Skip to content

Commit 9ed0264

Browse files
epughCopilot
andcommitted
fix ci flakiness leaderelection test (#4388)
Co-authored-by: copilot-swe-agent[bot] <198982749+Copilot@users.noreply.github.com> Co-authored-by: epugh <22395+epugh@users.noreply.github.com> (cherry picked from commit 89c5413)
1 parent e3a4d1b commit 9ed0264

3 files changed

Lines changed: 38 additions & 39 deletions

File tree

solr/core/src/test/org/apache/solr/cloud/LeaderElectionIntegrationTest.java

Lines changed: 25 additions & 23 deletions
Original file line numberDiff line numberDiff line change
@@ -19,7 +19,6 @@
1919
import java.io.IOException;
2020
import java.util.ArrayList;
2121
import java.util.List;
22-
import java.util.concurrent.TimeUnit;
2322
import org.apache.solr.client.solrj.SolrServerException;
2423
import org.apache.solr.client.solrj.request.CollectionAdminRequest;
2524
import org.apache.solr.common.cloud.ZkNodeProps;
@@ -61,7 +60,10 @@ public void testSimpleSliceLeaderElection() throws Exception {
6160
String collection = "collection1";
6261
createCollection(collection);
6362

64-
cluster.waitForActiveCollection(collection, 10, TimeUnit.SECONDS, 2, 6);
63+
waitForState(
64+
"Timeout waiting for collection to become active",
65+
collection,
66+
clusterShape(2, NUM_REPLICAS_OF_SHARD1 + 1));
6567
List<JettySolrRunner> stoppedRunners = new ArrayList<>();
6668
for (int i = 0; i < 4; i++) {
6769
// who is the leader?
@@ -107,15 +109,20 @@ public void testSimpleSliceLeaderElection() throws Exception {
107109
assertNotNull(jetty);
108110
cluster.expireZkSession(jetty);
109111

110-
for (int i = 0; i < 60; i++) { // wait till leader is changed
111-
if (jetty != getRunner(getLeader(collection))) {
112-
break;
113-
}
114-
Thread.sleep(100);
115-
}
116-
117-
// make sure we have waited long enough for the first leader to have come back
118-
Thread.sleep(ZkTestServer.TICK_TIME * 2 + 100);
112+
// Wait until leadership has moved away from the expired-session node
113+
waitForState(
114+
"Expected leader to move away after expiring zk session",
115+
collection,
116+
c -> {
117+
var l = c.getLeader("shard1");
118+
return l != null && !jetty.getNodeName().equals(l.getNodeName());
119+
});
120+
121+
// Wait until the expired-session node is live again before stopping others
122+
waitForState(
123+
"Expected expired-session node to rejoin live nodes",
124+
collection,
125+
(liveNodes, c) -> liveNodes.contains(jetty.getNodeName()));
119126

120127
// kill everyone but the first leader that should have reconnected by now
121128
for (JettySolrRunner jetty2 : cluster.getJettySolrRunners()) {
@@ -124,18 +131,13 @@ public void testSimpleSliceLeaderElection() throws Exception {
124131
}
125132
}
126133

127-
for (int i = 0; i < 320; i++) { // wait till leader is changed
128-
try {
129-
if (jetty == getRunner(getLeader(collection))) {
130-
break;
131-
}
132-
Thread.sleep(100);
133-
} catch (Exception e) {
134-
continue;
135-
}
136-
}
137-
138-
assertEquals(jetty, getRunner(getLeader(collection)));
134+
waitForState(
135+
"Expected original node to become leader after others stopped",
136+
collection,
137+
c -> {
138+
var l = c.getLeader("shard1");
139+
return l != null && jetty.getNodeName().equals(l.getNodeName());
140+
});
139141
}
140142

141143
private JettySolrRunner getRunner(String nodeName) {

solr/core/src/test/org/apache/solr/cloud/ZkFailoverTest.java

Lines changed: 5 additions & 14 deletions
Original file line numberDiff line numberDiff line change
@@ -23,9 +23,7 @@
2323
import org.apache.solr.client.solrj.request.SolrQuery;
2424
import org.apache.solr.client.solrj.response.QueryResponse;
2525
import org.apache.solr.common.SolrInputDocument;
26-
import org.apache.solr.common.cloud.ZkStateReader;
2726
import org.apache.solr.embedded.JettySolrRunner;
28-
import org.apache.zookeeper.KeeperException;
2927
import org.junit.AfterClass;
3028
import org.junit.BeforeClass;
3129
import org.slf4j.Logger;
@@ -60,12 +58,15 @@ public void testRestartZkWhenClusterDown() throws Exception {
6058
// This attempt will fail since it will time out after 1 second
6159
System.setProperty("solr.cloud.wait.for.zk.seconds", "1");
6260
restartSolrAndZk();
63-
waitForLiveNodes(0);
61+
waitForState("Timeout waiting for 0 live nodes", coll, (liveNodes, c) -> liveNodes.isEmpty());
6462

6563
// This attempt will succeed since there will be enough time to connect
6664
System.setProperty("solr.cloud.wait.for.zk.seconds", "20");
6765
restartSolrAndZk();
68-
waitForLiveNodes(cluster.getJettySolrRunners().size());
66+
waitForState(
67+
"Timeout waiting for all nodes to come up",
68+
coll,
69+
(liveNodes, c) -> liveNodes.size() == cluster.getJettySolrRunners().size());
6970
waitForState("Timeout waiting for " + coll, coll, clusterShape(2, 2));
7071
QueryResponse rsp =
7172
new QueryRequest(new SolrQuery("*:*")).process(cluster.getSolrClient(), coll);
@@ -99,14 +100,4 @@ private void restartSolrAndZk() throws Exception {
99100
thread.join();
100101
}
101102
}
102-
103-
private void waitForLiveNodes(int numNodes) throws InterruptedException, KeeperException {
104-
ZkStateReader zkStateReader = cluster.getZkStateReader();
105-
for (int i = 0; i < 100; i++) {
106-
zkStateReader.updateLiveNodes();
107-
if (zkStateReader.getClusterState().getLiveNodes().size() == numNodes) return;
108-
Thread.sleep(200);
109-
}
110-
fail("Timeout waiting for number of live nodes = " + numNodes);
111-
}
112103
}

solr/core/src/test/org/apache/solr/cloud/ZkShardTermsRecoveryTest.java

Lines changed: 8 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -52,14 +52,20 @@ public static void setupCluster() throws Exception {
5252
CollectionAdminRequest.createCollection(COLLECTION, "conf", NUM_SHARDS, NUM_REPLICAS)
5353
.process(cluster.getSolrClient())
5454
.getStatus());
55-
cluster.waitForActiveCollection(COLLECTION, 10, TimeUnit.SECONDS, 2, NUM_SHARDS * NUM_REPLICAS);
55+
waitForState(
56+
"Timeout waiting for collection to be active after creation",
57+
COLLECTION,
58+
clusterShape(NUM_SHARDS, NUM_SHARDS * NUM_REPLICAS));
5659
}
5760

5861
@Before
5962
public void waitForActiveState() throws Exception {
6063
CollectionAdminRequest.modifyCollection(COLLECTION, Map.of("readOnly", false))
6164
.process(cluster.getSolrClient());
62-
cluster.waitForActiveCollection(COLLECTION, 10, TimeUnit.SECONDS, 2, NUM_SHARDS * NUM_REPLICAS);
65+
waitForState(
66+
"Timeout waiting for active collection",
67+
COLLECTION,
68+
clusterShape(NUM_SHARDS, NUM_SHARDS * NUM_REPLICAS));
6369
}
6470

6571
@Test

0 commit comments

Comments
 (0)