Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
35 commits
Select commit Hold shift + click to select a range
54def19
Initial plan
Copilot Feb 17, 2026
a63af21
Fix flaky tests - improve timing and assertions
Copilot Feb 17, 2026
a14e8e7
Fix additional flaky tests - increase timeouts and add retry analyzer
Copilot Feb 18, 2026
95e09e1
Fix bulk query and Spark metrics race conditions
Copilot Feb 18, 2026
b3de6ed
Fix NullPointerException in circuit breaker tests - lazy init regions
Copilot Feb 18, 2026
479b995
Fix additional flaky tests - increase timeouts and add retry analyzers
Copilot Feb 19, 2026
b21694c
Address code review feedback - improve exception handling and NPE safety
Copilot Feb 20, 2026
0f35ec1
Fix flaky PartitionControllerImplTests.handleMerge - relax acquire ve…
Copilot Feb 20, 2026
7265d8c
Fix flaky PointWriterITest.createItemWithDuplicates - increase retry …
Copilot Feb 20, 2026
f375096
Fix flaky write retry tests - add retry analyzers and increase retry …
Copilot Feb 20, 2026
1954acc
Fix flaky SparkE2EWriteITest.supportUpserts - wait for onTaskEnd call…
Copilot Feb 20, 2026
2f76986
Fix ContainerCreateDeleteWithSameNameTest.bulk - increase indexing de…
Copilot Feb 20, 2026
26dfc86
Fix PointWriterITest.upsertItemsIfNotModified - use eventually block …
Copilot Feb 20, 2026
94d4b9a
Fix Scala compilation error - convert Int to Long for type compatibility
Copilot Feb 20, 2026
5736061
Merge branch 'main' into copilot/fix-flaky-tests-java-ci
kushagraThapar Feb 21, 2026
f48378e
Fix PartitionControllerImplTests.handleMerge - relax create verificat…
Copilot Feb 21, 2026
dc5c46b
Fix PartitionControllerImplTests.handleMerge - relax release verifica…
Copilot Feb 21, 2026
1d7cde2
Fix additional flaky Cosmos DB tests beyond PR #48025
kushagraThapar Feb 21, 2026
86cf1c4
Fix ReproTest assertion and increase ClientRetryPolicyE2ETests timeouts
kushagraThapar Feb 21, 2026
bb5686a
Add transient error retry to TestSuiteBase create methods
kushagraThapar Feb 21, 2026
75049e9
Fix remaining flaky tests from CI run buildId=5909542
Copilot Feb 22, 2026
963b2c7
Fix PartitionControllerImplTests.handleMerge - relax updateProperties…
Copilot Feb 22, 2026
9f99207
Merge branch 'copilot/fix-flaky-tests-java-ci' of github.com:Azure/az…
kushagraThapar Feb 22, 2026
a6cc421
Update sdk/cosmos/azure-cosmos-tests/src/test/java/com/azure/cosmos/r…
kushagraThapar Feb 22, 2026
cb6394e
Update sdk/cosmos/azure-cosmos-tests/src/test/java/com/azure/cosmos/r…
kushagraThapar Feb 22, 2026
085e502
Update sdk/cosmos/azure-cosmos-tests/src/test/java/com/azure/cosmos/i…
kushagraThapar Feb 22, 2026
b227221
Update sdk/cosmos/azure-cosmos-tests/src/test/java/com/azure/cosmos/c…
kushagraThapar Feb 22, 2026
e01f179
Update sdk/cosmos/azure-cosmos-tests/src/test/java/com/azure/cosmos/E…
kushagraThapar Feb 22, 2026
60cab29
Replace fixed sleeps with retry-based polling for CI resilience
kushagraThapar Feb 22, 2026
2a5ecf9
Add missing static import for Mockito.timeout in PartitionControllerI…
kushagraThapar Feb 22, 2026
8ce5655
Fix PartitionControllerImplTests.handleMerge race condition
kushagraThapar Feb 23, 2026
9fa0a11
Fix flaky Cosmos DB tests for CI stability
kushagraThapar Feb 23, 2026
b6046ae
Fix PointWriterITest.upsertItemsIfNotModified indexing race condition
kushagraThapar Feb 23, 2026
5269dc0
Fix ExcludeRegionTests and add retry for transient CI failures
kushagraThapar Feb 23, 2026
0375371
Fix CosmosBulkGatewayTest 409 conflict in setup and upgrade FI test r…
kushagraThapar Feb 24, 2026
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
Expand Up @@ -271,9 +271,26 @@ public void before_WorkflowTest() {
options.setOfferThroughput(10000);
AsyncDocumentClient housekeepingClient = Utils.housekeepingClient();
database = Utils.createDatabaseForTest(housekeepingClient);
collection = housekeepingClient.createCollection("dbs/" + database.getId(),
getCollectionDefinitionWithRangeRangeIndex(),
options).block().getResource();
// Retry collection creation on transient failures (408, 429, 503)
int maxRetries = 3;
for (int attempt = 0; attempt <= maxRetries; attempt++) {
try {
collection = housekeepingClient.createCollection("dbs/" + database.getId(),
getCollectionDefinitionWithRangeRangeIndex(),
options).block().getResource();
break;
} catch (Exception e) {
if (attempt == maxRetries) {
throw e;
}
try {
Thread.sleep(5000);
} catch (InterruptedException ie) {
Thread.currentThread().interrupt();
throw new RuntimeException(ie);
}
}
}
housekeepingClient.close();
}

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -12,6 +12,9 @@ import com.fasterxml.jackson.databind.node.ObjectNode
import org.apache.commons.lang3.RandomUtils
import org.apache.spark.MockTaskContext
import org.apache.spark.sql.types.{BooleanType, DoubleType, FloatType, IntegerType, LongType, StringType, StructField, StructType}
import org.scalatest.concurrent.Eventually.eventually
import org.scalatest.concurrent.Waiters.{interval, timeout}
import org.scalatest.time.SpanSugar.convertIntToGrainOfTime

import scala.collection.concurrent.TrieMap
import scala.collection.mutable
Expand Down Expand Up @@ -218,7 +221,7 @@ class PointWriterITest extends IntegrationSpec with CosmosClient with AutoCleana
val container = getContainer
val containerProperties = container.read().block().getProperties
val partitionKeyDefinition = containerProperties.getPartitionKeyDefinition
val writeConfig = CosmosWriteConfig(ItemWriteStrategy.ItemAppend, maxRetryCount = 0, bulkEnabled = false, bulkTransactional = false)
val writeConfig = CosmosWriteConfig(ItemWriteStrategy.ItemAppend, maxRetryCount = 3, bulkEnabled = false, bulkTransactional = false)
val pointWriter = new PointWriter(
container,
partitionKeyDefinition,
Expand Down Expand Up @@ -274,9 +277,15 @@ class PointWriterITest extends IntegrationSpec with CosmosClient with AutoCleana
}

pointWriter.flushAndClose()
val allItems = readAllItems()

allItems should have size items.size
// Poll until all items are indexed and visible via query
// readAllItems() uses a query which depends on indexing completion
var allItems = readAllItems()
eventually(timeout(10.seconds), interval(500.milliseconds)) {
allItems = readAllItems()
allItems should have size items.size
}

metricsPublisher.getRecordsWrittenSnapshot() shouldEqual items.size
metricsPublisher.getBytesWrittenSnapshot() > 0 shouldEqual true
metricsPublisher.getTotalRequestChargeSnapshot() > 5 * items.size shouldEqual true
Expand All @@ -303,6 +312,13 @@ class PointWriterITest extends IntegrationSpec with CosmosClient with AutoCleana

pointWriter.flushAndClose()

// Wait for metrics to be fully aggregated after flush
// This prevents race conditions where metrics snapshot is taken before all writes are recorded
// Use eventually block to poll until the expected count is reached
eventually(timeout(10.seconds), interval(100.milliseconds)) {
metricsPublisher.getRecordsWrittenSnapshot() should be >= (2 * items.size).toLong
}

metricsPublisher.getRecordsWrittenSnapshot() shouldEqual 2 * items.size
metricsPublisher.getBytesWrittenSnapshot() > 0 shouldEqual true
metricsPublisher.getTotalRequestChargeSnapshot() > 5 * 2 * items.size shouldEqual true
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -207,7 +207,7 @@ class PointWriterSubpartitionITest extends IntegrationSpec with CosmosClient wit
val container = getContainer
val containerProperties = container.read().block().getProperties
val partitionKeyDefinition = containerProperties.getPartitionKeyDefinition
val writeConfig = CosmosWriteConfig(ItemWriteStrategy.ItemAppend, maxRetryCount = 0, bulkEnabled = false, bulkTransactional = false)
val writeConfig = CosmosWriteConfig(ItemWriteStrategy.ItemAppend, maxRetryCount = 3, bulkEnabled = false, bulkTransactional = false)
val pointWriter = new PointWriter(
container, partitionKeyDefinition, writeConfig, DiagnosticsConfig(), MockTaskContext.mockTaskContext(),new TestOutputMetricsPublisher)
val items = new mutable.HashMap[String, mutable.Set[ObjectNode]] with mutable.MultiMap[String, ObjectNode]
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -166,6 +166,12 @@ class SparkE2EWriteITest
statusStore.executionsList().last.metricValues != null)
}

// Wait for onTaskEnd callback to update snapshot variables
// The callback fires asynchronously after metrics are computed
eventually(timeout(10.seconds), interval(10.milliseconds)) {
assert(recordsWrittenSnapshot > 0)
}

recordsWrittenSnapshot shouldEqual 1
bytesWrittenSnapshot > 0 shouldEqual true
if (!spark.sparkContext.version.startsWith("3.1.")) {
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -85,7 +85,7 @@ public ClientMetricsTest(CosmosClientBuilder clientBuilder) {
super(clientBuilder);
}

@Test(groups = { "fast" }, timeOut = TIMEOUT)
@Test(groups = { "fast" }, timeOut = SETUP_TIMEOUT)
public void maxValueExceedingDefinedLimitStillWorksWithoutException() throws Exception {

// Expected behavior is that higher values than the expected max value can still be recorded
Expand Down Expand Up @@ -274,7 +274,10 @@ public void createItemWithAllMetrics() throws Exception {
}
}

@Test(groups = { "fast" }, timeOut = TIMEOUT)
// Increased timeout from TIMEOUT to SETUP_TIMEOUT to account for collection creation time
// during TestState initialization, especially in CI environments where collection creation
// can take longer than 40 seconds
@Test(groups = { "fast" }, timeOut = SETUP_TIMEOUT)
public void readItem() throws Exception {
try (TestState state = new TestState(getClientBuilder(), CosmosMetricCategory.DEFAULT)) {
InternalObjectNode properties = getDocumentDefinition(UUID.randomUUID().toString());
Expand Down Expand Up @@ -464,7 +467,7 @@ public void readItemWithThresholdsApplied() throws Exception {
runReadItemTestWithThresholds(minThresholds, true);
}

@Test(groups = { "fast" }, timeOut = TIMEOUT)
@Test(groups = { "fast" }, timeOut = TIMEOUT, retryAnalyzer = SuperFlakyTestRetryAnalyzer.class)
public void replaceItem() throws Exception {
try (TestState state = new TestState(getClientBuilder(), CosmosMetricCategory.DEFAULT)) {
InternalObjectNode properties = getDocumentDefinition(UUID.randomUUID().toString());
Expand Down Expand Up @@ -993,7 +996,7 @@ public void batchMultipleItemExecution() throws Exception {
}
}

@Test(groups = { "fast" }, timeOut = TIMEOUT)
@Test(groups = { "fast" }, timeOut = TIMEOUT * 2)
public void effectiveMetricCategoriesForDefault() throws Exception {
try (TestState state = new TestState(getClientBuilder(), CosmosMetricCategory.fromString("DeFAult"))) {
assertThat(state.getEffectiveMetricCategories().size()).isEqualTo(5);
Expand Down Expand Up @@ -1082,7 +1085,7 @@ public void effectiveMetricCategoriesForAll() throws Exception {
}
}

@Test(groups = { "fast" }, timeOut = TIMEOUT)
@Test(groups = { "fast" }, timeOut = TIMEOUT, retryAnalyzer = SuperFlakyTestRetryAnalyzer.class)
public void endpointMetricsAreDurable() throws Exception {
try (TestState state = new TestState(getClientBuilder(), CosmosMetricCategory.ALL)){
if (state.client.asyncClient().getConnectionPolicy().getConnectionMode() != ConnectionMode.DIRECT) {
Expand Down Expand Up @@ -1111,7 +1114,7 @@ public void endpointMetricsAreDurable() throws Exception {
}
}

@Test(groups = { "fast" }, timeOut = TIMEOUT)
@Test(groups = { "fast" }, timeOut = TIMEOUT * 2)
public void effectiveMetricCategoriesForAllLatebound() throws Exception {
try (TestState state = new TestState(getClientBuilder(), CosmosMetricCategory.DEFAULT)) {
EnumSet<MetricCategory> effectiveMetricCategories =
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -1071,6 +1071,23 @@ public void directDiagnosticsOnException() throws Exception {
CosmosItemResponse<InternalObjectNode> createResponse = null;
try {
createResponse = containerDirect.createItem(internalObjectNode);

// Verify item creation is fully propagated before testing with wrong partition key
// Use retry-based polling instead of fixed sleep for CI resilience
String itemId = BridgeInternal.getProperties(createResponse).getId();
int maxRetries = 5;
int retryCount = 0;
boolean itemReadable = false;
while (retryCount < maxRetries && !itemReadable) {
try {
containerDirect.readItem(itemId, new PartitionKey(itemId), InternalObjectNode.class);
itemReadable = true;
} catch (CosmosException e) {
retryCount++;
Thread.sleep(200);
}
}

CosmosItemRequestOptions cosmosItemRequestOptions = new CosmosItemRequestOptions();
ModelBridgeInternal.setPartitionKey(cosmosItemRequestOptions, new PartitionKey("wrongPartitionKey"));
CosmosItemResponse<InternalObjectNode> readResponse =
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -346,7 +346,7 @@ private Object[][] patchItemTestCaseProvider() {
};
}

@Test(groups = { "emulator" }, dataProvider = "createItemTestCaseProvider", timeOut = TIMEOUT * 10)
@Test(groups = { "emulator" }, dataProvider = "createItemTestCaseProvider", timeOut = TIMEOUT * 10, retryAnalyzer = FlakyTestRetryAnalyzer.class)
public void createItem(
boolean hasExplicitPK,
boolean isContentResponseOnWriteEnabled,
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -344,7 +344,7 @@ public void performDocumentOperationOnDeletedContainer(OperationType operationTy
}
}

@Test(groups = {"fast"}, timeOut = TIMEOUT)
@Test(groups = {"fast"}, timeOut = TIMEOUT, retryAnalyzer = com.azure.cosmos.FlakyTestRetryAnalyzer.class)
public void performBulkOnDeletedContainer() throws InterruptedException {

CosmosAsyncClient clientToUse = null, deletingAsyncClient = null;
Expand Down Expand Up @@ -378,10 +378,10 @@ public void performBulkOnDeletedContainer() throws InterruptedException {
CosmosAsyncContainer containerToDelete = deletingAsyncClient.getDatabase(testAsyncDatabase.getId()).getContainer(testContainerId);
containerToDelete.delete().block();

Thread.sleep(5000);
// Increase wait time for container deletion to propagate to all caches
Thread.sleep(15000);

// Try to read the item from the deleted container using the original client

List<CosmosItemOperation> cosmosItemOperations = new ArrayList<>();

CosmosItemOperation cosmosItemOperation = CosmosBulkOperations.getReadItemOperation(
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -124,7 +124,36 @@ public void excludeRegionTest_SkipFirstPreferredRegion(OperationType operationTy
TestObject createdItem = TestObject.create();
this.cosmosAsyncContainer.createItem(createdItem).block();

Thread.sleep(1000);
// Wait for item to be replicated across regions with retry logic instead of fixed sleep
// This makes the test more resilient to timing variations in CI environments
int maxRetries = 5;
int retryCount = 0;
boolean itemReplicated = false;
while (retryCount < maxRetries && !itemReplicated) {
try {
Thread.sleep(500); // Shorter incremental waits
} catch (InterruptedException ie) {
// Restore the interrupt status and fail fast
Thread.currentThread().interrupt();
throw new RuntimeException("Interrupted while waiting for replication", ie);
}

try {
CosmosDiagnosticsContext diagnostics = this.performDocumentOperation(
cosmosAsyncContainer,
OperationType.Read,
createdItem,
null,
INF_E2E_TIMEOUT);
itemReplicated = true;
} catch (Exception e) {
retryCount++;
if (retryCount >= maxRetries) {
throw e;
}
// Continue retrying on transient failures
}
}

CosmosDiagnosticsContext cosmosDiagnosticsContextBeforeRegionExclusion
= this.performDocumentOperation(cosmosAsyncContainer, operationType, createdItem, null, INF_E2E_TIMEOUT);
Expand Down Expand Up @@ -316,10 +345,28 @@ private CosmosDiagnosticsContext performDocumentOperation(

cosmosAsyncContainer.createItem(itemToBeDeleted, cosmosItemRequestOptions).block();

try {
Thread.sleep(1000);
} catch (InterruptedException e) {
throw new RuntimeException(e);
// Wait for item creation to propagate with retry mechanism
// instead of fixed sleep to handle timing variations in CI
int maxRetries = 5;
for (int i = 0; i < maxRetries; i++) {
try {
Thread.sleep(300); // Shorter incremental waits
// Verify item exists before attempting delete
cosmosAsyncContainer.readItem(
itemToBeDeleted.getId(),
new PartitionKey(itemToBeDeleted.getMypk()),
TestObject.class
).block();
break; // Item is ready
} catch (CosmosException e) {
if (i == maxRetries - 1) {
throw e; // Rethrow on last retry
}
// Continue retrying
} catch (InterruptedException e) {
Thread.currentThread().interrupt();
throw new RuntimeException("Interrupted while waiting for item creation to propagate", e);
}
}

CosmosItemResponse<Object> itemResponse
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -15,7 +15,7 @@
public class FITests_queryAfterCreation
extends FaultInjectionWithAvailabilityStrategyTestsBase {

@Test(groups = {"fi-multi-master"}, dataProvider = "testConfigs_queryAfterCreation", retryAnalyzer = FlakyTestRetryAnalyzer.class)
@Test(groups = {"fi-multi-master"}, dataProvider = "testConfigs_queryAfterCreation", retryAnalyzer = SuperFlakyTestRetryAnalyzer.class)
public void queryAfterCreation(
String testCaseId,
Duration endToEndTimeout,
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -16,7 +16,7 @@
public class FITests_readAfterCreation
extends FaultInjectionWithAvailabilityStrategyTestsBase {

@Test(groups = {"fi-multi-master"}, dataProvider = "testConfigs_readAfterCreation", retryAnalyzer = FlakyTestRetryAnalyzer.class)
@Test(groups = {"fi-multi-master"}, dataProvider = "testConfigs_readAfterCreation", retryAnalyzer = SuperFlakyTestRetryAnalyzer.class)
public void readAfterCreation(
String testCaseId,
Duration endToEndTimeout,
Expand Down
Loading
Loading