Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions CHANGES.txt
Original file line number Diff line number Diff line change
@@ -1,4 +1,5 @@
5.1
* GCInspector should use different thresholds on GC events for warning messages (CASSANDRA-20980)
* Add cqlsh autocompletion for the identity mapping feature (CASSANDRA-20021)
* Add DDL Guardrail enabling administrators to disallow creation/modification of keyspaces with durable_writes = false (CASSANDRA-20913)
* Optimize Counter, Meter and Histogram metrics using thread local counters (CASSANDRA-20250)
Expand Down
16 changes: 16 additions & 0 deletions conf/cassandra.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -2048,6 +2048,11 @@ batch_size_fail_threshold: 50KiB
# Log WARN on any batches not of type LOGGED than span across more partitions than this limit
unlogged_batch_across_partitions_warn_threshold: 10

# GCInspector configs:
# For GC like ShenandoahGC/ZGC etc., there are GC events that do not have STW pauses (Concurrent phases)
# Operator might find it reasonable to use lower thresholds for events require STW pauses and higher thresholds
# for concurrent phases.
#
# GC Pauses greater than 200 ms will be logged at INFO level
# This threshold can be adjusted to minimize logging if necessary
# Min unit: ms
Expand All @@ -2059,6 +2064,17 @@ unlogged_batch_across_partitions_warn_threshold: 10
# Min unit: ms
# gc_warn_threshold: 1000ms

# GC Concurrent phase greater than 1000 ms will be logged at INFO level
# This threshold can be adjusted to minimize logging if necessary
# Min unit: ms
# gc_concurrent_phase_log_threshold: 1000ms

# GC Concurrent phase than gc_concurrent_phase_warn_threshold will be logged at WARN level
# Adjust the threshold based on your application throughput requirement. Setting to 0
# will deactivate the feature.
# Min unit: ms
# gc_concurrent_phase_warn_threshold: 2000ms

# Maximum size of any value in SSTables. Safety measure to detect SSTable corruption
# early. Any value size larger than this threshold will result into marking an SSTable
# as corrupted. This should be positive and less than 2GiB.
Expand Down
16 changes: 16 additions & 0 deletions conf/cassandra_latest.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -1913,6 +1913,11 @@ batch_size_fail_threshold: 50KiB
# Log WARN on any batches not of type LOGGED than span across more partitions than this limit
unlogged_batch_across_partitions_warn_threshold: 10

# GCInspector configs:
# For GC like ShenandoahGC/ZGC etc., there are GC events that do not have STW pauses.
# Such events are called Concurrent phases. Operator might find it reasonable to use lower thresholds
# for events require STW pauses and higher thresholds for concurrent phases.
#
# GC Pauses greater than 200 ms will be logged at INFO level
# This threshold can be adjusted to minimize logging if necessary
# Min unit: ms
Expand All @@ -1924,6 +1929,17 @@ unlogged_batch_across_partitions_warn_threshold: 10
# Min unit: ms
# gc_warn_threshold: 1000ms

# GC Concurrent phase greater than 1000 ms will be logged at INFO level
# This threshold can be adjusted to minimize logging if necessary
# Min unit: ms
# gc_concurrent_phase_log_threshold: 1000ms

# GC Concurrent phase than gc_concurrent_phase_warn_threshold will be logged at WARN level
# Adjust the threshold based on your application throughput requirement. Setting to 0
# will deactivate the feature.
# Min unit: ms
# gc_concurrent_phase_warn_threshold: 2000ms

# Maximum size of any value in SSTables. Safety measure to detect SSTable corruption
# early. Any value size larger than this threshold will result into marking an SSTable
# as corrupted. This should be positive and less than 2GiB.
Expand Down
2 changes: 2 additions & 0 deletions src/java/org/apache/cassandra/config/Config.java
Original file line number Diff line number Diff line change
Expand Up @@ -582,6 +582,8 @@ public static class SSTableConfig
public volatile DurationSpec.IntMillisecondsBound gc_log_threshold = new DurationSpec.IntMillisecondsBound("200ms");
@Replaces(oldName = "gc_warn_threshold_in_ms", converter = Converters.MILLIS_DURATION_INT, deprecated = true)
public volatile DurationSpec.IntMillisecondsBound gc_warn_threshold = new DurationSpec.IntMillisecondsBound("1s");
public volatile DurationSpec.IntMillisecondsBound gc_concurrent_phase_log_threshold = new DurationSpec.IntMillisecondsBound("1s");
public volatile DurationSpec.IntMillisecondsBound gc_concurrent_phase_warn_threshold = new DurationSpec.IntMillisecondsBound("2s");

// TTL for different types of trace events.
@Replaces(oldName = "tracetype_query_ttl", converter = Converters.SECONDS_DURATION, deprecated=true)
Expand Down
21 changes: 21 additions & 0 deletions src/java/org/apache/cassandra/config/DatabaseDescriptor.java
Original file line number Diff line number Diff line change
Expand Up @@ -4698,6 +4698,27 @@ public static void setGCWarnThreshold(int threshold)
conf.gc_warn_threshold = new DurationSpec.IntMillisecondsBound(threshold);
}

public static int getGCConcurrentPhaseLogThreshold()
{
return conf.gc_concurrent_phase_log_threshold.toMilliseconds();
}

public static void setGCConcurrentPhaseLogThreshold(int threshold)
{
conf.gc_concurrent_phase_log_threshold = new DurationSpec.IntMillisecondsBound(threshold);
}


public static int getGCConcurrentPhaseWarnThreshold()
{
return conf.gc_concurrent_phase_warn_threshold.toMilliseconds();
}

public static void setGCConcurrentPhaseWarnThreshold(int threshold)
{
conf.gc_concurrent_phase_warn_threshold = new DurationSpec.IntMillisecondsBound(threshold);
}

public static boolean isCDCEnabled()
{
return conf.cdc_enabled;
Expand Down
82 changes: 73 additions & 9 deletions src/java/org/apache/cassandra/service/GCInspector.java
Original file line number Diff line number Diff line change
Expand Up @@ -287,23 +287,53 @@ public void handleNotification(final Notification notification, final Object han
if (state.compareAndSet(prev, new State(duration, bytes, prev)))
break;
}

if (getGcWarnThresholdInMs() != 0 && duration > getGcWarnThresholdInMs())
logger.warn(sb.toString());
else if (duration > getGcLogThresholdInMs())
logger.info(sb.toString());
else if (logger.isTraceEnabled())
logger.trace(sb.toString());

if (duration > this.getStatusThresholdInMs())
StatusLogger.log();
if (isConcurrentPhase(info.getGcCause(), info.getGcName()))
{
if (getGcConcurrentPhaseWarnThresholdInMs() != 0 && duration > getGcConcurrentPhaseWarnThresholdInMs())
logger.warn(sb.toString());
else if (duration > getGcConcurrentPhaseLogThresholdInMs())
logger.info(sb.toString());
else if (logger.isTraceEnabled())
logger.trace(sb.toString());
// TODO: trigger StatusLogger if concurrent phases take too long?
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

regarding the TODO, yes, I think it makes sense to do a symmetric logic: too long GC can steal some computation resources from request processing and it maybe helpful to flush stats to see a possible impact.
Note: the threshold for status logging should be correspondent (based on concurrent phase log thresholds)

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

should be this comment removed as I see this patch contains StatusLogger.log already?

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

we have it for non-concurrent branch only, so it is actual for the current branch with a concurrent threshold logic

}
else
{
if (getGcWarnThresholdInMs() != 0 && duration > getGcWarnThresholdInMs())
logger.warn(sb.toString());
else if (duration > getGcLogThresholdInMs())
logger.info(sb.toString());
else if (logger.isTraceEnabled())
logger.trace(sb.toString());
if (duration > this.getStatusThresholdInMs())
StatusLogger.log();
}

// if we just finished an old gen collection and we're still using a lot of memory, try to reduce the pressure
if (gcState.assumeGCIsOldGen)
LifecycleTransaction.rescheduleFailedDeletions();
}
}

static boolean isConcurrentPhase(String cause, String name) {
// Mostly taken from: https://github.com/Netflix/spectator/blob/v1.7.x/spectator-ext-gc/src/main/java/com/netflix/spectator/gc/GcLogger.java
// So far the only indicator known is that the cause will be reported as "No GC"
// when using CMS.
//
// For ZGC, behavior was changed in JDK17: https://bugs.openjdk.java.net/browse/JDK-8265136
// For ZGC in older versions, there is no way to accurately get the amount of time
// in STW pauses.
//
// For G1, a new bean is added in JDK20 to indicate time spent in concurrent phases:
// https://bugs.openjdk.org/browse/JDK-8297247

return "No GC".equals(cause) // CMS
|| "G1 Concurrent GC".equals(name) // G1 in JDK20+
|| name.endsWith(" Cycles"); // Shenandoah, ZGC
}


public State getTotalSinceLastCheck()
{
return state.getAndSet(new State());
Expand Down Expand Up @@ -407,6 +437,40 @@ public void setGcLogThresholdInMs(long threshold)
DatabaseDescriptor.setGCLogThreshold((int) threshold);
}

public int getGcConcurrentPhaseWarnThresholdInMs()
{
return DatabaseDescriptor.getGCConcurrentPhaseWarnThreshold();
}

public void setGcConcurrentPhaseWarnThresholdInMs(int threshold)
{
long gcConcurrentPhaseLogThresholdInMs = getGcConcurrentPhaseLogThresholdInMs();
Copy link
Contributor

@smiklosovic smiklosovic Oct 28, 2025

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

you can move the resolution of this after the first "if". I that throws you have called this unnecessarily.

You have already done it like that in setGcConcurrentPhaseLogThresholdInMs

if (threshold < 0)
throw new IllegalArgumentException("Threshold must be greater than or equal to 0");
if (threshold != 0 && threshold <= gcConcurrentPhaseLogThresholdInMs)
throw new IllegalArgumentException("Threshold must be greater than gcConcurrentPhaseLogThresholdInMs which is currently "
+ gcConcurrentPhaseLogThresholdInMs);
DatabaseDescriptor.setGCConcurrentPhaseWarnThreshold(threshold);
}

public int getGcConcurrentPhaseLogThresholdInMs()
{
return DatabaseDescriptor.getGCConcurrentPhaseLogThreshold();
}

public void setGcConcurrentPhaseLogThresholdInMs(int threshold)
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I get that what you did here just follows the style which was already there and there is nothing wrong with it as such but I just wonder if we could not move the logic of this into DD's method and this one would just call it. I get that the methods need to be here because we are exposing them in MBean but the logic itself might be just moved to DD, no?

Because if somebody every calls DD methods, there is no validation. But it is just here. So everybody has to go through this method to get the validation. If we moved it to DD then it would be validated every time, regardless from where it is called.

DD currently also contains methods which have some basic validation around its parameters so ...

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

also as I understand it .... what happens when you put into yaml logically invalid values and you start the node? because the validation occurs only when you call mbean methods. If we want to be sure that what we started the node with makes sense already then it would need to be validated in DatabaseDesciptor.applySimpleConfig that is called as part of node startup and validated there so we are sure it is all valid already

{
if (threshold <= 0)
throw new IllegalArgumentException("Threshold must be greater than 0");

long gcConcurrentPhaseWarnThresholdInMs = getGcConcurrentPhaseWarnThresholdInMs();
if (gcConcurrentPhaseWarnThresholdInMs != 0 && threshold > gcConcurrentPhaseWarnThresholdInMs)
throw new IllegalArgumentException("Threshold must be less than gcConcurrentPhaseWarnThresholdInMs which is currently "
+ gcConcurrentPhaseWarnThresholdInMs);

DatabaseDescriptor.setGCConcurrentPhaseLogThreshold(threshold);
}

public long getGcLogThresholdInMs()
{
return DatabaseDescriptor.getGCLogThreshold();
Expand Down
4 changes: 4 additions & 0 deletions src/java/org/apache/cassandra/service/GCInspectorMXBean.java
Original file line number Diff line number Diff line change
Expand Up @@ -27,6 +27,10 @@ public interface GCInspectorMXBean
void setGcWarnThresholdInMs(long threshold);
long getGcWarnThresholdInMs();
void setGcLogThresholdInMs(long threshold);
int getGcConcurrentPhaseLogThresholdInMs();
void setGcConcurrentPhaseWarnThresholdInMs(int threshold);
int getGcConcurrentPhaseWarnThresholdInMs();
void setGcConcurrentPhaseLogThresholdInMs(int threshold);
long getGcLogThresholdInMs();
long getStatusThresholdInMs();
}
104 changes: 87 additions & 17 deletions test/unit/org/apache/cassandra/service/GCInspectorTest.java
Original file line number Diff line number Diff line change
Expand Up @@ -18,11 +18,16 @@
package org.apache.cassandra.service;

import org.apache.cassandra.config.DatabaseDescriptor;
import org.junit.Assert;
import org.junit.Before;
import org.junit.BeforeClass;
import org.junit.Test;

import static org.junit.Assert.assertEquals;
import static org.junit.Assert.assertFalse;
import static org.junit.Assert.assertTrue;
import static org.junit.Assert.fail;


public class GCInspectorTest
{

Expand All @@ -43,49 +48,98 @@ public void before()
@Test
public void ensureStaticFieldsHydrateFromConfig()
{
Assert.assertEquals(DatabaseDescriptor.getGCLogThreshold(), gcInspector.getGcLogThresholdInMs());
Assert.assertEquals(DatabaseDescriptor.getGCWarnThreshold(), gcInspector.getGcWarnThresholdInMs());
assertEquals(DatabaseDescriptor.getGCLogThreshold(), gcInspector.getGcLogThresholdInMs());
assertEquals(DatabaseDescriptor.getGCWarnThreshold(), gcInspector.getGcWarnThresholdInMs());
assertEquals(DatabaseDescriptor.getGCConcurrentPhaseLogThreshold(), gcInspector.getGcConcurrentPhaseLogThresholdInMs());
assertEquals(DatabaseDescriptor.getGCConcurrentPhaseWarnThreshold(), gcInspector.getGcConcurrentPhaseWarnThresholdInMs());
}

@Test
public void ensureStatusIsCalculated()
{
Assert.assertTrue(gcInspector.getStatusThresholdInMs() > 0);
assertTrue(gcInspector.getStatusThresholdInMs() > 0);
}

@Test(expected=IllegalArgumentException.class)
@Test
public void ensureWarnGreaterThanLog()
{
gcInspector.setGcWarnThresholdInMs(gcInspector.getGcLogThresholdInMs());
try
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Better style is to use Assertions.assertThrownBy

{
gcInspector.setGcWarnThresholdInMs(gcInspector.getGcLogThresholdInMs());
fail("Expect IllegalArgumentException");
}
catch (IllegalArgumentException e)
{
// expected
}
try
{
gcInspector.setGcConcurrentPhaseWarnThresholdInMs(gcInspector.getGcConcurrentPhaseLogThresholdInMs());
fail("Expect IllegalArgumentException");
}
catch (IllegalArgumentException e)
{
// expected
}
}

@Test
public void ensureZeroIsOk()
{
gcInspector.setGcWarnThresholdInMs(0);
Assert.assertEquals(gcInspector.getStatusThresholdInMs(), gcInspector.getGcLogThresholdInMs());
Assert.assertEquals(0, DatabaseDescriptor.getGCWarnThreshold());
Assert.assertEquals(200, DatabaseDescriptor.getGCLogThreshold());
assertEquals(gcInspector.getStatusThresholdInMs(), gcInspector.getGcLogThresholdInMs());
assertEquals(0, DatabaseDescriptor.getGCWarnThreshold());
assertEquals(200, DatabaseDescriptor.getGCLogThreshold());

gcInspector.setGcConcurrentPhaseWarnThresholdInMs(0);
assertEquals(0, DatabaseDescriptor.getGCConcurrentPhaseWarnThreshold());
assertEquals(1000, DatabaseDescriptor.getGCConcurrentPhaseLogThreshold());
}

@Test(expected=IllegalArgumentException.class)
@Test
public void ensureLogLessThanWarn()
{
Assert.assertEquals(200, gcInspector.getGcLogThresholdInMs());
assertEquals(200, gcInspector.getGcLogThresholdInMs());
gcInspector.setGcWarnThresholdInMs(1000);
Assert.assertEquals(1000, gcInspector.getGcWarnThresholdInMs());
gcInspector.setGcLogThresholdInMs(gcInspector.getGcWarnThresholdInMs() + 1);
assertEquals(1000, gcInspector.getGcWarnThresholdInMs());
try
{
gcInspector.setGcLogThresholdInMs(gcInspector.getGcWarnThresholdInMs() + 1);
fail("Expect IllegalArgumentException");
}
catch (IllegalArgumentException e)
{
// expected
}
assertEquals(1000, gcInspector.getGcConcurrentPhaseLogThresholdInMs());
gcInspector.setGcConcurrentPhaseWarnThresholdInMs(2000);
assertEquals(2000, gcInspector.getGcConcurrentPhaseWarnThresholdInMs());
try
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Assertions.assertThrownBy

{
gcInspector.setGcConcurrentPhaseLogThresholdInMs(gcInspector.getGcConcurrentPhaseWarnThresholdInMs() + 1);
fail("Expect IllegalArgumentException");
}
catch (IllegalArgumentException e)
{
// expected
}
}

@Test
public void testDefaults()
{
gcInspector.setGcLogThresholdInMs(200);
gcInspector.setGcWarnThresholdInMs(1000);
Assert.assertEquals(200, DatabaseDescriptor.getGCLogThreshold());
Assert.assertEquals(200, gcInspector.getGcLogThresholdInMs());
Assert.assertEquals(1000, DatabaseDescriptor.getGCWarnThreshold());
Assert.assertEquals(1000, gcInspector.getGcWarnThresholdInMs());
gcInspector.setGcConcurrentPhaseLogThresholdInMs(1000);
gcInspector.setGcConcurrentPhaseWarnThresholdInMs(2000);
assertEquals(200, DatabaseDescriptor.getGCLogThreshold());
assertEquals(200, gcInspector.getGcLogThresholdInMs());
assertEquals(1000, DatabaseDescriptor.getGCWarnThreshold());
assertEquals(1000, gcInspector.getGcWarnThresholdInMs());
assertEquals(1000, DatabaseDescriptor.getGCConcurrentPhaseLogThreshold());
assertEquals(1000, gcInspector.getGcConcurrentPhaseLogThresholdInMs());
assertEquals(2000, DatabaseDescriptor.getGCConcurrentPhaseWarnThreshold());
assertEquals(2000, gcInspector.getGcConcurrentPhaseWarnThresholdInMs());
}

@Test(expected=IllegalArgumentException.class)
Expand All @@ -94,4 +148,20 @@ public void testMaxValue()
gcInspector.setGcLogThresholdInMs(200);
gcInspector.setGcWarnThresholdInMs(Integer.MAX_VALUE+1L);
}

@Test
public void testIsConcurrentPhase()
{
assertTrue("No GC cause should be considered concurrent",
GCInspector.isConcurrentPhase("No GC", "SomeGCName"));
assertTrue("Shenandoah Cycles should be considered concurrent",
GCInspector.isConcurrentPhase("SomeCause", "Shenandoah Cycles"));
assertTrue("ZGC Cycles should be considered concurrent",
GCInspector.isConcurrentPhase("SomeCause", "ZGC Cycles"));

assertFalse("ParallelGC should not be considered concurrent",
GCInspector.isConcurrentPhase("Allocation Failure", "PS Scavenge"));
assertFalse("G1 Young Generation should not be considered concurrent",
GCInspector.isConcurrentPhase("G1 Evacuation Pause", "G1 Young Generation"));
}
}