Skip to content

Commit d57e89f

Browse files
committed
GCInspector should use different thresholds on GC events
1 parent 547307f commit d57e89f

File tree

8 files changed

+220
-26
lines changed

8 files changed

+220
-26
lines changed

CHANGES.txt

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,5 @@
11
5.1
2+
* GCInspector should use different thresholds on GC events for warning messages (CASSANDRA-20980)
23
* Add cqlsh autocompletion for the identity mapping feature (CASSANDRA-20021)
34
* Add DDL Guardrail enabling administrators to disallow creation/modification of keyspaces with durable_writes = false (CASSANDRA-20913)
45
* Optimize Counter, Meter and Histogram metrics using thread local counters (CASSANDRA-20250)

conf/cassandra.yaml

Lines changed: 16 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -2048,6 +2048,11 @@ batch_size_fail_threshold: 50KiB
20482048
# Log WARN on any batches not of type LOGGED than span across more partitions than this limit
20492049
unlogged_batch_across_partitions_warn_threshold: 10
20502050

2051+
# GCInspector configs:
2052+
# For GC like ShenandoahGC/ZGC etc., there are GC events that do not have STW pauses (Concurrent phases)
2053+
# Operator might find it reasonable to use lower thresholds for events require STW pauses and higher thresholds
2054+
# for concurrent phases.
2055+
#
20512056
# GC Pauses greater than 200 ms will be logged at INFO level
20522057
# This threshold can be adjusted to minimize logging if necessary
20532058
# Min unit: ms
@@ -2059,6 +2064,17 @@ unlogged_batch_across_partitions_warn_threshold: 10
20592064
# Min unit: ms
20602065
# gc_warn_threshold: 1000ms
20612066

2067+
# GC Concurrent phase greater than 200 ms will be logged at INFO level
2068+
# This threshold can be adjusted to minimize logging if necessary
2069+
# Min unit: ms
2070+
# gc_concurrent_phase_log_threshold: 1000ms
2071+
2072+
# GC Concurrent phase than gc_concurrent_phase_warn_threshold will be logged at WARN level
2073+
# Adjust the threshold based on your application throughput requirement. Setting to 0
2074+
# will deactivate the feature.
2075+
# Min unit: ms
2076+
# gc_concurrent_phase_warn_threshold: 2000ms
2077+
20622078
# Maximum size of any value in SSTables. Safety measure to detect SSTable corruption
20632079
# early. Any value size larger than this threshold will result into marking an SSTable
20642080
# as corrupted. This should be positive and less than 2GiB.

conf/cassandra_latest.yaml

Lines changed: 16 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1913,6 +1913,11 @@ batch_size_fail_threshold: 50KiB
19131913
# Log WARN on any batches not of type LOGGED than span across more partitions than this limit
19141914
unlogged_batch_across_partitions_warn_threshold: 10
19151915

1916+
# GCInspector configs:
1917+
# For GC like ShenandoahGC/ZGC etc., there are GC events that do not have STW pauses.
1918+
# Such events are called Concurrent phases. Operator might find it reasonable to use lower thresholds
1919+
# for events require STW pauses and higher thresholds for concurrent phases.
1920+
#
19161921
# GC Pauses greater than 200 ms will be logged at INFO level
19171922
# This threshold can be adjusted to minimize logging if necessary
19181923
# Min unit: ms
@@ -1924,6 +1929,17 @@ unlogged_batch_across_partitions_warn_threshold: 10
19241929
# Min unit: ms
19251930
# gc_warn_threshold: 1000ms
19261931

1932+
# GC Concurrent phase greater than 200 ms will be logged at INFO level
1933+
# This threshold can be adjusted to minimize logging if necessary
1934+
# Min unit: ms
1935+
# gc_concurrent_phase_log_threshold: 1000ms
1936+
1937+
# GC Concurrent phase than gc_concurrent_phase_warn_threshold will be logged at WARN level
1938+
# Adjust the threshold based on your application throughput requirement. Setting to 0
1939+
# will deactivate the feature.
1940+
# Min unit: ms
1941+
# gc_concurrent_phase_warn_threshold: 2000ms
1942+
19271943
# Maximum size of any value in SSTables. Safety measure to detect SSTable corruption
19281944
# early. Any value size larger than this threshold will result into marking an SSTable
19291945
# as corrupted. This should be positive and less than 2GiB.

src/java/org/apache/cassandra/config/Config.java

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -582,6 +582,8 @@ public static class SSTableConfig
582582
public volatile DurationSpec.IntMillisecondsBound gc_log_threshold = new DurationSpec.IntMillisecondsBound("200ms");
583583
@Replaces(oldName = "gc_warn_threshold_in_ms", converter = Converters.MILLIS_DURATION_INT, deprecated = true)
584584
public volatile DurationSpec.IntMillisecondsBound gc_warn_threshold = new DurationSpec.IntMillisecondsBound("1s");
585+
public volatile DurationSpec.IntMillisecondsBound gc_concurrent_phase_log_threshold = new DurationSpec.IntMillisecondsBound("200ms");
586+
public volatile DurationSpec.IntMillisecondsBound gc_concurrent_phase_warn_threshold = new DurationSpec.IntMillisecondsBound("1s");
585587

586588
// TTL for different types of trace events.
587589
@Replaces(oldName = "tracetype_query_ttl", converter = Converters.SECONDS_DURATION, deprecated=true)

src/java/org/apache/cassandra/config/DatabaseDescriptor.java

Lines changed: 21 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -4698,6 +4698,27 @@ public static void setGCWarnThreshold(int threshold)
46984698
conf.gc_warn_threshold = new DurationSpec.IntMillisecondsBound(threshold);
46994699
}
47004700

4701+
public static int getGCConcurrentPhaseLogThreshold()
4702+
{
4703+
return conf.gc_concurrent_phase_log_threshold.toMilliseconds();
4704+
}
4705+
4706+
public static void setGCConcurrentPhaseLogThreshold(int threshold)
4707+
{
4708+
conf.gc_concurrent_phase_log_threshold = new DurationSpec.IntMillisecondsBound(threshold);
4709+
}
4710+
4711+
4712+
public static int getGCConcurrentPhaseWarnThreshold()
4713+
{
4714+
return conf.gc_concurrent_phase_warn_threshold.toMilliseconds();
4715+
}
4716+
4717+
public static void setGCConcurrentPhaseWarnThreshold(int threshold)
4718+
{
4719+
conf.gc_concurrent_phase_warn_threshold = new DurationSpec.IntMillisecondsBound(threshold);
4720+
}
4721+
47014722
public static boolean isCDCEnabled()
47024723
{
47034724
return conf.cdc_enabled;

src/java/org/apache/cassandra/service/GCInspector.java

Lines changed: 73 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -287,23 +287,53 @@ public void handleNotification(final Notification notification, final Object han
287287
if (state.compareAndSet(prev, new State(duration, bytes, prev)))
288288
break;
289289
}
290-
291-
if (getGcWarnThresholdInMs() != 0 && duration > getGcWarnThresholdInMs())
292-
logger.warn(sb.toString());
293-
else if (duration > getGcLogThresholdInMs())
294-
logger.info(sb.toString());
295-
else if (logger.isTraceEnabled())
296-
logger.trace(sb.toString());
297290

298-
if (duration > this.getStatusThresholdInMs())
299-
StatusLogger.log();
291+
if (isConcurrentPhase(info.getGcCause(), info.getGcName()))
292+
{
293+
if (getGcWarnThresholdInMs() != 0 && duration > getGcWarnThresholdInMs())
294+
logger.warn(sb.toString());
295+
else if (duration > getGcLogThresholdInMs())
296+
logger.info(sb.toString());
297+
else if (logger.isTraceEnabled())
298+
logger.trace(sb.toString());
299+
// TODO: trigger StatusLogger if concurrent phases take too long?
300+
}
301+
else
302+
{
303+
if (getGcConcurrentPhaseWarnThresholdInMs() != 0 && duration > getGcConcurrentPhaseWarnThresholdInMs())
304+
logger.warn(sb.toString());
305+
else if (duration > getGcConcurrentPhaseLogThresholdInMs())
306+
logger.info(sb.toString());
307+
else if (logger.isTraceEnabled())
308+
logger.trace(sb.toString());
309+
if (duration > this.getStatusThresholdInMs())
310+
StatusLogger.log();
311+
}
300312

301313
// if we just finished an old gen collection and we're still using a lot of memory, try to reduce the pressure
302314
if (gcState.assumeGCIsOldGen)
303315
LifecycleTransaction.rescheduleFailedDeletions();
304316
}
305317
}
306318

319+
static boolean isConcurrentPhase(String cause, String name) {
320+
// Mostly taken from: https://github.com/Netflix/spectator/blob/v1.7.x/spectator-ext-gc/src/main/java/com/netflix/spectator/gc/GcLogger.java
321+
// So far the only indicator known is that the cause will be reported as "No GC"
322+
// when using CMS.
323+
//
324+
// For ZGC, behavior was changed in JDK17: https://bugs.openjdk.java.net/browse/JDK-8265136
325+
// For ZGC in older versions, there is no way to accurately get the amount of time
326+
// in STW pauses.
327+
//
328+
// For G1, a new bean is added in JDK20 to indicate time spent in concurrent phases:
329+
// https://bugs.openjdk.org/browse/JDK-8297247
330+
331+
return "No GC".equals(cause) // CMS
332+
|| "G1 Concurrent GC".equals(name) // G1 in JDK20+
333+
|| name.endsWith(" Cycles"); // Shenandoah, ZGC
334+
}
335+
336+
307337
public State getTotalSinceLastCheck()
308338
{
309339
return state.getAndSet(new State());
@@ -407,6 +437,40 @@ public void setGcLogThresholdInMs(long threshold)
407437
DatabaseDescriptor.setGCLogThreshold((int) threshold);
408438
}
409439

440+
public int getGcConcurrentPhaseWarnThresholdInMs()
441+
{
442+
return DatabaseDescriptor.getGCConcurrentPhaseWarnThreshold();
443+
}
444+
445+
public void setGcConcurrentPhaseWarnThresholdInMs(int threshold)
446+
{
447+
long gcConcurrentPhaseLogThresholdInMs = getGcConcurrentPhaseLogThresholdInMs();
448+
if (threshold < 0)
449+
throw new IllegalArgumentException("Threshold must be greater than or equal to 0");
450+
if (threshold != 0 && threshold <= gcConcurrentPhaseLogThresholdInMs)
451+
throw new IllegalArgumentException("Threshold must be greater than gcConcurrentPhaseLogThresholdInMs which is currently "
452+
+ gcConcurrentPhaseLogThresholdInMs);
453+
DatabaseDescriptor.setGCConcurrentPhaseWarnThreshold(threshold);
454+
}
455+
456+
public int getGcConcurrentPhaseLogThresholdInMs()
457+
{
458+
return DatabaseDescriptor.getGCConcurrentPhaseLogThreshold();
459+
}
460+
461+
public void setGcConcurrentPhaseLogThresholdInMs(int threshold)
462+
{
463+
if (threshold <= 0)
464+
throw new IllegalArgumentException("Threshold must be greater than 0");
465+
466+
long gcConcurrentPhaseWarnThresholdInMs = getGcConcurrentPhaseWarnThresholdInMs();
467+
if (gcConcurrentPhaseWarnThresholdInMs != 0 && threshold > gcConcurrentPhaseWarnThresholdInMs)
468+
throw new IllegalArgumentException("Threshold must be less than gcConcurrentPhaseWarnThresholdInMs which is currently "
469+
+ gcConcurrentPhaseWarnThresholdInMs);
470+
471+
DatabaseDescriptor.setGCConcurrentPhaseLogThreshold(threshold);
472+
}
473+
410474
public long getGcLogThresholdInMs()
411475
{
412476
return DatabaseDescriptor.getGCLogThreshold();

src/java/org/apache/cassandra/service/GCInspectorMXBean.java

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -27,6 +27,10 @@ public interface GCInspectorMXBean
2727
void setGcWarnThresholdInMs(long threshold);
2828
long getGcWarnThresholdInMs();
2929
void setGcLogThresholdInMs(long threshold);
30+
int getGcConcurrentPhaseLogThresholdInMs();
31+
void setGcConcurrentPhaseWarnThresholdInMs(int threshold);
32+
int getGcConcurrentPhaseWarnThresholdInMs();
33+
void setGcConcurrentPhaseLogThresholdInMs(int threshold);
3034
long getGcLogThresholdInMs();
3135
long getStatusThresholdInMs();
3236
}

test/unit/org/apache/cassandra/service/GCInspectorTest.java

Lines changed: 87 additions & 17 deletions
Original file line numberDiff line numberDiff line change
@@ -18,11 +18,16 @@
1818
package org.apache.cassandra.service;
1919

2020
import org.apache.cassandra.config.DatabaseDescriptor;
21-
import org.junit.Assert;
2221
import org.junit.Before;
2322
import org.junit.BeforeClass;
2423
import org.junit.Test;
2524

25+
import static org.junit.Assert.assertEquals;
26+
import static org.junit.Assert.assertFalse;
27+
import static org.junit.Assert.assertTrue;
28+
import static org.junit.Assert.fail;
29+
30+
2631
public class GCInspectorTest
2732
{
2833

@@ -43,49 +48,98 @@ public void before()
4348
@Test
4449
public void ensureStaticFieldsHydrateFromConfig()
4550
{
46-
Assert.assertEquals(DatabaseDescriptor.getGCLogThreshold(), gcInspector.getGcLogThresholdInMs());
47-
Assert.assertEquals(DatabaseDescriptor.getGCWarnThreshold(), gcInspector.getGcWarnThresholdInMs());
51+
assertEquals(DatabaseDescriptor.getGCLogThreshold(), gcInspector.getGcLogThresholdInMs());
52+
assertEquals(DatabaseDescriptor.getGCWarnThreshold(), gcInspector.getGcWarnThresholdInMs());
53+
assertEquals(DatabaseDescriptor.getGCConcurrentPhaseLogThreshold(), gcInspector.getGcConcurrentPhaseLogThresholdInMs());
54+
assertEquals(DatabaseDescriptor.getGCConcurrentPhaseWarnThreshold(), gcInspector.getGcConcurrentPhaseWarnThresholdInMs());
4855
}
4956

5057
@Test
5158
public void ensureStatusIsCalculated()
5259
{
53-
Assert.assertTrue(gcInspector.getStatusThresholdInMs() > 0);
60+
assertTrue(gcInspector.getStatusThresholdInMs() > 0);
5461
}
5562

56-
@Test(expected=IllegalArgumentException.class)
63+
@Test
5764
public void ensureWarnGreaterThanLog()
5865
{
59-
gcInspector.setGcWarnThresholdInMs(gcInspector.getGcLogThresholdInMs());
66+
try
67+
{
68+
gcInspector.setGcWarnThresholdInMs(gcInspector.getGcLogThresholdInMs());
69+
fail("Expect IllegalArgumentException");
70+
}
71+
catch (IllegalArgumentException e)
72+
{
73+
// expected
74+
}
75+
try
76+
{
77+
gcInspector.setGcConcurrentPhaseWarnThresholdInMs(gcInspector.getGcConcurrentPhaseLogThresholdInMs());
78+
fail("Expect IllegalArgumentException");
79+
}
80+
catch (IllegalArgumentException e)
81+
{
82+
// expected
83+
}
6084
}
6185

6286
@Test
6387
public void ensureZeroIsOk()
6488
{
6589
gcInspector.setGcWarnThresholdInMs(0);
66-
Assert.assertEquals(gcInspector.getStatusThresholdInMs(), gcInspector.getGcLogThresholdInMs());
67-
Assert.assertEquals(0, DatabaseDescriptor.getGCWarnThreshold());
68-
Assert.assertEquals(200, DatabaseDescriptor.getGCLogThreshold());
90+
assertEquals(gcInspector.getStatusThresholdInMs(), gcInspector.getGcLogThresholdInMs());
91+
assertEquals(0, DatabaseDescriptor.getGCWarnThreshold());
92+
assertEquals(200, DatabaseDescriptor.getGCLogThreshold());
93+
gcInspector.setGcConcurrentPhaseWarnThresholdInMs(0);
94+
assertEquals(gcInspector.getGcConcurrentPhaseLogThresholdInMs(), gcInspector.getGcConcurrentPhaseWarnThresholdInMs());
95+
assertEquals(0, DatabaseDescriptor.getGCConcurrentPhaseWarnThreshold());
96+
assertEquals(1000, DatabaseDescriptor.getGCConcurrentPhaseLogThreshold());
6997
}
7098

71-
@Test(expected=IllegalArgumentException.class)
99+
@Test
72100
public void ensureLogLessThanWarn()
73101
{
74-
Assert.assertEquals(200, gcInspector.getGcLogThresholdInMs());
102+
assertEquals(200, gcInspector.getGcLogThresholdInMs());
75103
gcInspector.setGcWarnThresholdInMs(1000);
76-
Assert.assertEquals(1000, gcInspector.getGcWarnThresholdInMs());
77-
gcInspector.setGcLogThresholdInMs(gcInspector.getGcWarnThresholdInMs() + 1);
104+
assertEquals(1000, gcInspector.getGcWarnThresholdInMs());
105+
try
106+
{
107+
gcInspector.setGcLogThresholdInMs(gcInspector.getGcWarnThresholdInMs() + 1);
108+
fail("Expect IllegalArgumentException");
109+
}
110+
catch (IllegalArgumentException e)
111+
{
112+
// expected
113+
}
114+
assertEquals(1000, gcInspector.getGcLogThresholdInMs());
115+
gcInspector.setGcConcurrentPhaseWarnThresholdInMs(2000);
116+
assertEquals(2000, gcInspector.getGcConcurrentPhaseWarnThresholdInMs());
117+
try
118+
{
119+
gcInspector.setGcConcurrentPhaseLogThresholdInMs(gcInspector.getGcConcurrentPhaseWarnThresholdInMs() + 1);
120+
fail("Expect IllegalArgumentException");
121+
}
122+
catch (IllegalArgumentException e)
123+
{
124+
// expected
125+
}
78126
}
79127

80128
@Test
81129
public void testDefaults()
82130
{
83131
gcInspector.setGcLogThresholdInMs(200);
84132
gcInspector.setGcWarnThresholdInMs(1000);
85-
Assert.assertEquals(200, DatabaseDescriptor.getGCLogThreshold());
86-
Assert.assertEquals(200, gcInspector.getGcLogThresholdInMs());
87-
Assert.assertEquals(1000, DatabaseDescriptor.getGCWarnThreshold());
88-
Assert.assertEquals(1000, gcInspector.getGcWarnThresholdInMs());
133+
gcInspector.setGcConcurrentPhaseLogThresholdInMs(1000);
134+
gcInspector.setGcConcurrentPhaseWarnThresholdInMs(2000);
135+
assertEquals(200, DatabaseDescriptor.getGCLogThreshold());
136+
assertEquals(200, gcInspector.getGcLogThresholdInMs());
137+
assertEquals(1000, DatabaseDescriptor.getGCWarnThreshold());
138+
assertEquals(1000, gcInspector.getGcWarnThresholdInMs());
139+
assertEquals(1000, DatabaseDescriptor.getGCConcurrentPhaseLogThreshold());
140+
assertEquals(1000, gcInspector.getGcConcurrentPhaseLogThresholdInMs());
141+
assertEquals(2000, DatabaseDescriptor.getGCConcurrentPhaseWarnThreshold());
142+
assertEquals(2000, gcInspector.getGcConcurrentPhaseWarnThresholdInMs());
89143
}
90144

91145
@Test(expected=IllegalArgumentException.class)
@@ -94,4 +148,20 @@ public void testMaxValue()
94148
gcInspector.setGcLogThresholdInMs(200);
95149
gcInspector.setGcWarnThresholdInMs(Integer.MAX_VALUE+1L);
96150
}
151+
152+
@Test
153+
public void testIsConcurrentPhase()
154+
{
155+
assertTrue("No GC cause should be considered concurrent",
156+
GCInspector.isConcurrentPhase("No GC", "SomeGCName"));
157+
assertTrue("Shenandoah Cycles should be considered concurrent",
158+
GCInspector.isConcurrentPhase("SomeCause", "Shenandoah Cycles"));
159+
assertTrue("ZGC Cycles should be considered concurrent",
160+
GCInspector.isConcurrentPhase("SomeCause", "ZGC Cycles"));
161+
162+
assertFalse("ParallelGC should not be considered concurrent",
163+
GCInspector.isConcurrentPhase("Allocation Failure", "PS Scavenge"));
164+
assertFalse("G1 Young Generation should not be considered concurrent",
165+
GCInspector.isConcurrentPhase("G1 Evacuation Pause", "G1 Young Generation"));
166+
}
97167
}

0 commit comments

Comments
 (0)