- 
                Notifications
    You must be signed in to change notification settings 
- Fork 3.8k
GCInspector should use different thresholds on GC events #4433
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
base: trunk
Are you sure you want to change the base?
Changes from all commits
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change | 
|---|---|---|
|  | @@ -287,23 +287,53 @@ public void handleNotification(final Notification notification, final Object han | |
| if (state.compareAndSet(prev, new State(duration, bytes, prev))) | ||
| break; | ||
| } | ||
|  | ||
| if (getGcWarnThresholdInMs() != 0 && duration > getGcWarnThresholdInMs()) | ||
| logger.warn(sb.toString()); | ||
| else if (duration > getGcLogThresholdInMs()) | ||
| logger.info(sb.toString()); | ||
| else if (logger.isTraceEnabled()) | ||
| logger.trace(sb.toString()); | ||
|  | ||
| if (duration > this.getStatusThresholdInMs()) | ||
| StatusLogger.log(); | ||
| if (isConcurrentPhase(info.getGcCause(), info.getGcName())) | ||
| { | ||
| if (getGcConcurrentPhaseWarnThresholdInMs() != 0 && duration > getGcConcurrentPhaseWarnThresholdInMs()) | ||
| logger.warn(sb.toString()); | ||
| else if (duration > getGcConcurrentPhaseLogThresholdInMs()) | ||
| logger.info(sb.toString()); | ||
| else if (logger.isTraceEnabled()) | ||
| logger.trace(sb.toString()); | ||
| // TODO: trigger StatusLogger if concurrent phases take too long? | ||
| } | ||
| else | ||
| { | ||
| if (getGcWarnThresholdInMs() != 0 && duration > getGcWarnThresholdInMs()) | ||
| logger.warn(sb.toString()); | ||
| else if (duration > getGcLogThresholdInMs()) | ||
| logger.info(sb.toString()); | ||
| else if (logger.isTraceEnabled()) | ||
| logger.trace(sb.toString()); | ||
| if (duration > this.getStatusThresholdInMs()) | ||
| StatusLogger.log(); | ||
| } | ||
|  | ||
| // if we just finished an old gen collection and we're still using a lot of memory, try to reduce the pressure | ||
| if (gcState.assumeGCIsOldGen) | ||
| LifecycleTransaction.rescheduleFailedDeletions(); | ||
| } | ||
| } | ||
|  | ||
| static boolean isConcurrentPhase(String cause, String name) { | ||
| // Mostly taken from: https://github.com/Netflix/spectator/blob/v1.7.x/spectator-ext-gc/src/main/java/com/netflix/spectator/gc/GcLogger.java | ||
| // So far the only indicator known is that the cause will be reported as "No GC" | ||
| // when using CMS. | ||
| // | ||
| // For ZGC, behavior was changed in JDK17: https://bugs.openjdk.java.net/browse/JDK-8265136 | ||
| // For ZGC in older versions, there is no way to accurately get the amount of time | ||
| // in STW pauses. | ||
| // | ||
| // For G1, a new bean is added in JDK20 to indicate time spent in concurrent phases: | ||
| // https://bugs.openjdk.org/browse/JDK-8297247 | ||
|  | ||
| return "No GC".equals(cause) // CMS | ||
| || "G1 Concurrent GC".equals(name) // G1 in JDK20+ | ||
| || name.endsWith(" Cycles"); // Shenandoah, ZGC | ||
| } | ||
|  | ||
|  | ||
| public State getTotalSinceLastCheck() | ||
| { | ||
| return state.getAndSet(new State()); | ||
|  | @@ -407,6 +437,40 @@ public void setGcLogThresholdInMs(long threshold) | |
| DatabaseDescriptor.setGCLogThreshold((int) threshold); | ||
| } | ||
|  | ||
| public int getGcConcurrentPhaseWarnThresholdInMs() | ||
| { | ||
| return DatabaseDescriptor.getGCConcurrentPhaseWarnThreshold(); | ||
| } | ||
|  | ||
| public void setGcConcurrentPhaseWarnThresholdInMs(int threshold) | ||
| { | ||
| long gcConcurrentPhaseLogThresholdInMs = getGcConcurrentPhaseLogThresholdInMs(); | ||
| There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. you can move the resolution of this after the first "if". I that throws you have called this unnecessarily. You have already done it like that in setGcConcurrentPhaseLogThresholdInMs | ||
| if (threshold < 0) | ||
| throw new IllegalArgumentException("Threshold must be greater than or equal to 0"); | ||
| if (threshold != 0 && threshold <= gcConcurrentPhaseLogThresholdInMs) | ||
| throw new IllegalArgumentException("Threshold must be greater than gcConcurrentPhaseLogThresholdInMs which is currently " | ||
| + gcConcurrentPhaseLogThresholdInMs); | ||
| DatabaseDescriptor.setGCConcurrentPhaseWarnThreshold(threshold); | ||
| } | ||
|  | ||
| public int getGcConcurrentPhaseLogThresholdInMs() | ||
| { | ||
| return DatabaseDescriptor.getGCConcurrentPhaseLogThreshold(); | ||
| } | ||
|  | ||
| public void setGcConcurrentPhaseLogThresholdInMs(int threshold) | ||
| There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I get that what you did here just follows the style which was already there and there is nothing wrong with it as such but I just wonder if we could not move the logic of this into DD's method and this one would just call it. I get that the methods need to be here because we are exposing them in MBean but the logic itself might be just moved to DD, no? Because if somebody every calls DD methods, there is no validation. But it is just here. So everybody has to go through this method to get the validation. If we moved it to DD then it would be validated every time, regardless from where it is called. DD currently also contains methods which have some basic validation around its parameters so ... There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. also as I understand it .... what happens when you put into yaml logically invalid values and you start the node? because the validation occurs only when you call mbean methods. If we want to be sure that what we started the node with makes sense already then it would need to be validated in DatabaseDesciptor.applySimpleConfig that is called as part of node startup and validated there so we are sure it is all valid already | ||
| { | ||
| if (threshold <= 0) | ||
| throw new IllegalArgumentException("Threshold must be greater than 0"); | ||
|  | ||
| long gcConcurrentPhaseWarnThresholdInMs = getGcConcurrentPhaseWarnThresholdInMs(); | ||
| if (gcConcurrentPhaseWarnThresholdInMs != 0 && threshold > gcConcurrentPhaseWarnThresholdInMs) | ||
| throw new IllegalArgumentException("Threshold must be less than gcConcurrentPhaseWarnThresholdInMs which is currently " | ||
| + gcConcurrentPhaseWarnThresholdInMs); | ||
|  | ||
| DatabaseDescriptor.setGCConcurrentPhaseLogThreshold(threshold); | ||
| } | ||
|  | ||
| public long getGcLogThresholdInMs() | ||
| { | ||
| return DatabaseDescriptor.getGCLogThreshold(); | ||
|  | ||
| Original file line number | Diff line number | Diff line change | 
|---|---|---|
|  | @@ -18,11 +18,16 @@ | |
| package org.apache.cassandra.service; | ||
|  | ||
| import org.apache.cassandra.config.DatabaseDescriptor; | ||
| import org.junit.Assert; | ||
| import org.junit.Before; | ||
| import org.junit.BeforeClass; | ||
| import org.junit.Test; | ||
|  | ||
| import static org.junit.Assert.assertEquals; | ||
| import static org.junit.Assert.assertFalse; | ||
| import static org.junit.Assert.assertTrue; | ||
| import static org.junit.Assert.fail; | ||
|  | ||
|  | ||
| public class GCInspectorTest | ||
| { | ||
|  | ||
|  | @@ -43,49 +48,98 @@ public void before() | |
| @Test | ||
| public void ensureStaticFieldsHydrateFromConfig() | ||
| { | ||
| Assert.assertEquals(DatabaseDescriptor.getGCLogThreshold(), gcInspector.getGcLogThresholdInMs()); | ||
| Assert.assertEquals(DatabaseDescriptor.getGCWarnThreshold(), gcInspector.getGcWarnThresholdInMs()); | ||
| assertEquals(DatabaseDescriptor.getGCLogThreshold(), gcInspector.getGcLogThresholdInMs()); | ||
| assertEquals(DatabaseDescriptor.getGCWarnThreshold(), gcInspector.getGcWarnThresholdInMs()); | ||
| assertEquals(DatabaseDescriptor.getGCConcurrentPhaseLogThreshold(), gcInspector.getGcConcurrentPhaseLogThresholdInMs()); | ||
| assertEquals(DatabaseDescriptor.getGCConcurrentPhaseWarnThreshold(), gcInspector.getGcConcurrentPhaseWarnThresholdInMs()); | ||
| } | ||
|  | ||
| @Test | ||
| public void ensureStatusIsCalculated() | ||
| { | ||
| Assert.assertTrue(gcInspector.getStatusThresholdInMs() > 0); | ||
| assertTrue(gcInspector.getStatusThresholdInMs() > 0); | ||
| } | ||
|  | ||
| @Test(expected=IllegalArgumentException.class) | ||
| @Test | ||
| public void ensureWarnGreaterThanLog() | ||
| { | ||
| gcInspector.setGcWarnThresholdInMs(gcInspector.getGcLogThresholdInMs()); | ||
| try | ||
| There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Better style is to use Assertions.assertThrownBy | ||
| { | ||
| gcInspector.setGcWarnThresholdInMs(gcInspector.getGcLogThresholdInMs()); | ||
| fail("Expect IllegalArgumentException"); | ||
| } | ||
| catch (IllegalArgumentException e) | ||
| { | ||
| // expected | ||
| } | ||
| try | ||
| { | ||
| gcInspector.setGcConcurrentPhaseWarnThresholdInMs(gcInspector.getGcConcurrentPhaseLogThresholdInMs()); | ||
| fail("Expect IllegalArgumentException"); | ||
| } | ||
| catch (IllegalArgumentException e) | ||
| { | ||
| // expected | ||
| } | ||
| } | ||
|  | ||
| @Test | ||
| public void ensureZeroIsOk() | ||
| { | ||
| gcInspector.setGcWarnThresholdInMs(0); | ||
| Assert.assertEquals(gcInspector.getStatusThresholdInMs(), gcInspector.getGcLogThresholdInMs()); | ||
| Assert.assertEquals(0, DatabaseDescriptor.getGCWarnThreshold()); | ||
| Assert.assertEquals(200, DatabaseDescriptor.getGCLogThreshold()); | ||
| assertEquals(gcInspector.getStatusThresholdInMs(), gcInspector.getGcLogThresholdInMs()); | ||
| assertEquals(0, DatabaseDescriptor.getGCWarnThreshold()); | ||
| assertEquals(200, DatabaseDescriptor.getGCLogThreshold()); | ||
|  | ||
| gcInspector.setGcConcurrentPhaseWarnThresholdInMs(0); | ||
| assertEquals(0, DatabaseDescriptor.getGCConcurrentPhaseWarnThreshold()); | ||
| assertEquals(1000, DatabaseDescriptor.getGCConcurrentPhaseLogThreshold()); | ||
| } | ||
|  | ||
| @Test(expected=IllegalArgumentException.class) | ||
| @Test | ||
| public void ensureLogLessThanWarn() | ||
| { | ||
| Assert.assertEquals(200, gcInspector.getGcLogThresholdInMs()); | ||
| assertEquals(200, gcInspector.getGcLogThresholdInMs()); | ||
| gcInspector.setGcWarnThresholdInMs(1000); | ||
| Assert.assertEquals(1000, gcInspector.getGcWarnThresholdInMs()); | ||
| gcInspector.setGcLogThresholdInMs(gcInspector.getGcWarnThresholdInMs() + 1); | ||
| assertEquals(1000, gcInspector.getGcWarnThresholdInMs()); | ||
| try | ||
| { | ||
| gcInspector.setGcLogThresholdInMs(gcInspector.getGcWarnThresholdInMs() + 1); | ||
| fail("Expect IllegalArgumentException"); | ||
| } | ||
| catch (IllegalArgumentException e) | ||
| { | ||
| // expected | ||
| } | ||
| assertEquals(1000, gcInspector.getGcConcurrentPhaseLogThresholdInMs()); | ||
| gcInspector.setGcConcurrentPhaseWarnThresholdInMs(2000); | ||
| assertEquals(2000, gcInspector.getGcConcurrentPhaseWarnThresholdInMs()); | ||
| try | ||
| There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Assertions.assertThrownBy | ||
| { | ||
| gcInspector.setGcConcurrentPhaseLogThresholdInMs(gcInspector.getGcConcurrentPhaseWarnThresholdInMs() + 1); | ||
| fail("Expect IllegalArgumentException"); | ||
| } | ||
| catch (IllegalArgumentException e) | ||
| { | ||
| // expected | ||
| } | ||
| } | ||
|  | ||
| @Test | ||
| public void testDefaults() | ||
| { | ||
| gcInspector.setGcLogThresholdInMs(200); | ||
| gcInspector.setGcWarnThresholdInMs(1000); | ||
| Assert.assertEquals(200, DatabaseDescriptor.getGCLogThreshold()); | ||
| Assert.assertEquals(200, gcInspector.getGcLogThresholdInMs()); | ||
| Assert.assertEquals(1000, DatabaseDescriptor.getGCWarnThreshold()); | ||
| Assert.assertEquals(1000, gcInspector.getGcWarnThresholdInMs()); | ||
| gcInspector.setGcConcurrentPhaseLogThresholdInMs(1000); | ||
| gcInspector.setGcConcurrentPhaseWarnThresholdInMs(2000); | ||
| assertEquals(200, DatabaseDescriptor.getGCLogThreshold()); | ||
| assertEquals(200, gcInspector.getGcLogThresholdInMs()); | ||
| assertEquals(1000, DatabaseDescriptor.getGCWarnThreshold()); | ||
| assertEquals(1000, gcInspector.getGcWarnThresholdInMs()); | ||
| assertEquals(1000, DatabaseDescriptor.getGCConcurrentPhaseLogThreshold()); | ||
| assertEquals(1000, gcInspector.getGcConcurrentPhaseLogThresholdInMs()); | ||
| assertEquals(2000, DatabaseDescriptor.getGCConcurrentPhaseWarnThreshold()); | ||
| assertEquals(2000, gcInspector.getGcConcurrentPhaseWarnThresholdInMs()); | ||
| } | ||
|  | ||
| @Test(expected=IllegalArgumentException.class) | ||
|  | @@ -94,4 +148,20 @@ public void testMaxValue() | |
| gcInspector.setGcLogThresholdInMs(200); | ||
| gcInspector.setGcWarnThresholdInMs(Integer.MAX_VALUE+1L); | ||
| } | ||
|  | ||
| @Test | ||
| public void testIsConcurrentPhase() | ||
| { | ||
| assertTrue("No GC cause should be considered concurrent", | ||
| GCInspector.isConcurrentPhase("No GC", "SomeGCName")); | ||
| assertTrue("Shenandoah Cycles should be considered concurrent", | ||
| GCInspector.isConcurrentPhase("SomeCause", "Shenandoah Cycles")); | ||
| assertTrue("ZGC Cycles should be considered concurrent", | ||
| GCInspector.isConcurrentPhase("SomeCause", "ZGC Cycles")); | ||
|  | ||
| assertFalse("ParallelGC should not be considered concurrent", | ||
| GCInspector.isConcurrentPhase("Allocation Failure", "PS Scavenge")); | ||
| assertFalse("G1 Young Generation should not be considered concurrent", | ||
| GCInspector.isConcurrentPhase("G1 Evacuation Pause", "G1 Young Generation")); | ||
| } | ||
| } | ||
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
regarding the TODO, yes, I think it makes sense to do a symmetric logic: too long GC can steal some computation resources from request processing and it maybe helpful to flush stats to see a possible impact.
Note: the threshold for status logging should be correspondent (based on concurrent phase log thresholds)
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
should be this comment removed as I see this patch contains StatusLogger.log already?
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
we have it for non-concurrent branch only, so it is actual for the current branch with a concurrent threshold logic