Skip to content

Commit 6dc58bd

Browse files
committed
Bug 37846546 - [36898832->14.1.2.0.3] RFA: Proxy leaking processedEvents causing heap stress (14.1.2.0 cl 115728 --> 14.1.2.0 CE)
[git-p4: depot-paths = "//dev/coherence-ce/release/coherence-ce-v14.1.2.0/": change = 115761]
1 parent cb8437c commit 6dc58bd

File tree

2 files changed

+46
-22
lines changed
  • prj/coherence-core-components/src/main/java/com/tangosol/coherence/component

2 files changed

+46
-22
lines changed

prj/coherence-core-components/src/main/java/com/tangosol/coherence/component/net/Message.java

+15-5
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,6 @@
11

22
/*
3-
* Copyright (c) 2000, 2024, Oracle and/or its affiliates.
3+
* Copyright (c) 2000, 2025, Oracle and/or its affiliates.
44
*
55
* Licensed under the Universal Permissive License v 1.0 as shown at
66
* https://oss.oracle.com/licenses/upl.
@@ -87,7 +87,14 @@ public class Message
8787
* MessageHandler.sendMulti
8888
*/
8989
private transient int __m_BufferUsageCounter;
90-
90+
91+
/**
92+
* Property fDelivered
93+
*
94+
* Only true iff releaseOutgoing determined that message has been delivered to all recipients.
95+
*/
96+
private transient volatile boolean m_fDelivered = false;
97+
9198
/**
9299
* Property DeserializationRequired
93100
*
@@ -141,7 +148,7 @@ public class Message
141148
* Messages generated by the Cluster Service (service 0).
142149
*/
143150
private int __m_MessageType;
144-
151+
145152
/**
146153
* Property NotifyDelivery
147154
*
@@ -788,7 +795,7 @@ public com.tangosol.internal.tracing.SpanContext getTracingSpanContext()
788795
public boolean isDelivered()
789796
{
790797
// see releaseOutgoing
791-
return getBufferController() == null;
798+
return m_fDelivered;
792799
}
793800

794801
// Accessor for the property "DeserializationRequired"
@@ -1250,12 +1257,13 @@ public void releaseOutgoing(boolean fSuspect, boolean fOrdered)
12501257
setBufferUsageCounter(cUsage);
12511258
}
12521259
}
1253-
1260+
12541261
if ((cUsage & ~BUFFER_COUNT_ORDER_BIT) == 0)
12551262
{
12561263
// all receipts are in
12571264
controller.dispose();
12581265
setBufferController(null);
1266+
m_fDelivered = true;
12591267

12601268
if (cUsage == 0) // not marked as unordered
12611269
{
@@ -1276,6 +1284,7 @@ public void releaseOutgoingComplete()
12761284
if (getBufferUsageCounter() == BUFFER_COUNT_ORDER_BIT) // dirty read; no users left; common case
12771285
{
12781286
setBufferUsageCounter(0);
1287+
m_fDelivered = true;
12791288
notifyDelivery();
12801289
}
12811290
else // potentially some users left; may be concurrently releasing
@@ -1291,6 +1300,7 @@ public void releaseOutgoingComplete()
12911300

12921301
if (cUsage == BUFFER_COUNT_ORDER_BIT) // no users left
12931302
{
1303+
m_fDelivered = true;
12941304
notifyDelivery();
12951305
}
12961306
// else; still users left; they will notify if needed during release

prj/coherence-core-components/src/main/java/com/tangosol/coherence/component/util/daemon/queueProcessor/service/grid/partitionedService/PartitionedCache.java

+31-17
Original file line numberDiff line numberDiff line change
@@ -5117,12 +5117,21 @@ protected void onInterval()
51175117

51185118
getRequestCoordinator().onInterval();
51195119

5120-
// on some environments the "NotifyDelivery" ack may become lost or be
5121-
// held up (e.g.: hundreds of ViewCache clients with heavy writes)
5122-
// the following attempts to get stuck events "unstuck" by re-sending
5123-
// the oldest and hopefully cause a chain of events to clear the
5124-
// pending events long array which can become clogged on backup members
5125-
// due to said clean-up being solely based on oldest ack'ed event.
5120+
// event sending may miss one or more events due to concurrency on
5121+
// entry status processing, see ResourceCoordinator.processEvent.
5122+
// Synthetic events are processed in finalizeInvoke*, which in turn
5123+
// processes events through ResourceCoordinator.processEvent one by
5124+
// one. If the event is not "managed" (== synthetic), it is then passed
5125+
// back to the caller as an OOB event to be sent to the client.
5126+
// If the event is "managed", it is assumed that the caller is
5127+
// responsible for sending the event.
5128+
// There is a narrow race condition where an entry status can be locked
5129+
// (setting isManaged() to true) while synthetic events are being
5130+
// processed, for example entries evicted.
5131+
// In this case the path is that of synthetic events, but the event
5132+
// is not returned into the OOB set.
5133+
// This is caught below when we see the oldest event staying too long
5134+
// in the pending events LongArray structure.
51265135
LongArray laPending = getPendingEvents();
51275136
if (laPending == null) // used in getOldestPendingEventSUID
51285137
{
@@ -5131,7 +5140,6 @@ protected void onInterval()
51315140

51325141
long ldtNow = Base.getSafeTimeMillis();
51335142
long lOldestSUID = getOldestPendingEventSUID();
5134-
51355143
if (ldtNow > getOldestEventResendNextMillis())
51365144
{
51375145
if (lOldestSUID > 0 &&
@@ -8690,7 +8698,7 @@ protected void postEvent(PartitionedCache.MapEvent msgEvent)
86908698
{
86918699
return;
86928700
}
8693-
8701+
86948702
post(msgEvent);
86958703
}
86968704

@@ -9566,7 +9574,7 @@ protected void publishChanges(com.tangosol.coherence.component.net.RequestContex
95669574

95679575
// remember the events-holder
95689576
oHolder = status.getMapEventHolder();
9569-
9577+
95709578
// increment the partition version
95719579
int nPartition = status.getPartition();
95729580
PartitionedCache.PartitionControl ctrlPart = (PartitionedCache.PartitionControl) getPartitionControl(nPartition);
@@ -9965,8 +9973,11 @@ protected void publishChanges(com.tangosol.coherence.component.net.RequestContex
99659973
//
99669974
// The batch context data structure is used for "synchronization" and both
99679975
// conditions are checked by $BatchContext#onJobCompleted
9968-
9969-
ctxBatch.onJobCompleted(job); // may be null
9976+
9977+
synchronized (ctxBatch)
9978+
{
9979+
ctxBatch.onJobCompleted(job); // may be null
9980+
}
99709981

99719982
// Now, only after backup messages/events have been queued/sent, unlock the
99729983
// keys that were locked during #processEvent
@@ -17604,7 +17615,7 @@ protected boolean tryBatchCompletion()
1760417615

1760517616
// all jobs have run, and all backups have completed; respond to the client
1760617617
service.publishToClients(getPrimaryResponse(), getEvents());
17607-
17618+
1760817619
return true;
1760917620
}
1761017621
}
@@ -29725,7 +29736,7 @@ public void onDelivery()
2972529736
public void onReceived()
2972629737
{
2972729738
super.onReceived();
29728-
29739+
2972929740
((PartitionedCache) getService()).onMapEvent(this);
2973029741
}
2973129742
}
@@ -38046,7 +38057,7 @@ protected void checkResourceDeadlock(Object oContender)
3804638057
}
3804738058

3804838059
/**
38049-
* Specialized helper method to collect aynchronously observed
38060+
* Specialized helper method to collect asynchronously observed
3805038061
* $EventStatus objects.
3805138062
*
3805238063
* @param setStatus the Set<$EntryStatus> of asynchronously observed
@@ -38731,7 +38742,7 @@ else if (!service.isPrimaryOwner(status.getPartition()))
3873138742
// else this is a troubling case; process the com.tangosol.util.MapEvent updating ancillary
3873238743
// data structures
3873338744
}
38734-
38745+
3873538746
// event could be null if this is a "synthetic" event holder used
3873638747
// to force a flush of the backup & client event changes
3873738748
if (event != null)
@@ -38929,7 +38940,7 @@ else if (fSynthetic)
3892938940
// 3) the status is managed by another thread
3893038941
//
3893138942
// If the status is unmanaged, we should try to lock the key and "take"
38932-
// ownership of the EntryStatus (and become reponsible for publishing
38943+
// ownership of the EntryStatus (and become responsible for publishing
3893338944
// backup and client-event changes). Note: we must not be overly aggressive
3893438945
// in attempting to lock due to the possibility of deadlock (see COH-5436).
3893538946
//
@@ -38978,7 +38989,10 @@ else if (fSynthetic)
3897838989
}
3897938990

3898038991
// return true iff the status has been locked by this method
38981-
// and therefore needs to be added to the OOB status collection
38992+
// and therefore needs to be added to the OOB status collection.
38993+
// Caveat: if a "managed" event has grabbed the same lock it could
38994+
// lead to a "forgotten" event if the caller is the synthetic event
38995+
// path. See onInterval
3898238996
return !fOOBEvent;
3898338997
}
3898438998

0 commit comments

Comments
 (0)