Skip to content

Commit 704ea17

Browse files
committed
Bug 37846548 - [36898832->22.06.13] RFA: Proxy leaking processedEvents causing heap stress
[git-p4: depot-paths = "//dev/coherence-ce/release/coherence-ce-v22.06/": change = 115733]
1 parent 8fdfb30 commit 704ea17

File tree

2 files changed

+46
-22
lines changed
  • prj/coherence-core-components/src/main/java/com/tangosol/coherence/component

2 files changed

+46
-22
lines changed

prj/coherence-core-components/src/main/java/com/tangosol/coherence/component/net/Message.java

+15-5
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,6 @@
11

22
/*
3-
* Copyright (c) 2000, 2024, Oracle and/or its affiliates.
3+
* Copyright (c) 2000, 2025, Oracle and/or its affiliates.
44
*
55
* Licensed under the Universal Permissive License v 1.0 as shown at
66
* https://oss.oracle.com/licenses/upl.
@@ -87,7 +87,14 @@ public class Message
8787
* MessageHandler.sendMulti
8888
*/
8989
private transient int __m_BufferUsageCounter;
90-
90+
91+
/**
92+
* Property fDelivered
93+
*
94+
* Only true iff releaseOutgoing determined that message has been delivered to all recipients.
95+
*/
96+
private transient volatile boolean m_fDelivered = false;
97+
9198
/**
9299
* Property DeserializationRequired
93100
*
@@ -141,7 +148,7 @@ public class Message
141148
* Messages generated by the Cluster Service (service 0).
142149
*/
143150
private int __m_MessageType;
144-
151+
145152
/**
146153
* Property NotifyDelivery
147154
*
@@ -788,7 +795,7 @@ public com.tangosol.internal.tracing.SpanContext getTracingSpanContext()
788795
public boolean isDelivered()
789796
{
790797
// see releaseOutgoing
791-
return getBufferController() == null;
798+
return m_fDelivered;
792799
}
793800

794801
// Accessor for the property "DeserializationRequired"
@@ -1250,12 +1257,13 @@ public void releaseOutgoing(boolean fSuspect, boolean fOrdered)
12501257
setBufferUsageCounter(cUsage);
12511258
}
12521259
}
1253-
1260+
12541261
if ((cUsage & ~BUFFER_COUNT_ORDER_BIT) == 0)
12551262
{
12561263
// all receipts are in
12571264
controller.dispose();
12581265
setBufferController(null);
1266+
m_fDelivered = true;
12591267

12601268
if (cUsage == 0) // not marked as unordered
12611269
{
@@ -1276,6 +1284,7 @@ public void releaseOutgoingComplete()
12761284
if (getBufferUsageCounter() == BUFFER_COUNT_ORDER_BIT) // dirty read; no users left; common case
12771285
{
12781286
setBufferUsageCounter(0);
1287+
m_fDelivered = true;
12791288
notifyDelivery();
12801289
}
12811290
else // potentially some users left; may be concurrently releasing
@@ -1291,6 +1300,7 @@ public void releaseOutgoingComplete()
12911300

12921301
if (cUsage == BUFFER_COUNT_ORDER_BIT) // no users left
12931302
{
1303+
m_fDelivered = true;
12941304
notifyDelivery();
12951305
}
12961306
// else; still users left; they will notify if needed during release

prj/coherence-core-components/src/main/java/com/tangosol/coherence/component/util/daemon/queueProcessor/service/grid/partitionedService/PartitionedCache.java

+31-17
Original file line numberDiff line numberDiff line change
@@ -5088,12 +5088,21 @@ protected void onInterval()
50885088

50895089
getRequestCoordinator().onInterval();
50905090

5091-
// on some environments the "NotifyDelivery" ack may become lost or be
5092-
// held up (e.g.: hundreds of ViewCache clients with heavy writes)
5093-
// the following attempts to get stuck events "unstuck" by re-sending
5094-
// the oldest and hopefully cause a chain of events to clear the
5095-
// pending events long array which can become clogged on backup members
5096-
// due to said clean-up being solely based on oldest ack'ed event.
5091+
// event sending may miss one or more events due to concurrency on
5092+
// entry status processing, see ResourceCoordinator.processEvent.
5093+
// Synthetic events are processed in finalizeInvoke*, which in turn
5094+
// processes events through ResourceCoordinator.processEvent one by
5095+
// one. If the event is not "managed" (== synthetic), it is then passed
5096+
// back to the caller as an OOB event to be sent to the client.
5097+
// If the event is "managed", it is assumed that the caller is
5098+
// responsible for sending the event.
5099+
// There is a narrow race condition where an entry status can be locked
5100+
// (setting isManaged() to true) while synthetic events are being
5101+
// processed, for example entries evicted.
5102+
// In this case the path is that of synthetic events, but the event
5103+
// is not returned into the OOB set.
5104+
// This is caught below when we see the oldest event staying too long
5105+
// in the pending events LongArray structure.
50975106
LongArray laPending = getPendingEvents();
50985107
if (laPending == null) // used in getOldestPendingEventSUID
50995108
{
@@ -5102,7 +5111,6 @@ protected void onInterval()
51025111

51035112
long ldtNow = Base.getSafeTimeMillis();
51045113
long lOldestSUID = getOldestPendingEventSUID();
5105-
51065114
if (ldtNow > getOldestEventResendNextMillis())
51075115
{
51085116
if (lOldestSUID > 0 &&
@@ -8662,7 +8670,7 @@ protected void postEvent(PartitionedCache.MapEvent msgEvent)
86628670
{
86638671
return;
86648672
}
8665-
8673+
86668674
post(msgEvent);
86678675
}
86688676

@@ -9538,7 +9546,7 @@ protected void publishChanges(com.tangosol.coherence.component.net.RequestContex
95389546

95399547
// remember the events-holder
95409548
oHolder = status.getMapEventHolder();
9541-
9549+
95429550
// increment the partition version
95439551
int nPartition = status.getPartition();
95449552
PartitionedCache.PartitionControl ctrlPart = (PartitionedCache.PartitionControl) getPartitionControl(nPartition);
@@ -9926,8 +9934,11 @@ protected void publishChanges(com.tangosol.coherence.component.net.RequestContex
99269934
//
99279935
// The batch context data structure is used for "synchronization" and both
99289936
// conditions are checked by $BatchContext#onJobCompleted
9929-
9930-
ctxBatch.onJobCompleted(job); // may be null
9937+
9938+
synchronized (ctxBatch)
9939+
{
9940+
ctxBatch.onJobCompleted(job); // may be null
9941+
}
99319942

99329943
// Now, only after backup messages/events have been queued/sent, unlock the
99339944
// keys that were locked during #processEvent
@@ -17566,7 +17577,7 @@ protected boolean tryBatchCompletion()
1756617577

1756717578
// all jobs have run, and all backups have completed; respond to the client
1756817579
service.publishToClients(getPrimaryResponse(), getEvents());
17569-
17580+
1757017581
return true;
1757117582
}
1757217583
}
@@ -29688,7 +29699,7 @@ public void onDelivery()
2968829699
public void onReceived()
2968929700
{
2969029701
super.onReceived();
29691-
29702+
2969229703
((PartitionedCache) getService()).onMapEvent(this);
2969329704
}
2969429705
}
@@ -38012,7 +38023,7 @@ protected void checkResourceDeadlock(Object oContender)
3801238023
}
3801338024

3801438025
/**
38015-
* Specialized helper method to collect aynchronously observed
38026+
* Specialized helper method to collect asynchronously observed
3801638027
* $EventStatus objects.
3801738028
*
3801838029
* @param setStatus the Set<$EntryStatus> of asynchronously observed
@@ -38697,7 +38708,7 @@ else if (!service.isPrimaryOwner(status.getPartition()))
3869738708
// else this is a troubling case; process the com.tangosol.util.MapEvent updating ancillary
3869838709
// data structures
3869938710
}
38700-
38711+
3870138712
// event could be null if this is a "synthetic" event holder used
3870238713
// to force a flush of the backup & client event changes
3870338714
if (event != null)
@@ -38895,7 +38906,7 @@ else if (fSynthetic)
3889538906
// 3) the status is managed by another thread
3889638907
//
3889738908
// If the status is unmanaged, we should try to lock the key and "take"
38898-
// ownership of the EntryStatus (and become reponsible for publishing
38909+
// ownership of the EntryStatus (and become responsible for publishing
3889938910
// backup and client-event changes). Note: we must not be overly aggressive
3890038911
// in attempting to lock due to the possibility of deadlock (see COH-5436).
3890138912
//
@@ -38944,7 +38955,10 @@ else if (fSynthetic)
3894438955
}
3894538956

3894638957
// return true iff the status has been locked by this method
38947-
// and therefore needs to be added to the OOB status collection
38958+
// and therefore needs to be added to the OOB status collection.
38959+
// Caveat: if a "managed" event has grabbed the same lock it could
38960+
// lead to a "forgotten" event if the caller is the synthetic event
38961+
// path. See onInterval
3894838962
return !fOOBEvent;
3894938963
}
3895038964

0 commit comments

Comments
 (0)