@@ -5088,12 +5088,21 @@ protected void onInterval()
50885088
50895089 getRequestCoordinator().onInterval();
50905090
5091- // on some environments the "NotifyDelivery" ack may become lost or be
5092- // held up (e.g.: hundreds of ViewCache clients with heavy writes)
5093- // the following attempts to get stuck events "unstuck" by re-sending
5094- // the oldest and hopefully cause a chain of events to clear the
5095- // pending events long array which can become clogged on backup members
5096- // due to said clean-up being solely based on oldest ack'ed event.
5091+ // event sending may miss one or more events due to concurrency on
5092+ // entry status processing, see ResourceCoordinator.processEvent.
5093+ // Synthetic events are processed in finalizeInvoke*, which in turn
5094+ // processes events through ResourceCoordinator.processEvent one by
5095+ // one. If the event is not "managed" (== synthetic), it is then passed
5096+ // back to the caller as an OOB event to be sent to the client.
5097+ // If the event is "managed", it is assumed that the caller is
5098+ // responsible for sending the event.
5099+ // There is a narrow race condition where an entry status can be locked
5100+ // (setting isManaged() to true) while synthetic events are being
5101+ // processed, for example entries evicted.
5102+ // In this case the path is that of synthetic events, but the event
5103+ // is not returned into the OOB set.
5104+ // This is caught below when we see the oldest event staying too long
5105+ // in the pending events LongArray structure.
50975106 LongArray laPending = getPendingEvents();
50985107 if (laPending == null) // used in getOldestPendingEventSUID
50995108 {
@@ -5102,7 +5111,6 @@ protected void onInterval()
51025111
51035112 long ldtNow = Base.getSafeTimeMillis();
51045113 long lOldestSUID = getOldestPendingEventSUID();
5105-
51065114 if (ldtNow > getOldestEventResendNextMillis())
51075115 {
51085116 if (lOldestSUID > 0 &&
@@ -8662,7 +8670,7 @@ protected void postEvent(PartitionedCache.MapEvent msgEvent)
86628670 {
86638671 return;
86648672 }
8665-
8673+
86668674 post(msgEvent);
86678675 }
86688676
@@ -9538,7 +9546,7 @@ protected void publishChanges(com.tangosol.coherence.component.net.RequestContex
95389546
95399547 // remember the events-holder
95409548 oHolder = status.getMapEventHolder();
9541-
9549+
95429550 // increment the partition version
95439551 int nPartition = status.getPartition();
95449552 PartitionedCache.PartitionControl ctrlPart = (PartitionedCache.PartitionControl) getPartitionControl(nPartition);
@@ -9926,8 +9934,11 @@ protected void publishChanges(com.tangosol.coherence.component.net.RequestContex
99269934 //
99279935 // The batch context data structure is used for "synchronization" and both
99289936 // conditions are checked by $BatchContext#onJobCompleted
9929-
9930- ctxBatch.onJobCompleted(job); // may be null
9937+
9938+ synchronized (ctxBatch)
9939+ {
9940+ ctxBatch.onJobCompleted(job); // may be null
9941+ }
99319942
99329943 // Now, only after backup messages/events have been queued/sent, unlock the
99339944 // keys that were locked during #processEvent
@@ -17566,7 +17577,7 @@ protected boolean tryBatchCompletion()
1756617577
1756717578 // all jobs have run, and all backups have completed; respond to the client
1756817579 service.publishToClients(getPrimaryResponse(), getEvents());
17569-
17580+
1757017581 return true;
1757117582 }
1757217583 }
@@ -29688,7 +29699,7 @@ public void onDelivery()
2968829699 public void onReceived()
2968929700 {
2969029701 super.onReceived();
29691-
29702+
2969229703 ((PartitionedCache) getService()).onMapEvent(this);
2969329704 }
2969429705 }
@@ -38012,7 +38023,7 @@ protected void checkResourceDeadlock(Object oContender)
3801238023 }
3801338024
3801438025 /**
38015- * Specialized helper method to collect aynchronously observed
38026+ * Specialized helper method to collect asynchronously observed
3801638027 * $EventStatus objects.
3801738028 *
3801838029 * @param setStatus the Set<$EntryStatus> of asynchronously observed
@@ -38697,7 +38708,7 @@ else if (!service.isPrimaryOwner(status.getPartition()))
3869738708 // else this is a troubling case; process the com.tangosol.util.MapEvent updating ancillary
3869838709 // data structures
3869938710 }
38700-
38711+
3870138712 // event could be null if this is a "synthetic" event holder used
3870238713 // to force a flush of the backup & client event changes
3870338714 if (event != null)
@@ -38895,7 +38906,7 @@ else if (fSynthetic)
3889538906 // 3) the status is managed by another thread
3889638907 //
3889738908 // If the status is unmanaged, we should try to lock the key and "take"
38898- // ownership of the EntryStatus (and become reponsible for publishing
38909+ // ownership of the EntryStatus (and become responsible for publishing
3889938910 // backup and client-event changes). Note: we must not be overly aggressive
3890038911 // in attempting to lock due to the possibility of deadlock (see COH-5436).
3890138912 //
@@ -38944,7 +38955,10 @@ else if (fSynthetic)
3894438955 }
3894538956
3894638957 // return true iff the status has been locked by this method
38947- // and therefore needs to be added to the OOB status collection
38958+ // and therefore needs to be added to the OOB status collection.
38959+ // Caveat: if a "managed" event has grabbed the same lock it could
38960+ // lead to a "forgotten" event if the caller is the synthetic event
38961+ // path. See onInterval
3894838962 return !fOOBEvent;
3894938963 }
3895038964
0 commit comments