@@ -5088,12 +5088,21 @@ protected void onInterval()
5088
5088
5089
5089
getRequestCoordinator().onInterval();
5090
5090
5091
- // on some environments the "NotifyDelivery" ack may become lost or be
5092
- // held up (e.g.: hundreds of ViewCache clients with heavy writes)
5093
- // the following attempts to get stuck events "unstuck" by re-sending
5094
- // the oldest and hopefully cause a chain of events to clear the
5095
- // pending events long array which can become clogged on backup members
5096
- // due to said clean-up being solely based on oldest ack'ed event.
5091
+ // event sending may miss one or more events due to concurrency on
5092
+ // entry status processing, see ResourceCoordinator.processEvent.
5093
+ // Synthetic events are processed in finalizeInvoke*, which in turn
5094
+ // processes events through ResourceCoordinator.processEvent one by
5095
+ // one. If the event is not "managed" (== synthetic), it is then passed
5096
+ // back to the caller as an OOB event to be sent to the client.
5097
+ // If the event is "managed", it is assumed that the caller is
5098
+ // responsible for sending the event.
5099
+ // There is a narrow race condition where an entry status can be locked
5100
+ // (setting isManaged() to true) while synthetic events are being
5101
+ // processed, for example entries evicted.
5102
+ // In this case the path is that of synthetic events, but the event
5103
+ // is not returned into the OOB set.
5104
+ // This is caught below when we see the oldest event staying too long
5105
+ // in the pending events LongArray structure.
5097
5106
LongArray laPending = getPendingEvents();
5098
5107
if (laPending == null) // used in getOldestPendingEventSUID
5099
5108
{
@@ -5102,7 +5111,6 @@ protected void onInterval()
5102
5111
5103
5112
long ldtNow = Base.getSafeTimeMillis();
5104
5113
long lOldestSUID = getOldestPendingEventSUID();
5105
-
5106
5114
if (ldtNow > getOldestEventResendNextMillis())
5107
5115
{
5108
5116
if (lOldestSUID > 0 &&
@@ -8662,7 +8670,7 @@ protected void postEvent(PartitionedCache.MapEvent msgEvent)
8662
8670
{
8663
8671
return;
8664
8672
}
8665
-
8673
+
8666
8674
post(msgEvent);
8667
8675
}
8668
8676
@@ -9538,7 +9546,7 @@ protected void publishChanges(com.tangosol.coherence.component.net.RequestContex
9538
9546
9539
9547
// remember the events-holder
9540
9548
oHolder = status.getMapEventHolder();
9541
-
9549
+
9542
9550
// increment the partition version
9543
9551
int nPartition = status.getPartition();
9544
9552
PartitionedCache.PartitionControl ctrlPart = (PartitionedCache.PartitionControl) getPartitionControl(nPartition);
@@ -9926,8 +9934,11 @@ protected void publishChanges(com.tangosol.coherence.component.net.RequestContex
9926
9934
//
9927
9935
// The batch context data structure is used for "synchronization" and both
9928
9936
// conditions are checked by $BatchContext#onJobCompleted
9929
-
9930
- ctxBatch.onJobCompleted(job); // may be null
9937
+
9938
+ synchronized (ctxBatch)
9939
+ {
9940
+ ctxBatch.onJobCompleted(job); // may be null
9941
+ }
9931
9942
9932
9943
// Now, only after backup messages/events have been queued/sent, unlock the
9933
9944
// keys that were locked during #processEvent
@@ -17566,7 +17577,7 @@ protected boolean tryBatchCompletion()
17566
17577
17567
17578
// all jobs have run, and all backups have completed; respond to the client
17568
17579
service.publishToClients(getPrimaryResponse(), getEvents());
17569
-
17580
+
17570
17581
return true;
17571
17582
}
17572
17583
}
@@ -29688,7 +29699,7 @@ public void onDelivery()
29688
29699
public void onReceived()
29689
29700
{
29690
29701
super.onReceived();
29691
-
29702
+
29692
29703
((PartitionedCache) getService()).onMapEvent(this);
29693
29704
}
29694
29705
}
@@ -38012,7 +38023,7 @@ protected void checkResourceDeadlock(Object oContender)
38012
38023
}
38013
38024
38014
38025
/**
38015
- * Specialized helper method to collect aynchronously observed
38026
+ * Specialized helper method to collect asynchronously observed
38016
38027
* $EventStatus objects.
38017
38028
*
38018
38029
* @param setStatus the Set<$EntryStatus> of asynchronously observed
@@ -38697,7 +38708,7 @@ else if (!service.isPrimaryOwner(status.getPartition()))
38697
38708
// else this is a troubling case; process the com.tangosol.util.MapEvent updating ancillary
38698
38709
// data structures
38699
38710
}
38700
-
38711
+
38701
38712
// event could be null if this is a "synthetic" event holder used
38702
38713
// to force a flush of the backup & client event changes
38703
38714
if (event != null)
@@ -38895,7 +38906,7 @@ else if (fSynthetic)
38895
38906
// 3) the status is managed by another thread
38896
38907
//
38897
38908
// If the status is unmanaged, we should try to lock the key and "take"
38898
- // ownership of the EntryStatus (and become reponsible for publishing
38909
+ // ownership of the EntryStatus (and become responsible for publishing
38899
38910
// backup and client-event changes). Note: we must not be overly aggressive
38900
38911
// in attempting to lock due to the possibility of deadlock (see COH-5436).
38901
38912
//
@@ -38944,7 +38955,10 @@ else if (fSynthetic)
38944
38955
}
38945
38956
38946
38957
// return true iff the status has been locked by this method
38947
- // and therefore needs to be added to the OOB status collection
38958
+ // and therefore needs to be added to the OOB status collection.
38959
+ // Caveat: if a "managed" event has grabbed the same lock it could
38960
+ // lead to a "forgotten" event if the caller is the synthetic event
38961
+ // path. See onInterval
38948
38962
return !fOOBEvent;
38949
38963
}
38950
38964
0 commit comments