@@ -5117,12 +5117,21 @@ protected void onInterval()
5117
5117
5118
5118
getRequestCoordinator().onInterval();
5119
5119
5120
- // on some environments the "NotifyDelivery" ack may become lost or be
5121
- // held up (e.g.: hundreds of ViewCache clients with heavy writes)
5122
- // the following attempts to get stuck events "unstuck" by re-sending
5123
- // the oldest and hopefully cause a chain of events to clear the
5124
- // pending events long array which can become clogged on backup members
5125
- // due to said clean-up being solely based on oldest ack'ed event.
5120
+ // event sending may miss one or more events due to concurrency on
5121
+ // entry status processing, see ResourceCoordinator.processEvent.
5122
+ // Synthetic events are processed in finalizeInvoke*, which in turn
5123
+ // processes events through ResourceCoordinator.processEvent one by
5124
+ // one. If the event is not "managed" (== synthetic), it is then passed
5125
+ // back to the caller as an OOB event to be sent to the client.
5126
+ // If the event is "managed", it is assumed that the caller is
5127
+ // responsible for sending the event.
5128
+ // There is a narrow race condition where an entry status can be locked
5129
+ // (setting isManaged() to true) while synthetic events are being
5130
+ // processed, for example entries evicted.
5131
+ // In this case the path is that of synthetic events, but the event
5132
+ // is not returned into the OOB set.
5133
+ // This is caught below when we see the oldest event staying too long
5134
+ // in the pending events LongArray structure.
5126
5135
LongArray laPending = getPendingEvents();
5127
5136
if (laPending == null) // used in getOldestPendingEventSUID
5128
5137
{
@@ -5131,7 +5140,6 @@ protected void onInterval()
5131
5140
5132
5141
long ldtNow = Base.getSafeTimeMillis();
5133
5142
long lOldestSUID = getOldestPendingEventSUID();
5134
-
5135
5143
if (ldtNow > getOldestEventResendNextMillis())
5136
5144
{
5137
5145
if (lOldestSUID > 0 &&
@@ -8690,7 +8698,7 @@ protected void postEvent(PartitionedCache.MapEvent msgEvent)
8690
8698
{
8691
8699
return;
8692
8700
}
8693
-
8701
+
8694
8702
post(msgEvent);
8695
8703
}
8696
8704
@@ -9566,7 +9574,7 @@ protected void publishChanges(com.tangosol.coherence.component.net.RequestContex
9566
9574
9567
9575
// remember the events-holder
9568
9576
oHolder = status.getMapEventHolder();
9569
-
9577
+
9570
9578
// increment the partition version
9571
9579
int nPartition = status.getPartition();
9572
9580
PartitionedCache.PartitionControl ctrlPart = (PartitionedCache.PartitionControl) getPartitionControl(nPartition);
@@ -9965,8 +9973,11 @@ protected void publishChanges(com.tangosol.coherence.component.net.RequestContex
9965
9973
//
9966
9974
// The batch context data structure is used for "synchronization" and both
9967
9975
// conditions are checked by $BatchContext#onJobCompleted
9968
-
9969
- ctxBatch.onJobCompleted(job); // may be null
9976
+
9977
+ synchronized (ctxBatch)
9978
+ {
9979
+ ctxBatch.onJobCompleted(job); // may be null
9980
+ }
9970
9981
9971
9982
// Now, only after backup messages/events have been queued/sent, unlock the
9972
9983
// keys that were locked during #processEvent
@@ -17604,7 +17615,7 @@ protected boolean tryBatchCompletion()
17604
17615
17605
17616
// all jobs have run, and all backups have completed; respond to the client
17606
17617
service.publishToClients(getPrimaryResponse(), getEvents());
17607
-
17618
+
17608
17619
return true;
17609
17620
}
17610
17621
}
@@ -29725,7 +29736,7 @@ public void onDelivery()
29725
29736
public void onReceived()
29726
29737
{
29727
29738
super.onReceived();
29728
-
29739
+
29729
29740
((PartitionedCache) getService()).onMapEvent(this);
29730
29741
}
29731
29742
}
@@ -38046,7 +38057,7 @@ protected void checkResourceDeadlock(Object oContender)
38046
38057
}
38047
38058
38048
38059
/**
38049
- * Specialized helper method to collect aynchronously observed
38060
+ * Specialized helper method to collect asynchronously observed
38050
38061
* $EventStatus objects.
38051
38062
*
38052
38063
* @param setStatus the Set<$EntryStatus> of asynchronously observed
@@ -38731,7 +38742,7 @@ else if (!service.isPrimaryOwner(status.getPartition()))
38731
38742
// else this is a troubling case; process the com.tangosol.util.MapEvent updating ancillary
38732
38743
// data structures
38733
38744
}
38734
-
38745
+
38735
38746
// event could be null if this is a "synthetic" event holder used
38736
38747
// to force a flush of the backup & client event changes
38737
38748
if (event != null)
@@ -38929,7 +38940,7 @@ else if (fSynthetic)
38929
38940
// 3) the status is managed by another thread
38930
38941
//
38931
38942
// If the status is unmanaged, we should try to lock the key and "take"
38932
- // ownership of the EntryStatus (and become reponsible for publishing
38943
+ // ownership of the EntryStatus (and become responsible for publishing
38933
38944
// backup and client-event changes). Note: we must not be overly aggressive
38934
38945
// in attempting to lock due to the possibility of deadlock (see COH-5436).
38935
38946
//
@@ -38978,7 +38989,10 @@ else if (fSynthetic)
38978
38989
}
38979
38990
38980
38991
// return true iff the status has been locked by this method
38981
- // and therefore needs to be added to the OOB status collection
38992
+ // and therefore needs to be added to the OOB status collection.
38993
+ // Caveat: if a "managed" event has grabbed the same lock it could
38994
+ // lead to a "forgotten" event if the caller is the synthetic event
38995
+ // path. See onInterval
38982
38996
return !fOOBEvent;
38983
38997
}
38984
38998
0 commit comments