Skip to content

Commit 62dc392

Browse files
authored
MCOL-5499 Enable ControlFlow for same node communication processing path to avoid DEC queue overloading (mariadb-corporation#2848)
1 parent cacbbee commit 62dc392

9 files changed

+192
-201
lines changed

dbcon/joblist/bpp-jl.h

Lines changed: 0 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -33,7 +33,6 @@
3333
#include "serializeable.h"
3434
#include "brm.h"
3535
#include "jobstep.h"
36-
// #include "primitiveprocessor.h"
3736
#include "batchprimitiveprocessor-jl.h"
3837
#include "command-jl.h"
3938
#include "columncommand-jl.h"

dbcon/joblist/distributedenginecomm.cpp

Lines changed: 39 additions & 107 deletions
Original file line numberDiff line numberDiff line change
@@ -241,10 +241,11 @@ int32_t DistributedEngineComm::Setup()
241241
newLocks.clear();
242242

243243
uint32_t newPmCount = fRm->getPsCount();
244-
throttleThreshold = fRm->getDECThrottleThreshold();
245244
tbpsThreadCount = fRm->getJlNumScanReceiveThreads();
246245
fDECConnectionsPerQuery = fRm->getDECConnectionsPerQuery();
247246
unsigned numConnections = getNumConnections();
247+
flowControlEnableBytesThresh = fRm->getDECEnableBytesThresh();
248+
flowControlDisableBytesThresh = fRm->getDECDisableBytesThresh();
248249
oam::Oam oam;
249250
ModuleTypeConfig moduletypeconfig;
250251

@@ -282,6 +283,8 @@ int32_t DistributedEngineComm::Setup()
282283
if (clientAtTheSameHost(cl))
283284
{
284285
cl->atTheSameHost(true);
286+
assert(connectionId <= std::numeric_limits<uint32_t>::max());
287+
localConnectionId_ = connectionId;
285288
}
286289
std::shared_ptr<std::mutex> nl(new std::mutex());
287290

@@ -433,33 +436,6 @@ void DistributedEngineComm::Listen(boost::shared_ptr<MessageQueueClient> client,
433436
os << "DEC: lost connection to " << client->addr2String();
434437
writeToLog(__FILE__, __LINE__, os.str(), LOG_TYPE_ERROR);
435438
}
436-
437-
/*
438-
// reset the pmconnection vector
439-
ClientList tempConns;
440-
boost::mutex::scoped_lock onErrLock(fOnErrMutex);
441-
string moduleName = client->moduleName();
442-
//cout << "moduleName=" << moduleName << endl;
443-
for ( uint32_t i = 0; i < fPmConnections.size(); i++)
444-
{
445-
if (moduleName != fPmConnections[i]->moduleName())
446-
tempConns.push_back(fPmConnections[i]);
447-
//else
448-
//cout << "DEC remove PM" << fPmConnections[i]->otherEnd() << " moduleName=" <<
449-
fPmConnections[i]->moduleName() << endl;
450-
}
451-
452-
if (tempConns.size() == fPmConnections.size()) return;
453-
454-
fPmConnections.swap(tempConns);
455-
pmCount = (pmCount == 0 ? 0 : pmCount - 1);
456-
//cout << "PMCOUNT=" << pmCount << endl;
457-
458-
// log it
459-
ostringstream os;
460-
os << "DEC: lost connection to " << client->addr2String();
461-
writeToLog(__FILE__, __LINE__, os.str(), LOG_TYPE_CRITICAL);
462-
*/
463439
}
464440
return;
465441
}
@@ -472,7 +448,7 @@ void DistributedEngineComm::addQueue(uint32_t key, bool sendACKs)
472448
condition* cond = new condition();
473449
uint32_t firstPMInterleavedConnectionId =
474450
key % (fPmConnections.size() / pmCount) * fDECConnectionsPerQuery * pmCount % fPmConnections.size();
475-
boost::shared_ptr<MQE> mqe(new MQE(pmCount, firstPMInterleavedConnectionId));
451+
boost::shared_ptr<MQE> mqe(new MQE(pmCount, firstPMInterleavedConnectionId, flowControlEnableBytesThresh));
476452

477453
mqe->queue = StepMsgQueue(lock, cond);
478454
mqe->sendACKs = sendACKs;
@@ -540,7 +516,7 @@ void DistributedEngineComm::read(uint32_t key, SBS& bs)
540516
{
541517
std::unique_lock lk(ackLock);
542518

543-
if (mqe->throttled && !mqe->hasBigMsgs && queueSize.size <= disableThreshold)
519+
if (mqe->throttled && !mqe->hasBigMsgs && queueSize.size <= flowControlDisableBytesThresh)
544520
setFlowControl(false, key, mqe);
545521

546522
vector<SBS> v;
@@ -578,7 +554,7 @@ const ByteStream DistributedEngineComm::read(uint32_t key)
578554
{
579555
std::unique_lock lk(ackLock);
580556

581-
if (mqe->throttled && !mqe->hasBigMsgs && queueSize.size <= disableThreshold)
557+
if (mqe->throttled && !mqe->hasBigMsgs && queueSize.size <= flowControlDisableBytesThresh)
582558
setFlowControl(false, key, mqe);
583559

584560
vector<SBS> v;
@@ -645,7 +621,7 @@ void DistributedEngineComm::read_some(uint32_t key, uint32_t divisor, vector<SBS
645621
{
646622
std::unique_lock lk(ackLock);
647623

648-
if (mqe->throttled && !mqe->hasBigMsgs && queueSize.size <= disableThreshold)
624+
if (mqe->throttled && !mqe->hasBigMsgs && queueSize.size <= flowControlDisableBytesThresh)
649625
setFlowControl(false, key, mqe);
650626

651627
sendAcks(key, v, mqe, queueSize.size);
@@ -726,12 +702,6 @@ void DistributedEngineComm::sendAcks(uint32_t uniqueID, const vector<SBS>& msgs,
726702

727703
msg->advanceInputPtr(sizeof(ISMPacketHeader));
728704
// There must be only one local connection here.
729-
uint32_t localConnectionId = std::numeric_limits<uint32_t>::max();
730-
for (uint32_t i = 0; i < pmCount; ++i)
731-
{
732-
if (fPmConnections[i]->atTheSameHost() && fIsExeMgr)
733-
localConnectionId = i;
734-
}
735705
bool sendToLocal = false;
736706
while (l_msgCount > 0)
737707
{
@@ -743,23 +713,23 @@ void DistributedEngineComm::sendAcks(uint32_t uniqueID, const vector<SBS>& msgs,
743713
nextPMToACK(mqe, l_msgCount, &sockIndex, toAck);
744714
idbassert(*toAck <= l_msgCount);
745715
l_msgCount -= *toAck;
746-
if (sockIndex == localConnectionId)
716+
if (sockIndex == localConnectionId_ && fIsExeMgr)
747717
{
748718
sendToLocal = true;
749719
continue;
750720
}
751721
pmAcked[sockIndex] = true;
752722
writeToClient(sockIndex, msg);
753723
}
754-
if (sendToLocal && localConnectionId < fPmConnections.size())
724+
if (sendToLocal)
755725
{
756-
pmAcked[localConnectionId] = true;
757-
writeToClient(localConnectionId, msg);
726+
pmAcked[localConnectionId_] = true;
727+
writeToClient(localConnectionId_, msg);
758728
}
759729

760730
// @bug4436, when no more unacked work, send an ack to all PMs that haven't been acked.
761731
// This is apply to the big message case only. For small messages, the flow control is
762-
// disabled when the queue size is below the disableThreshold.
732+
// disabled when the queue size is below the flowControlDisableBytesThresh.
763733
if (mqe->hasBigMsgs)
764734
{
765735
uint64_t totalUnackedWork = 0;
@@ -775,16 +745,16 @@ void DistributedEngineComm::sendAcks(uint32_t uniqueID, const vector<SBS>& msgs,
775745
{
776746
if (!pmAcked[i])
777747
{
778-
if (i == localConnectionId)
748+
if (i == localConnectionId_ && fIsExeMgr)
779749
{
780750
continue;
781751
}
782752
writeToClient(i, msg);
783753
}
784754
}
785-
if (!pmAcked[localConnectionId])
755+
if (!pmAcked[localConnectionId_] && fIsExeMgr)
786756
{
787-
writeToClient(localConnectionId, msg);
757+
writeToClient(localConnectionId_, msg);
788758
}
789759
}
790760
}
@@ -863,28 +833,18 @@ void DistributedEngineComm::setFlowControl(bool enabled, uint32_t uniqueID, boos
863833
ism->Command = BATCH_PRIMITIVE_ACK;
864834
ism->Size = (enabled ? 0 : -1);
865835

866-
#ifdef VALGRIND
867-
/* XXXPAT: For testing in valgrind, init the vars that don't get used */
868-
ism->Flags = 0;
869-
ism->Type = 0;
870-
ism->MsgCount = 0;
871-
ism->Status = 0;
872-
#endif
873-
874836
msg->advanceInputPtr(sizeof(ISMPacketHeader));
875-
uint32_t localConnectionId = std::numeric_limits<uint32_t>::max();
876837

877838
for (uint32_t i = 0; i < mqe->pmCount; ++i)
878839
{
879-
if (fPmConnections[i]->atTheSameHost() && fIsExeMgr)
840+
if (i == localConnectionId_ && fIsExeMgr)
880841
{
881-
localConnectionId = i;
882842
continue;
883843
}
884844
writeToClient(i, msg);
885845
}
886-
if (localConnectionId < fPmConnections.size())
887-
writeToClient(localConnectionId, msg);
846+
if (fIsExeMgr)
847+
writeToClient(localConnectionId_, msg);
888848
}
889849

890850
int32_t DistributedEngineComm::write(uint32_t senderID, const SBS& msg)
@@ -911,23 +871,23 @@ int32_t DistributedEngineComm::write(uint32_t senderID, const SBS& msg)
911871
/* XXXPAT: This relies on the assumption that the first pmCount "PMS*"
912872
entries in the config file point to unique PMs */
913873
{
914-
uint32_t localConnectionId = std::numeric_limits<uint32_t>::max();
915874
int32_t rc = 0;
916-
917875
for (uint32_t i = 0; i < pmCount; ++i)
918876
{
919-
if (fPmConnections[i]->atTheSameHost() && fIsExeMgr)
877+
if (i == localConnectionId_ && fIsExeMgr)
920878
{
921-
localConnectionId = i;
922879
continue;
923880
}
924881

925-
rc =writeToClient(i, msg, senderID);
926-
if (rc)
882+
if ((rc = writeToClient(i, msg, senderID)))
883+
{
927884
return rc;
885+
}
886+
}
887+
if (fIsExeMgr)
888+
{
889+
return writeToClient(localConnectionId_, msg);
928890
}
929-
if (localConnectionId < fPmConnections.size())
930-
rc = writeToClient(localConnectionId, msg);
931891
return rc;
932892
}
933893

@@ -985,56 +945,27 @@ void DistributedEngineComm::StartClientListener(boost::shared_ptr<MessageQueueCl
985945

986946
void DistributedEngineComm::addDataToOutput(SBS sbs)
987947
{
988-
ISMPacketHeader* hdr = (ISMPacketHeader*)(sbs->buf());
989-
PrimitiveHeader* p = (PrimitiveHeader*)(hdr + 1);
990-
uint32_t uniqueId = p->UniqueID;
991-
boost::shared_ptr<MQE> mqe;
992-
993-
std::unique_lock lk(fMlock);
994-
MessageQueueMap::iterator map_tok = fSessionMessages.find(uniqueId);
995-
996-
// The message for a session that doesn't exist.
997-
if (map_tok == fSessionMessages.end())
998-
{
999-
// Here gets the dead session ByteStream that is already removed
1000-
// from DEC queue.
1001-
return;
1002-
}
1003-
1004-
mqe = map_tok->second;
1005-
lk.unlock();
1006-
1007-
if (pmCount > 0)
1008-
{
1009-
// I hardcoded the unacked Worker id here. ACK isn't important
1010-
// for the local exchange b/c there is no need to
1011-
// enable flowcontrol localy on PM.
1012-
(void)atomicops::atomicInc(&mqe->unackedWork[0]);
1013-
}
1014-
1015-
[[maybe_unused]] TSQSize_t queueSize = mqe->queue.push(sbs);
1016-
// There will be no statistics about data transfered
1017-
// over the memory.
948+
assert(localConnectionId_ < pmCount);
949+
return addDataToOutput(sbs, localConnectionId_, nullptr);
1018950
}
1019951

1020952
void DistributedEngineComm::addDataToOutput(SBS sbs, uint32_t connIndex, Stats* stats)
1021953
{
1022954
ISMPacketHeader* hdr = (ISMPacketHeader*)(sbs->buf());
1023955
PrimitiveHeader* p = (PrimitiveHeader*)(hdr + 1);
1024956
uint32_t uniqueId = p->UniqueID;
1025-
boost::shared_ptr<MQE> mqe;
1026957
std::unique_lock lk(fMlock);
1027958
MessageQueueMap::iterator map_tok = fSessionMessages.find(uniqueId);
1028959

960+
// The message for a session that doesn't exist.
1029961
if (map_tok == fSessionMessages.end())
1030962
{
1031-
// For debugging...
1032-
// cerr << "DistributedEngineComm::AddDataToOutput: tried to add a message to a dead session: " <<
1033-
// uniqueId << ", size " << sbs->length() << ", step id " << p->StepID << endl;
963+
// Here gets the dead session ByteStream that is already removed
964+
// from DEC queue.
1034965
return;
1035966
}
1036967

1037-
mqe = map_tok->second;
968+
auto mqe = map_tok->second;
1038969
lk.unlock();
1039970

1040971
if (pmCount > 0)
@@ -1049,9 +980,9 @@ void DistributedEngineComm::addDataToOutput(SBS sbs, uint32_t connIndex, Stats*
1049980
std::lock_guard lk(ackLock);
1050981
uint64_t msgSize = sbs->lengthWithHdrOverhead();
1051982

1052-
if (!mqe->throttled && msgSize > (targetRecvQueueSize / 2))
1053-
doHasBigMsgs(mqe, (300 * 1024 * 1024 > 3 * msgSize ? 300 * 1024 * 1024
1054-
: 3 * msgSize)); // buffer at least 3 big msgs
983+
if (!mqe->throttled && msgSize > (flowControlEnableBytesThresh / 2))
984+
doHasBigMsgs(
985+
mqe, (bigMessageSize > 3 * msgSize ? bigMessageSize : 3 * msgSize)); // buffer at least 3 big msgs
1055986

1056987
if (!mqe->throttled && queueSize.size >= mqe->targetQueueSize)
1057988
setFlowControl(true, uniqueId, mqe);
@@ -1271,8 +1202,9 @@ Stats DistributedEngineComm::getNetworkStats(uint32_t uniqueID)
12711202
return empty;
12721203
}
12731204

1274-
DistributedEngineComm::MQE::MQE(const uint32_t pCount, const uint32_t initialInterleaverValue)
1275-
: ackSocketIndex(0), pmCount(pCount), hasBigMsgs(false), targetQueueSize(targetRecvQueueSize)
1205+
DistributedEngineComm::MQE::MQE(const uint32_t pCount, const uint32_t initialInterleaverValue,
1206+
const uint64_t flowControlEnableBytesThresh)
1207+
: ackSocketIndex(0), pmCount(pCount), hasBigMsgs(false), targetQueueSize(flowControlEnableBytesThresh)
12761208
{
12771209
unackedWork.reset(new volatile uint32_t[pmCount]);
12781210
interleaver.reset(new uint32_t[pmCount]);

dbcon/joblist/distributedenginecomm.h

Lines changed: 6 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -229,7 +229,7 @@ class DistributedEngineComm
229229
/* To keep some state associated with the connection. These aren't copyable. */
230230
struct MQE : public boost::noncopyable
231231
{
232-
MQE(const uint32_t pmCount, const uint32_t initialInterleaverValue);
232+
MQE(const uint32_t pmCount, const uint32_t initialInterleaverValue, const uint64_t recvQueueSize);
233233
uint32_t getNextConnectionId(const size_t pmIndex, const size_t pmConnectionsNumber,
234234
const uint32_t DECConnectionsPerQuery);
235235
messageqcpp::Stats stats;
@@ -297,9 +297,9 @@ class DistributedEngineComm
297297
bool fIsExeMgr;
298298

299299
// send-side throttling vars
300-
uint64_t throttleThreshold;
301-
static const uint32_t targetRecvQueueSize = 50000000;
302-
static const uint32_t disableThreshold = 10000000;
300+
uint64_t flowControlEnableBytesThresh = 50000000;
301+
uint64_t flowControlDisableBytesThresh = 10000000;
302+
uint64_t bigMessageSize = 300 * 1024 * 1024;
303303
uint32_t tbpsThreadCount;
304304
uint32_t fDECConnectionsPerQuery;
305305

@@ -310,6 +310,8 @@ class DistributedEngineComm
310310
void doHasBigMsgs(boost::shared_ptr<MQE> mqe, uint64_t targetSize);
311311
boost::mutex ackLock;
312312

313+
// localConnectionId_ is set running Setup() method
314+
uint32_t localConnectionId_ = std::numeric_limits<uint32_t>::max();
313315
std::vector<struct in_addr> localNetIfaceSins_;
314316
std::mutex inMemoryEM2PPExchMutex_;
315317
std::condition_variable inMemoryEM2PPExchCV_;

dbcon/joblist/resourcemanager.cpp

Lines changed: 0 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -239,8 +239,6 @@ ResourceManager::ResourceManager(bool runningInExeMgr, config::Config* aConfig)
239239
fAllowedDiskAggregation =
240240
getBoolVal(fRowAggregationStr, "AllowDiskBasedAggregation", defaultAllowDiskAggregation);
241241

242-
fMaxBPPSendQueue = getUintVal(fPrimitiveServersStr, "MaxBPPSendQueue", defaultMaxBPPSendQueue);
243-
244242
if (!load_encryption_keys())
245243
{
246244
Logger log;

0 commit comments

Comments
 (0)