Skip to content

Commit

Permalink
[rqd] Fix rqd cache spill issue (#1531)
Browse files Browse the repository at this point in the history
Ensure frames are removed from the rqd cache when they fail to complete
their report process

Co-authored-by: Ramon Figueired <[email protected]>
  • Loading branch information
DiegoTavares and Ramon Figueired authored Oct 4, 2024
1 parent 810bee5 commit e343152
Show file tree
Hide file tree
Showing 4 changed files with 56 additions and 28 deletions.
7 changes: 3 additions & 4 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -14,7 +14,6 @@ htmlcov/
.vscode
.venv/
.eggs/*
.gradle/*
/cuebot/logs
/cuebot/bin
/logs
/cuebot/bin/*
/logs/*
/.gradle/*
75 changes: 51 additions & 24 deletions rqd/rqd/rqcore.py
Original file line number Diff line number Diff line change
Expand Up @@ -237,29 +237,6 @@ def __writeFooter(self):
"Unable to write footer: %s due to %s at %s",
self.runFrame.log_dir_file, e, traceback.extract_tb(sys.exc_info()[2]))

def __sendFrameCompleteReport(self):
"""Send report to cuebot that frame has finished"""
report = rqd.compiled_proto.report_pb2.FrameCompleteReport()
# pylint: disable=no-member
report.host.CopyFrom(self.rqCore.machine.getHostInfo())
report.frame.CopyFrom(self.frameInfo.runningFrameInfo())
# pylint: enable=no-member

if self.frameInfo.exitStatus is None:
report.exit_status = 1
else:
report.exit_status = self.frameInfo.exitStatus

report.exit_signal = self.frameInfo.exitSignal
report.run_time = int(self.frameInfo.runTime)

# If nimby is active, then frame must have been killed by nimby
# Set the exitSignal to indicate this event
if self.rqCore.nimby.locked and not self.runFrame.ignore_nimby:
report.exit_status = rqd.rqconstants.EXITSTATUS_FOR_NIMBY_KILL

self.rqCore.network.reportRunningFrameCompletion(report)

def __cleanup(self):
"""Cleans up temporary files"""
rqd.rqutil.permissionsHigh()
Expand Down Expand Up @@ -551,7 +528,7 @@ def run(self):

self.rqCore.deleteFrame(self.runFrame.frame_id)

self.__sendFrameCompleteReport()
self.rqCore.sendFrameCompleteReport(self.frameInfo)
time_till_next = (
(self.rqCore.intervalStartTime + self.rqCore.intervalSleepTime) - time.time())
if time_till_next > (2 * rqd.rqconstants.RQD_MIN_PING_INTERVAL_SEC):
Expand Down Expand Up @@ -723,6 +700,9 @@ def deleteFrame(self, frameId):
self.cores.reserved_cores)
# pylint: disable=no-member
self.cores.reserved_cores.clear()
log.info("Successfully delete frame with Id: %s", frameId)
else:
log.warning("Frame with Id: %s not found in cache", frameId)

def killAllFrame(self, reason):
"""Will execute .kill() on every frame in cache until no frames remain
Expand Down Expand Up @@ -1080,3 +1060,50 @@ def sendStatusReport(self):
def isWaitingForIdle(self):
"""Returns whether the host is waiting until idle to take some action."""
return self.__whenIdle

def sendFrameCompleteReport(self, runningFrame):
"""Send a frameCompleteReport to Cuebot"""
if not runningFrame.completeReportSent:
report = rqd.compiled_proto.report_pb2.FrameCompleteReport()
# pylint: disable=no-member
report.host.CopyFrom(self.machine.getHostInfo())
report.frame.CopyFrom(runningFrame.runningFrameInfo())
# pylint: enable=no-member

if runningFrame.exitStatus is None:
report.exit_status = 1
else:
report.exit_status = runningFrame.exitStatus

report.exit_signal = runningFrame.exitSignal
report.run_time = int(runningFrame.runTime)

# If nimby is active, then frame must have been killed by nimby
# Set the exitSignal to indicate this event
if self.nimby.locked and not runningFrame.ignoreNimby:
report.exit_status = rqd.rqconstants.EXITSTATUS_FOR_NIMBY_KILL

self.network.reportRunningFrameCompletion(report)
runningFrame.completeReportSent = True

def sanitizeFrames(self):
"""
Iterate over the cache and update the status of frames that might have
completed but never reported back to cuebot.
"""
for frameId, runningFrame in self.__cache.items():
# If the frame was marked as completed (exitStatus) and a report has not been sent
# try to file the report again
if runningFrame.exitStatus is not None and not runningFrame.completeReportSent:
try:
self.sendFrameCompleteReport(runningFrame)
self.deleteFrame(frameId)
log.info("Successfully deleted frame from cache for %s/%s (%s)",
runningFrame.runFrame.job_name,
runningFrame.runFrame.frame_name,
frameId)
# pylint: disable=broad-except
except Exception:
log.exception("Failed to sanitize frame %s/%s",
runningFrame.runFrame.job_name,
runningFrame.runFrame.frame_name)
1 change: 1 addition & 0 deletions rqd/rqd/rqmachine.py
Original file line number Diff line number Diff line change
Expand Up @@ -809,6 +809,7 @@ def getHostReport(self):
self.__hostReport.host.CopyFrom(self.getHostInfo())

self.__hostReport.ClearField('frames')
self.__rqCore.sanitizeFrames()
for frameKey in self.__rqCore.getFrameKeys():
try:
info = self.__rqCore.getFrame(frameKey).runningFrameInfo()
Expand Down
1 change: 1 addition & 0 deletions rqd/rqd/rqnetwork.py
Original file line number Diff line number Diff line change
Expand Up @@ -79,6 +79,7 @@ def __init__(self, rqCore, runFrame):

self.lluTime = 0
self.childrenProcs = {}
self.completeReportSent = False

def runningFrameInfo(self):
"""Returns the RunningFrameInfo object"""
Expand Down

0 comments on commit e343152

Please sign in to comment.