diff --git a/.gitignore b/.gitignore index ecdefd280..28f8248ea 100644 --- a/.gitignore +++ b/.gitignore @@ -14,7 +14,6 @@ htmlcov/ .vscode .venv/ .eggs/* -.gradle/* -/cuebot/logs -/cuebot/bin -/logs \ No newline at end of file +/cuebot/bin/* +/logs/* +/.gradle/* \ No newline at end of file diff --git a/rqd/rqd/rqcore.py b/rqd/rqd/rqcore.py index a72d94c0a..5b85efe75 100644 --- a/rqd/rqd/rqcore.py +++ b/rqd/rqd/rqcore.py @@ -237,29 +237,6 @@ def __writeFooter(self): "Unable to write footer: %s due to %s at %s", self.runFrame.log_dir_file, e, traceback.extract_tb(sys.exc_info()[2])) - def __sendFrameCompleteReport(self): - """Send report to cuebot that frame has finished""" - report = rqd.compiled_proto.report_pb2.FrameCompleteReport() - # pylint: disable=no-member - report.host.CopyFrom(self.rqCore.machine.getHostInfo()) - report.frame.CopyFrom(self.frameInfo.runningFrameInfo()) - # pylint: enable=no-member - - if self.frameInfo.exitStatus is None: - report.exit_status = 1 - else: - report.exit_status = self.frameInfo.exitStatus - - report.exit_signal = self.frameInfo.exitSignal - report.run_time = int(self.frameInfo.runTime) - - # If nimby is active, then frame must have been killed by nimby - # Set the exitSignal to indicate this event - if self.rqCore.nimby.locked and not self.runFrame.ignore_nimby: - report.exit_status = rqd.rqconstants.EXITSTATUS_FOR_NIMBY_KILL - - self.rqCore.network.reportRunningFrameCompletion(report) - def __cleanup(self): """Cleans up temporary files""" rqd.rqutil.permissionsHigh() @@ -551,7 +528,7 @@ def run(self): self.rqCore.deleteFrame(self.runFrame.frame_id) - self.__sendFrameCompleteReport() + self.rqCore.sendFrameCompleteReport(self.frameInfo) time_till_next = ( (self.rqCore.intervalStartTime + self.rqCore.intervalSleepTime) - time.time()) if time_till_next > (2 * rqd.rqconstants.RQD_MIN_PING_INTERVAL_SEC): @@ -723,6 +700,9 @@ def deleteFrame(self, frameId): self.cores.reserved_cores) # pylint: disable=no-member self.cores.reserved_cores.clear() + log.info("Successfully delete frame with Id: %s", frameId) + else: + log.warning("Frame with Id: %s not found in cache", frameId) def killAllFrame(self, reason): """Will execute .kill() on every frame in cache until no frames remain @@ -1080,3 +1060,50 @@ def sendStatusReport(self): def isWaitingForIdle(self): """Returns whether the host is waiting until idle to take some action.""" return self.__whenIdle + + def sendFrameCompleteReport(self, runningFrame): + """Send a frameCompleteReport to Cuebot""" + if not runningFrame.completeReportSent: + report = rqd.compiled_proto.report_pb2.FrameCompleteReport() + # pylint: disable=no-member + report.host.CopyFrom(self.machine.getHostInfo()) + report.frame.CopyFrom(runningFrame.runningFrameInfo()) + # pylint: enable=no-member + + if runningFrame.exitStatus is None: + report.exit_status = 1 + else: + report.exit_status = runningFrame.exitStatus + + report.exit_signal = runningFrame.exitSignal + report.run_time = int(runningFrame.runTime) + + # If nimby is active, then frame must have been killed by nimby + # Set the exitSignal to indicate this event + if self.nimby.locked and not runningFrame.ignoreNimby: + report.exit_status = rqd.rqconstants.EXITSTATUS_FOR_NIMBY_KILL + + self.network.reportRunningFrameCompletion(report) + runningFrame.completeReportSent = True + + def sanitizeFrames(self): + """ + Iterate over the cache and update the status of frames that might have + completed but never reported back to cuebot. + """ + for frameId, runningFrame in self.__cache.items(): + # If the frame was marked as completed (exitStatus) and a report has not been sent + # try to file the report again + if runningFrame.exitStatus is not None and not runningFrame.completeReportSent: + try: + self.sendFrameCompleteReport(runningFrame) + self.deleteFrame(frameId) + log.info("Successfully deleted frame from cache for %s/%s (%s)", + runningFrame.runFrame.job_name, + runningFrame.runFrame.frame_name, + frameId) + # pylint: disable=broad-except + except Exception: + log.exception("Failed to sanitize frame %s/%s", + runningFrame.runFrame.job_name, + runningFrame.runFrame.frame_name) diff --git a/rqd/rqd/rqmachine.py b/rqd/rqd/rqmachine.py index fc7fa59ab..1f67798e3 100644 --- a/rqd/rqd/rqmachine.py +++ b/rqd/rqd/rqmachine.py @@ -809,6 +809,7 @@ def getHostReport(self): self.__hostReport.host.CopyFrom(self.getHostInfo()) self.__hostReport.ClearField('frames') + self.__rqCore.sanitizeFrames() for frameKey in self.__rqCore.getFrameKeys(): try: info = self.__rqCore.getFrame(frameKey).runningFrameInfo() diff --git a/rqd/rqd/rqnetwork.py b/rqd/rqd/rqnetwork.py index de1b38475..b4b955b10 100644 --- a/rqd/rqd/rqnetwork.py +++ b/rqd/rqd/rqnetwork.py @@ -79,6 +79,7 @@ def __init__(self, rqCore, runFrame): self.lluTime = 0 self.childrenProcs = {} + self.completeReportSent = False def runningFrameInfo(self): """Returns the RunningFrameInfo object"""