diff --git a/docker/docker-compose.eea.yml b/docker/docker-compose.eea.yml index 1482ad04..f505c01a 100644 --- a/docker/docker-compose.eea.yml +++ b/docker/docker-compose.eea.yml @@ -2,7 +2,7 @@ version: '3' services: frontend: - image: eeacms/copernicus-qctool-frontend:2.1.3 + image: eeacms/copernicus-qctool-frontend:2.1.4 ports: - 8000:8000 environment: @@ -28,7 +28,7 @@ services: - qc_tool_frontend:/mnt/qc_tool_frontend worker: - image: eeacms/copernicus-qctool-worker:2.1.3 + image: eeacms/copernicus-qctool-worker:2.1.4 shm_size: 1gb environment: - PRODUCT_DIRS diff --git a/docker/docker-compose.service_provider.yml b/docker/docker-compose.service_provider.yml index d651d23c..d6d82663 100644 --- a/docker/docker-compose.service_provider.yml +++ b/docker/docker-compose.service_provider.yml @@ -2,7 +2,7 @@ version: '3' services: frontend: - image: eeacms/copernicus-qctool-frontend:2.1.3 + image: eeacms/copernicus-qctool-frontend:2.1.4 ports: - 8000:8000 environment: @@ -23,7 +23,7 @@ services: - qc_tool_volume:/mnt/qc_tool_volume worker: - image: eeacms/copernicus-qctool-worker:2.1.3 + image: eeacms/copernicus-qctool-worker:2.1.4 shm_size: 1gb environment: - PRODUCT_DIRS diff --git a/src/qc_tool/common.py b/src/qc_tool/common.py index b8d10aa7..7313bd75 100644 --- a/src/qc_tool/common.py +++ b/src/qc_tool/common.py @@ -3,11 +3,10 @@ import json import re -import time +import socket import xml.etree.ElementTree as ET from importlib import import_module from os import environ -from os.path import normpath from pathlib import Path from shutil import copyfile from urllib.error import URLError @@ -34,6 +33,8 @@ JOB_PARTIAL = "partial" JOB_FAILED = "failed" JOB_ERROR = "error" +JOB_TIMEOUT = "worker timeout" +JOB_LOST = "worker lost" JOB_INPUT_DIRNAME = "input.d" JOB_OUTPUT_DIRNAME = "output.d" @@ -59,7 +60,7 @@ UNKNOWN_REFERENCE_YEAR_LABEL = "ury" UPDATE_JOB_STATUSES_INTERVAL = 30000 -WORKER_ALIVE_TIMEOUT = 5 +WORKER_ALIVE_TIMEOUT = 20 REFRESH_JOB_STATUSES_BACKGROUND_INTERVAL = 60 INSPIRE_SERVICE_URL_DEFAULT = "https://sdi.eea.europa.eu/validator/v2/" @@ -134,6 +135,13 @@ def compose_job_dir(job_uuid): job_dir = CONFIG["work_dir"].joinpath("job_{:s}".format(job_uuid)) return job_dir +def compose_job_stdout_filepath(job_uuid): + return CONFIG["work_dir"].joinpath(("job.{:s}.stdout").format(job_uuid)) + +def compose_job_log_filepath(job_uuid): + job_dir = compose_job_dir(job_uuid) + return job_dir.joinpath("job.log") + def create_job_dir(job_uuid): job_dir = compose_job_dir(job_uuid) job_dir.mkdir(parents=True) @@ -260,32 +268,46 @@ def compile_job_report_data(job_uuid, product_ident=None): job_report["steps"][i].update(job_step) return job_report + +def load_job_status(job_uuid): + try: + job_result = load_job_result(job_uuid) + job_status = job_result.get("status", JOB_ERROR) + if job_status is None: + job_status = JOB_ERROR + except FileNotFoundError: + # If the job has already finished there must be correct job result orelse there is some error. + # FIXME: inform logger. + job_status = JOB_ERROR + return job_status + + def check_running_job(job_uuid, worker_url, timeout): job_status = None worker_info = None url = urljoin(worker_url, "/jobs/{:s}.json".format(job_uuid)) try: - with urlopen(url, timeout=int(timeout)) as resp: + with urlopen(url, timeout=float(timeout)) as resp: if resp.status != 200: - # Bad request. - # FIXME: inform logger about such awkward situation. - return JOB_ERROR + # Bad request or timeout. + # This situation might be the case of worker timeout / worker unreachable. + job_status = load_job_status(job_uuid) + if job_status == JOB_ERROR: + return JOB_TIMEOUT worker_info = json.loads(resp.read()) + except (TimeoutError, socket.timeout) as ex: + # This situation might be the case of worker timeout / worker not responding. + job_status = load_job_status(job_uuid) + if job_status == JOB_ERROR: + return JOB_TIMEOUT except URLError as ex: # Cannot connect to worker, maybe the job had already finished and then the worker was shutdown. - # FIXME: make notice to log. - pass + job_status = load_job_status(job_uuid) + if job_status == JOB_ERROR: + return JOB_LOST if worker_info is None: # The job has already finished so load status from job result. - try: - job_result = load_job_result(job_uuid) - job_status = job_result.get("status", JOB_ERROR) - if job_status is None: - job_status = JOB_ERROR - except FileNotFoundError: - # If the job has already finished there must be correct job result orelse there is some error. - # FIXME: inform logger. - return JOB_ERROR + job_status = load_job_status(job_uuid) return job_status diff --git a/src/qc_tool/frontend/dashboard/static/dashboard/js/deliveries.js b/src/qc_tool/frontend/dashboard/static/dashboard/js/deliveries.js index a62ad0c6..9dd88e03 100644 --- a/src/qc_tool/frontend/dashboard/static/dashboard/js/deliveries.js +++ b/src/qc_tool/frontend/dashboard/static/dashboard/js/deliveries.js @@ -73,9 +73,12 @@ function actionsFormatter(value, row) { // for example /setup_job/1234 var btn_data = '
'; - if (row.last_job_status === "waiting" || row.last_job_status === "running" || row.date_submitted) { + if (IS_TEST_GROUP || row.last_job_status === "waiting" || row.last_job_status === "running" || row.date_submitted) { // job is running --> QC button disabled, Delete button disabled var tooltip_message = "QC job is currently running."; + if (IS_TEST_GROUP) { + tooltip_message = "As a test user account you are not allowed to run QC."; + } if (row.is_submitted) { tooltip_message = "Delivery has been already submitted to EEA."; } @@ -89,13 +92,13 @@ function actionsFormatter(value, row) { btn_data += 'QC'; if (IS_TEST_GROUP) { - btn_data += ' '; + btn_data += ' '; } else { - btn_data += '
diff --git a/src/qc_tool/frontend/dashboard/urls.py b/src/qc_tool/frontend/dashboard/urls.py index 6ac206cc..8c8c7164 100644 --- a/src/qc_tool/frontend/dashboard/urls.py +++ b/src/qc_tool/frontend/dashboard/urls.py @@ -31,6 +31,7 @@ path("data/product_descriptions/", views.get_product_descriptions_dropdown, name="product_descriptions_dropdown"), path("data/report//report.json", views.get_job_report, name="job_report_json"), path("data/report//report.pdf", views.get_pdf_report, name="job_report_pdf"), + path("data/log//log.txt", views.get_combined_job_log, name="job_combined_log"), path("upload/", views.resumable_upload_page, name="file_upload"), path("resumable_upload/", views.resumable_upload, name="resumable_upload"), diff --git a/src/qc_tool/frontend/dashboard/views.py b/src/qc_tool/frontend/dashboard/views.py index b99b0d92..3bc1624e 100644 --- a/src/qc_tool/frontend/dashboard/views.py +++ b/src/qc_tool/frontend/dashboard/views.py @@ -40,6 +40,8 @@ from qc_tool.common import JOB_RUNNING from qc_tool.common import JOB_WAITING from qc_tool.common import compose_attachment_filepath +from qc_tool.common import compose_job_log_filepath +from qc_tool.common import compose_job_stdout_filepath from qc_tool.common import compile_job_form_data from qc_tool.common import compile_job_report_data from qc_tool.common import get_job_report_filepath @@ -1069,10 +1071,6 @@ def get_job_info(request, product_ident): job_report = compile_job_form_data(product_ident) return JsonResponse({'job_result': job_report}) -def get_job_report(request, job_uuid): - job = models.Job.objects.get(job_uuid=job_uuid) - job_result = compile_job_report_data(job_uuid, job.product_ident) - return JsonResponse(job_result, safe=False) def get_job_history_json(request, delivery_id): """ @@ -1121,6 +1119,10 @@ def get_result(request, job_uuid): delivery = job.delivery job_report = compile_job_report_data(job_uuid, job.product_ident) + # if job status is not set in the report then try get status from the DB table (case of TIMEOUT or LOST) + if job_report.get("status") is None: + job_report["status"] = job.job_status + for step in job_report["steps"]: # Strip initial qc_tool. from check idents. if step["check_ident"].startswith("qc_tool."): @@ -1147,6 +1149,30 @@ def get_pdf_report(request, job_uuid): raise Http404() return response +def get_job_report(request, job_uuid): + job = models.Job.objects.get(job_uuid=job_uuid) + job_result = compile_job_report_data(job_uuid, job.product_ident) + return JsonResponse(job_result, safe=False) + +def get_combined_job_log(request, job_uuid): + stdout_filepath = compose_job_stdout_filepath(job_uuid) + joblog_filepath = compose_job_log_filepath(job_uuid) + + stdout_log_text = "Loading stdout log .." + joblog_log_text = "Loading job log .." + try: + stdout_log_text = stdout_filepath.read_text() + except FileNotFoundError: + stdout_log_text = "stdout log: no data." + + try: + joblog_log_text = joblog_filepath.read_text() + except FileNotFoundError: + joblog_log_text = "job log: no data." + + combined_log = "STDOUT LOG:" + "\n" + stdout_log_text + "DETAILED JOB LOG:" + "\n" + joblog_log_text + return HttpResponse(combined_log, content_type="text/plain") + @login_required def download_delivery_file(request, delivery_id): delivery = get_object_or_404(models.Delivery, pk=int(delivery_id)) @@ -1435,5 +1461,5 @@ def refresh_job_statuses(): if job_status != JOB_RUNNING: job.update_status(job_status) updated_count += 1 - logger.info("Status of {:d} running jobs has been updated.".format(updated_count)) + logger.info("refresh_job_statuses: Status of {:d} running jobs has been updated.".format(updated_count)) time.sleep(int(CONFIG["refresh_job_statuses_background_interval"])) diff --git a/src/qc_tool/worker/dispatch.py b/src/qc_tool/worker/dispatch.py index c381bb51..22fe9692 100644 --- a/src/qc_tool/worker/dispatch.py +++ b/src/qc_tool/worker/dispatch.py @@ -37,7 +37,8 @@ log = logging.getLogger(__name__) class TimedOutExc(Exception): - print ("The check has failed due to a timeout.") + pass + #print ("The check has failed due to a timeout.") def signal_handler(signum, frame): raise TimedOutExc() diff --git a/src/qc_tool/worker/report.py b/src/qc_tool/worker/report.py index be16804a..03b7b267 100644 --- a/src/qc_tool/worker/report.py +++ b/src/qc_tool/worker/report.py @@ -22,6 +22,8 @@ from qc_tool.common import JOB_FAILED from qc_tool.common import JOB_OK from qc_tool.common import JOB_PARTIAL +from qc_tool.common import JOB_TIMEOUT +from qc_tool.common import JOB_LOST from qc_tool.common import QCException from qc_tool.common import TIME_FORMAT from qc_tool.common import CONFIG @@ -112,7 +114,7 @@ def footer(canvas, doc): job_status = job_report["status"] if job_status is None: job_status = JOB_ERROR - if job_status in (JOB_ERROR, JOB_FAILED): + if job_status in (JOB_ERROR, JOB_FAILED, JOB_LOST, JOB_TIMEOUT): job_status_style = style_check_failed elif job_status == JOB_OK: job_status_style = style_check_ok diff --git a/src/qc_tool/worker/scheduler.py b/src/qc_tool/worker/scheduler.py index f33dde42..f6aa3fef 100644 --- a/src/qc_tool/worker/scheduler.py +++ b/src/qc_tool/worker/scheduler.py @@ -176,7 +176,7 @@ def run(self, put_event): log.debug("The job has stdout and stderr redirected to %s.".format(stdout_filepath)) with open(stdout_filepath, "a") as stdout_f: stdout_f.write("\n\n") - stdout_f.write("The job {:s} has started at {:s}+00:00.\n".format(self.job_args["job_uuid"], datetime.utcnow())) + stdout_f.write("The job {:s} has started at {:s}+00:00.\n".format(self.job_args["job_uuid"], datetime.utcnow().isoformat())) stdout_f.write("stdout and stderr of the job is redirected to this file.\n".format(self.job_args["job_uuid"])) stdout_f.write("\n") stdout_f.flush()