Merge pull request #100 from legend-exp/ci

gipert · web-flow · commit 8a76eea2c570 · 2025-02-21T16:54:42.000+01:00
CI improvements
diff --git a/.github/workflows/main.yml b/.github/workflows/main.yml
@@ -68,21 +68,8 @@ jobs:
           token: ${{ secrets.CLONE_LEGEND_METADATA }}
           path: ${{ env.LEGEND_METADATA }}
 
-      - name: Get dependencies and install legend-dataflow
-        run: |
-          python -m pip install --upgrade uv
-          python -m uv pip install --upgrade .[runprod]
-
-      - name: Set the PRODENV variable
-        run: |
-          echo "PRODENV=$(realpath $GITHUB_WORKSPACE/..)" >> $GITHUB_ENV
-
-      - name: run workflows in dry-run mode
-        run: |
-          snakemake --workflow-profile workflow/profiles/lngs-build-raw -n all-*-daq.gen
-          snakemake --workflow-profile workflow/profiles/lngs-build-raw -n all-*-raw.gen
-          snakemake --workflow-profile workflow/profiles/lngs -n all-*-evt.gen
-          snakemake --workflow-profile workflow/profiles/lngs -n all-*-skm.gen
+      - name: Run data production tests
+        run: ./tests/runprod/run-all.sh
 
   test-coverage:
     name: Calculate and upload test coverage
diff --git a/codecov.yml b/codecov.yml
@@ -0,0 +1,17 @@
+codecov:
+  require_ci_to_pass: true
+
+coverage:
+  status:
+    project:
+      default:
+        enabled: no
+    patch:
+      default:
+        enabled: no
+    changes:
+      default:
+        enabled: no
+
+github_checks:
+  annotations: false
diff --git a/pyproject.toml b/pyproject.toml
@@ -51,10 +51,10 @@ dynamic = ["version"]
 
 dependencies = [
     "colorlog",
-    "dbetto>=1.2.0",
+    "dbetto>=1.2",
     "pygama>=2",
     "dspeed>=1.6",
-    "pylegendmeta==1.2.0a2",
+    "pylegendmeta>=1.2",
     "legend-pydataobj>=1.11.6",
     "legend-daq2lh5>=1.4",
     "pip",
diff --git a/tests/runprod/conftest.sh b/tests/runprod/conftest.sh
@@ -0,0 +1,30 @@
+#!/usr/bin/env bash
+
+# IMPORTANT: this script must be *sourced* from the legend-dataflow directory
+
+_prod_cycle="$(realpath .)"
+
+function get_dataflow_config_value() {
+    python -c "import dbetto; print(dbetto.AttrsDict(dbetto.utils.load_dict('${_prod_cycle}/dataflow-config.yaml')).${1})" \
+        | sed "s|\$_|${_prod_cycle}|g"
+}
+
+run_test_command() {
+    echo "::notice::$*"
+
+    output=$("$@" 2>&1)
+    status=$?
+
+    if [ $status -ne 0 ]; then
+        echo "::error::command failed with status $status"
+        echo "$output"
+    fi
+
+    return $status
+}
+
+
+export -f get_dataflow_config_value run_test_command
+
+PRODENV="$(realpath ..)"
+export PRODENV
diff --git a/tests/runprod/install.sh b/tests/runprod/install.sh
@@ -0,0 +1,15 @@
+#!/usr/bin/env bash
+
+# IMPORTANT: this script must be executed from the legend-dataflow directory
+
+echo "::group::setting up test environment"
+
+PRODENV="$(realpath ..)"
+export PRODENV
+
+python -m pip --quiet install --upgrade pip wheel setuptools
+python -m pip --quiet install --upgrade '.[runprod]'
+
+dataprod -v install --remove --system bare -- dataflow-config.yaml
+
+echo "::endgroup::"
diff --git a/tests/runprod/run-all.sh b/tests/runprod/run-all.sh
@@ -0,0 +1,11 @@
+#!/usr/bin/env bash
+
+# IMPORTANT: this script must be executed from the legend-dataflow directory
+
+./tests/runprod/install.sh
+
+for test in tests/runprod/test-*.sh; do
+    echo "::group::test $test"
+    ./"$test" || exit 1
+    echo "::endgroup::"
+done
diff --git a/tests/runprod/test-evt.sh b/tests/runprod/test-evt.sh
@@ -0,0 +1,50 @@
+#!/usr/bin/env bash
+
+# IMPORTANT: this script must be executed from the legend-dataflow directory
+
+# shellcheck disable=SC1091
+source "$(cd -- "$(dirname -- "${BASH_SOURCE[0]}")" &> /dev/null && pwd)/conftest.sh"
+
+rawdir="$(get_dataflow_config_value paths.tier_raw)"
+mkdir -p "${rawdir}" || exit 1
+
+function mkdir_n_touch() {
+    mkdir -p "$(dirname "${1}")" || return 1
+    touch "${1}" || return 1
+}
+
+rawfiles=(
+    phy/p04/r001/l200-p04-r001-phy-20230421T174901Z-tier_raw.lh5
+    phy/p04/r000/l200-p04-r000-phy-20230415T033517Z-tier_raw.lh5
+    phy/p03/r001/l200-p03-r001-phy-20230318T015140Z-tier_raw.lh5
+    phy/p03/r000/l200-p03-r000-phy-20230312T043356Z-tier_raw.lh5
+    phy/p03/r002/l200-p03-r002-phy-20230324T205907Z-tier_raw.lh5
+    cal/p04/r001/l200-p04-r001-cal-20230421T131817Z-tier_raw.lh5
+    cal/p04/r000/l200-p04-r000-cal-20230414T215158Z-tier_raw.lh5
+    cal/p03/r001/l200-p03-r001-cal-20230317T211819Z-tier_raw.lh5
+    cal/p03/r000/l200-p03-r000-cal-20230311T235840Z-tier_raw.lh5
+    cal/p03/r002/l200-p03-r002-cal-20230324T161401Z-tier_raw.lh5
+    anp/p13/r002/l200-p13-r002-anp-20241217T094846Z-tier_raw.lh5
+    anc/p13/r006/l200-p13-r006-anc-20241221T150249Z-tier_raw.lh5
+    acs/p13/r006/l200-p13-r006-acs-20241221T150307Z-tier_raw.lh5
+)
+
+(
+    cd "${rawdir}" || exit 1
+    for file in "${rawfiles[@]}"; do
+        mkdir_n_touch "$file"
+    done
+)
+
+inputs="$(get_dataflow_config_value paths.metadata)"
+
+# FIXME: remove these at some point
+touch "$inputs/dataprod/overrides/dsp/cal/p03/r000/l200-p03-r000-cal-20230311T235840Z-par_dsp_svm_train.lh5"
+touch "$inputs/dataprod/overrides/dsp/cal/p04/r000/l200-p04-r000-cal-20230414T215158Z-par_dsp_svm_train.lh5"
+
+_smk_opts=(
+    --touch
+    --workflow-profile workflow/profiles/default
+)
+
+run_test_command snakemake "${_smk_opts[@]}" "all-*-evt.gen" || exit 1
diff --git a/tests/runprod/test-raw.sh b/tests/runprod/test-raw.sh
@@ -0,0 +1,43 @@
+#!/usr/bin/env bash
+
+# IMPORTANT: this script must be executed from the legend-dataflow directory
+
+# shellcheck disable=SC1091
+source "$(cd -- "$(dirname -- "${BASH_SOURCE[0]}")" &> /dev/null && pwd)/conftest.sh"
+
+sandbox=$(get_dataflow_config_value paths.sandbox_path)
+mkdir -p "${sandbox}"
+
+(
+    cd "${sandbox}" || exit 1
+    touch \
+        l200-p03-r000-cal-20230311T235840Z.orca \
+        l200-p03-r001-cal-20230317T211819Z.orca \
+        l200-p03-r002-cal-20230324T161401Z.orca \
+        l200-p04-r000-cal-20230414T215158Z.orca \
+        l200-p04-r001-cal-20230421T131817Z.orca \
+        l200-p03-r000-phy-20230312T043356Z.orca \
+        l200-p03-r001-phy-20230318T015140Z.orca \
+        l200-p03-r002-phy-20230324T205907Z.orca \
+        l200-p04-r000-phy-20230415T033517Z.orca \
+        l200-p04-r001-phy-20230421T174901Z.orca \
+        l200-p13-r006-acs-20241221T150307Z.fcio \
+        l200-p13-r006-anc-20241221T150249Z.fcio \
+        l200-p13-r002-anp-20241217T094846Z.fcio
+)
+
+# FIXME: --touch does not do what I thought. need to add this functionality to
+# the future plugin
+_smk_opts=(
+    --forcerun
+    --touch
+    --config system=bare
+    --cores all
+    --workflow-profile workflow/profiles/lngs-build-raw
+)
+
+for tier in daq raw; do
+    run_test_command snakemake "${_smk_opts[@]}" "all-*-${tier}.gen" || exit 1
+done
+
+rm -rf "${sandbox}"
diff --git a/workflow/Snakefile b/workflow/Snakefile
@@ -80,10 +80,11 @@ localrules:
 onstart:
     print("INFO: starting workflow")
     # Make sure some packages are initialized before we begin to avoid race conditions
-    for pkg in ["dspeed", "lgdo", "matplotlib"]:
-        shell(execenv.execenv_pyexe(config, "python") + "-c 'import " + pkg + "'")
+    if not workflow.touch:
+        for pkg in ["dspeed", "lgdo", "matplotlib"]:
+            shell(execenv.execenv_pyexe(config, "python") + "-c 'import " + pkg + "'")
 
-        # Log parameter catalogs in validity files
+            # Log parameter catalogs in validity files
     hit_par_cat_file = Path(utils.pars_path(config)) / "hit" / "validity.yaml"
     if hit_par_cat_file.is_file():
         hit_par_cat_file.unlink()
@@ -122,29 +123,11 @@ onstart:
 
 
 onsuccess:
-    from snakemake.report import auto_report
-
-
-    rep_dir = f"{log_path(config)}/report-{datetime.strftime(datetime.utcnow(), '%Y%m%dT%H%M%SZ')}"
-    Path(rep_dir).mkdir(parents=True, exist_ok=True)
-    # auto_report(workflow.persistence.dag, f"{rep_dir}/report.html")
-    auto_report(workflow.persistence.dag, report_plugin, report_settings)
-
-    with open(os.path.join(rep_dir, "dag.txt"), "w") as f:
-        f.writelines(str(workflow.persistence.dag))
-        # shell(f"cat {rep_dir}/dag.txt | dot -Tpdf > {rep_dir}/dag.pdf")
-
-    with open(f"{rep_dir}/rg.txt", "w") as f:
-        f.writelines(str(workflow.persistence.dag.rule_dot()))
-        # shell(f"cat {rep_dir}/rg.txt | dot -Tpdf > {rep_dir}/rg.pdf")
-
-        # remove .gen files
     files = glob.glob("*.gen")
     for file in files:
         if os.path.isfile(file):
             os.remove(file)
 
-            # remove filelists
     files = glob.glob(os.path.join(utils.filelist_path(config), "*"))
     for file in files:
         if os.path.isfile(file):
diff --git a/workflow/Snakefile-build-raw b/workflow/Snakefile-build-raw
@@ -54,7 +54,10 @@ onstart:
     print("INFO: initializing workflow")
 
     # Make sure some packages are initialized before we send jobs to avoid race conditions
-    shell(execenv.execenv_pyexe(config, "python") + " -c 'import daq2lh5, matplotlib'")
+    if not workflow.touch:
+        shell(
+            execenv.execenv_pyexe(config, "python") + " -c 'import daq2lh5, matplotlib'"
+        )
 
     raw_par_cat_file = Path(utils.pars_path(config)) / "raw" / "validity.yaml"
     if raw_par_cat_file.is_file():
@@ -87,16 +90,24 @@ rule gen_filelist:
 
 
 rule sort_data:
-    """
-    This rules moves the daq data from the unsorted sandbox dir
-    to the sorted dirs under generated
+    """Move DAQ data from sandbox to organized folder.
+
+    This rules moves the DAQ data from the unsorted sandbox directory to the
+    correct location in the `tier_raw` folder.
     """
     input:
-        patt.get_pattern_tier_daq_unsorted(config, extension="fcio"),
+        patt.get_pattern_tier_daq_unsorted(config),
     output:
-        patt.get_pattern_tier_daq(config, extension="fcio"),
+        patt.get_pattern_tier_daq(config),
     shell:
         "mv {input} {output}"
 
 
+use rule sort_data as sort_data_fcio with:
+    input:
+        patt.get_pattern_tier_daq_unsorted(config, extension="fcio"),
+    output:
+        patt.get_pattern_tier_daq(config, extension="fcio"),
+
+
 # vim: filetype=snakemake
diff --git a/workflow/rules/ann.smk b/workflow/rules/ann.smk
@@ -15,7 +15,9 @@ from legenddataflow.execenv import execenv_pyexe
 rule build_ann:
     input:
         dsp_file=get_pattern_tier(config, "dsp", check_in_cycle=False),
-        pars_file=lambda wildcards: get_input_par_file(wildcards, "ann", "cuts"),
+        pars_file=lambda wildcards: get_input_par_file(
+            setup=config, wildcards=wildcards, tier="ann", name="cuts"
+        ),
     params:
         timestamp="{timestamp}",
         datatype="{datatype}",
@@ -45,7 +47,9 @@ rule build_ann:
 rule build_pan:
     input:
         dsp_file=get_pattern_tier(config, "psp", check_in_cycle=False),
-        pars_file=lambda wildcards: get_input_par_file(wildcards, "ann", "cuts"),
+        pars_file=lambda wildcards: get_input_par_file(
+            setup=config, wildcards=wildcards, tier="ann", name="cuts"
+        ),
     params:
         timestamp="{timestamp}",
         datatype="{datatype}",
diff --git a/workflow/rules/common.smk b/workflow/rules/common.smk
@@ -66,16 +66,21 @@ def set_last_rule_name(workflow, new_name):
     workflow.check_localrules()
 
 
-def get_input_par_file(wildcards, tier, name):
+def get_input_par_file(setup, wildcards, tier, name):
+    allow_none = setup.get("allow_none", False)
     par_overwrite_file = Path(patt.par_overwrite_path(config)) / tier / "validity.yaml"
     pars_files_overwrite = Catalog.get_files(
         par_overwrite_file,
         wildcards.timestamp,
+        category=wildcards.datatype if hasattr(wildcards, "datatype") else "all",
     )
     for pars_file in pars_files_overwrite:
         if name in str(pars_file):
             return Path(patt.par_overwrite_path(config)) / tier / pars_file
-    raise ValueError(f"Could not find model in {pars_files_overwrite}")
+    if allow_none or (wildcards.datatype != "phy"):
+        return []
+    else:
+        raise ValueError(f"Could not find model in {pars_files_overwrite}")
 
 
 def get_overwrite_file(tier, wildcards=None, timestamp=None, name=None):
diff --git a/workflow/rules/dsp_pars_geds.smk b/workflow/rules/dsp_pars_geds.smk
@@ -214,10 +214,12 @@ rule build_pars_dsp_eopt_geds:
 rule build_svm_dsp_geds:
     input:
         hyperpars=lambda wildcards: get_input_par_file(
-            wildcards, "dsp", "svm_hyperpars"
+            setup=config, wildcards=wildcards, tier="dsp", name="svm_hyperpars"
         ),
         train_data=lambda wildcards: str(
-            get_input_par_file(wildcards, "dsp", "svm_hyperpars")
+            get_input_par_file(
+                setup=config, wildcards=wildcards, tier="dsp", name="svm_hyperpars"
+            )
         ).replace("hyperpars.yaml", "train.lh5"),
     params:
         timestamp="{timestamp}",
diff --git a/workflow/rules/evt.smk b/workflow/rules/evt.smk
@@ -26,7 +26,7 @@ rule build_evt:
             config, wildcards.timestamp, "hit"
         ),
         xtalk_matrix=lambda wildcards: get_input_par_file(
-            tier="evt", wildcards=wildcards, name="xtc"
+            setup=config, tier="evt", wildcards=wildcards, name="xtc"
         ),
     output:
         get_pattern_tier(config, "evt", check_in_cycle=check_in_cycle),
@@ -77,7 +77,7 @@ rule build_pet:
             config, wildcards.timestamp, "pht"
         ),
         xtalk_matrix=lambda wildcards: get_input_par_file(
-            tier="pet", wildcards=wildcards, name="xtc"
+            setup=config, tier="pet", wildcards=wildcards, name="xtc"
         ),
     output:
         get_pattern_tier(config, "pet", check_in_cycle=check_in_cycle),
diff --git a/workflow/rules/psp_pars_geds.smk b/workflow/rules/psp_pars_geds.smk
@@ -167,10 +167,12 @@ workflow._ruleorder.add(*rule_order_list)  # [::-1]
 rule build_svm_psp:
     input:
         hyperpars=lambda wildcards: get_input_par_file(
-            wildcards, "psp", "svm_hyperpars"
+            setup=config, wildcards=wildcards, tier="psp", name="svm_hyperpars"
         ),
         train_data=lambda wildcards: str(
-            get_input_par_file(wildcards, "psp", "svm_hyperpars")
+            get_input_par_file(
+                setup=config, wildcards=wildcards, tier="psp", name="svm_hyperpars"
+            )
         ).replace("hyperpars.yaml", "train.lh5"),
     output:
         dsp_pars=get_pattern_pars(config, "psp", "svm", "pkl"),
diff --git a/workflow/src/legenddataflow/execenv.py b/workflow/src/legenddataflow/execenv.py
diff --git a/workflow/src/legenddataflow/utils.py b/workflow/src/legenddataflow/utils.py