NEONScience · covesturtevant · Jun 4, 2025 · Jun 2, 2025 · Jun 3, 2025 · Jun 3, 2025
diff --git a/.github/workflows/DEV_prtncon_site_list.yml b/.github/workflows/DEV_prtncon_site_list.yml
@@ -0,0 +1,34 @@
+name: DEV-prtncon-site-list
+on:
+  push:
+    branches:
+      - 'master'
+    paths:
+      - 'pipe/prtncon/site-list.json'
+  workflow_dispatch: {} # Allows trigger of workflow from web interface
+jobs:
+  put_files:
+    runs-on: arc-neon-gke
+    #runs-on: ubuntu-latest 
+    env:
+      PACHD_ADDRESS: grpcs://pachyderm-dev.transitions-nonprod.gcp.neoninternal.org:443
+      PACH_TOKEN: ${{ secrets.RepoOwnerPachydermDev }}
+      REPO: prtncon_site_list # Pachyderm repo
+      BRANCH: master 
+      IN_PATHS: 'pipe/prtncon/site-list.json' # Comma-separated list (no spaces) to one or more paths or directories. Length must match OUT_PATHS. If directory, all files in directory will be placed in pachyderm at corresponding entry of OUT_PATHS. 
+      OUT_PATHS: 'site-list.json' # Comma-separated list (no spaces) of corresponding path(s) to place the files(s) in Pachyderm. Must be same length as IN_PATHS. If corresponding entry in IN_PATHS is a file, specify to the file. If corresponding entry in IN_PATHS is a directory, specify to the directory. 
+    steps:
+      - uses: actions/checkout@v4
+      - run: ls -la
+
+      - name: Put file
+        uses: ./.github/actions/put-files
+        with:
+          pachd_address: ${{ env.PACHD_ADDRESS }}
+          pach_token: ${{ env.PACH_TOKEN }}
+          repo_name: ${{ env.REPO }}
+          branch_name: ${{ env.BRANCH }}
+          in_paths: ${{ env.IN_PATHS }}
+          out_paths: ${{ env.OUT_PATHS }}
+
+
@@ -1,2 +1,4 @@
 name: DEV-prtncon-site-list
+permissions:
+  contents: read
 on:
@@ -1,2 +1,4 @@
 name: DEV-prtncon-site-list
+permissions:
+  contents: read
 on:
diff --git a/.github/workflows/DEV_prtncon_update_dag.yml b/.github/workflows/DEV_prtncon_update_dag.yml
@@ -0,0 +1,61 @@
+name: DEV-prtncon-update-dag
+on:
+  push:
+    branches:
+      - 'master'
+    paths:
+      - 'pipe/prtncon/*.yaml'
+      - 'pipe/prtncon/pipe_list_prtncon.txt'
+  workflow_dispatch: {} # Allows trigger of workflow from web interface
+
+jobs:
+  # -------------------------------------------------------------
+  # Using GitHub's API is not supported for push events
+  # -------------------------------------------------------------
+  # 
+  # ----------------------------------------------------------------------------------------------
+  # Using local .git history
+  # ----------------------------------------------------------------------------------------------
+  # Event `push`: Compare the preceding remote commit -> to the current commit of the main branch 
+  # ----------------------------------------------------------------------------------------------
+
+  changed_files:
+    runs-on: ubuntu-latest  # windows-latest || macos-latest
+    outputs:
+    # Use this changed_file_list if you plan to use get-changed-files-action 
+      changed_file_list: ${{ steps.changed-files-action.outputs.changed_file_list }}
+    steps:
+      - uses: actions/checkout@v4
+        with:
+          fetch-depth: 0  # OR "2" -> To retrieve the preceding commit.
+
+    # Using get-changed-files-action 
+      - name: Get changed files action
+        id: changed-files-action
+        uses: ./.github/actions/get-changed-files
+
+  update_pipelines:
@@ -1,2 +1,4 @@
 name: DEV-prtncon-update-dag
+permissions:
+  contents: read
 on:
@@ -1,2 +1,4 @@
 name: DEV-prtncon-update-dag
+permissions:
+  contents: read
 on:
+    needs: changed_files
+    runs-on: arc-neon-gke
+    #runs-on: ubuntu-latest 
+    env:
+      PACHD_ADDRESS: grpcs://pachyderm-dev.transitions-nonprod.gcp.neoninternal.org:443
+      PACH_TOKEN: ${{ secrets.RepoOwnerPachydermDev }}
+      PATHS: 'pipe/prtncon=pipe_list_prtncon.txt' # Format: '<directory>=<pipe_list_file>'. Separate multiple with comma (e.g. 'pipe/pqs1=pipe_list_pqs1.txt,pipe/parWaterSurface=pipe_list_parWaterSurface.txt'). Order matters.
+      TRANSACTION: True
+      UPDATE_SCOPE: changed # 'all' or 'changed'. If not specified, all will be updated. 'changed' will update/create any changed/non-existent pipelines.
+      CHANGED_FILES: ${{needs.changed_files.outputs.changed_file_list}}
+    steps:
+      - uses: actions/checkout@v4
+      - run: ls -la
+
+      - name: Update pipelines
+        uses: ./.github/actions/update-pipelines
+        with:
+          pachd_address: ${{ env.PACHD_ADDRESS }}
+          pach_token: ${{ env.PACH_TOKEN }}
+          paths: ${{ env.PATHS }}
+          transaction: ${{ env.TRANSACTION }}
+          update_scope: ${{ env.UPDATE_SCOPE }}
+          changed_files: ${{ env.CHANGED_FILES }}
+
@@ -1,2 +1,4 @@
 name: DEV-prtncon-update-dag
+permissions:
+  contents: read
 on:
@@ -1,2 +1,4 @@
 name: DEV-prtncon-update-dag
+permissions:
+  contents: read
 on:
diff --git a/pipe/prtncon/pipe_list_prtncon.txt b/pipe/prtncon/pipe_list_prtncon.txt
@@ -0,0 +1,3 @@
+prtncon_cron_daily_and_date_control.yaml
+prtncon_data_source_kafka.yaml
+prtncon_data_source_trino.yaml
diff --git a/pipe/prtncon/prtncon_cron_daily_and_date_control.yaml b/pipe/prtncon/prtncon_cron_daily_and_date_control.yaml
@@ -0,0 +1,57 @@
+---
+pipeline:
+  name: prtncon_cron_daily_and_date_control
+transform:
+  image: us-central1-docker.pkg.dev/neon-shared-service/neonscience/neon-is-date-cntl:v2.0.1
+  cmd: ["/bin/bash"]
+  env:
+    # START_DATE ("YYYY-MM-DD") and END_DATE ("YYYY-MM-DD") indicate the max date range (inclusive) to create the /Y/M/D folder structure
+    # If START_DATE is not set (remove line entirely to unset), the start_date and/or the kafka_start_date for each site will be used, as indicated in the site-list json file
+    # start_date field in the site-list file is the earliest date to pull data from a site
+    # kafka_start_date in the site-list file is the first full day from which data began streaming via Kafka
+    # END_DATE can be set or unset (remove line entirely to unset). If unset, end date will be yesterday. 
+    OUT_PATH: /pfs/out
+    START_DATE: "2025-05-01" # Inclusive
+   # END_DATE: "2023-11-23" # Inclusive
+    SOURCE_TYPE: "prtncon"
+  stdin:
+  - "#!/bin/bash"
+  - python3 -m cron_daily_and_date_control.cron_daily_and_date_control_main
+input:
+  cross:
+  # This cron is the central driver for daily scheduled updates, such as data ingest and metadata loaders.
+  - cron:
+      name: tick
+      spec: "0 7 * * *" # Run at 00:00 MST (07:00 GMT)
+      overwrite: true
+  - pfs:
+      name: SITE_FILE
+      repo: prtncon_site_list
+      glob: /site-list.json
+resource_requests:
+  memory: 100M
+  cpu: 1
+resource_limits:
+  memory: 300M
+  cpu: 1.5
+sidecar_resource_requests:
+  memory: 500M
+  cpu: 0.5
+autoscaling: true
+scheduling_spec:
+  node_selector:
+    cloud.google.com/gke-ephemeral-storage-local-ssd: "true"
+    nodepool.neonscience.org/pipeline: "yes"
+    cloud.google.com/gke-spot: "true"
+pod_spec: |-
+  { "tolerations": [
+    {
+      "key": "nodepool.neonscience.org/pipeline",
+      "operator": "Exists"
+    },
+    {
+      "effect": "NoSchedule",
+      "key": "cloud.google.com/gke-spot",
+      "operator": "Exists"
+    }  
+  ] }
diff --git a/pipe/prtncon/prtncon_data_source_kafka.yaml b/pipe/prtncon/prtncon_data_source_kafka.yaml
@@ -0,0 +1,177 @@
+---
+pipeline:
+  name: prtncon_data_source_kafka
+transform:
+  image: us-central1-docker.pkg.dev/neon-shared-service/bei/neon-avro-kafka-loader:v4.7.4
+  image_pull_secrets:
+  - battelleecology-quay-read-all-pull-secret
+  env:
+    OUT_PATH: /pfs/out
+    SOURCE_TYPE: "prtncon"
+    LOG_LEVEL: INFO
+    YEAR_INDEX: "5"
+    MONTH_INDEX: "6"
+    DAY_INDEX: "7"
+    KAFKA_RETENTION_DAYS: "15"
+  secrets:
+  - name: pachyderm-kafka-auth
+    env_var: KAFKA_USER
+    key: KAFKA_USER
+  - name: pachyderm-kafka-auth
+    env_var: KAFKA_PASSWORD
+    key: KAFKA_PASSWORD
+  - name: pachyderm-kafka-auth
+    env_var: KAFKA_BROKER
+    key: KAFKA_BROKER
+  - name: l0-bucket
+    env_var: BUCKET_NAME
+    key: LO_BUCKET
+  - name: pdr-secret
+    env_var: PDR_HOST
+    key: hostname
+  - name: pdr-secret
+    env_var: PDR_DBNAME
+    key: database
+  - name: pdr-secret
+    env_var: PDR_USER
+    key: username
+  - name: pdr-secret
+    env_var: PDR_PASSWORD
+    key: password
+  cmd:
+  - sh
+  - "-c"
+  - |-
+    /bin/bash <<'EOF'
+
+    # Use bash-scrict mode. See http://redsymbol.net/articles/unofficial-bash-strict-mode/
+    set -euo pipefail
+    IFS=$'\n\t'
+
+    # Get GCP zone
+    meta=$(curl -sH "Metadata-Flavor: Google" "http://metadata/computeMetadata/v1/instance/zone")
+    zone=$(echo $meta | cut -d "/" -f 4)
+    echo $zone
+
+    # Get today's date for evaluating kafka data retention period
+    date_today=$(date -u +%Y-%m-%d)
+    kafka_min_date=$(date -u -d "$KAFKA_RETENTION_DAYS days ago" +%Y-%m-%d)
+
+    # Get date from input path. Terminal path structure must be /SOURCE_TYPE/YYYY/MM/DD/SITE_FILE
+    # Datum must be set at /SOURCE_TYPE/YYYY/MM/DD or /SOURCE_TYPE/YYYY/MM/DD/SITE_FILE
+    date_path=$(echo $import_trigger | cut -f $YEAR_INDEX,$MONTH_INDEX,$DAY_INDEX -d "/")
+    echo $date_path
+    date_str=$(date -u +%Y-%m-%d -d $date_path)
+
+    # Get each site to run
+    if [[ -f ${import_trigger} ]]; then
+      import_trigger_glob="${import_trigger}"
+    else
+      import_trigger_glob="${import_trigger}/*"
+    fi
+
+    sites_output=()
+
+    for site_kafka in $import_trigger_glob; do
+      site_file=$(basename $site_kafka) # Strip off any path prefix
+      site=$(echo $site_file | cut -f 1 -d "." --only-delimited) # Extract the site from site.kafka. Ignore site-only files (e.g. CPER vs. CPER.kafka)
+      type=$(echo $site_file | cut -f 2 -d "." --only-delimited) # Extract the 'kafka' from site.kafka
+      if [ "$type" != "kafka" ]
+      then
+        echo "$site_file is not indicated to be streaming from Kafka. Skipping..."
+        continue
+      elif [ "$(date -u +%s -d "$date_str")" -lt "$(date -u +%s -d "$kafka_min_date")" ]
+      then
+        echo -n "Cannot extract $date_str Kafka data for $site. "
+        echo -n "Today's date ($date_today) is beyond the Kafka retention period ($KAFKA_RETENTION_DAYS days). Skipping..."
+        continue
+      fi
+
+      # We are ok to run
+      echo "Extracting $date_str kafka data for $site"
+
+      # Get "current data" - data that came in on the specified day, which is the same day it was measured
+      # Note: We cannot use the --removeoffset flag on the kafka loader (which removes the offsets from the filenames. This will often violate the Pachyderm requirement that different datums cannot write the same file)
+      ./extract-kafka-sensor.py -s $site -S $SOURCE_TYPE -D "$OUT_PATH/$SOURCE_TYPE" -d $date_str --only current --consumer client.rack=$zone
+
+      # Get "non-current data" - data that came in on the specified day, which is NOT the same day it was measured
+      date_str_1=$(date +%Y-%m-%d -d "$date_str + 1 day")
+      ./extract-kafka-sensor.py -s $site -S $SOURCE_TYPE -D "$OUT_PATH/$SOURCE_TYPE" -d $date_str_1 --only noncurrent --consumer client.rack=$zone
+
+      sites_output+=($site)
+
+    done
+
+    # Upload L0 files to bucket, compacting with any existing file with the same name
+    if [[ -d "$OUT_PATH/$SOURCE_TYPE" ]]; then
+      linkdir=$(mktemp -d)
+      shopt -s globstar
+      out_parquet_glob="${OUT_PATH}/**/*.parquet"
+      # /pfs/out/prtncon/2023/01/01/12345/data/file.parquet
+      echo "Linking output files to ${linkdir}"
+      # set -x # Uncomment for debugging
+      for f in $out_parquet_glob; do
+        # Parse the path
+        [[ "$f" =~ ^$OUT_PATH/(.*)/([0-9]+)/([0-9]+)/([0-9]+)/(.*)/data/(.*)$ ]]
+        fsourcetype="${BASH_REMATCH[1]}"
+        fyear="${BASH_REMATCH[2]}"
+        fmonth="${BASH_REMATCH[3]}"
+        fday="${BASH_REMATCH[4]}"
+        fsourceid="${BASH_REMATCH[5]}"
+        fname="${BASH_REMATCH[6]}"
+        fname_out="${fsourcetype}_${fsourceid}_${fyear}-${fmonth}-${fday}.parquet"  # Remove offsets from the filename
+        outdir="${linkdir}/v2/${fsourcetype}/ms=${fyear}-${fmonth}/source_id=${fsourceid}"
+        mkdir -p "${outdir}"
+        ln -s "${f}" "${outdir}/${fname_out}"
+
+        # Upload to bucket, compacting with any existing file 
+        ./compact-bucket-copy.py --sourcepath "${linkdir}" --destbucket "${BUCKET_NAME}"
+        rm -rf "${outdir}"
+      done
+
+      # Update the airflow triggering table
+      for site_output in "${sites_output[@]}"; do
+        ./update-trigger-table.py -s $site_output -S $SOURCE_TYPE -D "$OUT_PATH/$SOURCE_TYPE" 
+      done
+
+      # set +x # Uncomment for debugging
+      rm -rf $linkdir
+    fi
+    EOF
+input:
+  pfs:
+    name: import_trigger
+    repo: prtncon_cron_daily_and_date_control
+    # Must be datum by day (e.g. /SOURCE_TYPE/*/*/*) or by day/site (e.g. /SOURCE_TYPE/*/*/*/*)
+    glob: "/prtncon/*/*/*"
+parallelism_spec:
+  constant: 3
+autoscaling: true
+resource_requests:
+  memory: 300M
+  cpu: 1.6
+resource_limits:
+  memory: 1.5G
+  cpu: 2
+sidecar_resource_requests:
+  memory: 2G
+  cpu: 0.5
+datum_set_spec:
+  number: 1
+scheduling_spec:
+  node_selector:
+    cloud.google.com/gke-ephemeral-storage-local-ssd: "true"
+    nodepool.neonscience.org/pipeline: "yes"
+    cloud.google.com/gke-spot: "true"
+pod_spec: |-
+  { "tolerations": [
+    {
+      "key": "nodepool.neonscience.org/pipeline",
+      "operator": "Exists"
+    },
+    {
+      "effect": "NoSchedule",
+      "key": "cloud.google.com/gke-spot",
+      "operator": "Exists"
+    }  
+  ] }