add github-envs for tinybird upload scripts (localstack#8131)

F944 · Apr 19, 2023 · 86a5da8 · 86a5da8
1 parent 13a1419
commit 86a5da8
Show file tree

Hide file tree

Showing 3 changed files with 73 additions and 37 deletions.
diff --git a/.circleci/config.yml b/.circleci/config.yml
@@ -288,7 +288,8 @@ jobs:
             source .venv/bin/activate
             mkdir parity_metrics && mv target/metric_reports/metric-report-raw-data-*amd64*.csv parity_metrics
             METRIC_REPORT_DIR_PATH=parity_metrics \
-            COMMUNITY_IMPL_COV_PATH=scripts/implementation_coverage_full.csv \
+            IMPLEMENTATION_COVERAGE_FILE=scripts/implementation_coverage_full.csv \
+            SOURCE_TYPE=community \
             python -m scripts.tinybird.upload_raw_test_metrics_and_coverage
       - store_artifacts:
           path: parity_metrics/

diff --git a/scripts/tinybird/retrieve_legacy_data_from_circleci.py b/scripts/tinybird/retrieve_legacy_data_from_circleci.py
@@ -223,7 +223,9 @@ def collect_workflows_past_30_days():
                 os.environ["CIRCLE_WORKFLOW_ID"] = str(workflow_id)
 
                 # trigger the tinybird_upload
-                send_metric_report(metric_report_file_path, timestamp)
+                send_metric_report(
+                    metric_report_file_path, source_type="community", timestamp=timestamp
+                )
                 send_implemented_coverage(
                     community_coverage_file_path, timestamp=timestamp, type="community"
                 )

diff --git a/scripts/tinybird/upload_raw_test_metrics_and_coverage.py b/scripts/tinybird/upload_raw_test_metrics_and_coverage.py
@@ -55,18 +55,18 @@
 service, operation, status_code, error_code, is_implemented*
 
 Additionally, we add the following information:
-- *build_id*: the workflow-id (for CircleCI)
+- *build_id*: the workflow-id (for CircleCI) or run-id (for GitHub)
 - *timestamp*: a timestamp as string which will be the same for the CircleCI run
 - *ls_source*: “community” for the CircleCI run, “pro” for the Github action
 
 In order to get more metadata from the build, we also send some general information to tests_raw_builds.datasource:
-- *build_id:* the workflow-id (for CircleCI)
-- *timestamp:* a timestamp as string which will be the same for the CircleCI run
-- *branch:* env value from *`CIRCLE_BRANCH`*
-- *build_url:* env value from *`CIRCLE_BUILD_URL`*
-- *pull_requests:* env value from *`CIRCLE_PULL_REQUESTS`*
-- *build_num:* env value from *`CIRCLE_BUILD_NUM`*
-- *workflow_id:* env value from *`CIRCLE_WORKFLOW_ID`*
+- *build_id:* the workflow-id (for CircleCI) or run-id (for GitHub)
+- *timestamp:* a timestamp as string which will be the same for the CI run
+- *branch:* env value from *`CIRCLE_BRANCH`* or *`GITHUB_HEAD_REF`* (only set for pull_requests) or *`GITHUB_REF_NAME`*
+- *build_url:* env value from *`CIRCLE_BUILD_URL`* or *$GITHUB_SERVER_URL/$GITHUB_REPOSITORY/actions/runs/$GITHUB_RUN_ID*
+- *pull_requests:* env value from *`CIRCLE_PULL_REQUESTS`* or *`GITHUB_REF`*
+- *build_num:* env value from *`CIRCLE_BUILD_NUM`* (empty for GitHub, there seems to be no equivalent)
+- *workflow_id:* env value from *`CIRCLE_WORKFLOW_ID`* or *`GITHUB_RUN_ID`*
 """
 
 import csv
@@ -138,21 +138,53 @@ def send_metadata_for_build(build_id: str, timestamp: str):
     CIRCLE_BUILD_NUM=78206
     CIRCLE_BUILD_URL=https://circleci.com/gh/localstack/localstack/78206
     CIRCLE_WORKFLOW_ID=b86a4bc4-bcd1-4170-94d6-4af66846c1c1
+
+    GitHub env examples:
+    GITHUB_REF=ref/heads/master or ref/pull/<pr_number>/merge (will be used for 'pull_requests')
+    GITHUB_HEAD_REF=tinybird_data (used for 'branch', set only for pull_requests)
+    GITHUB_REF_NAME=feature-branch-1 (will be used for 'branch' if GITHUB_HEAD_REF is not set)
+    GITHUB_RUN_ID=1658821493 (will be used for 'workflow_id')
+
+    workflow run's URL (will be used for 'build_url'):
+    $GITHUB_SERVER_URL/$GITHUB_REPOSITORY/actions/runs/$GITHUB_RUN_ID
+
+    could not find anything that corresponds to "build_num" (number of current job in CircleCI)
+         -> leaving it blank for Github
     """
+    # on GitHub the GITHUB_HEAD_REF is only set for pull_request, else we use the GITHUB_REF_NAME
+    branch = (
+        os.environ.get("CIRCLE_BRANCH", "")
+        or os.environ.get("GITHUB_HEAD_REF", "")
+        or os.environ.get("GITHUB_REF_NAME", "")
+    )
+    workflow_id = os.environ.get("CIRCLE_WORKFLOW_ID", "") or os.environ.get("GITHUB_RUN_ID", "")
+
+    build_url = os.environ.get("CIRCLE_BUILD_URL", "")
+    if not build_url and os.environ.get("GITHUB_SERVER_URL"):
+        # construct the build-url for Github
+        server = os.environ.get("GITHUB_SERVER_URL", "")
+        repo = os.environ.get("GITHUB_REPOSITORY", "")
+        build_url = f"{server}/{repo}/actions/runs/{workflow_id}"
+
+    pull_requests = os.environ.get("CIRCLE_PULL_REQUESTS", "") or os.environ.get("GITHUB_REF", "")
+    build_num = os.environ.get(
+        "CIRCLE_BUILD_NUM", ""
+    )  # TODO could not find equivalent job-id ENV in github
+
     data = {
         "build_id": build_id,
         "timestamp": timestamp,
-        "branch": os.environ.get("CIRCLE_BRANCH", ""),
-        "build_url": os.environ.get("CIRCLE_BUILD_URL", ""),
-        "pull_requests": os.environ.get("CIRCLE_PULL_REQUESTS", ""),
-        "build_num": os.environ.get("CIRCLE_BUILD_NUM", ""),
-        "workflow_id": os.environ.get("CIRCLE_WORKFLOW_ID", ""),
+        "branch": branch,
+        "build_url": build_url,
+        "pull_requests": pull_requests,
+        "build_num": build_num,
+        "workflow_id": workflow_id,
     }
     data_to_send = [json.dumps(data)]
     send_data_to_tinybird(data_to_send, data_name=DATA_SOURCE_RAW_BUILDS)
 
 
-def send_metric_report(metric_path: str, timestamp: str):
+def send_metric_report(metric_path: str, source_type: str, timestamp: str):
     """
 
     SCHEMA >
@@ -172,7 +204,7 @@ def send_metric_report(metric_path: str, timestamp: str):
     """
     tmp: list[str] = []
     count: int = 0
-    build_id = os.environ.get("CIRCLE_WORKFLOW_ID", "")
+    build_id = os.environ.get("CIRCLE_WORKFLOW_ID", "") or os.environ.get("GITHUB_RUN_ID", "")
     send_metadata_for_build(build_id, timestamp)
 
     pathlist = Path(metric_path).rglob("metric-report-raw-data-*.csv")
@@ -187,7 +219,7 @@ def send_metric_report(metric_path: str, timestamp: str):
                 # add timestamp, build_id, ls_source
                 row["timestamp"] = timestamp
                 row["build_id"] = build_id
-                row["ls_source"] = "community"
+                row["ls_source"] = source_type
 
                 # remove data we are currently not interested in
                 for field in data_to_remove:
@@ -226,6 +258,7 @@ def send_implemented_coverage(file: str, timestamp: str, type: str):
     tmp: list[str] = []
     count: int = 0
 
+    build_id = os.environ.get("CIRCLE_WORKFLOW_ID", "") or os.environ.get("GITHUB_RUN_ID", "")
     with open(file, "r") as csv_obj:
         reader_obj = csv.DictReader(csv_obj)
         for row in reader_obj:
@@ -240,7 +273,7 @@ def send_implemented_coverage(file: str, timestamp: str, type: str):
             # add timestamp and source
             row["timestamp"] = timestamp
             row["ls_source"] = type
-            row["build_id"] = os.environ.get("CIRCLE_WORKFLOW_ID", "")
+            row["build_id"] = build_id
 
             tmp.append(json.dumps(row))
             if len(tmp) == 500:
@@ -257,37 +290,37 @@ def send_implemented_coverage(file: str, timestamp: str, type: str):
 def main():
     token = os.environ.get("TINYBIRD_PARITY_ANALYTICS_TOKEN", "")
     metric_report_dir = os.environ.get("METRIC_REPORT_DIR_PATH", "")
-    community_impl_coverage = os.environ.get("COMMUNITY_IMPL_COV_PATH", "")
-
+    impl_coverage_file = os.environ.get("IMPLEMENTATION_COVERAGE_FILE", "")
+    source_type = os.environ.get("SOURCE_TYPE", "")
     missing_info = (
         "missing data, please check the available ENVs that are required to run the script"
     )
-    if not token:
-        print(missing_info)
-        print("missing TINYBIRD_PARITY_ANALYTICS_TOKEN")
-        return
+    print(
+        f"METRIC_REPORT_DIR_PATH={metric_report_dir}, IMPLEMENTATION_COVERAGE_FILE={impl_coverage_file}, "
+        f"SOURCE_TYPE={source_type}"
+    )
     if not metric_report_dir:
         print(missing_info)
         print("missing METRIC_REPORT_DIR_PATH")
         return
-    if not community_impl_coverage:
+    if not impl_coverage_file:
+        print(missing_info)
+        print("missing IMPLEMENTATION_COVERAGE_FILE")
+        return
+    if not source_type:
         print(missing_info)
-        print("missing COMMUNITY_IMPL_COV_PATH")
+        print("missing SOURCE_TYPE")
+        return
+    if not token:
+        print(missing_info)
+        print("missing TINYBIRD_PARITY_ANALYTICS_TOKEN")
         return
 
     # create one timestamp that will be used for all the data sent
     timestamp: str = datetime.datetime.utcnow().strftime("%Y-%m-%d %H:%M:%S")
 
-    # TODO re-enable pro tests (but these should be sent by the pro pipeline)
-    # pro_impl_coverage = os.environ.get("PRO_IMPL_COV_PATH", "")
-    # if not pro_impl_coverage:
-    #     print(missing_info)
-    #     print("missing PRO_IMPL_COV_PATH")
-    #     return
-    # send_implemented_coverage(pro_impl_coverage, timestamp=timestamp, type="pro")
-
-    send_metric_report(metric_report_dir, timestamp)
-    send_implemented_coverage(community_impl_coverage, timestamp=timestamp, type="community")
+    send_metric_report(metric_report_dir, source_type=source_type, timestamp=timestamp)
+    send_implemented_coverage(impl_coverage_file, timestamp=timestamp, type=source_type)
 
 
 if __name__ == "__main__":