Expand script to collect dates for the pytorch/docs dir (#6534)

svekars · web-flow · commit a193ddd99f1e · 2025-04-18T13:44:14.000-07:00
- Modify the script to get last_updated for the files in pytorch repo
under the docs directory.
diff --git a/.github/scripts/get_tutorials_stats.py b/.github/scripts/get_tutorials_stats.py
@@ -113,9 +113,13 @@ class CommitInfo(NamedTuple):
 
 def get_file_names(
     cwd: Optional[str] = None,
+    path_filter: Optional[str] = None,
 ) -> List[CommitInfo]:
+    cmd = "git log --date=short --pretty='format:%h;%ad' --numstat"
+    if path_filter:
+        cmd += f" -- {path_filter}"
     lines = run_command(
-        "git log --date=short --pretty='format:%h;%ad' --numstat",
+        cmd,
         cwd=cwd,
         env={"TZ": "UTC"},
     ).split("\n")
@@ -191,13 +195,22 @@ def conv_to_csv(json_data: List[Dict[str, Any]]) -> io.StringIO:
 
 
 def main() -> None:
+    # Process the tutorials repo
+    print("Processing tutorials repo")
     tutorials_dir = os.path.expanduser("./tutorials")
-    get_history_log = get_history(tutorials_dir)
-    commits_to_files = get_file_names(tutorials_dir)
+    tutorials_history_log = get_history(tutorials_dir)
+    tutorials_commits_to_files = get_file_names(tutorials_dir)
+
+    # Process the pytorch/docs dir
+    print("Processing pytorch/docs dir")
+    pytorch_docs_dir = os.path.expanduser("./pytorch/docs")
+    pytorch_docs_history_log = get_history(pytorch_docs_dir)
+    pytorch_docs_commits_to_files = get_file_names(
+        os.path.expanduser("./pytorch"), "docs"
+    )
 
-    # Upload data to S3 as csv with gzip compression and no header line
+    # Combine the two histories
 
-    print(f"Uploading data to {METADATA_PATH}")
     history_log = [
         {
             "commit_id": i[0],
@@ -207,21 +220,54 @@ def main() -> None:
             "number_of_changed_files": int(i[4]),
             "lines_added": int(i[5]),
             "lines_deleted": int(i[6]),
+            "repo": "tutorials",
         }
-        for i in get_history_log
+        for i in tutorials_history_log
     ]
+
+    history_log.extend(
+        [
+            {
+                "commit_id": i[0],
+                "author": i[1],
+                "date": i[2],
+                "title": i[3],
+                "number_of_changed_files": int(i[4]),
+                "lines_added": int(i[5]),
+                "lines_deleted": int(i[6]),
+                "repo": "pytorch",
+            }
+            for i in pytorch_docs_history_log
+        ]
+    )
+
+    # Combine the two commits to files
+
+    filenames = []
+    for entry in tutorials_commits_to_files:
+        items = convert_to_dict(entry)
+        for item in items:
+            item["filename"] = f"tutorials/{item['filename']}"
+        filenames.extend(items)
+
+    for entry in pytorch_docs_commits_to_files:
+        items = convert_to_dict(entry)
+        for item in items:
+            item["filename"] = f"pytorch/{item['filename']}"
+        filenames.extend(items)
+
+    # Upload data to S3 as csv with gzip compression and no header line
+
+    print(f"Uploading data to {METADATA_PATH}")
     upload_to_s3(
         "ossci-raw-job-status",
         f"{METADATA_PATH}",
         history_log,
     )
     print(f"Finished uploading data to {METADATA_PATH}")
 
+    # Upload filenames to S3
     print(f"Uploading data to {FILENAMES_PATH}")
-    filenames = []
-    for entry in commits_to_files:
-        items = convert_to_dict(entry)
-        filenames.extend(items)
     upload_to_s3(
         "ossci-raw-job-status",
         f"{FILENAMES_PATH}",
diff --git a/.github/workflows/upload-tutorials-stats.yml b/.github/workflows/upload-tutorials-stats.yml
@@ -1,13 +1,9 @@
-# This is a basic workflow to help you get started with Actions
-
 name: Upload tutorials stat
 
-# Controls when the workflow will run
 on:
   schedule:
   # Run this once per day.
     - cron: "0 0 * * *"
-  # Allows you to run this workflow manually from the Actions tab
   workflow_dispatch:
 
 jobs:
@@ -32,6 +28,12 @@ jobs:
           path: './tutorials'
           fetch-depth: 0
 
+      - name: Checkout the pytorch repo
+        uses: actions/checkout@v4
+        with:
+          path: './pytorch'
+          fetch-depth: 0
+
       - name: Checkout the test-infra repo
         uses: actions/checkout@v4
         with: