Add files via upload

computron · web-flow · commit 36b2218ff2d9 · 2024-01-26T13:31:22.000-08:00
code upload
diff --git a/__init__.py b/__init__.py
diff --git a/analyze.py b/analyze.py
@@ -0,0 +1,44 @@
+from collections import Counter
+import re
+import os
+import json
+from tqdm import tqdm
+
+# helper to find all the JSON data files ... probably an easier way to do this with a library
+def find_json_files(directory):
+    json_files = []
+    for root, _, files in os.walk(directory):
+        for file in files:
+            if file.endswith(".json"):
+                json_files.append(os.path.join(root, file))
+    return json_files
+
+
+if __name__ == "__main__":
+    target_folder = os.getcwd()
+    json_files = find_json_files(target_folder)
+    all_refs = Counter()
+
+    for json_file in tqdm(json_files):
+        with open(json_file, "r") as file:
+            try:
+                data = json.load(file)
+                # load the references for each file
+                ref_docs = data["ref_docs"]
+                doc_strs = []
+                for ref in ref_docs:
+                    doi = str(ref["doi"])
+                    title = ref["title"] or ref["sourcetitle"]
+                    refid = str(ref["id"])
+                    doc_strs.append(f"{doi}\t{title}\t{refid}")
+
+                # add each reference to a running counter
+                all_refs.update(doc_strs)
+
+            except json.JSONDecodeError as e:
+                print(f"Error reading {json_file}: {e}")
+
+    top_items = all_refs.most_common(100)
+
+    for item, count in top_items:
+        print(f"{item}\t{count}")
diff --git a/generate.py b/generate.py
@@ -0,0 +1,50 @@
+import os
+import traceback
+from pybliometrics.scopus import ScopusSearch, AbstractRetrieval
+import json
+from tqdm import tqdm
+
+# NOTE: config file for pybliometrics is stored in $HOME/.config/pybliometrics.cfg
+
+if __name__ == "__main__":
+    for year in range(2013, 2024):
+        # make the folder to store the data for the year
+        current_path = os.getcwd()
+        folder_path = os.path.join(current_path, str(year))
+        if not os.path.exists(folder_path):
+            os.makedirs(folder_path)
+
+        # get the results
+        x = ScopusSearch(
+            f'ABS ( "data mining" ) OR ABS ( "machine learning" ) OR TITLE ( "data mining" ) OR TITLE ( "machine learning" ) AND TITLE ( "material" ) OR ABS ( "material" ) OR SRCTITLE ( "material" ) AND SUBJAREA ( mate ) AND DOCTYPE ( "AR" ) AND SRCTYPE( j ) AND PUBYEAR = {year} AND NOT SUBJAREA (medi ) AND NOT SUBJAREA ( immu ) AND NOT SUBJAREA ( BIOC ) AND NOT SUBJAREA ( busi )',
+            view="STANDARD")
+        print(len(x.results))
+
+        # store the results and add the ref_docs key to store each reference
+        for doc in tqdm(x.results):
+            try:
+                # store each result in a file labeled by its Scopus EID
+                doc_dict = doc._asdict()
+                eid = doc_dict["eid"]
+                file_path = os.path.join(folder_path, f"{eid}.json")
+                if not os.path.exists(file_path):
+                    # Look up the references / citations for that document
+                    document = AbstractRetrieval(eid, view="REF")
+                    refs = []
+                    # Store the references
+                    for ref in document.references:
+                        ref_doc = {"doi": ref.doi, "title": ref.title,
+                                   "id": ref.id,
+                                   "sourcetitle": ref.sourcetitle}
+                        refs.append(ref_doc)
+                    doc_dict["ref_docs"] = refs
+                    # Dump the dictionary to the JSON file
+                    with open(file_path, "w") as json_file:
+                        json.dump(doc_dict, json_file)
+                else:
+                    print("SKIP (File already exists)")
+
+            # we're not going to try too hard to fix any of the rare errors
+            except Exception as e:
+                print(f"An error occurred: {e}")
+                traceback.print_exc()