Skip to content

Commit 36b2218

Browse files
authored
Add files via upload
code upload
1 parent 4079387 commit 36b2218

File tree

3 files changed

+94
-0
lines changed

3 files changed

+94
-0
lines changed

__init__.py

Whitespace-only changes.

analyze.py

+44
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,44 @@
1+
from collections import Counter
2+
import re
3+
import os
4+
import json
5+
from tqdm import tqdm
6+
7+
# helper to find all the JSON data files ... probably an easier way to do this with a library
8+
def find_json_files(directory):
9+
json_files = []
10+
for root, _, files in os.walk(directory):
11+
for file in files:
12+
if file.endswith(".json"):
13+
json_files.append(os.path.join(root, file))
14+
return json_files
15+
16+
17+
if __name__ == "__main__":
18+
target_folder = os.getcwd()
19+
json_files = find_json_files(target_folder)
20+
all_refs = Counter()
21+
22+
for json_file in tqdm(json_files):
23+
with open(json_file, "r") as file:
24+
try:
25+
data = json.load(file)
26+
# load the references for each file
27+
ref_docs = data["ref_docs"]
28+
doc_strs = []
29+
for ref in ref_docs:
30+
doi = str(ref["doi"])
31+
title = ref["title"] or ref["sourcetitle"]
32+
refid = str(ref["id"])
33+
doc_strs.append(f"{doi}\t{title}\t{refid}")
34+
35+
# add each reference to a running counter
36+
all_refs.update(doc_strs)
37+
38+
except json.JSONDecodeError as e:
39+
print(f"Error reading {json_file}: {e}")
40+
41+
top_items = all_refs.most_common(100)
42+
43+
for item, count in top_items:
44+
print(f"{item}\t{count}")

generate.py

+50
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,50 @@
1+
import os
2+
import traceback
3+
from pybliometrics.scopus import ScopusSearch, AbstractRetrieval
4+
import json
5+
from tqdm import tqdm
6+
7+
# NOTE: config file for pybliometrics is stored in $HOME/.config/pybliometrics.cfg
8+
9+
if __name__ == "__main__":
10+
for year in range(2013, 2024):
11+
# make the folder to store the data for the year
12+
current_path = os.getcwd()
13+
folder_path = os.path.join(current_path, str(year))
14+
if not os.path.exists(folder_path):
15+
os.makedirs(folder_path)
16+
17+
# get the results
18+
x = ScopusSearch(
19+
f'ABS ( "data mining" ) OR ABS ( "machine learning" ) OR TITLE ( "data mining" ) OR TITLE ( "machine learning" ) AND TITLE ( "material" ) OR ABS ( "material" ) OR SRCTITLE ( "material" ) AND SUBJAREA ( mate ) AND DOCTYPE ( "AR" ) AND SRCTYPE( j ) AND PUBYEAR = {year} AND NOT SUBJAREA (medi ) AND NOT SUBJAREA ( immu ) AND NOT SUBJAREA ( BIOC ) AND NOT SUBJAREA ( busi )',
20+
view="STANDARD")
21+
print(len(x.results))
22+
23+
# store the results and add the ref_docs key to store each reference
24+
for doc in tqdm(x.results):
25+
try:
26+
# store each result in a file labeled by its Scopus EID
27+
doc_dict = doc._asdict()
28+
eid = doc_dict["eid"]
29+
file_path = os.path.join(folder_path, f"{eid}.json")
30+
if not os.path.exists(file_path):
31+
# Look up the references / citations for that document
32+
document = AbstractRetrieval(eid, view="REF")
33+
refs = []
34+
# Store the references
35+
for ref in document.references:
36+
ref_doc = {"doi": ref.doi, "title": ref.title,
37+
"id": ref.id,
38+
"sourcetitle": ref.sourcetitle}
39+
refs.append(ref_doc)
40+
doc_dict["ref_docs"] = refs
41+
# Dump the dictionary to the JSON file
42+
with open(file_path, "w") as json_file:
43+
json.dump(doc_dict, json_file)
44+
else:
45+
print("SKIP (File already exists)")
46+
47+
# we're not going to try too hard to fix any of the rare errors
48+
except Exception as e:
49+
print(f"An error occurred: {e}")
50+
traceback.print_exc()

0 commit comments

Comments
 (0)