Mostly minor changes, added appropriate header in the py files

The headers for the `GET` calls should maybe be set by environment variables though.
NeotomaDB · Aug 10, 2023 · 32320b4 · 32320b4
1 parent 04857ee
commit 32320b4
Show file tree

Hide file tree

Showing 5 changed files with 75 additions and 69 deletions.
diff --git a/.gitignore b/.gitignore
@@ -174,3 +174,7 @@ cython_debug/
 #  and can be added to the global gitignore or merged into this file.  For a more nuclear
 #  option (not recommended) you can uncomment the following to ignore the entire idea folder.
 #.idea/
+
+models/article-relevance/logistic_regression_model.joblib
+
+data/article-relevance/outputs/prediction_parquet/
diff --git a/docker-compose.yml b/docker-compose.yml
@@ -6,10 +6,10 @@ services:
       dockerfile: ./docker/article-relevance/Dockerfile
       context: .
     environment:
-      - N_RECENT=10
-      - MIN_DATE=
+      - N_RECENT=
+      - MIN_DATE=12M
       - MAX_DATE=
-      - TERM=
+      - TERM=holocene
       - AUTO_MIN_DATE=False
       - AUTO_CHECK_DUP=False
       - OUTPUT_PATH=/outputs/
@@ -18,7 +18,7 @@ services:
       - MODEL_PATH=/models/logistic_regression_model.joblib
 
     volumes:
-      - ./data/article-relevance/outputs:/output
+      - ./data/article-relevance/outputs:/outputs
       - ./data/article-relevance/processed/prediction_parquet:/parquet
       - ./data/article-relevance/raw:/raw
       - ./models/article-relevance:/models

diff --git a/src/article_relevance/gdd_api_query.py b/src/article_relevance/gdd_api_query.py
@@ -5,7 +5,7 @@
 # Input: parameters to specify article range
 # output: JSON containing article gddid, doi, url, and other metadata about the query process
 
-"""This script takes in user specified parameter values and query the GDD API for recently acquired articles. 
+"""This script takes in user specified parameter values and query the GDD API for recently acquired articles.
 
 Usage: gdd_api_query.py --doi_path=<doi_path> --parquet_path=<parquet_path> [--n_recent=<n_recent>] [--min_date=<min_date>] [--max_date=<max_date>] [--term=<term>] [--auto_min_date=<auto_min_date>] [--auto_check_dup=<auto_check_dup>]
 
@@ -33,7 +33,6 @@
 from datetime import datetime, date
 
 
-
 # Locate src module
 current_dir = os.path.dirname(os.path.abspath(__file__))
 src_dir = os.path.dirname(current_dir)
@@ -43,15 +42,15 @@
 
 logger = get_logger(__name__) # this gets the object with the current modules name
 
-def get_new_gdd_articles(output_path, 
+def get_new_gdd_articles(output_path,
                          parquet_path,
-                         n_recent_articles = None, 
-                         min_date = None, 
-                         max_date = None, 
+                         n_recent_articles = None,
+                         min_date = None,
+                         max_date = None,
                          term = None,
                          auto_check_dup = False):
-    """ 
-    Get newly acquired articles from min_date to (optional) max_date. 
+    """
+    Get newly acquired articles from min_date to (optional) max_date.
     Or get the most recent new articles added to GeoDeepDive.
     Return API resuls as a list of article metadata information.
 
@@ -62,7 +61,7 @@ def get_new_gdd_articles(output_path,
         min_date (str)          Lower limit of GeoDeepDive acquired date.
         max_date (str)          Upper limit of GeoDeepDive acquired date.
         term (str)              Term to search for.
-    
+
     Return:
         Write JSON output to output_path.
 
@@ -72,42 +71,42 @@ def get_new_gdd_articles(output_path,
         get_new_gdd_articles(n_recent_articles = 1000)
     """
 
-    # ======== To handle placeholder for arguments in the docker compose, convert empty str to None ===
+    # == convert empty str to None to handle placeholder for docker compose arguments  ==
     if n_recent_articles == '':
-         n_recent_articles = None
+        n_recent_articles = None
     if min_date == '':
-         min_date = None
+        min_date = None
     if max_date == '':
-         max_date = None
+        max_date = None
     if term == '':
-         term = None
-        
+        term = None
+
     # ======== Tests for input data type ==========
     if (n_recent_articles is None) and (min_date is None and max_date is None):
-            raise ValueError("Either n_recent_articles or a date range should be specified.")
-    
+        raise ValueError("Either n_recent_articles or a date range should be specified.")
+
     if (n_recent_articles is not None) and (min_date is not None or max_date is not None):
-            raise ValueError("Only one of n_recent_articles or a date range should be specified.")
-    
+        raise ValueError("Only one of n_recent_articles or a date range should be specified.")
+
     if (n_recent_articles is None) and (min_date is not None or max_date is not None):
             pattern = r'^\d{4}-\d{2}-\d{2}$'
-    
+
             if min_date is not None:
                 if not isinstance(min_date, str):
                      raise ValueError("min_date should be a string. min_date should be a string with format 'yyyy-mm-dd'.")
                 if re.match(pattern, min_date) is False:
                      raise ValueError("min_date does not follow the correct format. min_date should be a string with format 'yyyy-mm-dd'.")
-            
+
             if max_date is not None:
                 if not isinstance(max_date, str):
                      raise ValueError("min_date should be a string. min_date should be a string with format 'yyyy-mm-dd'.")
                 if re.match(pattern, max_date) is False:
                      raise ValueError("min_date does not follow the correct format. min_date should be a string with format 'yyyy-mm-dd'.")
-            
+
     if (n_recent_articles is not None) and (min_date is None and max_date is None):
          if not isinstance(n_recent_articles, int):
                 raise ValueError("n_recent_articles should be an integer.")
-            
+
     # ========== Query API ==========
     if n_recent_articles is not None:
         logger.info(f'Querying by n_recent = {n_recent_articles}')
@@ -117,33 +116,34 @@ def get_new_gdd_articles(output_path,
     elif (min_date is not None) and (max_date is not None):
         logger.info(f'Querying by min_date = {min_date} and max_date = {max_date}')
         api_call = f"https://xdd.wisc.edu/api/articles?min_acquired={min_date}&max_acquired={max_date}&full_results=true"
-    
+
     elif (min_date is not None) and (max_date is None):
         logger.info(f'Querying by min_date = {min_date}.')
         api_call = f"https://xdd.wisc.edu/api/articles?min_acquired={min_date}&full_results=true"
 
     elif (min_date is None) and (max_date is not None):
         logger.info(f'Querying by max_date = {max_date}.')
         api_call = f"https://xdd.wisc.edu/api/articles?max_acquired={max_date}&full_results=true"
-    
+
     else:
         raise ValueError("Please check input parameter values.")
-    
+
     if term is not None:
         logger.info(f'Search term = {term}.')
         api_extend = f"&term={term}"
         api_call += api_extend
 
 
     # =========== Query xDD API to get data ==========
+    logger.info(f'{api_call}')
     session = requests.Session()
     response = session.get(api_call)
 
     n_refresh = 0
     while response.status_code != 200 and n_refresh < 10:
-        response = requests.get(api_call)
+        response = requests.get(api_call, headers={"User-Agent":"Neotoma-Article-Relevance-Tool (mailto:[email protected])"})
         n_refresh += 1
-    
+
     response_dict = response.json()
     data = response_dict['success']['data']
 
@@ -167,7 +167,7 @@ def get_new_gdd_articles(output_path,
             new_data = next_response_dict['success']['data']
             logger.info(f'{len(new_data)} articles queried from GeoDeepDive (page {i}).')
             data.extend(new_data)
-            
+
             next_page = next_response_dict['success']['next_page']
             n_refresh += 1
 
@@ -188,15 +188,15 @@ def get_new_gdd_articles(output_path,
              one_article_dict['DOI'] = ['Non-DOI Article ID type']
         elif article['identifier'][0]['type'] == 'doi':
             one_article_dict['DOI'] = [article['identifier'][0]['id']]
-        else: 
+        else:
             one_article_dict['DOI'] = ['Non-DOI Article ID type']
-        
+
         one_article_dict['url'] = [article['link'][0]['url']]
         one_article_dict['status'] = 'queried'
 
         one_article = pd.DataFrame(one_article_dict)
         gdd_df = pd.concat([gdd_df, one_article])
-    
+
     gdd_df = gdd_df.reset_index(drop=True)
     logger.info(f'{gdd_df.shape[0]} articles returned from GeoDeepDive.')
 
@@ -219,11 +219,11 @@ def get_new_gdd_articles(output_path,
                     # Read only the ID column from the Parquet file
                     gdd_one_file = pq.read_table(file_path, columns=["gddid"]).to_pandas()
                     existing_ids.update(gdd_one_file["gddid"])
-        
+
             # remove the duplicates
             result_df = gdd_df[~gdd_df["gddid"].isin(existing_ids)]
             logger.info(f'{result_df.shape[0]} articles are new addition for relevance prediction.')
-        
+
     else:
          result_df = gdd_df.copy()
 
@@ -232,7 +232,7 @@ def get_new_gdd_articles(output_path,
 
     # pass the query info to prediction step (for saving in the parquet file)
     result_dict['queryinfo_min_date'] = min_date
-    
+
     if max_date is None:
         current_date = date.today()
         formatted_date = current_date.strftime("%Y-%m-%d")
@@ -255,13 +255,13 @@ def get_new_gdd_articles(output_path,
 
 
 def find_max_date_from_parquet(parquet_path):
-    """ 
+    """
     Based on the filename, find the last date when the pipeline was run.
     Return this data in yy-mm-dd format as a string.
 
     Args:
         parquet_path (str)      The path to the folder that stores the processed parquet files.
-    
+
     Return:
         Date as a string.
     """
@@ -273,11 +273,11 @@ def find_max_date_from_parquet(parquet_path):
             # Extract date from the file_name
             curr_date_str = file_name.split('_')[1][0:10]
             curr_date = datetime.strptime(curr_date_str, "%Y-%m-%d").date()
-            
+
             # Compare with result and update if the date is newer
             if curr_date > min_date:
                  min_date = curr_date
-    
+
     min_date_str = min_date.strftime("%Y-%m-%d")
 
     return min_date_str
@@ -298,7 +298,7 @@ def main():
     param_min_date = opt["--min_date"]
 
     param_auto_min_date = opt['--auto_min_date']
-    
+
     if param_auto_min_date.lower() == 'true':
         file_list = os.listdir(parquet_file_path)
         if len(file_list) == 0:
@@ -307,18 +307,18 @@ def main():
              param_min_date = find_max_date_from_parquet(parquet_file_path)
              logger.warning(f'auto_min_date is True. {param_min_date} is set as the min_date.')
 
-    
+
     param_auto_check_dup = opt['--auto_check_dup']
     if param_auto_check_dup is None:
         param_auto_check_dup = False
-         
+
     param_max_date = opt["--max_date"]
     param_term = opt["--term"]
 
-    get_new_gdd_articles(output_path = doi_file_storage, 
+    get_new_gdd_articles(output_path = doi_file_storage,
                          parquet_path = parquet_file_path,
-                         min_date = param_min_date, 
-                         max_date = param_max_date, 
+                         min_date = param_min_date,
+                         max_date = param_max_date,
                          n_recent_articles= param_n_recent,
                          term = param_term,
                          auto_check_dup = param_auto_check_dup

diff --git a/src/article_relevance/relevance_prediction_parquet.py b/src/article_relevance/relevance_prediction_parquet.py
@@ -16,19 +16,19 @@
     --send_xdd=<send_xdd>                   When True, relevant articles will be sent to xDD through API query. Default is False.
 """
 
-import pandas as pd
-import numpy as np
 import os
-import requests
+import datetime
 import json
+import requests
 import sys
 from langdetect import detect
 from sentence_transformers import SentenceTransformer
 import joblib
 from docopt import docopt
 import pyarrow as pa
 import pyarrow.parquet as pq
-import datetime
+import pandas as pd
+import numpy as np
 
 # Locate src module
 current_dir = os.path.dirname(os.path.abspath(__file__))
@@ -54,16 +54,16 @@ def crossref_extract(doi_path):
         pandas Dataframe containing CrossRef metadata.
     """
 
-    logger.info(f'Running crossref_extract function.')
+    logger.info("Running crossref_extract function.")
 
 
-    with open(doi_path) as json_file:
+    with open(doi_path, encoding="utf-8") as json_file:
         data_dictionary = json.load(json_file)
 
     df = pd.DataFrame(data_dictionary['data'])
 
     if df.shape[0] == 0:
-        logger.warning(f'Last xDD API query did not retrieve any article. Please verify the arguments.')
+        logger.warning("xDD API query did not retrieve any articles. Please verify the arguments.")
         raise ValueError("No article to process. Script terminated.")
 
     doi_col = 'DOI'
@@ -74,14 +74,16 @@ def crossref_extract(doi_path):
     # Initialize
     crossref = pd.DataFrame()
 
-    logger.info(f'Querying CrossRef API for article metadata.')
+    logger.info("Querying CrossRef API for article metadata.")
 
     # Loop through all doi, concatenate metadata into dataframe
     for doi in input_doi:
         cross_ref_url = f"https://api.crossref.org/works/{doi}"
 
          # make a request to the API
-        cross_ref_response = requests.get(cross_ref_url)
+        cross_ref_response = requests.get(cross_ref_url,
+            headers={"User-Agent":"Neotoma-Article-Relevance-Tool (mailto:[email protected])"},
+            timeout = 600)
 
         if cross_ref_response.status_code == 200:
 
@@ -161,7 +163,7 @@ def data_preprocessing(metadata_df):
         pd DataFrame containing all info required for model prediction.
     """
 
-    logger.info(f'Prediction data preprocessing begin.')
+    logger.info("Prediction data preprocessing begin.")
 
     metadata_df = metadata_df.reset_index(drop = True)
 
@@ -186,7 +188,7 @@ def data_preprocessing(metadata_df):
     metadata_df['text_with_abstract'] =  metadata_df['title_clean'] + ' ' + metadata_df['subtitle_clean'] + ' ' + metadata_df['abstract_clean']
 
     # Impute missing language
-    logger.info(f'Running article language imputation.')
+    logger.info("Running article language imputation.")
 
     metadata_df['language'] = metadata_df['language'].fillna(value = '')
     metadata_df['text_with_abstract'] = metadata_df['text_with_abstract'].fillna(value = '')
@@ -254,7 +256,7 @@ def add_embeddings(input_df, text_col, model = 'allenai/specter2'):
     Returns:
         pd DataFrame with origianl features and sentence embedding features added.
     """
-    logger.info(f'Sentence embedding start.')
+    logger.info("Sentence embedding start.")
 
     embedding_model = SentenceTransformer(model)
 
@@ -272,7 +274,7 @@ def add_embeddings(input_df, text_col, model = 'allenai/specter2'):
     # concatenate invalid_df with valid_df
     result = pd.concat([df_with_embeddings, invalid_df])
 
-    logger.info(f'Sentence embedding completed.')
+    logger.info("Sentence embedding completed.")
 
     return result