nyuolab · LW0214 · Oct 8, 2022
diff --git a/baseline_xgb/get_dates_file.py b/baseline_xgb/get_dates_file.py
@@ -0,0 +1,25 @@
+import numpy as np
+import pandas as pd
+import pickle as pkl
+import os
+
+total=pd.read_csv('/gpfs/data/oermannlab/project_data/text2table/complete_v2/dataset_v2.csv')
+
+def tmp(row):
+    item=row.split('-')
+    item[0]='Y'+item[0]
+    item[1]='M'+item[1]
+    item[2]='D'+item[2]
+    return '-'.join(item)
+total['DOB']=total['DOB'].apply(lambda x:tmp(x))
+
+dum=total['DOB'].str.get_dummies(sep='-')
+
+# just in case folder doesn't exist
+class_dir='class_files'
+os.makedirs(class_dir,exist_ok=True)
+
+# save dumifies columns
+file_name='dates.pkl'
+with open(os.path.join(class_dir,file_name),'wb') as f:
+    pkl.dump(dum.columns,f)
diff --git a/baseline_xgb/new_xgb.py b/baseline_xgb/new_xgb.py
@@ -9,6 +9,7 @@
 from sklearn import metrics
 import argparse
 from sklearn.model_selection import train_test_split
+from get_dates_file import tmp
 
 
 # Code for Training and Testing the baseline model, XGBoost, on the data with entered task
@@ -164,17 +165,17 @@ def train(task, tokenizer): # Function to train the model
     else: # preprocess data
         print("preprocessing data...")
         os.makedirs(baseline_folder_path,exist_ok=True)
-        # load original dataset
+        #load original dataset
         train=pd.read_csv(data_dir+'/train.csv')
         dev=pd.read_csv(data_dir+'/dev.csv')
         test=pd.read_csv(data_dir+'/test.csv')
 
-        # recover the original dataframe
-        total=pd.concat([train, dev])
-        total=pd.concat([total, test])
-        print("total shape: ", total.shape)
-        # get rid of rows with nan in labels
-        # total=total[total['DRG_CODE'].isna()==False]
+        #append dataframe
+        total=pd.concat([train,dev])
+        total=pd.concat([total,test])
+        print("total shape: ",total.shape)
+        #get rid of rows with nan in labels
+        total=total[total[col].isna()==False]
 
         # get X and y based on the task
         X_total=total['TEXT']
@@ -201,12 +202,16 @@ def train(task, tokenizer): # Function to train the model
             if task[0] == "GENDER" or "HOSPITAL_EXPIRE_FLAG	": # Gender and Expire Flag are binary
                 y_total=y_total.squeeze(axis=1).str.get_dummies().to_numpy()
             elif task[0] == "DOB": # DOB has format of YYYY-MM-DD
-                y_total=y_total.squeeze(axis=1).str.get_dummies(sep='-').to_numpy()
+                # add special token before y/m/d
+                y_total=y_total.apply(lambda x:tmp(x))
+                y_total=y_total.str.get_dummies(sep='-').to_numpy()
             else: # other tasks are separated by <CEL>
                 y_total=y_total.squeeze(axis=1).str.get_dummies(sep=' <CEL> ').to_numpy()
 
         else: # Multi-task: Combine all columns into one column and each column is separated by <CEL>
             if "DOB" in task: # DOB has format of YYYY-MM-DD, processed it first
+                # add special token before y/m/d
+                y_total=y_total.apply(lambda x:tmp(x))
                 # Replace "-" in DOB with "<CEL>"
                 y_total["DOB"] = y_total["DOB"].str.replace("-", "<CEL>")
 
@@ -320,4 +325,4 @@ def train(task, tokenizer): # Function to train the model
 
 else: # invalid mode
     print("Invalid mode. Please choose from train, predict_train, predict_test")
-    exit()
+    exit()