Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
25 changes: 25 additions & 0 deletions baseline_xgb/get_dates_file.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,25 @@
import numpy as np
import pandas as pd
import pickle as pkl
import os

total=pd.read_csv('/gpfs/data/oermannlab/project_data/text2table/complete_v2/dataset_v2.csv')

def tmp(row):
item=row.split('-')
item[0]='Y'+item[0]
item[1]='M'+item[1]
item[2]='D'+item[2]
return '-'.join(item)
total['DOB']=total['DOB'].apply(lambda x:tmp(x))

dum=total['DOB'].str.get_dummies(sep='-')

# just in case folder doesn't exist
class_dir='class_files'
os.makedirs(class_dir,exist_ok=True)

# save dumifies columns
file_name='dates.pkl'
with open(os.path.join(class_dir,file_name),'wb') as f:
pkl.dump(dum.columns,f)
23 changes: 14 additions & 9 deletions baseline_xgb/new_xgb.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,7 @@
from sklearn import metrics
import argparse
from sklearn.model_selection import train_test_split
from get_dates_file import tmp


# Code for Training and Testing the baseline model, XGBoost, on the data with entered task
Expand Down Expand Up @@ -164,17 +165,17 @@ def train(task, tokenizer): # Function to train the model
else: # preprocess data
print("preprocessing data...")
os.makedirs(baseline_folder_path,exist_ok=True)
# load original dataset
#load original dataset
train=pd.read_csv(data_dir+'/train.csv')
dev=pd.read_csv(data_dir+'/dev.csv')
test=pd.read_csv(data_dir+'/test.csv')

# recover the original dataframe
total=pd.concat([train, dev])
total=pd.concat([total, test])
print("total shape: ", total.shape)
# get rid of rows with nan in labels
# total=total[total['DRG_CODE'].isna()==False]
#append dataframe
total=pd.concat([train,dev])
total=pd.concat([total,test])
print("total shape: ",total.shape)
#get rid of rows with nan in labels
total=total[total[col].isna()==False]

# get X and y based on the task
X_total=total['TEXT']
Expand All @@ -201,12 +202,16 @@ def train(task, tokenizer): # Function to train the model
if task[0] == "GENDER" or "HOSPITAL_EXPIRE_FLAG ": # Gender and Expire Flag are binary
y_total=y_total.squeeze(axis=1).str.get_dummies().to_numpy()
elif task[0] == "DOB": # DOB has format of YYYY-MM-DD
y_total=y_total.squeeze(axis=1).str.get_dummies(sep='-').to_numpy()
# add special token before y/m/d
y_total=y_total.apply(lambda x:tmp(x))
y_total=y_total.str.get_dummies(sep='-').to_numpy()
else: # other tasks are separated by <CEL>
y_total=y_total.squeeze(axis=1).str.get_dummies(sep=' <CEL> ').to_numpy()

else: # Multi-task: Combine all columns into one column and each column is separated by <CEL>
if "DOB" in task: # DOB has format of YYYY-MM-DD, processed it first
# add special token before y/m/d
y_total=y_total.apply(lambda x:tmp(x))
# Replace "-" in DOB with "<CEL>"
y_total["DOB"] = y_total["DOB"].str.replace("-", "<CEL>")

Expand Down Expand Up @@ -320,4 +325,4 @@ def train(task, tokenizer): # Function to train the model

else: # invalid mode
print("Invalid mode. Please choose from train, predict_train, predict_test")
exit()
exit()