#adding scripts to repo

rbenel · rbenel · commit 824c80f63855 · 2022-02-14T16:53:28.000+02:00
diff --git a/ModelsProejct.R b/ModelsProejct.R
@@ -0,0 +1,80 @@
+library(nnet) #allows for multinom glm
+library(stargazer) #for pvalue for the multinom glm
+library(knitr)
+library(quantreg) #for quantile regression
+
+#this is the second script for this project
+
+#load cleaned data from CleanDataProject script 
+load(file = paste0("/Bigdata/Dropbox (Technion Dropbox)/Rina_Benel/Home/MachineLearningMedicine/results/cleanData.RData"))
+
+################################
+#Try to fit a multinomial model
+################################
+#run a multinom model, because we have septiles as the dependent variable. 
+fit.uniqueData <- nnet::multinom(losQuantile ~ gender + binaryLang + 
+                        first_admit_age  + simpleEthnic + marital_status + 
+                        insurance + sofa + sapsii, data = noNeonateData) 
+#but this function doesn't have p-values so we need to calculate them 
+summary.output <- summary(fit.uniqueData)
+
+#predict the dependent variable based off of the model used
+predict(fit.uniqueData, noNeonateData)
+
+#try to reduce the model?
+fit.uniqueData.reduced <- step(fit.uniqueData)
+
+#miscalculation error
+misCalcError <- table(predict(fit.uniqueData), noNeonateData$losQuantile)
+print(misCalcError)
+
+#find out percentage of time that the model is correct
+1-sum(diag(misCalcError))/sum(misCalcError)
+
+#Z statistics are simply ratios of model coefficients and standard errors
+z <- summary.output$coefficients/summary.output$standard.errors
+#we can get the p-values using the standard normal distribution.
+p <- (1 - pnorm(abs(z), 0, 1))*2 # we are using two-tailed z test
+
+#make a table for the first quantile
+Pclass1 <- rbind(summary.output$coefficients[1,],summary.output$standard.errors[1,],z[1,],p[1,])
+rownames(Pclass1) <- c("Coefficient","Std. Errors","z stat","p value")
+knitr::kable(Pclass1)
+
+#make a table for the third quantile
+Pclass3 <- rbind(summary.output$coefficients[2,],summary.output$standard.errors[2,],z[2,],p[2,])
+rownames(Pclass3) <- c("Coefficient","Std. Errors","z stat","p value")
+knitr::kable(Pclass3)
+
+#make a table for the fourth quantile
+Pclass4 <- rbind(summary.output$coefficients[3,],summary.output$standard.errors[3,],z[3,],p[3,])
+rownames(Pclass4) <- c("Coefficient","Std. Errors","z stat","p value")
+knitr::kable(Pclass4)
+
+########################################
+#Try to fit a quantile regression model
+########################################
+quantreg25 <- rq(los ~ gender + binaryLang + 
+                   first_admit_age  + simpleEthnic + marital_status + 
+                   insurance + sofa + sapsii, data = noNeonateData, tau = 0.25)
+summary(quantreg25)
+
+quantreg50 <- rq(los ~ gender + binaryLang + 
+                   first_admit_age  + simpleEthnic + marital_status + 
+                   insurance + sofa + sapsii, data = noNeonateData, tau = 0.50)
+summary(quantreg50)
+
+quantreg75 <- rq(los ~ gender + binaryLang + 
+                      first_admit_age  + simpleEthnic + marital_status + 
+                      insurance + sofa + sapsii, data = noNeonateData, tau = 0.75)
+summary(quantreg75)
+
+anova(quantreg25, quantreg50, quantreg75, joint = F)
+
+#plotting data 
+quantreg.all <- rq(los ~ gender + binaryLang + 
+                    first_admit_age  + simpleEthnic + marital_status + 
+                    insurance + sofa + sapsii, data = noNeonateData, tau = seq(0.05, 0.95, by = 0.05))
+
+quantreg.plot <- summary(quantreg.all)
+plot(quantreg.plot)
diff --git a/README.md b/README.md
@@ -1,3 +1,7 @@
-#MIMICII DB
+# MIMICII DB
 
-this is a test, test #2
+Project completed as part of the course requirments for "Machine Learning:Medicine"
+
+Order of the analysis 
+
+### 
diff --git a/ashtin_tutoiral.R b/ashtin_tutoiral.R
@@ -0,0 +1,73 @@
+library(ggcorrplot)
+library(caret)
+
+#this is the fourth script for this project, based on Ashtin's tutorial 
+
+#load cleaned data from CleanDataProject script 
+load(file = paste0("/Bigdata/Dropbox (Technion Dropbox)/Rina_Benel/Home/MachineLearningMedicine/results/cleanData.RData"))
+
+table(noNeonateData$logLOS)
+
+#as log of 4 and 5 are such small groups, I think it's best to combine them... 
+noNeonateData$logLOS[noNeonateData$logLOS == 5] <- 4
+                            
+table(noNeonateData$logLOS)
+
+#predictor variables, check correlations between numeric variables 
+
+cormat <- round(cor(as.matrix(noNeonateData[, c("first_admit_age", "sofa", "sapsii", "logLOS")])), 2)
+cormat[upper.tri(cormat)] <- ""
+#cormat <- as.data.frame(cormat) %>% select(-logLOS)
+
+ggcorrplot::ggcorrplot(round(cor(as.matrix(noNeonateData[, c("first_admit_age", "sofa", "sapsii", "logLOS")])), 2), 
+                       p.mat = ggcorrplot::cor_pmat(as.matrix(noNeonateData[, c("first_admit_age", "sofa", "sapsii", "logLOS")])),
+                       hc.order = TRUE, 
+                       #type = "lower",
+                       outline.col = "white",
+                       ggtheme = ggplot2::theme_minimal,
+                       colors = c("#cf222c", "white", "#3a2d7f")
+                        )
+
+# first_admit_age  sofa sapsii logLOS
+# first_admit_age            1.00 -0.04   0.29   0.02
+# sofa                      -0.04  1.00   0.32   0.21
+# sapsii                     0.29  0.32   1.00   0.17
+# logLOS                     0.02  0.21   0.17   1.00
+
+
+######################
+#TRAIN AND TEST DATA
+######################
+set.seed(1234) #set seed so we always get the same sample train/test
+train <- sample(nrow(noNeonateData), 0.7*nrow(noNeonateData)) #get 70%
+train.df <- noNeonateData[train, ] #divide the data 
+
+test.df <- noNeonateData[-train, ] #everything we didnt take in the training place into test
+
+table(train.df$logLOS)
+table(test.df$logLOS)
+
+########################
+#linear regression model
+#######################
+#create a model with the variables we are interested in looking at 
+# model.lm <- caret::train(logLOS ~ gender + binaryLang + 
+#                            first_admit_age  + simpleEthnic + marital_status + 
+#                            insurance + sofa + sapsii, 
+#                            data = train.df, 
+#                            method = "lm")
+
+#which model is correct???
+model.lm <- lm(logLOS ~ gender + binaryLang + 
+                           first_admit_age  + simpleEthnic + marital_status + 
+                           insurance + sofa + sapsii, 
+                           data = noNeonateData)
+summary(model.lm)
+step(model.lm)
+prob <- predict(model.lm, test.df)
+
+#see ML workshop hw#3 question #3
+table(test.df$logLOS, prob,
+      dnn=c("Actual", "Predicted"))
+
+
diff --git a/getTrialData.sql b/getTrialData.sql
@@ -0,0 +1,32 @@
+WITH first_admission_time AS
+(
+  SELECT DISTINCT mimiciii.patients.subject_id,
+      mimiciii.patients.dob, mimiciii.patients.gender, mimiciii.admissions.language, mimiciii.admissions.marital_status, mimiciii.admissions.ethnicity,mimiciii.admissions.insurance, mimiciii.icustays.los, mimiciii.icustays.icustay_id
+      , EXTRACT(EPOCH FROM outtime - intime)/60.0/60.0/24.0 as icu_length_of_stay
+      , MIN (mimiciii.admissions.admittime) AS first_admittime
+      , MIN( ROUND( (cast(mimiciii.admissions.admittime as date) - cast(mimiciii.patients.dob as date)) / 365.242,2) )
+          AS first_admit_age
+  FROM mimiciii.patients
+  INNER JOIN mimiciii.admissions
+  ON mimiciii.patients.subject_id = mimiciii.admissions.subject_id
+  INNER JOIN mimiciii.icustays
+  on mimiciii.icustays.subject_id = mimiciii.admissions.subject_id
+   WHERE mimiciii.admissions.language IS NOT NULL
+  GROUP BY mimiciii.patients.subject_id, mimiciii.patients.dob, mimiciii.patients.gender, mimiciii.admissions.language, mimiciii.admissions.marital_status, mimiciii.admissions.ethnicity,mimiciii.admissions.insurance, mimiciii.icustays.los, mimiciii.icustays.icustay_id, mimiciii.icustays.outtime, mimiciii.icustays.intime
+  ORDER BY mimiciii.patients.subject_id
+)
+SELECT
+    subject_id, icustay_id, dob,  gender, language, marital_status, ethnicity, insurance, los, icu_length_of_stay
+    , first_admittime, first_admit_age
+    , CASE
+        -- all ages > 89 in the database were replaced with 300
+        WHEN first_admit_age > 89
+            then '>89'
+        WHEN first_admit_age >= 14
+            THEN 'adult'
+        WHEN first_admit_age <= 1
+            THEN 'neonate'
+        ELSE 'middle'
+        END AS age_group
+FROM first_admission_time
+ORDER BY subject_id
diff --git a/graphsProjectPresentation.R b/graphsProjectPresentation.R
@@ -0,0 +1,36 @@
+#this is the third script for this project
+#I want to make graphs for the project presentation
+
+#load cleaned data from CleanDataProject script 
+load(file = paste0("/Bigdata/Dropbox (Technion Dropbox)/Rina_Benel/Home/MachineLearningMedicine/results/cleanData.RData"))
+
+
+#density plot los V. language 
+#for the purpose of the plot, anything that is a log of 0 will be -Inf, so let's add 1 to all of the values. 
+#this graph uses the *original* los values!!
+plot <- ggplot(noNeonateData, aes(log(los+1), fill = binaryLang)) + geom_density(alpha = 0.35) +
+  xlab("Length of Stay (log)") +
+  theme(panel.background = element_blank()) +
+  theme(axis.line.x = element_line(color="black", size = 0.5),
+        axis.line.y = element_line(color="black", size = 0.5)) 
+plot + scale_fill_manual(values= c("#00bfc4", "#F8766D")) + guides(fill=guide_legend(title=" "))
+
+
+#histogram 
+hist <- ggplot(noNeonateData, aes(log(los+1), fill = binaryLang)) + 
+  geom_histogram(alpha = 0.5, aes(y = ..density..), colour="black", position = 'identity', binwidth = 0.35) +
+  xlab("Length of Stay (log)") + ylab("") +
+  theme(legend.title = element_blank()) +
+  theme(legend.text = element_text(colour="black", size = 20, face = "plain")) +
+  theme( axis.title.x = element_text(family="sans",size = 20, face="bold", hjust=0.5, vjust=-0.5),
+         axis.title.y = element_text(family="sans",size = 20, angle=90, face="bold", hjust=0.5, vjust=1)) +
+  theme( axis.text.x = element_text(family = "sans",size = 14, angle=0, face='plain', colour="#353535",   hjust=1, vjust=1) ) +
+  theme( axis.text.y = element_text(family = "sans",size = 14, face='plain', colour="#353535",  vjust=0.5) ) +
+  theme(axis.line.x = element_line(color="black", size = 0.5),
+        axis.line.y = element_line(color="black", size = 0.5)) +
+  theme(legend.background = element_rect()) + 
+  theme(legend.position="top") +
+  theme(panel.border = element_blank(), panel.grid.major = element_blank(),
+        panel.grid.minor = element_blank(),  panel.background = element_blank()) 
+
+hist + guides(fill=guide_legend(title=" ")) + scale_fill_manual(values= c("#00bfc4", "#F8766D"))
diff --git a/multinomModel.R b/multinomModel.R
@@ -0,0 +1,156 @@
+library(dplyr)
+library(ggplot2)
+library(caret)
+library(nnet)
+library(MLmetrics)
+
+#this is a script for a regression model
+local <- getwd()
+
+#load cleaned data from CleanDataProject script 
+load(file = paste0(local, "/Bigdata/Dropbox (Technion Dropbox)/Rina_Benel/Home/MachineLearningMedicine/results/cleanData.RData"))
+
+#we will use here an intepretable transformation of LOS
+#breaks = c(0, 0.5, 1, 5, 20, 117),
+#labels = c("twelve_hours", "twentyfour_hours", "few_days", "many_days", "extended_stay"),
+
+#interpretable take 2 
+#breaks = c(0, 1, 2, 3, 5, 102),
+#labels = c("twentyfour_hours", "fourtyeight_hours", "seventytwo_hours", "few_days", "many_days"),
+
+#can look at a table of the two variables that interest us. 
+table(noNeonateData$InterpLos, noNeonateData$binaryLang)
+
+#check significance between two categorial variables
+chisq.test(noNeonateData$InterpLos, noNeonateData$binaryLang)
+
+################
+#caret addition
+################
+#since the MLN function doesn't require a tuning parameter, but if we want to apply regularlized regression
+#we can add this if we use caret which under the hoos is using nnet
+
+#Bottom line though, that is doesn't imporve the model
+trainControl_MNL <- trainControl(method = "cv", #cross validation resampling method
+                                 number = 10,   #number of resampling iterations
+                                 search = "grid",
+                                 classProbs = TRUE,
+                                 summaryFunction = multiClassSummary) #alternatie performance summaries
+
+tuneGrid_MNL <- expand.grid(decay = seq(0, 1, by = 0.1)) #11 values for decay
+#regularized paramater to avoid over-fitting 
+
+#set seed so partition we will use for training and test will always be the same 
+set.seed(2612) 
+
+#we use caret's package function, bec it leaves the same initial proportions of the variable we are interested in
+#for both the test and train 
+data.index <- caret::createDataPartition(noNeonateData$InterpLos,
+                                         p = 0.7, #the percentage of data that goes to training
+                                         list =FALSE) #automatically returns a list
+#seperate to train and test 
+train_data <- noNeonateData[data.index, ]
+
+test_data <- noNeonateData[-data.index, ]
+
+###################
+#caret continuation
+###################
+#MNL model which includes parameter 
+MNL_model <- caret::train(InterpLos ~ gender + binaryLang +
+                        first_admit_age  + simpleEthnic +
+                        insurance + sofa + sapsii,
+                        method = "multinom",
+                        data = train_data,
+                        maxit = 100,
+                        trace = FALSE, #we dont want to output the iterations
+                        tuneGrid = tuneGrid_MNL, #a df with columns for each tuning parameter
+                        trControl = trainControl_MNL)
+#get best value for decay
+MNL_model$bestTune
+
+#get the AUC and accuracy for each decay
+MNL_model$results %>% select(decay, AUC, Accuracy)
+
+#test the test data and get a confusion matrix
+caret::confusionMatrix(predict(MNL_model,
+                               newdata = test_data,
+                               type = "raw"),
+                       reference = test_data$InterpLos)
+
+
+#Conclusion, even with the additional paramaters I get the same accuarcy and the model can't predict 24hours!
+#####################
+##MNL model with nnet
+#####################
+# MNL model using nnet directly, with parameters
+MNL_model <- multinom(TakeTwo_InterpLos ~ gender + binaryLang +
+                            first_admit_age  + simpleEthnic +
+                            insurance + sofa + sapsii,
+                            data = train_data)
+
+#get the summary of the model
+summary(MNL_model)
+#the reported residual deviance is final negative log-likelihood multiplied by two 
+
+#extarct the coefficients from the model 
+exp(coef(MNL_model))
+
+head(prob.tableTrain <- fitted(MNL_model))
+
+##################################
+#check acccuracy for training data
+###################################
+train_data$predicted <- predict(MNL_model, newdata = train_data, "class")
+
+cm_tableTrain <- table(train_data$TakeTwo_InterpLos, train_data$predicted, dnn = c("actual", "predicted"))
+
+accuracyTrain <- round((sum(diag(cm_tableTrain))/sum(cm_tableTrain))*100,2)
+
+################################
+#check accuracy for testing data
+###############################
+test_data$predicted <- predict(MNL_model, newdata = test_data, "class")
+
+cm_tableTest <- table(test_data$TakeTwo_InterpLos, test_data$predicted, 
+                      dnn = c("actual", "predicted"))
+
+accuracyTest <- round((sum(diag(cm_tableTest))/sum(cm_tableTest))*100,2)
+
+
+#since they both come out about the same, with a 70% accuracy, let's take just the training set and examine closer
+#get the summary of the model
+train_summary <- summary(MNL_model)
+
+#calculate z-staistics and p values 
+z <- train_summary$coefficients/train_summary$standard.errors
+p <- (1 - pnorm(abs(z), 0, 1))*2 # we are using two-tailed z test
+
+#seperate each "length of stay" to display all of the details
+los_12h <- rbind(train_summary$coefficients[1, ], train_summary$standard.errors[1, ], z[1, ], p[1, ])
+rownames(los_12h) <- c("Coefficient","Std. Errors","z stat","p value")
+los_12h <- as.data.frame(round(t(los_12h),4))
+#write.csv(los_12h, file = paste0(local, "/Bigdata/Dropbox (Technion Dropbox)/Rina_Benel/Home/MachineLearningMedicine/results/los12h_summaryStatistics.csv"))
+
+los_24h <- rbind(train_summary$coefficients[2, ], train_summary$standard.errors[2, ], z[2, ], p[2, ])
+rownames(los_24h) <- c("Coefficient","Std. Errors","z stat","p value")
+los_24h <- as.data.frame(round(t(los_24h),4))
+#write.csv(los_24h, file = paste0(local, "/Bigdata/Dropbox (Technion Dropbox)/Rina_Benel/Home/MachineLearningMedicine/results/los24h_summaryStatistics.csv"))
+
+
+many_days <- rbind(train_summary$coefficients[3, ], train_summary$standard.errors[3, ], z[3, ], p[3, ])
+rownames(many_days) <- c("Coefficient","Std. Errors","z stat","p value")
+many_days <- as.data.frame(round(t(many_days),4))
+#write.csv(many_days, file = paste0(local, "/Bigdata/Dropbox (Technion Dropbox)/Rina_Benel/Home/MachineLearningMedicine/results/losmany_days_summaryStatistics.csv"))
+
+
+extended_stay <- rbind(train_summary$coefficients[4, ], train_summary$standard.errors[4, ], z[4, ], p[4, ])
+rownames(extended_stay) <- c("Coefficient","Std. Errors","z stat","p value")
+extended_stay <- as.data.frame(round(t(extended_stay),4))
+#write.csv(extended_stay, file = paste0(local, "/Bigdata/Dropbox (Technion Dropbox)/Rina_Benel/Home/MachineLearningMedicine/results/losextended_stay_summaryStatistics.csv"))
+
+
+
+
+
+
diff --git a/randomForest.R b/randomForest.R
diff --git a/willieDurationsStratifiedSOL.R b/willieDurationsStratifiedSOL.R