|
| 1 | +library(dplyr) |
| 2 | +library(ggplot2) |
| 3 | +library(caret) |
| 4 | +library(nnet) |
| 5 | +library(MLmetrics) |
| 6 | + |
| 7 | +#this is a script for a regression model |
| 8 | +local <- getwd() |
| 9 | + |
| 10 | +#load cleaned data from CleanDataProject script |
| 11 | +load(file = paste0(local, "/Bigdata/Dropbox (Technion Dropbox)/Rina_Benel/Home/MachineLearningMedicine/results/cleanData.RData")) |
| 12 | + |
| 13 | +#we will use here an intepretable transformation of LOS |
| 14 | +#breaks = c(0, 0.5, 1, 5, 20, 117), |
| 15 | +#labels = c("twelve_hours", "twentyfour_hours", "few_days", "many_days", "extended_stay"), |
| 16 | + |
| 17 | +#interpretable take 2 |
| 18 | +#breaks = c(0, 1, 2, 3, 5, 102), |
| 19 | +#labels = c("twentyfour_hours", "fourtyeight_hours", "seventytwo_hours", "few_days", "many_days"), |
| 20 | + |
| 21 | +#can look at a table of the two variables that interest us. |
| 22 | +table(noNeonateData$InterpLos, noNeonateData$binaryLang) |
| 23 | + |
| 24 | +#check significance between two categorial variables |
| 25 | +chisq.test(noNeonateData$InterpLos, noNeonateData$binaryLang) |
| 26 | + |
| 27 | +################ |
| 28 | +#caret addition |
| 29 | +################ |
| 30 | +#since the MLN function doesn't require a tuning parameter, but if we want to apply regularlized regression |
| 31 | +#we can add this if we use caret which under the hoos is using nnet |
| 32 | + |
| 33 | +#Bottom line though, that is doesn't imporve the model |
| 34 | +trainControl_MNL <- trainControl(method = "cv", #cross validation resampling method |
| 35 | + number = 10, #number of resampling iterations |
| 36 | + search = "grid", |
| 37 | + classProbs = TRUE, |
| 38 | + summaryFunction = multiClassSummary) #alternatie performance summaries |
| 39 | + |
| 40 | +tuneGrid_MNL <- expand.grid(decay = seq(0, 1, by = 0.1)) #11 values for decay |
| 41 | +#regularized paramater to avoid over-fitting |
| 42 | + |
| 43 | +#set seed so partition we will use for training and test will always be the same |
| 44 | +set.seed(2612) |
| 45 | + |
| 46 | +#we use caret's package function, bec it leaves the same initial proportions of the variable we are interested in |
| 47 | +#for both the test and train |
| 48 | +data.index <- caret::createDataPartition(noNeonateData$InterpLos, |
| 49 | + p = 0.7, #the percentage of data that goes to training |
| 50 | + list =FALSE) #automatically returns a list |
| 51 | +#seperate to train and test |
| 52 | +train_data <- noNeonateData[data.index, ] |
| 53 | + |
| 54 | +test_data <- noNeonateData[-data.index, ] |
| 55 | + |
| 56 | +################### |
| 57 | +#caret continuation |
| 58 | +################### |
| 59 | +#MNL model which includes parameter |
| 60 | +MNL_model <- caret::train(InterpLos ~ gender + binaryLang + |
| 61 | + first_admit_age + simpleEthnic + |
| 62 | + insurance + sofa + sapsii, |
| 63 | + method = "multinom", |
| 64 | + data = train_data, |
| 65 | + maxit = 100, |
| 66 | + trace = FALSE, #we dont want to output the iterations |
| 67 | + tuneGrid = tuneGrid_MNL, #a df with columns for each tuning parameter |
| 68 | + trControl = trainControl_MNL) |
| 69 | +#get best value for decay |
| 70 | +MNL_model$bestTune |
| 71 | + |
| 72 | +#get the AUC and accuracy for each decay |
| 73 | +MNL_model$results %>% select(decay, AUC, Accuracy) |
| 74 | + |
| 75 | +#test the test data and get a confusion matrix |
| 76 | +caret::confusionMatrix(predict(MNL_model, |
| 77 | + newdata = test_data, |
| 78 | + type = "raw"), |
| 79 | + reference = test_data$InterpLos) |
| 80 | + |
| 81 | + |
| 82 | +#Conclusion, even with the additional paramaters I get the same accuarcy and the model can't predict 24hours! |
| 83 | +##################### |
| 84 | +##MNL model with nnet |
| 85 | +##################### |
| 86 | +# MNL model using nnet directly, with parameters |
| 87 | +MNL_model <- multinom(TakeTwo_InterpLos ~ gender + binaryLang + |
| 88 | + first_admit_age + simpleEthnic + |
| 89 | + insurance + sofa + sapsii, |
| 90 | + data = train_data) |
| 91 | + |
| 92 | +#get the summary of the model |
| 93 | +summary(MNL_model) |
| 94 | +#the reported residual deviance is final negative log-likelihood multiplied by two |
| 95 | + |
| 96 | +#extarct the coefficients from the model |
| 97 | +exp(coef(MNL_model)) |
| 98 | + |
| 99 | +head(prob.tableTrain <- fitted(MNL_model)) |
| 100 | + |
| 101 | +################################## |
| 102 | +#check acccuracy for training data |
| 103 | +################################### |
| 104 | +train_data$predicted <- predict(MNL_model, newdata = train_data, "class") |
| 105 | + |
| 106 | +cm_tableTrain <- table(train_data$TakeTwo_InterpLos, train_data$predicted, dnn = c("actual", "predicted")) |
| 107 | + |
| 108 | +accuracyTrain <- round((sum(diag(cm_tableTrain))/sum(cm_tableTrain))*100,2) |
| 109 | + |
| 110 | +################################ |
| 111 | +#check accuracy for testing data |
| 112 | +############################### |
| 113 | +test_data$predicted <- predict(MNL_model, newdata = test_data, "class") |
| 114 | + |
| 115 | +cm_tableTest <- table(test_data$TakeTwo_InterpLos, test_data$predicted, |
| 116 | + dnn = c("actual", "predicted")) |
| 117 | + |
| 118 | +accuracyTest <- round((sum(diag(cm_tableTest))/sum(cm_tableTest))*100,2) |
| 119 | + |
| 120 | + |
| 121 | +#since they both come out about the same, with a 70% accuracy, let's take just the training set and examine closer |
| 122 | +#get the summary of the model |
| 123 | +train_summary <- summary(MNL_model) |
| 124 | + |
| 125 | +#calculate z-staistics and p values |
| 126 | +z <- train_summary$coefficients/train_summary$standard.errors |
| 127 | +p <- (1 - pnorm(abs(z), 0, 1))*2 # we are using two-tailed z test |
| 128 | + |
| 129 | +#seperate each "length of stay" to display all of the details |
| 130 | +los_12h <- rbind(train_summary$coefficients[1, ], train_summary$standard.errors[1, ], z[1, ], p[1, ]) |
| 131 | +rownames(los_12h) <- c("Coefficient","Std. Errors","z stat","p value") |
| 132 | +los_12h <- as.data.frame(round(t(los_12h),4)) |
| 133 | +#write.csv(los_12h, file = paste0(local, "/Bigdata/Dropbox (Technion Dropbox)/Rina_Benel/Home/MachineLearningMedicine/results/los12h_summaryStatistics.csv")) |
| 134 | + |
| 135 | +los_24h <- rbind(train_summary$coefficients[2, ], train_summary$standard.errors[2, ], z[2, ], p[2, ]) |
| 136 | +rownames(los_24h) <- c("Coefficient","Std. Errors","z stat","p value") |
| 137 | +los_24h <- as.data.frame(round(t(los_24h),4)) |
| 138 | +#write.csv(los_24h, file = paste0(local, "/Bigdata/Dropbox (Technion Dropbox)/Rina_Benel/Home/MachineLearningMedicine/results/los24h_summaryStatistics.csv")) |
| 139 | + |
| 140 | + |
| 141 | +many_days <- rbind(train_summary$coefficients[3, ], train_summary$standard.errors[3, ], z[3, ], p[3, ]) |
| 142 | +rownames(many_days) <- c("Coefficient","Std. Errors","z stat","p value") |
| 143 | +many_days <- as.data.frame(round(t(many_days),4)) |
| 144 | +#write.csv(many_days, file = paste0(local, "/Bigdata/Dropbox (Technion Dropbox)/Rina_Benel/Home/MachineLearningMedicine/results/losmany_days_summaryStatistics.csv")) |
| 145 | + |
| 146 | + |
| 147 | +extended_stay <- rbind(train_summary$coefficients[4, ], train_summary$standard.errors[4, ], z[4, ], p[4, ]) |
| 148 | +rownames(extended_stay) <- c("Coefficient","Std. Errors","z stat","p value") |
| 149 | +extended_stay <- as.data.frame(round(t(extended_stay),4)) |
| 150 | +#write.csv(extended_stay, file = paste0(local, "/Bigdata/Dropbox (Technion Dropbox)/Rina_Benel/Home/MachineLearningMedicine/results/losextended_stay_summaryStatistics.csv")) |
| 151 | + |
| 152 | + |
| 153 | + |
| 154 | + |
| 155 | + |
| 156 | + |
0 commit comments