Skip to content

Commit 824c80f

Browse files
committed
#adding scripts to repo
1 parent b4c4aed commit 824c80f

8 files changed

+623
-2
lines changed

ModelsProejct.R

+80
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,80 @@
1+
library(nnet) #allows for multinom glm
2+
library(stargazer) #for pvalue for the multinom glm
3+
library(knitr)
4+
library(quantreg) #for quantile regression
5+
6+
#this is the second script for this project
7+
8+
#load cleaned data from CleanDataProject script
9+
load(file = paste0("/Bigdata/Dropbox (Technion Dropbox)/Rina_Benel/Home/MachineLearningMedicine/results/cleanData.RData"))
10+
11+
################################
12+
#Try to fit a multinomial model
13+
################################
14+
#run a multinom model, because we have septiles as the dependent variable.
15+
fit.uniqueData <- nnet::multinom(losQuantile ~ gender + binaryLang +
16+
first_admit_age + simpleEthnic + marital_status +
17+
insurance + sofa + sapsii, data = noNeonateData)
18+
#but this function doesn't have p-values so we need to calculate them
19+
summary.output <- summary(fit.uniqueData)
20+
21+
#predict the dependent variable based off of the model used
22+
predict(fit.uniqueData, noNeonateData)
23+
24+
#try to reduce the model?
25+
fit.uniqueData.reduced <- step(fit.uniqueData)
26+
27+
#miscalculation error
28+
misCalcError <- table(predict(fit.uniqueData), noNeonateData$losQuantile)
29+
print(misCalcError)
30+
31+
#find out percentage of time that the model is correct
32+
1-sum(diag(misCalcError))/sum(misCalcError)
33+
34+
#Z statistics are simply ratios of model coefficients and standard errors
35+
z <- summary.output$coefficients/summary.output$standard.errors
36+
#we can get the p-values using the standard normal distribution.
37+
p <- (1 - pnorm(abs(z), 0, 1))*2 # we are using two-tailed z test
38+
39+
#make a table for the first quantile
40+
Pclass1 <- rbind(summary.output$coefficients[1,],summary.output$standard.errors[1,],z[1,],p[1,])
41+
rownames(Pclass1) <- c("Coefficient","Std. Errors","z stat","p value")
42+
knitr::kable(Pclass1)
43+
44+
#make a table for the third quantile
45+
Pclass3 <- rbind(summary.output$coefficients[2,],summary.output$standard.errors[2,],z[2,],p[2,])
46+
rownames(Pclass3) <- c("Coefficient","Std. Errors","z stat","p value")
47+
knitr::kable(Pclass3)
48+
49+
#make a table for the fourth quantile
50+
Pclass4 <- rbind(summary.output$coefficients[3,],summary.output$standard.errors[3,],z[3,],p[3,])
51+
rownames(Pclass4) <- c("Coefficient","Std. Errors","z stat","p value")
52+
knitr::kable(Pclass4)
53+
54+
########################################
55+
#Try to fit a quantile regression model
56+
########################################
57+
quantreg25 <- rq(los ~ gender + binaryLang +
58+
first_admit_age + simpleEthnic + marital_status +
59+
insurance + sofa + sapsii, data = noNeonateData, tau = 0.25)
60+
summary(quantreg25)
61+
62+
quantreg50 <- rq(los ~ gender + binaryLang +
63+
first_admit_age + simpleEthnic + marital_status +
64+
insurance + sofa + sapsii, data = noNeonateData, tau = 0.50)
65+
summary(quantreg50)
66+
67+
quantreg75 <- rq(los ~ gender + binaryLang +
68+
first_admit_age + simpleEthnic + marital_status +
69+
insurance + sofa + sapsii, data = noNeonateData, tau = 0.75)
70+
summary(quantreg75)
71+
72+
anova(quantreg25, quantreg50, quantreg75, joint = F)
73+
74+
#plotting data
75+
quantreg.all <- rq(los ~ gender + binaryLang +
76+
first_admit_age + simpleEthnic + marital_status +
77+
insurance + sofa + sapsii, data = noNeonateData, tau = seq(0.05, 0.95, by = 0.05))
78+
79+
quantreg.plot <- summary(quantreg.all)
80+
plot(quantreg.plot)

README.md

+6-2
Original file line numberDiff line numberDiff line change
@@ -1,3 +1,7 @@
1-
#MIMICII DB
1+
# MIMICII DB
22

3-
this is a test, test #2
3+
Project completed as part of the course requirments for "Machine Learning:Medicine"
4+
5+
Order of the analysis
6+
7+
###

ashtin_tutoiral.R

+73
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,73 @@
1+
library(ggcorrplot)
2+
library(caret)
3+
4+
#this is the fourth script for this project, based on Ashtin's tutorial
5+
6+
#load cleaned data from CleanDataProject script
7+
load(file = paste0("/Bigdata/Dropbox (Technion Dropbox)/Rina_Benel/Home/MachineLearningMedicine/results/cleanData.RData"))
8+
9+
table(noNeonateData$logLOS)
10+
11+
#as log of 4 and 5 are such small groups, I think it's best to combine them...
12+
noNeonateData$logLOS[noNeonateData$logLOS == 5] <- 4
13+
14+
table(noNeonateData$logLOS)
15+
16+
#predictor variables, check correlations between numeric variables
17+
18+
cormat <- round(cor(as.matrix(noNeonateData[, c("first_admit_age", "sofa", "sapsii", "logLOS")])), 2)
19+
cormat[upper.tri(cormat)] <- ""
20+
#cormat <- as.data.frame(cormat) %>% select(-logLOS)
21+
22+
ggcorrplot::ggcorrplot(round(cor(as.matrix(noNeonateData[, c("first_admit_age", "sofa", "sapsii", "logLOS")])), 2),
23+
p.mat = ggcorrplot::cor_pmat(as.matrix(noNeonateData[, c("first_admit_age", "sofa", "sapsii", "logLOS")])),
24+
hc.order = TRUE,
25+
#type = "lower",
26+
outline.col = "white",
27+
ggtheme = ggplot2::theme_minimal,
28+
colors = c("#cf222c", "white", "#3a2d7f")
29+
)
30+
31+
# first_admit_age sofa sapsii logLOS
32+
# first_admit_age 1.00 -0.04 0.29 0.02
33+
# sofa -0.04 1.00 0.32 0.21
34+
# sapsii 0.29 0.32 1.00 0.17
35+
# logLOS 0.02 0.21 0.17 1.00
36+
37+
38+
######################
39+
#TRAIN AND TEST DATA
40+
######################
41+
set.seed(1234) #set seed so we always get the same sample train/test
42+
train <- sample(nrow(noNeonateData), 0.7*nrow(noNeonateData)) #get 70%
43+
train.df <- noNeonateData[train, ] #divide the data
44+
45+
test.df <- noNeonateData[-train, ] #everything we didnt take in the training place into test
46+
47+
table(train.df$logLOS)
48+
table(test.df$logLOS)
49+
50+
########################
51+
#linear regression model
52+
#######################
53+
#create a model with the variables we are interested in looking at
54+
# model.lm <- caret::train(logLOS ~ gender + binaryLang +
55+
# first_admit_age + simpleEthnic + marital_status +
56+
# insurance + sofa + sapsii,
57+
# data = train.df,
58+
# method = "lm")
59+
60+
#which model is correct???
61+
model.lm <- lm(logLOS ~ gender + binaryLang +
62+
first_admit_age + simpleEthnic + marital_status +
63+
insurance + sofa + sapsii,
64+
data = noNeonateData)
65+
summary(model.lm)
66+
step(model.lm)
67+
prob <- predict(model.lm, test.df)
68+
69+
#see ML workshop hw#3 question #3
70+
table(test.df$logLOS, prob,
71+
dnn=c("Actual", "Predicted"))
72+
73+

getTrialData.sql

+32
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,32 @@
1+
WITH first_admission_time AS
2+
(
3+
SELECT DISTINCT mimiciii.patients.subject_id,
4+
mimiciii.patients.dob, mimiciii.patients.gender, mimiciii.admissions.language, mimiciii.admissions.marital_status, mimiciii.admissions.ethnicity,mimiciii.admissions.insurance, mimiciii.icustays.los, mimiciii.icustays.icustay_id
5+
, EXTRACT(EPOCH FROM outtime - intime)/60.0/60.0/24.0 as icu_length_of_stay
6+
, MIN (mimiciii.admissions.admittime) AS first_admittime
7+
, MIN( ROUND( (cast(mimiciii.admissions.admittime as date) - cast(mimiciii.patients.dob as date)) / 365.242,2) )
8+
AS first_admit_age
9+
FROM mimiciii.patients
10+
INNER JOIN mimiciii.admissions
11+
ON mimiciii.patients.subject_id = mimiciii.admissions.subject_id
12+
INNER JOIN mimiciii.icustays
13+
on mimiciii.icustays.subject_id = mimiciii.admissions.subject_id
14+
WHERE mimiciii.admissions.language IS NOT NULL
15+
GROUP BY mimiciii.patients.subject_id, mimiciii.patients.dob, mimiciii.patients.gender, mimiciii.admissions.language, mimiciii.admissions.marital_status, mimiciii.admissions.ethnicity,mimiciii.admissions.insurance, mimiciii.icustays.los, mimiciii.icustays.icustay_id, mimiciii.icustays.outtime, mimiciii.icustays.intime
16+
ORDER BY mimiciii.patients.subject_id
17+
)
18+
SELECT
19+
subject_id, icustay_id, dob, gender, language, marital_status, ethnicity, insurance, los, icu_length_of_stay
20+
, first_admittime, first_admit_age
21+
, CASE
22+
-- all ages > 89 in the database were replaced with 300
23+
WHEN first_admit_age > 89
24+
then '>89'
25+
WHEN first_admit_age >= 14
26+
THEN 'adult'
27+
WHEN first_admit_age <= 1
28+
THEN 'neonate'
29+
ELSE 'middle'
30+
END AS age_group
31+
FROM first_admission_time
32+
ORDER BY subject_id

graphsProjectPresentation.R

+36
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,36 @@
1+
#this is the third script for this project
2+
#I want to make graphs for the project presentation
3+
4+
#load cleaned data from CleanDataProject script
5+
load(file = paste0("/Bigdata/Dropbox (Technion Dropbox)/Rina_Benel/Home/MachineLearningMedicine/results/cleanData.RData"))
6+
7+
8+
#density plot los V. language
9+
#for the purpose of the plot, anything that is a log of 0 will be -Inf, so let's add 1 to all of the values.
10+
#this graph uses the *original* los values!!
11+
plot <- ggplot(noNeonateData, aes(log(los+1), fill = binaryLang)) + geom_density(alpha = 0.35) +
12+
xlab("Length of Stay (log)") +
13+
theme(panel.background = element_blank()) +
14+
theme(axis.line.x = element_line(color="black", size = 0.5),
15+
axis.line.y = element_line(color="black", size = 0.5))
16+
plot + scale_fill_manual(values= c("#00bfc4", "#F8766D")) + guides(fill=guide_legend(title=" "))
17+
18+
19+
#histogram
20+
hist <- ggplot(noNeonateData, aes(log(los+1), fill = binaryLang)) +
21+
geom_histogram(alpha = 0.5, aes(y = ..density..), colour="black", position = 'identity', binwidth = 0.35) +
22+
xlab("Length of Stay (log)") + ylab("") +
23+
theme(legend.title = element_blank()) +
24+
theme(legend.text = element_text(colour="black", size = 20, face = "plain")) +
25+
theme( axis.title.x = element_text(family="sans",size = 20, face="bold", hjust=0.5, vjust=-0.5),
26+
axis.title.y = element_text(family="sans",size = 20, angle=90, face="bold", hjust=0.5, vjust=1)) +
27+
theme( axis.text.x = element_text(family = "sans",size = 14, angle=0, face='plain', colour="#353535", hjust=1, vjust=1) ) +
28+
theme( axis.text.y = element_text(family = "sans",size = 14, face='plain', colour="#353535", vjust=0.5) ) +
29+
theme(axis.line.x = element_line(color="black", size = 0.5),
30+
axis.line.y = element_line(color="black", size = 0.5)) +
31+
theme(legend.background = element_rect()) +
32+
theme(legend.position="top") +
33+
theme(panel.border = element_blank(), panel.grid.major = element_blank(),
34+
panel.grid.minor = element_blank(), panel.background = element_blank())
35+
36+
hist + guides(fill=guide_legend(title=" ")) + scale_fill_manual(values= c("#00bfc4", "#F8766D"))

multinomModel.R

+156
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,156 @@
1+
library(dplyr)
2+
library(ggplot2)
3+
library(caret)
4+
library(nnet)
5+
library(MLmetrics)
6+
7+
#this is a script for a regression model
8+
local <- getwd()
9+
10+
#load cleaned data from CleanDataProject script
11+
load(file = paste0(local, "/Bigdata/Dropbox (Technion Dropbox)/Rina_Benel/Home/MachineLearningMedicine/results/cleanData.RData"))
12+
13+
#we will use here an intepretable transformation of LOS
14+
#breaks = c(0, 0.5, 1, 5, 20, 117),
15+
#labels = c("twelve_hours", "twentyfour_hours", "few_days", "many_days", "extended_stay"),
16+
17+
#interpretable take 2
18+
#breaks = c(0, 1, 2, 3, 5, 102),
19+
#labels = c("twentyfour_hours", "fourtyeight_hours", "seventytwo_hours", "few_days", "many_days"),
20+
21+
#can look at a table of the two variables that interest us.
22+
table(noNeonateData$InterpLos, noNeonateData$binaryLang)
23+
24+
#check significance between two categorial variables
25+
chisq.test(noNeonateData$InterpLos, noNeonateData$binaryLang)
26+
27+
################
28+
#caret addition
29+
################
30+
#since the MLN function doesn't require a tuning parameter, but if we want to apply regularlized regression
31+
#we can add this if we use caret which under the hoos is using nnet
32+
33+
#Bottom line though, that is doesn't imporve the model
34+
trainControl_MNL <- trainControl(method = "cv", #cross validation resampling method
35+
number = 10, #number of resampling iterations
36+
search = "grid",
37+
classProbs = TRUE,
38+
summaryFunction = multiClassSummary) #alternatie performance summaries
39+
40+
tuneGrid_MNL <- expand.grid(decay = seq(0, 1, by = 0.1)) #11 values for decay
41+
#regularized paramater to avoid over-fitting
42+
43+
#set seed so partition we will use for training and test will always be the same
44+
set.seed(2612)
45+
46+
#we use caret's package function, bec it leaves the same initial proportions of the variable we are interested in
47+
#for both the test and train
48+
data.index <- caret::createDataPartition(noNeonateData$InterpLos,
49+
p = 0.7, #the percentage of data that goes to training
50+
list =FALSE) #automatically returns a list
51+
#seperate to train and test
52+
train_data <- noNeonateData[data.index, ]
53+
54+
test_data <- noNeonateData[-data.index, ]
55+
56+
###################
57+
#caret continuation
58+
###################
59+
#MNL model which includes parameter
60+
MNL_model <- caret::train(InterpLos ~ gender + binaryLang +
61+
first_admit_age + simpleEthnic +
62+
insurance + sofa + sapsii,
63+
method = "multinom",
64+
data = train_data,
65+
maxit = 100,
66+
trace = FALSE, #we dont want to output the iterations
67+
tuneGrid = tuneGrid_MNL, #a df with columns for each tuning parameter
68+
trControl = trainControl_MNL)
69+
#get best value for decay
70+
MNL_model$bestTune
71+
72+
#get the AUC and accuracy for each decay
73+
MNL_model$results %>% select(decay, AUC, Accuracy)
74+
75+
#test the test data and get a confusion matrix
76+
caret::confusionMatrix(predict(MNL_model,
77+
newdata = test_data,
78+
type = "raw"),
79+
reference = test_data$InterpLos)
80+
81+
82+
#Conclusion, even with the additional paramaters I get the same accuarcy and the model can't predict 24hours!
83+
#####################
84+
##MNL model with nnet
85+
#####################
86+
# MNL model using nnet directly, with parameters
87+
MNL_model <- multinom(TakeTwo_InterpLos ~ gender + binaryLang +
88+
first_admit_age + simpleEthnic +
89+
insurance + sofa + sapsii,
90+
data = train_data)
91+
92+
#get the summary of the model
93+
summary(MNL_model)
94+
#the reported residual deviance is final negative log-likelihood multiplied by two
95+
96+
#extarct the coefficients from the model
97+
exp(coef(MNL_model))
98+
99+
head(prob.tableTrain <- fitted(MNL_model))
100+
101+
##################################
102+
#check acccuracy for training data
103+
###################################
104+
train_data$predicted <- predict(MNL_model, newdata = train_data, "class")
105+
106+
cm_tableTrain <- table(train_data$TakeTwo_InterpLos, train_data$predicted, dnn = c("actual", "predicted"))
107+
108+
accuracyTrain <- round((sum(diag(cm_tableTrain))/sum(cm_tableTrain))*100,2)
109+
110+
################################
111+
#check accuracy for testing data
112+
###############################
113+
test_data$predicted <- predict(MNL_model, newdata = test_data, "class")
114+
115+
cm_tableTest <- table(test_data$TakeTwo_InterpLos, test_data$predicted,
116+
dnn = c("actual", "predicted"))
117+
118+
accuracyTest <- round((sum(diag(cm_tableTest))/sum(cm_tableTest))*100,2)
119+
120+
121+
#since they both come out about the same, with a 70% accuracy, let's take just the training set and examine closer
122+
#get the summary of the model
123+
train_summary <- summary(MNL_model)
124+
125+
#calculate z-staistics and p values
126+
z <- train_summary$coefficients/train_summary$standard.errors
127+
p <- (1 - pnorm(abs(z), 0, 1))*2 # we are using two-tailed z test
128+
129+
#seperate each "length of stay" to display all of the details
130+
los_12h <- rbind(train_summary$coefficients[1, ], train_summary$standard.errors[1, ], z[1, ], p[1, ])
131+
rownames(los_12h) <- c("Coefficient","Std. Errors","z stat","p value")
132+
los_12h <- as.data.frame(round(t(los_12h),4))
133+
#write.csv(los_12h, file = paste0(local, "/Bigdata/Dropbox (Technion Dropbox)/Rina_Benel/Home/MachineLearningMedicine/results/los12h_summaryStatistics.csv"))
134+
135+
los_24h <- rbind(train_summary$coefficients[2, ], train_summary$standard.errors[2, ], z[2, ], p[2, ])
136+
rownames(los_24h) <- c("Coefficient","Std. Errors","z stat","p value")
137+
los_24h <- as.data.frame(round(t(los_24h),4))
138+
#write.csv(los_24h, file = paste0(local, "/Bigdata/Dropbox (Technion Dropbox)/Rina_Benel/Home/MachineLearningMedicine/results/los24h_summaryStatistics.csv"))
139+
140+
141+
many_days <- rbind(train_summary$coefficients[3, ], train_summary$standard.errors[3, ], z[3, ], p[3, ])
142+
rownames(many_days) <- c("Coefficient","Std. Errors","z stat","p value")
143+
many_days <- as.data.frame(round(t(many_days),4))
144+
#write.csv(many_days, file = paste0(local, "/Bigdata/Dropbox (Technion Dropbox)/Rina_Benel/Home/MachineLearningMedicine/results/losmany_days_summaryStatistics.csv"))
145+
146+
147+
extended_stay <- rbind(train_summary$coefficients[4, ], train_summary$standard.errors[4, ], z[4, ], p[4, ])
148+
rownames(extended_stay) <- c("Coefficient","Std. Errors","z stat","p value")
149+
extended_stay <- as.data.frame(round(t(extended_stay),4))
150+
#write.csv(extended_stay, file = paste0(local, "/Bigdata/Dropbox (Technion Dropbox)/Rina_Benel/Home/MachineLearningMedicine/results/losextended_stay_summaryStatistics.csv"))
151+
152+
153+
154+
155+
156+

0 commit comments

Comments
 (0)