-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathCS_descriptives_coffee.R
126 lines (102 loc) · 5.09 KB
/
CS_descriptives_coffee.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
# Coffee consumer stats. Prepare metadata from two datasets
library(tidyverse)
# Read in metadata from alcohol file (which has BMI data)
sumtab <- function() {
alc_g <- read_csv("alcohol/alcohol.csv") %>% select(Idepic, Qe_Alc, R_BMI, Age_Dtq)
#cs_metadata file now contains NAs where needed for easier subsetting. Read in and subset subjects (remove QC metadata)
meta <- read.csv("data/cs_metadata.csv") %>% filter(present.pos == T & stype == "SA") %>%
left_join(alc_g, by="Idepic") %>% mutate(logQe_Alc = log(Qe_Alc + 1),
cupvols = case_when(country == "France" ~ 146.59, country == "Germany" ~ 209.32,
country == "Greece" ~ 135.48, country == "Italy" ~ 55.2), cups = cof/cupvols)
#Centre specific characteristics of the study population for paper
meta <- meta %>% select(country, centre, sex, Age_Dtq, R_BMI, smoke, cof, cups)
a <- meta %>% group_by(country, centre) %>%
summarise(n = n(), BMI = round(mean(R_BMI), 1), Cup_intake = round(median(cups),2),
Cof_intake = round(median(cof)), Mean_age = round(mean(Age_Dtq), 1))
b <- meta %>% group_by(centre, sex) %>% summarise(fem = n()) %>% filter(sex == "Female")
c <- meta %>% group_by(centre, smoke) %>% summarise(smokers = n()) %>% filter(smoke == "Yes")
d <- left_join(a, b, by = "centre")
#Final summary table for paper
sumtab <- left_join(d, c, by = "centre") %>% select(Country=country, "EPIC Center"=centre, N=n,
"of which female" = fem, "Mean age" = Mean_age,
"Median coffee intake (mL)" = Cof_intake, "Mean BMI" = BMI,
"Current smokers at sample collection"=smokers)
}
tab <- sumtab()
#-------------------------------------------------------------------------------------------
# Visualisations
# histogram of all consumers
hist(meta$cof, breaks = 50, col = "gray")
# Boxplot of coffee intake by country (can change predictor to sex, smoke, alc. bev)
boxplot(sqrt(cof) ~ country, data=meta, col="dodgerblue", ylab="Alcohol intake (mL/day)")
# Boxplot for powerpoint slide. For publication reduce font size to 10 and B&W
library(ggplot2)
# ggplot(meta, aes(x=country, y=cups)) +
ggplot(meta, aes(x=country, y=cof)) +
#geom_boxplot(fill="grey", outlier.colour="white") +
geom_boxplot(fill="dodgerblue", outlier.colour = "white") +
theme_bw(base_size = 10) +
geom_jitter(position=position_jitter(width=0.1), size=1) + #, colour="red") +
theme(panel.grid.minor=element_blank(),
#panel.grid.major = element_blank(),
panel.border=element_rect(colour = "black")) +
#ylim(c(0,1250)) +
xlab("") +
ylab("Coffee intake mL/day") #+
#ylab("Coffee intake cups/day")
ggsave("intake boxplot2.png", width=160, height = 120, units="mm")
ggsave("intake boxplot2.svg")
# For paper
ggsave("boxplot volume paper.png", width=80, height = 50, units="mm")
ggsave("boxplot cups paper.png", width=80, height = 50, units="mm")
# Boxplot for poster
library(latticeExtra)
bwplot(cof ~ country, data=meta, pch="|", ylab = "Coffee intake (mL/day)", ylim = c(-50, 1300),
par.settings = list(box.rectangle = list(fill= "darkorange", col="black")))
# with tapply
cofsum1 <- tapply(meta$cof, meta$country, summary)
write.csv(cofsum, file="Intake summary by country.csv")
#-------------------------------------------------------------------------------------------------
# statistical tests
# Kruskal test (not as good as bootstrapped CIs as don't know which groups are different)
kruskal.test(meta$cof, meta$country)
kruskal.test(meta$cups, meta$country)
boxplot(meta$cups ~ meta$country)
fit <- lm(meta$cups ~ meta$country)
summary(fit)
# Medians and bootstrapped CIs of coffee intake or cups, 451 subjects
library(simpleboot)
tapply(meta$cof, meta$country, median)
tapply(meta$cups, meta$country, median)
b <- tapply(meta$cups, meta$country, FUN = one.boot, median, R=1000)
lapply(b, boot.ci, type=c("perc", "bca")) #two types output but percentile method used here
# Medians
# France Germany Greece Italy
# 248.3863 429.7500 140.0000 90.0000
# Bootstrapped CIs
# Percentile BCa
# France (181.6, 320.0 ) (176.3, 310.8 )
# Germany (292.5, 450.0 ) (290.4, 450.0 )
# Greece (112.0, 140.0 ) ( 70.5, 140.0 )
# Italy (80.00, 92.31 ) (66.47, 90.00 )
# By base R with loop (warning, need to split data frame first, FR, D etc)
medians <- numeric(1000)
for (i in 1:1000) medians[i] <- median(sample(na.omit(FR$cof), replace = T))
ci <- quantile(medians, c(0.025, 0.975))
cat("95% confidence interval is (", ci, ")\n")
# ----
#Get coffee intake for all EPIC data
library(tidyverse)
library(haven)
foods <- read_csv("D:/dtqst_food_group.csv") %>% select(Country, Center, QGE130301, Idepic)
#Need to clean data after Python import of compressed SAS file
#See python script in MyScripts
#Practice code using data head only
hdfoods <- head(foods)
hdfoods %>% separate(Idepic, sep = "'", into = c("a", "b", "c"))
#On full data set
foods1 <- foods %>% separate(Idepic, sep = "'", into = c("a", "Idepic", "c")) %>% select(-a, -c)
class(foods1$Idepic)
saveRDS(foods1, "Coffee intakes all EPIC.rds")
intersect(foods1$Idepic, hcc$Idepic) %>% length()
dim(hcc)