-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathdata_challenge_project.R
240 lines (178 loc) · 8.46 KB
/
data_challenge_project.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
# Sorting the raw data based on requirement
# Load required libraries
library(dplyr)
library(magrittr)
library(tidyr)
# Read the data from the CSV file
# Replace "/path/to/your/csvfile.csv" with the actual path to your CSV file
raw_data <- read.csv("/path/to/your/csvfile.csv")
# Filter the data for a specific species and country
# Replace "Species" and "Country" with the appropriate column names
# Replace "Species_Name" and "Country_Name" with the desired species and country names
filtered_data <- raw_data %>%
filter(Species == "Species_Name" & Country == "Country_Name")
# Save the filtered data to a new CSV file
# Replace "/path/to/your/output/file.csv" with the desired output path and file name
write.csv(filtered_data, '/path/to/your/output/file.csv')
# Remove any empty columns
# Load required libraries
library(dplyr)
# Read the data from the CSV file
# Replace "/path/to/your/input/file.csv" with the actual path to your input CSV file
data <- read.csv("/path/to/your/input/file.csv")
# Identify and remove empty columns
empty_cols <- which(colSums(is.na(data)) == nrow(data))
data <- data[, -empty_cols]
# Save the modified data to a new CSV file
# Replace "/path/to/your/output/modified_file.csv" with the desired output path and file name
write.csv(data, "/path/to/your/output/modified_file.csv", row.names = FALSE)
# Creating a distribution plot of samples across the world from the provided dataset
# Load required libraries
library(ggplot2)
library(maps)
# Read the CSV file into a data frame
# Replace "/path/to/your/input/file.csv" with the actual path to your input CSV file
data <- read.csv("/path/to/your/input/file.csv")
# Create a world map
world <- map_data("world")
# Check for missing or non-unique country names in the data
if (any(is.na(data$Country))) {
stop("There are missing values in the 'Country' column.")
}
if (any(duplicated(data$Country))) {
stop("There are non-unique values in the 'Country' column.")
}
# Merge the data frame with the world map
data_map <- inner_join(world, data, by = c("region" = "Country"))
# Create the plot using ggplot2
plot <- ggplot() +
geom_polygon(data = data_map, aes(x = long, y = lat, group = group, fill = SampleID)) +
coord_map("miller") +
theme_minimal() +
theme(axis.text = element_blank(),
axis.title = element_blank(),
panel.grid.major = element_blank(),
panel.grid.minor = element_blank()) +
scale_fill_viridis_c() +
labs(title = "Distribution of Samples on World Map")
# Save the plot as a PNG file
# Replace "/path/to/your/output/sample_map.png" with the desired output path and file name
ggsave("/path/to/your/output/sample_map.png", plot, width = 10, height = 6, dpi = 300)
# To generate a plot for prevalence of species by year
# Load required libraries
library(ggplot2)
library(dplyr)
# Read the CSV file into a data frame
# Replace "/path/to/your/input/file.csv" with the actual path to your input CSV file
data_counts <- read.csv("/path/to/your/input/file.csv")
# Check for missing or non-unique values in the data
if (any(is.na(data_counts$Year)) || any(is.na(data_counts$n))) {
stop("There are missing values in the 'Year' or 'n' columns.")
}
if (any(duplicated(data_counts$Year))) {
stop("There are non-unique values in the 'Year' column.")
}
# Create the line plot using ggplot2
line_plot <- ggplot(data_counts, aes(x = Year, y = n, group = Species, color = Species)) +
geom_line() +
geom_point() +
labs(title = "Prevalence of ESKAPE pathogens by species and year",
x = "Year",
y = "Number of Samples",
color = "Species") +
theme_minimal()
# Display the line plot
print(line_plot)
# Save the plot as a PNG file
# Replace "/path/to/your/output/line_plot.png" with the desired output path and file name
ggsave("/path/to/your/output/line_plot.png", line_plot, width = 10, height = 6, dpi = 300)
# preparing the data for MDR analysis using the AMR package
# First convert the entries in the dataset columns into R,I,S format from Resistant,Intermediate,Susceptible format
# Load required libraries
library(dplyr)
# Read the raw data from CSV file
# Replace "/path/to/your/input/raw_data.csv" with the actual path to your input CSV file
raw_data <- read.csv("/path/to/your/input/raw_data.csv")
# Function to replace "Resistant", "Susceptible", and "Intermediate" with "R", "S", and "I" respectively
replace_resistant_susceptible_intermediate <- function(x) {
x <- gsub("Resistant", "R", x)
x <- gsub("Susceptible", "S", x)
gsub("Intermediate", "I", x)
}
# Automatically detect and replace columns containing "Resistant", "Susceptible", and "Intermediate"
raw_data <- raw_data %>%
mutate(across(where(is.character), replace_resistant_susceptible_intermediate))
# Save the updated data to a new CSV file if needed
# Replace "/path/to/your/output/modified_data.csv" with the desired output path and file name
write.csv(raw_data, "/path/to/your/output/modified_data.csv", row.names = FALSE)
# Determine the MDR isolates count from the prepared dataset usinf AMR package https://msberends.github.io/AMR/reference/mdro.html
mdro(
# A dataframe antibiotic columns like AMX or amox.
x = NULL,
# A specefic supported National/International/Custom guideline to be followed
guideline = "CMI2012",
# column name of the names or codes of the microorganisms
col_mo = NULL,
# a logical to indicate whether progress should be printed to the console
info = interactive(),
# minimal required percentage of antimicrobial classes that must be available per isolate
pct_required_classes = 0.5,
# a logical to indicate whether all values of S and I must be merged into one, so resistance is only considered when isolates are R, not I.
combine_SI = TRUE,
# a logical to turn Verbose mode on and off (default is off).
verbose = FALSE,
# a logical to indicate whether only antibiotic columns must be detected that were transformed to class sir
only_sir_columns = FALSE,
...
)
# Generating a plot for the MDR analysis
# Load required libraries
library(ggplot2)
library(tidyverse)
# Example data: Replace these with your actual data
pathogen_names <- c("E. faecium", "S. aureus", "K. pneumoniae", "A. baumannii", "P. aeruginosa", "Enterobacter spp")
total_counts <- c(16661, 147921, 88712, 38036, 94460, 48041)
mdr_counts <- c(5710, 1596, 27327, 23077, 17165, 7549)
# Combine vectors into a data frame
mdr_data <- data.frame(pathogen = pathogen_names, total_count = total_counts, mdr_count = mdr_counts)
# Calculate the proportion of MDR isolates relative to total isolates
mdr_data$mdr_proportion <- mdr_data$mdr_count / mdr_data$total_count
# Create the grouped bar plot with text labels
ggplot(mdr_data, aes(x = reorder(pathogen, total_count), y = total_count, fill = "Total")) +
geom_bar(stat = "identity", position = "dodge") +
geom_bar(aes(y = mdr_count, fill = "MDR"), stat = "identity", position = "dodge") +
geom_text(aes(label = paste0(round(mdr_proportion * 100), "%")),
position = position_dodge(width = 0.9), vjust = -0.5, size = 3.5) +
labs(title = "MDR Isolate Count Among Total Isolates for ESKAPE Pathogens",
x = "Pathogen",
y = "Count",
fill = "Isolate Type") +
scale_fill_manual(values = c("Total" = "gray", "MDR" = "red")) +
theme_minimal() +
theme(axis.text.x = element_text(angle = 45, hjust = 1),
legend.title = element_blank(),
axis.text = element_text(color = "black"))
# Future antimicrobial resistance prediction using regression models
# AMR package (https://msberends.github.io/AMR/articles/resistance_predict.html)
# Load the necessary libraries
library(dplyr)
library(ggplot2)
library(AMR)
# Read the CSV file into a data frame
# Replace "/path/to/your/input/your_file_path.csv" with the actual path to your input CSV file
data <- read.csv("/path/to/your/input/your_file_path.csv")
# Specify the name of the antibiotic for resistance prediction
# Replace "Your_Antibiotic_Column_Name" with the actual column name containing antibiotic resistance information
antibiotic_column <- "Your_Antibiotic_Column_Name"
# Perform resistance prediction for the specified antibiotic
# Replace "Year" and "Your_Antibiotic_Column_Name" with the actual column names for date and antibiotic resistance information
predict_antibiotic <- data %>%
resistance_predict(
col_date = "Year",
col_ab = antibiotic_column,
model = "binomial"
)
# Plot the resistance prediction results
plot(predict_antibiotic)
# Plot the predicted SIR (Susceptible, Intermediate, Resistant) proportions
ggplot_sir_predict(predict_antibiotic)