-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathanalysis.R
151 lines (131 loc) · 7.52 KB
/
analysis.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
library(dplyr) # Version 0.7.0 or above
library(reshape2)
library(purrr)
pull.pheno <- function(df, keep){
# Pull out only certain behaviours from data frame
#
# Args
# df: data frame where you want to subset out certain columns
# keep: columns to keep, of the format "A|B|C"
# Returns:
# df_short: data frame with only the columns that have been specified in keep
if(length(grep(keep, colnames(df))) == 0){
return(df)
} else {
df.short <- df[, grep(keep, colnames(df))]
return(df.short)
}
}
read.direc <- function(cell) {
# Read all the csv files in a directory into a list of data.frames
# Args:
# path = path to the root directory containing directories for each cell line. Likekly `results.direc`
# cell = cell line for which to read results e.g. `"SKBR3"`
# head = whether or not to use headers
# Return:
# list of data frames, one for each csv file for the cell line
file.names.direc <- list.files(path=file.path(cell), pattern="*(Attractor|Fixpoints)")
if (length(file.names.direc) == 0) {
stop(paste0("Error: Zero results files in directory: \n", file.path(path, cell),
"\n Have you specified the directory correctly?"))
}
data <- lapply(file.path(cell, file.names.direc), read.csv)
names(data) <- gsub("*.csv$", "", file.names.direc)
data
}
read.cell.line.results <- function(cell.lines) {
# Read the results for all the cell lines, and store as a list of list of data frames
# Args:
# cell.lines = list of the cell lines for which there are directories of results
# path = path to directory of results
# Returns:
# list of list of data frames of results, root list is of cell lines, leaves are of the stable states for each of those cell lines
data <- lapply(cell.lines, read.direc)
names(data) <- cell.lines
data
}
name.results <- function(l, colname) {
# Takes the name of a variable and turns it into the value of a column, with specified columnme, to allow for later merging without losing that info
# Args:
# l = a list of data frames with each data frame named such that names(l) is meaningful
# colname = name for column containing name data
# Returns:
# same list as input but each data frame has a column `colname` which contains the name of each data frame
mapply(function(x, y) {x[colname] <- y; x}, x=l, y=names(l), SIMPLIFY=F)
}
expand.range <- function(x) {
## if x is a character, interpreted as a range. Otherwise, as a scalar
## Expand ranges, e.g. "[1; 2; 3]", to a vector of numbers
if (is.character(x) && grepl("\\[", x)) {
nums <- strtoi(trimws(unlist(strsplit(substr(x, 2, nchar(x)-1), ";"))))
return(nums)
}
return(as.integer(x))
}
expand.ranges <- function(df) {
lists <- apply(df, 1, function(r) lapply(r, expand.range))
dfs <- lapply(lists, cross_df)
bind_rows(dfs)
}
expand.fixpoints <- function(l) {
fixpoints <- grepl("Fixpoints", names(l))
results <- l[!fixpoints]
num.loops <- length(results)
for (f in names(l)[fixpoints]) {
expansion <- expand.ranges(l[[f]])
expansion <- split(expansion, seq(nrow(expansion)))
names(expansion) <- sapply(num.loops - 1 + seq(length(expansion)),
function(i) paste(f, i, sep=""))
results <- append(results, expansion)
}
names(results) <- gsub("Fixpoints", "Attractor", names(results))
results
}
expand.mutations <- function(df){
# Takes a data frame with Mutation as a single identifier and splits.
# Dependacies: dplyr
#
# Args:
# df: data frame, one in which the Mutation variable is in a column, and has already had Attractors split off, e.g. summary.by.mutation.attractor
# Returns:
# data frame with a set of columns for each node and mutation value, as well as the number of mutations applied, for each attractor. Should be able to be used to make both the node and edge table for visualisation
out <- strsplit(as.character(df$Mutation), split = "__")
number.muts <- unlist(lapply(out, function(x) length(x)/2)) #Number of mutations applied, will be needed to build edge list
out <- lapply(out, FUN=function(x) c(unlist(x), rep(NA, max(lengths(out))-length(x))))
out <- as.data.frame(cbind(do.call(rbind, out)))
n.newcol <- ncol(out)
Node.names <- paste0("Node", seq(1:(n.newcol/2)))
Mut.names <- paste0("Mut", seq(1:(n.newcol/2)))
new.colnames <- c(rbind(Node.names, Mut.names))
colnames(out) <- new.colnames
df <- cbind(out, unclass(df)) #Needed to remove effects of `summarise` which prevents `cbind` and `dply::arrange` from working
df$Number.Muts <- number.muts
df <- select(df, Cell_Line, Mutation, Attractor, Number.Muts, everything())
df <- arrange(df, Number.Muts, Attractor)
df
}
analyse = function(results) {
short.results <- lapply(results, function(x) lapply(x, expand.ranges)) %>%
lapply(function (x) lapply(x, pull.pheno, "Proliferation|Apoptosis")) #Pull out the nodes that affect how well the mutations are going
short.results.muts <- lapply(short.results, name.results, colname = "Full_Mutation_Name") # Add mutations from name of data frames to a column
short.results.muts.unlist <- lapply(short.results.muts, function(x) Reduce(function(df1, df2) merge(df1, df2, by = c("Proliferation", "Apoptosis", "Full_Mutation_Name"), all = TRUE), x)) #
short.results.named <- name.results(short.results.muts.unlist, colname = "Cell_Line") %>% # Add cell_line from name of data frames to a column
lapply(mutate, Full_Mutation_Name = gsub("^Mut", Full_Mutation_Name, replacement = "")) #Remove the `"Mut"` prefix only from the very begining of a string on column `Full_Mutation_Name`
# Split Mutations into mutation and attractor. so before here we need to filter
short.results.named.split <- lapply(short.results.named, function(x) {y <- cbind(x, colsplit(x[["Full_Mutation_Name"]], pattern = "Attractor", c("Mutation", "Attractor"))); return(y)})
## # Group by mutations AND THEN by attractor
by.mutation.attractor <- lapply(short.results.named.split, group_by, Mutation, Attractor)
summary.by.mutation.attractor <- lapply(by.mutation.attractor, summarise,
min_Proliferation = min(Proliferation),
max_Proliferation = max(Proliferation),
mean_Proliferation = mean(Proliferation),
range_Proliferation = max(Proliferation)-min(Proliferation),
min_Apoptosis = min(Apoptosis),
max_Apoptosis = max(Apoptosis),
mean_Apoptosis = mean(Apoptosis),
range_Apoptosis = max(Apoptosis)-min(Apoptosis),
loop_length = n()) %>%
name.results(colname = "Cell_Line") #need to add cell_line name as lost on the `summarise` process
expanded.summary.by.mutation.attractor <- lapply(summary.by.mutation.attractor, expand.mutations)
expanded.summary.by.mutation.attractor
}