-
Notifications
You must be signed in to change notification settings - Fork 3
/
Copy pathsd_length_distributions.R
136 lines (107 loc) · 4.58 KB
/
sd_length_distributions.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
#!/usr/bin/env Rscript
setwd(dirname(rstudioapi::getActiveDocumentContext()$path))
source("plotutils.R")
library(ggridges)
tmp = copy(SEDEF[SEDEF$original])
peri = tmp[peri>0 | peri2 > 0]; peri$Assembly = "pericentromeric"
acro = tmp[acro>0 | acro2 > 0]; acro$Assembly = "acrocentric"
telo = tmp[telo>0 | telo2 > 0]; telo$Assembly = "telomeric"
intra = tmp[chr == chr2]; intra$Assembly = "intrachromosomal"
inter = tmp[chr != chr2]; inter$Assembly = "interchromosomal"
notacro = tmp[acro == 0 & acro2 == 0]; notacro$Assembly = "not acrocentric"
all = tmp; all$Assembly = "All"
df = rbind(intra, inter, acro, peri, telo, all, notacro)
df$length = df$matchB + df$mismatchB #df$end - df$start
df$Assembly = factor(df$Assembly, levels = unique(df$Assembly))
df$Assembly
x = "#chr1 start1 end1 name fakeScore strand1 start1 end1 color chr2 start2 end2 score strand2 max_len aln_len indel_a indel_b alnB matchB mismatchB transitionsB transversions fracMatch fracMatchIndel jck k2K aln_gaps uppercaseA uppercaseB uppercaseMatches aln_matches aln_mismatches aln_gap_bases filter_score sat_bases"
header = strsplit(x, "\\s+")[[1]]
colnames(df)[4:length(header)] = header[4:length(header)]
names = levels(df$Assembly)
colors = c(OLDCOLOR, BLUE, NEWCOLOR, "#FF0000","green", "black", 'blue')
names(colors) = names
ggplot(data=df) +
geom_point(aes(x=max_len, y = fracMatch), alpha=0.8, size=0.05) +
geom_density2d(aes(x=max_len, y = fracMatch), size=1, alpha=0.8) +
#scale_x_continuous(trans = "log10", labels = comma) +
#scale_fill_brewer(palette ="Spectral", direction = -1)+
facet_wrap(vars(Assembly)) +
theme_cowplot()
info.df = df %>% group_by(Assembly) %>%
group_modify(~length_stats(.x)) %>%
mutate(label=as.factor(label)) %>%
merge(df%>% group_by(Assembly) %>% summarise(count = n()))
info.df$y = rep(seq(.1,.8,.7/(length(unique(info.df$label))-1)), length(unique(info.df$Assembly)))
info.df
p1 = ggplot(data=df)+
geom_density_ridges(aes(max_len, y=Assembly, fill=Assembly))+
xlab("Segmental duplication length (bp)")+
geom_text(data=info.df,
aes(label=paste(label, round(value)), x= 1e6, y=as.numeric(Assembly)+y, group = Assembly, color=Assembly),
#direction = "y",
#ylim = c(0,NA),
hjust = 0)+
scale_fill_manual( values = colors) +
scale_color_manual( values = colors) +
theme_cowplot()+
theme(legend.position = "none") +
scale_x_continuous(trans = "log10", labels = comma) #facet_zoom( x = max_len >= 5e4 & max_len <= 3e5, zoom.size=2, horizontal = F) ;p1
p1
p2 = ggplot(data=df)+
geom_density_ridges(aes(fracMatch*100, y=Assembly, fill=Assembly))+
xlab("Percent Identity")+
scale_x_continuous(breaks = seq(90,100))+
scale_fill_manual( values = colors) +
theme_cowplot()+
theme(legend.position = "none")
p = plot_grid(p1,p2)
ggsave(glue("{SUPP}/sd_length_identity.pdf"), plot = p, width = 16, height = 8)
p
info.df.2 = info.df %>%
mutate(label = paste(label, round(value))) %>%
group_by(Assembly) %>%
summarise(label = paste(label, collapse = "\n"))
p1 = ggplot(data=df)+
geom_histogram(aes(max_len, fill=Assembly), bins = 200)+
facet_col(~Assembly, scales = "free_y")+
xlab("Segmental duplication length (bp)")+
geom_text(data=info.df.2,
aes(label=label, x= 0.75e6, y=25, group = Assembly),
#direction = "y",
#ylim = c(0,NA),
hjust = 0, vjust = 0, size = 3)+
scale_fill_manual( values = colors) +
scale_color_manual( values = colors) +
theme_cowplot()+
theme(legend.position = "none") +
scale_x_continuous(trans = "log10", labels = comma) #facet_zoom( x = max_len >= 5e4 & max_len <= 3e5, zoom.size=2, horizontal = F) ;p1
p1
p2 = ggplot(data=df)+
geom_histogram(aes(fracMatch*100, fill=Assembly), bins=200)+
facet_col(~Assembly, scales = "free_y")+
xlab("Percent Identity")+
scale_x_continuous(breaks = seq(90,100))+
scale_fill_manual( values = colors) +
theme_cowplot()+
theme(legend.position = "none")
p = plot_grid(p1,p2);
ggsave(glue("{SUPP}/sd_length_identity.pdf"), plot = p, width = 16, height = 12)
p
fileConn<-file(glue("{SUPP}/sd_length_identity.txt"))
out = c()
for(n in names){
for(j in names){
if(n!=j){
result = wilcox.test(df[Assembly==n]$max_len, df[Assembly==j]$max_len, alternative = "greater")
if(result$p.value<0.5){
#print(c(n,">",j))
#print(result$p.value)
out = c(out, c(paste(n,"<",j), result$p.value))
}
}
}
}
writeLines(out, fileConn)
close(fileConn)
tmp
length_stats(df[Assembly=="acrocentric"])