-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathnode_genome_mapping.Rmd
148 lines (114 loc) · 5.45 KB
/
node_genome_mapping.Rmd
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
---
title: "node_genome_mapping"
output: html_document
date: "2024-11-06"
---
```{r}
library(readr)
library(tidyverse)
library(stringr)
blast_result <- read_table("workflow/out/blast/sample_1_0_02_blast_results_sample0-9_reference_metagenome.out", guess_max = 100000)
genome_seq_ids <- read_delim("workflow/out/blast/sample_1_reference_metagenome_ids.txt", delim = ";", escape_double = FALSE, trim_ws = TRUE, guess_max = 200)
tax_profile <- read_csv("workflow/out/blast/taxonomic_profile_1.csv")
query_ids <- read_table("workflow/out/blast/sample_1_0_02_ids.txt")
```
```{r}
blast_result_filtered <- blast_result %>%
group_by(qseqid, sseqid) %>%
distinct() %>%
ungroup()
cat("original nrows in blast result:", nrow(blast_result), "\n")
cat("distinct rows in blast result with no duplicate query-target pairs (only one hit per query per target) :", nrow(blast_result_filtered), "\n")
qseqs_with_hits <- unique(blast_result_filtered$qseqid)
qseqs_no_hits <- setdiff(query_ids$qseqid, qseqs_with_hits)
cat("n queries with blast hits:", length(qseqs_with_hits), "\n")
cat("calculated n queries with NO blast hits:", length(qseqs_no_hits), "\n")
exp_n_qseqs_no_hits <- length(query_ids$qseqid) - length(qseqs_with_hits)
cat("expected n queries with NO blast hits:", exp_n_qseqs_no_hits, "\n")
# investigate seqs with no blast hits
View(as.data.frame(qseqs_no_hits))
write_lines(qseqs_no_hits, "workflow/out/blast/qseqs_no_blast_hits.txt")
```
```{r}
tax_table <- tax_profile %>%
rowwise() %>%
mutate(kingdom = strsplit(TAXPATHSN, "\\|")[[1]][1],
phylum = strsplit(TAXPATHSN, "\\|")[[1]][2],
class = strsplit(TAXPATHSN, "\\|")[[1]][3],
order = strsplit(TAXPATHSN, "\\|")[[1]][4],
family = strsplit(TAXPATHSN, "\\|")[[1]][5],
genus = strsplit(TAXPATHSN, "\\|")[[1]][6],
species = strsplit(TAXPATHSN, "\\|")[[1]][7],
strain = strsplit(TAXPATHSN, "\\|")[[1]][8])
# mutate(species_stripped = paste(strsplit(species, " ")[[1]][1:2], collapse = " "))
genome_node_clusters <- blast_result_filtered %>%
rowwise %>%
mutate(node = strsplit(qseqid, ":")[[1]][1]) %>%
left_join(., genome_seq_ids, by = c("sseqid" = "seqid")) %>%
select(node, genome, info) %>%
distinct() %>%
mutate(species = case_when(genome == "Erysipelotrichaceae bacterium I46" ~ genome,
genome == "Actinomyces sp. oral taxon 414 strain F0588" ~ "Actinomyces sp. oral taxon 414",
genome == "Coriobacteriaceae bacterium 68-1-3" ~ genome,
!str_detect(genome, "\\bsp\\.") ~
paste(strsplit(genome, " ")[[1]][1:2], collapse = " "),
str_detect(genome, "\\bsp\\.") ~
paste(strsplit(genome, " ")[[1]][1:3], collapse = " "))) %>%
left_join(., tax_table, by = c("species")) %>%
filter(!is.na(genome))
genome_node_clusters[genome_node_clusters == ""] <- NA
write.csv("workflow/out/node_classification/nodes_by_genome_multilabel.csv")
species_node_clusters <- genome_node_clusters %>%
select(node, species) %>%
filter(!is.na(species)) %>%
distinct() %>%
group_by(node) %>%
summarise(species = paste(species, collapse = ";")) %>%
distinct()
write.csv(species_node_clusters, "workflow/out/taxonomy/sample_1_0_02_nodes_by_species_multilabel.csv")
genus_node_clusters <- genome_node_clusters %>%
select(node, genus) %>%
filter(!is.na(genus)) %>%
distinct() %>%
group_by(node) %>%
summarise(species = paste(genus, collapse = ";")) %>%
distinct()
write.csv(genus_node_clusters, "workflow/out/taxonomy/sample_1_0_02_nodes_by_genera_multilabel.csv")
genus_node_clusters <- genome_node_clusters %>%
rowwise() %>%
mutate(genus = ifelse(is.na(genus),
strsplit(genome, " ")[[1]][1],
genus)) %>%
select(node, genus, Genera) %>%
distinct() %>%
group_by(genus) %>%
mutate(Genera = case_when(is.na(Genera) ~ first(Genera[!is.na(Genera)]),
TRUE ~ Genera)) %>%
ungroup()
genus_node_clusters[genus_node_clusters == ""] <- NA
# manually adding missing values for Genera and genus heh
genus_node_clusters$Genera <- as.character(genus_node_clusters$Genera)
genus_node_clusters$Genera[genus_node_clusters$genus == "Acetivibrio"] <- "35829"
genus_node_clusters$Genera[genus_node_clusters$genus == "Anaerotignum"] <- "2039240"
genus_node_clusters$Genera[genus_node_clusters$genus == "Peptoclostridium"] <- "1481960"
genus_node_clusters$genus[genus_node_clusters$Genera == "128827"] <- "Erysipelotrichaceae"
genus_node_clusters$genus[genus_node_clusters$Genera == "39491"] <- "Agathobacter"
genus_node_clusters$genus[genus_node_clusters$Genera == "2815775"] <- "Berryella"
genus_node_clusters_filtered <- genus_node_clusters %>%
select(node, genus) %>%
filter(!is.na(genus)) %>%
distinct() %>%
group_by(node) %>%
mutate(mult_genera = ifelse(n() > 1,
"multiple_genera",
genus)) %>%
ungroup() %>%
select(node, mult_genera) %>%
distinct()
write.csv(genus_node_clusters_filtered, "workflow/out/blast/sample1_nodes_by_genus.csv")
write.csv(species_node_clusters, "workflow/out/blast/sample1_nodes_by_species.csv")
# determine how we should group clusters by on a genus level -- by genus or genera
nrow(subset(genus_node_clusters, is.na(genus)))
nrow(subset(genus_node_clusters, is.na(Genera)))
# way more rows have values for Genera so I guess we'll use this ugh
```