-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathfu1_import_refseq_euk_fun
65 lines (61 loc) · 2.66 KB
/
fu1_import_refseq_euk_fun
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
###############################################################
# Script to import RefSeq Eukaryotes annotation data #
# Data: Hiseq - metranscriptomics #
# Mona Parizadeh - 2020-2021 #
###############################################################
# import packages ####
library(scales); packageVersion("scales") #‘1.1.1’
library(reshape2); packageVersion("reshape2") #‘1.4.4’
library(vegan); packageVersion("vegan") #‘2.5.7’
library("ggplot2"); packageVersion("ggplot2") #‘3.3.3’
library("data.table"); packageVersion("data.table") #‘1.13.6’
# Import data ####
path = "~/Documents/refseq_euk_new/"
list.files(path)
# Get file names ####
files = list.files(path, pattern = "*.tsv", full.names = T, recursive = FALSE)
files
# loading the files
y <- 0
for (x in files) {
y <- y + 1
if (y == 1) {
table <- read.table(file = x, header = F, quote = "", sep = "\t")
colnames(table) = c("DELETE", x, "V3")
table <- table[,c(3,2)] }
if (y > 1) {
temp_table <- read.table(file = x, header = F, quote = "", sep = "\t")
colnames(temp_table) = c("DELETE", x, "V3")
temp_table <- temp_table[,c(2,3)]
table <- merge(table, temp_table, by = "V3", all = T) }
}
table[is.na(table)] <- 0
rownames(table) = table$V3
View(head(table))
table_trimmed <- data.frame(table[,-1]) #remove V3
# Clean sample names
file_names = ""
for (name in colnames(table_trimmed)) {
file_names = c(file_names, unlist(strsplit(name, split='_', fixed=TRUE))[4])}
file_names = file_names[-1]
file_names_trimmed = ""
for (name in file_names) {
file_names_trimmed = c(file_names_trimmed, unlist(strsplit(name, split='.', fixed=TRUE))[2])}
file_names_trimmed = file_names_trimmed[-1] #remove the empty colname
colnames(table_trimmed) = file_names_trimmed
rownames(table_trimmed)[1] = "Unannotated"
dim(table_trimmed);length(file_names_trimmed)
#(sum(table_trimmed[1,2:32])/sum(table_trimmed[2:32]))*100 #removed seqs
#table_trimmed = table_trimmed[-1,] #remove unassigned functions
head(table_trimmed); dim(table_trimmed)
# Export function names ####
fun_names = rownames(table_trimmed)
fun_lables = paste0("fun", seq(rownames(table_trimmed)))
fun_table = cbind(fun_lables,fun_names)
colnames(fun_table) = c("id","function")
head(fun_table);dim(fun_table)
write.table(fun_table,"~/Documents/article3/metatranscriptomics_dbCor/function_names_euk.tsv",sep = "\t", quote = FALSE)
# Rename functions
rownames(table_trimmed) = fun_lables
saveRDS(table_trimmed, "~/Documents/article3/metatranscriptomics_dbCor/aca_rna_refseq_euk_fun.rds")
save.image("~/Documents/article3/metatranscriptomics_dbCor/a3_fu1_import_refseq_euk_fun.Rdata")