Skip to content

Commit dde7b0b

Browse files
committed
Commenting and selecting query genes by default
1 parent 5d19ca4 commit dde7b0b

File tree

1 file changed

+134
-63
lines changed

1 file changed

+134
-63
lines changed

blast.R

+134-63
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,7 @@
11
library("BSgenome.PCC7120.Ensembl.29")
22
upstream.distance <- 500
33
query.genes <- commandArgs()[[1]]
4+
query.genes <- c("alr2431","asr3168","alr0296")
45
print("Will analyze following genes")
56
print(query.genes)
67
print(class(query.genes))
@@ -12,109 +13,179 @@ query.seqs <- list()
1213

1314
#system("mkdir BLASTquery")
1415

15-
#extrae la secuencia de cada gen desde el bsgenome de PCC7120, teniendo en cuenta la hebra
16+
#extrae la secuencia de cada uno de los query.genes desde el bsgenome de PCC7120, teniendo en cuenta la hebra
1617
#y exportala a un archivo con el mismo nombre que el gen.fasta en la carpeta BLASTquery
1718
for(gene in 1:length(query.genes))
1819
{
19-
gene.name <- query.genes[gene]
20-
print(gene.name)
21-
positions <- nostoc.gtf %>% filter(feature == "CDS", gene_name == gene.name) %>% select(start, end, strand)
22-
start <- positions[1,1]
23-
end <- positions[1,2]
24-
strand <- positions[1,3]
25-
if(strand == "+")
26-
{
27-
seq <- getSeq(BSgenome.PCC7120.Ensembl.29)[[1]][start:end] %>% as.character()
28-
}
29-
else if(strand == "-")
30-
{
31-
seq <- getSeq(BSgenome.PCC7120.Ensembl.29)[[1]][start:end] %>% as.character()
32-
seq <- seq %>% strsplit(split = "") %>% unlist() %>% rev() %>% comp() %>% toupper() %>% paste(collapse = "")
33-
}
34-
fasta.file <- paste("BLASTquery/BLAST_", gene.name, ".fasta", sep = "")
35-
write.fasta(seq, names = gene.name , file = fasta.file)
20+
gene.name <- query.genes[gene]
21+
fasta.file <- paste("BLASTquery/BLAST_", gene.name, ".fasta", sep = "")
22+
if(file.exists(fasta.file))
23+
{
24+
next
25+
}
26+
print(paste("Exporting ", gene.name, '"s fasta sequence as extracted from the BSgenome object', sep = ""))
27+
positions <- nostoc.gtf %>% filter(feature == "CDS", gene_name == gene.name) %>% select(start, end, strand)
28+
if(nrow(positions) == 0)
29+
{
30+
print(paste(gene.name, " is not present in nostoc.gtf by that name", sep = ""))
31+
next
32+
}
33+
start <- positions[1,1]
34+
end <- positions[1,2]
35+
dna.strand <- positions[1,3]
36+
if(dna.strand == "+")
37+
{
38+
seq <- getSeq(BSgenome.PCC7120.Ensembl.29)[[1]][start:end] %>% as.character()
39+
}
40+
else if(dna.strand == "-")
41+
{
42+
seq <- getSeq(BSgenome.PCC7120.Ensembl.29)[[1]][start:end] %>% as.character()
43+
seq <- seq %>% strsplit(split = "") %>% unlist() %>% rev() %>% comp() %>% toupper() %>% paste(collapse = "")
44+
}
45+
write.fasta(seq, names = gene.name , file = fasta.file)
3646
}
3747

38-
infileNms <- list.files(path = "BLASTquery", pattern = "*.fasta")
39-
infileNms
48+
infileNms <- paste("BLAST_", query.genes, ".fasta", sep = "")
49+
4050

4151

4252
outnames <- paste(unlist(sapply(infileNms, strsplit, split = "*.fasta")),
4353
".out", sep = "")
4454

4555

46-
# create commands function
56+
# create commands function, This function creates the command required to blast each sequence in BLASTquery
4757
cmdCreate <- function(infile, outfile){
48-
paste("echo BLASTING ", infile, "; ", "blastn -db nr -query BLASTquery/", infile, " -remote -out BLASTout/", outfile,
49-
' -outfmt "7 qacc sacc sseqid sgi staxids sscinames evalue qstart qend sstrand sstart send qseq sseq"', sep = "")
58+
if(file.exists(paste("BLASTout/", outfile, sep = "")))
59+
{
60+
paste("echo File ", infile, " already downloaded and blasted, skipping", sep = "")
61+
}
62+
else
63+
{
64+
paste("echo BLASTING ", infile, "; ", "blastn -db nr -query BLASTquery/", infile, " -remote -out BLASTout/", outfile,
65+
' -outfmt "7 qacc stitle sacc sseqid sgi staxids sscinames evalue qstart qend strand sstart send"', sep = "")
66+
}
5067
}
5168

5269
# create actual commands
5370
cmds <- mapply(FUN = cmdCreate, infile = infileNms, outfile = outnames)
54-
##Blast all sequences stored in the fasta files
71+
## CALLING BLASTN
72+
## Blast all sequences stored in the fasta files
5573
sapply(cmds, system)
5674

5775
## Concatenate all BLAST output into one R data.frame
5876
system('cat BLASTout/* | grep -v "#" > "blast_output"')
77+
78+
cmd <- paste("cat ", paste("BLASTout/BLAST_", query.genes, ".out", sep = "", collapse = " "), ' | grep -v "#" > "blast_output"')
79+
system(cmd)
80+
5981
blast.output <- read.table(file = "blast_output", sep = "\t")
60-
colnames(blast.output) <- c("qacc", "sacc", "sseqid", "sgi", "staxids",
82+
colnames(blast.output) <- c("qacc", "stitle", "sacc", "sseqid", "sgi", "staxids",
6183
"sscinames", "evalue", "qstart",
6284
"qend", "strand", "sstart",
63-
"send", "qseq", "sseq")
85+
"send")
6486

87+
## Remove all entries from PCC 7120
88+
blast.output <- blast.output %>% filter(sscinames != "Nostoc sp. PCC 7120")
6589

90+
## Define a new column that states how many letters upstream will be explored. It will be upstream.distance + the distance between the query start and the query alignment (qstart)
91+
blast.output$upstream.distance <- upstream.distance + blast.output$qstart
6692

67-
## For every genome involved, extract the sequence going from -upstream.distance
68-
#up to the start of each BLAST subject hit
6993

70-
sgis <- blast.output %>% select(sgi) %>% distinct()
71-
#system("mkdir nucleotide")
72-
73-
reference.seqs <- list()
94+
## Initialize two lists that will store upstrem sequences and their identifiers
7495
upstream.sequences <- list()
7596
seq.ids <- character()
7697

77-
for (sgi in sgis[,1])
78-
#sgis[,1]
98+
# For each row in blast.output
99+
for(i in 1:nrow(blast.output))
79100
{
80-
print(sgi)
81-
#Download the genome
82-
sgid <- sgi %>% as.character()
83-
fasta.file <- paste("nucleotide/", sgid, ".fasta" , sep = "")
101+
## Extract the sequence's NCBI identifier (sgid)
102+
sgi <- (blast.output %>% select(sgi))[i,]
103+
104+
## Extract the sequence's strand
105+
dna.strand <- select(blast.output, strand)[i,] %>% as.character()
106+
107+
## Check if it has already been downloaded
108+
fasta.file <- paste("nucleotide/", sgi, ".fasta" , sep = "")
109+
110+
# If it hasn't, create the command that downloads it and execute it
84111
if(!file.exists(fasta.file))
85112
{
86-
cmd <- paste("wget -O ", fasta.file, ' "http://eutils.ncbi.nlm.nih.gov/entrez/eutils/efetch.fcgi?db=nucleotide&id=', sgid, '&rettype=fasta"', sep = "")
113+
cmd <- paste("wget -O ", fasta.file, ' "http://eutils.ncbi.nlm.nih.gov/entrez/eutils/efetch.fcgi?db=nucleotide&id=', sgi, '&rettype=fasta"', sep = "")
114+
print(paste("Downloading ", sgi, ' "s fasta file', sep = ""))
87115
system(cmd)
88116
}
117+
# If it has, warn so
118+
else
119+
{
120+
print(paste("Sequence ", sgi, " already downloaded", sep = ""))
121+
}
89122

90-
# Read the genome and assign the genome to the list reference.seqs with id sgi
123+
# Read the sequence just downloaded and assign it to "sequence"
91124
sequence <- read.fasta(fasta.file, seqtype = "DNA") %>% getSequence() %>% unlist()
92-
reference.seqs[[sgid]] <- sequence
93125

94-
# Extract start and end positions of every hit associated with genome sgi
95-
positions <- blast.output %>% filter(sgi == sgid) %>% select(sstart, send)
96-
97-
# For every hit, extract the sequence 500 upstream of it
98-
for(i in 1:nrow(positions))
126+
# Extract the elements that will make the seq.id
127+
seq.id <- select(blast.output, qacc, stitle)[i,] %>% as.list() %>% unlist() %>% as.character()
128+
seq.id[2] <- seq.id[2] %>% strsplit(split = " ") %>% unlist() %>% paste(collapse = "_")
129+
# Collapse them into one string
130+
seq.id <- paste(seq.id, collapse = "|")
131+
132+
133+
134+
if(dna.strand == "plus")
99135
{
100-
seq.id <- paste(sgid, i, sep = "_")
101-
#print(seq.id)
102-
upstream.position <- positions[i, 1] - upstream.distance
103-
if(upstream.position < 0)
104-
{
105-
window <- c((length(sequence) - upstream.position):length(sequence), 1:positions[i, 1])
106-
}
107-
else
108-
{
109-
window <- upstream.position:positions[i, 1]
110-
}
111-
upstream.sequence <- sequence[window]
112-
upstream.sequences[[seq.id]] <- upstream.sequence
113-
seq.ids <- c(seq.ids, seq.id)
136+
## Compute the subject upstream position
137+
upstream.position <- blast.output$sstart[i] - blast.output$upstream.distance[i]
138+
if(length(sequence) < upstream.distance)
139+
{
140+
## Window cannot be bigger than sequence
141+
window <- 1:length(sequence)
142+
}
143+
else if(upstream.position < 0)
144+
{
145+
## Circular
146+
window <- c((length(sequence) + upstream.position):length(sequence), 1:blast.output$sstart[i])
147+
}
148+
else
149+
{
150+
# Regular window
151+
window <- upstream.position:blast.output$sstart[i]
152+
}
114153
}
154+
155+
156+
else if(dna.strand == "minus")
157+
{
158+
## Compute the subject upstream position
159+
upstream.position <- blast.output$sstart[i] + blast.output$upstream.distance[i]
160+
if(length(sequence) < upstream.distance)
161+
{
162+
## Window cannot be bigger than sequence
163+
window <- 1:length(sequence)
164+
}
165+
else if(upstream.position > length(sequence))
166+
{
167+
## Circular
168+
window <- c(blast.output$sstart[i]:length(sequence), 1:upstream.position)
169+
}
170+
else
171+
{
172+
# Regular window
173+
window <- blast.output$sstart[i]:upstream.position
174+
}
175+
}
176+
print(length(window))
177+
178+
## Subset the sequence and retain only the one spanning the window
179+
upstream.sequence <- sequence[window]
180+
181+
## Add the seq.id and the upstream.sequence to its corresponding list
182+
upstream.sequences[[seq.id]] <- upstream.sequence
183+
seq.ids <- c(seq.ids, seq.id)
184+
115185
}
116186

117-
str(upstream.sequences)
187+
188+
189+
write.fasta(upstream.sequences, file.out = "500pb_upstream_homologues.fa", names = seq.ids)
118190

119-
str(reference.seqs)
120-
str(upstream.sequences)
191+
write.table(file = "blast_output_processed.txt", blast.output, col.names = T)

0 commit comments

Comments
 (0)