1
1
library(" BSgenome.PCC7120.Ensembl.29" )
2
2
upstream.distance <- 500
3
3
query.genes <- commandArgs()[[1 ]]
4
+ query.genes <- c(" alr2431" ," asr3168" ," alr0296" )
4
5
print(" Will analyze following genes" )
5
6
print(query.genes )
6
7
print(class(query.genes ))
@@ -12,109 +13,179 @@ query.seqs <- list()
12
13
13
14
# system("mkdir BLASTquery")
14
15
15
- # extrae la secuencia de cada gen desde el bsgenome de PCC7120, teniendo en cuenta la hebra
16
+ # extrae la secuencia de cada uno de los query.genes desde el bsgenome de PCC7120, teniendo en cuenta la hebra
16
17
# y exportala a un archivo con el mismo nombre que el gen.fasta en la carpeta BLASTquery
17
18
for (gene in 1 : length(query.genes ))
18
19
{
19
- gene.name <- query.genes [gene ]
20
- print(gene.name )
21
- positions <- nostoc.gtf %> % filter(feature == " CDS" , gene_name == gene.name ) %> % select(start , end , strand )
22
- start <- positions [1 ,1 ]
23
- end <- positions [1 ,2 ]
24
- strand <- positions [1 ,3 ]
25
- if (strand == " +" )
26
- {
27
- seq <- getSeq(BSgenome.PCC7120.Ensembl.29 )[[1 ]][start : end ] %> % as.character()
28
- }
29
- else if (strand == " -" )
30
- {
31
- seq <- getSeq(BSgenome.PCC7120.Ensembl.29 )[[1 ]][start : end ] %> % as.character()
32
- seq <- seq %> % strsplit(split = " " ) %> % unlist() %> % rev() %> % comp() %> % toupper() %> % paste(collapse = " " )
33
- }
34
- fasta.file <- paste(" BLASTquery/BLAST_" , gene.name , " .fasta" , sep = " " )
35
- write.fasta(seq , names = gene.name , file = fasta.file )
20
+ gene.name <- query.genes [gene ]
21
+ fasta.file <- paste(" BLASTquery/BLAST_" , gene.name , " .fasta" , sep = " " )
22
+ if (file.exists(fasta.file ))
23
+ {
24
+ next
25
+ }
26
+ print(paste(" Exporting " , gene.name , ' "s fasta sequence as extracted from the BSgenome object' , sep = " " ))
27
+ positions <- nostoc.gtf %> % filter(feature == " CDS" , gene_name == gene.name ) %> % select(start , end , strand )
28
+ if (nrow(positions ) == 0 )
29
+ {
30
+ print(paste(gene.name , " is not present in nostoc.gtf by that name" , sep = " " ))
31
+ next
32
+ }
33
+ start <- positions [1 ,1 ]
34
+ end <- positions [1 ,2 ]
35
+ dna.strand <- positions [1 ,3 ]
36
+ if (dna.strand == " +" )
37
+ {
38
+ seq <- getSeq(BSgenome.PCC7120.Ensembl.29 )[[1 ]][start : end ] %> % as.character()
39
+ }
40
+ else if (dna.strand == " -" )
41
+ {
42
+ seq <- getSeq(BSgenome.PCC7120.Ensembl.29 )[[1 ]][start : end ] %> % as.character()
43
+ seq <- seq %> % strsplit(split = " " ) %> % unlist() %> % rev() %> % comp() %> % toupper() %> % paste(collapse = " " )
44
+ }
45
+ write.fasta(seq , names = gene.name , file = fasta.file )
36
46
}
37
47
38
- infileNms <- list.files( path = " BLASTquery " , pattern = " *.fasta " )
39
- infileNms
48
+ infileNms <- paste( " BLAST_ " , query.genes , " .fasta " , sep = " " )
49
+
40
50
41
51
42
52
outnames <- paste(unlist(sapply(infileNms , strsplit , split = " *.fasta" )),
43
53
" .out" , sep = " " )
44
54
45
55
46
- # create commands function
56
+ # create commands function, This function creates the command required to blast each sequence in BLASTquery
47
57
cmdCreate <- function (infile , outfile ){
48
- paste(" echo BLASTING " , infile , " ; " , " blastn -db nr -query BLASTquery/" , infile , " -remote -out BLASTout/" , outfile ,
49
- ' -outfmt "7 qacc sacc sseqid sgi staxids sscinames evalue qstart qend sstrand sstart send qseq sseq"' , sep = " " )
58
+ if (file.exists(paste(" BLASTout/" , outfile , sep = " " )))
59
+ {
60
+ paste(" echo File " , infile , " already downloaded and blasted, skipping" , sep = " " )
61
+ }
62
+ else
63
+ {
64
+ paste(" echo BLASTING " , infile , " ; " , " blastn -db nr -query BLASTquery/" , infile , " -remote -out BLASTout/" , outfile ,
65
+ ' -outfmt "7 qacc stitle sacc sseqid sgi staxids sscinames evalue qstart qend strand sstart send"' , sep = " " )
66
+ }
50
67
}
51
68
52
69
# create actual commands
53
70
cmds <- mapply(FUN = cmdCreate , infile = infileNms , outfile = outnames )
54
- # #Blast all sequences stored in the fasta files
71
+ # # CALLING BLASTN
72
+ # # Blast all sequences stored in the fasta files
55
73
sapply(cmds , system )
56
74
57
75
# # Concatenate all BLAST output into one R data.frame
58
76
system(' cat BLASTout/* | grep -v "#" > "blast_output"' )
77
+
78
+ cmd <- paste(" cat " , paste(" BLASTout/BLAST_" , query.genes , " .out" , sep = " " , collapse = " " ), ' | grep -v "#" > "blast_output"' )
79
+ system(cmd )
80
+
59
81
blast.output <- read.table(file = " blast_output" , sep = " \t " )
60
- colnames(blast.output ) <- c(" qacc" , " sacc" , " sseqid" , " sgi" , " staxids" ,
82
+ colnames(blast.output ) <- c(" qacc" , " stitle " , " sacc" , " sseqid" , " sgi" , " staxids" ,
61
83
" sscinames" , " evalue" , " qstart" ,
62
84
" qend" , " strand" , " sstart" ,
63
- " send" , " qseq " , " sseq " )
85
+ " send" )
64
86
87
+ # # Remove all entries from PCC 7120
88
+ blast.output <- blast.output %> % filter(sscinames != " Nostoc sp. PCC 7120" )
65
89
90
+ # # Define a new column that states how many letters upstream will be explored. It will be upstream.distance + the distance between the query start and the query alignment (qstart)
91
+ blast.output $ upstream.distance <- upstream.distance + blast.output $ qstart
66
92
67
- # # For every genome involved, extract the sequence going from -upstream.distance
68
- # up to the start of each BLAST subject hit
69
93
70
- sgis <- blast.output %> % select(sgi ) %> % distinct()
71
- # system("mkdir nucleotide")
72
-
73
- reference.seqs <- list ()
94
+ # # Initialize two lists that will store upstrem sequences and their identifiers
74
95
upstream.sequences <- list ()
75
96
seq.ids <- character ()
76
97
77
- for ( sgi in sgis [, 1 ])
78
- # sgis[,1]
98
+ # For each row in blast.output
99
+ for ( i in 1 : nrow( blast.output ))
79
100
{
80
- print(sgi )
81
- # Download the genome
82
- sgid <- sgi %> % as.character()
83
- fasta.file <- paste(" nucleotide/" , sgid , " .fasta" , sep = " " )
101
+ # # Extract the sequence's NCBI identifier (sgid)
102
+ sgi <- (blast.output %> % select(sgi ))[i ,]
103
+
104
+ # # Extract the sequence's strand
105
+ dna.strand <- select(blast.output , strand )[i ,] %> % as.character()
106
+
107
+ # # Check if it has already been downloaded
108
+ fasta.file <- paste(" nucleotide/" , sgi , " .fasta" , sep = " " )
109
+
110
+ # If it hasn't, create the command that downloads it and execute it
84
111
if (! file.exists(fasta.file ))
85
112
{
86
- cmd <- paste(" wget -O " , fasta.file , ' "http://eutils.ncbi.nlm.nih.gov/entrez/eutils/efetch.fcgi?db=nucleotide&id=' , sgid , ' &rettype=fasta"' , sep = " " )
113
+ cmd <- paste(" wget -O " , fasta.file , ' "http://eutils.ncbi.nlm.nih.gov/entrez/eutils/efetch.fcgi?db=nucleotide&id=' , sgi , ' &rettype=fasta"' , sep = " " )
114
+ print(paste(" Downloading " , sgi , ' "s fasta file' , sep = " " ))
87
115
system(cmd )
88
116
}
117
+ # If it has, warn so
118
+ else
119
+ {
120
+ print(paste(" Sequence " , sgi , " already downloaded" , sep = " " ))
121
+ }
89
122
90
- # Read the genome and assign the genome to the list reference.seqs with id sgi
123
+ # Read the sequence just downloaded and assign it to "sequence"
91
124
sequence <- read.fasta(fasta.file , seqtype = " DNA" ) %> % getSequence() %> % unlist()
92
- reference.seqs [[sgid ]] <- sequence
93
125
94
- # Extract start and end positions of every hit associated with genome sgi
95
- positions <- blast.output %> % filter(sgi == sgid ) %> % select(sstart , send )
96
-
97
- # For every hit, extract the sequence 500 upstream of it
98
- for (i in 1 : nrow(positions ))
126
+ # Extract the elements that will make the seq.id
127
+ seq.id <- select(blast.output , qacc , stitle )[i ,] %> % as.list() %> % unlist() %> % as.character()
128
+ seq.id [2 ] <- seq.id [2 ] %> % strsplit(split = " " ) %> % unlist() %> % paste(collapse = " _" )
129
+ # Collapse them into one string
130
+ seq.id <- paste(seq.id , collapse = " |" )
131
+
132
+
133
+
134
+ if (dna.strand == " plus" )
99
135
{
100
- seq.id <- paste(sgid , i , sep = " _" )
101
- # print(seq.id)
102
- upstream.position <- positions [i , 1 ] - upstream.distance
103
- if (upstream.position < 0 )
104
- {
105
- window <- c((length(sequence ) - upstream.position ): length(sequence ), 1 : positions [i , 1 ])
106
- }
107
- else
108
- {
109
- window <- upstream.position : positions [i , 1 ]
110
- }
111
- upstream.sequence <- sequence [window ]
112
- upstream.sequences [[seq.id ]] <- upstream.sequence
113
- seq.ids <- c(seq.ids , seq.id )
136
+ # # Compute the subject upstream position
137
+ upstream.position <- blast.output $ sstart [i ] - blast.output $ upstream.distance [i ]
138
+ if (length(sequence ) < upstream.distance )
139
+ {
140
+ # # Window cannot be bigger than sequence
141
+ window <- 1 : length(sequence )
142
+ }
143
+ else if (upstream.position < 0 )
144
+ {
145
+ # # Circular
146
+ window <- c((length(sequence ) + upstream.position ): length(sequence ), 1 : blast.output $ sstart [i ])
147
+ }
148
+ else
149
+ {
150
+ # Regular window
151
+ window <- upstream.position : blast.output $ sstart [i ]
152
+ }
114
153
}
154
+
155
+
156
+ else if (dna.strand == " minus" )
157
+ {
158
+ # # Compute the subject upstream position
159
+ upstream.position <- blast.output $ sstart [i ] + blast.output $ upstream.distance [i ]
160
+ if (length(sequence ) < upstream.distance )
161
+ {
162
+ # # Window cannot be bigger than sequence
163
+ window <- 1 : length(sequence )
164
+ }
165
+ else if (upstream.position > length(sequence ))
166
+ {
167
+ # # Circular
168
+ window <- c(blast.output $ sstart [i ]: length(sequence ), 1 : upstream.position )
169
+ }
170
+ else
171
+ {
172
+ # Regular window
173
+ window <- blast.output $ sstart [i ]: upstream.position
174
+ }
175
+ }
176
+ print(length(window ))
177
+
178
+ # # Subset the sequence and retain only the one spanning the window
179
+ upstream.sequence <- sequence [window ]
180
+
181
+ # # Add the seq.id and the upstream.sequence to its corresponding list
182
+ upstream.sequences [[seq.id ]] <- upstream.sequence
183
+ seq.ids <- c(seq.ids , seq.id )
184
+
115
185
}
116
186
117
- str(upstream.sequences )
187
+
188
+
189
+ write.fasta(upstream.sequences , file.out = " 500pb_upstream_homologues.fa" , names = seq.ids )
118
190
119
- str(reference.seqs )
120
- str(upstream.sequences )
191
+ write.table(file = " blast_output_processed.txt" , blast.output , col.names = T )
0 commit comments