forked from de-Boer-Lab/CisRegModels
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathalignFastqsIntoSeqs.py
executable file
·138 lines (125 loc) · 7.47 KB
/
alignFastqsIntoSeqs.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
#!/usr/bin/env python
"""
The purpose of this script is to ...
"""
## Code annotated by Guoyao
import warnings
from CisRegModels import MYUTILS
from CisRegModels.DNA import *
import sys
import argparse
## set up the parser command in python to run some command in shell
parser = argparse.ArgumentParser(description='Takes two fastq files for convergent reads and aligns them into one sequence in the strandedness of read1.')
parser.add_argument('-i1',dest='inFP1', metavar='<inFile>',help='Input file of FastQ for read1', required=True);
parser.add_argument('-i2',dest='inFP2', metavar='<inFile>',help='Input file of FastQ for read2', required=True);
parser.add_argument('-p',dest='overlap', metavar='<overlap>',help='Approximately how much overlap should there be?', required=True);
parser.add_argument('-r',dest='overlapRange', metavar='<overlapRange>',help='How many bases of flexibility should there be in the overlap?', required=True);
parser.add_argument('-o',dest='outFP', metavar='<outFile>',help='Where to output results [default=stdout]', required=False);
parser.add_argument('-l',dest='logFP', metavar='<logFile>',help='0 [default=stderr]', required=False);
parser.add_argument('-v',dest='verbose', action='count',help='Verbose output?', required=False, default=0);
## set up the initial
args = parser.parse_args();
## open the files
inFile1=MYUTILS.smartGZOpen(args.inFP1,'r');
inFile2=MYUTILS.smartGZOpen(args.inFP2,'r');
## if the input is not None, use the sys command to direct the location of the inputs
if (args.logFP is not None):
logFile=MYUTILS.smartGZOpen(args.logFP,'w');
sys.stderr=logFile;
## set up the details of the output, including the locations and errors
if (args.outFP is None):
### I don't know what does this mean, I assume it is when the output location is none, just print the name of the output
outFile= sys.stdout;
else:
if args.verbose>0: warnings.warn("Outputting to file "+args.outFP);
outFile = MYUTILS.smartGZOpen(args.outFP,'w');
## the function to read next file, read files by line(only one line), remove line breaks
def getNextRead(inFile):
name = inFile.readline().rstrip();
seq = inFile.readline().rstrip();
plus = inFile.readline().rstrip();
quality = inFile.readline().rstrip();
return (name,seq,quality);
## the function to test the alignment is match or not and return with the suitability
def testAlignment(seq1, seq2, offset):
score = 0.0;
for i in range(offset, len(seq1)):
if seq1[i] == seq2[i-offset]:
score+=1;
#else:
# print("mismatch at %i"%(i));
return score/(len(seq1)-offset)
## the function to return the alignment location of the sequences
def align(seq1, seq2, offset, overlapRange):
for i in range(0,overlapRange+1):
if testAlignment(seq1,seq2, offset + i) > 0.75:
return (offset + i)
elif testAlignment(seq1,seq2, offset - i) > 0.75:
return (offset - i)
## the function to output the alignment sequence
def getConsensus(read1, read2, offset):
consensus = [];
#print("from %i to %i"%(offset,len(read1[1])));
for i in range(offset, len(read1[1])):
if read1[2][i]> read2[2][i-offset]: #converts to ASCII, corresponds to quality; if equal, doesn't matter which is better since base is the same
consensus.append(read1[1][i]);
else:
consensus.append(read2[1][i-offset]);
#print("read 2 from %i"%((len(read1[1])-offset)));
return read1[1][0:(offset)] + "".join(consensus) + read2[1][(len(read1[1])-offset):]
args.overlap = int(args.overlap);
args.overlapRange = int(args.overlapRange);
#tests
#read1 = ("@M01581:878:000000000-AP1UB:1:1101:13431:1072 1:N:0:1", "TGCATTTTTTTCACATCTACTGAACGGTGACGGTTGCGGCCCACGTGGCCCCCGGGAAAGCGGAGGGCGGCTGCT", "CCCCCGGGGGGGGGGGGGFEGFGGGFGFCCFEGGGGGGGCC+CBFG8FEFFEEFEG@F<FFE7=FEDFGDECFEF");
#read2 = ("@M01581:878:000000000-AP1UB:1:1101:13431:1072 2:N:0:1", "AACAGCCGTAACCTCGACCTCGTCCACGCAGGTCAAGCAGCCGCCCTCCGCTTTCCCGGGGGCCACGTGGGCCGC", "CCCCCGGGGGGGGFGDGEG@F;FFGGEFGGGGEFGFFGF@,CFEEFCFCC@CFFGFFC@7C+@+8CFG8C7+B>F");
#print("seq1 = "+read1[1]);
#print("seq2 = "+read2[1]);
#print("rc(seq2) = "+revcomp(read2[1]));
#print("qual2 = "+read2[2]);
#read2 = (read2[0], revcomp(read2[1]), read2[2][::-1]);
#print("rev(qual2) = "+read2[2]);
#print(str(len(read1[1]) - 40));
#print("Alignment = "+str(testAlignment(read1[1], read2[1],len(read1[1]) - args.overlap)));
#print("Alignment offset = "+str(align(read1[1], read2[1],len(read1[1]) - args.overlap, args.overlapRange )));
#print("Consensus = " + getConsensus(read1, read2, align(read1[1], read2[1],len(read1[1]) - args.overlap, args.overlapRange )));
#read1 = ("@M01581:878:000000000-AP1UB:1:1101:13431:1072 1:N:0:1", "TGCATTTTTTTCACATCTACTGAACGGTGACGGTTGCGGCCCACGTGGCCCCCGGGAAAGCGGAGGGCGGCTGCG", "CCCCCGGGGGGGGGGGGGFEGFGGGFGFCCFEGGGGGGGCC+CBFG8FEFFEEFEG@F<FFE7=FEDFGDECFEJ");
#print("Consensus = " + getConsensus(read1, read2, align(read1[1], read2[1],len(read1[1]) - args.overlap, args.overlapRange )));
#read1 = ("@M01581:878:000000000-AP1UB:1:1101:13431:1072 1:N:0:1", "TGCATTTTTTTCACATCTACTGAACGGTGACGGTTGCGGCCCACGTGGCCCCCGGGAAAGCGGAGGGCGGCTGCG", "CCCCCGGGGGGGGGGGGGFEGFGGGFGFCCFEGGGGGGGCC+CBFG8FEFFEEFEG@F<FFE7=FEDFGDECFE!");
#print("Consensus = " + getConsensus(read1, read2, align(read1[1], read2[1],len(read1[1]) - args.overlap, args.overlapRange )));
#read1 = ("@NB501583:183:HYK5YBGX2:1:11101:12773:1776 1:N:0:GGAGCTAC+CTAGTCGA", "CTAATCAAGTGCTAGCTAGACATCTATATATAACAAGCACAGAACCGTCTAATTGGTATTTTTCAGGACATTTTAAACATCCGTACAACGAGAACCCATACATTACTTTTTTTAATATTCTCGACCACATAGCCCCCAATGCATGTCCACACAGAAAGTGTTGCGCACAACCCCGGTAACCATCCCTATG", "AAAAAEEEEEEEEEEEEEEAEEEEEEEEEEEEEEEEEEEEEEAEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEAEEEEEEEEEEEEEEEEEE<AEEEAAEEEEEEEEEEEEEEEEEEE<AAEEEEAEEEEEE<EEEEAEEEEAEEEEEEEEAEEAEEE<EE<AAAAEE<AAE<<<AAAAEAA66<666AA");
#read2 = ("@NB501583:183:HYK5YBGX2:1:11101:12773:1776 2:N:0:GGAGCTAC+CTAGTCGA", "CTGCTCAGGATCCTAAACGTTATAATACACGAAATGAATGCCTCTCTTAACAGTATACTTCTTTAATGATGAGTTATGTCCAACCTAGGGGGACTACTGGTGCCCATAGGGA", "AAAAAEEEEEEEEEEEEEEEEEEAEEEEEEEEAEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEAEEEEEEEEEAAEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEA<");
#read2 = (read2[0], revcomp(read2[1]), read2[2][::-1]);
#print("Alignment = "+str(testAlignment(read1[1], read2[1],len(read1[1]) - args.overlap)));
#print("Alignment offset = "+str(align(read1[1], read2[1],len(read1[1]) - args.overlap, args.overlapRange )));
#print("Best alignment = "+str(testAlignment(read1[1], read2[1],align(read1[1], read2[1],len(read1[1]) - args.overlap, args.overlapRange ))));
#print("Consensus = " + getConsensus(read1, read2, align(read1[1], read2[1],len(read1[1]) - args.overlap, args.overlapRange )));
#exit();
## determinr the alignment is done or not
notDone = True;
i=0;
while notDone:
i=i+1;
d1 = getNextRead(inFile1);
d2 = getNextRead(inFile2);
if d1[0]=="" or d2[0]=="":
if d1[0]=="" and d2[0]=="":
notDone=False;
else:
raise Exception("One of the files ended prematurely at fastq entry %i (line ~%i)"%(i,i*4));
else:
## I dont know what is the revcomp command is, dearch in google is the reverse complement
d2 = (d2[0], revcomp(d2[1]), d2[2][::-1]);
offset = align(d1[1],d2[1], len(d1[1]) - args.overlap, args.overlapRange);
if args.verbose>1:
sys.stderr.write("Alignment offset = %i\n"%(offset));
if offset is not None:
consensus = getConsensus(d1, d2, offset);
outFile.write("%s\t%s\t%i\n"%(d1[0],consensus,len(d1[1])-offset));
else:
sys.stderr.write("Skipping %s/%s because they don't align within given parameters\n"%(d1[0],d2[0]));
inFile1.close();
inFile2.close();
outFile.close();
if (args.logFP is not None):
logFile.close();
## this is the same command as before but after the if the command changed, I don't know why