forked from axelcournac/3C_tutorial
-
Notifications
You must be signed in to change notification settings - Fork 1
/
Copy pathfragment_attribution.py
233 lines (205 loc) · 7.02 KB
/
fragment_attribution.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
# -*- coding: utf-8 -*-
"""
This code assigns a restriction fragment to a locus (chrm-position).
@author: Remi Montagne (reimplementation of Axel Cournac's code)
"""
import sys
import glob
import os.path as op
from Bio import SeqIO
from Bio.Seq import Seq
from Bio import Restriction
from Bio.Restriction import *
import argparse
##############################
# PARSE CL arguments
##############################
def _test():
import doctest
doctest.testmod()
def parse_args():
""" Gets the arguments from the command line."""
parser = argparse.ArgumentParser()
parser.add_argument(
"input_file",
type=str,
help="Path to the input file containing the coordinates of Hi-C "
"interacting pairs (dat format). Use '-' to read from stdin.",
)
parser.add_argument(
"output_file",
nargs="?",
type=str,
help="Path to the output file (dat.indices format). Defaults to stdout.",
)
parser.add_argument(
"-e",
"--enzyme",
required=True,
help="The restriction enzyme used in the Hi-C protocol. E.g. DpnII.",
)
parser.add_argument(
"-r",
"--reference",
type=lambda s: map(str, s.split(",")),
required=True,
help="Path to the reference genome in FASTA format. Can be multiple "
" files separated by commas.",
)
return parser.parse_args()
##############################
# FUNCTIONS
##############################
def get_restriction_table(seq, enz):
"""
Get the restriction table for a genomic sequence.
Parameters
----------
seq : Seq object
A biopython Seq object representing a chromosomes or contig.
enz : str
The name of the restriction enzyme used.
Returns
-------
list
List of restriction fragment boundary positions for the input sequence.
>>> get_restriction_table(Seq("AAGATCGATCGG"),"DpnII")
[0, 3, 7, 12]
>>> get_restriction_table(Seq("AA"),"DpnII")
[0, 2]
>>> get_restriction_table(Seq("AA"),"aeiou1")
Traceback (most recent call last):
...
ValueError: aeiou1 is not a valid restriction enzyme.
>>> get_restriction_table("AA","DpnII")
Traceback (most recent call last):
...
TypeError: Expected Seq or MutableSeq instance, got <class 'str'> instead
"""
# Restriction batch containing the restriction enzyme
try:
rb = RestrictionBatch([enz])
except ValueError:
raise ValueError("{} is not a valid restriction enzyme.".format(enz))
# Conversion from string type to restriction type
enz = rb.get(enz)
map_restriction = rb.search(seq)
map_restriction = map_restriction[enz]
map_restriction = [0] + map_restriction
map_restriction.append(len(seq))
return map_restriction
def find_frag(pos, rl):
"""
Use binary search to find the index of a chromosome restriction fragment
corresponding to an input genomic position.
Parameters
----------
pos : int
Genomic position, in base pairs.
rl : list
List of genomic positions corresponding to restriction sites.
Returns
-------
int
The 0-based index of the restriction fragment to which the position belongs.
>>> find_frag(15, [0, 20, 30])
0
>>> find_frag(15, [10, 20, 30])
Traceback (most recent call last):
...
ValueError: The first position in the restriction table is not 0.
>>> find_frag(31, [0, 20, 30])
Traceback (most recent call last):
...
ValueError: Read position is larger than last entry in restriction table.
"""
if rl[0] != 0:
raise ValueError("The first position in the restriction table is not 0.")
if pos > rl[-1]:
raise ValueError(
"Read position is larger than last entry in restriction table."
)
bottom = 0 # indice of the first restriction fragment
top = len(rl) - 1 # indice of the last restriction fragment
found = False
# binary search for the index of the read
while not found:
index = int((top + bottom) / 2) # $index <- middle of the chr
# if the position of the read is smaller than the position of the current index
if top == bottom:
found = True
if pos < rl[index]:
top = index
elif pos >= rl[index + 1]:
bottom = index
else:
found = True
return index
def write_indices_file(infile, outfile, restriction_table):
"""
Writes the "dat.indices" file, which has two more columns than the input
file corresponding to the restriction fragment index of each read.
Parameters
----------
infile: file object
File handle for the input .dat file.
outfile: file object
File handle for the output .dat.indices file.
restriction_table: dict
Dictionary with chromosome identifiers (str) as keys and list of
positions (int) of restriction sites as values.
"""
# eye candy progress animation
load_anim = ["\\", "|", "/", "-"]
prog = 0
# Open files for reading and writing
i = 0
for line in infile: # iterate over each line
i += 1
if i % 5000000 == 0:
prog += 1
print(
"Attribution of index to reads: {0}\r".format(load_anim[prog % 4]),
end="",
file=sys.stderr,
)
chr1, pos1, sens1, chr2, pos2, sens2 = line.split() # split it by whitespace
pos1 = int(pos1)
sens1 = sens1
pos2 = int(pos2)
sens2 = sens2
indice1 = find_frag(pos1, restriction_table[chr1])
indice2 = find_frag(pos2, restriction_table[chr2])
outfile.write(
"{0}\t{1}\t{2}\t{3}\t{4}\t{5}\t{6}\t{7}\n".format(
chr1, pos1, sens1, indice1, chr2, pos2, sens2, indice2
)
)
if __name__ == "__main__":
print(
"Assigns restriction fragment ID to Hi-C records in a .dat file.",
file=sys.stderr,
)
# Get arguments
args = parse_args()
enz = args.enzyme
refs = args.reference
restriction_table = {}
for ref in refs:
with open(ref, "rU") as ref_handle:
for rec in SeqIO.parse(ref_handle, "fasta"):
chr_name = rec.id
# Get the lists of restriction sites of each chromosome into
# a restriction_table
restriction_table[chr_name] = get_restriction_table(rec.seq, enz)
print(
"{0}: {1} restriction sites".format(
chr_name, len(restriction_table[chr_name])
),
file=sys.stderr,
)
# Associate each read from .dat file with its corresponding restriction
# fragment index and writes the resulting line
input_handle = sys.stdin if args.input_file == "-" else open(args.input_file, "r")
output_handle = sys.stdout if not args.output_file else open(args.output_file, "w")
write_indices_file(input_handle, output_handle, restriction_table)