This repository was archived by the owner on Jul 22, 2018. It is now read-only.
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathbedSubsampler.py
93 lines (80 loc) · 2.97 KB
/
bedSubsampler.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
import argparse
import random
from modules.BedHandler import BedHandler
"""
Subsample the bed file for homozygous cases for training faster
"""
class BedSubsampler:
"""
Downsamples a bed file
"""
def __init__(self, bed_file_path):
"""
Initialize a bed subsampler
:param bed_file_path:
"""
# --- initialize handlers ---
self.bed_handler = BedHandler(bed_file_path)
@staticmethod
def select_or_not(bed_record, downsample_rate):
"""
Determines if a bed record should be selected given a downsampling rate
:param bed_record: A bed record
:param downsample_rate: A downsampling probability
:return: Boolean
"""
chromosome_name, start_position, end_position, ref, alts, genotype = tuple(bed_record.rstrip().split('\t')[0:6])
genotype = int(genotype)
# if not homozygous, always pick
if genotype != 0:
return True
# else do a sampling based on probability
random_chance = random.uniform(0, 1)
if random_chance <= downsample_rate:
return True
return False
def downsample_bed_file(self):
"""
Downsample the bed file
:return:
"""
# calculate the downsample rate based on distribution of three classes
downsample_rate = max(self.bed_handler.total_het, self.bed_handler.total_hom_alt) / self.bed_handler.total_hom
downsample_rate = 2 * downsample_rate
for i, record in enumerate(self.bed_handler.all_bed_records):
if BedSubsampler.select_or_not(record, downsample_rate) is True:
print(record, end='')
def print_bed_stats(self):
"""
Print the distribution of the bed file
:return:
"""
print("Total records: ")
print(self.bed_handler.total_hom, self.bed_handler.total_het, self.bed_handler.total_hom_alt)
total_cases = self.bed_handler.total_hom + self.bed_handler.total_het + self.bed_handler.total_hom_alt
print("Percent homozygous records:\t", int(self.bed_handler.total_hom * 100 / total_cases))
print("Percent Heterozygous records:\t", int(self.bed_handler.total_het * 100 / total_cases))
print("Percent Hom-alt records:\t", int(self.bed_handler.total_hom_alt * 100 / total_cases))
if __name__ == '__main__':
'''
Processes arguments and performs tasks to generate the pileup.
'''
parser = argparse.ArgumentParser()
parser.register("type", "bool", lambda v: v.lower() == "true")
parser.add_argument(
"--bed",
type=str,
required=True,
help="bed file path."
)
parser.add_argument(
"--stats",
type=bool,
help="If true, will print bed file stats."
)
FLAGS, unparsed = parser.parse_known_args()
down_sampler = BedSubsampler(FLAGS.bed)
if FLAGS.stats is True:
down_sampler.print_bed_stats()
else:
down_sampler.downsample_bed_file()