13
13
"([MDCLXVI]+[a-z]?)"
14
14
]
15
15
16
+
16
17
class ChromosomeParser :
17
18
def __init__ (self , gff ) -> None :
18
19
self .species = ''
@@ -22,21 +23,21 @@ def __init__(self, gff) -> None:
22
23
self .sequence_regions = []
23
24
self .chr_candidates = []
24
25
self ._parse (gff )
25
-
26
+
26
27
@property
27
28
def regex (self ) -> str :
28
29
return "{}{}{}" .format (self .prefix , self .pattern , self .suffix )
29
-
30
+
30
31
def __repr__ (self ) -> str :
31
32
return "\t " .join (filter (None , (self .species , self .regex )))
32
-
33
+
33
34
def _parse (self , gff ) -> None :
34
35
self .species , _ = os .path .splitext (os .path .basename (gff ))
35
36
with open (gff , 'rt' ) as f :
36
37
data = f .readlines ()
37
- self .sequence_regions = [list (filter (None , l .split (" " )))[1 ] for l in data if l .startswith ("##sequence-region" )]
38
+ self .sequence_regions = [list (filter (None , li .split (" " )))[1 ] for li in data if li .startswith ("##sequence-region" )]
38
39
self .chr_candidates = [r for r in self .sequence_regions if {"." , "_" }.intersection (r )]
39
-
40
+
40
41
def generate_regex (self ) -> None :
41
42
self .has_suffix ()
42
43
prefix_pool = self ._iter_prefix_pool ()
@@ -60,7 +61,7 @@ def generate_regex(self) -> None:
60
61
max_match = len_match
61
62
self .prefix = prefix
62
63
self .pattern = pattern
63
-
64
+
64
65
def has_suffix (self ) -> bool :
65
66
"""
66
67
Assume any sequence_region with a version number "_v#" is a likely candidate for a chromosome.
@@ -72,24 +73,25 @@ def has_suffix(self) -> bool:
72
73
self .suffix = re .search (r"_v\d+" , with_suffix [0 ]).group ()
73
74
return True
74
75
return False
75
-
76
+
76
77
def _iter_prefix_pool (self ):
77
78
yield from common_prefixes (self .chr_candidates )
78
79
# yield from [
79
80
# common_prefixes(self.chr_candidates),
80
81
# (self.chr_candidates[0].split("_")[0] + "_", 1),
81
82
# ]
82
-
83
+
83
84
def correct_prefix (self ):
84
85
try :
85
86
r = re .compile (r"{}$" .format (self .regex ), re .IGNORECASE )
86
87
first_match = list (filter (r .match , self .chr_candidates ))[0 ]
87
- prefix = re .split (' _|\.' , first_match )[0 ]
88
+ prefix = re .split (r" _|\." , first_match )[0 ]
88
89
separator = first_match .split (prefix )[1 ][0 ]
89
90
self .prefix = prefix + separator
90
- except :
91
+ except Exception :
91
92
print (self .species )
92
93
94
+
93
95
def common_prefixes (li ):
94
96
prefixes = []
95
97
for first_letter , prefix_batch in groupby (sorted (li ), key = itemgetter (0 )):
@@ -104,7 +106,7 @@ def common_prefixes(li):
104
106
prefixes .append (('' .join (prefix ), threshold ))
105
107
break
106
108
threshold = count
107
- prefix .append (char )
109
+ prefix .append (char )
108
110
return sorted (prefixes , key = itemgetter (1 ))
109
111
110
112
0 commit comments