Skip to content

Commit acd7ca3

Browse files
committed
fisque
1 parent 9926470 commit acd7ca3

File tree

1 file changed

+140
-140
lines changed

1 file changed

+140
-140
lines changed

ogs_merge/ogs_merge

+140-140
Original file line numberDiff line numberDiff line change
@@ -132,148 +132,148 @@ class OgsMerger():
132132

133133
has_multiple_isoforms = len(f.sub_features) > 1
134134

135-
for child in f.sub_features:
135+
if f.type not in ("transcript", "mRNA"):
136+
print("Found unexpected feature under gene, %s, leaving as is" % f.type)
137+
else:
136138

137-
if child.type not in ("transcript", "mRNA"):
138-
print("Found unexpected feature under gene, %s, leaving as is" % child.type)
139-
continue
139+
for child in f.sub_features:
140140

141-
rna_start = f.location.start
142-
rna_end = f.location.end
141+
rna_start = f.location.start
142+
rna_end = f.location.end
143143

144-
if not self.first_isoform_not_numbered or mrna_count > 0:
145-
isoform_suffix = self.isoform_prefix
146-
if self.use_numbers_for_isoform:
147-
isoform_suffix += str(mrna_count + 1)
144+
if not self.first_isoform_not_numbered or mrna_count > 0:
145+
isoform_suffix = self.isoform_prefix
146+
if self.use_numbers_for_isoform:
147+
isoform_suffix += str(mrna_count + 1)
148+
else:
149+
isoform_suffix += (string.ascii_uppercase[mrna_count] * mrna_count_cycle)
148150
else:
149-
isoform_suffix += (string.ascii_uppercase[mrna_count] * mrna_count_cycle)
150-
else:
151-
isoform_suffix = ""
152-
153-
child.qualifiers['source'][0] = self.source
154-
if 'filtertag' in child.qualifiers:
155-
del child.qualifiers['filtertag']
156-
if 'owner' in child.qualifiers:
157-
del child.qualifiers['owner']
158-
mrna_id = gene_id + isoform_suffix
159-
if 'Name' not in child.qualifiers:
160-
child.qualifiers['Name'] = [mrna_id]
161-
else:
162-
child.qualifiers['Name'][0] = mrna_id
151+
isoform_suffix = ""
152+
153+
child.qualifiers['source'][0] = self.source
154+
if 'filtertag' in child.qualifiers:
155+
del child.qualifiers['filtertag']
156+
if 'owner' in child.qualifiers:
157+
del child.qualifiers['owner']
158+
mrna_id = gene_id + isoform_suffix
159+
if 'Name' not in child.qualifiers:
160+
child.qualifiers['Name'] = [mrna_id]
161+
else:
162+
child.qualifiers['Name'][0] = mrna_id
163163

164-
# put apollo id to aliases and write a good ID
165-
current_id = child.qualifiers['ID'][0]
166-
if re.match("^[A-F0-9]{32}$", current_id) or re.match("^[a-f0-9-]{36}$", current_id):
167-
if 'Alias' not in child.qualifiers:
168-
child.qualifiers['Alias'] = []
169-
child.qualifiers['Alias'].append(current_id)
164+
# put apollo id to aliases and write a good ID
165+
current_id = child.qualifiers['ID'][0]
166+
if re.match("^[A-F0-9]{32}$", current_id) or re.match("^[a-f0-9-]{36}$", current_id):
167+
if 'Alias' not in child.qualifiers:
168+
child.qualifiers['Alias'] = []
169+
child.qualifiers['Alias'].append(current_id)
170170

171-
if 'ID' not in child.qualifiers:
172-
child.qualifiers['ID'] = [mrna_id]
173-
else:
174-
child.qualifiers['ID'][0] = mrna_id
175-
176-
child.qualifiers['Parent'][0] = gene_id
177-
178-
# Some qualifiers are not needed outside apollo
179-
if 'status' in child.qualifiers:
180-
del child.qualifiers['status']
181-
if 'annotGroup' in child.qualifiers:
182-
del child.qualifiers['annotGroup']
183-
184-
# Transfer attributes to mRNA level
185-
if 'symbol' in f.qualifiers:
186-
child.qualifiers['symbol'] = f.qualifiers['symbol']
187-
if 'allele' in f.qualifiers:
188-
child.qualifiers['allele'] = f.qualifiers['allele']
189-
if 'part' in f.qualifiers:
190-
child.qualifiers['part'] = f.qualifiers['part']
191-
if 'synonym' in f.qualifiers:
192-
if 'synonym' not in child.qualifiers:
193-
child.qualifiers['synonym'] = []
194-
child.qualifiers['synonym'] += f.qualifiers['synonym']
195-
if 'Note' in f.qualifiers:
196-
if 'Note' not in child.qualifiers:
197-
child.qualifiers['Note'] = []
198-
child.qualifiers['Note'] += f.qualifiers['Note']
199-
if 'Dbxref' in f.qualifiers:
200-
if 'Dbxref' not in child.qualifiers:
201-
child.qualifiers['Dbxref'] = []
202-
child.qualifiers['Dbxref'] += f.qualifiers['Dbxref']
203-
204-
# Transfer only if not multiple isoforms
205-
if not has_multiple_isoforms:
206-
if 'Name' in f.qualifiers:
207-
child.qualifiers['full_name'] = f.qualifiers['Name']
208-
209-
# Transfer without replacing isoform information
210-
if not has_multiple_isoforms or 'description' not in child.qualifiers or not child.qualifiers['description']:
211-
if 'description' in f.qualifiers:
212-
child.qualifiers['description'] = f.qualifiers['description']
213-
214-
# Add previous versions as aliases
215-
if len(gene_id_splitted) > 1:
216-
if 'Alias' not in child.qualifiers:
217-
child.qualifiers['Alias'] = []
218-
child.qualifiers['Alias'].append(gene_id_no_version + isoform_suffix)
219-
for old_version in range(1, gene_version):
220-
child.qualifiers['Alias'].append(gene_id_no_version + '.' + str(old_version) + isoform_suffix)
221-
222-
# Remove uppercase variants if any
223-
if 'Allele' in child.qualifiers:
224-
del child.qualifiers['Allele']
225-
if 'Part' in child.qualifiers:
226-
del child.qualifiers['Part']
227-
if 'Synonym' in child.qualifiers:
228-
del child.qualifiers['Synonym']
229-
230-
id_count = 1
231-
for gchild in child.sub_features: # exons, cds, ...
232-
gchild.qualifiers['source'][0] = self.source
233-
if 'filtertag' in gchild.qualifiers:
234-
del gchild.qualifiers['filtertag']
235-
if 'ID' in gchild.qualifiers:
236-
gchild.qualifiers['ID'][0] = mrna_id + "-" + gchild.type + "-" + str(id_count)
237-
if 'owner' in gchild.qualifiers:
238-
del gchild.qualifiers['owner']
239-
gchild.qualifiers['Name'] = [mrna_id + "-" + gchild.type] # Will create the Name array if not yet present
240-
gchild.qualifiers['Parent'][0] = mrna_id
241-
id_count_gg = 1
242-
for ggchild in gchild.sub_features: # exotic stuff (non_canonical_five_prime_splice_site non_canonical_three_prime_splice_site stop_codon_read_through)
243-
ggchild.qualifiers['source'][0] = self.source
244-
if 'filtertag' in ggchild.qualifiers:
245-
del ggchild.qualifiers['filtertag']
246-
if 'owner' in ggchild.qualifiers:
247-
del ggchild.qualifiers['owner']
248-
ggchild.qualifiers['ID'][0] = mrna_id + "-" + gchild.type + "-" + str(id_count) + "-" + ggchild.type + "-" + str(id_count_gg)
249-
ggchild.qualifiers['Name'][0] = mrna_id + "-" + gchild.type + "-" + ggchild.type
250-
ggchild.qualifiers['Parent'][0] = mrna_id + "-" + gchild.type + "-" + str(id_count)
251-
id_count_gg += 1
252-
id_count += 1
253-
254-
# Check exon/utrs are not outside the gene/mrna coordinates
255-
if gchild.location.start < rna_start or gchild.location.end < rna_start:
256-
rna_start = min(gchild.location.start, gchild.location.end)
257-
if gchild.location.start > rna_end or gchild.location.end > rna_end:
258-
rna_end = max(gchild.location.start, gchild.location.end)
259-
260-
if rna_start != child.location.start:
261-
print("Fixing start coordinate for mRNA {} from {} to {}".format(child.qualifiers['ID'][0], child.location.start, rna_start))
262-
child.location = FeatureLocation(rna_start, child.location.end, strand=child.location.strand)
263-
if rna_end != child.location.end:
264-
print("Fixing end coordinate for mRNA {} from {} to {}".format(child.qualifiers['ID'][0], child.location.end, rna_end))
265-
child.location = FeatureLocation(child.location.start, rna_end, strand=child.location.strand)
266-
267-
# Check mrna are not outside the gene coordinates
268-
if child.location.start < gene_start or child.location.end < gene_start:
269-
gene_start = min(child.location.start, child.location.end)
270-
if child.location.start > gene_end or child.location.end > gene_end:
271-
gene_end = max(child.location.start, child.location.end)
272-
273-
mrna_count += 1
274-
if mrna_count >= 25:
275-
mrna_count = 0
276-
mrna_count_cycle += 1
171+
if 'ID' not in child.qualifiers:
172+
child.qualifiers['ID'] = [mrna_id]
173+
else:
174+
child.qualifiers['ID'][0] = mrna_id
175+
176+
child.qualifiers['Parent'][0] = gene_id
177+
178+
# Some qualifiers are not needed outside apollo
179+
if 'status' in child.qualifiers:
180+
del child.qualifiers['status']
181+
if 'annotGroup' in child.qualifiers:
182+
del child.qualifiers['annotGroup']
183+
184+
# Transfer attributes to mRNA level
185+
if 'symbol' in f.qualifiers:
186+
child.qualifiers['symbol'] = f.qualifiers['symbol']
187+
if 'allele' in f.qualifiers:
188+
child.qualifiers['allele'] = f.qualifiers['allele']
189+
if 'part' in f.qualifiers:
190+
child.qualifiers['part'] = f.qualifiers['part']
191+
if 'synonym' in f.qualifiers:
192+
if 'synonym' not in child.qualifiers:
193+
child.qualifiers['synonym'] = []
194+
child.qualifiers['synonym'] += f.qualifiers['synonym']
195+
if 'Note' in f.qualifiers:
196+
if 'Note' not in child.qualifiers:
197+
child.qualifiers['Note'] = []
198+
child.qualifiers['Note'] += f.qualifiers['Note']
199+
if 'Dbxref' in f.qualifiers:
200+
if 'Dbxref' not in child.qualifiers:
201+
child.qualifiers['Dbxref'] = []
202+
child.qualifiers['Dbxref'] += f.qualifiers['Dbxref']
203+
204+
# Transfer only if not multiple isoforms
205+
if not has_multiple_isoforms:
206+
if 'Name' in f.qualifiers:
207+
child.qualifiers['full_name'] = f.qualifiers['Name']
208+
209+
# Transfer without replacing isoform information
210+
if not has_multiple_isoforms or 'description' not in child.qualifiers or not child.qualifiers['description']:
211+
if 'description' in f.qualifiers:
212+
child.qualifiers['description'] = f.qualifiers['description']
213+
214+
# Add previous versions as aliases
215+
if len(gene_id_splitted) > 1:
216+
if 'Alias' not in child.qualifiers:
217+
child.qualifiers['Alias'] = []
218+
child.qualifiers['Alias'].append(gene_id_no_version + isoform_suffix)
219+
for old_version in range(1, gene_version):
220+
child.qualifiers['Alias'].append(gene_id_no_version + '.' + str(old_version) + isoform_suffix)
221+
222+
# Remove uppercase variants if any
223+
if 'Allele' in child.qualifiers:
224+
del child.qualifiers['Allele']
225+
if 'Part' in child.qualifiers:
226+
del child.qualifiers['Part']
227+
if 'Synonym' in child.qualifiers:
228+
del child.qualifiers['Synonym']
229+
230+
id_count = 1
231+
for gchild in child.sub_features: # exons, cds, ...
232+
gchild.qualifiers['source'][0] = self.source
233+
if 'filtertag' in gchild.qualifiers:
234+
del gchild.qualifiers['filtertag']
235+
if 'ID' in gchild.qualifiers:
236+
gchild.qualifiers['ID'][0] = mrna_id + "-" + gchild.type + "-" + str(id_count)
237+
if 'owner' in gchild.qualifiers:
238+
del gchild.qualifiers['owner']
239+
gchild.qualifiers['Name'] = [mrna_id + "-" + gchild.type] # Will create the Name array if not yet present
240+
gchild.qualifiers['Parent'][0] = mrna_id
241+
id_count_gg = 1
242+
for ggchild in gchild.sub_features: # exotic stuff (non_canonical_five_prime_splice_site non_canonical_three_prime_splice_site stop_codon_read_through)
243+
ggchild.qualifiers['source'][0] = self.source
244+
if 'filtertag' in ggchild.qualifiers:
245+
del ggchild.qualifiers['filtertag']
246+
if 'owner' in ggchild.qualifiers:
247+
del ggchild.qualifiers['owner']
248+
ggchild.qualifiers['ID'][0] = mrna_id + "-" + gchild.type + "-" + str(id_count) + "-" + ggchild.type + "-" + str(id_count_gg)
249+
ggchild.qualifiers['Name'][0] = mrna_id + "-" + gchild.type + "-" + ggchild.type
250+
ggchild.qualifiers['Parent'][0] = mrna_id + "-" + gchild.type + "-" + str(id_count)
251+
id_count_gg += 1
252+
id_count += 1
253+
254+
# Check exon/utrs are not outside the gene/mrna coordinates
255+
if gchild.location.start < rna_start or gchild.location.end < rna_start:
256+
rna_start = min(gchild.location.start, gchild.location.end)
257+
if gchild.location.start > rna_end or gchild.location.end > rna_end:
258+
rna_end = max(gchild.location.start, gchild.location.end)
259+
260+
if rna_start != child.location.start:
261+
print("Fixing start coordinate for mRNA {} from {} to {}".format(child.qualifiers['ID'][0], child.location.start, rna_start))
262+
child.location = FeatureLocation(rna_start, child.location.end, strand=child.location.strand)
263+
if rna_end != child.location.end:
264+
print("Fixing end coordinate for mRNA {} from {} to {}".format(child.qualifiers['ID'][0], child.location.end, rna_end))
265+
child.location = FeatureLocation(child.location.start, rna_end, strand=child.location.strand)
266+
267+
# Check mrna are not outside the gene coordinates
268+
if child.location.start < gene_start or child.location.end < gene_start:
269+
gene_start = min(child.location.start, child.location.end)
270+
if child.location.start > gene_end or child.location.end > gene_end:
271+
gene_end = max(child.location.start, child.location.end)
272+
273+
mrna_count += 1
274+
if mrna_count >= 25:
275+
mrna_count = 0
276+
mrna_count_cycle += 1
277277

278278
if gene_start != f.location.start:
279279
print("Fixing start coordinate for gene {} from {} to {}".format(f.qualifiers['ID'][0], f.location.start, gene_start))
@@ -287,11 +287,11 @@ class OgsMerger():
287287
# Add exons subfeatures to a feature, guessing from UTR and CDS subfeatures
288288
def guess_exons(self, cleaned_f):
289289

290-
for child in cleaned_f.sub_features: # mRNA
290+
if cleaned_f.type not in ("transcript", "mRNA"):
291+
print("Found unexpected feature under gene, %s, not guessing exons etc" % cleaned_f.type)
292+
return cleaned_f
291293

292-
if child.type not in ("transcript", "mRNA"):
293-
print("Found unexpected feature under gene, %s, not guessing exons etc" % child.type)
294-
continue
294+
for child in cleaned_f.sub_features: # mRNA
295295

296296
cds_coords = {}
297297
utr_coords = {}

0 commit comments

Comments
 (0)