@@ -132,148 +132,148 @@ class OgsMerger():
132
132
133
133
has_multiple_isoforms = len (f .sub_features ) > 1
134
134
135
- for child in f .sub_features :
135
+ if f .type not in ("transcript" , "mRNA" ):
136
+ print ("Found unexpected feature under gene, %s, leaving as is" % f .type )
137
+ else :
136
138
137
- if child .type not in ("transcript" , "mRNA" ):
138
- print ("Found unexpected feature under gene, %s, leaving as is" % child .type )
139
- continue
139
+ for child in f .sub_features :
140
140
141
- rna_start = f .location .start
142
- rna_end = f .location .end
141
+ rna_start = f .location .start
142
+ rna_end = f .location .end
143
143
144
- if not self .first_isoform_not_numbered or mrna_count > 0 :
145
- isoform_suffix = self .isoform_prefix
146
- if self .use_numbers_for_isoform :
147
- isoform_suffix += str (mrna_count + 1 )
144
+ if not self .first_isoform_not_numbered or mrna_count > 0 :
145
+ isoform_suffix = self .isoform_prefix
146
+ if self .use_numbers_for_isoform :
147
+ isoform_suffix += str (mrna_count + 1 )
148
+ else :
149
+ isoform_suffix += (string .ascii_uppercase [mrna_count ] * mrna_count_cycle )
148
150
else :
149
- isoform_suffix += (string .ascii_uppercase [mrna_count ] * mrna_count_cycle )
150
- else :
151
- isoform_suffix = ""
152
-
153
- child .qualifiers ['source' ][0 ] = self .source
154
- if 'filtertag' in child .qualifiers :
155
- del child .qualifiers ['filtertag' ]
156
- if 'owner' in child .qualifiers :
157
- del child .qualifiers ['owner' ]
158
- mrna_id = gene_id + isoform_suffix
159
- if 'Name' not in child .qualifiers :
160
- child .qualifiers ['Name' ] = [mrna_id ]
161
- else :
162
- child .qualifiers ['Name' ][0 ] = mrna_id
151
+ isoform_suffix = ""
152
+
153
+ child .qualifiers ['source' ][0 ] = self .source
154
+ if 'filtertag' in child .qualifiers :
155
+ del child .qualifiers ['filtertag' ]
156
+ if 'owner' in child .qualifiers :
157
+ del child .qualifiers ['owner' ]
158
+ mrna_id = gene_id + isoform_suffix
159
+ if 'Name' not in child .qualifiers :
160
+ child .qualifiers ['Name' ] = [mrna_id ]
161
+ else :
162
+ child .qualifiers ['Name' ][0 ] = mrna_id
163
163
164
- # put apollo id to aliases and write a good ID
165
- current_id = child .qualifiers ['ID' ][0 ]
166
- if re .match ("^[A-F0-9]{32}$" , current_id ) or re .match ("^[a-f0-9-]{36}$" , current_id ):
167
- if 'Alias' not in child .qualifiers :
168
- child .qualifiers ['Alias' ] = []
169
- child .qualifiers ['Alias' ].append (current_id )
164
+ # put apollo id to aliases and write a good ID
165
+ current_id = child .qualifiers ['ID' ][0 ]
166
+ if re .match ("^[A-F0-9]{32}$" , current_id ) or re .match ("^[a-f0-9-]{36}$" , current_id ):
167
+ if 'Alias' not in child .qualifiers :
168
+ child .qualifiers ['Alias' ] = []
169
+ child .qualifiers ['Alias' ].append (current_id )
170
170
171
- if 'ID' not in child .qualifiers :
172
- child .qualifiers ['ID' ] = [mrna_id ]
173
- else :
174
- child .qualifiers ['ID' ][0 ] = mrna_id
175
-
176
- child .qualifiers ['Parent' ][0 ] = gene_id
177
-
178
- # Some qualifiers are not needed outside apollo
179
- if 'status' in child .qualifiers :
180
- del child .qualifiers ['status' ]
181
- if 'annotGroup' in child .qualifiers :
182
- del child .qualifiers ['annotGroup' ]
183
-
184
- # Transfer attributes to mRNA level
185
- if 'symbol' in f .qualifiers :
186
- child .qualifiers ['symbol' ] = f .qualifiers ['symbol' ]
187
- if 'allele' in f .qualifiers :
188
- child .qualifiers ['allele' ] = f .qualifiers ['allele' ]
189
- if 'part' in f .qualifiers :
190
- child .qualifiers ['part' ] = f .qualifiers ['part' ]
191
- if 'synonym' in f .qualifiers :
192
- if 'synonym' not in child .qualifiers :
193
- child .qualifiers ['synonym' ] = []
194
- child .qualifiers ['synonym' ] += f .qualifiers ['synonym' ]
195
- if 'Note' in f .qualifiers :
196
- if 'Note' not in child .qualifiers :
197
- child .qualifiers ['Note' ] = []
198
- child .qualifiers ['Note' ] += f .qualifiers ['Note' ]
199
- if 'Dbxref' in f .qualifiers :
200
- if 'Dbxref' not in child .qualifiers :
201
- child .qualifiers ['Dbxref' ] = []
202
- child .qualifiers ['Dbxref' ] += f .qualifiers ['Dbxref' ]
203
-
204
- # Transfer only if not multiple isoforms
205
- if not has_multiple_isoforms :
206
- if 'Name' in f .qualifiers :
207
- child .qualifiers ['full_name' ] = f .qualifiers ['Name' ]
208
-
209
- # Transfer without replacing isoform information
210
- if not has_multiple_isoforms or 'description' not in child .qualifiers or not child .qualifiers ['description' ]:
211
- if 'description' in f .qualifiers :
212
- child .qualifiers ['description' ] = f .qualifiers ['description' ]
213
-
214
- # Add previous versions as aliases
215
- if len (gene_id_splitted ) > 1 :
216
- if 'Alias' not in child .qualifiers :
217
- child .qualifiers ['Alias' ] = []
218
- child .qualifiers ['Alias' ].append (gene_id_no_version + isoform_suffix )
219
- for old_version in range (1 , gene_version ):
220
- child .qualifiers ['Alias' ].append (gene_id_no_version + '.' + str (old_version ) + isoform_suffix )
221
-
222
- # Remove uppercase variants if any
223
- if 'Allele' in child .qualifiers :
224
- del child .qualifiers ['Allele' ]
225
- if 'Part' in child .qualifiers :
226
- del child .qualifiers ['Part' ]
227
- if 'Synonym' in child .qualifiers :
228
- del child .qualifiers ['Synonym' ]
229
-
230
- id_count = 1
231
- for gchild in child .sub_features : # exons, cds, ...
232
- gchild .qualifiers ['source' ][0 ] = self .source
233
- if 'filtertag' in gchild .qualifiers :
234
- del gchild .qualifiers ['filtertag' ]
235
- if 'ID' in gchild .qualifiers :
236
- gchild .qualifiers ['ID' ][0 ] = mrna_id + "-" + gchild .type + "-" + str (id_count )
237
- if 'owner' in gchild .qualifiers :
238
- del gchild .qualifiers ['owner' ]
239
- gchild .qualifiers ['Name' ] = [mrna_id + "-" + gchild .type ] # Will create the Name array if not yet present
240
- gchild .qualifiers ['Parent' ][0 ] = mrna_id
241
- id_count_gg = 1
242
- for ggchild in gchild .sub_features : # exotic stuff (non_canonical_five_prime_splice_site non_canonical_three_prime_splice_site stop_codon_read_through)
243
- ggchild .qualifiers ['source' ][0 ] = self .source
244
- if 'filtertag' in ggchild .qualifiers :
245
- del ggchild .qualifiers ['filtertag' ]
246
- if 'owner' in ggchild .qualifiers :
247
- del ggchild .qualifiers ['owner' ]
248
- ggchild .qualifiers ['ID' ][0 ] = mrna_id + "-" + gchild .type + "-" + str (id_count ) + "-" + ggchild .type + "-" + str (id_count_gg )
249
- ggchild .qualifiers ['Name' ][0 ] = mrna_id + "-" + gchild .type + "-" + ggchild .type
250
- ggchild .qualifiers ['Parent' ][0 ] = mrna_id + "-" + gchild .type + "-" + str (id_count )
251
- id_count_gg += 1
252
- id_count += 1
253
-
254
- # Check exon/utrs are not outside the gene/mrna coordinates
255
- if gchild .location .start < rna_start or gchild .location .end < rna_start :
256
- rna_start = min (gchild .location .start , gchild .location .end )
257
- if gchild .location .start > rna_end or gchild .location .end > rna_end :
258
- rna_end = max (gchild .location .start , gchild .location .end )
259
-
260
- if rna_start != child .location .start :
261
- print ("Fixing start coordinate for mRNA {} from {} to {}" .format (child .qualifiers ['ID' ][0 ], child .location .start , rna_start ))
262
- child .location = FeatureLocation (rna_start , child .location .end , strand = child .location .strand )
263
- if rna_end != child .location .end :
264
- print ("Fixing end coordinate for mRNA {} from {} to {}" .format (child .qualifiers ['ID' ][0 ], child .location .end , rna_end ))
265
- child .location = FeatureLocation (child .location .start , rna_end , strand = child .location .strand )
266
-
267
- # Check mrna are not outside the gene coordinates
268
- if child .location .start < gene_start or child .location .end < gene_start :
269
- gene_start = min (child .location .start , child .location .end )
270
- if child .location .start > gene_end or child .location .end > gene_end :
271
- gene_end = max (child .location .start , child .location .end )
272
-
273
- mrna_count += 1
274
- if mrna_count >= 25 :
275
- mrna_count = 0
276
- mrna_count_cycle += 1
171
+ if 'ID' not in child .qualifiers :
172
+ child .qualifiers ['ID' ] = [mrna_id ]
173
+ else :
174
+ child .qualifiers ['ID' ][0 ] = mrna_id
175
+
176
+ child .qualifiers ['Parent' ][0 ] = gene_id
177
+
178
+ # Some qualifiers are not needed outside apollo
179
+ if 'status' in child .qualifiers :
180
+ del child .qualifiers ['status' ]
181
+ if 'annotGroup' in child .qualifiers :
182
+ del child .qualifiers ['annotGroup' ]
183
+
184
+ # Transfer attributes to mRNA level
185
+ if 'symbol' in f .qualifiers :
186
+ child .qualifiers ['symbol' ] = f .qualifiers ['symbol' ]
187
+ if 'allele' in f .qualifiers :
188
+ child .qualifiers ['allele' ] = f .qualifiers ['allele' ]
189
+ if 'part' in f .qualifiers :
190
+ child .qualifiers ['part' ] = f .qualifiers ['part' ]
191
+ if 'synonym' in f .qualifiers :
192
+ if 'synonym' not in child .qualifiers :
193
+ child .qualifiers ['synonym' ] = []
194
+ child .qualifiers ['synonym' ] += f .qualifiers ['synonym' ]
195
+ if 'Note' in f .qualifiers :
196
+ if 'Note' not in child .qualifiers :
197
+ child .qualifiers ['Note' ] = []
198
+ child .qualifiers ['Note' ] += f .qualifiers ['Note' ]
199
+ if 'Dbxref' in f .qualifiers :
200
+ if 'Dbxref' not in child .qualifiers :
201
+ child .qualifiers ['Dbxref' ] = []
202
+ child .qualifiers ['Dbxref' ] += f .qualifiers ['Dbxref' ]
203
+
204
+ # Transfer only if not multiple isoforms
205
+ if not has_multiple_isoforms :
206
+ if 'Name' in f .qualifiers :
207
+ child .qualifiers ['full_name' ] = f .qualifiers ['Name' ]
208
+
209
+ # Transfer without replacing isoform information
210
+ if not has_multiple_isoforms or 'description' not in child .qualifiers or not child .qualifiers ['description' ]:
211
+ if 'description' in f .qualifiers :
212
+ child .qualifiers ['description' ] = f .qualifiers ['description' ]
213
+
214
+ # Add previous versions as aliases
215
+ if len (gene_id_splitted ) > 1 :
216
+ if 'Alias' not in child .qualifiers :
217
+ child .qualifiers ['Alias' ] = []
218
+ child .qualifiers ['Alias' ].append (gene_id_no_version + isoform_suffix )
219
+ for old_version in range (1 , gene_version ):
220
+ child .qualifiers ['Alias' ].append (gene_id_no_version + '.' + str (old_version ) + isoform_suffix )
221
+
222
+ # Remove uppercase variants if any
223
+ if 'Allele' in child .qualifiers :
224
+ del child .qualifiers ['Allele' ]
225
+ if 'Part' in child .qualifiers :
226
+ del child .qualifiers ['Part' ]
227
+ if 'Synonym' in child .qualifiers :
228
+ del child .qualifiers ['Synonym' ]
229
+
230
+ id_count = 1
231
+ for gchild in child .sub_features : # exons, cds, ...
232
+ gchild .qualifiers ['source' ][0 ] = self .source
233
+ if 'filtertag' in gchild .qualifiers :
234
+ del gchild .qualifiers ['filtertag' ]
235
+ if 'ID' in gchild .qualifiers :
236
+ gchild .qualifiers ['ID' ][0 ] = mrna_id + "-" + gchild .type + "-" + str (id_count )
237
+ if 'owner' in gchild .qualifiers :
238
+ del gchild .qualifiers ['owner' ]
239
+ gchild .qualifiers ['Name' ] = [mrna_id + "-" + gchild .type ] # Will create the Name array if not yet present
240
+ gchild .qualifiers ['Parent' ][0 ] = mrna_id
241
+ id_count_gg = 1
242
+ for ggchild in gchild .sub_features : # exotic stuff (non_canonical_five_prime_splice_site non_canonical_three_prime_splice_site stop_codon_read_through)
243
+ ggchild .qualifiers ['source' ][0 ] = self .source
244
+ if 'filtertag' in ggchild .qualifiers :
245
+ del ggchild .qualifiers ['filtertag' ]
246
+ if 'owner' in ggchild .qualifiers :
247
+ del ggchild .qualifiers ['owner' ]
248
+ ggchild .qualifiers ['ID' ][0 ] = mrna_id + "-" + gchild .type + "-" + str (id_count ) + "-" + ggchild .type + "-" + str (id_count_gg )
249
+ ggchild .qualifiers ['Name' ][0 ] = mrna_id + "-" + gchild .type + "-" + ggchild .type
250
+ ggchild .qualifiers ['Parent' ][0 ] = mrna_id + "-" + gchild .type + "-" + str (id_count )
251
+ id_count_gg += 1
252
+ id_count += 1
253
+
254
+ # Check exon/utrs are not outside the gene/mrna coordinates
255
+ if gchild .location .start < rna_start or gchild .location .end < rna_start :
256
+ rna_start = min (gchild .location .start , gchild .location .end )
257
+ if gchild .location .start > rna_end or gchild .location .end > rna_end :
258
+ rna_end = max (gchild .location .start , gchild .location .end )
259
+
260
+ if rna_start != child .location .start :
261
+ print ("Fixing start coordinate for mRNA {} from {} to {}" .format (child .qualifiers ['ID' ][0 ], child .location .start , rna_start ))
262
+ child .location = FeatureLocation (rna_start , child .location .end , strand = child .location .strand )
263
+ if rna_end != child .location .end :
264
+ print ("Fixing end coordinate for mRNA {} from {} to {}" .format (child .qualifiers ['ID' ][0 ], child .location .end , rna_end ))
265
+ child .location = FeatureLocation (child .location .start , rna_end , strand = child .location .strand )
266
+
267
+ # Check mrna are not outside the gene coordinates
268
+ if child .location .start < gene_start or child .location .end < gene_start :
269
+ gene_start = min (child .location .start , child .location .end )
270
+ if child .location .start > gene_end or child .location .end > gene_end :
271
+ gene_end = max (child .location .start , child .location .end )
272
+
273
+ mrna_count += 1
274
+ if mrna_count >= 25 :
275
+ mrna_count = 0
276
+ mrna_count_cycle += 1
277
277
278
278
if gene_start != f .location .start :
279
279
print ("Fixing start coordinate for gene {} from {} to {}" .format (f .qualifiers ['ID' ][0 ], f .location .start , gene_start ))
@@ -287,11 +287,11 @@ class OgsMerger():
287
287
# Add exons subfeatures to a feature, guessing from UTR and CDS subfeatures
288
288
def guess_exons (self , cleaned_f ):
289
289
290
- for child in cleaned_f .sub_features : # mRNA
290
+ if cleaned_f .type not in ("transcript" , "mRNA" ):
291
+ print ("Found unexpected feature under gene, %s, not guessing exons etc" % cleaned_f .type )
292
+ return cleaned_f
291
293
292
- if child .type not in ("transcript" , "mRNA" ):
293
- print ("Found unexpected feature under gene, %s, not guessing exons etc" % child .type )
294
- continue
294
+ for child in cleaned_f .sub_features : # mRNA
295
295
296
296
cds_coords = {}
297
297
utr_coords = {}
0 commit comments