Skip to content

Commit 3f32768

Browse files
committed
patch for other kind of identifiers
1 parent 930deb4 commit 3f32768

File tree

1 file changed

+23
-12
lines changed

1 file changed

+23
-12
lines changed

ogs_merge/ogs_merge

+23-12
Original file line numberDiff line numberDiff line change
@@ -137,12 +137,18 @@ class OgsMerger():
137137
rna_start = f.location.start
138138
rna_end = f.location.end
139139

140+
isoform_suffix = self.isoform_prefix
141+
if self.use_numbers_for_isoform:
142+
isoform_suffix += (string.ascii_uppercase[mrna_count] * mrna_count_cycle)
143+
else:
144+
isoform_suffix += str(mrna_count + 1)
145+
140146
child.qualifiers['source'][0] = self.source
141147
if 'filtertag' in child.qualifiers:
142148
del child.qualifiers['filtertag']
143149
if 'owner' in child.qualifiers:
144150
del child.qualifiers['owner']
145-
mrna_id = gene_id + "-" + "R" + (string.ascii_uppercase[mrna_count] * mrna_count_cycle)
151+
mrna_id = gene_id + isoform_suffix
146152
if 'Name' not in child.qualifiers:
147153
child.qualifiers['Name'] = [mrna_id]
148154
else:
@@ -202,9 +208,9 @@ class OgsMerger():
202208
if len(gene_id_splitted) > 1:
203209
if 'Alias' not in child.qualifiers:
204210
child.qualifiers['Alias'] = []
205-
child.qualifiers['Alias'].append(gene_id_no_version + "-" + "R" + (string.ascii_uppercase[mrna_count] * mrna_count_cycle))
211+
child.qualifiers['Alias'].append(gene_id_no_version + isoform_suffix)
206212
for old_version in range(1, gene_version):
207-
child.qualifiers['Alias'].append(gene_id_no_version + '.' + str(old_version) + "-" + "R" + (string.ascii_uppercase[mrna_count] * mrna_count_cycle))
213+
child.qualifiers['Alias'].append(gene_id_no_version + '.' + str(old_version) + isoform_suffix)
208214

209215
# Remove uppercase variants if any
210216
if 'Allele' in child.qualifiers:
@@ -485,15 +491,15 @@ class OgsMerger():
485491
base_gff_in = open(self.filtered_base_gff, 'r')
486492
base_gff_out = open(self.tmpdir + '/base_cds.gff', 'w+')
487493

488-
if not self.regex_rna:
494+
if not self.exon_parent_regex:
489495
rna_reg = r'Parent=([a-zA-Z0-9]+)([\.0-9]+)?([-_]R[A-Z]+)?(,[a-zA-Z0-9\.\-_]*)?'
490496
else:
491-
rna_reg = r'Parent=' + self.regex_rna
497+
rna_reg = r'Parent=' + self.exon_parent_regex
492498

493-
if not self.regex_rna_replace:
499+
if not self.exon_parent_to_gene:
494500
rna_reg_rep = r'ID=\1'
495501
else:
496-
rna_reg_rep = r'ID=' + self.regex_rna_replace
502+
rna_reg_rep = r'ID=' + self.exon_parent_to_gene
497503

498504
for li in base_gff_in:
499505
cols = li.strip().split()
@@ -706,7 +712,7 @@ class OgsMerger():
706712
base_id = ""
707713

708714
if wa_id not in self.name_map:
709-
# A new gene that have no bedtools result and that wasn't found in the previous annotation version
715+
# A new gene that has no bedtools result and that wasn't found in the previous annotation version
710716
# Give it a completely new id
711717
base_id = self.id_syntax.replace('{id}', str(self.highest_id).zfill(self.padding_length))
712718
self.name_map[wa_id] = base_id + ".1"
@@ -960,6 +966,7 @@ class OgsMerger():
960966
for prot in prot_in:
961967
prot = prot.strip()
962968
if prot.startswith(">"):
969+
# FIXME need to adapt this when playing with --isoform_prefix option... not sure how (good luck if you need to change this)
963970
prot = re.sub(r'-R([A-Z]+)', r'-P\1', prot)
964971
print(prot, file=prot_out)
965972
prot_out.close()
@@ -977,8 +984,10 @@ class OgsMerger():
977984
parser.add_argument("-p", "--previous_gff", help="The gff from the previous annotation version (if different than <base_gff>)")
978985
parser.add_argument("-d", "--deleted", help="File containing a list of mRNAs to remove")
979986
parser.add_argument("-o", "--out_prefix", help="Prefix for output files (default=<ogs_name>_<today's date>)")
980-
parser.add_argument("--regex_rna", help="Regex matching mRNA ids, with a capturing group around the gene id without version suffix (default='([a-zA-Z0-9]+)([\\.0-9]+)?([-_]R[A-Z]+)?(,[a-zA-Z0-9\\.\\-_]*)?' )")
981-
parser.add_argument("--regex_rna_replace", help="Replacement string to create a gene id from regex_rna captured group, where {id} is the captured group (default='{id}' )")
987+
parser.add_argument("--exon_parent_regex", help="Regex matching exons' Parent ids, with a capturing group around the gene id radical (default='([a-zA-Z0-9]+)([\\.0-9]+)?([-_]R[A-Z]+)?(,[a-zA-Z0-9\\.\\-_]*)?' )")
988+
parser.add_argument("--exon_parent_to_gene", help="Replacement string to create a gene id from exon_parent_regex first captured group (aka gene id radical), where \1 is the captured group (default='\1' )")
989+
parser.add_argument("--isoform_prefix", help="Prefix for the isoform part of mRNA ids (default='-R')", default="-R")
990+
parser.add_argument('--use_numbers_for_isoform', help='By default, the script will name the isoforms of a gene with letters. If you use this flag, it will be numbers instead.', action="store_true" )
982991

983992
args = parser.parse_args()
984993

@@ -992,8 +1001,10 @@ class OgsMerger():
9921001

9931002
self.id_regex = args.id_regex
9941003
self.id_syntax = args.id_syntax
995-
self.regex_rna = args.regex_rna
996-
self.regex_rna_replace = args.regex_rna_replace
1004+
self.exon_parent_regex = args.exon_parent_regex
1005+
self.exon_parent_to_gene = args.exon_parent_to_gene
1006+
self.isoform_prefix = args.isoform_prefix
1007+
self.use_numbers_for_isoform = args.use_numbers_for_isoform
9971008

9981009
self.out_prefix = args.out_prefix
9991010
if not self.out_prefix:

0 commit comments

Comments
 (0)