Merge pull request #48 from cmbi/rebuttal

Rebuttal
cmbi · Apr 19, 2019 · d3196d2 · d3196d2
2 parents f7797f0 + 29bc725
commit d3196d2
Show file tree

Hide file tree

Showing 15 changed files with 392 additions and 107 deletions.
diff --git a/metadome/__init__.py b/metadome/__init__.py
@@ -1,7 +1,7 @@
 import logging
 from flask_debugtoolbar import DebugToolbarExtension
 
-_VERSION = '1.0.0 - alpha'
+_VERSION = '1.0.1'
 
 # for using the Flask debug toolbar throughout the application
 toolbar = DebugToolbarExtension()

diff --git a/metadome/default_settings.py b/metadome/default_settings.py
@@ -48,6 +48,7 @@
 GENCODE_HG_ANNOTATION_FILE_GFF3 = DATA_DIR+"Gencode/gencode.v19.annotation.gff3"
 GENCODE_HG_TRANSCRIPTION_FILE = DATA_DIR+"Gencode/gencode.v19.pc_transcripts.fa"
 GENCODE_HG_TRANSLATION_FILE = DATA_DIR+"Gencode/gencode.v19.pc_translations.fa"
+GENCODE_REFSEQ_FILE = DATA_DIR+"Gencode/gencode.v19.metadata.RefSeq"
 GENCODE_SWISSPROT_FILE = DATA_DIR+"Gencode/gencode.v19.metadata.SwissProt"
 GENCODE_BASIC_FILE = DATA_DIR+"Gencode/ucsc.gencode.v19.wgEncodeGencodeBasic.txt"
 

diff --git a/metadome/domain/repositories.py b/metadome/domain/repositories.py
@@ -27,6 +27,27 @@ class MalformedAARegionException(Exception):
 
 class GeneRepository:
 
+    @staticmethod
+    def retrieve_gene_names_for_multiple_transcript_ids(_transcript_ids):
+        """Retrieves all gene names for a given set of gencode transcripts 
+        based on multiple Gene objects as {gencode_transcription_id: gene_name}"""
+        # Open as session
+        _session = db.create_scoped_session()
+
+        try:
+            _gene_name_per_gencode_transcription_id = {}
+            for gene in _session.query(Gene).filter(Gene.gencode_transcription_id.in_(_transcript_ids)).all():
+                _gene_name_per_gencode_transcription_id[gene.gencode_transcription_id] = gene.gene_name
+            return _gene_name_per_gencode_transcription_id
+        except (AlchemyResourceClosedError, AlchemyOperationalError, PsycopOperationalError) as e:
+            raise RecoverableError(str(e))
+        except:
+            _log.error(traceback.format_exc())
+            raise
+        finally:
+            # Close this session, thus all items are cleared and memory usage is kept at a minimum
+            _session.remove()
+
     @staticmethod
     def retrieve_transcript_id_for_multiple_gene_ids(_gene_ids):
         """Retrieves all gencode transcripts for multiple Gene objects as {gene_id: gencode_transcription_id}"""

diff --git a/metadome/domain/wrappers/gencode.py b/metadome/domain/wrappers/gencode.py
@@ -1,7 +1,7 @@
 import logging
 from metadome.default_settings import GENCODE_HG_TRANSLATION_FILE,\
     GENCODE_SWISSPROT_FILE, GENCODE_HG_TRANSCRIPTION_FILE,\
-    GENCODE_HG_ANNOTATION_FILE_GFF3, GENCODE_BASIC_FILE
+    GENCODE_HG_ANNOTATION_FILE_GFF3, GENCODE_BASIC_FILE, GENCODE_REFSEQ_FILE
 from metadome.domain.parsers import gff3
 from Bio.Seq import translate
 import urllib
@@ -383,4 +383,36 @@ def retrieve_all_protein_coding_gene_names():
                 # add the gene name to the set
                 gene_names.add(tokens[5])
 
-    return list(gene_names)
+    return list(gene_names)
+
+def retrieve_refseq_identifiers_for_transcript(gencode_id):
+    """Retrieves the refseq identifiers for a Gencode transcript"""
+    result = {}
+    result['NP'] = []
+    result['NM'] = []
+    result['NR'] = []
+    with open(GENCODE_REFSEQ_FILE) as gencode_refseq:
+        # read the lines in the file
+        lines = gencode_refseq.readlines()
+        for line in lines:
+            # check if the unique identifier is on the current line
+            if gencode_id in line:
+                #Add the result to hits
+                tokens = line.split('\t')
+
+                # Only add the translation to the translation list if the gene_name exactly matches the one we are looking for
+                if gencode_id == tokens[0]:
+                    # add the results
+                    for token in tokens[1:]:
+                        token = token.strip()
+                        if token.startswith('NP'):
+                            result['NP'].append(token)
+                        elif token.startswith('NM'):
+                            result['NM'].append(token)
+                        elif token.startswith('NR'):
+                            result['NR'].append(token)
+                        elif len(token) == 0:
+                            continue
+                        else:
+                            _log.warning("When retrieving matching RefSeq ids for "+gencode_id+" unexpected token: "+token)    
+    return result
diff --git a/metadome/presentation/api/routes.py b/metadome/presentation/api/routes.py
@@ -14,6 +14,7 @@
 from metadome.controllers.job import (create_visualization_job_if_needed,
                                       get_visualization_status,
                                       retrieve_visualization)
+from metadome.domain.wrappers.gencode import retrieve_refseq_identifiers_for_transcript
 
 
 _log = logging.getLogger(__name__)
@@ -30,7 +31,7 @@ def get_transcript_ids_for_gene(gene_name):
     _log.debug('get_transcript_ids_for_gene')
     # retrieve the transcript ids for this gene
     trancripts = GeneRepository.retrieve_all_transcript_ids(gene_name)
-
+    
     # check if there was any return value
     if len(trancripts) > 0:
         message = "Retrieved transcripts for gene '"+trancripts[0].gene_name+"'"
@@ -39,9 +40,14 @@ def get_transcript_ids_for_gene(gene_name):
 
     transcript_results = []
     for t in trancripts:
+        # retrieve matching refseq identifiers for this transcript 
+        refseq_ids = retrieve_refseq_identifiers_for_transcript(t.gencode_transcription_id)
+        refseq_nm_numbers = ", ".join(nm_number for nm_number in refseq_ids['NM'])
+
         transcript_entry = {}
         transcript_entry['aa_length'] = t.sequence_length
         transcript_entry['gencode_id'] = t.gencode_transcription_id
+        transcript_entry['refseq_nm_numbers'] = refseq_nm_numbers
         transcript_entry['has_protein_data'] = not t.protein_id is None
         transcript_results.append(transcript_entry)
 

diff --git a/metadome/presentation/web/routes.py b/metadome/presentation/web/routes.py
@@ -61,9 +61,9 @@ def about():
 def method():
     return render_template('method.html')
 
-@bp.route('/help', methods=['GET'])
+@bp.route('/faq', methods=['GET'])
 def help_page():
-    return render_template('help.html')
+    return render_template('faq.html')
 
 @bp.route('/visualization_error/<transcript_id>/', methods=['GET'])
 def visualization_error(transcript_id):