Skip to content

Commit

Permalink
Commit code files
Browse files Browse the repository at this point in the history
  • Loading branch information
Damianos Melidis committed Nov 19, 2021
1 parent 005b80f commit a39d667
Show file tree
Hide file tree
Showing 74 changed files with 44,077 additions and 0 deletions.
5 changes: 5 additions & 0 deletions .gitignore
Original file line number Diff line number Diff line change
@@ -0,0 +1,5 @@
genotoscope/.idea/
genotoscope/venv/
genotoscope/__pycache__/
genotoscope/*.ini
genotoscope/*.yaml
17 changes: 17 additions & 0 deletions README.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,17 @@
#GenOtoScope

GenOtoScope is a bioinformatics tool to classify the pathogenicity of variants associated with hearing loss, based on ACMG/AMP guidelines.
The official GenOtoScope installation is found at TBA, hosted by [Human Genetics department of Medical School of Hannover](https://www.mhh.de/en/human-genetics).
This work was developed [L3S Research Center](https://www.l3s.de/en), institue of [Leibniz University Hannover](https://www.uni-hannover.de/en/).


##Documentation

User documentation can be found on the [wiki]().

Submit your bug reports [here]().

##Copyright
GenOtoScope is copyright (c) 2021-2022 by Medical School of Hannover and L3S Research Center.

GenOtoScope is licensed under the [GNU Affero General Public License](http://www.gnu.org/licenses/agpl-3.0.html). Please contact the authors to discuss custom licensing.
Empty file added __init__.py
Empty file.
284 changes: 284 additions & 0 deletions beds/ExtractPhenoTranscripts.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,284 @@
from genotoscope.utils import create_dir
from os.path import join
from pandas import read_excel

from pyensembl import EnsemblRelease
from pybedtools import BedTool


class ExtractPhenoTranscripts:
"""
Read hearing loss relevant transcripts and clinical significant exons from the followings two papers:
1. DiStefano, Marina T., et al. "Curating clinically relevant transcripts for the interpretation of sequence variants."
The Journal of Molecular Diagnostics 20.6 (2018): 789-801.
DOI: 10.1016/j.jmoldx.2018.06.005.
2. DiStefano, Marina T., et al. "ClinGen expert clinical validity curation of 164 hearing loss gene–disease pairs."
Genetics in Medicine 21.10 (2019): 2239-2247.
DOI: doi: 10.1038/s41436-019-0487-0.
"""

def __init__(self, data_path, output_root, output_dir):
"""
Parameters
----------
data_path : str
hearing loss specific info data path
output_root : str
output root, where the produced bed files will be saved
output_dir : str
output directory where the bed file will be placed
Returns
-------
None
"""
# input and output directories
self.data_path = data_path
self.output_dir = join(output_root, output_dir)
create_dir(self.output_dir)

### PyEnsembl ###
# release 75 uses human reference genome GRCh37
self.ensembl_data = EnsemblRelease(75)

def get_gene_chr(self, gene_name):
"""
Get chromosome number from gene name
Parameters
----------
gene_name: str
gene name
Returns
-------
str
chromosome at which gene lies in
"""
known_chromosomes = [str(i) for i in range(1, 23)]
known_chromosomes += ['X', 'Y', 'MT']
if len(gene_name.strip().split(" ")) > 1:
[gene, gene_alternative] = gene_name.strip().split(" ")
else:
gene = gene_name.strip()
gene_alternative = None
try:
# print("try to get locus for gene name: {}".format(gene))
gene_locus = self.ensembl_data.loci_of_gene_names(gene)
except ValueError:
if gene_alternative: # if gene alternative name exists
try:
gene_alternative = gene_alternative.replace("(", "")
gene_alternative = gene_alternative.replace(")", "")
# print("gene locus could not be found, try with alternative name: {}".format(gene_alternative))
gene_locus = self.ensembl_data.loci_of_gene_names(gene_alternative)
except ValueError:
print("Could not find locus for input gene name (first and alternative): {}".format(gene_name))
gene_locus = None
else:
# alternative name does not exist, and gene locus was not found for the first gene name
print("Could not find locus for input gene name: {}".format(gene_name))
gene_locus = None

# convert gene locus to chromosome number
if gene_locus:
first_locus = gene_locus[0]
chr = first_locus.to_dict()['contig']
print("Gene is at chromosome: {}".format(chr))
assert chr in known_chromosomes, "AssertionError: retrieved chromosome number should be one of [1:22] + X,Y"
return chr
else:
return None

def write_bed(self, header_bed, bed_rows, bed_filename):
"""
Write bed file
Parameters
----------
header_bed : str
header for bed file
bed_rows : list of str
bed rows
bed_filename : str
bed file name
Returns
-------
None
"""
print("Writing bed file, containing {} rows".format(len(bed_rows)))
with open(join(self.output_dir, bed_filename), 'w') as bed_handle:
bed_handle.write(header_bed + "\n")
for bed_row in bed_rows:
bed_handle.write(bed_row + "\n")

@staticmethod
def parse_exons_index(exons_column):
"""
Parse phenotype relevant exon indices
Parameters
----------
exons_column : str
clinical significant exon indices column
Returns
-------
dict of str : list of int
dictionary of clinical significant exons (to be included or excluded)
"""
print("Parse exons column: {}".format(exons_column))
clinical_exons = {"include": [], "exclude": []}
if exons_column[0].isdigit():
### ### ###
# exons to include
### ### ###
if "-" in exons_column:
# at least two ranges of exon indices to be included
for exons_range in exons_column.split(","):
if "-" in exons_range:
exon_start, exon_end = exons_range.strip().split("-")
clinical_exons["include"] = clinical_exons["include"] + [exon_idx for exon_idx in
range(int(exon_start) - 1,
int(exon_end))]
else:
clinical_exons["include"] = clinical_exons["include"] + [int(exons_range.strip()) - 1]
else:
# only one exon to include
clinical_exons["include"] = clinical_exons["include"] + [int(exons_column) - 1]
elif exons_column[0] == "-":
if "(" in exons_column and ")" in exons_column:
# at least two ranges of exon indices to be excluded
exons_column = exons_column[2:-1]
for exons_range in exons_column.split(","):
if "-" in exons_range:
exon_start, exon_end = exons_range.strip().split("-")
clinical_exons["exclude"] = clinical_exons["exclude"] + [exon_idx for exon_idx in
range(int(exon_start) - 1,
int(exon_end))]
else:
clinical_exons["exclude"] = clinical_exons["exclude"] + [int(exons_range.strip()) - 1]
else:
# only one exon to be excluded
clinical_exons["exclude"] = clinical_exons["exclude"] + [-1 * (int(exons_column) + 1)]
else:
# all exons to be included
# equivalent exclude a non existent exon index
clinical_exons["exclude"] = [-1]
print("Clinical exons to process: {}".format(clinical_exons))
### ### ###
# assert that exons are only included or only excluded
### ### ###
try:
assert bool(len(clinical_exons["include"]) > 0) ^ bool(len(clinical_exons["exclude"]) > 0)
except AssertionError:
print(
"AssertionError: Please make clinical_exons columns to described only inclusion of exons or only exclusion of them")

return clinical_exons

def extract_pheno_relevant_exons(self, pheno_relevant_transcripts_file):
"""
Create phenotype relevant exons using transcript and gene information
Parameters
----------
pheno_relevant_transcripts_file : str
full path to phenotpy relevant transcripts list
Returns
-------
list of str
phenotype relevant exons, one per list element
"""
print("Extract information for phenotype relevant exons")
pheno_transcripts_df = read_excel(join(self.data_path, pheno_relevant_transcripts_file), header=0, index_col=0)
# for each gene get the chromosome number
# and then save all exons of the transcripts to an list
pheno_relevant_exons = []
genes_transcripts_df = pheno_transcripts_df.iloc[:, 0:2]
gene_transcripts_exons_df = pheno_transcripts_df.iloc[:, 0:4]
extract_stats = {"uniq_genes": set(), "uniq_trans": set(), "uniq_exons": set(), "grch38_only": set(),
"no_distinct_exons": set(), "already_added": set()}

for gene_name, row in gene_transcripts_exons_df.iterrows():
if row["comment"] == "only_grch38":
# exclude current transcript, as is found only on GRCh38
extract_stats["grch38_only"].add(row["ensembl GRCh37.p13"].strip())
elif row["comment"] == "transcript_already_included":
# exclude current transcript, as it is already added
extract_stats["already_added"].add(row["ensembl GRCh37.p13"].strip())
elif row["clinical_exons"] == "no_distinct_coding_exons":
# exclude current transcript, as it does not contain distinct exons
extract_stats["no_distinct_exons"].add(row["ensembl GRCh37.p13"].strip())
else:
extract_stats["uniq_genes"].add(gene_name)
# get transcript and chromosome info
if len(row["ensembl GRCh37.p13"].split(".")[0]) > 1:
transcript_id = row["ensembl GRCh37.p13"].split(".")[0]
else:
transcript_id = row["ensembl GRCh37.p13"]
extract_stats["uniq_trans"].add(transcript_id)
transcript_chrom = self.get_gene_chr(gene_name)
transcript = self.ensembl_data.transcript_by_id(transcript_id)

# get clinical exons
clinical_exons = ExtractPhenoTranscripts.parse_exons_index(str(row["clinical_exons"]))
# understand if you need to only include exons or only exlude
if len(clinical_exons["include"]) > 0:
include_or_exclude = 0
elif len(clinical_exons["exclude"]) > 0:
include_or_exclude = 1
for exon_idx, exon in enumerate(transcript.exons):
if include_or_exclude == 0:
if exon_idx in clinical_exons["include"]:
extract_stats["uniq_exons"].add(exon.id)
pheno_relevant_exons.append(
"\t".join(
["chr" + transcript_chrom, str(exon.to_dict()["start"]), str(exon.to_dict()["end"]),
exon.id, transcript_id, exon.to_dict()["strand"]]))
else:
if exon_idx not in clinical_exons["exclude"]:
extract_stats["uniq_exons"].add(exon.id)
pheno_relevant_exons.append(
"\t".join(
["chr" + transcript_chrom, str(exon.to_dict()["start"]), str(exon.to_dict()["end"]),
exon.id, transcript_id, exon.to_dict()["strand"]]))
print("--")
print("### ### ###")
print("Extraction statistics:")
for k, v in extract_stats.items():
print(k + " contains: {} elements, which are:".format(len(v)))
print(v)
print("### ### ###")
return pheno_relevant_exons

def run(self, pheno_relevant_transcripts_file, header_bed, out_filename):
"""
Extract phenotype relevant transcripts and write their clinical significant exons on bed file
Parameters
----------
pheno_relevant_transcripts_file : str
phenotype relevant transcripts xslx file
header_bed : str
bed header for extracted phenotype relevant transcripts
out_filename: str
bed annotation filename
Returns
-------
None
"""
print("Extract phenotype relevant transcripts and write their clinical significant exons on bed file")
# extract phenotype relevant transcripts and their clinical significant exons
pheno_relevant_exons = self.extract_pheno_relevant_exons(pheno_relevant_transcripts_file)
# write them on a bed file
self.write_bed(header_bed, pheno_relevant_exons, out_filename)
# sort bed file
print("Sort resulted bed and save sorted version")
pheno_relevant_exons = BedTool(join(self.output_dir, out_filename)).sort()
pheno_relevant_exons.saveas(join(self.output_dir, out_filename.split(".bed")[0] + "_sorted" + ".bed"))
Loading

0 comments on commit a39d667

Please sign in to comment.