Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
76 commits
Select commit Hold shift + click to select a range
17765ef
issue #170
ifiddes Apr 14, 2020
eee10a4
Cupcake script; assorted bugs
ifiddes Apr 28, 2020
4958b75
Allow for users to specify singularity image via environmental variab…
ifiddes Apr 28, 2020
0e64724
Update __init__.py
ifiddes Apr 28, 2020
27d6dde
allow type
ifiddes Apr 29, 2020
fc47144
update reserved keys
ifiddes Apr 29, 2020
4991350
I see the problem
ifiddes Apr 29, 2020
859a323
gff parsing bugs
ifiddes May 10, 2020
2914597
more clear error
ifiddes May 10, 2020
1c4c060
update script
ifiddes May 10, 2020
25f521c
oops
ifiddes May 10, 2020
c0a8500
oops
ifiddes May 10, 2020
89071a1
fixing install and permissions
ifiddes May 10, 2020
e3c4677
bad data; chromosome name script
ifiddes May 11, 2020
b09781a
oops
ifiddes May 11, 2020
6d9fa5a
oops
ifiddes May 11, 2020
5780085
oops again
ifiddes May 11, 2020
f9c42ae
oops again
ifiddes May 11, 2020
815d829
oops again
ifiddes May 11, 2020
313e9af
oops again
ifiddes May 11, 2020
b4bcdc8
update validate gff3 to python3
ifiddes May 11, 2020
264164f
somehow this got out of date
ifiddes May 11, 2020
6ab1a60
somehow this got out of date
ifiddes May 11, 2020
17ac972
ugh
ifiddes May 11, 2020
af2dd55
ugh
ifiddes May 11, 2020
88be608
ugh
ifiddes May 11, 2020
9e3abd8
ugh
ifiddes May 11, 2020
3fa1eee
ugh
ifiddes May 11, 2020
f65c076
subtle
ifiddes May 12, 2020
3dec4a7
allow minimal
ifiddes May 12, 2020
542c38b
nevermind
ifiddes May 12, 2020
7d2a3de
hacks
ifiddes May 12, 2020
18ff681
convert
ifiddes May 12, 2020
7bd8d7a
convert
ifiddes May 12, 2020
8c1e4cd
more features
ifiddes May 12, 2020
2be9410
fix convert
ifiddes May 12, 2020
7c8b04e
fix convert
ifiddes May 12, 2020
cb9537b
add script for HAL conversion
ifiddes May 12, 2020
ea3bbca
oops
ifiddes May 12, 2020
e11a1f3
hal gets mad
ifiddes May 12, 2020
228219f
oops
ifiddes May 12, 2020
26fa1c4
fix biotypes
ifiddes May 12, 2020
56692be
fix gene names
ifiddes May 13, 2020
162cdbe
ugh
ifiddes May 13, 2020
77b6754
bug in alignments
ifiddes May 13, 2020
a13a80b
oops
ifiddes May 13, 2020
8852f75
this aligner can walk off the end
ifiddes May 13, 2020
46a28a7
more bugs
ifiddes May 13, 2020
03adbd2
more bugs
ifiddes May 13, 2020
b0f6ad9
try to improve logging
ifiddes May 14, 2020
cb3bb14
oops
ifiddes May 14, 2020
4375845
summarize script
ifiddes May 14, 2020
632ab21
oops
May 14, 2020
1886ba6
docker should understand tmpdir
ifiddes May 15, 2020
81bff24
Merge pull request #176 from ComparativeGenomicsToolkit/rename_cactus
ifiddes May 17, 2020
11ed286
Merge pull request #177 from ComparativeGenomicsToolkit/fix_gff3
ifiddes May 17, 2020
279070d
docs
ifiddes May 17, 2020
b480c02
Merge pull request #174 from ComparativeGenomicsToolkit/issue_172
ifiddes May 17, 2020
5c014b9
infer parent
ifiddes May 20, 2020
c0e9f54
new biotype
ifiddes May 20, 2020
f73c171
bug in conversion
ifiddes May 20, 2020
4c2fe34
cgp wont work
ifiddes May 24, 2020
20ade5a
empty tags
ifiddes May 24, 2020
13a588a
Merge pull request #178 from ComparativeGenomicsToolkit/fix_exref
ifiddes May 27, 2020
91baf08
Merge pull request #179 from ComparativeGenomicsToolkit/gff3
ifiddes May 27, 2020
a771d00
exonerate in docker
ifiddes Jun 4, 2020
c6dba03
Merge pull request #182 from ComparativeGenomicsToolkit/exonerate
ifiddes Jun 4, 2020
6396bef
Update consensus.py
ifiddes Jun 11, 2020
81856ac
oops
ifiddes Jun 11, 2020
8d2540f
python 3.6 is fine
ifiddes Jun 11, 2020
6e11f82
bug in indel finding
ifiddes Jun 12, 2020
b204194
oops
ifiddes Jun 19, 2020
79b9b81
refactor parental gene assignment
ifiddes Jun 21, 2020
1265673
Merge branch 'master' into refactor_parent_gene
ifiddes Jun 23, 2020
3df1f58
fix target genomes
ifiddes Jun 25, 2020
e335d75
try again
ifiddes Jun 25, 2020
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
6 changes: 3 additions & 3 deletions Dockerfile
Original file line number Diff line number Diff line change
Expand Up @@ -9,7 +9,7 @@ RUN git clone git://github.com/samtools/htslib.git
RUN cd htslib && make install

# bcftools
RUN git clone git://github.com/samtools/bcftools.git
RUN git clone git://github.com/samtools/bcftools.git
RUN cd bcftools && make

# samtools
Expand All @@ -33,7 +33,7 @@ RUN cd augustus && make
# HDF5
RUN wget -q http://www.hdfgroup.org/ftp/HDF5/releases/hdf5-1.10/hdf5-1.10.1/src/hdf5-1.10.1.tar.gz
RUN tar xzf hdf5-1.10.1.tar.gz
RUN cd hdf5-1.10.1 && ./configure --enable-cxx --prefix=/usr
RUN cd hdf5-1.10.1 && ./configure --enable-cxx --prefix=/usr
RUN cd hdf5-1.10.1 && make && make install

# sonLib
Expand Down Expand Up @@ -62,7 +62,7 @@ RUN tar xvjf sambamba_v0.6.7_linux.tar.bz2

FROM ubuntu:18.04
RUN apt-get update
RUN apt-get install -y wget bedtools bamtools samtools sqlite3 libgsl0-dev libcolamd2 software-properties-common libcurl4-openssl-dev
RUN apt-get install -y wget bedtools bamtools samtools sqlite3 libgsl0-dev libcolamd2 software-properties-common libcurl4-openssl-dev exonerate
RUN add-apt-repository -y ppa:deadsnakes/ppa
RUN apt-get install -y python3.7 python3-pip
# Kent
Expand Down
63 changes: 38 additions & 25 deletions README.md

Large diffs are not rendered by default.

40 changes: 18 additions & 22 deletions cat/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -53,7 +53,6 @@
from .exceptions import *

logger = logging.getLogger('cat')
logger.setLevel('INFO')


###
Expand Down Expand Up @@ -185,7 +184,7 @@ def get_pipeline_args(self):
args.set('global_near_best', self.global_near_best, True)
args.set('filter_overlapping_genes', self.filter_overlapping_genes, True)
args.set('overlapping_gene_distance', self.overlapping_gene_distance, True)

# user specified flags for consensus finding
args.set('intron_rnaseq_support', self.intron_rnaseq_support, False)
args.set('exon_rnaseq_support', self.exon_rnaseq_support, False)
Expand Down Expand Up @@ -398,6 +397,8 @@ def load_docker(self):
'Either install it or use a different option for --binary-mode.')
os.environ['SINGULARITY_PULLFOLDER'] = os.path.abspath(self.work_dir)
os.environ['SINGULARITY_CACHEDIR'] = os.path.abspath(self.work_dir)
if os.environ.get('SINGULARITY_IMAGE'):
return
tools.fileOps.ensure_dir(self.work_dir)
if not os.path.isfile(os.path.join(self.work_dir, 'cat.img')):
subprocess.check_call(['singularity', 'pull', '--name', 'cat.img',
Expand Down Expand Up @@ -565,7 +566,7 @@ def success(task):
os.remove(os.path.abspath(task.job_store))
except OSError:
pass
else:
else:
cmd = ['toil', 'stats', '--raw', os.path.abspath(task.job_store)]
raw = tools.procOps.call_proc(cmd)
parsed = raw[raw.index('{'):raw.rfind('}') + 1]
Expand Down Expand Up @@ -1620,14 +1621,13 @@ def get_args(pipeline_args, mode):
args = tools.misc.HashableNamespace()
if mode == 'augPB':
args.tablename = tools.sqlInterface.AugPbAlternativeGenes.__tablename__
pb_genomes = set(pipeline_args.target_genomes) & (pipeline_args.isoseq_genomes - {pipeline_args.ref_genome})
args.gps = {genome: AugustusPb.get_args(pipeline_args, genome).augustus_pb_gp
for genome in pipeline_args.isoseq_genomes}
for genome in pb_genomes}
args.filtered_tm_gps = {genome: TransMap.get_args(pipeline_args, genome).filtered_tm_gp
for genome in pipeline_args.isoseq_genomes - {pipeline_args.ref_genome}}
for genome in pb_genomes}
args.unfiltered_tm_gps = {genome: TransMap.get_args(pipeline_args, genome).tm_gp
for genome in pipeline_args.isoseq_genomes - {pipeline_args.ref_genome}}
args.chrom_sizes = {genome: GenomeFiles.get_args(pipeline_args, genome).sizes
for genome in pipeline_args.isoseq_genomes}
for genome in pb_genomes}
# add the reference annotation as a pseudo-transMap to assign parents in reference
args.filtered_tm_gps[pipeline_args.ref_genome] = ReferenceFiles.get_args(pipeline_args).annotation_gp
args.unfiltered_tm_gps[pipeline_args.ref_genome] = ReferenceFiles.get_args(pipeline_args).annotation_gp
Expand All @@ -1643,23 +1643,20 @@ def get_args(pipeline_args, mode):
unfiltered_tm_gp_files[pipeline_args.ref_genome] = ReferenceFiles.get_args(pipeline_args).annotation_gp
args.filtered_tm_gps = filtered_tm_gp_files
args.unfiltered_tm_gps = unfiltered_tm_gp_files
args.chrom_sizes = {genome: GenomeFiles.get_args(pipeline_args, genome).sizes
for genome in list(pipeline_args.target_genomes) + [pipeline_args.ref_genome]}
elif mode == 'exRef':
exref_genomes = set(pipeline_args.target_genomes) & set(pipeline_args.external_ref_genomes)
args.tablename = tools.sqlInterface.ExRefAlternativeGenes.__tablename__
args.gps = {genome: ExternalReferenceFiles.get_args(pipeline_args, genome).annotation_gp
for genome in pipeline_args.external_ref_genomes}
for genome in exref_genomes}
filtered_tm_gp_files = {genome: TransMap.get_args(pipeline_args, genome).filtered_tm_gp
for genome in pipeline_args.external_ref_genomes}
for genome in exref_genomes}
unfiltered_tm_gp_files = {genome: TransMap.get_args(pipeline_args, genome).tm_gp
for genome in pipeline_args.external_ref_genomes}
for genome in exref_genomes}
# add the reference annotation as a pseudo-transMap to assign parents in reference
filtered_tm_gp_files[pipeline_args.ref_genome] = ReferenceFiles.get_args(pipeline_args).annotation_gp
unfiltered_tm_gp_files[pipeline_args.ref_genome] = ReferenceFiles.get_args(pipeline_args).annotation_gp
args.filtered_tm_gps = filtered_tm_gp_files
args.unfiltered_tm_gps = unfiltered_tm_gp_files
args.chrom_sizes = {genome: GenomeFiles.get_args(pipeline_args, genome).sizes
for genome in list(pipeline_args.target_genomes) + [pipeline_args.ref_genome]}
else:
raise Exception('Invalid mode passed to FindDenovoParents')
return args
Expand Down Expand Up @@ -1696,8 +1693,7 @@ def run(self):
table_target = self.get_table_targets(genome, denovo_args.tablename, pipeline_args)
filtered_tm_gp = denovo_args.filtered_tm_gps[genome]
unfiltered_tm_gp = denovo_args.unfiltered_tm_gps[genome]
chrom_sizes = denovo_args.chrom_sizes[genome]
df = assign_parents(filtered_tm_gp, unfiltered_tm_gp, chrom_sizes, denovo_gp)
df = assign_parents(filtered_tm_gp, unfiltered_tm_gp, denovo_gp)
db = pipeline_args.dbs[genome]
with tools.sqlite.ExclusiveSqlConnection(db) as engine:
df.to_sql(denovo_args.tablename, engine, if_exists='replace')
Expand Down Expand Up @@ -2298,7 +2294,7 @@ def output(self):
def run(self):
pipeline_args = self.get_pipeline_args()
luigi_stats = tools.sqlInterface.load_luigi_stats(pipeline_args.stats_db, 'stats')

try:
toil_stats = tools.sqlInterface.load_luigi_stats(pipeline_args.stats_db, 'toil_stats')
except ValueError:
Expand Down Expand Up @@ -3184,7 +3180,7 @@ def construct_consensus_gp_as(has_rna, has_pb):
'''


snake_composite = '''track hubCentral
snake_composite = '''track cactus
compositeTrack on
shortLabel Cactus
longLabel Cactus Alignment Tracks
Expand All @@ -3199,11 +3195,11 @@ def construct_consensus_gp_as(has_rna, has_pb):
visibility full
type bigBed 3

track hubCentralAlignments
track cactusAlignments
shortLabel Alignments
view Alignments
visibility full
subTrack hubCentral
subTrack cactus

'''

Expand All @@ -3212,7 +3208,7 @@ def construct_consensus_gp_as(has_rna, has_pb):
shortLabel {genome}
otherSpecies {genome}
visibility {visibility}
parent hubCentralAlignments off
parent cactusAlignments off
priority 3
bigDataUrl {hal_path}
type halSnake
Expand Down
5 changes: 4 additions & 1 deletion cat/chaining.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,8 @@
""""
Toil program to generate UCSC chains and nets between two genomes in a HAL file.
Toil program to generate UCSC chains and nets between one reference genome and N target genomes in a HAL file.

The reference genome is split up by contig/chromosome, and those are chained against all the things they align to
in each genome specified.
"""
import argparse
import collections
Expand Down
42 changes: 23 additions & 19 deletions cat/classify.py
Original file line number Diff line number Diff line change
Expand Up @@ -42,8 +42,6 @@
2) mRNA

"""
import itertools

import pandas as pd

import tools.bio
Expand Down Expand Up @@ -244,29 +242,28 @@ def find_indels(tx, psl, aln_mode):
"""
def convert_coordinates_to_chromosome(left_pos, right_pos, coordinate_fn, strand):
"""convert alignment coordinates to target chromosome coordinates, inverting if negative strand"""
left_chrom_pos = coordinate_fn(left_pos)
assert left_chrom_pos is not None
right_chrom_pos = coordinate_fn(right_pos)
if right_chrom_pos is None:
right_chrom_pos = coordinate_fn(right_pos - 1)
if strand == '-':
left_chrom_pos += 1
else:
left_chrom_pos -= 1
assert right_chrom_pos is not None
if strand == '-':
left_chrom_pos, right_chrom_pos = right_chrom_pos, left_chrom_pos
assert right_chrom_pos >= left_chrom_pos
# this is a deletion, and we don't need to adjust
if left_pos == right_pos:
left_chrom_pos = right_chrom_pos = coordinate_fn(left_pos)
# for + strand insertion, just translate
elif strand == '+':
right_chrom_pos = coordinate_fn(right_pos)
left_chrom_pos = coordinate_fn(left_pos)
# for - strand insertion, translate but move outwards
else:
right_chrom_pos = coordinate_fn(left_pos) + 1
left_chrom_pos = coordinate_fn(right_pos - 1)
return left_chrom_pos, right_chrom_pos

def parse_indel(left_pos, right_pos, coordinate_fn, tx, offset, gap_type):
"""Converts either an insertion or a deletion into a output transcript"""
left_chrom_pos, right_chrom_pos = convert_coordinates_to_chromosome(left_pos, right_pos, coordinate_fn,
tx.strand)
if left_chrom_pos is None or right_chrom_pos is None:
assert aln_mode == 'CDS'
return None

assert right_chrom_pos >= left_chrom_pos

if left_chrom_pos > tx.thick_start and right_chrom_pos < tx.thick_stop:
indel_type = 'CodingMult3' if offset % 3 == 0 else 'Coding'
else:
Expand Down Expand Up @@ -295,13 +292,20 @@ def parse_indel(left_pos, right_pos, coordinate_fn, tx, offset, gap_type):
t_offset = t_start - block_size - t_pos
assert (q_offset >= 0 and t_offset >= 0)
if q_offset != 0: # query insertion -> insertion in target sequence
left_pos = q_start - q_offset
right_pos = q_start
if tx.strand == '+':
left_pos = q_start - q_offset
right_pos = q_start
else:
left_pos = psl.q_size - q_start - 1
right_pos = psl.q_size - q_start + q_offset
row = parse_indel(left_pos, right_pos, coordinate_fn, tx, q_offset, 'Insertion')
if row is not None:
r.append(row)
if t_offset != 0: # target insertion -> insertion in reference sequence
left_pos = right_pos = q_start
if tx.strand == '+':
left_pos = right_pos = q_start
else:
left_pos = right_pos = psl.q_size - q_start
row = parse_indel(left_pos, right_pos, coordinate_fn, tx, t_offset, 'Deletion')
if row is not None:
r.append(row)
Expand Down
25 changes: 15 additions & 10 deletions cat/consensus.py
Original file line number Diff line number Diff line change
Expand Up @@ -648,9 +648,11 @@ def add_exref_ids(s):
'source_gene_common_name': s.CommonName}

# bring in extra tags for exRef
if 'exRef' in denovo_tx_modes:
for key, val in tools.misc.parse_gff_attr_line(exref_annot.loc[aln_id].ExtraTags).items():
denovo_tx_dict[aln_id][key] = val
if tools.nameConversions.aln_id_is_exref(aln_id):
tags = exref_annot.loc[aln_id].ExtraTags
if len(tags) > 0:
for key, val in tools.misc.parse_gff_attr_line(tags).items():
denovo_tx_dict[aln_id][key] = val

# record some metrics
metrics['denovo'][tx_mode][s.TranscriptClass.replace('_', ' ').capitalize()] += 1
Expand Down Expand Up @@ -907,12 +909,15 @@ def convert_attrs(attrs, id_field):
else:
attrs['Name'] = attrs['gene_id']
# convert empty strings into nan
attrs = {x: y if y != '' else 'nan' for x, y in attrs.items()}
# don't include the support vectors in the string, they will be placed in their respective places
attrs_str = ['='.join([key, str(val).replace('=', '_')]) for key, val in sorted(attrs.items()) if 'support' not in key]
# explicitly escape any semicolons that may exist in the input strings
attrs_str = [x.replace(';', '%3B') for x in attrs_str]
return score, ';'.join(attrs_str)
attrs_str = []
for key, val in attrs.items():
val = str(val)
if len(val) == 0:
val = 'nan'
val = val.replace('=', '%3D').replace(';', '%3B')
key = key.replace('=', '%3D').replace(';', '%3B')
attrs_str.append(f"{key}={val}")
return score, ";".join(attrs_str)

def find_feature_support(attrs, feature, i):
"""Extracts the boolean value from the comma delimited string"""
Expand All @@ -929,7 +934,7 @@ def find_all_tx_modes(attrs_list):
for attrs in attrs_list:
tx_modes.update(attrs['transcript_modes'].split(','))
return ','.join(tx_modes)

intervals = set()
for tx in tx_objs:
intervals.update(tx.exon_intervals)
Expand Down
2 changes: 1 addition & 1 deletion cat/hints_db.py
Original file line number Diff line number Diff line change
Expand Up @@ -307,7 +307,7 @@ def run_protein_aln(job, protein_subset, genome_fasta_file_id):
# perform alignment
tmp_exonerate = tools.fileOps.get_tmp_toil_file()
cmd = ['exonerate', '--model', 'protein2genome', '--showvulgar', 'no', '--showalignment', 'no',
'--showquerygff', 'yes', genome_fasta, protein_fasta]
'--showquerygff', 'yes', protein_fasta, genome_fasta]
tools.procOps.run_proc(cmd, stdout=tmp_exonerate)
return job.fileStore.writeGlobalFile(tmp_exonerate)

Expand Down
Loading