Skip to content

Commit

Permalink
updated for Hcv1.av84
Browse files Browse the repository at this point in the history
  • Loading branch information
conchoecia committed May 2, 2020
1 parent a45634c commit 1e3f450
Show file tree
Hide file tree
Showing 3 changed files with 38 additions and 30 deletions.
49 changes: 24 additions & 25 deletions annotation/Snakefile
Original file line number Diff line number Diff line change
Expand Up @@ -83,7 +83,6 @@ def attach_descriptions_to_pdf(fasta_input,

rule all:
input:
"tempfiles/ref.fa",
expand("final_output/{fix}.gff", fix = PREFIX),
expand("final_output/nucleotide_sequences/unphased/{fix}_transcripts.fasta",fix = PREFIX),
expand("final_output/vcf_files/{fix}_transcripts.vcf", fix = PREFIX),
Expand Down Expand Up @@ -121,7 +120,7 @@ rule gen_temporary_ref:
input:
ref = REF
output:
tempref = temp("tempfiles/ref.fa")
tempref = "tempfiles/ref.fa"
shell:
"""
zcat {input.ref} > {output.tempref}
Expand Down Expand Up @@ -153,7 +152,7 @@ rule gff_to_gene_description:
input:
gff = "final_output/{fix}.gff"
output:
gene_to_description = temp("tempfiles/gene_to_description_{fix}.txt")
gene_to_description = "tempfiles/gene_to_description_{fix}.txt"
shell:
"""
cat {input.gff} | grep 'Description' | \
Expand All @@ -169,7 +168,7 @@ rule gen_fasta_from_gff:
ref = "tempfiles/ref.fa",
gff = "final_output/{fix}.gff"
output:
fas = temp("final_output/nucleotide_sequences/unphased/{fix}_transcripts.temp.fasta")
fas = "final_output/nucleotide_sequences/unphased/{fix}_transcripts.temp.fasta"
shell:
"""
awk '{{if ($1 != "M"){{print($0)}} }}' {input.gff} > temp.gff
Expand Down Expand Up @@ -197,7 +196,7 @@ rule map_illumina_to_transcripts:
freads = RNAF,
rreads = RNAR
output:
bam = temp("tempfiles/bamfiles/rna_to_transcripts/ill_to_{fix}_transcripts.sorted.bam")
bam = "tempfiles/bamfiles/rna_to_transcripts/ill_to_{fix}_transcripts.sorted.bam"
threads: workflow.cores
shell:
"""
Expand All @@ -215,7 +214,7 @@ rule map_flnc_to_transcripts:
fas = "final_output/nucleotide_sequences/unphased/{fix}_transcripts.fasta",
flnc = FLNC
output:
bam = temp("tempfiles/bamfiles/rna_to_transcripts/flnc_to_{fix}_transcripts.sorted.bam")
bam = "tempfiles/bamfiles/rna_to_transcripts/flnc_to_{fix}_transcripts.sorted.bam"
threads: workflow.cores
shell:
"""
Expand All @@ -234,7 +233,7 @@ rule merge_bams:
ill_bam = "tempfiles/bamfiles/rna_to_transcripts/ill_to_{fix}_transcripts.sorted.bam",
flnc_bam = "tempfiles/bamfiles/rna_to_transcripts/flnc_to_{fix}_transcripts.sorted.bam"
output:
merged = temp("tempfiles/bamfiles/merged_to_transcripts/allreads_to_{fix}_transcripts.sorted.bam")
merged = "tempfiles/bamfiles/merged_to_transcripts/allreads_to_{fix}_transcripts.sorted.bam"
shell:
"""
# merge bams
Expand Down Expand Up @@ -279,7 +278,7 @@ rule haplotag_flnc_bam:
flnc_bam = "tempfiles/bamfiles/rna_to_transcripts/flnc_to_{fix}_transcripts.sorted.bam"
output:
gzvcf = "final_output/vcf_files/phased/{fix}_transcripts.phased.vcf.gz",
flnc_bam = temp("tempfiles/bamfiles/rna_to_transcripts/flnc_haplotagged_to_{fix}_transcripts.sorted.bam")
flnc_bam = "tempfiles/bamfiles/rna_to_transcripts/flnc_haplotagged_to_{fix}_transcripts.sorted.bam"
shell:
"""
bgzip -c {input.vcf} > {output.gzvcf}
Expand All @@ -293,7 +292,7 @@ rule generate_two_references_from_vcf:
fas = "final_output/nucleotide_sequences/unphased/{fix}_transcripts.fasta",
gzvcf = "final_output/vcf_files/phased/{fix}_transcripts.phased.vcf.gz",
output:
fas = temp("tempfiles/nucleotide_sequences/unphased/phase{index}_transcripts_{fix}.fasta"),
fas = "tempfiles/nucleotide_sequences/unphased/phase{index}_transcripts_{fix}.fasta"
params:
outstem = lambda wildcards: "tempfiles/nucleotide_sequences/unphased",
prefix = lambda wildcards: wildcards.fix
Expand All @@ -316,8 +315,8 @@ rule get_one_fastq_file_per_haplotig:
flnc_bam = "tempfiles/bamfiles/rna_to_transcripts/flnc_haplotagged_to_{fix}_transcripts.sorted.bam",
reads = FLNC
output:
rlist = temp("tempfiles/fastq/unphased/h{index}list.{fix}.reads.txt"),
reads = temp("tempfiles/fastq/unphased/h{index}.flnc.{fix}.fastq.gz"),
rlist = "tempfiles/fastq/unphased/h{index}list.{fix}.reads.txt",
reads = "tempfiles/fastq/unphased/h{index}.flnc.{fix}.fastq.gz",
params:
prefix = lambda wildcards: wildcards.fix,
dex = lambda wildcards: wildcards.index
Expand All @@ -337,7 +336,7 @@ rule map_reads_to_transcripts:
fas = "tempfiles/nucleotide_sequences/unphased/phase{index}_transcripts_{fix}.fasta",
reads = "tempfiles/fastq/unphased/h{index}.flnc.{fix}.fastq.gz",
output:
bam = temp("tempfiles/bamfiles/h{index}flnc_to_h{index}.{fix}.sorted.bam")
bam = "tempfiles/bamfiles/h{index}flnc_to_h{index}.{fix}.sorted.bam"
threads: workflow.cores
shell:
"""
Expand All @@ -354,7 +353,7 @@ rule pilon_correct_the_references:
bam = "tempfiles/bamfiles/h{index}flnc_to_h{index}.{fix}.sorted.bam",
pilon = PILONJAR
output:
seq = temp("tempfiles/pilon/h{index}_flnc_to_{fix}_h{index}.pilon/h{index}_flnc_to_{fix}_h{index}.pilon.fasta"),
seq = "tempfiles/pilon/h{index}_flnc_to_{fix}_h{index}.pilon/h{index}_flnc_to_{fix}_h{index}.pilon.fasta",
threads: workflow.cores
params:
outdir = lambda wildcards: "tempfiles/pilon/h{0}_flnc_to_{1}_h{0}.pilon".format(wildcards.index, wildcards.fix),
Expand Down Expand Up @@ -397,7 +396,7 @@ rule translate_prots:
seq = "final_output/nucleotide_sequences/partly_phased/h{index}_pilon_{fix}.fasta",
ptp = "scripts/prottrans.py"
output:
pep = temp("final_output/protein_sequences/partly_phased/temp/h{index}_{fix}.pep")
pep = "final_output/protein_sequences/partly_phased/temp/h{index}_{fix}.pep"
threads: 1
shell:
"""
Expand Down Expand Up @@ -462,7 +461,7 @@ rule make_gene_comparison_table:
make_table_of_genes = "scripts/make_table_of_genes.py",
output:
diff_length_csvs = "final_output/protein_sequences/model/protein_size_table_{fix}.csv",
protein_model = temp("final_output/protein_sequences/model/{fix}_model_proteins.temp.pep")
protein_model = "final_output/protein_sequences/model/{fix}_model_proteins.temp.pep"
shell:
"""
python {input.make_table_of_genes} {input.fasta} {input.peps} {output.diff_length_csvs} {output.protein_model}
Expand All @@ -473,7 +472,7 @@ rule attach_descriptions_to_protein_models:
pep = "final_output/protein_sequences/model/{fix}_model_proteins.temp.pep",
gene_to_description = "tempfiles/gene_to_description_{fix}.txt"
output:
pep = temp("final_output/protein_sequences/model/{fix}_model_proteins.temp2.pep")
pep = "final_output/protein_sequences/model/{fix}_model_proteins.temp2.pep"
run:
attach_descriptions_to_pdf(input.pep,
input.gene_to_description,
Expand All @@ -498,7 +497,7 @@ rule map_phased_transcripts_to_genome:
transcripts = "final_output/nucleotide_sequences/partly_phased/h{index}_pilon_{fix}.fasta",
genome = "tempfiles/ref.fa"
output:
bam = temp("tempfiles/bamfiles/transcripts_to_genome/pilon_h{index}_to_ref.{fix}.sorted.bam")
bam = "tempfiles/bamfiles/transcripts_to_genome/pilon_h{index}_to_ref.{fix}.sorted.bam"
threads: workflow.cores
shell:
"""
Expand All @@ -515,8 +514,8 @@ rule merge_bams_from_transcripts:
input:
bams = expand("tempfiles/bamfiles/transcripts_to_genome/pilon_h{index}_to_ref.{fix}.sorted.bam", fix = PREFIX, index = ["1", "2"]),
output:
merged = temp("tempfiles/bamfiles/transcripts_to_genome/pilon_merged_to_ref.{fix}.sorted.bam"),
index = temp("tempfiles/bamfiles/transcripts_to_genome/pilon_merged_to_ref.{fix}.sorted.bam.bai")
merged = "tempfiles/bamfiles/transcripts_to_genome/pilon_merged_to_ref.{fix}.sorted.bam",
index = "tempfiles/bamfiles/transcripts_to_genome/pilon_merged_to_ref.{fix}.sorted.bam.bai"

threads: 1
shell:
Expand All @@ -533,8 +532,8 @@ rule gen_temporary_whole_genome_vcf:
input:
vcfs = gVCFs
output:
gz = temp("tempfiles/whole_genome_phased.vcf.gz"),
tbi = temp("tempfiles/whole_genome_phased.vcf.gz.tbi")
gz = "tempfiles/whole_genome_phased.vcf.gz",
tbi = "tempfiles/whole_genome_phased.vcf.gz.tbi"
params:
vcf = "tempfiles/whole_genome_phased.vcf"
shell:
Expand Down Expand Up @@ -565,7 +564,7 @@ rule haplotag_transcripts:
index = "tempfiles/bamfiles/transcripts_to_genome/pilon_merged_to_ref.{fix}.sorted.bam.bai",
ref = "tempfiles/ref.fa"
output:
bam = temp("tempfiles/bamfiles/transcripts_to_genome/haplotaged_pilon_to_ref.{fix}.sorted.bam")
bam = "tempfiles/bamfiles/transcripts_to_genome/haplotaged_pilon_to_ref.{fix}.sorted.bam"
shell:
"""
whatshap haplotag --ignore-read-groups -o {output.bam} \
Expand All @@ -576,7 +575,7 @@ rule make_phased_list:
input:
bam = "tempfiles/bamfiles/transcripts_to_genome/haplotaged_pilon_to_ref.{fix}.sorted.bam"
output:
txt = temp("tempfiles/txt/h{index}_transcript_to_genome.{fix}.list")
txt = "tempfiles/txt/h{index}_transcript_to_genome.{fix}.list"
threads: 1
params:
index = lambda wildcards: wildcards.index
Expand Down Expand Up @@ -630,7 +629,7 @@ rule temp_catd_phased_peps:
peps = expand("final_output/protein_sequences/partly_phased/h{index}_{fix}.pep",
fix = PREFIX, index = ["1","2"]),
output:
temppeps = temp("tempfiles/temp_fused_phased.pep")
temppeps = "tempfiles/temp_fused_phased.pep"
shell:
"""
cat {input.peps} > {output.temppeps}
Expand Down Expand Up @@ -661,7 +660,7 @@ rule temp_catd_phased_nucls:
nucls = expand("final_output/nucleotide_sequences/partly_phased/h{index}_pilon_{fix}.fasta",
fix = PREFIX, index = ["1","2"]),
output:
tempnucls = temp("tempfiles/temp_nucl_fused_phased.fasta")
tempnucls = "tempfiles/temp_nucl_fused_phased.fasta"
shell:
"""
cat {input.nucls} > {output.tempnucls}
Expand Down
2 changes: 1 addition & 1 deletion annotation/make_release.sh
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
#!/bin/bash
RELEASE=Hcv1a1d20200414
RELEASE=Hcv1av84
mkdir ${RELEASE}_release
find ./final_output/ -type f -name "*" -exec cp {} ${RELEASE}_release \;
rm ${RELEASE}_release/*vcf*
Expand Down
17 changes: 13 additions & 4 deletions annotation/scripts/make_table_of_genes.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
#!/usr/bin/env python
"""
This program makes a table of protein sizes?
This program makes a table of protein sizes between two haplotypes,
then makes a single file with all the proteins
"""

import sys
Expand Down Expand Up @@ -33,34 +34,38 @@
import pandas as pd
import sys

print(" - Reading in the sequences from the transcripts from the gff", file=sys.stderr)
gene_dict = {}
for record in SeqIO.parse(seqs_orig, "fasta"):
seq_record = ".".join(record.id.split('.')[0:5])
seqlist.add(seq_record)
print(seq_record)
this_chr = seq_record.split('.')[2]
this_gene= int(seq_record.split('.')[3].replace('g',''))
if this_chr not in gene_dict:
gene_dict[this_chr] = set()
gene_dict[this_chr].add(this_gene)

print(" - Getting the number of genes on each chromosome.", file=sys.stderr)
# get the number of genes on each chr
for key in gene_dict:
gene_dict[key] = max(gene_dict[key])

print(" - Getting the h1 genes.", file=sys.stderr)
# get the genes in the h1 file
h1_dict = {x: -1 for x in seqlist}
for record in SeqIO.parse(h1_fasta, "fasta"):
seqid = ".".join(record.id.split(".")[0:5])
h1_dict[seqid] = len(record.seq)

print(" - Getting the h2 genes.", file=sys.stderr)
# get the genes in the h2 file
h2_dict = {x: -1 for x in seqlist}
for record in SeqIO.parse(h2_fasta, "fasta"):
seqid = ".".join(record.id.split(".")[0:5])
h2_dict[seqid] = len(record.seq)

# make the dataset
print(" - Making a table of which isoform to keep.", file=sys.stderr)
df = pd.DataFrame([h1_dict, h2_dict])
df = df.T
df.columns = ["h1", "h2"]
Expand All @@ -86,15 +91,17 @@
df.at[i, "keep"] = "h2"

print(df)
df.to_csv(diff_lengths_csv, sep='\t')
df.to_csv(diff_lengths_csv, sep='\t', index_label="gene")

print(" - Getting the isoform indices.", file=sys.stderr)
# we need this later to sort the output
df["isoform_index"] = -1
for i, row in df.iterrows():
df.at[i, "isoform_index"] = int(i.split('.')[4].replace('i', ''))

df["fasta"] = ""

print(" - Adding the protein sequences to the pandas dataframe.", file=sys.stderr)
# add the sequence to the pandas df
col_to_file = {"h1": h1_fasta, "h2": h2_fasta}
for key in col_to_file:
Expand All @@ -109,6 +116,7 @@
#print("fasta string ", fasta_string)
df.at[seqid, "fasta"] = fasta_string

print(" - Getting number of chromosomes.", file=sys.stderr)
# print the fasta file out in order
#first get the num Cs
num_c = max([int(key.replace('c','')) for key in gene_dict if key[0] == 'c'])
Expand All @@ -119,11 +127,12 @@
for i in range(1, num_s+1):
sort_list.append("sca{}".format(i))

print(" - Printing out the fasta file.", file=sys.stderr)
new_fasta = open(model_pep_file, "w")
for this_c in sort_list:
if this_c in gene_dict:
for this_gene in range(1, gene_dict[this_c]+1):
this_id = "Hcv1.1.{}.g{}.".format(this_c, this_gene)
this_id = ".{}.g{}.".format(this_c, this_gene)
print_these_df = df.filter(like=this_id, axis=0).sort_values(by ="isoform_index")
if len(print_these_df) > 0:
for i, row in print_these_df.iterrows():
Expand Down

0 comments on commit 1e3f450

Please sign in to comment.