Skip to content

Commit

Permalink
Version 3.1.2 (#31)
Browse files Browse the repository at this point in the history
Add --write-nocalls-in-vcf option to write no-call sites in the VCF
  • Loading branch information
xiao-chen-xc authored Jan 24, 2025
1 parent 75facb3 commit f4630d2
Show file tree
Hide file tree
Showing 3 changed files with 109 additions and 35 deletions.
2 changes: 1 addition & 1 deletion paraphase/__init__.py
Original file line number Diff line number Diff line change
@@ -1 +1 @@
__version__ = "3.1.1"
__version__ = "3.1.2"
8 changes: 8 additions & 0 deletions paraphase/paraphase.py
Original file line number Diff line number Diff line change
Expand Up @@ -171,6 +171,7 @@ def process_gene(
sample_id,
outdir,
phaser_call,
args,
)
vcf_generater.set_parameter(
config, tmpdir=tmpdir, prog_cmd=prog_cmd
Expand All @@ -181,6 +182,7 @@ def process_gene(
sample_id,
outdir,
phaser_call,
args,
)
vcf_generater.set_parameter(
config, tmpdir=tmpdir, prog_cmd=prog_cmd
Expand Down Expand Up @@ -646,6 +648,12 @@ def load_parameters(self):
required=False,
action="store_true",
)
parser.add_argument(
"--write-nocalls-in-vcf",
help="Optional. If specified, Paraphase will write no-call sites in the VCFs, marked with LowQual filter.",
required=False,
action="store_true",
)
parser.add_argument(
"--samtools",
help="Optional path to samtools",
Expand Down
134 changes: 100 additions & 34 deletions paraphase/prepare_bam_and_vcf.py
Original file line number Diff line number Diff line change
Expand Up @@ -344,10 +344,13 @@ class VcfGenerater:
search_range = 200
min_base_quality_for_variant_calling = 25

def __init__(self, sample_id, outdir, call_sum):
def __init__(self, sample_id, outdir, call_sum, args=None):
self.sample_id = sample_id
self.outdir = outdir
self.call_sum = call_sum
self.lowqual = False
if args is not None:
self.lowqual = args.write_nocalls_in_vcf
self.match = {}

def set_parameter(self, config, tmpdir=None, prog_cmd=None):
Expand Down Expand Up @@ -395,7 +398,7 @@ def write_header(self, fout):
"""Write VCF header"""
fout.write("##fileformat=VCFv4.2\n")
fout.write('##FILTER=<ID=PASS,Description="All filters passed">\n')
# fout.write('##FILTER=<ID=LowQual,Description="Nonpassing variant">\n')
fout.write('##FILTER=<ID=LowQual,Description="Nonpassing variant">\n')
fout.write(
'##INFO=<ID=HPBOUND,Number=.,Type=String,Description="Boundary coordinates of the phased haplotype">\n'
)
Expand Down Expand Up @@ -495,16 +498,55 @@ def merge_vcf(self, vars_list):
call_info = variants_info[pos]
# unique variants at this site
variant_observed = set([a[0] for a in call_info if a is not None])
ref_only = False
if len(variant_observed) == 1:
_, ref, alt = list(variant_observed)[0].split("_")
if ref == alt:
ref_only = True
if len(variant_observed) == 2:
has_ref = False
has_del = False
for var in variant_observed:
_, a, b = var.split("_")
if a == b:
has_ref = True
elif b == "*":
has_del = True
if has_ref and has_del:
ref_only = True

for variant in variant_observed:
_, ref, alt = variant.split("_")
valid_gts = []
merge_gt = []
merge_ad = []
merge_dp = []
for each_call in call_info:
for each_call_index, each_call in enumerate(call_info):
if each_call is None:
merge_gt.append(".")
merge_ad.append(".")
merge_dp.append(".")
hap_info = haps_info[each_call_index]
(
_,
hap_info_bound1,
hap_info_bound2,
hap_info_truncated,
) = hap_info
if (
hap_info_truncated is None
or hap_info_truncated is False
):
valid_gts.append(".")
elif hap_info_truncated == ["5p"]:
if pos > hap_info_bound1:
valid_gts.append(".")
elif hap_info_truncated == ["3p"]:
if pos < hap_info_bound2:
valid_gts.append(".")
elif hap_info_truncated == ["5p", "3p"]:
if hap_info_bound1 < pos < hap_info_bound2:
valid_gts.append(".")
else:
var_name, dp, ad, var_filter, gt, counter = each_call
if counter is None:
Expand All @@ -527,39 +569,51 @@ def merge_vcf(self, vars_list):
]
)
else:
this_ad = ",".join([str(a) for a in [ad[0], 0]])
this_ad = ",".join(
[str(a) for a in [ad[0], dp - ad[0]]]
)
if var_filter != []:
gt = "."
merge_dp.append(str(dp))
if gt == "0":
merge_gt.append(gt)
valid_gts.append(gt)
merge_ad.append(this_ad)
elif var_name == variant:
merge_gt.append(gt)
valid_gts.append(gt)
merge_ad.append(this_ad)
else:
merge_gt.append(".")
valid_gts.append(".")
merge_ad.append(this_ad)
if list_counter == 0 and haps_ids != haps_ids1:
for _ in range(len(haps_ids2)):
merge_gt.append(".")
merge_ad.append(".")
merge_dp.append(".")
elif list_counter > 0:
for _ in range(len(haps_ids1)):
merge_gt.insert(0, ".")
merge_ad.insert(0, ".")
merge_dp.insert(0, ".")
write_variant = False
if self.lowqual is True:
if (
(alt != ref or ref_only)
and alt not in [".", "*"]
and ("1" in merge_gt or "." in valid_gts)
):
write_variant = True
elif alt != ref and alt not in [".", "*"] and "1" in merge_gt:
write_variant = True
final_qual = "."
if (
alt != ref
and alt not in [".", "*"]
and "1" in merge_gt # or "." in merge_gt
):
if write_variant:
if list_counter == 0 and haps_ids != haps_ids1:
for _ in range(len(haps_ids2)):
merge_gt.append(".")
merge_ad.append(".")
merge_dp.append(".")
elif list_counter > 0:
for _ in range(len(haps_ids1)):
merge_gt.insert(0, ".")
merge_ad.insert(0, ".")
merge_dp.insert(0, ".")

if "1" in merge_gt:
variant_filter = "PASS"
# else:
# variant_filter = "LowQual"
else:
variant_filter = "LowQual"
info_field = "HPBOUND=" + ",".join(haps_bounds)
alleles = self.call_sum.get("alleles_final")
if alleles is not None and alleles != []:
Expand Down Expand Up @@ -778,7 +832,7 @@ def pileup_to_variant(
var_filter = []
if dp < min_depth:
var_filter.append("LowDP")
if ad[1] < dp * 0.7:
if (gt == "1" and ad[1] < dp * 0.7) or (gt == "0" and ad[0] < dp * 0.7):
var_filter.append("LowQual")
if var_filter != []:
gt = "."
Expand Down Expand Up @@ -843,7 +897,7 @@ def run_without_realign(
two_cp_haplotypes = self.call_sum.get("two_copy_haplotypes")
nhap = len(final_haps)
if two_cp_haplotypes is not None:
nhap += len(two_cp_haplotypes)
nhap += len([a for a in two_cp_haplotypes if a in final_haps.values()])
hap_info = []

# gene1only, or two-gene mode but gene1 side
Expand All @@ -862,6 +916,11 @@ def run_without_realign(
hap_name = f"{self.gene}_homozygous_hap1"
pileups_raw = {}
read_names = {}

for pos in range(self.left_boundary, self.right_boundary):
pileups_raw.setdefault(pos, [])
read_names.setdefault(pos, [])

for pileupcolumn in bamh.pileup(
nchr,
truncate=True,
Expand All @@ -871,8 +930,8 @@ def run_without_realign(
this_pos_bases = [
a.upper() for a in pileupcolumn.get_query_sequences(add_indels=True)
]
pileups_raw.setdefault(pos, this_pos_bases)
read_names.setdefault(pos, pileupcolumn.get_query_names())
pileups_raw[pos] = this_pos_bases
read_names[pos] = pileupcolumn.get_query_names()
variants_called = self.pileup_to_variant(
pileups_raw,
read_names,
Expand All @@ -887,13 +946,14 @@ def run_without_realign(
)

for pos, var_name, dp, ad, var_filter, gt, counter in variants_called:
variants_info.setdefault(
pos,
[
[var_name, dp, ad, var_filter, gt, counter],
[var_name, dp, ad, var_filter, gt, counter],
],
)
if pos > self.left_boundary and pos < self.right_boundary:
variants_info.setdefault(
pos,
[
[var_name, dp, ad, var_filter, gt, counter],
[var_name, dp, ad, var_filter, gt, counter],
],
)

i = 0
for hap_name in final_haps.values():
Expand Down Expand Up @@ -943,6 +1003,12 @@ def run_without_realign(
# by HP tag
pileups_raw = {}
read_names = {}

if hap_bound != []:
for pos in range(hap_bound[0], hap_bound[1]):
pileups_raw.setdefault(pos, [])
read_names.setdefault(pos, [])

for pileupcolumn in bamh.pileup(
nchr,
truncate=True,
Expand Down Expand Up @@ -1024,8 +1090,8 @@ class TwoGeneVcfGenerater(VcfGenerater):
Make vcf for two-gene scenario
"""

def __init__(self, sample_id, outdir, call_sum):
VcfGenerater.__init__(self, sample_id, outdir, call_sum)
def __init__(self, sample_id, outdir, call_sum, args):
VcfGenerater.__init__(self, sample_id, outdir, call_sum, args)

def set_parameter(self, config, tmpdir=None, prog_cmd=None):
super().set_parameter(config, tmpdir, prog_cmd)
Expand Down

0 comments on commit f4630d2

Please sign in to comment.