Skip to content

Commit

Permalink
Merge pull request #53 from CostaLab/develop
Browse files Browse the repository at this point in the history
upgrade to 0.11.1
  • Loading branch information
lzj1769 authored Dec 12, 2017
2 parents 3e272cb + 6ddfb29 commit fcae071
Show file tree
Hide file tree
Showing 5 changed files with 242 additions and 129 deletions.
69 changes: 56 additions & 13 deletions rgt/HINT/DifferentialAnalysis.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,3 @@
# Import
import os
import numpy as np
from pysam import Samfile, Fastafile
Expand All @@ -14,7 +13,7 @@
from argparse import SUPPRESS

# Internal
from ..Util import AuxiliaryFunctions, GenomeData
from ..Util import ErrorHandler, AuxiliaryFunctions, GenomeData
from rgt.GenomicRegionSet import GenomicRegionSet
from biasTable import BiasTable

Expand Down Expand Up @@ -67,6 +66,16 @@ def diff_analysis_args(parser):


def diff_analysis_run(args):
# Initializing Error Handler
err = ErrorHandler()

output_location = os.path.join(args.output_location, "{}_{}".format(args.condition1, args.condition2))
try:
if not os.path.isdir(output_location):
os.makedirs(output_location)
except Exception:
err.throw_error("MM_OUT_FOLDER_CREATION")

mpbs1 = GenomicRegionSet("Motif Predicted Binding Sites of Condition1")
mpbs1.read(args.mpbs_file1)

Expand Down Expand Up @@ -179,12 +188,15 @@ def diff_analysis_run(args):
output_factor(args, args.factor1, args.factor2)

ps_tc_results_by_tf = dict()

for mpbs_name in mpbs_name_list:
num_fp = len(signal_dict_by_tf_1[mpbs_name])

# print the line plot for each factor
line_plot(args, mpbs_name, num_fp, signal_dict_by_tf_1[mpbs_name], signal_dict_by_tf_2[mpbs_name],
pwm_dict_by_tf[mpbs_name])
fig, ax = plt.subplots()
line_plot(args, err, mpbs_name, num_fp, signal_dict_by_tf_1[mpbs_name], signal_dict_by_tf_2[mpbs_name],
pwm_dict_by_tf[mpbs_name], fig, ax)
plt.close(fig)

ps_tc_results_by_tf[mpbs_name] = list()

Expand All @@ -195,8 +207,9 @@ def diff_analysis_run(args):
res = get_ps_tc_results(signal_1, signal_2, motif_len_dict[mpbs_name])
ps_tc_results_by_tf[mpbs_name].append(res)

stat_results_by_tf = get_stat_results(ps_tc_results_by_tf)
output_stat_results(args, stat_results_by_tf)
#stat_results_by_tf = get_stat_results(ps_tc_results_by_tf)
#scatter_plot(args, stat_results_by_tf)
#output_stat_results(args, stat_results_by_tf)


def get_bc_signal(chrom, start, end, bam, bias_table, genome_file_name, forward_shift, reverse_shift):
Expand Down Expand Up @@ -362,7 +375,7 @@ def compute_factors(signal_dict_by_tf_1, signal_dict_by_tf_2):
return factor1, factor2


def line_plot(args, mpbs_name, num_fp, signal_tf_1, signal_tf_2, pwm_dict):
def line_plot(args, err, mpbs_name, num_fp, signal_tf_1, signal_tf_2, pwm_dict, fig, ax):
# compute the average signal
mean_signal_1 = np.zeros(args.window_size)
mean_signal_2 = np.zeros(args.window_size)
Expand All @@ -373,14 +386,16 @@ def line_plot(args, mpbs_name, num_fp, signal_tf_1, signal_tf_2, pwm_dict):
mean_signal_1 = (mean_signal_1 / num_fp) / args.factor1
mean_signal_2 = (mean_signal_2 / num_fp) / args.factor2

output_location = os.path.join(args.output_location, "{}_{}".format(args.condition1, args.condition2))

# Output PWM and create logo
pwm_fname = os.path.join(args.output_location, "{}.pwm".format(mpbs_name))
pwm_fname = os.path.join(output_location, "{}.pwm".format(mpbs_name))
pwm_file = open(pwm_fname, "w")
for e in ["A", "C", "G", "T"]:
pwm_file.write(" ".join([str(int(f)) for f in pwm_dict[e]]) + "\n")
pwm_file.close()

logo_fname = os.path.join(args.output_location, "{}.logo.eps".format(mpbs_name))
logo_fname = os.path.join(output_location, "{}.logo.eps".format(mpbs_name))
pwm = motifs.read(open(pwm_fname), "pfm")
pwm.weblogo(logo_fname, format="eps", stack_width="large", stacks_per_line=str(args.window_size),
color_scheme="color_classic", unit_name="", show_errorbars=False, logo_title="",
Expand All @@ -391,7 +406,7 @@ def line_plot(args, mpbs_name, num_fp, signal_tf_1, signal_tf_2, pwm_dict):
end = (args.window_size / 2) - 1
x = np.linspace(start, end, num=args.window_size)

fig, ax = plt.subplots()
#fig, ax = plt.subplots()

ax.plot(x, mean_signal_1, color='red', label=args.condition1)
ax.plot(x, mean_signal_2, color='blue', label=args.condition2)
Expand All @@ -417,12 +432,12 @@ def line_plot(args, mpbs_name, num_fp, signal_tf_1, signal_tf_2, pwm_dict):
ax.legend(loc="upper right", frameon=False)
ax.spines['bottom'].set_position(('outward', 40))

figure_name = os.path.join(args.output_location, "{}.line.eps".format(mpbs_name))
figure_name = os.path.join(output_location, "{}.line.eps".format(mpbs_name))
fig.tight_layout()
fig.savefig(figure_name, format="eps", dpi=300)

# Creating canvas and printing eps / pdf with merged results
output_fname = os.path.join(args.output_location, "{}.eps".format(mpbs_name))
output_fname = os.path.join(output_location, "{}.eps".format(mpbs_name))
c = pyx.canvas.canvas()
c.insert(pyx.epsfile.epsfile(0, 0, figure_name, scale=1.0))
c.insert(pyx.epsfile.epsfile(0.45, 0.8, logo_fname, width=16.5, height=3))
Expand All @@ -435,13 +450,41 @@ def line_plot(args, mpbs_name, num_fp, signal_tf_1, signal_tf_2, pwm_dict):
os.remove(pwm_fname)


def scatter_plot(args, stat_results_by_tf):
tc_diff = list()
ps_diff = list()
mpbs_names = list()
for mpbs_name in stat_results_by_tf.keys():
mpbs_names.append(mpbs_name)
tc_diff.append(stat_results_by_tf[mpbs_name][-2])
ps_diff.append(stat_results_by_tf[mpbs_name][2])

fig, ax = plt.subplots(figsize=(12,12))
ax.scatter(tc_diff, ps_diff, alpha=0.0)
for i, txt in enumerate(mpbs_names):
ax.annotate(txt, (tc_diff[i], ps_diff[i]), alpha=0.6)
ax.margins(0.05)

tc_diff_mean = np.mean(tc_diff)
ps_diff_mean = np.mean(ps_diff)
ax.axvline(x=tc_diff_mean, linewidth=2, linestyle='dashed')
ax.axhline(y=ps_diff_mean, linewidth=2, linestyle='dashed')

ax.set_xlabel("TC DIFF of {} - {}".format(args.condition1, args.condition2), fontweight='bold')
ax.set_ylabel("Protection Score of {} - {}".format(args.condition1, args.condition2), fontweight='bold', rotation=90)

figure_name = os.path.join(args.output_location, "{}_{}_statistics.pdf".format(args.condition1, args.condition2))
fig.savefig(figure_name, format="pdf", dpi=300)


def output_stat_results(args, stat_results_by_tf):
output_fname = os.path.join(args.output_location, "footprint_statistics.txt")
output_fname = os.path.join(args.output_location, "{}_{}_statistics.txt".format(args.condition1, args.condition2))
header = ["Motif",
"Protection_Score_{}".format(args.condition1), "Protection_Score_{}".format(args.condition2),
"Protection_Diff_{}_{}".format(args.condition1, args.condition2),
"TC_{}".format(args.condition1), "TC_{}".format(args.condition2),
"TC_Diff_{}_{}".format(args.condition1, args.condition2), "P_values"]

with open(output_fname, "w") as f:
f.write("\t".join(header) + "\n")
for mpbs_name in stat_results_by_tf.keys():
Expand Down
117 changes: 92 additions & 25 deletions rgt/HINT/Footprinting.py
Original file line number Diff line number Diff line change
Expand Up @@ -148,23 +148,81 @@ def atac_seq(args):
bias_table = BiasTable().load_table(table_file_name_F=table_F,
table_file_name_R=table_R)

initial_clip = 50 if not args.initial_clip else args.initial_clip
sg_window_size = 9 if not args.sg_window_size else args.sg_window_size
norm_per = 98 if not args.norm_per else args.norm_per
slope_per = 98 if not args.slope_per else args.slope_per
downstream_ext = 1 if not args.downstream_ext else args.downstream_ext
upstream_ext = 0 if not args.upstream_ext else args.upstream_ext
region_total_ext = 0 if not args.region_total_ext else args.region_total_ext
forward_shift = 5 if not args.forward_shift else args.forward_shift
reverse_shift = -4 if not args.reverse_shift else args.reverse_shift
fp_max_size = 50 if not args.fp_max_size else args.fp_max_size
fp_min_size = 5 if not args.fp_min_size else args.fp_min_size
fp_ext = 5 if not args.fp_ext else args.fp_ext
tc_ext = 100 if not args.tc_ext else args.tc_ext
if args.initial_clip is None:
initial_clip = 50
else:
initial_clip = args.initial_clip

if args.sg_window_size is None:
sg_window_size = 9
else:
sg_window_size = args.sg_window_size

if args.norm_per is None:
norm_per = 98
else:
norm_per = args.norm_per

if args.slope_per is None:
slope_per = 98
else:
slope_per = args.slope_per

if args.downstream_ext is None:
downstream_ext = 1
else:
downstream_ext = args.downstream_ext

if args.upstream_ext is None:
upstream_ext = 0
else:
upstream_ext = args.upstream_ext

if args.forward_shift is None:
forward_shift = 5
else:
forward_shift = args.forward_shift

if args.reverse_shift is None:
reverse_shift = -4
else:
reverse_shift = args.reverse_shift

if args.region_total_ext is None:
region_total_ext = 0
else:
region_total_ext = args.region_total_ext

if args.fp_max_size is None:
fp_max_size = 50
else:
fp_max_size = args.fp_max_size

if args.fp_min_size is None:
fp_min_size = 5
else:
fp_min_size = args.fp_min_size

if args.fp_ext is None:
fp_ext = 5
else:
fp_ext = args.fp_ext

if args.tc_ext is None:
tc_ext = 100
else:
tc_ext = args.tc_ext

if args.paired_end:
fp_state = 8 if not args.fp_state else args.fp_state
if args.fp_state is None:
fp_state = 8
else:
fp_state = args.fp_state
else:
fp_state = 6 if not args.fp_state else args.fp_state
if args.fp_state is None:
fp_state = 6
else:
fp_state = args.fp_state

# Initializing result set
footprints = GenomicRegionSet(args.output_prefix)
Expand All @@ -183,7 +241,7 @@ def atac_seq(args):
fasta = Fastafile(genome_data.get_genome())

if args.paired_end:
for region in regions:
for region in original_regions:
input_sequence = list()

signal_bc_f_max_145, signal_bc_r_max_145 = \
Expand Down Expand Up @@ -233,33 +291,38 @@ def atac_seq(args):
input_sequence.append(signal_bc_r_min_145)
input_sequence.append(signal_bc_r_min_145_slope)

posterior_list = hmm.predict(np.array(input_sequence).T)
# Applying HMM
try:
posterior_list = hmm.predict(np.array(input_sequence).T)
except Exception:
err.throw_warning("FP_HMM_APPLIC", add_msg="in region (" + ",".join([region.chrom, str(region.initial), str(
region.final)]) + "). This iteration will be skipped.")
continue

if args.fp_bed_fname is not None:
output_bed_file(region.chrom, region.initial, region.final, posterior_list, args.fp_bed_fname, fp_state)

# Formatting results
start_pos = 0
flag_start = False
fp_state_nb = fp_state
for k in range(region.initial, region.initial + len(posterior_list)):
curr_index = k - region.initial
if flag_start:
if posterior_list[curr_index] != fp_state_nb:
if posterior_list[curr_index] != fp_state:
if k - start_pos < fp_max_size:
fp = GenomicRegion(region.chrom, start_pos, k)
footprints.add(fp)
flag_start = False
else:
if posterior_list[curr_index] == fp_state_nb:
if posterior_list[curr_index] == fp_state:
flag_start = True
start_pos = k
if flag_start:
if region.initial + len(posterior_list) - start_pos < fp_max_size:
fp = GenomicRegion(region.chrom, start_pos, region.final)
footprints.add(fp)
else:
for region in regions:
for region in original_regions:

input_sequence = list()
atac_norm_f, atac_slope_f, atac_norm_r, atac_slope_r = \
Expand All @@ -273,24 +336,28 @@ def atac_seq(args):
input_sequence.append(atac_norm_r)
input_sequence.append(atac_slope_r)

posterior_list = hmm.predict(np.array(input_sequence).T)
try:
posterior_list = hmm.predict(np.array(input_sequence).T)
except Exception:
err.throw_warning("FP_HMM_APPLIC", add_msg="in region (" + ",".join([region.chrom, str(region.initial), str(
region.final)]) + "). This iteration will be skipped.")
continue

if args.fp_bed_fname is not None:
output_bed_file(region.chrom, region.initial, region.final, posterior_list, args.fp_bed_fname, fp_state)
# Formatting results
start_pos = 0
flag_start = False
fp_state_nb = fp_state
for k in range(region.initial, region.initial + len(posterior_list)):
curr_index = k - region.initial
if flag_start:
if posterior_list[curr_index] != fp_state_nb:
if posterior_list[curr_index] != fp_state:
if k - start_pos < fp_max_size:
fp = GenomicRegion(region.chrom, start_pos, k)
footprints.add(fp)
flag_start = False
else:
if posterior_list[curr_index] == fp_state_nb:
if posterior_list[curr_index] == fp_state:
flag_start = True
start_pos = k
if flag_start:
Expand Down
Loading

0 comments on commit fcae071

Please sign in to comment.