Merge pull request #53 from CostaLab/develop

upgrade to 0.11.1
CostaLab · Dec 12, 2017 · fcae071 · fcae071
2 parents 3e272cb + 6ddfb29
commit fcae071
Show file tree

Hide file tree

Showing 5 changed files with 242 additions and 129 deletions.
diff --git a/rgt/HINT/DifferentialAnalysis.py b/rgt/HINT/DifferentialAnalysis.py
@@ -1,4 +1,3 @@
-# Import
 import os
 import numpy as np
 from pysam import Samfile, Fastafile
@@ -14,7 +13,7 @@
 from argparse import SUPPRESS
 
 # Internal
-from ..Util import AuxiliaryFunctions, GenomeData
+from ..Util import ErrorHandler, AuxiliaryFunctions, GenomeData
 from rgt.GenomicRegionSet import GenomicRegionSet
 from biasTable import BiasTable
 
@@ -67,6 +66,16 @@ def diff_analysis_args(parser):
 
 
 def diff_analysis_run(args):
+    # Initializing Error Handler
+    err = ErrorHandler()
+
+    output_location = os.path.join(args.output_location, "{}_{}".format(args.condition1, args.condition2))
+    try:
+        if not os.path.isdir(output_location):
+            os.makedirs(output_location)
+    except Exception:
+        err.throw_error("MM_OUT_FOLDER_CREATION")
+
     mpbs1 = GenomicRegionSet("Motif Predicted Binding Sites of Condition1")
     mpbs1.read(args.mpbs_file1)
 
@@ -179,12 +188,15 @@ def diff_analysis_run(args):
         output_factor(args, args.factor1, args.factor2)
 
     ps_tc_results_by_tf = dict()
+
     for mpbs_name in mpbs_name_list:
         num_fp = len(signal_dict_by_tf_1[mpbs_name])
 
         # print the line plot for each factor
-        line_plot(args, mpbs_name, num_fp, signal_dict_by_tf_1[mpbs_name], signal_dict_by_tf_2[mpbs_name],
-                  pwm_dict_by_tf[mpbs_name])
+        fig, ax = plt.subplots()
+        line_plot(args, err, mpbs_name, num_fp, signal_dict_by_tf_1[mpbs_name], signal_dict_by_tf_2[mpbs_name],
+                  pwm_dict_by_tf[mpbs_name], fig, ax)
+        plt.close(fig)
 
         ps_tc_results_by_tf[mpbs_name] = list()
 
@@ -195,8 +207,9 @@ def diff_analysis_run(args):
             res = get_ps_tc_results(signal_1, signal_2, motif_len_dict[mpbs_name])
             ps_tc_results_by_tf[mpbs_name].append(res)
 
-    stat_results_by_tf = get_stat_results(ps_tc_results_by_tf)
-    output_stat_results(args, stat_results_by_tf)
+    #stat_results_by_tf = get_stat_results(ps_tc_results_by_tf)
+    #scatter_plot(args, stat_results_by_tf)
+    #output_stat_results(args, stat_results_by_tf)
 
 
 def get_bc_signal(chrom, start, end, bam, bias_table, genome_file_name, forward_shift, reverse_shift):
@@ -362,7 +375,7 @@ def compute_factors(signal_dict_by_tf_1, signal_dict_by_tf_2):
     return factor1, factor2
 
 
-def line_plot(args, mpbs_name, num_fp, signal_tf_1, signal_tf_2, pwm_dict):
+def line_plot(args, err, mpbs_name, num_fp, signal_tf_1, signal_tf_2, pwm_dict, fig, ax):
     # compute the average signal
     mean_signal_1 = np.zeros(args.window_size)
     mean_signal_2 = np.zeros(args.window_size)
@@ -373,14 +386,16 @@ def line_plot(args, mpbs_name, num_fp, signal_tf_1, signal_tf_2, pwm_dict):
     mean_signal_1 = (mean_signal_1 / num_fp) / args.factor1
     mean_signal_2 = (mean_signal_2 / num_fp) / args.factor2
 
+    output_location = os.path.join(args.output_location, "{}_{}".format(args.condition1, args.condition2))
+
     # Output PWM and create logo
-    pwm_fname = os.path.join(args.output_location, "{}.pwm".format(mpbs_name))
+    pwm_fname = os.path.join(output_location, "{}.pwm".format(mpbs_name))
     pwm_file = open(pwm_fname, "w")
     for e in ["A", "C", "G", "T"]:
         pwm_file.write(" ".join([str(int(f)) for f in pwm_dict[e]]) + "\n")
     pwm_file.close()
 
-    logo_fname = os.path.join(args.output_location, "{}.logo.eps".format(mpbs_name))
+    logo_fname = os.path.join(output_location, "{}.logo.eps".format(mpbs_name))
     pwm = motifs.read(open(pwm_fname), "pfm")
     pwm.weblogo(logo_fname, format="eps", stack_width="large", stacks_per_line=str(args.window_size),
                 color_scheme="color_classic", unit_name="", show_errorbars=False, logo_title="",
@@ -391,7 +406,7 @@ def line_plot(args, mpbs_name, num_fp, signal_tf_1, signal_tf_2, pwm_dict):
     end = (args.window_size / 2) - 1
     x = np.linspace(start, end, num=args.window_size)
 
-    fig, ax = plt.subplots()
+    #fig, ax = plt.subplots()
 
     ax.plot(x, mean_signal_1, color='red', label=args.condition1)
     ax.plot(x, mean_signal_2, color='blue', label=args.condition2)
@@ -417,12 +432,12 @@ def line_plot(args, mpbs_name, num_fp, signal_tf_1, signal_tf_2, pwm_dict):
     ax.legend(loc="upper right", frameon=False)
     ax.spines['bottom'].set_position(('outward', 40))
 
-    figure_name = os.path.join(args.output_location, "{}.line.eps".format(mpbs_name))
+    figure_name = os.path.join(output_location, "{}.line.eps".format(mpbs_name))
     fig.tight_layout()
     fig.savefig(figure_name, format="eps", dpi=300)
 
     # Creating canvas and printing eps / pdf with merged results
-    output_fname = os.path.join(args.output_location, "{}.eps".format(mpbs_name))
+    output_fname = os.path.join(output_location, "{}.eps".format(mpbs_name))
     c = pyx.canvas.canvas()
     c.insert(pyx.epsfile.epsfile(0, 0, figure_name, scale=1.0))
     c.insert(pyx.epsfile.epsfile(0.45, 0.8, logo_fname, width=16.5, height=3))
@@ -435,13 +450,41 @@ def line_plot(args, mpbs_name, num_fp, signal_tf_1, signal_tf_2, pwm_dict):
     os.remove(pwm_fname)
 
 
+def scatter_plot(args, stat_results_by_tf):
+    tc_diff = list()
+    ps_diff = list()
+    mpbs_names = list()
+    for mpbs_name in stat_results_by_tf.keys():
+        mpbs_names.append(mpbs_name)
+        tc_diff.append(stat_results_by_tf[mpbs_name][-2])
+        ps_diff.append(stat_results_by_tf[mpbs_name][2])
+
+    fig, ax = plt.subplots(figsize=(12,12))
+    ax.scatter(tc_diff, ps_diff, alpha=0.0)
+    for i, txt in enumerate(mpbs_names):
+        ax.annotate(txt, (tc_diff[i], ps_diff[i]), alpha=0.6)
+    ax.margins(0.05)
+
+    tc_diff_mean = np.mean(tc_diff)
+    ps_diff_mean = np.mean(ps_diff)
+    ax.axvline(x=tc_diff_mean, linewidth=2, linestyle='dashed')
+    ax.axhline(y=ps_diff_mean, linewidth=2, linestyle='dashed')
+
+    ax.set_xlabel("TC DIFF of {} - {}".format(args.condition1, args.condition2), fontweight='bold')
+    ax.set_ylabel("Protection Score of {} - {}".format(args.condition1, args.condition2), fontweight='bold', rotation=90)
+
+    figure_name = os.path.join(args.output_location, "{}_{}_statistics.pdf".format(args.condition1, args.condition2))
+    fig.savefig(figure_name, format="pdf", dpi=300)
+
+
 def output_stat_results(args, stat_results_by_tf):
-    output_fname = os.path.join(args.output_location, "footprint_statistics.txt")
+    output_fname = os.path.join(args.output_location, "{}_{}_statistics.txt".format(args.condition1, args.condition2))
     header = ["Motif",
               "Protection_Score_{}".format(args.condition1), "Protection_Score_{}".format(args.condition2),
               "Protection_Diff_{}_{}".format(args.condition1, args.condition2),
               "TC_{}".format(args.condition1), "TC_{}".format(args.condition2),
               "TC_Diff_{}_{}".format(args.condition1, args.condition2), "P_values"]
+
     with open(output_fname, "w") as f:
         f.write("\t".join(header) + "\n")
         for mpbs_name in stat_results_by_tf.keys():

diff --git a/rgt/HINT/Footprinting.py b/rgt/HINT/Footprinting.py
@@ -148,23 +148,81 @@ def atac_seq(args):
         bias_table = BiasTable().load_table(table_file_name_F=table_F,
                                             table_file_name_R=table_R)
 
-    initial_clip = 50 if not args.initial_clip else args.initial_clip
-    sg_window_size = 9 if not args.sg_window_size else args.sg_window_size
-    norm_per = 98 if not args.norm_per else args.norm_per
-    slope_per = 98 if not args.slope_per else args.slope_per
-    downstream_ext = 1 if not args.downstream_ext else args.downstream_ext
-    upstream_ext = 0 if not args.upstream_ext else args.upstream_ext
-    region_total_ext = 0 if not args.region_total_ext else args.region_total_ext
-    forward_shift = 5 if not args.forward_shift else args.forward_shift
-    reverse_shift = -4 if not args.reverse_shift else args.reverse_shift
-    fp_max_size = 50 if not args.fp_max_size else args.fp_max_size
-    fp_min_size = 5 if not args.fp_min_size else args.fp_min_size
-    fp_ext = 5 if not args.fp_ext else args.fp_ext
-    tc_ext = 100 if not args.tc_ext else args.tc_ext
+    if args.initial_clip is None:
+        initial_clip = 50
+    else:
+        initial_clip = args.initial_clip
+
+    if args.sg_window_size is None:
+        sg_window_size = 9
+    else:
+        sg_window_size = args.sg_window_size
+
+    if args.norm_per is None:
+        norm_per = 98
+    else:
+        norm_per = args.norm_per
+
+    if args.slope_per is None:
+        slope_per = 98
+    else:
+        slope_per = args.slope_per
+
+    if args.downstream_ext is None:
+        downstream_ext = 1
+    else:
+        downstream_ext = args.downstream_ext
+
+    if args.upstream_ext is None:
+        upstream_ext = 0
+    else:
+        upstream_ext = args.upstream_ext
+
+    if args.forward_shift is None:
+        forward_shift = 5
+    else:
+        forward_shift = args.forward_shift
+
+    if args.reverse_shift is None:
+        reverse_shift = -4
+    else:
+        reverse_shift = args.reverse_shift
+
+    if args.region_total_ext is None:
+        region_total_ext = 0
+    else:
+        region_total_ext = args.region_total_ext
+
+    if args.fp_max_size is None:
+        fp_max_size = 50
+    else:
+        fp_max_size = args.fp_max_size
+
+    if args.fp_min_size is None:
+        fp_min_size = 5
+    else:
+        fp_min_size = args.fp_min_size
+
+    if args.fp_ext is None:
+        fp_ext = 5
+    else:
+        fp_ext = args.fp_ext
+
+    if args.tc_ext is None:
+        tc_ext = 100
+    else:
+        tc_ext = args.tc_ext
+
     if args.paired_end:
-        fp_state = 8 if not args.fp_state else args.fp_state
+        if args.fp_state is None:
+            fp_state = 8
+        else:
+            fp_state = args.fp_state
     else:
-        fp_state = 6 if not args.fp_state else args.fp_state
+        if args.fp_state is None:
+            fp_state = 6
+        else:
+            fp_state = args.fp_state
 
     # Initializing result set
     footprints = GenomicRegionSet(args.output_prefix)
@@ -183,7 +241,7 @@ def atac_seq(args):
     fasta = Fastafile(genome_data.get_genome())
 
     if args.paired_end:
-        for region in regions:
+        for region in original_regions:
             input_sequence = list()
 
             signal_bc_f_max_145, signal_bc_r_max_145 = \
@@ -233,33 +291,38 @@ def atac_seq(args):
             input_sequence.append(signal_bc_r_min_145)
             input_sequence.append(signal_bc_r_min_145_slope)
 
-            posterior_list = hmm.predict(np.array(input_sequence).T)
+            # Applying HMM
+            try:
+                posterior_list = hmm.predict(np.array(input_sequence).T)
+            except Exception:
+                err.throw_warning("FP_HMM_APPLIC", add_msg="in region (" + ",".join([region.chrom, str(region.initial), str(
+                    region.final)]) + "). This iteration will be skipped.")
+                continue
 
             if args.fp_bed_fname is not None:
                 output_bed_file(region.chrom, region.initial, region.final, posterior_list, args.fp_bed_fname, fp_state)
 
             # Formatting results
             start_pos = 0
             flag_start = False
-            fp_state_nb = fp_state
             for k in range(region.initial, region.initial + len(posterior_list)):
                 curr_index = k - region.initial
                 if flag_start:
-                    if posterior_list[curr_index] != fp_state_nb:
+                    if posterior_list[curr_index] != fp_state:
                         if k - start_pos < fp_max_size:
                             fp = GenomicRegion(region.chrom, start_pos, k)
                             footprints.add(fp)
                         flag_start = False
                 else:
-                    if posterior_list[curr_index] == fp_state_nb:
+                    if posterior_list[curr_index] == fp_state:
                         flag_start = True
                         start_pos = k
             if flag_start:
                 if region.initial + len(posterior_list) - start_pos < fp_max_size:
                     fp = GenomicRegion(region.chrom, start_pos, region.final)
                     footprints.add(fp)
     else:
-        for region in regions:
+        for region in original_regions:
 
             input_sequence = list()
             atac_norm_f, atac_slope_f, atac_norm_r, atac_slope_r = \
@@ -273,24 +336,28 @@ def atac_seq(args):
             input_sequence.append(atac_norm_r)
             input_sequence.append(atac_slope_r)
 
-            posterior_list = hmm.predict(np.array(input_sequence).T)
+            try:
+                posterior_list = hmm.predict(np.array(input_sequence).T)
+            except Exception:
+                err.throw_warning("FP_HMM_APPLIC", add_msg="in region (" + ",".join([region.chrom, str(region.initial), str(
+                    region.final)]) + "). This iteration will be skipped.")
+                continue
 
             if args.fp_bed_fname is not None:
                 output_bed_file(region.chrom, region.initial, region.final, posterior_list, args.fp_bed_fname, fp_state)
             # Formatting results
             start_pos = 0
             flag_start = False
-            fp_state_nb = fp_state
             for k in range(region.initial, region.initial + len(posterior_list)):
                 curr_index = k - region.initial
                 if flag_start:
-                    if posterior_list[curr_index] != fp_state_nb:
+                    if posterior_list[curr_index] != fp_state:
                         if k - start_pos < fp_max_size:
                             fp = GenomicRegion(region.chrom, start_pos, k)
                             footprints.add(fp)
                         flag_start = False
                 else:
-                    if posterior_list[curr_index] == fp_state_nb:
+                    if posterior_list[curr_index] == fp_state:
                         flag_start = True
                         start_pos = k
             if flag_start: