From d6c5ab5246ffd7da26d90cb0e49bc107eee5c2fb Mon Sep 17 00:00:00 2001 From: lzj1769 Date: Fri, 5 Jun 2020 17:22:00 +0200 Subject: [PATCH 01/12] add smooth function --- rgt/HINT/DifferentialAnalysis.py | 22 ++++++++++++++++++++++ 1 file changed, 22 insertions(+) diff --git a/rgt/HINT/DifferentialAnalysis.py b/rgt/HINT/DifferentialAnalysis.py index 7c290e7f..88a38759 100644 --- a/rgt/HINT/DifferentialAnalysis.py +++ b/rgt/HINT/DifferentialAnalysis.py @@ -269,6 +269,8 @@ def get_raw_signal(arguments): if p1 <= cut_site < p2: signal[cut_site - p1] += 1.0 + signal = smooth(signal) + return signal @@ -296,6 +298,8 @@ def get_bc_signal(arguments): # smooth the signal signal = np.add(signal, np.array(_signal)) + signal = smooth(signal) + return signal @@ -630,3 +634,21 @@ def output_profiles(mpbs_name_list, signals, conditions, output_location): output_filename = os.path.join(output_location, "{}_{}.txt".format(condition, mpbs_name)) with open(output_filename, "w") as f: f.write("\t".join(map(str, signals[i][j])) + "\n") + + +def smooth(signal, window_size=5, rank=5): + k = len(signal) + zscore = stats.zscore(signal) + order = zscore.argsort() + ranks = order.argsort() + + signal[ranks > (k - rank)] = np.nan + + smooth_signal = np.zeros(k) + + for i in range(k): + a = max(0, i - round(window_size / 2)) + b = min(k, ceil(i + window_size / 2)) + smooth_signal[i] = np.nanmean(signal[a:b]) + + return smooth_signal From 315b746b6a3423a1a0b87e19cae3122feacee2a0 Mon Sep 17 00:00:00 2001 From: lzj1769 Date: Mon, 8 Jun 2020 09:33:55 +0200 Subject: [PATCH 02/12] fix import --- rgt/HINT/DifferentialAnalysis.py | 1 + 1 file changed, 1 insertion(+) diff --git a/rgt/HINT/DifferentialAnalysis.py b/rgt/HINT/DifferentialAnalysis.py index 88a38759..0aa4b4b1 100644 --- a/rgt/HINT/DifferentialAnalysis.py +++ b/rgt/HINT/DifferentialAnalysis.py @@ -10,6 +10,7 @@ from scipy.stats import zscore from scipy.stats import norm +from scipy import stats from argparse import SUPPRESS from multiprocessing import Pool, cpu_count From 38feb635d8253894d5986ad8938d7919fd58ed2e Mon Sep 17 00:00:00 2001 From: lzj1769 Date: Thu, 9 Jul 2020 17:20:33 +0200 Subject: [PATCH 03/12] add option for differential footprinting --- rgt/HINT/DifferentialAnalysis.py | 27 +++++++++++++++------------ 1 file changed, 15 insertions(+), 12 deletions(-) diff --git a/rgt/HINT/DifferentialAnalysis.py b/rgt/HINT/DifferentialAnalysis.py index 0aa4b4b1..fc23f094 100644 --- a/rgt/HINT/DifferentialAnalysis.py +++ b/rgt/HINT/DifferentialAnalysis.py @@ -65,6 +65,8 @@ def diff_analysis_args(parser): help="The prefix for results files. DEFAULT: differential") parser.add_argument("--standardize", action="store_true", default=False, help="If set, the signal will be rescaled to (0, 1) for plotting.") + parser.add_argument("--no-lineplots", default=False, action='store_true', + help="If set, the footprint line plots will not be generated. DEFAULT: False") parser.add_argument("--output-profiles", default=False, action='store_true', help="If set, the footprint profiles will be writen into a text, in which each row is a " "specific instance of the given motif. DEFAULT: False") @@ -206,7 +208,7 @@ def diff_analysis_run(args): print("signal generation is done!\n") - # compute normalization facotr for each condition + # compute normalization factor for each condition factors = compute_factors(signals) output_factor(args, factors, conditions) @@ -218,18 +220,19 @@ def diff_analysis_run(args): if args.output_profiles: output_profiles(mpbs_name_list, signals, conditions, args.output_location) - print("generating line plot for each motif...\n") - if args.nc == 1: - for i, mpbs_name in enumerate(mpbs_name_list): - output_line_plot((mpbs_name, motif_num[i], signals[:, i, :], conditions, motif_pwm[i], output_location, - args.window_size, colors)) - else: - with Pool(processes=args.nc) as pool: - arguments_list = list() + if not args.no_lineplots: + print("generating line plot for each motif...\n") + if args.nc == 1: for i, mpbs_name in enumerate(mpbs_name_list): - arguments_list.append((mpbs_name, motif_num[i], signals[:, i, :], conditions, motif_pwm[i], output_location, - args.window_size, colors)) - pool.map(output_line_plot, arguments_list) + output_line_plot((mpbs_name, motif_num[i], signals[:, i, :], conditions, motif_pwm[i], output_location, + args.window_size, colors)) + else: + with Pool(processes=args.nc) as pool: + arguments_list = list() + for i, mpbs_name in enumerate(mpbs_name_list): + arguments_list.append((mpbs_name, motif_num[i], signals[:, i, :], conditions, motif_pwm[i], output_location, + args.window_size, colors)) + pool.map(output_line_plot, arguments_list) ps_tc_results = list() for i, mpbs_name in enumerate(mpbs_name_list): From 91ae1021a2d6b4e5086c1459b51be8c52ef8b4ce Mon Sep 17 00:00:00 2001 From: Fabio Ticconi Date: Fri, 17 Jul 2020 18:55:01 +0200 Subject: [PATCH 04/12] upping to minor version; also improved GRS.subtract unittests --- rgt/GenomicRegionSet.py | 6 +- rgt/__version__.py | 2 +- unittest/test_GenomicRegionSet.py | 326 ++++++++++++++++++++---------- 3 files changed, 229 insertions(+), 105 deletions(-) diff --git a/rgt/GenomicRegionSet.py b/rgt/GenomicRegionSet.py index c8a02fc1..26fe40f1 100644 --- a/rgt/GenomicRegionSet.py +++ b/rgt/GenomicRegionSet.py @@ -1135,12 +1135,13 @@ def window(self, y, adding_length=1000): return extended_self.intersect(y) def subtract(self, y, whole_region=False, merge=True, exact=False): - """Return a GenomicRegionSet excluded the overlapping regions with y. + """Return a copy of this GenomicRegionSet minus all the regions overlapping with y. *Keyword arguments:* - y -- the GenomicRegionSet which to subtract by - whole_region -- subtract the whole region, not partially + - merge -- before subtracting, it merges any overlapping sequence within self or y - exact -- only regions which match exactly with a region within y are subtracted if set, whole_region and merge are completely ignored if set, the returned GRS is sorted and does not contain duplicates @@ -1273,10 +1274,11 @@ def subtract(self, y, whole_region=False, merge=True, exact=False): if len(self) == 0 or len(y) == 0: return self - # If there is overlap within self or y, they should be merged first. if not self.sorted: self.sort() + # if there is overlap within self or y, and the `merge` option is set, + # we merge any overlapping sequence and create two different GenomicRegionSets if merge: a = self.merge(w_return=True) b = y.merge(w_return=True) diff --git a/rgt/__version__.py b/rgt/__version__.py index f23a6b39..7e0dc0e8 100644 --- a/rgt/__version__.py +++ b/rgt/__version__.py @@ -1 +1 @@ -__version__ = "0.13.0" +__version__ = "0.13.1" diff --git a/unittest/test_GenomicRegionSet.py b/unittest/test_GenomicRegionSet.py index 48300b44..fb62bfa8 100644 --- a/unittest/test_GenomicRegionSet.py +++ b/unittest/test_GenomicRegionSet.py @@ -732,6 +732,7 @@ def test_subtract(self): [['chr1', 6, 15]]) result = self.setA.subtract(self.setB) self.assertEqual(len(result), 0) + """ A : ------ B : none @@ -743,6 +744,7 @@ def test_subtract(self): self.assertEqual(len(result), 1) self.assertEqual(result[0].initial, 6) self.assertEqual(result[0].final, 15) + """ A : ------ B : ------ @@ -754,6 +756,7 @@ def test_subtract(self): self.assertEqual(len(result), 1) self.assertEqual(result[0].initial, 1) self.assertEqual(result[0].final, 6) + """ A : ------ B : ------ @@ -765,6 +768,7 @@ def test_subtract(self): self.assertEqual(len(result), 1) self.assertEqual(result[0].initial, 10) self.assertEqual(result[0].final, 15) + """ A : --- B : --------- @@ -774,6 +778,7 @@ def test_subtract(self): [['chr1', 1, 15]]) result = self.setA.subtract(self.setB) self.assertEqual(len(result), 0) + """ A : --------- B : --- @@ -787,6 +792,7 @@ def test_subtract(self): self.assertEqual(result[0].final, 6) self.assertEqual(result[1].initial, 10) self.assertEqual(result[1].final, 15) + """ A : ------ B : ------ @@ -796,6 +802,7 @@ def test_subtract(self): [['chr1', 6, 15]]) result = self.setA.subtract(self.setB) self.assertEqual(len(result), 0) + """ A : ---------- ------ B : ---------- ---- @@ -809,6 +816,7 @@ def test_subtract(self): self.assertEqual(result[0].final, 20) self.assertEqual(result[1].initial, 70) self.assertEqual(result[1].final, 85) + """ A : ------ ----- B : ------ @@ -822,6 +830,7 @@ def test_subtract(self): self.assertEqual(result[0].final, 30) self.assertEqual(result[1].initial, 35) self.assertEqual(result[1].final, 55) + """ A : ch1 --------------------- ch2 ------------------------- @@ -840,6 +849,7 @@ def test_subtract(self): self.assertEqual(result[1].final, 30000) self.assertEqual(result[2].initial, 0) self.assertEqual(result[2].final, 31000) + """ A : ----------------------------------------------------------- B : --- --------- ---- ---- @@ -863,7 +873,6 @@ def test_subtract(self): [['chr1', 10, 15], ['chr1', 30, 70], ['chr1', 120, 140], ['chr1', 200, 240]]) result = self.setA.subtract(self.setB) res_list = [(r.initial, r.final) for r in result] - self.assertListEqual(res_list, [(5, 10), (15, 30), (70, 120), (140, 150), (180, 200)]) """ @@ -879,7 +888,6 @@ def test_subtract(self): [['chr1', 10, 15], ['chr1', 30, 70], ['chr1', 120, 140], ['chr1', 200, 240]]) result = self.setA.subtract(self.setB, merge=False) res_list = [(r.initial, r.final) for r in result] - self.assertListEqual(res_list, [(5, 10), (15, 30), (70, 100), (20, 30), (70, 80), (95, 120), (140, 150), (180, 200)]) @@ -896,115 +904,229 @@ def test_subtract(self): self.assertEqual(len(result), 15) # test exact subtraction + # the arguments merge and whole_region must be ignored, so we set their defaults in the most disrupting way: + # whole_region=False: allows partial subtraction (unless exact=True) + # merge=True: any overlapping regions in the two GRSs are merged (unless exact=True) - # only one region to subtract, 2 regions before that one, 1 region after it, region is duplicated - self.region_sets([['chr1', 10, 13], ['chr1', 10, 15], ['chr1', 10, 20], ['chr1', 10, 20], ['chr1', 12, 30]], - [['chr1', 10, 20]]) - result = self.setA.subtract(self.setB, exact=True) - self.assertEqual(len(result), 3) - self.assertEqual(result[0].chrom, 'chr1') - self.assertEqual(result[0].initial, 10) - self.assertEqual(result[0].final, 13) - self.assertEqual(result[1].chrom, 'chr1') - self.assertEqual(result[1].initial, 10) - self.assertEqual(result[1].final, 15) - self.assertEqual(result[2].chrom, 'chr1') - self.assertEqual(result[2].initial, 12) - self.assertEqual(result[2].final, 30) - - # subtract two different but overlapping regions, one duplicate region in GRS - self.region_sets([['chr1', 10, 13], ['chr1', 10, 13], ['chr1', 10, 15], ['chr1', 10, 20], ['chr1', 12, 30], - ['chr5', 4, 64]], [['chr1', 10, 20], ['chr1', 12, 30]]) - result = self.setA.subtract(self.setB, exact=True) + """ + Subtracting one region (and its duplicate), ignoring regions before/after and overlapping + + A : --- + ----- + ---------- + ---------- + ------------------ + ----- + B : ---------- + R : --- + ----- + ------------------ + ----- + """ + self.region_sets([['chr1', 10, 13], ['chr1', 13, 18], + ['chr1', 13, 23], ['chr1', 13, 23], + ['chr1', 15, 35], ['chr1', 25, 30]], + [['chr1', 13, 23]]) + result = self.setA.subtract(self.setB, exact=True, merge=True, whole_region=False) + self.assertEqual(len(result), 4) + res_list = [(r.initial, r.final) for r in result] + self.assertListEqual(res_list, [(10, 13), (13, 18), (15, 35), (25, 30)]) + + """ + Subtracting two different but overlapping regions, and one duplicate region gets removed + + A : --- + ----- + ---------- + ---------- + ------------------ + ----- + B : ----- + ------------------ + R : --- + ---------- + ----- + """ + self.region_sets([['chr1', 10, 13], ['chr1', 13, 18], + ['chr1', 13, 23], ['chr1', 13, 23], + ['chr1', 15, 35], ['chr1', 25, 30]], + [['chr1', 13, 18], ['chr1', 15, 35]]) + result = self.setA.subtract(self.setB, exact=True, merge=True, whole_region=False) self.assertEqual(len(result), 3) - self.assertEqual(result[0].chrom, 'chr1') - self.assertEqual(result[0].initial, 10) - self.assertEqual(result[0].final, 13) - self.assertEqual(result[1].chrom, 'chr1') - self.assertEqual(result[1].initial, 10) - self.assertEqual(result[1].final, 15) - self.assertEqual(result[2].chrom, 'chr5') - self.assertEqual(result[2].initial, 4) - self.assertEqual(result[2].final, 64) - - # subtract two different but not overlapping regions, one duplicate region in GRS - self.region_sets([['chr1', 10, 13], ['chr1', 10, 13], ['chr1', 10, 15], ['chr1', 10, 20], ['chr2', 12, 30], - ['chr5', 4, 64]], [['chr1', 10, 20], ['chr2', 12, 30]]) - result = self.setA.subtract(self.setB, exact=True) + res_list = [(r.initial, r.final) for r in result] + self.assertListEqual(res_list, [(10, 13), (13, 23), (25, 30)]) + + """ + Subtracting two different but overlapping regions that are on different chromosomes, + and one duplicate region gets removed + + A : --- + ----- + ---------- + ---------- + ------------------ + ----- + B : ----- (chr5) + ------------------ (chr9) + R : --- + ----- + ---------- + ------------------ + ----- + """ + self.region_sets([['chr1', 10, 13], ['chr1', 13, 18], + ['chr1', 13, 23], ['chr1', 13, 23], + ['chr1', 15, 35], ['chr1', 25, 30]], + [['chr5', 13, 18], ['chr9', 15, 35]]) + result = self.setA.subtract(self.setB, exact=True, merge=True, whole_region=False) + self.assertEqual(len(result), 5) + res_list = [(r.initial, r.final) for r in result] + self.assertListEqual(res_list, [(10, 13), (13, 18), (13, 23), (15, 35), (25, 30)]) + + """ + Subtracting two different but not overlapping regions, and one duplicate region gets removed + + A : --- + ----- + ---------- + ------------------ + ------------------ + ----- + B : --- ----- + R : ----- + ---------- + ------------------ + """ + self.region_sets([['chr1', 10, 13], ['chr1', 13, 18], + ['chr1', 13, 23], ['chr1', 15, 35], + ['chr1', 15, 35], ['chr1', 25, 30]], + [['chr1', 10, 13], ['chr1', 25, 30]]) + result = self.setA.subtract(self.setB, exact=True, merge=True, whole_region=False) self.assertEqual(len(result), 3) - self.assertEqual(result[0].chrom, 'chr1') - self.assertEqual(result[0].initial, 10) - self.assertEqual(result[0].final, 13) - self.assertEqual(result[1].chrom, 'chr1') - self.assertEqual(result[1].initial, 10) - self.assertEqual(result[1].final, 15) - self.assertEqual(result[2].chrom, 'chr5') - self.assertEqual(result[2].initial, 4) - self.assertEqual(result[2].final, 64) + res_list = [(r.initial, r.final) for r in result] + self.assertListEqual(res_list, [(13, 18), (13, 23), (15, 35)]) + + """ + Subtracting two different but not overlapping regions that are on different chromosomes, + and one duplicate region gets removed + + A : --- + ----- + ---------- + ------------------ + ------------------ + ----- + B : --- ----- + chr7 chr3 + R : ----- + ---------- + ------------------ + """ + self.region_sets([['chr1', 10, 13], ['chr1', 13, 18], + ['chr1', 13, 23], ['chr1', 15, 35], + ['chr1', 15, 35], ['chr1', 25, 30]], + [['chr7', 10, 13], ['chr3', 25, 30]]) + result = self.setA.subtract(self.setB, exact=True, merge=True, whole_region=False) + self.assertEqual(len(result), 5) + res_list = [(r.initial, r.final) for r in result] + self.assertListEqual(res_list, [(10, 13), (13, 18), (13, 23), (15, 35), (25, 30)]) + + """ + Start from unsorted GRS. Subtracting the same region twice. Verify that output GRS is sorted. + + A : --- ----- + ---------- + ------------------ (chr2) + ------------------ + ----- + + B : ---------- + ---------- + R : --- + ----- + ------------------ + ----- + ------------------ (chr2) + """ + self.region_sets([['chr1', 10, 13], ['chr1', 25, 30], + ['chr1', 13, 23], ['chr2', 15, 35], + ['chr1', 15, 35], ['chr1', 13, 18]], + [['chr1', 13, 23], ['chr1', 13, 23]]) + result = self.setA.subtract(self.setB, exact=True, merge=True, whole_region=False) + self.assertEqual(len(result), 5) + res_list = [(r.chrom, r.initial, r.final) for r in result] + self.assertListEqual(res_list, [('chr1', 10, 13), ('chr1', 13, 18), ('chr1', 15, 35), ('chr1', 25, 30), + ('chr2', 15, 35)]) + + """ + Subtracting one region that is on same chromosome, but not in A + + A : --- + ----- + ---------- + ---------- + ------------------ + ----- + B : ----- + R : --- + ----- + ---------- + ------------------ + ----- + """ + self.region_sets([['chr1', 10, 13], ['chr1', 13, 18], + ['chr1', 13, 23], ['chr1', 13, 23], + ['chr1', 15, 35], ['chr1', 25, 30]], + [['chr1', 36, 41]]) + result = self.setA.subtract(self.setB, exact=True, merge=True, whole_region=False) + self.assertEqual(len(result), 5) + res_list = [(r.initial, r.final) for r in result] + self.assertListEqual(res_list, [(10, 13), (13, 18), (13, 23), (15, 35), (25, 30)]) + """ + Subtracting multiple regions from a 1-sized GRS - # subtract "same" region but on different chromosome - self.region_sets([['chr4', 6, 97]], [['chr7', 6, 97]]) - result = self.setA.subtract(self.setB, exact=True) - self.assertEqual(len(result), 1) - self.assertEqual(result[0].chrom, 'chr4') - self.assertEqual(result[0].initial, 6) - self.assertEqual(result[0].final, 97) + A : ------------------ + B : --- + ----- + ---------- + ---------- + ------------------ + ----- + R : + """ + self.region_sets([['chr1', 15, 35]], + [['chr1', 10, 13], ['chr1', 13, 18], + ['chr1', 13, 23], ['chr1', 13, 23], + ['chr1', 15, 35], ['chr1', 25, 30]]) + result = self.setA.subtract(self.setB, exact=True, merge=True, whole_region=False) + self.assertEqual(len(result), 0) - # subtract one duplicated region and test ordering of resulting GRS - self.region_sets([['chr2', 10, 13], ['chr2', 10, 13], ['chr1', 10, 15], ['chr1', 10, 20], ['chr1', 12, 30]], - [['chr1', 10, 20], ['chr1', 10, 20]]) - result = self.setA.subtract(self.setB, exact=True) - self.assertEqual(len(result), 3) - self.assertEqual(result[0].chrom, 'chr1') - self.assertEqual(result[0].initial, 10) - self.assertEqual(result[0].final, 15) - self.assertEqual(result[1].chrom, 'chr1') - self.assertEqual(result[1].initial, 12) - self.assertEqual(result[1].final, 30) - self.assertEqual(result[2].chrom, 'chr2') - self.assertEqual(result[2].initial, 10) - self.assertEqual(result[2].final, 13) + """ + Start from unsorted GRS. Subtract a few regions (also unsorted). Verify that output GRS is sorted - # subtract region that is not included in GRS - self.region_sets([['chr1', 7, 15], ['chr3', 8, 46], ['chr9', 3, 38]], [['chr3', 7, 19]]) - result = self. setA.subtract(self.setB, exact=True) - self.assertEqual(len(result), 3) - self.assertEqual(result[0].chrom, 'chr1') - self.assertEqual(result[0].initial, 7) - self.assertEqual(result[0].final, 15) - self.assertEqual(result[1].chrom, 'chr3') - self.assertEqual(result[1].initial, 8) - self.assertEqual(result[1].final, 46) - self.assertEqual(result[2].chrom, 'chr9') - self.assertEqual(result[2].initial, 3) - self.assertEqual(result[2].final, 38) - - # subtract three different regions from one GRS of length 1 - """ - first region ----------- - second region ----- - third region ----- - GRS ----- - """ - self.region_sets([['chr4', 46, 94]], [['chr4', 2, 100], ['chr4', 29, 80], ['chr4', 46, 94]]) - result = self.setA.subtract(self.setB, exact=True) - self.assertEqual(len(result), 0) + A : ----- + ---------- + ------------------ (chr2) + ------------------ + ----- + --- - # subtract unordered - self.region_sets([['chr12', 75, 117], ['chr1', 3, 8], ['chr1', 48, 56], ['chr1', 20, 36], ['chr12', 8, 15]], - [['chr12', 8, 15], ['chr6', 17, 42], ['chr1', 48, 56], ['chr4', 75, 117]]) - result = self.setA.subtract(self.setB, exact=True) + B : ---------- + ----- + ------------------ + R : --- + ----- + ------------------ (chr2) + """ + self.region_sets([['chr1', 25, 30], ['chr1', 13, 23], + ['chr2', 15, 35], ['chr1', 15, 35], + ['chr1', 13, 18], ['chr1', 10, 13]], + [['chr1', 13, 23], ['chr1', 25, 30], ['chr1', 15, 35]]) + result = self.setA.subtract(self.setB, exact=True, merge=True, whole_region=False) self.assertEqual(len(result), 3) - self.assertEqual(result[0].chrom, 'chr1') - self.assertEqual(result[0].initial, 3) - self.assertEqual(result[0].final, 8) - self.assertEqual(result[1].chrom, 'chr1') - self.assertEqual(result[1].initial, 20) - self.assertEqual(result[1].final, 36) - self.assertEqual(result[2].chrom, 'chr12') - self.assertEqual(result[2].initial, 75) - self.assertEqual(result[2].final, 117) + res_list = [(r.chrom, r.initial, r.final) for r in result] + self.assertListEqual(res_list, [('chr1', 10, 13), ('chr1', 13, 18), ('chr2', 15, 35)]) def test_merge(self): """ From 9b3f99b4d26fea53a77554c400aa24c5523620a2 Mon Sep 17 00:00:00 2001 From: Fabio Ticconi Date: Mon, 20 Jul 2020 21:36:58 +0200 Subject: [PATCH 05/12] #154 removing references to gene_regions --- setup.py | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/setup.py b/setup.py index dcd0dfeb..385d852d 100644 --- a/setup.py +++ b/setup.py @@ -274,14 +274,14 @@ def recursive_chown_chmod(path_to_walk, uid, gid, file_permission, path_permissi data_config_file.write("[" + genome + "]\n") data_config_file.write("genome: " + path.join(genome, "genome_zv9_ensembl_release_79.fa\n")) data_config_file.write("chromosome_sizes: " + path.join(genome, "chrom.sizes.zv9\n")) -data_config_file.write("gene_regions: " + path.join(genome, "genes_zv9.bed\n")) +data_config_file.write("genes_RefSeq: " + path.join(genome, "genes_zv9.bed\n")) data_config_file.write("annotation: " + path.join(genome, "Danio_rerio.Zv9.79.gtf\n")) data_config_file.write("gene_alias: " + path.join(genome, "alias_zebrafish.txt\n\n")) genome = "zv10" data_config_file.write("[" + genome + "]\n") data_config_file.write("genome: " + path.join(genome, "genome_zv10_ensembl_release_84.fa\n")) data_config_file.write("chromosome_sizes: " + path.join(genome, "chrom.sizes.zv10\n")) -data_config_file.write("gene_regions: " + path.join(genome, "genes_zv10.bed\n")) +data_config_file.write("genes_RefSeq: " + path.join(genome, "genes_zv10.bed\n")) data_config_file.write("annotation: " + path.join(genome, "Danio_rerio.GRCz10.84.gtf\n")) data_config_file.write("gene_alias: " + path.join(genome, "alias_zebrafish.txt\n\n")) @@ -327,7 +327,8 @@ def recursive_chown_chmod(path_to_walk, uid, gid, file_permission, path_permissi user_config_file.write("#[" + genome + "]\n") user_config_file.write("#genome: undefined\n") user_config_file.write("#chromosome_sizes: undefined\n") - user_config_file.write("#gene_regions: undefined\n") + user_config_file.write("#genes_Gencode: undefined\n") + user_config_file.write("#genes_RefSeq: undefined\n") user_config_file.write("#annotation: undefined\n") user_config_file.write("#gene_alias: undefined\n\n") From 9adcfd995976b1bd618980dd2b625213c3c5405e Mon Sep 17 00:00:00 2001 From: Fabio Ticconi Date: Tue, 21 Jul 2020 19:15:16 +0200 Subject: [PATCH 06/12] #155 better motif analysis output --- rgt/motifanalysis/Enrichment.py | 19 ++++++++++++++----- rgt/motifanalysis/Match.py | 33 +++++++++++++++++++++------------ 2 files changed, 35 insertions(+), 17 deletions(-) diff --git a/rgt/motifanalysis/Enrichment.py b/rgt/motifanalysis/Enrichment.py index 7992b279..d705f3ba 100644 --- a/rgt/motifanalysis/Enrichment.py +++ b/rgt/motifanalysis/Enrichment.py @@ -147,6 +147,7 @@ def main(args): err.throw_error("ME_MATCH_NOTFOUND") # Background file must exist + print(">> loading background file..") background_original_filename = background_filename if not os.path.isfile(background_filename): err.throw_error("DEFAULT_ERROR", add_msg="Background file does not exist or is not readable.") @@ -159,9 +160,10 @@ def main(args): # Background MPBS file must exist path, ext = os.path.splitext(background_filename) + background_name = os.path.basename(path) # first we check at matching folder location - background_mpbs_filename = os.path.join(match_location, os.path.basename(path) + "_mpbs" + ext) + background_mpbs_filename = os.path.join(match_location, background_name + "_mpbs" + ext) if not os.path.isfile(background_mpbs_filename): # if not found, we search at background file location @@ -184,6 +186,10 @@ def main(args): else: err.throw_error("DEFAULT_ERROR", add_msg="Background MPBS file must be in either BED or BigBed format.") + background = GenomicRegionSet("background") + background.read(background_filename) + print(">>>", background_name, ",", len(background), "regions") + # Default genomic data genome_data = GenomeData(args.organism) @@ -191,6 +197,7 @@ def main(args): # Load motif_set (includes MotifData object), is either customized or default if args.motif_dbs: + print(">> loading external motif databases..") # args.motif_dbs is a list of paths to pwm files motif_set = MotifSet(preload_motifs=args.motif_dbs, motif_dbs=True) @@ -198,15 +205,18 @@ def main(args): if 'database' in filter_values: del filter_values['database'] else: + print(">> loading motif databases..") if 'database' in filter_values: motif_set = MotifSet(preload_motifs=filter_values['database']) else: motif_set = MotifSet(preload_motifs="default") - print(">> used database(s):", ",".join([str(db) for db in motif_set.motif_data.repositories_list])) + for db in motif_set.motif_data.get_repositories_list(): + print(">>>", db) # applying filtering pattern, taking a subset of the motif set if args.filter: + print(">> applying motif filter..") motif_set = motif_set.filter(filter_values, search=args.filter_type) motif_names = list(motif_set.motifs_map.keys()) @@ -249,6 +259,7 @@ def main(args): except Exception: err.throw_error("MM_WRONG_EXPMAT") elif input_files: + print(">> loading input files..") # get input files, if available for input_filename in input_files: name, _ = os.path.splitext(os.path.basename(input_filename)) @@ -258,7 +269,7 @@ def main(args): genomic_regions_dict[name] = regions - print(">> input regions BED files loaded") + print(">>>", name, ",", len(regions), "regions") if not genomic_regions_dict: err.throw_error("DEFAULT_ERROR", add_msg="You must either specify an experimental matrix, " @@ -352,8 +363,6 @@ def main(args): start = time.time() print(">> collecting background statistics...", sep="", end="") sys.stdout.flush() - background = GenomicRegionSet("background") - background.read(background_filename) background_mpbs = GenomicRegionSet("background_mpbs") background_mpbs.read(background_mpbs_filename) diff --git a/rgt/motifanalysis/Match.py b/rgt/motifanalysis/Match.py index 99f7d373..efa9de63 100644 --- a/rgt/motifanalysis/Match.py +++ b/rgt/motifanalysis/Match.py @@ -167,6 +167,7 @@ def main(args): except Exception: err.throw_error("MM_WRONG_EXPMAT") elif args.input_files: + print(">> loading input files..") # get input files, if available for input_filename in args.input_files: name, _ = os.path.splitext(os.path.basename(input_filename)) @@ -176,7 +177,7 @@ def main(args): genomic_regions_dict[name] = regions - print(">>> input file", name, "loaded:", len(regions), "regions") + print(">>>", name, ",", len(regions), "regions") # we put this here because we don't want to create the output directory unless we # are sure the initialisation (including loading input files) worked @@ -191,6 +192,7 @@ def main(args): # get promoter regions from list of genes (both target and background) # TODO: should be more clever, allow precomputed regions etc if args.target_genes_filename: + print(">> creating target promoter file..") annotation = AnnotationSet(args.organism, alias_source=args.organism, protein_coding=True, known_only=True) @@ -206,11 +208,12 @@ def main(args): genomic_regions_dict[target_regions.name] = target_regions - print(">>> target promoter file created:", len(target_regions), "regions") + print(">>>", len(target_regions), "regions") # we make a background in case it's requested, but also in case a list of target genes has not been # provided if args.promoter_make_background or (args.promoters_only and not args.target_genes_filename): + print(">> creating background promoter file..") if not annotation: annotation = AnnotationSet(args.organism, alias_source=args.organism, protein_coding=True, known_only=True) @@ -231,7 +234,7 @@ def main(args): genomic_regions_dict[background_regions.name] = background_regions - print(">>> background promoter file created:", len(background_regions), "regions") + print(">>>", len(background_regions), "regions") if not genomic_regions_dict: err.throw_error("DEFAULT_ERROR", add_msg="You must either specify an experimental matrix, or at least a " @@ -265,14 +268,13 @@ def main(args): max_region_len = curr_len max_region = curr_genomic_region - print(">> all files loaded") - ################################################################################################### # Creating random regions ################################################################################################### # if a random proportion is set, create random regions if args.rand_proportion: + print(">> creating random regions file..") # Create random coordinates and name it random_regions rand_region = max_region.random_regions(args.organism, multiply_factor=args.rand_proportion, chrom_X=True) @@ -302,30 +304,37 @@ def main(args): except Exception: err.throw_warning("DEFAULT_WARNING") # FIXME: maybe error instead? - print(">> random regions file created:", len(rand_region), "regions") + print(">>>", len(rand_region), "regions") ################################################################################################### # Creating PWMs ################################################################################################### + # Load motif_set (includes MotifData object), is either customized or default if args.motif_dbs: - ms = MotifSet(preload_motifs=args.motif_dbs, motif_dbs=True) + print(">> loading external motif databases..") + # args.motif_dbs is a list of paths to pwm files + motif_set = MotifSet(preload_motifs=args.motif_dbs, motif_dbs=True) + # filter for dbs only if --motif_dbs is not set if 'database' in filter_values: del filter_values['database'] else: + print(">> loading motif databases..") if 'database' in filter_values: - ms = MotifSet(preload_motifs=filter_values['database']) + motif_set = MotifSet(preload_motifs=filter_values['database']) else: - ms = MotifSet(preload_motifs="default") + motif_set = MotifSet(preload_motifs="default") - print(">> used database(s):", ",".join([str(db) for db in ms.motif_data.repositories_list])) + for db in motif_set.motif_data.get_repositories_list(): + print(">>>", db) # applying filtering pattern, taking a subset of the motif set if args.filter: - ms = ms.filter(filter_values, search=args.filter_type) + print(">> applying motif filter..") + motif_set = motif_set.filter(filter_values, search=args.filter_type) - motif_list = ms.get_motif_list(args.pseudocounts, args.fpr) + motif_list = motif_set.get_motif_list(args.pseudocounts, args.fpr) print(">> motifs loaded:", len(motif_list)) From e7a0aa19f55e7d11d2a3bc5544fb130bebb8b3d0 Mon Sep 17 00:00:00 2001 From: Fabio Ticconi Date: Tue, 21 Jul 2020 22:29:00 +0200 Subject: [PATCH 07/12] #155 further changes to matching and enrichment output --- rgt/motifanalysis/Enrichment.py | 27 +++++++++++++++------------ rgt/motifanalysis/Match.py | 23 ++++++++++++++--------- 2 files changed, 29 insertions(+), 21 deletions(-) diff --git a/rgt/motifanalysis/Enrichment.py b/rgt/motifanalysis/Enrichment.py index d705f3ba..7f814f53 100644 --- a/rgt/motifanalysis/Enrichment.py +++ b/rgt/motifanalysis/Enrichment.py @@ -135,12 +135,20 @@ def main(args): output_location = args.output_location else: output_location = os.path.join(os.getcwd(), enrichment_folder_name) + print(">> output location:", output_location) # Matching folder if args.matching_location: match_location = args.matching_location else: match_location = os.path.join(os.getcwd(), matching_folder_name) + print(">> match location:", match_location) + print() + + # Default genomic data + genome_data = GenomeData(args.organism) + print(">> genome:", genome_data.organism) + print() # the matching directory must exist, for obvious reasons if not os.path.isdir(match_location): @@ -188,12 +196,8 @@ def main(args): background = GenomicRegionSet("background") background.read(background_filename) - print(">>>", background_name, ",", len(background), "regions") - - # Default genomic data - genome_data = GenomeData(args.organism) - - print(">> genome:", genome_data.organism) + print(">>> ", background_name, ", ", len(background), " regions", sep="") + print() # Load motif_set (includes MotifData object), is either customized or default if args.motif_dbs: @@ -213,6 +217,7 @@ def main(args): for db in motif_set.motif_data.get_repositories_list(): print(">>>", db) + print() # applying filtering pattern, taking a subset of the motif set if args.filter: @@ -220,8 +225,8 @@ def main(args): motif_set = motif_set.filter(filter_values, search=args.filter_type) motif_names = list(motif_set.motifs_map.keys()) - print(">> motifs loaded:", len(motif_names)) + print() # Default image data image_data = ImageData() @@ -255,6 +260,7 @@ def main(args): del exp_matrix print(">> experimental matrix loaded") + print() except Exception: err.throw_error("MM_WRONG_EXPMAT") @@ -269,7 +275,8 @@ def main(args): genomic_regions_dict[name] = regions - print(">>>", name, ",", len(regions), "regions") + print(">>> ", name, ", ", len(regions), " regions", sep="") + print() if not genomic_regions_dict: err.throw_error("DEFAULT_ERROR", add_msg="You must either specify an experimental matrix, " @@ -358,8 +365,6 @@ def main(args): except Exception: err.throw_error("ME_OUT_FOLDER_CREATION") - print() - start = time.time() print(">> collecting background statistics...", sep="", end="") sys.stdout.flush() @@ -404,8 +409,6 @@ def main(args): link_name = grs.name sitetest_link_dict[link_name] = os.path.join(link_location, link_name, output_stat_fulltest + ".html") - print() - # Iterating on each input object for curr_input in input_list: diff --git a/rgt/motifanalysis/Match.py b/rgt/motifanalysis/Match.py index efa9de63..bfed7258 100644 --- a/rgt/motifanalysis/Match.py +++ b/rgt/motifanalysis/Match.py @@ -139,6 +139,7 @@ def main(args): else: output_location = npath(matching_folder_name) print(">> output location:", output_location) + print() # Default genomic data genome_data = GenomeData(args.organism) @@ -146,6 +147,7 @@ def main(args): print(">> genome:", genome_data.organism) print(">> pseudocounts:", args.pseudocounts) print(">> fpr threshold:", args.fpr) + print() ################################################################################################### # Reading Input Regions @@ -162,7 +164,8 @@ def main(args): # if the matrix is present, the (empty) dictionary is overwritten genomic_regions_dict = exp_matrix.objectsDict - print(">>> experimental matrix loaded") + print(">> experimental matrix loaded") + print() except Exception: err.throw_error("MM_WRONG_EXPMAT") @@ -177,7 +180,8 @@ def main(args): genomic_regions_dict[name] = regions - print(">>>", name, ",", len(regions), "regions") + print(">>> ", name, ", ", len(regions), " regions", sep="") + print() # we put this here because we don't want to create the output directory unless we # are sure the initialisation (including loading input files) worked @@ -208,7 +212,8 @@ def main(args): genomic_regions_dict[target_regions.name] = target_regions - print(">>>", len(target_regions), "regions") + print(">>> ", len(target_regions), " regions", sep="") + print() # we make a background in case it's requested, but also in case a list of target genes has not been # provided @@ -234,7 +239,8 @@ def main(args): genomic_regions_dict[background_regions.name] = background_regions - print(">>>", len(background_regions), "regions") + print(">>> ", len(background_regions), " regions", sep="") + print() if not genomic_regions_dict: err.throw_error("DEFAULT_ERROR", add_msg="You must either specify an experimental matrix, or at least a " @@ -304,7 +310,8 @@ def main(args): except Exception: err.throw_warning("DEFAULT_WARNING") # FIXME: maybe error instead? - print(">>>", len(rand_region), "regions") + print(">>> ", len(rand_region), " regions", sep="") + print() ################################################################################################### # Creating PWMs @@ -328,6 +335,7 @@ def main(args): for db in motif_set.motif_data.get_repositories_list(): print(">>>", db) + print() # applying filtering pattern, taking a subset of the motif set if args.filter: @@ -335,8 +343,8 @@ def main(args): motif_set = motif_set.filter(filter_values, search=args.filter_type) motif_list = motif_set.get_motif_list(args.pseudocounts, args.fpr) - print(">> motifs loaded:", len(motif_list)) + print() # Performing normalized threshold strategy if requested if args.norm_threshold: @@ -372,8 +380,6 @@ def main(args): # Creating genome file genome_file = Fastafile(genome_data.get_genome()) - print() - # Iterating on list of genomic region sets for grs in regions_to_match: @@ -450,7 +456,6 @@ def main(args): secs = time.time() - start print("[", "%02.3f" % secs, " seconds]", sep="") - # TODO must double-check/fix the normalisation. def match_multiple(scanner, motifs, sequence, genomic_region, unique_threshold=None, normalize_bitscore=False, output=None): """ From 6878811d2e2013816abbb9e81145f8a22f3bb177 Mon Sep 17 00:00:00 2001 From: Fabio Ticconi Date: Wed, 22 Jul 2020 16:49:29 +0200 Subject: [PATCH 08/12] fixing #155 with improved matching/enrichment output --- test/motif.sh | 35 +++++++++++++++++++---------------- 1 file changed, 19 insertions(+), 16 deletions(-) diff --git a/test/motif.sh b/test/motif.sh index dbb16c79..41d0c91b 100755 --- a/test/motif.sh +++ b/test/motif.sh @@ -9,11 +9,13 @@ mkdir -p $DIR cd ${DIR} echo "**********************************************" -echo "Testing Motif Analysis" -echo +echo "******** Testing Motif Analysis tool *********" +echo "**********************************************" # full-site test -echo "Full-Site Test:" +echo +echo "# Full-Site Test" +echo url="http://www.regulatory-genomics.org/wp-content/uploads/2017/03/RGT_MotifAnalysis_FullSiteTest.tar.gz" @@ -29,14 +31,14 @@ fi # Run test script cd $DEST -echo "Running matching.." +echo "## MATCHING" echo -echo ".. with filter" +echo "### with filter" rgt-motifanalysis matching --organism hg19 --filter "database:jaspar_vertebrates" --input-files input/regions_K562.bed input/background.bed mv match/background_mpbs.bed match/background_mpbs_filter.bed mv match/regions_K562_mpbs.bed match/regions_K562_mpbs_filter.bed echo -echo ".. without filter" +echo "### without filter" rgt-motifanalysis matching --organism hg19 --input-files input/regions_K562.bed input/background.bed a=$(sort match/background_mpbs.bed | md5sum | awk '{ print $1 }') @@ -47,28 +49,28 @@ d=$(sort match/regions_K562_mpbs_filter.bed | md5sum | awk '{ print $1 }') echo if [ ${a} != ${b} ] then - echo "filter error in background_mpbs:" + echo "#### filter error in background_mpbs:" echo ${a} echo ${b} fi echo if [ ${c} != ${d} ] then - echo "filter error in regions_K562_mpbs:" + echo "#### filter error in regions_K562_mpbs:" echo ${c} echo ${d} fi echo -echo "Running enrichment.." +echo "## ENRICHMENT" rgt-motifanalysis enrichment --organism hg19 input/background.bed input/regions_K562.bed - echo # Promoter test cd $DIR -echo "Promoter Test:" +echo "# Promoter Test" +echo url="http://www.regulatory-genomics.org/wp-content/uploads/2017/03/RGT_MotifAnalysis_PromoterTest.tar.gz" @@ -84,15 +86,16 @@ fi # Run test script cd $DEST -echo "Running matching.." +echo "## MATCHING" rgt-motifanalysis matching --organism hg19 --make-background --target-genes input/genes.txt --promoters-only -echo "Running enrichment.." +echo "## ENRICHMENT" rgt-motifanalysis enrichment --organism hg19 --logo-copy match/background_regions.bed match/target_regions.bed echo # Gene-association test -echo "Gene Association Test:" +echo "# Gene Association Test:" +echo url="http://www.regulatory-genomics.org/wp-content/uploads/2017/03/RGT_MotifAnalysis_GeneAssocTest.tar.gz" @@ -109,9 +112,9 @@ fi # Run test script cd $DEST -echo "Running matching.." +echo "## MATCHING" rgt-motifanalysis matching --organism hg19 --input-matrix input_matrix.txt --rand-proportion 10 -echo "Running enrichment.." +echo "## ENRICHMENT" rgt-motifanalysis enrichment --organism hg19 --logo-embed --input-matrix input_matrix.txt match/random_regions.bed echo From f003aa30b0e735fc660e70e7e2409db75e09b232 Mon Sep 17 00:00:00 2001 From: Fabio Ticconi Date: Wed, 22 Jul 2020 18:47:18 +0200 Subject: [PATCH 09/12] #154 converted genecode/refseq duplicate entry to a single gene_regions --- rgt/Util.py | 13 ++----------- setup.py | 23 +++++++++++------------ 2 files changed, 13 insertions(+), 23 deletions(-) diff --git a/rgt/Util.py b/rgt/Util.py index 8d42fe43..4cb48e5a 100644 --- a/rgt/Util.py +++ b/rgt/Util.py @@ -90,8 +90,7 @@ def __init__(self, organism): self.organism = organism self.genome = os.path.join(self.data_dir, self.config.get(organism, 'genome')) self.chromosome_sizes = os.path.join(self.data_dir, self.config.get(organism, 'chromosome_sizes')) - self.genes_gencode = os.path.join(self.data_dir, self.config.get(organism, 'genes_Gencode')) - self.genes_refseq = os.path.join(self.data_dir, self.config.get(organism, 'genes_RefSeq')) + self.gene_regions = os.path.join(self.data_dir, self.config.get(organism, 'gene_regions')) self.annotation = os.path.join(self.data_dir, self.config.get(organism, 'annotation')) self.annotation_dump_dir = os.path.dirname(os.path.join(self.data_dir, self.annotation)) self.gene_alias = os.path.join(self.data_dir, self.config.get(organism, 'gene_alias')) @@ -114,15 +113,7 @@ def get_chromosome_sizes(self): def get_gene_regions(self): """Returns the current path to the gene_regions BED file.""" - return self.genes_gencode - - def get_genes_gencode(self): - """Returns the current path to the gene_regions BED file.""" - return self.genes_gencode - - def get_genes_refseq(self): - """Returns the current path to the gene_regions BED file.""" - return self.genes_refseq + return self.gene_regions def get_annotation(self): """ diff --git a/setup.py b/setup.py index 385d852d..68e8dd1e 100644 --- a/setup.py +++ b/setup.py @@ -239,8 +239,8 @@ def recursive_chown_chmod(path_to_walk, uid, gid, file_permission, path_permissi data_config_file.write("[" + genome + "]\n") data_config_file.write("genome: " + path.join(genome, "genome_mm9.fa\n")) data_config_file.write("chromosome_sizes: " + path.join(genome, "chrom.sizes.mm9\n")) -data_config_file.write("genes_Gencode: " + path.join(genome, "genes_Gencode_mm9.bed\n")) -data_config_file.write("genes_RefSeq: " + path.join(genome, "genes_RefSeq_mm9.bed\n")) +data_config_file.write("gene_regions: " + path.join(genome, "genes_Gencode_mm9.bed\n")) +data_config_file.write("# gene_regions: " + path.join(genome, "genes_RefSeq_mm9.bed # alternative to Gencode\n")) data_config_file.write("annotation: " + path.join(genome, "gencode.vM1.annotation.gtf\n")) data_config_file.write("gene_alias: " + path.join(genome, "alias_mouse.txt\n\n")) data_config_file.write("repeat_maskers: " + path.join(genome, "repeat_maskers\n\n")) @@ -248,16 +248,16 @@ def recursive_chown_chmod(path_to_walk, uid, gid, file_permission, path_permissi data_config_file.write("[" + genome + "]\n") data_config_file.write("genome: " + path.join(genome, "genome_mm10.fa\n")) data_config_file.write("chromosome_sizes: " + path.join(genome, "chrom.sizes.mm10\n")) -data_config_file.write("genes_Gencode: " + path.join(genome, "genes_Gencode_mm10.bed\n")) -data_config_file.write("genes_RefSeq: " + path.join(genome, "genes_RefSeq_mm10.bed\n")) +data_config_file.write("gene_regions: " + path.join(genome, "genes_Gencode_mm10.bed\n")) +data_config_file.write("# gene_regions: " + path.join(genome, "genes_RefSeq_mm10.bed # alternative to Gencode\n")) data_config_file.write("annotation: " + path.join(genome, "gencode.vM20.annotation.gtf\n")) data_config_file.write("gene_alias: " + path.join(genome, "alias_mouse.txt\n\n")) genome = "hg19" data_config_file.write("[" + genome + "]\n") data_config_file.write("genome: " + path.join(genome, "genome_hg19.fa\n")) data_config_file.write("chromosome_sizes: " + path.join(genome, "chrom.sizes.hg19\n")) -data_config_file.write("genes_Gencode: " + path.join(genome, "genes_Gencode_hg19.bed\n")) -data_config_file.write("genes_RefSeq: " + path.join(genome, "genes_RefSeq_hg19.bed\n")) +data_config_file.write("gene_regions: " + path.join(genome, "genes_Gencode_hg19.bed\n")) +data_config_file.write("# gene_regions: " + path.join(genome, "genes_RefSeq_hg19.bed # alternative to Gencode\n")) data_config_file.write("annotation: " + path.join(genome, "gencode.v19.annotation.gtf\n")) data_config_file.write("gene_alias: " + path.join(genome, "alias_human.txt\n\n")) data_config_file.write("repeat_maskers: " + path.join(genome, "repeat_maskers\n\n")) @@ -265,8 +265,8 @@ def recursive_chown_chmod(path_to_walk, uid, gid, file_permission, path_permissi data_config_file.write("[" + genome + "]\n") data_config_file.write("genome: " + path.join(genome, "genome_hg38.fa\n")) data_config_file.write("chromosome_sizes: " + path.join(genome, "chrom.sizes.hg38\n")) -data_config_file.write("genes_Gencode: " + path.join(genome, "genes_Gencode_hg38.bed\n")) -data_config_file.write("genes_RefSeq: " + path.join(genome, "genes_RefSeq_hg38.bed\n")) +data_config_file.write("gene_regions: " + path.join(genome, "genes_Gencode_hg38.bed\n")) +data_config_file.write("# gene_regions: " + path.join(genome, "genes_RefSeq_hg38.bed # alternative to Gencode\n")) data_config_file.write("annotation: " + path.join(genome, "gencode.v24.annotation.gtf\n")) data_config_file.write("gene_alias: " + path.join(genome, "alias_human.txt\n\n")) data_config_file.write("repeat_maskers: " + path.join(genome, "repeat_maskers\n\n")) @@ -274,14 +274,14 @@ def recursive_chown_chmod(path_to_walk, uid, gid, file_permission, path_permissi data_config_file.write("[" + genome + "]\n") data_config_file.write("genome: " + path.join(genome, "genome_zv9_ensembl_release_79.fa\n")) data_config_file.write("chromosome_sizes: " + path.join(genome, "chrom.sizes.zv9\n")) -data_config_file.write("genes_RefSeq: " + path.join(genome, "genes_zv9.bed\n")) +data_config_file.write("gene_regions: " + path.join(genome, "genes_zv9.bed\n")) data_config_file.write("annotation: " + path.join(genome, "Danio_rerio.Zv9.79.gtf\n")) data_config_file.write("gene_alias: " + path.join(genome, "alias_zebrafish.txt\n\n")) genome = "zv10" data_config_file.write("[" + genome + "]\n") data_config_file.write("genome: " + path.join(genome, "genome_zv10_ensembl_release_84.fa\n")) data_config_file.write("chromosome_sizes: " + path.join(genome, "chrom.sizes.zv10\n")) -data_config_file.write("genes_RefSeq: " + path.join(genome, "genes_zv10.bed\n")) +data_config_file.write("gene_regions: " + path.join(genome, "genes_zv10.bed\n")) data_config_file.write("annotation: " + path.join(genome, "Danio_rerio.GRCz10.84.gtf\n")) data_config_file.write("gene_alias: " + path.join(genome, "alias_zebrafish.txt\n\n")) @@ -327,8 +327,7 @@ def recursive_chown_chmod(path_to_walk, uid, gid, file_permission, path_permissi user_config_file.write("#[" + genome + "]\n") user_config_file.write("#genome: undefined\n") user_config_file.write("#chromosome_sizes: undefined\n") - user_config_file.write("#genes_Gencode: undefined\n") - user_config_file.write("#genes_RefSeq: undefined\n") + user_config_file.write("#gene_regions: undefined\n") user_config_file.write("#annotation: undefined\n") user_config_file.write("#gene_alias: undefined\n\n") From d3edef77fd292cd94a471f22bea769a128b2f08b Mon Sep 17 00:00:00 2001 From: jovesus Date: Thu, 23 Jul 2020 11:51:03 +0200 Subject: [PATCH 10/12] modified: rgt/tdf/Main.py --- rgt/tdf/Main.py | 23 +++++++++++------------ 1 file changed, 11 insertions(+), 12 deletions(-) diff --git a/rgt/tdf/Main.py b/rgt/tdf/Main.py index 6fa05c24..56a72436 100644 --- a/rgt/tdf/Main.py +++ b/rgt/tdf/Main.py @@ -9,7 +9,6 @@ import datetime import subprocess import matplotlib -matplotlib.use('Agg', warn=False) from collections import OrderedDict # Local Libraries @@ -31,7 +30,7 @@ current_dir = os.getcwd() """ -Triplex Domain Finder (TDF) provides statistical tests and plotting tools for +Triplex Domain Finder (TDF) provides statistical tests and plotting tools for triplex binding site analysis Author: Joseph C.C. Kuo @@ -52,9 +51,9 @@ def main(): Author: Chao-Chung Kuo', formatter_class=argparse.ArgumentDefaultsHelpFormatter) parser.add_argument('--version', action='version', version=version_message) - + subparsers = parser.add_subparsers(help='sub-command help', dest='mode') - + ################### Promoter test ########################################## h_promotor = "Promoter test evaluates the association between the given lncRNA to the target promoters." @@ -67,12 +66,12 @@ def main(): parser_promotertest.add_argument('-bg', default=False, metavar=' ', help="Input BED file of the promoter regions of background genes") parser_promotertest.add_argument('-o', metavar=' ', help="Output directory name for all the results") parser_promotertest.add_argument('-t', metavar=' ', default=False, help="Define the title name for the results under the Output name. (default is RNA name)") - + parser_promotertest.add_argument('-organism', metavar=' ', help='Define the organism') parser_promotertest.add_argument('-gtf', metavar=' ', default=None, help='Define the GTF file for annotation (optional)') parser_promotertest.add_argument('-tss', type=int, default=0, metavar=' ', help="Define the distance between the promoter regions and TSS along gene body (default: %(default)s)") parser_promotertest.add_argument('-pl', type=int, default=1000, metavar=' ', help="Define the promotor length (default: %(default)s)") - + parser_promotertest.add_argument('-showdbs', action="store_true", help="Show the plots and statistics of DBS (DNA Binding sites)") parser_promotertest.add_argument('-score', action="store_true", help="Load score column from input gene list or BED file for analysis.") parser_promotertest.add_argument('-scoreh', action="store_true", help="Use the header of scores from the given gene list or BED file.") @@ -101,7 +100,7 @@ def main(): parser_promotertest.add_argument('-mf', action="store_true", default=False, help="[Triplexes] Merge overlapping features into a cluster and report the spanning region.") parser_promotertest.add_argument('-rm', type=int, default=2, metavar=' ', help="[Triplexes] Set the multiprocessing") parser_promotertest.add_argument('-par', type=str, default="", metavar=' ', help="[Triplexes] Define other parameters for Triplexes") - + ################### Genomic Region Test ########################################## h_region = "Genomic region test evaluates the association between the given lncRNA to the target regions by randomization." parser_randomtest = subparsers.add_parser('regiontest', help=h_region) @@ -111,12 +110,12 @@ def main(): parser_randomtest.add_argument('-bed', metavar=' ', help="Input BED file for interested regions on DNA") parser_randomtest.add_argument('-o', metavar=' ', help="Output directory name for all the results and temporary files") parser_randomtest.add_argument('-t', metavar=' ', default=False, help="Define the title name for the results under the Output name. (default is RNA name)") - - parser_randomtest.add_argument('-n', type=int, default=10000, metavar=' ', + + parser_randomtest.add_argument('-n', type=int, default=10000, metavar=' ', help="Number of times for randomization (default: %(default)s)") parser_randomtest.add_argument('-organism', metavar=' ', help='Define the organism') - + parser_randomtest.add_argument('-showdbs', action="store_true", help="Show the plots and statistics of DBS (DNA Binding sites)") parser_randomtest.add_argument('-score', action="store_true", help="Load score column from input BED file") parser_randomtest.add_argument('-a', type=float, default=0.05, metavar=' ', help="Define significance level for rejection null hypothesis (default: %(default)s)") @@ -130,7 +129,7 @@ def main(): parser_randomtest.add_argument('-showpa', action="store_true", default=False, help="Show parallel and antiparallel bindings in the plot separately.") parser_randomtest.add_argument('-mp', type=int, default=1, metavar=' ', help="Define the number of threads for multiprocessing") parser_randomtest.add_argument('-nofile', action="store_true", default=False, help="Don't save any files in the output folder, except the statistics.") - + parser_randomtest.add_argument('-l', type=int, default=20, metavar=' ', help="[Triplexes] Define the minimum length of triplex (default: %(default)s)") parser_randomtest.add_argument('-e', type=int, default=20, metavar=' ', help="[Triplexes] Set the maximal error-rate in %% tolerated (default: %(default)s)") parser_randomtest.add_argument('-c', type=int, default=2, metavar=' ', help="[Triplexes] Sets the tolerated number of consecutive errors with respect to the canonical triplex rules as such were found to greatly destabilize triplexes in vitro (default: %(default)s)") @@ -167,7 +166,7 @@ def main(): help="[Triplexes] Autobinding offset. Maximum offset between autobinding regions (must be positive, >= 0), e.g., 1 for regions to be at least adjacent, 2 if there can be 1 bp space between segments, etc. (default: %(default)s)") ########################################################################## - # rgt-TDF integrate -path + # rgt-TDF integrate -path parser_integrate = subparsers.add_parser('integrate', help="Integrate the project's links and generate project-level statistics.") parser_integrate.add_argument('-path',type=str, metavar=' ', help='Define the path of the project.') parser_integrate.add_argument('-exp', action="store_true", default=False, help='Include expression score for ranking.') From 2185c657ce7f83a686e013d3e56f9bd307af200c Mon Sep 17 00:00:00 2001 From: lzj1769 Date: Wed, 19 Aug 2020 15:45:48 +0200 Subject: [PATCH 11/12] remove argument warn from matplotlib.use --- rgt/viz/Main.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/rgt/viz/Main.py b/rgt/viz/Main.py index acc5862d..ea6360b2 100644 --- a/rgt/viz/Main.py +++ b/rgt/viz/Main.py @@ -8,7 +8,7 @@ import datetime import matplotlib -matplotlib.use('Agg', warn=False) +matplotlib.use('Agg') from .boxplot import Boxplot from .lineplot import Lineplot From 4da74332814de192c2005a4960e56dc59f3382ea Mon Sep 17 00:00:00 2001 From: lzj1769 Date: Wed, 19 Aug 2020 16:33:58 +0200 Subject: [PATCH 12/12] fix bug of rgt-hint reporting number of reads uncorrectly --- rgt/HINT/Footprinting.py | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/rgt/HINT/Footprinting.py b/rgt/HINT/Footprinting.py index 06864c0f..311d1992 100644 --- a/rgt/HINT/Footprinting.py +++ b/rgt/HINT/Footprinting.py @@ -853,8 +853,7 @@ def post_processing(footprints, original_regions, fp_min_size, fp_ext, genome_da footprints_overlap.write(output_file_name) # the number of reads - lines = pysam.idxstats(reads_file.file_name).splitlines() - num_reads = sum(map(int, [x.split("\t")[2] for x in lines if not x.startswith("#")])) + num_reads = pysam.AlignmentFile(reads_file.file_name).count(until_eof=True) # the number of peaks and tag count within peaks num_peaks = 0