-
Notifications
You must be signed in to change notification settings - Fork 119
/
Copy pathencode_lib_frip.py
executable file
·114 lines (95 loc) · 3.48 KB
/
encode_lib_frip.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
#!/usr/bin/env python
# ENCODE DCC FRiP wrapper
# Author: Jin Lee ([email protected])
import sys
import os
import argparse
from encode_lib_common import (
get_num_lines, gunzip, log, ls_l, mkdir_p, rm_f,
run_shell_cmd, strip_ext, write_txt)
def parse_arguments():
parser = argparse.ArgumentParser(prog='ENCODE DCC FRiP.',
description='')
parser.add_argument('peak', type=str,
help='Peak file.')
parser.add_argument('ta', type=str,
help='TAGALIGN file.')
parser.add_argument('--chrsz', type=str,
help='2-col chromosome sizes file. \
If given, do shifted FRiP (for ChIP-Seq).')
parser.add_argument('--fraglen', type=int, default=0,
help='Fragment length for TAGALIGN file. \
If given, do shifted FRiP (for ChIP-Seq).')
parser.add_argument('--out-dir', default='', type=str,
help='Output directory.')
parser.add_argument('--log-level', default='INFO',
choices=['NOTSET', 'DEBUG', 'INFO',
'WARNING', 'CRITICAL', 'ERROR',
'CRITICAL'],
help='Log level')
args = parser.parse_args()
log.setLevel(args.log_level)
log.info(sys.argv)
return args
def frip(ta, peak, out_dir):
prefix = os.path.join(out_dir,
os.path.basename(strip_ext(peak)))
frip_qc = '{}.frip.qc'.format(prefix)
if get_num_lines(peak) == 0:
val1 = 0.0
tmp_files = []
else:
# due to bedtools bug when .gz is given for -a and -b
tmp1 = gunzip(ta, 'tmp1', out_dir)
tmp2 = gunzip(peak, 'tmp2', out_dir)
cmd = 'bedtools intersect -nonamecheck -a {} -b {} -wa -u | wc -l'
cmd = cmd.format(
tmp1, # ta
tmp2) # peak
val1 = run_shell_cmd(cmd)
tmp_files = [tmp1, tmp2]
val2 = get_num_lines(ta)
write_txt(frip_qc, str(float(val1)/float(val2)))
rm_f(tmp_files)
return frip_qc
def frip_shifted(ta, peak, chrsz, fraglen, out_dir):
prefix = os.path.join(out_dir,
os.path.basename(strip_ext(peak)))
frip_qc = '{}.frip.qc'.format(prefix)
half_fraglen = (fraglen+1)/2
if get_num_lines(peak) == 0:
val1 = 0.0
else:
# due to bedtools bug when .gz is given for -a and -b
tmp2 = gunzip(peak, 'tmp2', out_dir)
cmd = 'bedtools slop -i {} -g {} '
cmd += '-s -l {} -r {} | '
cmd += 'awk \'{{if ($2>=0 && $3>=0 && $2<=$3) print $0}}\' | '
cmd += 'bedtools intersect -nonamecheck -a stdin -b {} '
cmd += '-wa -u | wc -l'
cmd = cmd.format(
ta,
chrsz,
-half_fraglen,
half_fraglen,
tmp2) # peak
val1 = run_shell_cmd(cmd)
rm_f(tmp2)
val2 = get_num_lines(ta)
write_txt(frip_qc, str(float(val1)/float(val2)))
return frip_qc
def main():
# read params
args = parse_arguments()
log.info('Initializing and making output directory...')
mkdir_p(args.out_dir)
if args.fraglen:
frip_shifted(args.ta, args.peak,
args.chrsz, args.fraglen, args.out_dir)
else:
frip(args.ta, args.peak, args.out_dir)
log.info('List all files in output directory...')
ls_l(args.out_dir)
log.info('All done.')
if __name__ == '__main__':
main()