-
Notifications
You must be signed in to change notification settings - Fork 1
Expand file tree
/
Copy pathrun_master_list_simple__receiveInputFileOfFilenames.sh
More file actions
executable file
·128 lines (115 loc) · 3.83 KB
/
run_master_list_simple__receiveInputFileOfFilenames.sh
File metadata and controls
executable file
·128 lines (115 loc) · 3.83 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
#! /bin/bash
# qsub -cwd -N JOB_ID -S /bin/bash run_master_list_simple__receiveInputFileOfFilenames.sh
if [ "$1" == "" ] || [ "$2" == "" ]; then
echo -e "Usage: $0 sampleFile.txt outfilename [blasklistfile]"
echo -e "\twhere sampleFile contains the full path to each peak file in column 1, one per line."
echo -e "\tblacklist file is optional"
exit 1
fi
fileOfFilenames=$1
if [ "$3" == "" ]; then
BLACKLIST=/dev/null
else
BLACKLIST=$3
fi
## Make master set of unique DHS positions in the genome. This will be the union of
## the MCV peaks and all single-cell strict peaks not in MCV zones.
## Also extract the "cell-type predominant" set. This is everything that's left over from the
## single-cell line union after you subtract off MCV and CTS.
out=$2
tmpd=/tmp/tmp$$
tmp=$tmpd/tmp.bed
tmpo=$tmpd/tmp.out.bed
tmp2=$tmpd/tmp2.bed
tmpm=$tmpd/tmp.mrg.bed
tmpms=$tmpd/tmp.maxscore.txt
mkdir -p $tmpd
rm -f $tmp
echo "union-ing single-cell sets..."
linenum=1
cat "$fileOfFilenames" | while read peakPath restOfLine
do
pks=$peakPath
if [ ! -s "$pks" ]; then
echo "Error: File $pks,"
echo "on line $linenum of $fileOfFilenames, was not found."
exit 1
fi
proj=$(basename "$pks" | cut -f1 -d '.')
# Each filename must end in .bed.gz or .starch. or .bed.
is_gz=$(basename "$pks" | grep -c '\.bed\.gz$')
is_starch=$(basename "$pks" | grep -c '\.starch$')
is_bed=$(basename "$pks" | grep -c '\.bed$')
score_col=7
columns=$(unstarch "$pks" | awk '{print NF; exit}' -)
if [ "$columns" -eq 5 ]; then
score_col=5
fi
if [ "$is_gz" == "1" ]; then
zcat "$pks" \
| awk -v p="$proj" -v s="$score_col" '{print $1"\t"$2"\t"$3"\t"p"\t"$s}' - \
| bedops -n -1 - "$BLACKLIST" \
>> "$tmp"
N=$(zcat "$pks" | wc -l)
elif [ "$is_starch" == "1" ]; then
bedops -n -1 "$pks" "$BLACKLIST" \
| awk -v p="$proj" -v s="$score_col" '{print $1"\t"$2"\t"$3"\t"p"\t"$s}' - \
>> "$tmp"
N=$(unstarch "$pks" | wc -l)
elif [ "$is_bed" == "1" ]; then
awk -v p="$proj" -v s="$score_col" '{print $1"\t"$2"\t"$3"\t"p"\t"$s}' "$pks" \
| bedops -n -1 - "$BLACKLIST" \
>> "$tmp"
N=$(wc -l < "$pks")
else
echo "Error: Files in $fileOfFilenames must end in .bed.gz, .starch, or .bed."
echo "File $pks, on line $linenum, does not."
exit 1
fi
printf "%-15s %7d\n" "${proj/-DS.*/}" "$N"
((linenum++))
done
echo "sorting..."
sort-bed "$tmp" > "$tmpo"
## Get non-overlapping representatives.
stop=0
while [ "$stop" == 0 ]
do
echo "merge steps..."
## This klugey bit before and after the merging is because we don't want to merge regions that are simply adjacent but not overlapping
awk '{print $1"\t"$2"\t"$3-1}' $tmpo \
| bedops -m - \
| awk '{print $1"\t"$2"\t"$3+1}' - \
> $tmpm
bedmap --max $tmpm $tmpo \
> $tmpms
awk '{print $1"\t"$2"\t"$3"\tI"}' $tmpm \
| paste - $tmpms \
| bedmap --max $tmpo - \
| paste $tmpo - \
| awk '($5==$6)' - \
| cut -f1-4,6 - \
> $tmp
## There may be some ties -- now go through and take one representative from the adjacent ties
echo "remove adjacent ties..."
awk '{print $1"\t"$2"\t"$3-1}' $tmp \
| bedops -m - \
| awk '{print $1"\t"$2"\t"$3+1}' - \
| bedmap --multidelim "\t" --echo-map - $tmp \
| cut -f1-5 \
> $tmp2
## Now add in stuff that doesn't intersect our current set
bedops -n -0% $tmpo $tmp2 \
> $tmp
if [ -s $tmp ]; then
echo "Adding $(wc -l $tmp | cut -d' ' -f1 -) elements"
bedops -u $tmp $tmp2 \
> $tmpo
else
cp $tmp2 $tmpo
stop=1
fi
done
mv "$tmpo" "$out"
cut -f1-3 "$out" > masterDHSs.bed3
exit