-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathplot-barStackedAcceptedGuidesRejectedByMm10dbIEEE.py
161 lines (133 loc) · 5.94 KB
/
plot-barStackedAcceptedGuidesRejectedByMm10dbIEEE.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
'''
A Benchmark of Computational CRISPR-Cas9 Guide Design Methods
Jacob Bradford, Dimitri Perrin. 2019.
This draws the stacked bar graph used in the paper.
The plot describes what percentage of exon-targeting guides reported by a
particular tool would have been rejected by mm10db. The data for this is
calculated by mm10db-rejects-accepted-by-other-tools.py and pasted here.
Run:
1: (normalise the raw data using: normalise.py)
2: (extract exon targeting guides using: normalised-extract-exon-guides.py)
3: (calculate data for this script using: mm10db-rejects-accepted-by-other-tools.py)
4: python plot-barStackedAcceptedGuidesRejectedByMm10dbIEEE.py
Input:
- TSV formatted data pasted into the multiline string named "raw_data".
Output:
- image file
'''
import numpy as np
import matplotlib
import matplotlib.pyplot as plt
from matplotlib.pyplot import cm
OUTPUT_DIR = '.'
PLOT_OUTPUT_DPI = 300
# each column is a tool, each row is a stack category
raw_data = [map(float, x.split('\t')) for x in
'''0.209568299 0.24707894 0.2489726513 0.239152371 0.248972651 0.232262883 0.248972651 0.248972651 0.248972651 0.248972651 0.248972651 0.50853549 0.252287176 0.258928571 0.248972651 0.248972651 0.242904841
0.185244845 0.178113423 0.1771290917 0.147830474 0.177129092 0.17326363 0.177129092 0.177129092 0.177129092 0.177129092 0.177129092 0.112309075 0.193053187 0.186011905 0.177129092 0.177129092 0.211185309
0.099065722 0.092049017 0.0918237211 0.052976791 0.091823721 0.093602191 0.091823721 0.091823721 0.091823721 0.091823721 0.091823721 0.001796945 0.088230733 0.048809524 0.091823721 0.091823721 0.101001669
0.000322165 0.000284981 0.0002834065 0.000504541 0.000283407 0.000497884 0.000283407 0.000283407 0.000283407 0.000283407 0.000283407 0 0.000310126 0.000297619 0.000283407 0.000283407 0.000834725
0.110985825 0.109147905 0.1085447074 0.113017154 0.108544707 0.105302465 0.108544707 0.108544707 0.108544707 0.108544707 0.108544707 0.080862534 0.118778105 0.113988095 0.108544707 0.108544707 0.133555927
0.004349227 0.004132231 0.0041093949 0.002018163 0.004109395 0.00373413 0.004109395 0.004109395 0.004109395 0.004109395 0.004109395 0.002695418 0.004496821 0.004315476 0.004109395 0.004109395 0.004173623
0.014014175 0.01154175 0.0131784044 0.01160444 0.013178404 0 0.013178404 0.013178404 0.013178404 0.013178404 0.013178404 0.012578616 0 0.013839286 0.013178404 0.013178404 0
0.376449742 0.357651753 0.3559586226 0.432896065 0.355958623 0.391336819 0.355958623 0.355958623 0.355958623 0.355958623 0.355958623 0.281221923 0.342843852 0.373809524 0.355958623 0.355958623 0.306343907'''.split('\n')]
data = np.array(raw_data)
tools = [
'Cas-Designer',
'CasFinder',
'CCTop',
'CHOPCHOP',
'CRISPOR',
'CRISPR-DO',
'CRISPR-ERA',
'CT-Finder',
'FlashFry',
'GT-Scan',
'GuideScan',
'PhytoCRISP-Ex',
'sgRNAcas9',
'sgRNAScorer2',
'SSC',
'TUSCAN',
'WU-CRISPR'
]
reasons = [
'Not considered by mm10db',
'Accepted by both',
'* Multiple exact matches',
#'* Multiple matches in exons',
#'* Multiple matches in genome',
'* Off-target score',
'* Secondary structure or energy',
'* Too close to reverse primer',
'* Poly-thymine',
'* GC-Content',
]
def stacked_bar(data, series_labels, category_labels=None,
show_values=False, value_format="{}", y_label=None,
grid=True, reverse=False):
"""Plots a stacked bar chart with the data and labels provided.
Keyword arguments:
data -- 2-dimensional numpy array or nested list
containing data for each series in rows
series_labels -- list of series labels (these appear in
the legend)
category_labels -- list of category labels (these appear
on the x-axis)
show_values -- If True then numeric value labels will
be shown on each bar
value_format -- Format string for numeric value labels
(default is "{}")
y_label -- Label for y-axis (str)
grid -- If True display grid
reverse -- If True reverse the order that the
series are displayed (left-to-right
or right-to-left)
"""
ny = len(data[0])
ind = list(range(ny))
axes = []
cum_size = np.zeros(ny)
data = np.array(data)
if reverse:
data = np.flip(data, axis=1)
category_labels = reversed(category_labels)
for i, row_data in enumerate(data):
axes.append(plt.bar(ind, row_data, bottom=cum_size,
label=series_labels[i]))
cum_size += row_data
if category_labels:
plt.xticks(ind, category_labels, rotation=25)
if y_label:
plt.ylabel(y_label)
if grid:
plt.grid()
if show_values:
for axis in axes:
for bar in axis:
w, h = bar.get_width(), bar.get_height()
plt.text(bar.get_x() + w/2, bar.get_y() + h/2,
value_format.format(h), ha="center",
va="center")
fig = plt.figure(figsize=(16,12))
fig = fig.add_subplot(111)
#fig.get_yaxis().set_major_formatter(
#matplotlib.ticker.FuncFormatter(lambda y, p: "{0:.0%}".format(int(y))))
fig.get_yaxis().set_ticklabels(['%s%%' % z for z in xrange(0, 120, 20)])
stacked_bar(
data,
reasons,
category_labels=tools,
show_values=False,
grid=False,
value_format="{:.1f}",
y_label="Distribution of Guides"
)
lgd = plt.legend(loc=9, bbox_to_anchor=(0.5, -0.125), ncol=2)
#fig.set_title('Distribution of mm10db Classifications on Exon-Targeting Guides')
# save to file
outFig = fig.get_figure()
outFigFileName = '%s/barStackedAcceptedGuidesRejectedByMm10dbIEEE.eps' % (OUTPUT_DIR)
outFig.savefig(outFigFileName, format='eps', dpi=PLOT_OUTPUT_DPI, bbox_extra_artists=(lgd,), bbox_inches='tight')
print 'Wrote to: %s' % outFigFileName
plt.clf()