-
Notifications
You must be signed in to change notification settings - Fork 2
/
Copy pathPixDistStats3.py
236 lines (214 loc) · 11.4 KB
/
PixDistStats3.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
# This script analyzes the csv files output by PixDistStats2.py
# Updated Feb 2021.
# PixDistStats2 separates the data into biological replicates instead of aggregating all data for each sample group.
# This script takes those data and does stats and makes plots.
# pixel_distance.py actually performs the measurement of minimum distance
# between tumor and lyve-1 pixels, and outputs the results for each image.
# PixDistStats.py performs stats and makes plots on ALL the data separated by sample group. However,
# this is insufficient because it isn't split up into biological replicates, or normalized.
# PixDistStats2.py separates the data into biological replicates instead of aggregating
# all data for each sample group, and experiments with plots.
# PixDistStats3.py takes data from PixDistStats2, normalizes it to total pixels for each animal,
# does statistical comparisons and makes plots.
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import os
import pixel_distance as pxd
import pandas as pd
from scipy.stats import stats
from statsmodels.stats.multicomp import pairwise_tukeyhsd, MultiComparison
import joypy as jpy
def load_datas(dir):
distbypercentiles = pd.read_csv(dir + 'dist_by_percentiles.csv', index_col='percentiles')
numpixbydistbins = pd.read_csv(dir + 'numpix_by_dist_bins.csv', index_col='distance bins')
normnumpixbydistbins = pd.read_csv(dir + 'norm_numpix_by_dist_bins.csv', index_col='distance bins')
print('dist by percentiles: ')
print(distbypercentiles.head(10))
print('numpix by dist bins: ')
print(numpixbydistbins.head(11))
print('normalized numpix by dist bins: ')
print(normnumpixbydistbins.head(11))
# return datas as a list
return [distbypercentiles, numpixbydistbins, normnumpixbydistbins]
def run_anova(data, savedir, labels):
# ANOVA
f_stat, p = stats.f_oneway(data[labels[0:4]],
data[labels[5:9]],
data[labels[10:14]])
# Multiple comparisons... Tukey Test:
# need a stacked dataframe with consistent labels...
# measurement is the data, group is naive, tdLN, disLN
# mc = MultiComparison(data_stacked['measurement'], data_stacked['group'])
# stack data
# data_stacked = data.stack().reset_index()
# data is in Series form... so it's already stacked. Just reset_index()
data_stacked = data.to_frame()
# data_stacked = data_stacked.rename(columns={'level_0': 'id', 'level_1': 'group', 0: 'distance'})
print(data_stacked.head(20))
# make new column with supergroups (naive, disLN, tdLN)
# data_stacked['supergroup'] = data_stacked['group'].map(lambda x: x.rstrip('12345'))
data_stacked['supergroup'] = data_stacked.index.map(lambda x: x.rstrip('12345'))
print(data_stacked.head(20))
# mc = MultiComparison(data_stacked['distance'], data_stacked['supergroup'])
mc = MultiComparison(data_stacked[data.name], data_stacked['supergroup'])
tukey = mc.tukeyhsd(alpha=0.05)
print(data_stacked[data.name])
# Save ANOVA & Tukey results in a text file
file0 = open(savedir + data.name + '_ANOVA.txt', 'a+')
file0.write('Stats: \n')
file0.write('Mean: ' + str(data_stacked.groupby(['supergroup']).mean()) + '\n')
file0.write('Standard Deviation: ' + str(data_stacked.groupby(['supergroup']).std()) + '\n')
file0.write('ANOVA Results: \n')
file0.write('F Statistic: ' + str(f_stat) + '\n')
file0.write('p-value: ' + str(p) + '\n')
file0.write('Tukey results: ' + '\n')
file0.write(str(tukey) + '\n')
file0.write('Unique groups: {}'.format(mc.groupsunique))
return
def transpose_data(data):
# remove name of indexes
data.index.names = [None]
transposed_data = data.transpose()
print('after transposing: ')
print(transposed_data)
# drop the number from the end of the indexes (make supergroups)
transposed_data = transposed_data.rename(index=lambda x: x.rstrip('12345'))
print('after renaming indexes: ')
print(transposed_data)
# stack based on supergroup
# transposed_stacked_data = transposed_data.stack()
# print('after stacking: ')
# print(transposed_stacked_data)
return transposed_data
def make_plots(dist_percentile, norm_numpix, savedir, labels):
sns.set_theme(style="whitegrid")
# keep x-axis as distance consistent across plots.
# For dist_by_percentiles_transposed, try plotting a bar graph which shows the distance tumor cells invaded to
# at each percentile
# 10th %ile |---| 10% of cells invaded less than this distance
# 20th %ile |-------| 20% of cells invaded less than this distance
# 30th %ile |------------| 30% of cells invaded less than this distance
# For norm_numpix_by_dist_bins_transposed, try plotting a histogram...
# Proportion (normalized #) of pixels at each distance.
# Can overlay all three histograms in different colors, slightly opaque
# ------------------------------------------------------------------------------------------
# bar plots for dist_percentile:
print('initial assessment: ')
dist_percentile.index.names = ['Group']
print(dist_percentile)
print(dist_percentile.index)
# convert indexes to a column so we can melt it
dist_percentile.reset_index(level=dist_percentile.index.names, inplace=True)
print('after reset index: ')
print(dist_percentile)
melt_dist_percentile = pd.melt(dist_percentile, id_vars='Group', var_name='Percentile',
value_name='Distance (microns)')
ax2 = sns.barplot(x='Distance (microns)', y='Percentile', hue='Group', data=melt_dist_percentile)
fig2 = ax2.get_figure()
fig2.set_size_inches(11, 8.5) # increase figure size
plt.gca().legend().set_title(None) # remove legend title
plt.gca().set_title('Distance from Lymphatics by Percentile') # set plot title
# Add annotations for statistical significance based on earlier anova & tukey comparisons (see txt files)
# which comparisons were significant? by tukey:
# 30th: disLN & tdLN. p-adj = 0.0401
# 40th: disLN & tdLN. p-adj = 0.0191
# 50th: disLN & tdLN. p-adj = 0.0126, naive & tdLN. p-adj = 0.0369
# 60th: disLN & tdLN. p-adj = 0.012, naive & tdLN. p-adj = 0.0177
# 70th: disLN & tdLN. p-adj = 0.0153, naive & tdLN. p-adj = 0.0122
# 80th: disLN & tdLN. p-adj = 0.0221, naive & tdLN. p-adj = 0.011
fig2.savefig(savedir + 'dist_by_percentiles.png')
fig2.clf()
# -----------------------------------------------------------------------------------------------------
# histograms for norm_numpix:
# this isn't actually a histogram... since I already have the x-labels as bins and
# the counts (proportions) for each sample. What I really want to do is create a bunch of bar plots.
# fig, ax = plt.subplots()
# for a in [x, y]:
# sns.distplot(a, bins=range(1, 110, 10), ax=ax, kde=False)
# ax.set_xlim([0, 100])
# Try melting...
print('before index rename attempt: ')
print(norm_numpix.index)
norm_numpix.index.names = ['Group']
print('after index rename attempt: ')
print(norm_numpix)
print(norm_numpix.index)
# convert indexes to a column so we can melt it
norm_numpix.reset_index(level=norm_numpix.index.names, inplace=True)
print('after reset index: ')
print(norm_numpix)
melt_norm_numpix = pd.melt(norm_numpix, id_vars='Group', var_name='Distance (microns)',
value_name='% of total pixels within group')
print('after melting: ')
print(melt_norm_numpix.head())
# # Stack Data
# norm_numpix = norm_numpix.stack()
# print('after stacking: ')
# print(norm_numpix)
# print('indexes: ')
# print(norm_numpix.index)
# # samples = ['tdLN', 'disLN', 'naive']
# # dist_bins = ['0-10um', '10-20um', '20-30um', '30-40um', '40-50um',
# # '50-60um', '60-70um', '70-80um', '80-90um', '90-100um', '100um+']
# # norm_numpix.index = pd.MultiIndex.from_product([samples, dist_bins], names=['sample', 'dist_bin'])
# # norm_numpix.rename_axis(index=['sample', 'dist_bin'])
# norm_numpix.index.names = ['sample', 'dist_bin']
# print('after rename attempt: ')
# print(norm_numpix)
# print(norm_numpix.index)
# # g = sns.FacetGrid(norm_numpix, hue='sample', palette='coolwarm')
ax = sns.barplot(x='Distance (microns)', y='% of total pixels within group', hue='Group', data=melt_norm_numpix)
fig = ax.get_figure()
fig.set_size_inches(11, 8.5) # increase figure size
plt.gca().legend().set_title(None) # remove legend title
plt.gca().set_title('% of Tumor+ Pixels vs. Distance from Lymphatics') # set plot title
# Add annotations for statistical significance based on earlier anova & tukey comparisons (see txt files)
# which comparisons were significant? by tukey:
# in general... 0-20um: tdLN sig lower. 30-50um: tdLN sig higher.
# 0-10um: disLN & tdLN. p-adj = 0.0472
# 10-20um: naive & tdLN. p-adj = 0.0306
# 30-40um: naive & tdLN. p-adj = 0.0014
# 40-50um: disLN & tdLN. p-adj = 0.0019. naive & tdLN. p-adj = 0.001
fig.savefig(savedir + 'numpix_by_dist_bins.png')
fig.clf()
return
# -------------------------------------------------------------------------------------
# MAIN --------------------------------------------------------------------------------
# -------------------------------------------------------------------------------------
# choose directory
dirname = pxd.file_import(prompt='Choose the directory containing tiff folders: ')
print(dirname)
save_dir1 = dirname + '/anova_outputs_feb2020/'
print(save_dir1)
# load datas
[dist_by_percentiles, numpix_by_dist_bins, norm_numpix_by_dist_bins] = load_datas(dirname)
data_labels = ['naive1', 'naive2', 'naive3', 'naive4', 'naive5',
'disLN1', 'disLN2', 'disLN3', 'disLN4', 'disLN5',
'tdLN1', 'tdLN2', 'tdLN3', 'tdLN4', 'tdLN5']
# STATS
# We'll try ANOVA. However, looking at the data distributions, it looks like the tumor data has a higher variance.
# One of the assumptions of ANOVA is that groups have the same variance. If this is a problem, we could try
# using Welch's ANOVA through Pengouin. https://pingouin-stats.org/generated/pingouin.welch_anova.html
# Also use this for loop to create a new dataframe where each measure is stacked (essentially a transpose)
dist_by_percentiles_transposed = pd.DataFrame()
numpix_by_dist_bins_transposed = pd.DataFrame()
for row in range(11):
print('current row: ' + str(row))
if row < 10:
print('dist_by_percentiles row name: ')
print(dist_by_percentiles.iloc[row].name) # print the current index (which is a name because it's a series)
print('row: ')
print(dist_by_percentiles.iloc[row])
run_anova(dist_by_percentiles.iloc[row], save_dir1, data_labels) # only has 10 rows
print('norm_numpix_by_dist_bins row name: ')
print(norm_numpix_by_dist_bins.iloc[row].name) # print the current index (which is a name because it's a series)
print('row: ')
print(norm_numpix_by_dist_bins.iloc[row])
run_anova(norm_numpix_by_dist_bins.iloc[row], save_dir1, data_labels) # has 11 rows
# transpose for plotting
dist_by_percentiles_transposed = transpose_data(dist_by_percentiles)
norm_numpix_by_dist_bins_transposed = transpose_data(norm_numpix_by_dist_bins)
# make plots
data_labels2 = ['tdLN', 'disLN', 'naive']
make_plots(dist_by_percentiles_transposed, norm_numpix_by_dist_bins_transposed, save_dir1, data_labels2)