-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathfilter_result.py
224 lines (205 loc) · 8.77 KB
/
filter_result.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
import os
import pickle
import numpy as np
from scipy import stats
import pandas as pd
from tqdm import tqdm
def kalman( x, P, m, R, motion, Q, F, H ):
'''
Parameters:
x: initial state
P: initial uncertainty convariance matrix
measurement: observed position ( same shape as H * x )
R: measurement noise ( same shape as H )
motion: external motion added to state vector x
Q: motion noise ( same shape as P )
F: next state function: x_prime = F * x
H: measurement function: position = H * x
Return: the updated and predicted new values for ( x, P )
See also http://en.wikipedia.org/wiki/Kalman_filter
This version of kalman can be applied to many different situations by
appropriately defining F and H
'''
# UPDATE x, P based on measurement m
# distance between measured and current position-belief
if np.isnan(m).any():
y = 0
else:
y = m - H * x
S = H * P * H.T + R # residual convariance
K = P * H.T * S.I # Kalman gain
x = x + K * y
I = np.matrix( np.eye( F.shape[0] ) ) # identity matrix
P = ( I - K * H ) * P
# PREDICT x, P based on motion
x = F * x + motion
P = F * P * F.T + Q
return x, P
# TODO refactor to reuse the kalman filter function
def kalman_leaf_length(day_array, measure_array, assumed_start_val=100, assumed_start_velocity=5,
P=None, Q=None):
for i in range(max(day_array)):
if i not in day_array:
day_array = np.append(day_array, i)
measure_array = np.append(measure_array, np.nan)
measure_array = measure_array[np.argsort(day_array)]
day_array.sort()
N = max(day_array)
x = np.matrix( [assumed_start_val, assumed_start_velocity] ).T
if P is None:
P = np.matrix( '''
1000. 0. ;
0. 100.''' )
R = 10000
F = np.matrix( '''
1. 1. ;
0. 1. ''' )
H = np.matrix( '''
1. 0. ''' )
motion = np.matrix( '0. 0.' ).T
if Q is None:
Q = np.matrix( '''
25. 0. ;
0. 4. ''' )
m = np.matrix( '0.' ).T
kalman_x = np.zeros( N )
for n in range(N):
m[0] = measure_array[n]
x, P = kalman( x, P, m, R, motion, Q, F, H )
kalman_x[n] = x[0]
return day_array, kalman_x
def kalman_leaf_width(day_array, measure_array, assumed_start_val=15.0, assumed_start_velocity=0.8,
P=None, Q=None):
for i in range(max(day_array)):
if i not in day_array:
day_array = np.append(day_array, i)
measure_array = np.append(measure_array, np.nan)
measure_array = measure_array[np.argsort(day_array)]
day_array.sort()
N = max(day_array)
x = np.matrix( [assumed_start_val, assumed_start_velocity] ).T
if P is None:
P = np.matrix( '''
200. 0. ;
0. 10.''' )
R = 1000
F = np.matrix( '''
1. 1. ;
0. 1. ''' )
H = np.matrix( '''
1. 0. ''' )
motion = np.matrix( '0. 0.' ).T
if Q is None:
Q = np.matrix( '''
16. 0. ;
0. 2. ''' )
m = np.matrix( '0.' ).T
kalman_x = np.zeros( N )
for n in range(N):
m[0] = measure_array[n]
x, P = kalman( x, P, m, R, motion, Q, F, H )
kalman_x[n] = x[0]
return day_array, kalman_x
def leaf_data_dict_to_dataframe(data_dict):
result_list = []
for (date, sensor), plot_dict in tqdm(data_dict.items(), desc='parse dict to list'):
for (plot_col, plot_range), leaves_list in plot_dict.items():
for leaf_length, leaf_width in leaves_list:
result_list.append([date, sensor, plot_col, plot_range, leaf_length, leaf_width])
leaves_df = pd.DataFrame(result_list)
leaves_df.columns=['date', 'sensor', 'col', 'range', 'leaf_length', 'leaf_width']
leaves_df['date']=leaves_df['date'].astype('datetime64[ns]')
return leaves_df
def subplot_col_range_to_site_str(subplot_col, subplot_range, season_num):
plot_range = subplot_range
plot_col = int((subplot_col + 1) / 2)
if (subplot_col % 2) != 0:
subplot = 'W'
else:
subplot = 'E'
site_str = 'MAC Field Scanner Season {} Range {} Column {} {}'\
.format(str(season_num), str(plot_range), str(plot_col), subplot)
return site_str
def remove_outliars(df, by=None, threshold=1):
if by is None:
by = df.columns
if (not hasattr(arg, "strip") and
hasattr(arg, "__iteritems__") or
hasattr(arg, "__iter__")):
by = list(by)
else:
by = [by, ]
if len(by) > 1:
return df[np.abs((stats.zscore(df[by])) < threshold).all(axis=1)]
else:
return df[np.abs(stats.zscore(df[by])) < threshold]
if __name__ == '__main__':
data_file_path = './leaves.pkl'
season_num = 4
pkl_save_folder = './s4_pkl_result'
# read data
with open(data_file_path, 'rb') as f:
leaves_dict = pickle.load(f)
leaves_df = leaf_data_dict_to_dataframe(leaves_dict)
# remove NaN
leaves_df.loc[(leaves_df[['leaf_width']] == 0).all(axis=1), 'leaf_width'] = np.nan
leaves_df = leaves_df.dropna()
# remove outliers of leaf width
leaves_df.loc[(np.abs(stats.zscore(leaves_df[['leaf_width']])) > 0.1).all(axis=1), 'leaf_width'] = np.nan
leaves_df = leaves_df.dropna()
leaves_df.loc[(np.abs(stats.zscore(leaves_df[['leaf_width']])) > 4).all(axis=1), 'leaf_width'] = np.nan
leaves_df = leaves_df.dropna()
# find all avaliable plot col-range
col_range = np.unique(leaves_df[['col', 'range']].values, axis=0)
# remove row with zeros
col_range = col_range[np.all(col_range!=0, axis=1), :]
all_plot_result_df = pd.DataFrame()
for plot_col, plot_range in tqdm(col_range, desc='kalman filtering'):
# select plot
selected_plot_df = leaves_df[(leaves_df['col']==plot_col) & (leaves_df['range']==plot_range)]
# TODO could move two filter out of for loop for better readiability
# filtered if only few days have data
if len(selected_plot_df['date'].unique()) < 30:
continue
# filter days with few data points
selected_plot_df = selected_plot_df.groupby(['date', 'col', 'range'])\
.filter(lambda x: x['leaf_length'].count() > 4)
if len(selected_plot_df['date'].unique()) < 30:
continue
# mean by average all (west east together)
selected_plot_df = selected_plot_df.groupby(['date']).mean().reset_index()
# transfer the data into discrete time_array, measure_array format
# add delta_day column
first_day = selected_plot_df['date'].min()
selected_plot_df['delta_days'] = np.nan
selected_plot_df['delta_days'] = (selected_plot_df['date'] - first_day)
selected_plot_df['delta_days'] = selected_plot_df['delta_days'].apply(lambda delta_t: delta_t.days)
length_measure_array = selected_plot_df['leaf_length'].values
width_measure_array = selected_plot_df['leaf_width'].values
time_array = selected_plot_df['delta_days'].values
# put it into kalman filter
result_day_array, result_kalman_length = kalman_leaf_length(time_array, length_measure_array)
result_day_array, result_kalman_width = kalman_leaf_width(time_array, width_measure_array)
# select from first day to the day with largest measurement
max_loc = np.argmax(result_kalman_length)
result_day_array = result_day_array[:max_loc]
result_kalman_length = result_kalman_length[:max_loc]
result_kalman_width = result_kalman_width[:max_loc]
result_df = pd.DataFrame(columns=['local_datetime', 'leaf_length', 'leaf_width', 'delta_days', 'site'])
result_df['delta_days'] = result_day_array
result_df['delta_days'] = pd.to_timedelta(result_df['delta_days'], unit='d')
result_df['leaf_length'] = result_kalman_length
result_df['leaf_width'] = result_kalman_width
result_df['local_datetime'] = first_day
result_df['local_datetime'] = result_df['local_datetime'] + result_df['delta_days']
result_df['site'] = subplot_col_range_to_site_str(plot_col, plot_range, season_num)
result_df['species'] = 'Sorghum bicolor'
result_df['citation_author'] = 'Zeyu Zhang'
result_df['citation_year'] = '2019'
result_df['citation_title'] = 'Maricopa Field Station Data and Metadata'
result_df['method'] = 'Scanner 3d ply data to leaf length'
result_df = result_df.drop(['delta_days'], axis=1)
# save result only
result_df.to_pickle(os.path.join(pkl_save_folder,'col_{}_range_{}.pkl'.format(plot_col, plot_range)))
all_plot_result_df = all_plot_result_df.append(result_df)
all_plot_result_df.to_csv('./betydb_s{}_length_result.csv'.format(season_num), index=False, float_format='%.3f')