-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathh5_splitter.py
397 lines (319 loc) · 16.7 KB
/
h5_splitter.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
import h5py
import pandas as pd
import numpy as np
import os
import matplotlib.pyplot as plt
import yaml
import json
# import scipy.stats as scs
# from video import split_video
from utils import get_session_name, h5print, populate_depth
# so np arrays dont truncate
import sys
np.set_printoptions(threshold=sys.maxsize)
## READ H5 FILE WITH H5PY
# h5print(filename)
### READ THROUGH h5 FILE AND EVERY SEPERATE AT 2 CUTTING POINTS (frames)
### def cut_frames(filename, cut_points=[18000, 36000])
### ONCE THESE ARRAYS ARE EDITED SAVE AS A NEW H5 file (3 output)
def split_groups(dataset, cut_points, print_summary=False):
control_frames = dataset[:cut_points[0]]
stim_frames = dataset[cut_points[0]:cut_points[1]]
post_frames = dataset[cut_points[1]:]
if print_summary == True:
print("[PRINTING SUMMARY]")
print(control_frames)
print(len(control_frames))
print("--------------------------------------")
print(stim_frames)
print(len(stim_frames))
print("--------------------------------------")
print(post_frames)
print(len(post_frames))
print("--------------------------------------")
return control_frames, stim_frames, post_frames
def split_h5(filename, cut_points, key=None, print_summary=False):
with h5py.File(filename, 'r') as f:
frame_num = f['frames'].shape[0]
print("Frames in session:", frame_num)
data_dict = {}
if key == None:
keys = f.keys()
else:
if key not in f.keys():
print("NOT A VALID KEY")
print("Please use an available key in this dataset:", f.keys())
print("QUITTING")
quit()
else:
keys = [key]
print("keys:",keys)
for key in keys:
print("[SELECTED]:", key.upper())
data = f[key]
if isinstance(data, h5py.Dataset):
print(data)
control_frames, stim_frames, post_frames = split_groups(data, cut_points, print_summary=print_summary)
data_dict[key] = [control_frames, stim_frames, post_frames]
print("[SUCCESS]")
elif data.name[1:] == 'metadata':
flip_data = data['extraction']['flips']
# flip_data = data['extraction/flips'] CAN USE '/' to move down hierarchy
print(flip_data)
control_frames, stim_frames, post_frames = split_groups(flip_data, cut_points, print_summary=print_summary)
data_dict[key] = [control_frames, stim_frames, post_frames]
print("[SUCCESS]")
elif data.name[1:] == 'scalars':
scalars = {}
for scalar in data:
scalar_data = data[scalar]
print(scalar_data)
control_frames, stim_frames, post_frames = split_groups(scalar_data, cut_points, print_summary=print_summary)
scalar_list = [control_frames, stim_frames, post_frames]
scalars[scalar] = scalar_list
print("[SUCCESS]")
data_dict[key] = scalars
return data_dict
####### UNCOMMENT
# split_data = dictionary of split values
# split_data = split_h5(filename, [18000, 36000], print_summary=False)
def create_file(old_file, split_data, location, newfile='newfile', proc=False, check_files=True):
new_files = ['control', 'stim', 'post']
destinations = []
for i in range(len(new_files)):
if proc == True:
print('[CREATING SUBDIR]: /proc')
# create session folder path and with proc subdir
session_folder = location + '/' + get_session_name(old_file) + ' (' + new_files[i] + ')'
proc_folder = session_folder + '/proc'
if not os.path.exists(proc_folder):
os.makedirs(proc_folder)
destination = proc_folder
destinations.append(destination)
# populate session folder with empty depth.dat and depth_ts.txt
populate_depth(session_folder)
# update session name in metadata.json and add to session folder
json_meta_path = old_file[:-18] + 'metadata.json'
update_json(json_meta_path, session_folder, new_files[i])
new_file_name = 'results_00'
else:
new_file_name = new_files[i] + '-' + newfile
print('[CREATING FILE]: ' + new_file_name + '.h5')
with h5py.File(destination + '/' + new_file_name + '.h5', 'w') as new_h5:
with h5py.File(old_file, 'r') as old_h5:
keys = old_h5.keys()
datasets = []
for key in keys:
dataset = split_data[key]
if key == 'scalars':
for scalar in dataset:
scalar_sets = dataset[scalar]
new_h5.create_dataset(key+'/'+scalar, data=scalar_sets[i])
else:
if isinstance(old_h5[key], h5py.Dataset):
new_h5.create_dataset(key, data=dataset[i])
datasets.append(key)
else:
if key not in datasets:
old_h5.copy(old_h5[key], new_h5['/'])
if key == 'metadata':
del new_h5['/metadata/extraction/flips']
new_h5.create_dataset('metadata/extraction/flips', data=dataset[i])
# update uuid metadata (appends 'c', 's' or 'p' to end)
uuid = np.array(new_h5['/metadata/uuid']).item()
str_uuid = uuid.decode("utf-8")
new_uuid = str.encode(str_uuid + new_files[i][0])
print("New uuid:", new_uuid)
del new_h5['/metadata/uuid']
new_h5['/metadata/uuid'] = new_uuid
# update sessionName metadata (appends '(control)', '(stim)' or '(post)' to end)
session_name = np.array(new_h5['/metadata/acquisition/SessionName']).item()
str_session_name = session_name.decode("utf-8")
new_session_name = str.encode(str_session_name+ ' ('+new_files[i]+')')
print("New session name:", new_session_name)
del new_h5['/metadata/acquisition/SessionName']
new_h5['/metadata/acquisition/SessionName'] = new_session_name
# update yaml
update_dict = {}
update_dict['uuid'] = new_uuid.decode("utf-8")
update_dict['metadata'] = {'SessionName': new_session_name.decode("utf-8")}
# update_dict['parameters'] = {'flip_classifier': '/fs03/am17/jacks/Moseq/Extraction/data/flip_classifier_k2_largemicewithfibre.pkl'} # MAY NEED TO CHANGE AS THIS IS NOT ORIGINAL ONE USED IN EXTRACTION
corresponding_yaml = old_file.replace('.h5', '.yaml')
update_yaml(corresponding_yaml, destination, update_dict, newfile=new_file_name)
print('[NEW FILE CREATED]:', new_files[i] + '-' + newfile + '.h5')
if check_files == True:
if proc == False:
print()
h5print(location+'/'+new_files[i]+'-'+newfile+'.h5')
print()
else:
print()
h5print(destinations[i]+'/'+new_file_name+'.h5')
print()
print("YES")
return destinations
####### UNCOMMENT
# create_file(split_data, './finals')
####### UNCOMMENT
# CHECK NEWFILE
# h5print('control-newfile.h5')
# print()
# h5print('stim-newfile.h5')
# print()
# h5print('post-newfile.h5')
def check_finals(filename):
h5print('./finals/'+filename)
# check_finals('control-results_001.h5')
# update_dict = {uuid: '', metadata: {SessionName: ''}}
# subdir must start with '/' (but not end)
def update_yaml(oldfile, destination, update_dict, newfile='newfile', subdir=False):
with open(oldfile) as old_yml:
content = yaml.safe_load(old_yml)
for key in content:
if key == 'uuid':
content[key] = update_dict[key]
if key == 'metadata':
if 'SessionName' in update_dict[key]:
content[key]['SessionName'] = update_dict[key]['SessionName']
# if key == 'parameters': ## DONT NEED YET
# if 'flip_classifier' in update_dict[key]:
# content[key]['flip_classifier'] = update_dict[key]['flip_classifier']
if subdir != False:
subdir = destination + subdir + '/'
if not os.path.exists(subdir):
os.makedirs(subdir)
else:
subdir = destination + '/'
print("[CREATING YAML]")
with open(subdir + newfile + '.yaml', "w") as new_yaml:
yaml.dump(content, new_yaml, default_flow_style=False)
print("[YAML CREATED]")
# update_dict = {'uuid': 'ligma', 'metadata': {'SessionName': 'balls'}}
# update_yaml('./proc/results_00.yaml', './check', update_dict)
def update_json(oldfile, destination, condition, newfile='metadata'):
print('[READING METADATA.JSON]...')
with open(oldfile, 'r') as json_file:
data = json.load(json_file)
data['SessionName'] = data['SessionName'] + ' (' + condition + ')'
print('[CREATING NEW METADATA.JSON]...')
with open(destination + '/' + newfile + '.json', 'a') as new_json:
json.dump(data, new_json)
def main(files, destination):
session_count = 1
total_sessions = len(files)
for filename in files:
new_file_name = get_session_name(filename)
video_name = filename.replace('.h5', '.mp4')
print(new_file_name)
print(destination)
print("[STARTING MAIN]: " + new_file_name)
print("------------------------------------------------------------------------")
## READFILE
print("[READING FILE]: " + str(session_count)+'/'+str(total_sessions))
h5print(filename)
print("------------------------------------------------------------------------")
## SPLIT FILE
print("[SPLITTING FILE]: " + str(session_count)+'/'+str(total_sessions))
split_data = split_h5(filename, [18000, 36000], print_summary=False)
print("------------------------------------------------------------------------")
## CREATE FILES
# by session count
# create_file(split_data, './finals', newfile='results_00'+str(session_count))
# by session name
# updated create_file function to return list of destinations for video function
destinations = create_file(filename, split_data, destination, proc=True, newfile=new_file_name)
split_video(video_name, destinations, [18000, 36000], newfile='results_00')
print("------------------------------------------------------------------------")
session_count += 1
# break # for testing
def start(base_dir, destination):
files = os.listdir(base_dir)
session_files = []
# bad_sessions = ['session_20190205095213 WT3 10Hz', 'session_20190206115259 WT6 10Hz'] # WT 10Hz
# bad_sessions = ['session_20190218101113 3883 10Hz coil came off', 'session_20190218112308 3907 10Hz coil came off']
# bad_sessions = ['session_20190113151744 WT2 Sham', 'session_20190113160334 WT4 Sham', 'session_20190113163631 WT5 Sham', 'session_20190113170857 WT1 Sham', 'session_20190113174353 WT6 Sham', 'session_20190113182616 WT7 Sham', 'session_20190113190625 WT3 Sham', 'session_20190128093115 WT5 sham', 'session_20190128100411 WT6 sham', 'session_20190128103838 WT4 sham', 'session_20190131104916 WT3 Sham coil fell off', 'session_20190131121155 WT5 Sham', 'session_20190131124434 WT6 Sham coil fell off', 'session_20190131162118 WT3 Sham coil fell off', 'session_20190131165326 WT6 Sham coil fell off'] # WT 10Hz sham
bad_sessions = ['session_20190117092631 3906 Sham', 'session_20190117100227 3928 Sham', 'session_20190117103940 3876 Sham', 'session_20190117111450 3885 Sham', 'session_20190117114725 3887 Sham', 'session_20190117122252 3897 Sham', 'session_20190128140048 3928 sham', 'session_20190128143448 3876 sham', 'session_20190128150913 3885 sham', 'session_20190128154317 3890 sham', 'session_20190128162613 3895 sham', 'session_20190128173218 3883 sham', 'session_20190128180452 3907 sham', 'session_20190128184018 3918 sham', 'session_20190131142959 3906 Sham', 'session_20190201114235 3890 Sham', 'session_20190201122324 3876 Sham', 'session_20190201130116 should be 3879 Sham', 'session_20190201133253 3883 Sham', 'session_20190201140423 3889 Sham', 'session_20190201144458 3895 Sham'] # ephrin 10Hz sham
for session in files:
if session[:7] == 'session' and session not in bad_sessions:
result_filepath = base_dir + session + '/proc/results_00.h5'
session_files.append(result_filepath)
print(session_files)
confirmation = input("confirm session files? (y/n): ")
if confirmation.lower() == 'y':
print('[CONFIRMED]')
main(session_files, destination)
elif confirmation.lower() == 'n':
print('[REJECTED]')
quit()
else:
print("[ERROR]: PLEASE ENTER (y/n): ")
quit()
## --------------------------------------------------START--------------------------------------------------##
# base_dir
wt_10Hz_basedir = '../../' # WT 10Hz wire
ephrin_10Hz_basedir = '../../../ephrin 10Hz wire/' # Ephrin
wt_sham = '../../../../Sham wire/WT sham wire/' # WT Sham
# destinations
final_destination = './finals'
external_destination = '/Volumes/NO NAME/Research/Moseq/Ephrin 10Hz wire'
# testing base_dir
local_test = '../../testing/'
# testing destination
test_destination = '../../testing_finals' # for testing
# data from tyler PC
raw_ephrin = 'F:\wire 10 Hz/ephrin 10Hz wire/'
raw_wt = 'F:\wire 10 Hz/WT 10 Hz wire/'
raw_wt_sham = 'F:\Sham wire/WT sham wire/'
raw_ephrin_sham = 'F:\Sham wire/ephrin sham/'
# research from tyler PC
research_ephrin = 'D:\Research/2022/Moseq/data/10Hz ephrin/'
research_wt = 'D:\Research/2022/Moseq/data/10Hz WT/'
research_wt_sham = 'D:\Research/2022/Moseq/data/10Hz WT sham/'
research_ephrin_sham = 'D:\Research/2022/Moseq/data/10Hz ephrin sham/'
# start(local_test, test_destination)
## --------------------------------------------------START--------------------------------------------------##
def extract_scalars(base_dir, raw=False, save_to_csv=False):
files = os.listdir(base_dir)
print("[EXTRACTING SCALARS FROM", len(files), "FILES]")
if raw == True:
session_files = []
bad_sessions = ['session_20190205095213 WT3 10Hz', 'session_20190206115259 WT6 10Hz']
for session in files:
if session[:7] == 'session' and session not in bad_sessions:
result_filepath = base_dir + session + '/proc/results_00.h5'
session_files.append(result_filepath)
else:
session_files = files
session_dicts = {}
for session in session_files:
if raw == True:
session = session.split(base_dir)[1]
condition_session_title = session.split('/')[0].replace(' ', '-')
session_title = condition_session_title
else:
condition_session_title = session.split('.')[0]
session_title = session[session.index("-")+1:-3]
if session_title not in session_dicts:
session_dicts[session_title] = {}
scalar_dict = {}
with h5py.File(base_dir + '/' + session, 'r') as f:
scalars = f['scalars']
for scalar in scalars:
scalar_data = scalars[scalar]
scalar_dict[scalar] = scalar_data
scalar_df = pd.DataFrame(scalar_dict)
if save_to_csv == True:
scalar_df.to_csv('./scalar_csv/'+condition_session_title+".csv")
if session.split("-")[0] not in session_dicts[session_title]:
if raw == True:
session_dicts[session_title] = scalar_df
else:
session_dicts[session_title][session.split("-")[0]] = scalar_df
# timestamps = f['timestamps']
# print(timestamps)
# break
print("[EXTRACTED", len(session_dicts), "SESSIONS WITH CONTROL, STIM AND POST]")
return session_dicts
# ./finals -- The grouped results (control, stim, post)
# ../../ -- raw results
# extracted_dicts = extract_scalars('./finals', save_to_csv=False)