Skip to content

Commit 54a9241

Browse files
committed
data_import.py:
- Session object now has attribute 'prints' which is list of namedtuples with attributes 'time' and 'string' containing the output of user print lines. - Session object now has attribute 'variables_df' which is a dataframe containing the value of variables output from the task. - The time units for imported data can now be specified to be 'second' or 'ms', default is 'second'. - Removed int_subject_IDs argument from Session and Experiment class. Added file 'analog_temp.npy.py' to tools folder which when run converts any analog .temp files (created e.g. if the GUI was closed / crashed during data acqusition) to numpy files.
1 parent 4c3317d commit 54a9241

File tree

3 files changed

+115
-47
lines changed

3 files changed

+115
-47
lines changed

com/data_logger.py

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -35,10 +35,10 @@ def open_data_file(self, data_dir, experiment_name, setup_ID, subject_ID, dateti
3535
self.write_info_line('experiment_name', self.experiment_name)
3636
self.write_info_line('task_name', self.sm_info['name'])
3737
self.write_info_line('task_file_hash', self.sm_info['task_hash'])
38-
self.write_info_line('setup_ID', self.setup_ID)
38+
self.write_info_line('setup_id', self.setup_ID)
3939
self.write_info_line('framework_version', self.sm_info['framework_version'])
4040
self.write_info_line('micropython_version', self.sm_info['micropython_version'])
41-
self.write_info_line('subject_ID', self.subject_ID)
41+
self.write_info_line('subject_id', self.subject_ID)
4242
self.write_info_line('start_time', datetime.utcnow().isoformat(timespec='milliseconds'))
4343
self.analog_writers = {ID:
4444
Analog_writer(ai['name'], ai['fs'], ai['dtype'], self.file_path)
@@ -129,7 +129,7 @@ def open_data_files(self, session_filepath):
129129
ses_path_stem, file_ext = os.path.splitext(session_filepath)
130130
self.path_stem = ses_path_stem + f'_{self.name}'
131131
self.t_tempfile_path = self.path_stem + '.time.temp'
132-
self.d_tempfile_path = self.path_stem + f'.data.1{self.data_type}.temp'
132+
self.d_tempfile_path = self.path_stem + f'.data-1{self.data_type}.temp'
133133
self.time_tempfile = open(self.t_tempfile_path, 'wb')
134134
self.data_tempfile = open(self.d_tempfile_path, 'wb')
135135
self.next_chunk_start_time = 0

tools/analog_temp2npy.py

Lines changed: 48 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,48 @@
1+
import os
2+
import numpy as np
3+
4+
def find_files_with_extension(folder_path, extension):
5+
'''Return paths for all files with specified file extension in specified
6+
folder and sub-folders'''
7+
file_paths = []
8+
for root, dirs, files in os.walk(folder_path):
9+
for file in files:
10+
if file.endswith(extension):
11+
file_paths.append(os.path.join(root, file))
12+
return file_paths
13+
14+
15+
def tempfile2npy(file_path):
16+
'''Convert a single temp file to a .npy file.'''
17+
file_type = file_path.split('.')[-2]
18+
path_stem = file_path.rsplit('.',2)[0]
19+
with open(file_path, 'rb') as f:
20+
if file_type == 'time': # Timestamp file
21+
times = np.frombuffer(f.read(), dtype='float64')
22+
np.save(path_stem + '.time.npy', times)
23+
else: # Data samples file.
24+
data_type = file_type[-1]
25+
data = np.frombuffer(f.read(), dtype=data_type)
26+
np.save(path_stem + '.data.npy', data)
27+
os.remove(file_path)
28+
29+
30+
def all_tempfile2numpy(folder_path):
31+
'''Convert all .temp files in specified folder to .npy'''
32+
file_paths = find_files_with_extension(folder_path, '.temp')
33+
for file_path in file_paths:
34+
tempfile2npy(file_path)
35+
36+
37+
if __name__ == "__main__":
38+
# Convert all temp files in data folder to npy.
39+
print('Converting .temp files to .npy')
40+
try:
41+
data_path = os.path.join('..', 'data')
42+
all_tempfile2numpy(data_path)
43+
print('\nFiles converted successfully.')
44+
except Exception as e:
45+
print('\nUnable to convert files.')
46+
print(e)
47+
input('\nPress enter to close.')
48+

tools/data_import.py

Lines changed: 64 additions & 44 deletions
Original file line numberDiff line numberDiff line change
@@ -10,10 +10,7 @@
1010
from collections import namedtuple
1111

1212
Event = namedtuple('Event', ['time','name'])
13-
14-
Variable = namedtuple('Variable', ['time', 'type', 'value'])
15-
16-
Print = namedtuple('Print', ['time','value'])
13+
Print = namedtuple('Print', ['time','string'])
1714

1815
#----------------------------------------------------------------------------------
1916
# Session class
@@ -26,8 +23,6 @@ class Session():
2623
- experiment_name
2724
- task_name
2825
- subject_ID
29-
If argument int_subject_IDs is True, suject_ID is stored as an integer,
30-
otherwise subject_ID is stored as a string.
3126
- datetime
3227
The date and time that the session started stored as a datetime object.
3328
- datetime_string
@@ -40,12 +35,16 @@ class Session():
4035
A dictionary with keys that are the names of the framework events and states and
4136
corresponding values which are Numpy arrays of all the times (in milliseconds since the
4237
start of the framework run) at which each event/state entry occured.
43-
- print_lines
44-
A list of all the lines output by print statements during the framework run, each line starts
45-
with the time in milliseconds at which it was printed.
38+
- prints
39+
A list of named tuples for every user print statement output with fields 'time' and 'string'.
40+
- variables_df
41+
A Pandas dataframe containing the values of variables output by the task. For .txt files
42+
only variables output by the print_variables function are included.
4643
'''
4744

48-
def __init__(self, file_path, int_subject_IDs=False):
45+
def __init__(self, file_path, time_unit='second'):
46+
47+
assert time_unit in ('second','ms'), 'time_unit must be "second" or "ms"'
4948

5049
print('Importing data file: '+os.path.split(file_path)[1])
5150
self.file_name = os.path.split(file_path)[1]
@@ -62,8 +61,9 @@ def __init__(self, file_path, int_subject_IDs=False):
6261

6362
self.experiment_name = next(line for line in info_lines if 'Experiment name' in line).split(' : ')[1]
6463
self.task_name = next(line for line in info_lines if 'Task name' in line).split(' : ')[1]
65-
subject_ID_string = next(line for line in info_lines if 'Subject ID' in line).split(' : ')[1]
64+
self.subject_ID = next(line for line in info_lines if 'Subject ID' in line).split(' : ')[1]
6665
datetime_string = next(line for line in info_lines if 'Start date' in line).split(' : ')[1]
66+
self.time_unit = time_unit
6767

6868
self.datetime = datetime.strptime(datetime_string, '%Y/%m/%d %H:%M:%S')
6969

@@ -76,46 +76,63 @@ def __init__(self, file_path, int_subject_IDs=False):
7676

7777
data_lines = [line[2:].split(' ') for line in all_lines if line[0]=='D']
7878

79-
self.events = [Event(int(dl[0]), ID2name[int(dl[1])]) for dl in data_lines]
79+
self.events = [Event(int(dl[0]) if time_unit=='ms' else int(dl[0])/1000,
80+
ID2name[int(dl[1])]) for dl in data_lines]
8081

8182
self.times = {event_name: np.array([ev.time for ev in self.events if ev.name == event_name])
8283
for event_name in ID2name.values()}
8384

84-
self.print_lines = [line[2:].split(' ', 1) for line in all_lines if line[0]=='P']
85-
85+
print_lines = [line[2:].split(' ', 1) for line in all_lines if line[0]=='P']
86+
var_dicts = []
87+
var_times = []
8688
self.prints = []
87-
self.variables = []
8889

89-
for print_line in self.print_lines:
90-
try:
91-
value = json.loads(print_line[1])
92-
self.variables.append(Variable(time=int(print_line[0]),type='print',value=value))
93-
except json.JSONDecodeError:
94-
self.prints.append(Print(time=int(print_line[0]),value=print_line[1]))
90+
for print_line in print_lines:
91+
print_time = int(print_line[0]) if time_unit=='ms' else int(print_line[0])/1000
92+
try: # Output of print_variables function.
93+
var_dicts.append(json.loads(print_line[1]))
94+
var_times.append(print_time)
95+
except json.JSONDecodeError: # Output of user print function.
96+
self.prints.append(Print(print_time, print_line[1]))
9597

98+
# Create variables dataframe.
99+
100+
self.variables_df = pd.DataFrame(var_dicts)
101+
columns = self.variables_df.columns
102+
self.variables_df.columns = pd.MultiIndex.from_arrays([['values']*len(columns),columns])
103+
self.variables_df.insert(0,'operation', ['print']*len(self.variables_df))
104+
self.variables_df.insert(0,'time', var_times)
105+
self.variables_df.reset_index()
106+
96107
elif os.path.splitext(file_path)[1] == '.tsv':
97108

98109
# Load tsv file to pandas dataframe.
99110

100111
df = pd.read_csv(file_path, delimiter='\t')
112+
113+
if time_unit == 'ms':
114+
df = df.loc[df['type'] != 'warning', :] # Warning rows have nan time so can't convert to int.
115+
df['time'] = (df['time']*1000).astype(int)
101116

102117
# Extract and store session information.
103118

104119
self.experiment_name = df.loc[(df["type"]=="info") & (df["name"]=="experiment_name"), "value"].item()
105120
self.task_name = df.loc[(df["type"]=="info") & (df["name"]=="task_name" ), "value"].item()
106-
subject_ID_string = df.loc[(df["type"]=="info") & (df["name"]=="subject_id" ), "value"].item()
121+
self.subject_ID = df.loc[(df["type"]=="info") & (df["name"]=="subject_id" ), "value"].item()
107122
datetime_string = df.loc[(df["type"]=="info") & (df["name"]=="start_time" ), "value"].item()
108123

109124
self.datetime = datetime.fromisoformat(datetime_string)
110125

111126
# Extract and store session data.
112127

113-
self.events = [Event(int(row['time']*1000), row['name']) for i,row
128+
self.events = [Event(row['time'], row['name']) for i,row
114129
in df[df['type'].isin(['state', 'event'])].iterrows()]
115130

116131
self.times = {event_name: np.array([ev.time for ev in self.events if ev.name == event_name])
117132
for event_name in df.loc[df['type'].isin(['state', 'event']), 'name'].unique()}
118133

134+
self.prints = [Print(row.time, row.value) for row in df.loc[df.type=='print',:].itertuples()]
135+
119136
# Create variables dataframe.
120137

121138
df.loc[df['type']=='variable', 'value'] = df.loc[df['type']=='variable', 'value'].apply(json.loads) # Convert variables row value fields to dicts.
@@ -126,13 +143,6 @@ def __init__(self, file_path, int_subject_IDs=False):
126143
self.variables_df.insert(0,'time', df.loc[df['type']=='variable', 'time'].tolist())
127144
self.variables_df.reset_index()
128145

129-
# Common to both filetypes.
130-
131-
if int_subject_IDs: # Convert subject ID string to integer.
132-
self.subject_ID = int(''.join([i for i in subject_ID_string if i.isdigit()]))
133-
else:
134-
self.subject_ID = subject_ID_string
135-
136146
self.datetime_string = self.datetime.strftime('%Y-%m-%d %H:%M:%S')
137147

138148

@@ -141,14 +151,14 @@ def __init__(self, file_path, int_subject_IDs=False):
141151
#----------------------------------------------------------------------------------
142152

143153
class Experiment():
144-
def __init__(self, folder_path, int_subject_IDs=True):
154+
def __init__(self, folder_path, time_unit='second'):
145155
'''
146156
Import all sessions from specified folder to create experiment object. Only sessions in the
147157
specified folder (not in subfolders) will be imported.
148158
Arguments:
149159
folder_path: Path of data folder.
150-
int_subject_IDs: If True subject IDs are converted to integers, e.g. m012 is converted to 12.
151160
'''
161+
assert time_unit in ('second','ms'), 'time_unit must be "second" or "ms"'
152162

153163
self.folder_name = os.path.split(folder_path)[1]
154164
self.path = folder_path
@@ -160,18 +170,19 @@ def __init__(self, folder_path, int_subject_IDs=True):
160170
with open(os.path.join(self.path, 'sessions.pkl'),'rb') as sessions_file:
161171
self.sessions = pickle.load(sessions_file)
162172
print('Saved sessions loaded from: sessions.pkl')
173+
assert self.sessions[0].time_unit == time_unit, 'time_unit of saved sessions does not match time_unit argument.'
163174
except IOError:
164175
pass
165176

166177
old_files = [session.file_name for session in self.sessions]
167178
files = os.listdir(self.path)
168-
new_files = [f for f in files if f[-4:] == '.txt' and f not in old_files]
179+
new_files = [f for f in files if f[-4:] in ('.txt', '.tsv') and f not in old_files]
169180

170181
if len(new_files) > 0:
171182
print('Loading new data files..')
172183
for file_name in new_files:
173184
try:
174-
self.sessions.append(Session(os.path.join(self.path, file_name), int_subject_IDs))
185+
self.sessions.append(Session(os.path.join(self.path, file_name), time_unit))
175186
except Exception as error_message:
176187
print('Unable to import file: ' + file_name)
177188
print(error_message)
@@ -275,7 +286,7 @@ def _toDate(d): # Convert input to datetime.date object.
275286
# Session Dataframe
276287
#----------------------------------------------------------------------------------
277288

278-
def session_dataframe(file_path, paired_events={}, pair_end_suffix=None):
289+
def session_dataframe(file_path, paired_events={}, pair_end_suffix=None, time_unit='second'):
279290
'''Generate a pandas dataframe from a pyControl data file (.txt or .tsv)
280291
containing the sessions data. The data frame has columns:
281292
type : Whether the row contains session 'info', a 'state' entry,
@@ -306,6 +317,7 @@ def session_dataframe(file_path, paired_events={}, pair_end_suffix=None):
306317
-------
307318
df : session dataframe
308319
'''
320+
assert time_unit in ('second','ms'), 'time_unit must be "second" or "ms"'
309321

310322
# Load data from file.
311323

@@ -335,20 +347,20 @@ def session_dataframe(file_path, paired_events={}, pair_end_suffix=None):
335347
'value' : value})
336348
elif line[0] == 'D': # Data line.
337349
timestamp, ID = [int(i) for i in line.split(' ')[1:]]
338-
line_dicts.append({'time' : timestamp/1000,
350+
line_dicts.append({'time' : timestamp if time_unit == 'ms' else timestamp/1000,
339351
'type' : 'state' if ID in state_IDs.values() else 'event',
340352
'name' : ID2name[ID]})
341353
elif line[0] == 'P': # Print line.
342354
time_str, print_str = line[2:].split(' ',1)
343-
try:
355+
timestamp = int(time_str)
356+
try: # print_variables output.
344357
value_dict = json.loads(print_str)
345-
line_dicts.append({'time' : int(time_str)/1000,
358+
line_dicts.append({'time' : timestamp if time_unit == 'ms' else timestamp/1000,
346359
'type' : 'variable',
347360
'name' : 'print',
348361
'value' : value_dict})
349-
350-
except json.JSONDecodeError:
351-
line_dicts.append({'time' : int(time_str)/1000,
362+
except json.JSONDecodeError: # User print string.
363+
line_dicts.append({'time' : timestamp if time_unit == 'ms' else timestamp/1000,
352364
'type' : 'print',
353365
'value' : print_str})
354366

@@ -357,6 +369,11 @@ def session_dataframe(file_path, paired_events={}, pair_end_suffix=None):
357369
elif os.path.splitext(file_path)[1] == '.tsv': # Load data from .tsv file.
358370

359371
df = pd.read_csv(file_path, delimiter='\t')
372+
373+
if time_unit == 'ms':
374+
df = df.loc[df['type'] != 'warning', :] # Warning rows have nan time so can't convert to int.
375+
df['time'] = (df['time']*1000).astype(int)
376+
360377
# Convert variables row value fields to dicts from json strings.
361378
df.loc[df['type']=='variable', 'value'] = df.loc[df['type']=='variable', 'value'].apply(json.loads)
362379

@@ -401,7 +418,8 @@ def session_dataframe(file_path, paired_events={}, pair_end_suffix=None):
401418
# Experiment dataframe
402419
#----------------------------------------------------------------------------------
403420

404-
def experiment_dataframe(folder_path, paired_events={}, pair_end_suffix=None):
421+
def experiment_dataframe(folder_path, paired_events={}, pair_end_suffix=None,
422+
time_unit='second'):
405423
'''Generate a pandas dataframe from a pyControl experiment comprising
406424
many session data files in a folder. The experiment dataframe has the
407425
same columns as the session dataframe ('type', 'name', 'time', 'duration',
@@ -432,19 +450,21 @@ def experiment_dataframe(folder_path, paired_events={}, pair_end_suffix=None):
432450
-------
433451
df : session dataframe
434452
'''
435-
session_filenames = [f for f in os.listdir(folder_path) if f[-4:] == '.txt']
453+
assert time_unit in ('second','ms'), 'time_unit must be "second" or "ms"'
454+
session_filenames = [f for f in os.listdir(folder_path) if f[-4:] in ('.txt', '.tsv')]
436455
session_dataframes = []
437456
for session_filename in session_filenames:
438457
# Make session dataframe.
439458
session_df = session_dataframe(os.path.join(folder_path,session_filename),
440-
paired_events=paired_events, pair_end_suffix=pair_end_suffix)
459+
paired_events=paired_events, pair_end_suffix=pair_end_suffix, time_unit=time_unit)
441460
# Convert info rows to columns.
442461
info_rows = session_df[session_df['type']=='info']
443462
session_df = session_df[session_df['type']!='info']
444463
for name,value in zip(info_rows['name'], info_rows['value']):
445464
session_df[name] = value
446465
session_dataframes.append(session_df)
447466
experiment_df = pd.concat(session_dataframes, axis=0)
467+
experiment_df.reset_index(drop=True, inplace=True)
448468
return experiment_df
449469

450470
#----------------------------------------------------------------------------------

0 commit comments

Comments
 (0)