Skip to content

Commit 5d046d8

Browse files
committed
Added experiment_dataframe function to data_import.py which creates a single dataframe containing data from many sessions contained in an experiment folder.
1 parent e26b792 commit 5d046d8

File tree

1 file changed

+151
-104
lines changed

1 file changed

+151
-104
lines changed

tools/data_import.py

Lines changed: 151 additions & 104 deletions
Original file line numberDiff line numberDiff line change
@@ -82,110 +82,6 @@ def __init__(self, file_path, int_subject_IDs=True):
8282

8383
self.print_lines = [line[2:] for line in all_lines if line[0]=='P']
8484

85-
#----------------------------------------------------------------------------------
86-
# Session Dataframe
87-
#----------------------------------------------------------------------------------
88-
89-
def session_dataframe(file_path, paired_events={}, pair_end_suffix=None):
90-
'''Generate a pandas dataframe from a pyControl data file containing the
91-
sessions data. The data frame has columns:
92-
type : Whether the row contains session 'info', a 'state' entry,
93-
'event' or 'print' line.
94-
name : The name of the state, event or session information in the row.
95-
time : The time the row occured in ms since the session start.
96-
duration : The duration in ms of states and paired events (see below).
97-
value : The contents of 'info' and 'print' rows.
98-
99-
Optionally events can be specified as coming in pairs corresponding to the
100-
start and end of an action, e.g. entering and exiting a nosepoke. When a
101-
start-event end-event pair occurs in the data, only the start_event generates
102-
a row in the dataframe, with the end event used to compute the duration.
103-
104-
Parameters
105-
----------
106-
file_path : path to pyControl data file.
107-
108-
paired_events : Optional dict specifying paired events e.g.
109-
{'poke_1_in':poke_1_out', 'poke_1_in':poke_1_out'}.
110-
111-
pair_end_suffix : Optional string specifying a suffix used to indicate the
112-
end event of paired events that share a common stem e.g.
113-
the pair {'poke_1_in':poke_1_out'} would be found
114-
automatically using pair_end_suffix='_out'
115-
116-
DESCRIPTION. The default is None.
117-
118-
Returns
119-
-------
120-
df : session dataframe
121-
'''
122-
123-
# Load data from file.
124-
with open(file_path, 'r') as f:
125-
print('Importing data file: '+os.path.split(file_path)[1])
126-
all_lines = [line.strip() for line in f.readlines() if line.strip()]
127-
128-
# Make dataframe.
129-
state_IDs = eval(next(line for line in all_lines if line[0]=='S')[2:])
130-
event_IDs = eval(next(line for line in all_lines if line[0]=='E')[2:])
131-
ID2name = {v: k for k, v in {**state_IDs, **event_IDs}.items()}
132-
133-
line_dicts = []
134-
for line in all_lines:
135-
if line[0] == 'I': # Info line.
136-
name, value = line[2:].split(' : ')
137-
line_dicts.append({'type' : 'info',
138-
'name' : name,
139-
'value' : value})
140-
elif line[0] == 'D': # Data line.
141-
timestamp, ID = [int(i) for i in line.split(' ')[1:]]
142-
line_dicts.append({'type' : 'state' if ID in state_IDs.values() else 'event',
143-
'name' : ID2name[ID],
144-
'time' : int(timestamp)})
145-
elif line[0] == 'P': # Print line.
146-
line_dicts.append({'type' : 'print',
147-
'time' : int(line[2:].split(' ',1)[0]),
148-
'value' : line[2:].split(' ',1)[1]})
149-
150-
df = pd.DataFrame(line_dicts)
151-
152-
# Add state durations.
153-
df.loc[df['type'] == 'state','duration'] = -df.loc[df['type'] == 'state','time'].diff(-1)
154-
155-
# Find paired events with specified pair end suffix.
156-
if pair_end_suffix:
157-
end_events = [ev for ev in event_IDs.keys() if ev.endswith(pair_end_suffix)]
158-
for end_event in end_events:
159-
stem = end_event[:-len(pair_end_suffix)]
160-
try:
161-
start_event = next(ev for ev in event_IDs.keys() if ev.startswith(stem) and ev != end_event)
162-
except StopIteration:
163-
continue # No matching start event found.
164-
paired_events[start_event] = end_event
165-
166-
# Compute paired event durations and remove end events.
167-
if paired_events:
168-
end2start = {v:k for k,v in paired_events.items()}
169-
start_times = {se:None for se in paired_events.keys()}
170-
start_inds = {se:None for se in paired_events.keys()}
171-
end_inds = []
172-
for i in df.index:
173-
if df.loc[i,'name'] in paired_events.keys(): # Pair start event.
174-
start_times[df.loc[i,'name']] = df.loc[i,'time']
175-
start_inds[ df.loc[i,'name']] = i
176-
elif df.loc[i,'name'] in paired_events.values(): # Pair end event.
177-
start_event = end2start[df.loc[i,'name']]
178-
if start_times[start_event] is not None:
179-
df.loc[start_inds[start_event],'duration'] = df.loc[i,'time'] - start_times[start_event]
180-
start_times[start_event] = None
181-
end_inds.append(i)
182-
df.drop(index=end_inds, inplace=True)
183-
184-
# Reset index and set column order.
185-
df.reset_index(drop=True)
186-
df = df.reindex(columns=['type','name','time','duration','value'])
187-
return df
188-
18985
#----------------------------------------------------------------------------------
19086
# Experiment class
19187
#----------------------------------------------------------------------------------
@@ -321,6 +217,157 @@ def _toDate(d): # Convert input to datetime.date object.
321217
else:
322218
raise ValueError('Unable to convert input to date.')
323219

220+
#----------------------------------------------------------------------------------
221+
# Session Dataframe
222+
#----------------------------------------------------------------------------------
223+
224+
def session_dataframe(file_path, paired_events={}, pair_end_suffix=None):
225+
'''Generate a pandas dataframe from a pyControl data file containing the
226+
sessions data. The data frame has columns:
227+
type : Whether the row contains session 'info', a 'state' entry,
228+
'event' or 'print' line.
229+
name : The name of the state, event or session information in the row.
230+
time : The time the row occured in ms since the session start.
231+
duration : The duration in ms of states and paired events (see below).
232+
value : The contents of 'info' and 'print' rows.
233+
234+
Optionally events can be specified as coming in pairs corresponding to the
235+
start and end of an action, e.g. entering and exiting a nosepoke. When a
236+
start-event end-event pair occurs in the data, only the start_event generates
237+
a row in the dataframe, with the end event used to compute the duration.
238+
239+
Parameters
240+
----------
241+
file_path : path to pyControl data file.
242+
243+
paired_events : Optional dict specifying paired events e.g.
244+
{'poke_1_in':poke_1_out', 'poke_1_in':poke_1_out'}.
245+
246+
pair_end_suffix : Optional string specifying a suffix used to indicate the
247+
end event of paired events that share a common stem e.g.
248+
the pair {'poke_1_in':poke_1_out'} would be found
249+
automatically using pair_end_suffix='_out'
250+
251+
Returns
252+
-------
253+
df : session dataframe
254+
'''
255+
256+
# Load data from file.
257+
with open(file_path, 'r') as f:
258+
print('Importing data file: '+os.path.split(file_path)[1])
259+
all_lines = [line.strip() for line in f.readlines() if line.strip()]
260+
261+
# Make dataframe.
262+
state_IDs = eval(next(line for line in all_lines if line[0]=='S')[2:])
263+
event_IDs = eval(next(line for line in all_lines if line[0]=='E')[2:])
264+
ID2name = {v: k for k, v in {**state_IDs, **event_IDs}.items()}
265+
266+
line_dicts = []
267+
for line in all_lines:
268+
if line[0] == 'I': # Info line.
269+
name, value = line[2:].split(' : ')
270+
line_dicts.append({'type' : 'info',
271+
'name' : name,
272+
'value' : value})
273+
elif line[0] == 'D': # Data line.
274+
timestamp, ID = [int(i) for i in line.split(' ')[1:]]
275+
line_dicts.append({'type' : 'state' if ID in state_IDs.values() else 'event',
276+
'name' : ID2name[ID],
277+
'time' : int(timestamp)})
278+
elif line[0] == 'P': # Print line.
279+
line_dicts.append({'type' : 'print',
280+
'time' : int(line[2:].split(' ',1)[0]),
281+
'value' : line[2:].split(' ',1)[1]})
282+
283+
df = pd.DataFrame(line_dicts)
284+
285+
# Add state durations.
286+
df.loc[df['type'] == 'state','duration'] = -df.loc[df['type'] == 'state','time'].diff(-1)
287+
288+
# Find paired events with specified pair end suffix.
289+
if pair_end_suffix:
290+
end_events = [ev for ev in event_IDs.keys() if ev.endswith(pair_end_suffix)]
291+
for end_event in end_events:
292+
stem = end_event[:-len(pair_end_suffix)]
293+
try:
294+
start_event = next(ev for ev in event_IDs.keys() if ev.startswith(stem) and ev != end_event)
295+
except StopIteration:
296+
continue # No matching start event found.
297+
paired_events[start_event] = end_event
298+
299+
# Compute paired event durations and remove end events.
300+
if paired_events:
301+
end2start = {v:k for k,v in paired_events.items()}
302+
start_times = {se:None for se in paired_events.keys()}
303+
start_inds = {se:None for se in paired_events.keys()}
304+
end_inds = []
305+
for i in df.index:
306+
if df.loc[i,'name'] in paired_events.keys(): # Pair start event.
307+
start_times[df.loc[i,'name']] = df.loc[i,'time']
308+
start_inds[ df.loc[i,'name']] = i
309+
elif df.loc[i,'name'] in paired_events.values(): # Pair end event.
310+
start_event = end2start[df.loc[i,'name']]
311+
if start_times[start_event] is not None:
312+
df.loc[start_inds[start_event],'duration'] = df.loc[i,'time'] - start_times[start_event]
313+
start_times[start_event] = None
314+
end_inds.append(i)
315+
df.drop(index=end_inds, inplace=True)
316+
317+
# Reset index and set column order.
318+
df.reset_index(drop=True)
319+
df = df.reindex(columns=['type','name','time','duration','value'])
320+
return df
321+
322+
#----------------------------------------------------------------------------------
323+
# Experiment dataframe
324+
#----------------------------------------------------------------------------------
325+
326+
def experiment_dataframe(folder_path, paired_events={}, pair_end_suffix=None):
327+
'''Generate a pandas dataframe from a pyControl experiment comprising
328+
many session data files in a folder. The experiment dataframe has the
329+
same columns as the session dataframe ('type', 'name', 'time', 'duration',
330+
'value'), with additional columns specifying the subject_ID, start data and
331+
time etc generated from the info lines in the pyControl data file. Each row
332+
of the dataframe corresponds to a single state entry, event or print line
333+
from a single session.
334+
335+
As with the session_dataframe function, events can optionally be specified
336+
as coming in pairs corresponding to the start and end of an action, e.g.
337+
entering and exiting a nosepoke. When a start-event end-event pair occurs
338+
in the data, only the start_event generates a row in the dataframe, with
339+
the end event used to compute the duration.
340+
341+
Parameters
342+
----------
343+
folder_path : path to experiment data folder.
344+
345+
paired_events : Optional dict specifying paired events e.g.
346+
{'poke_1_in':poke_1_out', 'poke_1_in':poke_1_out'}.
347+
348+
pair_end_suffix : Optional string specifying a suffix used to indicate the
349+
end event of paired events that share a common stem e.g.
350+
the pair {'poke_1_in':poke_1_out'} would be found
351+
automatically using pair_end_suffix='_out'
352+
353+
Returns
354+
-------
355+
df : session dataframe
356+
'''
357+
session_filenames = [f for f in os.listdir(folder_path) if f[-4:] == '.txt']
358+
session_dataframes = []
359+
for session_filename in session_filenames:
360+
# Make session dataframe.
361+
session_df = session_dataframe(os.path.join(folder_path,session_filename),
362+
paired_events=paired_events, pair_end_suffix=pair_end_suffix)
363+
# Convert info rows to columns.
364+
info_rows = session_df[session_df['type']=='info']
365+
session_df = session_df[session_df['type']!='info']
366+
for name,value in zip(info_rows['name'], info_rows['value']):
367+
session_df[name] = value
368+
session_dataframes.append(session_df)
369+
experiment_df = pd.concat(session_dataframes, axis=0)
370+
return experiment_df
324371

325372
#----------------------------------------------------------------------------------
326373
# Load analog data

0 commit comments

Comments
 (0)