-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathextractors.py
149 lines (118 loc) · 4.9 KB
/
extractors.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
# -*- coding: utf-8 -*-
"""functions and classes to extract info from HDF stores."""
import os
from abc import ABC, abstractmethod
from typing import List, Tuple
import pandas as pd
import dfget
class AbstractExtractor(ABC):
"""Extract dataframe from HDF store."""
@abstractmethod
def extract(self, store_path) -> pd.DataFrame:
"""Extract information from a store."""
pass
class MainExtractor(AbstractExtractor):
"""Extract (sub-) dataframe from 'main_data'."""
def __init__(self, timeslice: slice = None, fields: List[str] = None):
self.fields = fields
self.timeslice = timeslice
if fields is not None:
self.func = dfget.gen(fields)
def extract(self, store_path) -> Tuple[pd.DataFrame, dict]:
"""Extract main data from HDF store."""
with pd.HDFStore(store_path, 'r') as store:
folder_data = store["folder_data"]
df = store["main_data"]
if self.timeslice is not None:
df = df[self.timeslice]
if self.fields is not None:
df = self.func(df)
return df, folder_data
def __call__(self, *args, **kwargs):
"""Extractors naturally call .extract."""
return self.extract(*args, **kwargs)
class ClusterExtractor(AbstractExtractor):
"""Extract dataframe by concatenating df2_xxxxxx."""
def __init__(self, typed=False, timeslice: slice = None,
fields: List[str] = None, write_timesteps=True):
"""do_timestep=True: add a timestep column."""
if typed:
self.ext_name = "df2_"
else:
self.ext_name = "df_"
self.fields = fields
self.timeslice = timeslice
self.write_timesteps = write_timesteps
if fields is not None:
self.func = dfget.gen(fields)
def extract(self, store_path) -> Tuple[pd.DataFrame, dict]:
"""Extract cluster data from HDF store: concatenate timesteps."""
with pd.HDFStore(store_path, 'r') as store:
folder_data = store["folder_data"]
num_sims = len(store["main_data"])
if self.timeslice is not None:
rng = range(num_sims)[self.timeslice]
else:
rng = range(num_sims)
def gen_store_with_time():
for num in rng:
df = store[f"{self.ext_name}{num:06}"]
if self.fields is not None:
df = self.func(df)
if self.write_timesteps:
df["timestep"] = num
yield df
df = pd.concat(gen_store_with_time(),
ignore_index=True,
)
return df, folder_data
def __call__(self, *args, **kwargs):
"""Extractors naturally call .extract."""
return self.extract(*args, **kwargs)
class ClusterExtractorAcc(AbstractExtractor):
"""Extract dataframe by concatenating df2_xxxxxx."""
def __init__(self, typed=False, timeslice: slice = None,
fields: List[str] = None):
"""do_timestep=True: add a timestep column."""
if typed:
self.ext_name = "df2_"
else:
self.ext_name = "df_"
self.fields = fields
self.timeslice = timeslice
self.func = dfget.gen(fields)
def extract(self, store_path) -> Tuple[pd.DataFrame, dict]:
"""Extract cluster data from HDF store: concatenate timesteps."""
with pd.HDFStore(store_path, 'r') as store:
folder_data = store["folder_data"]
num_sims = len(store["main_data"])
if self.timeslice is not None:
rng = range(num_sims)[self.timeslice]
else:
rng = range(num_sims)
def gen_store_with_time():
for num in rng:
df = store[f"{self.ext_name}{num:06}"]
if self.fields is not None:
df = self.func(df)
yield df
df = sum(gen_store_with_time())
df /= len(rng)
return df, folder_data
def __call__(self, *args, **kwargs):
"""Extractors naturally call .extract."""
return self.extract(*args, **kwargs)
class InfoExtractor(AbstractExtractor):
"""Extract meta information: done, and number."""
def extract(self, store_path) -> Tuple[pd.DataFrame, dict]:
"""Extract done and number from store with name x####_xxxx/pystatistics.h5."""
base_path = os.path.split(store_path)[0]
num = int(os.path.split(base_path)[-1].split("_")[0][1:])
with pd.HDFStore(store_path, 'r') as store:
folder_data = store["folder_data"]
df = store["done"]
df["number"] = num
return df, folder_data
def __call__(self, *args, **kwargs):
"""Extractors naturally call .extract."""
return self.extract(*args, **kwargs)