2
2
from typing import Optional
3
3
4
4
import click
5
+ from gymlib .symlinks_paths import (
6
+ get_tables_dirname ,
7
+ get_workload_dirname ,
8
+ get_workload_suffix ,
9
+ name_to_linkname ,
10
+ )
5
11
6
12
from benchmark .constants import DEFAULT_SCALE_FACTOR
7
13
from util .log import DBGYM_LOGGER_NAME
8
14
from util .shell import subprocess_run
9
- from util .workspace import (
10
- DBGymWorkspace ,
11
- get_default_tables_dname ,
12
- get_workload_name ,
13
- is_fully_resolved ,
14
- link_result ,
15
- )
15
+ from util .workspace import DBGymWorkspace , fully_resolve_path
16
16
17
17
JOB_TABLES_URL = "https://event.cwi.nl/da/job/imdb.tgz"
18
18
JOB_QUERIES_URL = "https://event.cwi.nl/da/job/job.tgz"
137
137
@click .group (name = "job" )
138
138
@click .pass_obj
139
139
def job_group (dbgym_workspace : DBGymWorkspace ) -> None :
140
- dbgym_workspace . append_group ( "job" )
140
+ pass
141
141
142
142
143
- @job_group .command (name = "data " )
143
+ @job_group .command (name = "tables " )
144
144
# We expose this option to keep its interface consistent with other workloads, but you should never pass in something other than DEFAULT_SCALE_FACTOR.
145
145
@click .argument ("scale-factor" , type = float )
146
146
@click .pass_obj
147
- # The reason generate data is separate from create dbdata is because generate- data is generic
147
+ # The reason generate data is separate from create dbdata is because generate data is generic
148
148
# to all DBMSs while create dbdata is specific to a single DBMS.
149
- def job_data (dbgym_workspace : DBGymWorkspace , scale_factor : float ) -> None :
149
+ def job_tables (dbgym_workspace : DBGymWorkspace , scale_factor : float ) -> None :
150
+ _job_tables (dbgym_workspace , scale_factor )
151
+
152
+
153
+ def _job_tables (dbgym_workspace : DBGymWorkspace , scale_factor : float ) -> None :
150
154
assert scale_factor == DEFAULT_SCALE_FACTOR
151
- _download_job_data (dbgym_workspace )
155
+ _download_job_tables (dbgym_workspace )
152
156
153
157
154
158
@job_group .command (name = "workload" )
@@ -161,18 +165,24 @@ def job_data(dbgym_workspace: DBGymWorkspace, scale_factor: float) -> None:
161
165
@click .pass_obj
162
166
def job_workload (
163
167
dbgym_workspace : DBGymWorkspace , query_subset : str , scale_factor : float
168
+ ) -> None :
169
+ _job_workload (dbgym_workspace , query_subset , scale_factor )
170
+
171
+
172
+ def _job_workload (
173
+ dbgym_workspace : DBGymWorkspace , query_subset : str , scale_factor : float
164
174
) -> None :
165
175
assert scale_factor == DEFAULT_SCALE_FACTOR
166
176
_download_job_queries (dbgym_workspace )
167
177
_generate_job_workload (dbgym_workspace , query_subset )
168
178
169
179
170
- def _download_job_data (dbgym_workspace : DBGymWorkspace ) -> None :
180
+ def _download_job_tables (dbgym_workspace : DBGymWorkspace ) -> None :
171
181
_download_and_untar_dir (
172
182
dbgym_workspace ,
173
183
JOB_TABLES_URL ,
174
184
"imdb.tgz" ,
175
- get_default_tables_dname ( DEFAULT_SCALE_FACTOR ),
185
+ get_tables_dirname ( "job" , DEFAULT_SCALE_FACTOR ),
176
186
)
177
187
178
188
@@ -199,51 +209,66 @@ def _download_and_untar_dir(
199
209
an "original" directory name. If this is the case, you should set
200
210
`untarred_original_dname` to ensure that it gets renamed to `untarred_dname`.
201
211
"""
202
- expected_symlink_dpath = (
203
- dbgym_workspace .cur_symlinks_data_path ( mkdir = True ) / f"{ untarred_dname } .link"
212
+ expected_symlink_path = (
213
+ dbgym_workspace .dbgym_cur_symlinks_path / f"{ untarred_dname } .link"
204
214
)
205
- if expected_symlink_dpath .exists ():
215
+ if expected_symlink_path .exists ():
206
216
logging .getLogger (DBGYM_LOGGER_NAME ).info (
207
- f"Skipping download: { expected_symlink_dpath } "
217
+ f"Skipping download: { expected_symlink_path } "
208
218
)
209
219
return
210
220
211
- logging .getLogger (DBGYM_LOGGER_NAME ).info (f"Downloading: { expected_symlink_dpath } " )
212
- real_data_path = dbgym_workspace .cur_task_runs_data_path (mkdir = True )
213
- subprocess_run (f"curl -O { download_url } " , cwd = real_data_path )
214
- untarred_data_dpath = dbgym_workspace .cur_task_runs_data_path (untarred_dname )
221
+ logging .getLogger (DBGYM_LOGGER_NAME ).info (f"Downloading: { expected_symlink_path } " )
222
+ subprocess_run (f"curl -O { download_url } " , cwd = dbgym_workspace .dbgym_this_run_path )
223
+ untarred_data_path = dbgym_workspace .dbgym_this_run_path / untarred_dname
215
224
216
225
if untarred_original_dname is not None :
217
- assert not untarred_data_dpath .exists ()
218
- subprocess_run (f"tar -zxvf { download_tarred_fname } " , cwd = real_data_path )
219
- assert (real_data_path / untarred_original_dname ).exists ()
226
+ assert not untarred_data_path .exists ()
227
+ subprocess_run (
228
+ f"tar -zxvf { download_tarred_fname } " ,
229
+ cwd = dbgym_workspace .dbgym_this_run_path ,
230
+ )
231
+ assert (dbgym_workspace .dbgym_this_run_path / untarred_original_dname ).exists ()
220
232
subprocess_run (
221
- f"mv { untarred_original_dname } { untarred_dname } " , cwd = real_data_path
233
+ f"mv { untarred_original_dname } { untarred_dname } " ,
234
+ cwd = dbgym_workspace .dbgym_this_run_path ,
222
235
)
223
236
else :
224
- untarred_data_dpath .mkdir (parents = True , exist_ok = False )
225
- subprocess_run (f"tar -zxvf ../{ download_tarred_fname } " , cwd = untarred_data_dpath )
237
+ untarred_data_path .mkdir (parents = True , exist_ok = False )
238
+ subprocess_run (f"tar -zxvf ../{ download_tarred_fname } " , cwd = untarred_data_path )
226
239
227
- assert untarred_data_dpath .exists ()
228
- subprocess_run (f"rm { download_tarred_fname } " , cwd = real_data_path )
229
- symlink_dpath = link_result (dbgym_workspace , untarred_data_dpath )
230
- assert expected_symlink_dpath .samefile (symlink_dpath )
231
- logging .getLogger (DBGYM_LOGGER_NAME ).info (f"Downloaded: { expected_symlink_dpath } " )
240
+ assert untarred_data_path .exists ()
241
+ subprocess_run (
242
+ f"rm { download_tarred_fname } " , cwd = dbgym_workspace .dbgym_this_run_path
243
+ )
244
+ symlink_path = dbgym_workspace .link_result (untarred_data_path )
245
+ assert expected_symlink_path .samefile (symlink_path )
246
+ logging .getLogger (DBGYM_LOGGER_NAME ).info (f"Downloaded: { expected_symlink_path } " )
232
247
233
248
234
249
def _generate_job_workload (
235
250
dbgym_workspace : DBGymWorkspace ,
236
251
query_subset : str ,
237
252
) -> None :
238
- workload_name = get_workload_name (DEFAULT_SCALE_FACTOR , query_subset )
239
- expected_workload_symlink_dpath = dbgym_workspace .cur_symlinks_data_path (
240
- mkdir = True
241
- ) / (workload_name + ".link" )
253
+ workload_name = get_workload_dirname (
254
+ "job" ,
255
+ DEFAULT_SCALE_FACTOR ,
256
+ get_workload_suffix ("job" , query_subset = query_subset ),
257
+ )
258
+ expected_workload_symlink_path = dbgym_workspace .dbgym_cur_symlinks_path / (
259
+ name_to_linkname (workload_name )
260
+ )
261
+ if expected_workload_symlink_path .exists ():
262
+ logging .getLogger (DBGYM_LOGGER_NAME ).info (
263
+ f"Skipping generation: { expected_workload_symlink_path } "
264
+ )
265
+ return
242
266
243
267
logging .getLogger (DBGYM_LOGGER_NAME ).info (
244
- f"Generating: { expected_workload_symlink_dpath } "
268
+ f"Generating: { expected_workload_symlink_path } "
245
269
)
246
- real_dpath = dbgym_workspace .cur_task_runs_data_path (workload_name , mkdir = True )
270
+ workload_path = dbgym_workspace .dbgym_this_run_path / workload_name
271
+ workload_path .mkdir (parents = False , exist_ok = False )
247
272
248
273
query_names = None
249
274
if query_subset == "all" :
@@ -255,19 +280,17 @@ def _generate_job_workload(
255
280
else :
256
281
assert False
257
282
258
- with open (real_dpath / "order.txt" , "w" ) as f :
283
+ with open (workload_path / "order.txt" , "w" ) as f :
284
+ queries_parent_path = dbgym_workspace .dbgym_cur_symlinks_path / (
285
+ name_to_linkname (JOB_QUERIES_DNAME )
286
+ )
287
+
259
288
for qname in query_names :
260
- sql_fpath = (
261
- dbgym_workspace .cur_symlinks_data_path (mkdir = True )
262
- / (f"{ JOB_QUERIES_DNAME } .link" )
263
- ).resolve () / f"{ qname } .sql"
264
- assert is_fully_resolved (
265
- sql_fpath
266
- ), "We should only write existent real absolute paths to a file"
267
- f .write (f"Q{ qname } ,{ sql_fpath } \n " )
289
+ sql_path = fully_resolve_path (queries_parent_path / f"{ qname } .sql" )
290
+ f .write (f"Q{ qname } ,{ sql_path } \n " )
268
291
269
- workload_symlink_dpath = link_result (dbgym_workspace , real_dpath )
270
- assert workload_symlink_dpath == expected_workload_symlink_dpath
292
+ workload_symlink_path = dbgym_workspace . link_result (workload_path )
293
+ assert workload_symlink_path == expected_workload_symlink_path
271
294
logging .getLogger (DBGYM_LOGGER_NAME ).info (
272
- f"Generated: { expected_workload_symlink_dpath } "
295
+ f"Generated: { expected_workload_symlink_path } "
273
296
)
0 commit comments