Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Standardized logic around fully resolved paths #52

Merged
merged 9 commits into from
Dec 23, 2024
7 changes: 3 additions & 4 deletions benchmark/job/cli.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,7 @@
DBGymConfig,
default_tables_dname,
get_workload_name,
is_fully_resolved,
link_result,
)

Expand Down Expand Up @@ -260,10 +261,8 @@ def _generate_job_workload(
dbgym_cfg.cur_symlinks_data_path(mkdir=True)
/ (f"{JOB_QUERIES_DNAME}.link")
).resolve() / f"{qname}.sql"
assert (
sql_fpath.exists()
and not sql_fpath.is_symlink()
and sql_fpath.is_absolute()
assert is_fully_resolved(
sql_fpath
), "We should only write existent real absolute paths to a file"
f.write(f"Q{qname},{sql_fpath}\n")

Expand Down
8 changes: 3 additions & 5 deletions benchmark/job/load_info.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,7 @@

from benchmark.constants import DEFAULT_SCALE_FACTOR
from dbms.load_info_base_class import LoadInfoBaseClass
from util.workspace import DBGymConfig, default_tables_dname
from util.workspace import DBGymConfig, default_tables_dname, is_fully_resolved

JOB_SCHEMA_FNAME = "job_schema.sql"

Expand Down Expand Up @@ -55,10 +55,8 @@ def __init__(self, dbgym_cfg: DBGymConfig):
data_root_dpath / f"{default_tables_dname(DEFAULT_SCALE_FACTOR)}.link"
)
tables_dpath = tables_symlink_dpath.resolve()
assert (
tables_dpath.exists()
and tables_dpath.is_absolute()
and not tables_dpath.is_symlink()
assert is_fully_resolved(
tables_dpath
), f"tables_dpath ({tables_dpath}) should be an existent real absolute path. Make sure you have generated the TPC-H data"
self._tables_and_fpaths = []
for table in JobLoadInfo.TABLES:
Expand Down
13 changes: 4 additions & 9 deletions benchmark/tpch/cli.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,7 @@
default_tables_dname,
get_scale_factor_string,
get_workload_name,
is_fully_resolved,
link_result,
)

Expand Down Expand Up @@ -94,11 +95,7 @@ def _clone_tpch_kit(dbgym_cfg: DBGymConfig) -> None:

def _get_tpch_kit_dpath(dbgym_cfg: DBGymConfig) -> Path:
tpch_kit_dpath = (dbgym_cfg.cur_symlinks_build_path() / "tpch-kit.link").resolve()
assert (
tpch_kit_dpath.exists()
and tpch_kit_dpath.is_absolute()
and not tpch_kit_dpath.is_symlink()
)
assert is_fully_resolved(tpch_kit_dpath)
return tpch_kit_dpath


Expand Down Expand Up @@ -197,10 +194,8 @@ def _generate_tpch_workload(
symlink_data_dpath
/ (_get_queries_dname(seed, scale_factor) + ".link")
).resolve() / f"{qname}.sql"
assert (
sql_fpath.exists()
and not sql_fpath.is_symlink()
and sql_fpath.is_absolute()
assert is_fully_resolved(
sql_fpath
), "We should only write existent real absolute paths to a file"
f.write(f"S{seed}-Q{qname},{sql_fpath}\n")
# TODO(WAN): add option to deep-copy the workload.
Expand Down
8 changes: 3 additions & 5 deletions benchmark/tpch/load_info.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,7 @@
from typing import Optional

from dbms.load_info_base_class import LoadInfoBaseClass
from util.workspace import DBGymConfig, default_tables_dname
from util.workspace import DBGymConfig, default_tables_dname, is_fully_resolved

TPCH_SCHEMA_FNAME = "tpch_schema.sql"
TPCH_CONSTRAINTS_FNAME = "tpch_constraints.sql"
Expand Down Expand Up @@ -48,10 +48,8 @@ def __init__(self, dbgym_cfg: DBGymConfig, scale_factor: float):
data_root_dpath / f"{default_tables_dname(scale_factor)}.link"
)
tables_dpath = tables_symlink_dpath.resolve()
assert (
tables_dpath.exists()
and tables_dpath.is_absolute()
and not tables_dpath.is_symlink()
assert is_fully_resolved(
tables_dpath
), f"tables_dpath ({tables_dpath}) should be an existent real absolute path. Make sure you have generated the TPC-H data"
self._tables_and_fpaths = []
for table in TpchLoadInfo.TABLES:
Expand Down
22 changes: 10 additions & 12 deletions dbms/postgres/cli.py
Original file line number Diff line number Diff line change
Expand Up @@ -35,10 +35,11 @@
from util.workspace import (
WORKSPACE_PATH_PLACEHOLDER,
DBGymConfig,
conv_inputpath_to_realabspath,
default_dbdata_parent_dpath,
default_pgbin_path,
fully_resolve_path,
get_dbdata_tgz_name,
is_fully_resolved,
is_ssd,
link_result,
open_and_save,
Expand Down Expand Up @@ -107,9 +108,9 @@ def postgres_dbdata(
dbgym_cfg.dbgym_workspace_path
)

# Convert all input paths to absolute paths
pgbin_path = conv_inputpath_to_realabspath(dbgym_cfg, pgbin_path)
dbdata_parent_dpath = conv_inputpath_to_realabspath(dbgym_cfg, dbdata_parent_dpath)
# Fully resolve all input paths.
pgbin_path = fully_resolve_path(dbgym_cfg, pgbin_path)
dbdata_parent_dpath = fully_resolve_path(dbgym_cfg, dbdata_parent_dpath)

# Check assertions on args
if intended_dbdata_hardware == "hdd":
Expand Down Expand Up @@ -316,26 +317,23 @@ def _start_or_stop_postgres(
dbgym_cfg: DBGymConfig, pgbin_path: Path, dbdata_dpath: Path, is_start: bool
) -> None:
# They should be absolute paths and should exist
assert pgbin_path.is_absolute() and pgbin_path.exists()
assert dbdata_dpath.is_absolute() and dbdata_dpath.exists()
# The inputs may be symlinks so we need to resolve them first
pgbin_real_dpath = pgbin_path.resolve()
dbdata_dpath = dbdata_dpath.resolve()
assert is_fully_resolved(pgbin_path)
assert is_fully_resolved(dbdata_dpath)
pgport = DEFAULT_POSTGRES_PORT
save_file(dbgym_cfg, pgbin_real_dpath / "pg_ctl")
save_file(dbgym_cfg, pgbin_path / "pg_ctl")

if is_start:
# We use subprocess.run() because subprocess_run() never returns when running "pg_ctl start".
# The reason subprocess_run() never returns is because pg_ctl spawns a postgres process so .poll() always returns None.
# On the other hand, subprocess.run() does return normally, like calling `./pg_ctl` on the command line would do.
result = subprocess.run(
f"./pg_ctl -D \"{dbdata_dpath}\" -o '-p {pgport}' start",
cwd=pgbin_real_dpath,
cwd=pgbin_path,
shell=True,
)
result.check_returncode()
else:
subprocess_run(
f"./pg_ctl -D \"{dbdata_dpath}\" -o '-p {pgport}' stop",
cwd=pgbin_real_dpath,
cwd=pgbin_path,
)
29 changes: 22 additions & 7 deletions env/integtest_tuning_agent.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,14 @@
from typing import Any, Optional

from env.integtest_util import IntegtestWorkspace
from env.tuning_agent import DBMSConfigDelta, TuningAgent
from env.tuning_agent import (
DBMSConfigDelta,
IndexesDelta,
QueryKnobsDelta,
SysKnobsDelta,
TuningAgent,
TuningAgentStepReader,
)


class MockTuningAgent(TuningAgent):
Expand All @@ -25,7 +32,11 @@ def setUpClass() -> None:

@staticmethod
def make_config(letter: str) -> DBMSConfigDelta:
return DBMSConfigDelta([letter], {letter: letter}, {letter: [letter]})
return DBMSConfigDelta(
indexes=IndexesDelta([letter]),
sysknobs=SysKnobsDelta({letter: letter}),
qknobs=QueryKnobsDelta({letter: [letter]}),
)

def test_get_step_delta(self) -> None:
agent = MockTuningAgent(IntegtestWorkspace.get_dbgym_cfg())
Expand All @@ -37,10 +48,12 @@ def test_get_step_delta(self) -> None:
agent.config_to_return = PostgresConnTests.make_config("c")
agent.step()

self.assertEqual(agent.get_step_delta(1), PostgresConnTests.make_config("b"))
self.assertEqual(agent.get_step_delta(0), PostgresConnTests.make_config("a"))
self.assertEqual(agent.get_step_delta(1), PostgresConnTests.make_config("b"))
self.assertEqual(agent.get_step_delta(2), PostgresConnTests.make_config("c"))
reader = TuningAgentStepReader(agent.dbms_cfg_deltas_dpath)

self.assertEqual(reader.get_step_delta(1), PostgresConnTests.make_config("b"))
self.assertEqual(reader.get_step_delta(0), PostgresConnTests.make_config("a"))
self.assertEqual(reader.get_step_delta(1), PostgresConnTests.make_config("b"))
self.assertEqual(reader.get_step_delta(2), PostgresConnTests.make_config("c"))

def test_get_all_deltas(self) -> None:
agent = MockTuningAgent(IntegtestWorkspace.get_dbgym_cfg())
Expand All @@ -52,8 +65,10 @@ def test_get_all_deltas(self) -> None:
agent.config_to_return = PostgresConnTests.make_config("c")
agent.step()

reader = TuningAgentStepReader(agent.dbms_cfg_deltas_dpath)

self.assertEqual(
agent.get_all_deltas(),
reader.get_all_deltas(),
[
PostgresConnTests.make_config("a"),
PostgresConnTests.make_config("b"),
Expand Down
Empty file added env/replay.py
Empty file.
56 changes: 39 additions & 17 deletions env/tuning_agent.py
Original file line number Diff line number Diff line change
@@ -1,12 +1,17 @@
import json
from dataclasses import asdict, dataclass
from pathlib import Path
from typing import NewType, TypedDict

from util.workspace import DBGymConfig
from util.workspace import DBGymConfig, is_fully_resolved

# PostgresConn doesn't use these types because PostgresConn is used internally by tuning agents.
# These types are only given as the outputs of tuning agents.
IndexesDelta = NewType("IndexesDelta", list[str])
SysKnobsDelta = NewType("SysKnobsDelta", dict[str, str])
QueryKnobsDelta = NewType("QueryKnobsDelta", dict[str, list[str]])

@dataclass
class DBMSConfigDelta:

class DBMSConfigDelta(TypedDict):
"""
This class represents a DBMS config delta. A "DBMS config" is the indexes, system knobs,
and query knobs set by the tuning agent. A "delta" is the change from the prior config.
Expand All @@ -21,9 +26,13 @@ class DBMSConfigDelta:
because knobs can be settings ("SET (enable_sort on)") or flags ("IndexOnlyScan(it)").
"""

indexes: list[str]
sysknobs: dict[str, str]
qknobs: dict[str, list[str]]
indexes: IndexesDelta
sysknobs: SysKnobsDelta
qknobs: QueryKnobsDelta


def get_step_delta_fpath(dbms_cfg_deltas_dpath: Path, step_num: int) -> Path:
return dbms_cfg_deltas_dpath / f"step{step_num}_delta.json"


class TuningAgent:
Expand All @@ -41,11 +50,10 @@ def step(self) -> None:
curr_step_num = self.next_step_num
self.next_step_num += 1
dbms_cfg_delta = self._step()
with self.get_step_delta_fpath(curr_step_num).open("w") as f:
json.dump(asdict(dbms_cfg_delta), f)

def get_step_delta_fpath(self, step_num: int) -> Path:
return self.dbms_cfg_deltas_dpath / f"step{step_num}_delta.json"
with get_step_delta_fpath(self.dbms_cfg_deltas_dpath, curr_step_num).open(
"w"
) as f:
json.dump(dbms_cfg_delta, f)

# Subclasses should override this function.
def _step(self) -> DBMSConfigDelta:
Expand All @@ -56,11 +64,25 @@ def _step(self) -> DBMSConfigDelta:
"""
raise NotImplementedError


class TuningAgentStepReader:
def __init__(self, dbms_cfg_deltas_dpath: Path) -> None:
self.dbms_cfg_deltas_dpath = dbms_cfg_deltas_dpath
assert is_fully_resolved(self.dbms_cfg_deltas_dpath)
num_steps = 0
while get_step_delta_fpath(self.dbms_cfg_deltas_dpath, num_steps).exists():
num_steps += 1
self.num_steps = num_steps

def get_step_delta(self, step_num: int) -> DBMSConfigDelta:
assert step_num >= 0 and step_num < self.next_step_num
with self.get_step_delta_fpath(step_num).open("r") as f:
return DBMSConfigDelta(**json.load(f))
assert False
assert step_num >= 0 and step_num < self.num_steps
with get_step_delta_fpath(self.dbms_cfg_deltas_dpath, step_num).open("r") as f:
data = json.load(f)
return DBMSConfigDelta(
indexes=data["indexes"],
sysknobs=data["sysknobs"],
qknobs=data["qknobs"],
)

def get_all_deltas(self) -> list[DBMSConfigDelta]:
return [self.get_step_delta(step_num) for step_num in range(self.next_step_num)]
return [self.get_step_delta(step_num) for step_num in range(self.num_steps)]
26 changes: 11 additions & 15 deletions tune/protox/agent/hpo.py
Original file line number Diff line number Diff line change
Expand Up @@ -35,7 +35,6 @@
WORKSPACE_PATH_PLACEHOLDER,
DBGymConfig,
TuningMode,
conv_inputpath_to_realabspath,
default_benchbase_config_path,
default_benchmark_config_path,
default_dbdata_parent_dpath,
Expand All @@ -44,6 +43,7 @@
default_pgbin_path,
default_pristine_dbdata_snapshot_path,
default_workload_path,
fully_resolve_path,
get_default_workload_name_suffix,
get_workload_name,
is_ssd,
Expand Down Expand Up @@ -294,22 +294,18 @@ def hpo(
if seed is None:
seed = random.randint(0, int(1e8))

# Convert all input paths to absolute paths
embedder_path = conv_inputpath_to_realabspath(dbgym_cfg, embedder_path)
benchmark_config_path = conv_inputpath_to_realabspath(
dbgym_cfg, benchmark_config_path
)
benchbase_config_path = conv_inputpath_to_realabspath(
dbgym_cfg, benchbase_config_path
)
sysknobs_path = conv_inputpath_to_realabspath(dbgym_cfg, sysknobs_path)
pristine_dbdata_snapshot_path = conv_inputpath_to_realabspath(
# Fully resolve all input paths.
embedder_path = fully_resolve_path(dbgym_cfg, embedder_path)
benchmark_config_path = fully_resolve_path(dbgym_cfg, benchmark_config_path)
benchbase_config_path = fully_resolve_path(dbgym_cfg, benchbase_config_path)
sysknobs_path = fully_resolve_path(dbgym_cfg, sysknobs_path)
pristine_dbdata_snapshot_path = fully_resolve_path(
dbgym_cfg, pristine_dbdata_snapshot_path
)
dbdata_parent_dpath = conv_inputpath_to_realabspath(dbgym_cfg, dbdata_parent_dpath)
pgbin_path = conv_inputpath_to_realabspath(dbgym_cfg, pgbin_path)
workload_path = conv_inputpath_to_realabspath(dbgym_cfg, workload_path)
boot_config_fpath_during_hpo = conv_inputpath_to_realabspath(
dbdata_parent_dpath = fully_resolve_path(dbgym_cfg, dbdata_parent_dpath)
pgbin_path = fully_resolve_path(dbgym_cfg, pgbin_path)
workload_path = fully_resolve_path(dbgym_cfg, workload_path)
boot_config_fpath_during_hpo = fully_resolve_path(
dbgym_cfg, boot_config_fpath_during_hpo
)

Expand Down
6 changes: 3 additions & 3 deletions tune/protox/agent/replay.py
Original file line number Diff line number Diff line change
Expand Up @@ -31,9 +31,9 @@
from util.workspace import (
DBGymConfig,
TuningMode,
conv_inputpath_to_realabspath,
default_replay_data_fname,
default_tuning_steps_dpath,
fully_resolve_path,
get_default_workload_name_suffix,
get_workload_name,
link_result,
Expand Down Expand Up @@ -151,8 +151,8 @@ def replay(
boot_enabled_during_tune,
)

# Convert all input paths to absolute paths
tuning_steps_dpath = conv_inputpath_to_realabspath(dbgym_cfg, tuning_steps_dpath)
# Fully resolve all input paths.
tuning_steps_dpath = fully_resolve_path(dbgym_cfg, tuning_steps_dpath)

# Group args together to reduce the # of parameters we pass into functions
replay_args = ReplayArgs(
Expand Down
10 changes: 4 additions & 6 deletions tune/protox/agent/tune.py
Original file line number Diff line number Diff line change
Expand Up @@ -17,9 +17,9 @@
WORKSPACE_PATH_PLACEHOLDER,
DBGymConfig,
TuningMode,
conv_inputpath_to_realabspath,
default_hpoed_agent_params_path,
default_tuning_steps_dname,
fully_resolve_path,
get_default_workload_name_suffix,
get_workload_name,
link_result,
Expand Down Expand Up @@ -86,11 +86,9 @@ def tune(
dbgym_cfg.dbgym_workspace_path, benchmark_name, workload_name
)

# Convert all input paths to absolute paths
hpoed_agent_params_path = conv_inputpath_to_realabspath(
dbgym_cfg, hpoed_agent_params_path
)
boot_config_fpath_during_tune = conv_inputpath_to_realabspath(
# Fully resolve all input paths.
hpoed_agent_params_path = fully_resolve_path(dbgym_cfg, hpoed_agent_params_path)
boot_config_fpath_during_tune = fully_resolve_path(
dbgym_cfg, boot_config_fpath_during_tune
)

Expand Down
Loading
Loading