Workload (#53)

wangpatrick57 · web-flow · commit 841089133315 · 2024-12-23T15:36:38.000-08:00
**Summary**: basic `Workload` class that lets you read a workload directory. **Demo**: Added tests that pass. <img width="626" alt="Screenshot 2024-12-23 at 14 59 15" src="https://github.com/user-attachments/assets/96bfd927-0842-4e37-a3c7-e9036bc36486" />
diff --git a/benchmark/tpch/cli.py b/benchmark/tpch/cli.py
@@ -4,6 +4,7 @@
 import click
 
 from benchmark.constants import DEFAULT_SCALE_FACTOR
+from benchmark.tpch.constants import DEFAULT_TPCH_SEED, NUM_TPCH_QUERIES
 from util.log import DBGYM_LOGGER_NAME
 from util.shell import subprocess_run
 from util.workspace import (
@@ -15,8 +16,6 @@
     link_result,
 )
 
-NUM_TPCH_QUERIES = 22
-
 
 @click.group(name="tpch")
 @click.pass_obj
@@ -38,13 +37,13 @@ def tpch_data(dbgym_cfg: DBGymConfig, scale_factor: float) -> None:
 @click.option(
     "--seed-start",
     type=int,
-    default=15721,
+    default=DEFAULT_TPCH_SEED,
     help="A workload consists of queries from multiple seeds. This is the starting seed (inclusive).",
 )
 @click.option(
     "--seed-end",
     type=int,
-    default=15721,
+    default=DEFAULT_TPCH_SEED,
     help="A workload consists of queries from multiple seeds. This is the ending seed (inclusive).",
 )
 @click.option(
diff --git a/benchmark/tpch/constants.py b/benchmark/tpch/constants.py
@@ -0,0 +1,2 @@
+DEFAULT_TPCH_SEED = 15721
+NUM_TPCH_QUERIES = 22
diff --git a/env/integtest_pg_conn.py b/env/integtest_pg_conn.py
@@ -3,7 +3,11 @@
 
 import psycopg
 
-from env.integtest_util import IntegtestWorkspace
+from env.integtest_util import (
+    INTEGTEST_BENCHMARK,
+    INTEGTEST_SCALE_FACTOR,
+    IntegtestWorkspace,
+)
 from env.pg_conn import PostgresConn
 from util.pg import (
     DEFAULT_POSTGRES_PORT,
@@ -12,19 +16,13 @@
 )
 from util.workspace import (
     DEFAULT_BOOT_CONFIG_FPATH,
-    DBGymConfig,
     default_dbdata_parent_dpath,
     default_pgbin_path,
     default_pristine_dbdata_snapshot_path,
 )
 
-BENCHMARK = "tpch"
-SCALE_FACTOR = 0.01
-
 
 class PostgresConnTests(unittest.TestCase):
-    dbgym_cfg: DBGymConfig
-
     @staticmethod
     def setUpClass() -> None:
         IntegtestWorkspace.set_up_workspace()
@@ -36,16 +34,14 @@ def setUp(self) -> None:
             + "to ensure this. Be careful about accidentally taking down other people's Postgres instances though.",
         )
         self.pristine_dbdata_snapshot_path = default_pristine_dbdata_snapshot_path(
-            IntegtestWorkspace.get_dbgym_cfg().dbgym_workspace_path,
-            BENCHMARK,
-            SCALE_FACTOR,
+            IntegtestWorkspace.get_workspace_path(),
+            INTEGTEST_BENCHMARK,
+            INTEGTEST_SCALE_FACTOR,
         )
         self.dbdata_parent_dpath = default_dbdata_parent_dpath(
-            IntegtestWorkspace.get_dbgym_cfg().dbgym_workspace_path
-        )
-        self.pgbin_dpath = default_pgbin_path(
-            IntegtestWorkspace.get_dbgym_cfg().dbgym_workspace_path
+            IntegtestWorkspace.get_workspace_path()
         )
+        self.pgbin_dpath = default_pgbin_path(IntegtestWorkspace.get_workspace_path())
 
         # The reason we restart Postgres every time is to ensure a "clean" starting point
         # so that all tests are independent of each other.
diff --git a/env/integtest_util.py b/env/integtest_util.py
@@ -6,6 +6,11 @@
 
 from util.workspace import DBGymConfig
 
+# These are the values used by set_up_env_integtests.sh.
+# TODO: make set_up_env_integtests.sh take in these values directly as envvars.
+INTEGTEST_BENCHMARK = "tpch"
+INTEGTEST_SCALE_FACTOR = 0.01
+
 
 class IntegtestWorkspace:
     """
@@ -40,4 +45,3 @@ def get_dbgym_cfg() -> DBGymConfig:
     def get_workspace_path() -> Path:
         with open(IntegtestWorkspace.ENV_INTEGTESTS_DBGYM_CONFIG_FPATH) as f:
             return Path(yaml.safe_load(f)["dbgym_workspace_path"])
-        assert False
diff --git a/env/integtest_workload.py b/env/integtest_workload.py
@@ -0,0 +1,50 @@
+import unittest
+
+from benchmark.tpch.constants import DEFAULT_TPCH_SEED, NUM_TPCH_QUERIES
+from env.integtest_util import (
+    INTEGTEST_BENCHMARK,
+    INTEGTEST_SCALE_FACTOR,
+    IntegtestWorkspace,
+)
+from env.workload import Workload
+from util.workspace import (
+    default_workload_path,
+    fully_resolve_path,
+    get_default_workload_name_suffix,
+    get_workload_name,
+)
+
+
+class WorkloadTests(unittest.TestCase):
+    @staticmethod
+    def setUpClass() -> None:
+        IntegtestWorkspace.set_up_workspace()
+
+    def test_workload(self) -> None:
+        workload_dpath = fully_resolve_path(
+            IntegtestWorkspace.get_dbgym_cfg(),
+            default_workload_path(
+                IntegtestWorkspace.get_workspace_path(),
+                INTEGTEST_BENCHMARK,
+                get_workload_name(
+                    INTEGTEST_SCALE_FACTOR,
+                    get_default_workload_name_suffix(INTEGTEST_BENCHMARK),
+                ),
+            ),
+        )
+
+        workload = Workload(IntegtestWorkspace.get_dbgym_cfg(), workload_dpath)
+
+        # Check the order of query IDs.
+        self.assertEqual(
+            workload.get_query_order(),
+            [f"S{DEFAULT_TPCH_SEED}-Q{i}" for i in range(1, NUM_TPCH_QUERIES + 1)],
+        )
+
+        # Sanity check all queries.
+        for query in workload.get_queries_in_order():
+            self.assertTrue("select" in query.lower())
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/env/set_up_env_integtests.sh b/env/set_up_env_integtests.sh
@@ -16,5 +16,7 @@ export DBGYM_CONFIG_PATH=env/env_integtests_dbgym_config.yaml # Note that this e
 WORKSPACE_PATH=$(grep 'dbgym_workspace_path:' $DBGYM_CONFIG_PATH | sed 's/dbgym_workspace_path: //')
 
 python3 task.py benchmark $BENCHMARK data $SCALE_FACTOR
+python3 task.py benchmark $BENCHMARK workload --scale-factor $SCALE_FACTOR
+
 python3 task.py dbms postgres build
 python3 task.py dbms postgres dbdata $BENCHMARK --scale-factor $SCALE_FACTOR --intended-dbdata-hardware $INTENDED_DBDATA_HARDWARE
diff --git a/env/workload.py b/env/workload.py
@@ -0,0 +1,35 @@
+from pathlib import Path
+
+from util.workspace import DBGymConfig, is_fully_resolved, open_and_save
+
+
+class Workload:
+    def __init__(self, dbgym_cfg: DBGymConfig, workload_dpath: Path) -> None:
+        self.dbgym_cfg = dbgym_cfg
+        self.workload_dpath = workload_dpath
+        assert is_fully_resolved(self.workload_dpath)
+
+        self.queries: dict[str, str] = {}
+        order_fpath = self.workload_dpath / "order.txt"
+        self.query_order: list[str] = []
+
+        assert order_fpath.exists()
+
+        with open_and_save(self.dbgym_cfg, order_fpath) as f:
+            for line in f:
+                qid, qpath = line.strip().split(",")
+                qpath = Path(qpath)
+                assert is_fully_resolved(qpath)
+
+                with open_and_save(self.dbgym_cfg, qpath) as qf:
+                    self.queries[qid] = qf.read()
+                self.query_order.append(qid)
+
+    def get_query(self, qid: str) -> str:
+        return self.queries[qid]
+
+    def get_query_order(self) -> list[str]:
+        return self.query_order
+
+    def get_queries_in_order(self) -> list[str]:
+        return [self.queries[qid] for qid in self.query_order]
diff --git a/scripts/run_protox_e2e_test.py b/scripts/run_protox_e2e_test.py
@@ -8,6 +8,7 @@
 import yaml
 
 from benchmark.constants import DEFAULT_SCALE_FACTOR
+from benchmark.tpch.constants import DEFAULT_TPCH_SEED
 from util.pg import get_is_postgres_running
 from util.workspace import (
     default_embedder_path,
@@ -72,7 +73,7 @@ def run_e2e_for_benchmark(benchmark_name: str, intended_dbdata_hardware: str) ->
     if benchmark_name == "tpch":
         scale_factor = 0.01
         query_subset = "all"
-        workload_name_suffix = f"15721_15721_{query_subset}"
+        workload_name_suffix = f"{DEFAULT_TPCH_SEED}_{DEFAULT_TPCH_SEED}_{query_subset}"
         embedding_datagen_args = "--override-sample-limits lineitem,32768"
         embedding_train_args = "--iterations-per-epoch 1 --num-points-to-sample 1 --num-batches 1 --batch-size 64 --start-epoch 15 --num-samples 4 --train-max-concurrent 4 --num-curate 2"
         tune_hpo_args = "--num-samples 2 --max-concurrent 2 --workload-timeout 15 --query-timeout 1 --tune-duration-during-hpo 0.01"
diff --git a/util/workspace.py b/util/workspace.py
@@ -15,6 +15,7 @@
 import redis
 import yaml
 
+from benchmark.tpch.constants import DEFAULT_TPCH_SEED
 from util.log import DBGYM_LOGGER_NAME
 from util.shell import subprocess_run
 
@@ -96,7 +97,7 @@ def get_workload_name(scale_factor: float | str, suffix: str) -> str:
 
 def get_default_workload_name_suffix(benchmark_name: str) -> str:
     if benchmark_name == "tpch":
-        return "15721_15721_all"
+        return f"{DEFAULT_TPCH_SEED}_{DEFAULT_TPCH_SEED}_all"
     if benchmark_name == "job":
         return "all"
     else:

Original file line number	Diff line number	Diff line change
`@@ -0,0 +1,2 @@`
	`1`	`+DEFAULT_TPCH_SEED = 15721`
	`2`	`+NUM_TPCH_QUERIES = 22`