Gymlib + submodule infra (#58)

wangpatrick57 · web-flow · commit b0e30da37e2c · 2024-12-25T15:33:59.000-08:00
**Summary**: Set up submodules for agents and gymlib. **Demo**: The hello-tune agent ([repo@614687b](https://github.com/wangpatrick57/hello-tune/tree/614687b70ebd85be84b10de63e74e4813a36b30e)) calls `print(gymlib.magic.get_magic_number())` which successfully prints 42. <img width="759" alt="Screenshot 2024-12-25 at 15 17 50" src="https://github.com/user-attachments/assets/0d2bac20-1224-4ff2-93f0-7d8346e5413f" /> **Details**: * Each agent will be a git submodule inside `agents/`. * Each agent will have access to `gymlib/` as a Python library. * Each agent must use its own conda environment. You can use `.python_version` and `requirements.txt` inside the agent to define the env. This can be created automatically using `build_agent_conda_env.sh`. * The base gym will use the `dbgym` conda env. This can be created automatically using `build_gym_conda_env.sh`. * The current `tune/` folder is being deprecated and will be removed in a future PR. This is why the E2E tests are turned off in the CI.
diff --git a/.github/workflows/tests.yaml b/.github/workflows/tests.yaml
@@ -30,7 +30,9 @@ jobs:
     # We need to do `. "$HOME/.cargo/env"` in each step for it to work.
     - name: Install dependencies
       run: |
-        ./dependencies/install_dependencies.sh
+        pip install -r ./scripts/configs/requirements.txt
+        pip install -e ./gymlib
+        ./scripts/install_sysdeps.sh
 
     - name: Check formatting
       run: |
@@ -57,12 +59,12 @@ jobs:
         export 
         ./scripts/run_integration_tests.sh
 
-    - name: Run end-to-end tests
-      # End-to-end tests are like integration tests in that they require external systems to be running.
-      # Unlike integration tests though, they don't perform detailed checks for any individual module.
-      #
-      # Note that we need to run with a non-root user in order to start Postgres. This is configured in the .yaml
-      # file for our self-hosted GHA runners.
-      run: |
-        . "$HOME/.cargo/env"
-        python -m scripts.run_protox_e2e_test ssd
+    # - name: Run end-to-end tests
+    #   # End-to-end tests are like integration tests in that they require external systems to be running.
+    #   # Unlike integration tests though, they don't perform detailed checks for any individual module.
+    #   #
+    #   # Note that we need to run with a non-root user in order to start Postgres. This is configured in the .yaml
+    #   # file for our self-hosted GHA runners.
+    #   run: |
+    #     . "$HOME/.cargo/env"
+    #     python -m scripts.run_protox_e2e_test ssd
diff --git a/.gitignore b/.gitignore
@@ -3,6 +3,6 @@ __pycache__/
 .conda/
 .idea/
 test_clean_scratchspace/
-
 workspace/
-default_*_benchbase_config_*.xml
+default_*_benchbase_config_*.xml
+*.egg-info/
diff --git a/.gitmodules b/.gitmodules
@@ -0,0 +1,3 @@
+[submodule "agents/hello-tune"]
+	path = agents/hello-tune
+	url = git@github.com:wangpatrick57/hello-tune.git
diff --git a/agents/hello-tune b/agents/hello-tune
@@ -0,0 +1 @@
+Subproject commit 614687b70ebd85be84b10de63e74e4813a36b30e
diff --git a/dependencies/install_dependencies.sh b/dependencies/install_dependencies.sh
diff --git a/gymlib/gymlib/__init__.py b/gymlib/gymlib/__init__.py
@@ -0,0 +1 @@
+from . import magic
diff --git a/gymlib/gymlib/magic.py b/gymlib/gymlib/magic.py
@@ -0,0 +1,2 @@
+def get_magic_number() -> int:
+    return 42
diff --git a/gymlib/pyproject.toml b/gymlib/pyproject.toml
@@ -0,0 +1,12 @@
+[build-system]
+requires = ["setuptools>=61.0"]
+build-backend = "setuptools.build_meta"
+
+[project]
+name = "gymlib"
+version = "0.1.0"
+requires-python = ">=3.8"
+dependencies = []
+
+[tool.setuptools]
+py-modules = ["gymlib"]
diff --git a/scripts/_build_conda_env.sh b/scripts/_build_conda_env.sh
@@ -0,0 +1,75 @@
+#!/bin/bash
+# This helper script creates a conda environment.
+# You should not run this directly. Instead, use build_agent_conda_env.sh or build_gym_conda_env.sh.
+
+set -euo pipefail
+
+# 1. Checks.
+# 1.1. Check that conda is installed.
+if ! command -v conda &> /dev/null; then
+    echo "Error: Conda is not installed"
+    exit 1
+fi
+
+# 1.2. Input validation.
+if [ "$#" -lt 3 ]; then
+    echo "Usage: ./_build_conda_env.sh <env_name> <python_version_path> <requirements_path>"
+    exit 1
+fi
+
+env_name=$1
+python_version_path=$2
+requirements_path=$3
+
+# 1.3. Check that the environment doesn't already exist.
+if conda info --envs | grep -q "^$env_name "; then
+    echo "Error: Conda environment '$env_name' already exists"
+    exit 1
+fi
+
+# 2. Set up the environment.
+# Note: I am intentionally not using environment.yml. I am instead using
+# requirements.txt and .python_version. This is for two reasons:
+#   1. environment.yml sets the conda env name. However, I want to enforce
+#      that the conda env name is the same as the agent name.
+#   2. requirements.txt can be used by pip and only contains packages and
+#      not any additional conda-specific syntax, making it more modular
+#      and flexible.
+
+# 2.1. Set python_version variable.
+if [ -f "$python_version_path" ]; then
+    python_version=$(cat "$python_version_path")
+else
+    echo "Warning: .python_version not found in $python_version_path. Using default Python 3.10."
+    python_version="3.10"
+fi
+
+# 2.2. Create conda environment with specified Python version.
+echo "Creating conda environment '$env_name' with Python $python_version..."
+eval "$(conda shell.bash hook)"
+conda create -y -n "$env_name" python="$python_version"
+
+# 2.3. Install the packages.
+conda activate "$env_name"
+
+if [ -f "$requirements_path" ]; then
+    echo "Installing pip requirements from $requirements_path..."
+    pip install -r "$requirements_path"
+else
+    echo "Warning: $requirements_path not found. Skipping pip install."
+fi
+
+# We always install gymlib so that the agent has access to it.
+if [ -d "gymlib" ]; then
+    echo "Installing gymlib in editable mode..."
+    pip install -e ./gymlib
+else
+    echo "Error: gymlib directory not found in $(pwd). Please ensure you're running this script from the right folder."
+    exit 1
+fi
+
+conda deactivate
+
+# 2.4. Success message.
+echo "Conda environment '$env_name' created successfully."
+echo "It is not currently activated. To activate it, run 'conda activate $env_name'."
diff --git a/scripts/_run_tests.py b/scripts/_run_tests.py
diff --git a/scripts/build_agent_conda_env.sh b/scripts/build_agent_conda_env.sh
@@ -0,0 +1,32 @@
+#!/bin/bash
+# This script creates a conda environment for a specific agent.
+# - Name matches the agent name.
+# - Python version from .python_version file in the agent's folder (if exists).
+# - Dependencies from requirements.txt file in the agent's folder (if exists).
+# - gymlib is installed.
+#
+# Using this script is *optional*. If you have a more complex environment setup
+# for your agent, just do that manually.
+#
+# Run it from the dbgym root folder (e.g. `./scripts/build_agent_conda_env.sh <agent_name>`).
+#
+# Before running this script, the user must update the folder of the agent
+# they want to create a conda environment for (e.g. by calling submodule update).
+# There are other things the user must do as well but these are all checked
+# automatically by this script.
+
+set -euo pipefail
+
+if [ -z "$1" ]; then
+    echo "Usage: ./build_agent_conda_env.sh <agent_name>"
+    exit 1
+fi
+
+agent_name=$1
+
+if [ ! -d "agents/$agent_name" ]; then
+    echo "Error: Agent folder '$agent_name' does not exist"
+    exit 1
+fi
+
+./scripts/_build_conda_env.sh "$agent_name" "agents/$agent_name/.python_version" "agents/$agent_name/requirements.txt"
diff --git a/scripts/build_dbgym_conda_env.sh b/scripts/build_dbgym_conda_env.sh
@@ -0,0 +1,7 @@
+#!/bin/bash
+# This script builds the conda environment used by the gym itself (i.e. the orchestrator).
+# This script is optional. You don't need to use conda if you don't want to (the CI doesn't use conda, for instance)
+
+set -euo pipefail
+
+./scripts/_build_conda_env.sh "dbgym" "scripts/configs/.python_version" "scripts/configs/requirements.txt"
diff --git a/scripts/check_format.sh b/scripts/check_format.sh
@@ -1,5 +1,6 @@
 #!/bin/bash
 set -euxo pipefail
 
-black . --check
-isort . --profile black -c
+# Ignore agents/ because those are all submodules.
+black . --check --exclude agents
+isort . --profile black -c --skip agents
diff --git a/scripts/configs/.python_version b/scripts/configs/.python_version
@@ -0,0 +1 @@
+3.10.13
diff --git a/scripts/configs/apt_requirements.txt b/scripts/configs/apt_requirements.txt
diff --git a/scripts/configs/e2e_test_dbgym_config.yaml b/scripts/configs/e2e_test_dbgym_config.yaml
diff --git a/scripts/configs/mypy.ini b/scripts/configs/mypy.ini
diff --git a/scripts/configs/requirements.txt b/scripts/configs/requirements.txt
diff --git a/scripts/format.sh b/scripts/format.sh
@@ -1,5 +1,6 @@
 #!/bin/bash
-set -euxo pipefail
+set -euo pipefail
 
-black .
-isort . --profile black
+# Ignore agents/ because those are all submodules.
+black . --exclude agents
+isort . --profile black --skip agents
diff --git a/scripts/install_sysdeps.sh b/scripts/install_sysdeps.sh
@@ -0,0 +1,5 @@
+#!/bin/bash
+# "sysdeps" stands for "system dependencies".
+# These are dependencies unrelated to Python that the dbgym needs.
+cat scripts/configs/apt_requirements.txt | xargs sudo apt-get install -y
+curl --proto '=https' --tlsv1.2 -sSf https://sh.rustup.rs | sh -s -- -y
diff --git a/scripts/mypy.sh b/scripts/mypy.sh
@@ -1,2 +1,3 @@
 #!/bin/bash
-mypy --config-file scripts/mypy.ini .
+# Ignore agents/ because those are all submodules.
+mypy --config-file scripts/configs/mypy.ini . --exclude agents/
diff --git a/scripts/run_integration_tests.sh b/scripts/run_integration_tests.sh
@@ -1,2 +1,2 @@
 #!/bin/bash
-python -m scripts.run_tests "integtest_*.py"
+python -m scripts._run_tests "integtest_*.py"
diff --git a/scripts/run_protox_e2e_test.py b/scripts/run_protox_e2e_test.py
@@ -26,7 +26,7 @@
 # Be careful when changing these constants. In some places, the E2E test is hardcoded to work for these specific constants.
 DBMS = "postgres"
 AGENT = "protox"
-E2ETEST_DBGYM_CONFIG_FPATH = Path("scripts/e2e_test_dbgym_config.yaml")
+E2ETEST_DBGYM_CONFIG_FPATH = Path("scripts/configs/e2e_test_dbgym_config.yaml")
 
 
 def get_workspace_dpath(config_fpath: Path) -> Path:
diff --git a/scripts/run_unit_tests.sh b/scripts/run_unit_tests.sh
@@ -1,2 +1,2 @@
 #!/bin/bash
-python -m scripts.run_tests "unittest_*.py"
+python -m scripts._run_tests "unittest_*.py"

Original file line number	Diff line number	Diff line change
`@@ -0,0 +1,3 @@`
	`1`	`+[submodule "agents/hello-tune"]`
	`2`	`+ path = agents/hello-tune`
	`3`	`+ url = [email protected]:wangpatrick57/hello-tune.git`
Original file line number	Diff line number	Diff line change
`@@ -0,0 +1,2 @@`
	`1`	`+def get_magic_number() -> int:`
	`2`	`+ return 42`
Original file line number	Diff line number	Diff line change
`@@ -1,2 +1,2 @@`
`1`	`1`	`#!/bin/bash`
`2`		`-python -m scripts.run_tests "integtest_*.py"`
	`2`	`+python -m scripts._run_tests "integtest_*.py"`
Original file line number	Diff line number	Diff line change
`@@ -1,2 +1,2 @@`
`1`	`1`	`#!/bin/bash`
`2`		`-python -m scripts.run_tests "unittest_*.py"`
	`2`	`+python -m scripts._run_tests "unittest_*.py"`