From ed21a60d86ec703a268ff964ff0689aabf339bc4 Mon Sep 17 00:00:00 2001
From: Patrick Wang <wang.patrick57@gmail.com>
Date: Sat, 28 Dec 2024 19:50:20 -0500
Subject: [PATCH 01/24] added (failing) test for test_tpch_data

---
 benchmark/tests/__init__.py                   |  0
 .../benchmark_integtest_dbgym_config.yaml     |  1 +
 benchmark/tests/integtest_benchmark.py        | 24 +++++++++++++++++++
 benchmark/tpch/cli.py                         |  9 ++++++-
 env/tests/gymlib_integtest_util.py            |  4 ++++
 5 files changed, 37 insertions(+), 1 deletion(-)
 create mode 100644 benchmark/tests/__init__.py
 create mode 100644 benchmark/tests/benchmark_integtest_dbgym_config.yaml
 create mode 100644 benchmark/tests/integtest_benchmark.py

diff --git a/benchmark/tests/__init__.py b/benchmark/tests/__init__.py
new file mode 100644
index 00000000..e69de29b
diff --git a/benchmark/tests/benchmark_integtest_dbgym_config.yaml b/benchmark/tests/benchmark_integtest_dbgym_config.yaml
new file mode 100644
index 00000000..8a1b1cca
--- /dev/null
+++ b/benchmark/tests/benchmark_integtest_dbgym_config.yaml
@@ -0,0 +1 @@
+dbgym_workspace_path: ../dbgym_benchmark_integtest_workspace/
diff --git a/benchmark/tests/integtest_benchmark.py b/benchmark/tests/integtest_benchmark.py
new file mode 100644
index 00000000..c3fef054
--- /dev/null
+++ b/benchmark/tests/integtest_benchmark.py
@@ -0,0 +1,24 @@
+import unittest
+from pathlib import Path
+
+# It's ok to import private functions from the benchmark module because this is an integration test.
+from benchmark.tpch.cli import _tpch_data
+from util.workspace import DBGymWorkspace, get_default_tables_path, get_workspace_path_from_config
+
+
+class TestBenchmark(unittest.TestCase):
+    DBGYM_CONFIG_PATH = Path("benchmark/tests/benchmark_integtest_dbgym_config.yaml")
+
+    def setUp(self) -> None:
+        self.workspace = DBGymWorkspace(get_workspace_path_from_config(TestBenchmark.DBGYM_CONFIG_PATH))
+
+    def test_tpch_data(self) -> None:
+        scale_factor = 0.01
+        tables_path = get_default_tables_path(self.workspace.dbgym_workspace_path, "tpch", scale_factor)
+        self.assertFalse(tables_path.exists())
+        _tpch_data(self.workspace, scale_factor)
+        self.assertTrue(tables_path.exists())
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/benchmark/tpch/cli.py b/benchmark/tpch/cli.py
index eea9dc5c..5ecac290 100644
--- a/benchmark/tpch/cli.py
+++ b/benchmark/tpch/cli.py
@@ -29,6 +29,13 @@ def tpch_group(dbgym_workspace: DBGymWorkspace) -> None:
 # The reason generate data is separate from create dbdata is because generate-data is generic
 #   to all DBMSs while create dbdata is specific to a single DBMS.
 def tpch_data(dbgym_workspace: DBGymWorkspace, scale_factor: float) -> None:
+    _tpch_data(dbgym_workspace, scale_factor)
+
+
+def _tpch_data(dbgym_workspace: DBGymWorkspace, scale_factor: float) -> None:
+    """
+    This function exists as a hook for integration tests.
+    """
     _clone_tpch_kit(dbgym_workspace)
     _generate_data(dbgym_workspace, scale_factor)
 
@@ -87,7 +94,7 @@ def _clone_tpch_kit(dbgym_workspace: DBGymWorkspace) -> None:
     logging.getLogger(DBGYM_LOGGER_NAME).info(f"Cloning: {expected_symlink_dpath}")
     real_build_path = dbgym_workspace.cur_task_runs_build_path()
     subprocess_run(
-        f"./clone_tpch_kit.sh {real_build_path}", cwd=dbgym_workspace.cur_source_path()
+        f"./clone_tpch_kit.sh {real_build_path}", cwd=dbgym_workspace.base_dbgym_repo_dpath / "benchmark" / "tpch"
     )
     symlink_dpath = link_result(dbgym_workspace, real_build_path / "tpch-kit")
     assert expected_symlink_dpath.samefile(symlink_dpath)
diff --git a/env/tests/gymlib_integtest_util.py b/env/tests/gymlib_integtest_util.py
index 86b3b829..597d6e73 100644
--- a/env/tests/gymlib_integtest_util.py
+++ b/env/tests/gymlib_integtest_util.py
@@ -37,6 +37,10 @@ class GymlibIntegtestManager:
 
     @staticmethod
     def set_up_workspace() -> None:
+        """
+        Set up the workspace if it has not already been set up.
+        None of the integtest_*.py files will delete the workspace so that future tests run faster.
+        """
         workspace_path = get_workspace_path_from_config(
             GymlibIntegtestManager.DBGYM_CONFIG_PATH
         )

From 91a6db39c69d65e2c79b68c5b87d9a46da9941f7 Mon Sep 17 00:00:00 2001
From: Patrick Wang <wang.patrick57@gmail.com>
Date: Sun, 29 Dec 2024 15:14:16 -0500
Subject: [PATCH 02/24] wrote and passed test_tpch_data

---
 benchmark/job/cli.py                    |  4 +-
 benchmark/job/load_info.py              | 39 ++++++--------
 benchmark/tests/integtest_benchmark.py  | 17 +++++--
 benchmark/tpch/cli.py                   | 68 ++++++++++++-------------
 benchmark/tpch/load_info.py             | 47 +++++++----------
 dbms/load_info_base_class.py            |  2 +-
 dbms/postgres/cli.py                    |  8 +--
 gymlib_package/gymlib/__init__.py       |  2 +-
 gymlib_package/gymlib/symlinks_paths.py | 34 +++++++++++++
 util/workspace.py                       | 65 ++++++++++-------------
 10 files changed, 152 insertions(+), 134 deletions(-)
 create mode 100644 gymlib_package/gymlib/symlinks_paths.py

diff --git a/benchmark/job/cli.py b/benchmark/job/cli.py
index 84df02ef..420fae7d 100644
--- a/benchmark/job/cli.py
+++ b/benchmark/job/cli.py
@@ -2,13 +2,13 @@
 from typing import Optional
 
 import click
+from gymlib.symlinks_paths import get_tables_dirname
 
 from benchmark.constants import DEFAULT_SCALE_FACTOR
 from util.log import DBGYM_LOGGER_NAME
 from util.shell import subprocess_run
 from util.workspace import (
     DBGymWorkspace,
-    get_default_tables_dname,
     get_workload_name,
     is_fully_resolved,
     link_result,
@@ -172,7 +172,7 @@ def _download_job_data(dbgym_workspace: DBGymWorkspace) -> None:
         dbgym_workspace,
         JOB_TABLES_URL,
         "imdb.tgz",
-        get_default_tables_dname(DEFAULT_SCALE_FACTOR),
+        get_tables_dirname("job", DEFAULT_SCALE_FACTOR),
     )
 
 
diff --git a/benchmark/job/load_info.py b/benchmark/job/load_info.py
index e205847d..3a64ab57 100644
--- a/benchmark/job/load_info.py
+++ b/benchmark/job/load_info.py
@@ -1,16 +1,16 @@
 from pathlib import Path
 from typing import Optional
 
+from gymlib.symlinks_paths import get_tables_symlink_path
+
 from benchmark.constants import DEFAULT_SCALE_FACTOR
 from dbms.load_info_base_class import LoadInfoBaseClass
-from util.workspace import DBGymWorkspace, get_default_tables_dname, is_fully_resolved
+from util.workspace import DBGymWorkspace, fully_resolve_path
 
 JOB_SCHEMA_FNAME = "job_schema.sql"
 
 
 class JobLoadInfo(LoadInfoBaseClass):
-    CODEBASE_PATH_COMPONENTS = ["dbgym", "benchmark", "job"]
-    CODEBASE_DNAME = "_".join(CODEBASE_PATH_COMPONENTS)
     TABLES = [
         "aka_name",
         "aka_title",
@@ -36,38 +36,29 @@ class JobLoadInfo(LoadInfoBaseClass):
     ]
 
     def __init__(self, dbgym_workspace: DBGymWorkspace):
-        # schema and constraints
-        schema_root_dpath = dbgym_workspace.base_dbgym_repo_dpath
-        for component in JobLoadInfo.CODEBASE_PATH_COMPONENTS[
-            1:
-        ]:  # [1:] to skip "dbgym"
-            schema_root_dpath /= component
-        self._schema_fpath = schema_root_dpath / JOB_SCHEMA_FNAME
+        # Schema (directly in the codebase).
+        job_codebase_path = dbgym_workspace.base_dbgym_repo_dpath / "benchmark" / "job"
+        self._schema_fpath = job_codebase_path / JOB_SCHEMA_FNAME
         assert (
             self._schema_fpath.exists()
         ), f"self._schema_fpath ({self._schema_fpath}) does not exist"
 
         # Tables
-        data_root_dpath = (
-            dbgym_workspace.dbgym_symlinks_path / JobLoadInfo.CODEBASE_DNAME / "data"
-        )
-        tables_symlink_dpath = (
-            data_root_dpath / f"{get_default_tables_dname(DEFAULT_SCALE_FACTOR)}.link"
+        tables_path = fully_resolve_path(
+            get_tables_symlink_path(
+                dbgym_workspace.dbgym_workspace_path, "job", DEFAULT_SCALE_FACTOR
+            )
         )
-        tables_dpath = tables_symlink_dpath.resolve()
-        assert is_fully_resolved(
-            tables_dpath
-        ), f"tables_dpath ({tables_dpath}) should be an existent real absolute path. Make sure you have generated the TPC-H data"
-        self._tables_and_fpaths = []
+        self._tables_and_paths = []
         for table in JobLoadInfo.TABLES:
-            table_fpath = tables_dpath / f"{table}.csv"
-            self._tables_and_fpaths.append((table, table_fpath))
+            table_fpath = tables_path / f"{table}.csv"
+            self._tables_and_paths.append((table, table_fpath))
 
     def get_schema_fpath(self) -> Path:
         return self._schema_fpath
 
-    def get_tables_and_fpaths(self) -> list[tuple[str, Path]]:
-        return self._tables_and_fpaths
+    def get_tables_and_paths(self) -> list[tuple[str, Path]]:
+        return self._tables_and_paths
 
     def get_table_file_delimiter(self) -> str:
         return ","
diff --git a/benchmark/tests/integtest_benchmark.py b/benchmark/tests/integtest_benchmark.py
index c3fef054..5b4a06e8 100644
--- a/benchmark/tests/integtest_benchmark.py
+++ b/benchmark/tests/integtest_benchmark.py
@@ -1,20 +1,31 @@
+import shutil
 import unittest
 from pathlib import Path
 
+from gymlib.symlinks_paths import get_tables_symlink_path
+
 # It's ok to import private functions from the benchmark module because this is an integration test.
 from benchmark.tpch.cli import _tpch_data
-from util.workspace import DBGymWorkspace, get_default_tables_path, get_workspace_path_from_config
+from util.workspace import DBGymWorkspace, get_workspace_path_from_config
 
 
 class TestBenchmark(unittest.TestCase):
     DBGYM_CONFIG_PATH = Path("benchmark/tests/benchmark_integtest_dbgym_config.yaml")
 
     def setUp(self) -> None:
-        self.workspace = DBGymWorkspace(get_workspace_path_from_config(TestBenchmark.DBGYM_CONFIG_PATH))
+        workspace_path = get_workspace_path_from_config(TestBenchmark.DBGYM_CONFIG_PATH)
+        # Get a clean start each time.
+        shutil.rmtree(workspace_path)
+        self.workspace = DBGymWorkspace(workspace_path)
+
+    # def tearDown(self) -> None:
+    #     shutil.rmtree(self.workspace.dbgym_workspace_path)
 
     def test_tpch_data(self) -> None:
         scale_factor = 0.01
-        tables_path = get_default_tables_path(self.workspace.dbgym_workspace_path, "tpch", scale_factor)
+        tables_path = get_tables_symlink_path(
+            self.workspace.dbgym_workspace_path, "tpch", scale_factor
+        )
         self.assertFalse(tables_path.exists())
         _tpch_data(self.workspace, scale_factor)
         self.assertTrue(tables_path.exists())
diff --git a/benchmark/tpch/cli.py b/benchmark/tpch/cli.py
index 5ecac290..61db9d6e 100644
--- a/benchmark/tpch/cli.py
+++ b/benchmark/tpch/cli.py
@@ -2,6 +2,7 @@
 from pathlib import Path
 
 import click
+from gymlib.symlinks_paths import get_tables_dirname, get_tables_symlink_path
 
 from benchmark.constants import DEFAULT_SCALE_FACTOR
 from benchmark.tpch.constants import DEFAULT_TPCH_SEED, NUM_TPCH_QUERIES
@@ -9,13 +10,14 @@
 from util.shell import subprocess_run
 from util.workspace import (
     DBGymWorkspace,
-    get_default_tables_dname,
     get_scale_factor_string,
     get_workload_name,
     is_fully_resolved,
     link_result,
 )
 
+TPCH_KIT_DIRNAME = "tpch-kit"
+
 
 @click.group(name="tpch")
 @click.pass_obj
@@ -77,13 +79,13 @@ def tpch_workload(
     )
 
 
-def _get_queries_dname(seed: int, scale_factor: float) -> str:
+def _get_queries_dirname(seed: int, scale_factor: float) -> str:
     return f"queries_{seed}_sf{get_scale_factor_string(scale_factor)}"
 
 
 def _clone_tpch_kit(dbgym_workspace: DBGymWorkspace) -> None:
-    expected_symlink_dpath = (
-        dbgym_workspace.cur_symlinks_build_path(mkdir=True) / "tpch-kit.link"
+    expected_symlink_dpath = dbgym_workspace.dbgym_cur_symlinks_path / (
+        TPCH_KIT_DIRNAME + ".link"
     )
     if expected_symlink_dpath.exists():
         logging.getLogger(DBGYM_LOGGER_NAME).info(
@@ -92,60 +94,57 @@ def _clone_tpch_kit(dbgym_workspace: DBGymWorkspace) -> None:
         return
 
     logging.getLogger(DBGYM_LOGGER_NAME).info(f"Cloning: {expected_symlink_dpath}")
-    real_build_path = dbgym_workspace.cur_task_runs_build_path()
     subprocess_run(
-        f"./clone_tpch_kit.sh {real_build_path}", cwd=dbgym_workspace.base_dbgym_repo_dpath / "benchmark" / "tpch"
+        f"./clone_tpch_kit.sh {dbgym_workspace.dbgym_this_run_path}",
+        cwd=dbgym_workspace.base_dbgym_repo_dpath / "benchmark" / "tpch",
+    )
+    symlink_dpath = dbgym_workspace.link_result(
+        dbgym_workspace.dbgym_this_run_path / TPCH_KIT_DIRNAME
     )
-    symlink_dpath = link_result(dbgym_workspace, real_build_path / "tpch-kit")
     assert expected_symlink_dpath.samefile(symlink_dpath)
     logging.getLogger(DBGYM_LOGGER_NAME).info(f"Cloned: {expected_symlink_dpath}")
 
 
-def _get_tpch_kit_dpath(dbgym_workspace: DBGymWorkspace) -> Path:
-    tpch_kit_dpath = (
-        dbgym_workspace.cur_symlinks_build_path() / "tpch-kit.link"
-    ).resolve()
-    assert is_fully_resolved(tpch_kit_dpath)
-    return tpch_kit_dpath
-
-
 def _generate_tpch_queries(
     dbgym_workspace: DBGymWorkspace, seed_start: int, seed_end: int, scale_factor: float
 ) -> None:
-    tpch_kit_dpath = _get_tpch_kit_dpath(dbgym_workspace)
-    data_path = dbgym_workspace.cur_symlinks_data_path(mkdir=True)
+    tpch_kit_dpath = dbgym_workspace.dbgym_cur_symlinks_path / (
+        TPCH_KIT_DIRNAME + ".link"
+    )
     logging.getLogger(DBGYM_LOGGER_NAME).info(
-        f"Generating queries: {data_path} [{seed_start}, {seed_end}]"
+        f"Generating queries: [{seed_start}, {seed_end}]"
     )
     for seed in range(seed_start, seed_end + 1):
-        expected_queries_symlink_dpath = data_path / (
-            _get_queries_dname(seed, scale_factor) + ".link"
+        expected_queries_symlink_dpath = dbgym_workspace.dbgym_cur_symlinks_path / (
+            _get_queries_dirname(seed, scale_factor) + ".link"
         )
         if expected_queries_symlink_dpath.exists():
             continue
 
-        real_dir = dbgym_workspace.cur_task_runs_data_path(
-            _get_queries_dname(seed, scale_factor), mkdir=True
+        queries_parent_path = (
+            dbgym_workspace.dbgym_this_run_path
+            / _get_queries_dirname(seed, scale_factor)
         )
         for i in range(1, NUM_TPCH_QUERIES + 1):
-            target_sql = (real_dir / f"{i}.sql").resolve()
+            target_sql = (queries_parent_path / f"{i}.sql").resolve()
             subprocess_run(
                 f"DSS_QUERY=./queries ./qgen {i} -r {seed} -s {scale_factor} > {target_sql}",
                 cwd=tpch_kit_dpath / "dbgen",
                 verbose=False,
             )
-        queries_symlink_dpath = link_result(dbgym_workspace, real_dir)
+        queries_symlink_dpath = dbgym_workspace.link_result(queries_parent_path)
         assert queries_symlink_dpath.samefile(expected_queries_symlink_dpath)
     logging.getLogger(DBGYM_LOGGER_NAME).info(
-        f"Generated queries: {data_path} [{seed_start}, {seed_end}]"
+        f"Generated queries: [{seed_start}, {seed_end}]"
     )
 
 
 def _generate_data(dbgym_workspace: DBGymWorkspace, scale_factor: float) -> None:
-    tpch_kit_dpath = _get_tpch_kit_dpath(dbgym_workspace)
-    data_path = dbgym_workspace.cur_symlinks_data_path(mkdir=True)
-    expected_tables_symlink_dpath = (
-        data_path / f"{get_default_tables_dname(scale_factor)}.link"
+    tpch_kit_dpath = dbgym_workspace.dbgym_cur_symlinks_path / (
+        TPCH_KIT_DIRNAME + ".link"
+    )
+    expected_tables_symlink_dpath = get_tables_symlink_path(
+        dbgym_workspace.dbgym_workspace_path, "tpch", scale_factor
     )
     if expected_tables_symlink_dpath.exists():
         logging.getLogger(DBGYM_LOGGER_NAME).info(
@@ -157,12 +156,13 @@ def _generate_data(dbgym_workspace: DBGymWorkspace, scale_factor: float) -> None
         f"Generating: {expected_tables_symlink_dpath}"
     )
     subprocess_run(f"./dbgen -vf -s {scale_factor}", cwd=tpch_kit_dpath / "dbgen")
-    real_dir = dbgym_workspace.cur_task_runs_data_path(
-        get_default_tables_dname(scale_factor), mkdir=True
+    tables_parent_path = dbgym_workspace.dbgym_this_run_path / get_tables_dirname(
+        "tpch", scale_factor
     )
-    subprocess_run(f"mv ./*.tbl {real_dir}", cwd=tpch_kit_dpath / "dbgen")
+    tables_parent_path.mkdir(parents=True, exist_ok=False)
+    subprocess_run(f"mv ./*.tbl {tables_parent_path}", cwd=tpch_kit_dpath / "dbgen")
 
-    tables_symlink_dpath = link_result(dbgym_workspace, real_dir)
+    tables_symlink_dpath = dbgym_workspace.link_result(tables_parent_path)
     assert tables_symlink_dpath.samefile(expected_tables_symlink_dpath)
     logging.getLogger(DBGYM_LOGGER_NAME).info(
         f"Generated: {expected_tables_symlink_dpath}"
@@ -202,7 +202,7 @@ def _generate_tpch_workload(
             for qname in query_names:
                 sql_fpath = (
                     symlink_data_dpath
-                    / (_get_queries_dname(seed, scale_factor) + ".link")
+                    / (_get_queries_dirname(seed, scale_factor) + ".link")
                 ).resolve() / f"{qname}.sql"
                 assert is_fully_resolved(
                     sql_fpath
diff --git a/benchmark/tpch/load_info.py b/benchmark/tpch/load_info.py
index f9710b04..0e5bcc64 100644
--- a/benchmark/tpch/load_info.py
+++ b/benchmark/tpch/load_info.py
@@ -1,18 +1,16 @@
 from pathlib import Path
 from typing import Optional
 
+from gymlib.symlinks_paths import get_tables_symlink_path
+
 from dbms.load_info_base_class import LoadInfoBaseClass
-from util.workspace import DBGymWorkspace, get_default_tables_dname, is_fully_resolved
+from util.workspace import DBGymWorkspace, fully_resolve_path
 
 TPCH_SCHEMA_FNAME = "tpch_schema.sql"
 TPCH_CONSTRAINTS_FNAME = "tpch_constraints.sql"
 
 
 class TpchLoadInfo(LoadInfoBaseClass):
-    # currently, hardcoding the path seems like the easiest solution. If the path ever changes, it'll
-    # just break an integration test and we can fix it. I don't want to prematurely overengineer it
-    CODEBASE_PATH_COMPONENTS = ["dbgym", "benchmark", "tpch"]
-    CODEBASE_DNAME = "_".join(CODEBASE_PATH_COMPONENTS)
     TABLES = [
         "region",
         "nation",
@@ -25,42 +23,35 @@ class TpchLoadInfo(LoadInfoBaseClass):
     ]
 
     def __init__(self, dbgym_workspace: DBGymWorkspace, scale_factor: float):
-        # schema and constraints
-        schema_root_dpath = dbgym_workspace.base_dbgym_repo_dpath
-        for component in TpchLoadInfo.CODEBASE_PATH_COMPONENTS[
-            1:
-        ]:  # [1:] to skip "dbgym"
-            schema_root_dpath /= component
-        self._schema_fpath = schema_root_dpath / TPCH_SCHEMA_FNAME
+        # Schema and constraints (directly in the codebase).
+        tpch_codebase_path = (
+            dbgym_workspace.base_dbgym_repo_dpath / "benchmark" / "tpch"
+        )
+        self._schema_fpath = tpch_codebase_path / TPCH_SCHEMA_FNAME
         assert (
             self._schema_fpath.exists()
         ), f"self._schema_fpath ({self._schema_fpath}) does not exist"
-        self._constraints_fpath = schema_root_dpath / TPCH_CONSTRAINTS_FNAME
+        self._constraints_fpath = tpch_codebase_path / TPCH_CONSTRAINTS_FNAME
         assert (
             self._constraints_fpath.exists()
         ), f"self._constraints_fpath ({self._constraints_fpath}) does not exist"
 
-        # tables
-        data_root_dpath = (
-            dbgym_workspace.dbgym_symlinks_path / TpchLoadInfo.CODEBASE_DNAME / "data"
-        )
-        tables_symlink_dpath = (
-            data_root_dpath / f"{get_default_tables_dname(scale_factor)}.link"
+        # Tables
+        tables_path = fully_resolve_path(
+            get_tables_symlink_path(
+                dbgym_workspace.dbgym_workspace_path, "tpch", scale_factor
+            )
         )
-        tables_dpath = tables_symlink_dpath.resolve()
-        assert is_fully_resolved(
-            tables_dpath
-        ), f"tables_dpath ({tables_dpath}) should be an existent real absolute path. Make sure you have generated the TPC-H data"
-        self._tables_and_fpaths = []
+        self._tables_and_paths = []
         for table in TpchLoadInfo.TABLES:
-            table_fpath = tables_dpath / f"{table}.tbl"
-            self._tables_and_fpaths.append((table, table_fpath))
+            table_path = tables_path / f"{table}.tbl"
+            self._tables_and_paths.append((table, table_path))
 
     def get_schema_fpath(self) -> Path:
         return self._schema_fpath
 
-    def get_tables_and_fpaths(self) -> list[tuple[str, Path]]:
-        return self._tables_and_fpaths
+    def get_tables_and_paths(self) -> list[tuple[str, Path]]:
+        return self._tables_and_paths
 
     def get_table_file_delimiter(self) -> str:
         return "|"
diff --git a/dbms/load_info_base_class.py b/dbms/load_info_base_class.py
index 847579d6..bfbe763a 100644
--- a/dbms/load_info_base_class.py
+++ b/dbms/load_info_base_class.py
@@ -12,7 +12,7 @@ class LoadInfoBaseClass:
     def get_schema_fpath(self) -> Path:
         raise NotImplementedError
 
-    def get_tables_and_fpaths(self) -> list[tuple[str, Path]]:
+    def get_tables_and_paths(self) -> list[tuple[str, Path]]:
         raise NotImplementedError
 
     # We assume the table file has a "csv-like" format where values are separated by a delimiter.
diff --git a/dbms/postgres/cli.py b/dbms/postgres/cli.py
index 393689a1..326fc6d2 100644
--- a/dbms/postgres/cli.py
+++ b/dbms/postgres/cli.py
@@ -33,7 +33,7 @@
     WORKSPACE_PATH_PLACEHOLDER,
     DBGymWorkspace,
     fully_resolve_path,
-    get_dbdata_tgz_name,
+    get_dbdata_tgz_filename,
     get_default_dbdata_parent_dpath,
     get_default_pgbin_path,
     is_fully_resolved,
@@ -202,7 +202,7 @@ def _create_dbdata(
     # Note that you can't pass "[dbdata].tgz" as an arg to cur_task_runs_data_path() because that would create "[dbdata].tgz" as a dir.
     dbdata_tgz_real_fpath = dbgym_workspace.cur_task_runs_data_path(
         mkdir=True
-    ) / get_dbdata_tgz_name(benchmark_name, scale_factor)
+    ) / get_dbdata_tgz_filename(benchmark_name, scale_factor)
     # We need to cd into dbdata_dpath so that the tar file does not contain folders for the whole path of dbdata_dpath.
     subprocess_run(f"tar -czf {dbdata_tgz_real_fpath} .", cwd=dbdata_dpath)
 
@@ -276,10 +276,10 @@ def _load_into_dbdata(
     sql_file_execute(dbgym_workspace, conn, load_info.get_schema_fpath())
 
     # Truncate all tables first before even loading a single one.
-    for table, _ in load_info.get_tables_and_fpaths():
+    for table, _ in load_info.get_tables_and_paths():
         sqlalchemy_conn_execute(conn, f"TRUNCATE {table} CASCADE")
     # Then, load the tables.
-    for table, table_fpath in load_info.get_tables_and_fpaths():
+    for table, table_fpath in load_info.get_tables_and_paths():
         with open_and_save(dbgym_workspace, table_fpath, "r") as table_csv:
             assert conn.connection.dbapi_connection is not None
             cur = conn.connection.dbapi_connection.cursor()
diff --git a/gymlib_package/gymlib/__init__.py b/gymlib_package/gymlib/__init__.py
index 30e9d5c3..99257dda 100644
--- a/gymlib_package/gymlib/__init__.py
+++ b/gymlib_package/gymlib/__init__.py
@@ -1 +1 @@
-from . import magic
+from . import magic, symlinks_paths
diff --git a/gymlib_package/gymlib/symlinks_paths.py b/gymlib_package/gymlib/symlinks_paths.py
new file mode 100644
index 00000000..5fb86b44
--- /dev/null
+++ b/gymlib_package/gymlib/symlinks_paths.py
@@ -0,0 +1,34 @@
+from pathlib import Path
+
+# TODO: move these into workspace.py and move workspace.py into gymlib.
+SYMLINKS_DNAME = "symlinks"
+DBGYM_APP_NAME = "dbgym"
+
+SCALE_FACTOR_PLACEHOLDER: str = "[scale_factor]"
+BENCHMARK_NAME_PLACEHOLDER: str = "[benchmark_name]"
+WORKLOAD_NAME_PLACEHOLDER: str = "[workload_name]"
+
+
+def get_scale_factor_string(scale_factor: float | str) -> str:
+    if type(scale_factor) is str and scale_factor == SCALE_FACTOR_PLACEHOLDER:
+        return scale_factor
+    else:
+        if float(int(scale_factor)) == scale_factor:
+            return str(int(scale_factor))
+        else:
+            return str(scale_factor).replace(".", "point")
+
+
+def get_tables_dirname(benchmark: str, scale_factor: float | str) -> str:
+    return f"{benchmark}_sf{get_scale_factor_string(scale_factor)}_tables"
+
+
+def get_tables_symlink_path(
+    workspace_path: Path, benchmark: str, scale_factor: float | str
+) -> Path:
+    return (
+        workspace_path
+        / SYMLINKS_DNAME
+        / DBGYM_APP_NAME
+        / get_tables_dirname(benchmark, scale_factor)
+    )
diff --git a/util/workspace.py b/util/workspace.py
index 11e0d620..ce7cbd1f 100644
--- a/util/workspace.py
+++ b/util/workspace.py
@@ -45,24 +45,8 @@ def get_latest_run_path_from_workspace_path(workspace_path: Path) -> Path:
     return get_runs_path_from_workspace_path(workspace_path) / "latest_run.link"
 
 
-def get_scale_factor_string(scale_factor: float | str) -> str:
-    if type(scale_factor) is str and scale_factor == SCALE_FACTOR_PLACEHOLDER:
-        return scale_factor
-    else:
-        if float(int(scale_factor)) == scale_factor:
-            return str(int(scale_factor))
-        else:
-            return str(scale_factor).replace(".", "point")
-
-
-def get_dbdata_tgz_name(benchmark_name: str, scale_factor: float | str) -> str:
-    return f"{benchmark_name}_sf{get_scale_factor_string(scale_factor)}_pristine_dbdata.tgz"
-
-
 # Other parameters
-BENCHMARK_NAME_PLACEHOLDER: str = "[benchmark_name]"
-WORKLOAD_NAME_PLACEHOLDER: str = "[workload_name]"
-SCALE_FACTOR_PLACEHOLDER: str = "[scale_factor]"
+
 
 # Paths of config files in the codebase. These are always relative paths.
 # The reason these can be relative paths instead of functions taking in codebase_path as input is because relative paths are relative to the codebase root
@@ -70,8 +54,6 @@ def get_dbdata_tgz_name(benchmark_name: str, scale_factor: float | str) -> str:
 
 
 # Generally useful functions
-def get_workload_name(scale_factor: float | str, suffix: str) -> str:
-    return f"workload_sf{get_scale_factor_string(scale_factor)}_{suffix}"
 
 
 def get_default_workload_name_suffix(benchmark_name: str) -> str:
@@ -85,8 +67,6 @@ def get_default_workload_name_suffix(benchmark_name: str) -> str:
 
 # Standard names of files/directories. These can refer to either the actual file/directory or a link to the file/directory.
 #   Since they can refer to either the actual or the link, they do not have ".link" in them.
-def get_default_tables_dname(scale_factor: float | str) -> str:
-    return f"tables_sf{get_scale_factor_string(scale_factor)}"
 
 
 # Paths of dependencies in the workspace. These are named "*_path" because they will be an absolute path
@@ -102,17 +82,6 @@ def get_default_tables_dname(scale_factor: float | str) -> str:
 #    folder called run_*/*/tuning_steps. However, replay itself generates an replay_info.log file, which goes in
 #    run_*/*/tuning_steps/. The bug was that my replay function was overwriting the replay_info.log file of the
 #    tuning run. By naming all symlinks "*.link", we avoid the possibility of subtle bugs like this happening.
-def get_default_tables_path(
-    workspace_path: Path, benchmark_name: str, scale_factor: float | str
-) -> Path:
-    return (
-        get_symlinks_path_from_workspace_path(workspace_path)
-        / f"dbgym_benchmark_{benchmark_name}"
-        / "data"
-        / (get_default_tables_dname(scale_factor) + ".link")
-    )
-
-
 def get_default_workload_path(
     workspace_path: Path, benchmark_name: str, workload_name: str
 ) -> Path:
@@ -124,6 +93,27 @@ def get_default_workload_path(
     )
 
 
+SCALE_FACTOR_PLACEHOLDER: str = "[scale_factor]"
+
+
+def get_scale_factor_string(scale_factor: float | str) -> str:
+    if type(scale_factor) is str and scale_factor == SCALE_FACTOR_PLACEHOLDER:
+        return scale_factor
+    else:
+        if float(int(scale_factor)) == scale_factor:
+            return str(int(scale_factor))
+        else:
+            return str(scale_factor).replace(".", "point")
+
+
+def get_dbdata_tgz_filename(benchmark_name: str, scale_factor: float | str) -> str:
+    return f"{benchmark_name}_sf{get_scale_factor_string(scale_factor)}_pristine_dbdata.tgz"
+
+
+def get_workload_name(scale_factor: float | str, suffix: str) -> str:
+    return f"workload_sf{get_scale_factor_string(scale_factor)}_{suffix}"
+
+
 def get_default_pristine_dbdata_snapshot_path(
     workspace_path: Path, benchmark_name: str, scale_factor: float | str
 ) -> Path:
@@ -131,7 +121,7 @@ def get_default_pristine_dbdata_snapshot_path(
         get_symlinks_path_from_workspace_path(workspace_path)
         / "dbgym_dbms_postgres"
         / "data"
-        / (get_dbdata_tgz_name(benchmark_name, scale_factor) + ".link")
+        / (get_dbdata_tgz_filename(benchmark_name, scale_factor) + ".link")
     )
 
 
@@ -187,6 +177,7 @@ def __init__(self, dbgym_workspace_path: Path):
             self.dbgym_workspace_path
         )
         self.dbgym_symlinks_path.mkdir(parents=True, exist_ok=True)
+        self.dbgym_cur_symlinks_path = self.dbgym_symlinks_path / self.app_name
         # tmp/ is a workspace for this run only
         # One use for it is to place the unzipped dbdata.
         # There's no need to save the actual dbdata dir in run_*/ because we just save a symlink to
@@ -357,7 +348,7 @@ def cur_source_path(self, *dirs: str) -> Path:
             cur_path = cur_path / dir
         return cur_path
 
-    def cur_symlinks_path(self, *dirs: str, mkdir: bool = False) -> Path:
+    def _cur_symlinks_path(self, *dirs: str, mkdir: bool = False) -> Path:
         flattened_structure = "_".join(self.cur_path_list)
         cur_path = self.dbgym_symlinks_path / flattened_structure
         for dir in dirs:
@@ -376,13 +367,13 @@ def cur_task_runs_path(self, *dirs: str, mkdir: bool = False) -> Path:
         return cur_path
 
     def cur_symlinks_bin_path(self, *dirs: str, mkdir: bool = False) -> Path:
-        return self.cur_symlinks_path("bin", *dirs, mkdir=mkdir)
+        return self._cur_symlinks_path("bin", *dirs, mkdir=mkdir)
 
     def cur_symlinks_build_path(self, *dirs: str, mkdir: bool = False) -> Path:
-        return self.cur_symlinks_path("build", *dirs, mkdir=mkdir)
+        return self._cur_symlinks_path("build", *dirs, mkdir=mkdir)
 
     def cur_symlinks_data_path(self, *dirs: str, mkdir: bool = False) -> Path:
-        return self.cur_symlinks_path("data", *dirs, mkdir=mkdir)
+        return self._cur_symlinks_path("data", *dirs, mkdir=mkdir)
 
     def cur_task_runs_build_path(self, *dirs: str, mkdir: bool = False) -> Path:
         return self.cur_task_runs_path("build", *dirs, mkdir=mkdir)

From 0f00113161e15704caa01e1117efa9b1667daeeb Mon Sep 17 00:00:00 2001
From: Patrick Wang <wang.patrick57@gmail.com>
Date: Sun, 29 Dec 2024 15:16:12 -0500
Subject: [PATCH 03/24] tpch data -> tables

---
 benchmark/tests/integtest_benchmark.py |  6 +++---
 benchmark/tpch/cli.py                  | 15 +++++++--------
 2 files changed, 10 insertions(+), 11 deletions(-)

diff --git a/benchmark/tests/integtest_benchmark.py b/benchmark/tests/integtest_benchmark.py
index 5b4a06e8..3d046bcf 100644
--- a/benchmark/tests/integtest_benchmark.py
+++ b/benchmark/tests/integtest_benchmark.py
@@ -5,7 +5,7 @@
 from gymlib.symlinks_paths import get_tables_symlink_path
 
 # It's ok to import private functions from the benchmark module because this is an integration test.
-from benchmark.tpch.cli import _tpch_data
+from benchmark.tpch.cli import _tpch_tables
 from util.workspace import DBGymWorkspace, get_workspace_path_from_config
 
 
@@ -21,13 +21,13 @@ def setUp(self) -> None:
     # def tearDown(self) -> None:
     #     shutil.rmtree(self.workspace.dbgym_workspace_path)
 
-    def test_tpch_data(self) -> None:
+    def test_tpch_tables(self) -> None:
         scale_factor = 0.01
         tables_path = get_tables_symlink_path(
             self.workspace.dbgym_workspace_path, "tpch", scale_factor
         )
         self.assertFalse(tables_path.exists())
-        _tpch_data(self.workspace, scale_factor)
+        _tpch_tables(self.workspace, scale_factor)
         self.assertTrue(tables_path.exists())
 
 
diff --git a/benchmark/tpch/cli.py b/benchmark/tpch/cli.py
index 61db9d6e..ce58de29 100644
--- a/benchmark/tpch/cli.py
+++ b/benchmark/tpch/cli.py
@@ -1,5 +1,4 @@
 import logging
-from pathlib import Path
 
 import click
 from gymlib.symlinks_paths import get_tables_dirname, get_tables_symlink_path
@@ -25,21 +24,21 @@ def tpch_group(dbgym_workspace: DBGymWorkspace) -> None:
     dbgym_workspace.append_group("tpch")
 
 
-@tpch_group.command(name="data")
+@tpch_group.command(name="tables")
 @click.argument("scale-factor", type=float)
 @click.pass_obj
-# The reason generate data is separate from create dbdata is because generate-data is generic
+# The reason generate tables is separate from create dbdata is because tpch_tables is generic
 #   to all DBMSs while create dbdata is specific to a single DBMS.
-def tpch_data(dbgym_workspace: DBGymWorkspace, scale_factor: float) -> None:
-    _tpch_data(dbgym_workspace, scale_factor)
+def tpch_tables(dbgym_workspace: DBGymWorkspace, scale_factor: float) -> None:
+    _tpch_tables(dbgym_workspace, scale_factor)
 
 
-def _tpch_data(dbgym_workspace: DBGymWorkspace, scale_factor: float) -> None:
+def _tpch_tables(dbgym_workspace: DBGymWorkspace, scale_factor: float) -> None:
     """
     This function exists as a hook for integration tests.
     """
     _clone_tpch_kit(dbgym_workspace)
-    _generate_data(dbgym_workspace, scale_factor)
+    _generate_tpch_tables(dbgym_workspace, scale_factor)
 
 
 @tpch_group.command(name="workload")
@@ -139,7 +138,7 @@ def _generate_tpch_queries(
     )
 
 
-def _generate_data(dbgym_workspace: DBGymWorkspace, scale_factor: float) -> None:
+def _generate_tpch_tables(dbgym_workspace: DBGymWorkspace, scale_factor: float) -> None:
     tpch_kit_dpath = dbgym_workspace.dbgym_cur_symlinks_path / (
         TPCH_KIT_DIRNAME + ".link"
     )

From 74c32e2cb039163365ba992da619ad821e810029 Mon Sep 17 00:00:00 2001
From: Patrick Wang <wang.patrick57@gmail.com>
Date: Sun, 29 Dec 2024 15:44:34 -0500
Subject: [PATCH 04/24] now passing test_tpch_workload

---
 benchmark/job/cli.py                    | 19 +++++----
 benchmark/tests/integtest_benchmark.py  | 44 +++++++++++++++++---
 benchmark/tpch/cli.py                   | 54 ++++++++++++++++++-------
 env/tests/gymlib_integtest_util.py      | 20 ++++++---
 env/tests/integtest_workload.py         | 18 +--------
 gymlib_package/gymlib/symlinks_paths.py | 31 +++++++++++++-
 util/workspace.py                       | 20 ---------
 7 files changed, 134 insertions(+), 72 deletions(-)

diff --git a/benchmark/job/cli.py b/benchmark/job/cli.py
index 420fae7d..162cf337 100644
--- a/benchmark/job/cli.py
+++ b/benchmark/job/cli.py
@@ -2,17 +2,16 @@
 from typing import Optional
 
 import click
-from gymlib.symlinks_paths import get_tables_dirname
+from gymlib.symlinks_paths import (
+    get_tables_dirname,
+    get_workload_dirname,
+    get_workload_suffix,
+)
 
 from benchmark.constants import DEFAULT_SCALE_FACTOR
 from util.log import DBGYM_LOGGER_NAME
 from util.shell import subprocess_run
-from util.workspace import (
-    DBGymWorkspace,
-    get_workload_name,
-    is_fully_resolved,
-    link_result,
-)
+from util.workspace import DBGymWorkspace, is_fully_resolved, link_result
 
 JOB_TABLES_URL = "https://event.cwi.nl/da/job/imdb.tgz"
 JOB_QUERIES_URL = "https://event.cwi.nl/da/job/job.tgz"
@@ -235,7 +234,11 @@ def _generate_job_workload(
     dbgym_workspace: DBGymWorkspace,
     query_subset: str,
 ) -> None:
-    workload_name = get_workload_name(DEFAULT_SCALE_FACTOR, query_subset)
+    workload_name = get_workload_dirname(
+        "job",
+        DEFAULT_SCALE_FACTOR,
+        get_workload_suffix("job", query_subset=query_subset),
+    )
     expected_workload_symlink_dpath = dbgym_workspace.cur_symlinks_data_path(
         mkdir=True
     ) / (workload_name + ".link")
diff --git a/benchmark/tests/integtest_benchmark.py b/benchmark/tests/integtest_benchmark.py
index 3d046bcf..a7cc3402 100644
--- a/benchmark/tests/integtest_benchmark.py
+++ b/benchmark/tests/integtest_benchmark.py
@@ -2,11 +2,20 @@
 import unittest
 from pathlib import Path
 
-from gymlib.symlinks_paths import get_tables_symlink_path
+from gymlib.symlinks_paths import (
+    get_tables_symlink_path,
+    get_workload_suffix,
+    get_workload_symlink_path,
+)
 
 # It's ok to import private functions from the benchmark module because this is an integration test.
-from benchmark.tpch.cli import _tpch_tables
-from util.workspace import DBGymWorkspace, get_workspace_path_from_config
+from benchmark.tpch.cli import _tpch_tables, _tpch_workload
+from benchmark.tpch.constants import DEFAULT_TPCH_SEED
+from util.workspace import (
+    DBGymWorkspace,
+    fully_resolve_path,
+    get_workspace_path_from_config,
+)
 
 
 class TestBenchmark(unittest.TestCase):
@@ -15,11 +24,13 @@ class TestBenchmark(unittest.TestCase):
     def setUp(self) -> None:
         workspace_path = get_workspace_path_from_config(TestBenchmark.DBGYM_CONFIG_PATH)
         # Get a clean start each time.
-        shutil.rmtree(workspace_path)
+        if workspace_path.exists():
+            shutil.rmtree(workspace_path)
         self.workspace = DBGymWorkspace(workspace_path)
 
-    # def tearDown(self) -> None:
-    #     shutil.rmtree(self.workspace.dbgym_workspace_path)
+    def tearDown(self) -> None:
+        if self.workspace.dbgym_workspace_path.exists():
+            shutil.rmtree(self.workspace.dbgym_workspace_path)
 
     def test_tpch_tables(self) -> None:
         scale_factor = 0.01
@@ -29,6 +40,27 @@ def test_tpch_tables(self) -> None:
         self.assertFalse(tables_path.exists())
         _tpch_tables(self.workspace, scale_factor)
         self.assertTrue(tables_path.exists())
+        self.assertTrue(fully_resolve_path(tables_path).exists())
+
+    def test_tpch_workload(self) -> None:
+        scale_factor = 0.01
+        workload_path = get_workload_symlink_path(
+            self.workspace.dbgym_workspace_path,
+            "tpch",
+            scale_factor,
+            get_workload_suffix(
+                "tpch",
+                seed_start=DEFAULT_TPCH_SEED,
+                seed_end=DEFAULT_TPCH_SEED,
+                query_subset="all",
+            ),
+        )
+        self.assertFalse(workload_path.exists())
+        _tpch_workload(
+            self.workspace, DEFAULT_TPCH_SEED, DEFAULT_TPCH_SEED, "all", scale_factor
+        )
+        self.assertTrue(workload_path.exists())
+        self.assertTrue(fully_resolve_path(workload_path).exists())
 
 
 if __name__ == "__main__":
diff --git a/benchmark/tpch/cli.py b/benchmark/tpch/cli.py
index ce58de29..89cb5fd7 100644
--- a/benchmark/tpch/cli.py
+++ b/benchmark/tpch/cli.py
@@ -1,7 +1,12 @@
 import logging
 
 import click
-from gymlib.symlinks_paths import get_tables_dirname, get_tables_symlink_path
+from gymlib.symlinks_paths import (
+    get_tables_dirname,
+    get_tables_symlink_path,
+    get_workload_dirname,
+    get_workload_suffix,
+)
 
 from benchmark.constants import DEFAULT_SCALE_FACTOR
 from benchmark.tpch.constants import DEFAULT_TPCH_SEED, NUM_TPCH_QUERIES
@@ -9,8 +14,8 @@
 from util.shell import subprocess_run
 from util.workspace import (
     DBGymWorkspace,
+    fully_resolve_path,
     get_scale_factor_string,
-    get_workload_name,
     is_fully_resolved,
     link_result,
 )
@@ -68,6 +73,19 @@ def tpch_workload(
     query_subset: str,
     scale_factor: float,
 ) -> None:
+    _tpch_workload(dbgym_workspace, seed_start, seed_end, query_subset, scale_factor)
+
+
+def _tpch_workload(
+    dbgym_workspace: DBGymWorkspace,
+    seed_start: int,
+    seed_end: int,
+    query_subset: str,
+    scale_factor: float,
+) -> None:
+    """
+    This function exists as a hook for integration tests.
+    """
     assert (
         seed_start <= seed_end
     ), f"seed_start ({seed_start}) must be <= seed_end ({seed_end})"
@@ -124,6 +142,7 @@ def _generate_tpch_queries(
             dbgym_workspace.dbgym_this_run_path
             / _get_queries_dirname(seed, scale_factor)
         )
+        queries_parent_path.mkdir(parents=False, exist_ok=False)
         for i in range(1, NUM_TPCH_QUERIES + 1):
             target_sql = (queries_parent_path / f"{i}.sql").resolve()
             subprocess_run(
@@ -158,7 +177,7 @@ def _generate_tpch_tables(dbgym_workspace: DBGymWorkspace, scale_factor: float)
     tables_parent_path = dbgym_workspace.dbgym_this_run_path / get_tables_dirname(
         "tpch", scale_factor
     )
-    tables_parent_path.mkdir(parents=True, exist_ok=False)
+    tables_parent_path.mkdir(parents=False, exist_ok=False)
     subprocess_run(f"mv ./*.tbl {tables_parent_path}", cwd=tpch_kit_dpath / "dbgen")
 
     tables_symlink_dpath = dbgym_workspace.link_result(tables_parent_path)
@@ -175,16 +194,22 @@ def _generate_tpch_workload(
     query_subset: str,
     scale_factor: float,
 ) -> None:
-    symlink_data_dpath = dbgym_workspace.cur_symlinks_data_path(mkdir=True)
-    workload_name = get_workload_name(
-        scale_factor, f"{seed_start}_{seed_end}_{query_subset}"
+    workload_name = get_workload_dirname(
+        "tpch",
+        scale_factor,
+        get_workload_suffix(
+            "tpch", seed_start=seed_start, seed_end=seed_end, query_subset=query_subset
+        ),
+    )
+    expected_workload_symlink_dpath = dbgym_workspace.dbgym_cur_symlinks_path / (
+        workload_name + ".link"
     )
-    expected_workload_symlink_dpath = symlink_data_dpath / (workload_name + ".link")
 
     logging.getLogger(DBGYM_LOGGER_NAME).info(
         f"Generating: {expected_workload_symlink_dpath}"
     )
-    real_dpath = dbgym_workspace.cur_task_runs_data_path(workload_name, mkdir=True)
+    workload_path = dbgym_workspace.dbgym_this_run_path / workload_name
+    workload_path.mkdir(parents=False, exist_ok=False)
 
     query_names = None
     if query_subset == "all":
@@ -196,20 +221,21 @@ def _generate_tpch_workload(
     else:
         assert False
 
-    with open(real_dpath / "order.txt", "w") as f:
+    with open(workload_path / "order.txt", "w") as f:
         for seed in range(seed_start, seed_end + 1):
+            queries_parent_path = dbgym_workspace.dbgym_cur_symlinks_path / (
+                _get_queries_dirname(seed, scale_factor) + ".link"
+            )
+
             for qname in query_names:
-                sql_fpath = (
-                    symlink_data_dpath
-                    / (_get_queries_dirname(seed, scale_factor) + ".link")
-                ).resolve() / f"{qname}.sql"
+                sql_fpath = fully_resolve_path(queries_parent_path / f"{qname}.sql")
                 assert is_fully_resolved(
                     sql_fpath
                 ), "We should only write existent real absolute paths to a file"
                 f.write(f"S{seed}-Q{qname},{sql_fpath}\n")
                 # TODO(WAN): add option to deep-copy the workload.
 
-    workload_symlink_dpath = link_result(dbgym_workspace, real_dpath)
+    workload_symlink_dpath = dbgym_workspace.link_result(workload_path)
     assert workload_symlink_dpath == expected_workload_symlink_dpath
     logging.getLogger(DBGYM_LOGGER_NAME).info(
         f"Generated: {expected_workload_symlink_dpath}"
diff --git a/env/tests/gymlib_integtest_util.py b/env/tests/gymlib_integtest_util.py
index 597d6e73..115cb8d7 100644
--- a/env/tests/gymlib_integtest_util.py
+++ b/env/tests/gymlib_integtest_util.py
@@ -3,6 +3,10 @@
 from pathlib import Path
 from typing import Optional
 
+# TODO: remove symlinks_paths from the import
+from gymlib.symlinks_paths import get_workload_dirname, get_workload_suffix
+
+from benchmark.tpch.constants import DEFAULT_TPCH_SEED
 from env.tuning_artifacts import TuningMetadata
 from util.workspace import (
     DBGymWorkspace,
@@ -10,9 +14,7 @@
     get_default_dbdata_parent_dpath,
     get_default_pgbin_path,
     get_default_pristine_dbdata_snapshot_path,
-    get_default_workload_name_suffix,
     get_default_workload_path,
-    get_workload_name,
     get_workspace_path_from_config,
 )
 
@@ -75,16 +77,22 @@ def get_dbgym_workspace() -> DBGymWorkspace:
     @staticmethod
     def get_default_metadata() -> TuningMetadata:
         dbgym_workspace = GymlibIntegtestManager.get_dbgym_workspace()
+        assert GymlibIntegtestManager.BENCHMARK == "tpch"
+        suffix = get_workload_suffix(
+            GymlibIntegtestManager.BENCHMARK,
+            seed_start=DEFAULT_TPCH_SEED,
+            seed_end=DEFAULT_TPCH_SEED,
+            query_subset="all",
+        )
         return TuningMetadata(
             workload_path=fully_resolve_path(
                 get_default_workload_path(
                     dbgym_workspace.dbgym_workspace_path,
                     GymlibIntegtestManager.BENCHMARK,
-                    get_workload_name(
+                    get_workload_dirname(
+                        GymlibIntegtestManager.BENCHMARK,
                         GymlibIntegtestManager.SCALE_FACTOR,
-                        get_default_workload_name_suffix(
-                            GymlibIntegtestManager.BENCHMARK
-                        ),
+                        suffix,
                     ),
                 ),
             ),
diff --git a/env/tests/integtest_workload.py b/env/tests/integtest_workload.py
index ee6fc068..a30f975b 100644
--- a/env/tests/integtest_workload.py
+++ b/env/tests/integtest_workload.py
@@ -3,12 +3,7 @@
 from benchmark.tpch.constants import DEFAULT_TPCH_SEED, NUM_TPCH_QUERIES
 from env.tests.gymlib_integtest_util import GymlibIntegtestManager
 from env.workload import Workload
-from util.workspace import (
-    fully_resolve_path,
-    get_default_workload_name_suffix,
-    get_default_workload_path,
-    get_workload_name,
-)
+from util.workspace import fully_resolve_path, get_default_workload_path
 
 
 class WorkloadTests(unittest.TestCase):
@@ -17,16 +12,7 @@ def setUpClass() -> None:
         GymlibIntegtestManager.set_up_workspace()
 
     def test_workload(self) -> None:
-        workload_dpath = fully_resolve_path(
-            get_default_workload_path(
-                GymlibIntegtestManager.get_dbgym_workspace().dbgym_workspace_path,
-                GymlibIntegtestManager.BENCHMARK,
-                get_workload_name(
-                    GymlibIntegtestManager.SCALE_FACTOR,
-                    get_default_workload_name_suffix(GymlibIntegtestManager.BENCHMARK),
-                ),
-            ),
-        )
+        workload_dpath = GymlibIntegtestManager.get_default_metadata().workload_path
 
         workload = Workload(
             GymlibIntegtestManager.get_dbgym_workspace(), workload_dpath
diff --git a/gymlib_package/gymlib/symlinks_paths.py b/gymlib_package/gymlib/symlinks_paths.py
index 5fb86b44..a25ffdac 100644
--- a/gymlib_package/gymlib/symlinks_paths.py
+++ b/gymlib_package/gymlib/symlinks_paths.py
@@ -1,4 +1,5 @@
 from pathlib import Path
+from typing import Any
 
 # TODO: move these into workspace.py and move workspace.py into gymlib.
 SYMLINKS_DNAME = "symlinks"
@@ -20,7 +21,22 @@ def get_scale_factor_string(scale_factor: float | str) -> str:
 
 
 def get_tables_dirname(benchmark: str, scale_factor: float | str) -> str:
-    return f"{benchmark}_sf{get_scale_factor_string(scale_factor)}_tables"
+    return f"tables_{benchmark}_sf{get_scale_factor_string(scale_factor)}"
+
+
+def get_workload_suffix(benchmark: str, **kwargs: Any) -> str:
+    if benchmark == "tpch":
+        assert kwargs.keys() == {"seed_start", "seed_end", "query_subset"}
+        return f"{kwargs['seed_start']}_{kwargs['seed_end']}_{kwargs['query_subset']}"
+    elif benchmark == "job":
+        assert kwargs.keys() == {"query_subset"}
+        return f"{kwargs['query_subset']}"
+    else:
+        assert False
+
+
+def get_workload_dirname(benchmark: str, scale_factor: float | str, suffix: str) -> str:
+    return f"workload_{benchmark}_sf{get_scale_factor_string(scale_factor)}_{suffix}"
 
 
 def get_tables_symlink_path(
@@ -30,5 +46,16 @@ def get_tables_symlink_path(
         workspace_path
         / SYMLINKS_DNAME
         / DBGYM_APP_NAME
-        / get_tables_dirname(benchmark, scale_factor)
+        / (get_tables_dirname(benchmark, scale_factor) + ".link")
+    )
+
+
+def get_workload_symlink_path(
+    workspace_path: Path, benchmark: str, scale_factor: float | str, suffix: str
+) -> Path:
+    return (
+        workspace_path
+        / SYMLINKS_DNAME
+        / DBGYM_APP_NAME
+        / (get_workload_dirname(benchmark, scale_factor, suffix) + ".link")
     )
diff --git a/util/workspace.py b/util/workspace.py
index ce7cbd1f..2f6739df 100644
--- a/util/workspace.py
+++ b/util/workspace.py
@@ -53,22 +53,6 @@ def get_latest_run_path_from_workspace_path(workspace_path: Path) -> Path:
 DEFAULT_BOOT_CONFIG_FPATH = POSTGRES_PATH / "default_boot_config.yaml"
 
 
-# Generally useful functions
-
-
-def get_default_workload_name_suffix(benchmark_name: str) -> str:
-    if benchmark_name == "tpch":
-        return f"{DEFAULT_TPCH_SEED}_{DEFAULT_TPCH_SEED}_all"
-    if benchmark_name == "job":
-        return "all"
-    else:
-        assert False
-
-
-# Standard names of files/directories. These can refer to either the actual file/directory or a link to the file/directory.
-#   Since they can refer to either the actual or the link, they do not have ".link" in them.
-
-
 # Paths of dependencies in the workspace. These are named "*_path" because they will be an absolute path
 # The reason these _cannot_ be relative paths is because relative paths are relative to the codebase root, not the workspace root
 # Note that it's okay to hardcode the codebase paths (like dbgym_dbms_postgres) here. In the worst case, we'll just break an
@@ -110,10 +94,6 @@ def get_dbdata_tgz_filename(benchmark_name: str, scale_factor: float | str) -> s
     return f"{benchmark_name}_sf{get_scale_factor_string(scale_factor)}_pristine_dbdata.tgz"
 
 
-def get_workload_name(scale_factor: float | str, suffix: str) -> str:
-    return f"workload_sf{get_scale_factor_string(scale_factor)}_{suffix}"
-
-
 def get_default_pristine_dbdata_snapshot_path(
     workspace_path: Path, benchmark_name: str, scale_factor: float | str
 ) -> Path:

From 8c2c4ac83b880333543ebe13c01b61c8be7d1e6b Mon Sep 17 00:00:00 2001
From: Patrick Wang <wang.patrick57@gmail.com>
Date: Sun, 29 Dec 2024 15:53:44 -0500
Subject: [PATCH 05/24] added and passed test_job_tables

---
 benchmark/job/cli.py                   | 67 +++++++++++++++-----------
 benchmark/tests/integtest_benchmark.py | 12 +++++
 2 files changed, 50 insertions(+), 29 deletions(-)

diff --git a/benchmark/job/cli.py b/benchmark/job/cli.py
index 162cf337..07685084 100644
--- a/benchmark/job/cli.py
+++ b/benchmark/job/cli.py
@@ -139,15 +139,19 @@ def job_group(dbgym_workspace: DBGymWorkspace) -> None:
     dbgym_workspace.append_group("job")
 
 
-@job_group.command(name="data")
+@job_group.command(name="tables")
 # We expose this option to keep its interface consistent with other workloads, but you should never pass in something other than DEFAULT_SCALE_FACTOR.
 @click.argument("scale-factor", type=float)
 @click.pass_obj
-# The reason generate data is separate from create dbdata is because generate-data is generic
+# The reason generate data is separate from create dbdata is because generate data is generic
 #   to all DBMSs while create dbdata is specific to a single DBMS.
-def job_data(dbgym_workspace: DBGymWorkspace, scale_factor: float) -> None:
+def job_tables(dbgym_workspace: DBGymWorkspace, scale_factor: float) -> None:
+    _job_tables(dbgym_workspace, scale_factor)
+
+
+def _job_tables(dbgym_workspace: DBGymWorkspace, scale_factor: float) -> None:
     assert scale_factor == DEFAULT_SCALE_FACTOR
-    _download_job_data(dbgym_workspace)
+    _download_job_tables(dbgym_workspace)
 
 
 @job_group.command(name="workload")
@@ -166,7 +170,7 @@ def job_workload(
     _generate_job_workload(dbgym_workspace, query_subset)
 
 
-def _download_job_data(dbgym_workspace: DBGymWorkspace) -> None:
+def _download_job_tables(dbgym_workspace: DBGymWorkspace) -> None:
     _download_and_untar_dir(
         dbgym_workspace,
         JOB_TABLES_URL,
@@ -198,36 +202,41 @@ def _download_and_untar_dir(
     an "original" directory name. If this is the case, you should set
     `untarred_original_dname` to ensure that it gets renamed to `untarred_dname`.
     """
-    expected_symlink_dpath = (
-        dbgym_workspace.cur_symlinks_data_path(mkdir=True) / f"{untarred_dname}.link"
+    expected_symlink_path = (
+        dbgym_workspace.dbgym_cur_symlinks_path / f"{untarred_dname}.link"
     )
-    if expected_symlink_dpath.exists():
+    if expected_symlink_path.exists():
         logging.getLogger(DBGYM_LOGGER_NAME).info(
-            f"Skipping download: {expected_symlink_dpath}"
+            f"Skipping download: {expected_symlink_path}"
         )
         return
 
-    logging.getLogger(DBGYM_LOGGER_NAME).info(f"Downloading: {expected_symlink_dpath}")
-    real_data_path = dbgym_workspace.cur_task_runs_data_path(mkdir=True)
-    subprocess_run(f"curl -O {download_url}", cwd=real_data_path)
-    untarred_data_dpath = dbgym_workspace.cur_task_runs_data_path(untarred_dname)
+    logging.getLogger(DBGYM_LOGGER_NAME).info(f"Downloading: {expected_symlink_path}")
+    subprocess_run(f"curl -O {download_url}", cwd=dbgym_workspace.dbgym_this_run_path)
+    untarred_data_path = dbgym_workspace.dbgym_this_run_path / untarred_dname
 
     if untarred_original_dname is not None:
-        assert not untarred_data_dpath.exists()
-        subprocess_run(f"tar -zxvf {download_tarred_fname}", cwd=real_data_path)
-        assert (real_data_path / untarred_original_dname).exists()
+        assert not untarred_data_path.exists()
         subprocess_run(
-            f"mv {untarred_original_dname} {untarred_dname}", cwd=real_data_path
+            f"tar -zxvf {download_tarred_fname}",
+            cwd=dbgym_workspace.dbgym_this_run_path,
+        )
+        assert (dbgym_workspace.dbgym_this_run_path / untarred_original_dname).exists()
+        subprocess_run(
+            f"mv {untarred_original_dname} {untarred_dname}",
+            cwd=dbgym_workspace.dbgym_this_run_path,
         )
     else:
-        untarred_data_dpath.mkdir(parents=True, exist_ok=False)
-        subprocess_run(f"tar -zxvf ../{download_tarred_fname}", cwd=untarred_data_dpath)
+        untarred_data_path.mkdir(parents=True, exist_ok=False)
+        subprocess_run(f"tar -zxvf ../{download_tarred_fname}", cwd=untarred_data_path)
 
-    assert untarred_data_dpath.exists()
-    subprocess_run(f"rm {download_tarred_fname}", cwd=real_data_path)
-    symlink_dpath = link_result(dbgym_workspace, untarred_data_dpath)
-    assert expected_symlink_dpath.samefile(symlink_dpath)
-    logging.getLogger(DBGYM_LOGGER_NAME).info(f"Downloaded: {expected_symlink_dpath}")
+    assert untarred_data_path.exists()
+    subprocess_run(
+        f"rm {download_tarred_fname}", cwd=dbgym_workspace.dbgym_this_run_path
+    )
+    symlink_path = dbgym_workspace.link_result(untarred_data_path)
+    assert expected_symlink_path.samefile(symlink_path)
+    logging.getLogger(DBGYM_LOGGER_NAME).info(f"Downloaded: {expected_symlink_path}")
 
 
 def _generate_job_workload(
@@ -239,12 +248,12 @@ def _generate_job_workload(
         DEFAULT_SCALE_FACTOR,
         get_workload_suffix("job", query_subset=query_subset),
     )
-    expected_workload_symlink_dpath = dbgym_workspace.cur_symlinks_data_path(
+    expected_workload_symlink_path = dbgym_workspace.cur_symlinks_data_path(
         mkdir=True
     ) / (workload_name + ".link")
 
     logging.getLogger(DBGYM_LOGGER_NAME).info(
-        f"Generating: {expected_workload_symlink_dpath}"
+        f"Generating: {expected_workload_symlink_path}"
     )
     real_dpath = dbgym_workspace.cur_task_runs_data_path(workload_name, mkdir=True)
 
@@ -269,8 +278,8 @@ def _generate_job_workload(
             ), "We should only write existent real absolute paths to a file"
             f.write(f"Q{qname},{sql_fpath}\n")
 
-    workload_symlink_dpath = link_result(dbgym_workspace, real_dpath)
-    assert workload_symlink_dpath == expected_workload_symlink_dpath
+    workload_symlink_path = link_result(dbgym_workspace, real_dpath)
+    assert workload_symlink_path == expected_workload_symlink_path
     logging.getLogger(DBGYM_LOGGER_NAME).info(
-        f"Generated: {expected_workload_symlink_dpath}"
+        f"Generated: {expected_workload_symlink_path}"
     )
diff --git a/benchmark/tests/integtest_benchmark.py b/benchmark/tests/integtest_benchmark.py
index a7cc3402..4c11b4fa 100644
--- a/benchmark/tests/integtest_benchmark.py
+++ b/benchmark/tests/integtest_benchmark.py
@@ -9,6 +9,8 @@
 )
 
 # It's ok to import private functions from the benchmark module because this is an integration test.
+from benchmark.constants import DEFAULT_SCALE_FACTOR
+from benchmark.job.cli import _job_tables
 from benchmark.tpch.cli import _tpch_tables, _tpch_workload
 from benchmark.tpch.constants import DEFAULT_TPCH_SEED
 from util.workspace import (
@@ -42,6 +44,16 @@ def test_tpch_tables(self) -> None:
         self.assertTrue(tables_path.exists())
         self.assertTrue(fully_resolve_path(tables_path).exists())
 
+    def test_job_tables(self) -> None:
+        scale_factor = DEFAULT_SCALE_FACTOR
+        tables_path = get_tables_symlink_path(
+            self.workspace.dbgym_workspace_path, "job", scale_factor
+        )
+        self.assertFalse(tables_path.exists())
+        _job_tables(self.workspace, scale_factor)
+        self.assertTrue(tables_path.exists())
+        self.assertTrue(fully_resolve_path(tables_path).exists())
+
     def test_tpch_workload(self) -> None:
         scale_factor = 0.01
         workload_path = get_workload_symlink_path(

From 86f62eb2160accbdbd02cb437f1821b90deb9b7e Mon Sep 17 00:00:00 2001
From: Patrick Wang <wang.patrick57@gmail.com>
Date: Sun, 29 Dec 2024 16:07:42 -0500
Subject: [PATCH 06/24] wrote and passed test_job_workload

---
 benchmark/job/cli.py                   | 43 +++++++++++++++++---------
 benchmark/tests/integtest_benchmark.py | 22 ++++++++++---
 benchmark/tpch/cli.py                  | 16 ++++++----
 3 files changed, 57 insertions(+), 24 deletions(-)

diff --git a/benchmark/job/cli.py b/benchmark/job/cli.py
index 07685084..b7853c80 100644
--- a/benchmark/job/cli.py
+++ b/benchmark/job/cli.py
@@ -11,7 +11,12 @@
 from benchmark.constants import DEFAULT_SCALE_FACTOR
 from util.log import DBGYM_LOGGER_NAME
 from util.shell import subprocess_run
-from util.workspace import DBGymWorkspace, is_fully_resolved, link_result
+from util.workspace import (
+    DBGymWorkspace,
+    fully_resolve_path,
+    is_fully_resolved,
+    link_result,
+)
 
 JOB_TABLES_URL = "https://event.cwi.nl/da/job/imdb.tgz"
 JOB_QUERIES_URL = "https://event.cwi.nl/da/job/job.tgz"
@@ -164,6 +169,12 @@ def _job_tables(dbgym_workspace: DBGymWorkspace, scale_factor: float) -> None:
 @click.pass_obj
 def job_workload(
     dbgym_workspace: DBGymWorkspace, query_subset: str, scale_factor: float
+) -> None:
+    _job_workload(dbgym_workspace, query_subset, scale_factor)
+
+
+def _job_workload(
+    dbgym_workspace: DBGymWorkspace, query_subset: str, scale_factor: float
 ) -> None:
     assert scale_factor == DEFAULT_SCALE_FACTOR
     _download_job_queries(dbgym_workspace)
@@ -248,14 +259,20 @@ def _generate_job_workload(
         DEFAULT_SCALE_FACTOR,
         get_workload_suffix("job", query_subset=query_subset),
     )
-    expected_workload_symlink_path = dbgym_workspace.cur_symlinks_data_path(
-        mkdir=True
-    ) / (workload_name + ".link")
+    expected_workload_symlink_path = dbgym_workspace.dbgym_cur_symlinks_path / (
+        workload_name + ".link"
+    )
+    if expected_workload_symlink_path.exists():
+        logging.getLogger(DBGYM_LOGGER_NAME).info(
+            f"Skipping generation: {expected_workload_symlink_path}"
+        )
+        return
 
     logging.getLogger(DBGYM_LOGGER_NAME).info(
         f"Generating: {expected_workload_symlink_path}"
     )
-    real_dpath = dbgym_workspace.cur_task_runs_data_path(workload_name, mkdir=True)
+    workload_path = dbgym_workspace.dbgym_this_run_path / workload_name
+    workload_path.mkdir(parents=False, exist_ok=False)
 
     query_names = None
     if query_subset == "all":
@@ -267,18 +284,16 @@ def _generate_job_workload(
     else:
         assert False
 
-    with open(real_dpath / "order.txt", "w") as f:
+    with open(workload_path / "order.txt", "w") as f:
+        queries_parent_path = dbgym_workspace.dbgym_cur_symlinks_path / (
+            JOB_QUERIES_DNAME + ".link"
+        )
+
         for qname in query_names:
-            sql_fpath = (
-                dbgym_workspace.cur_symlinks_data_path(mkdir=True)
-                / (f"{JOB_QUERIES_DNAME}.link")
-            ).resolve() / f"{qname}.sql"
-            assert is_fully_resolved(
-                sql_fpath
-            ), "We should only write existent real absolute paths to a file"
+            sql_fpath = fully_resolve_path(queries_parent_path / f"{qname}.sql")
             f.write(f"Q{qname},{sql_fpath}\n")
 
-    workload_symlink_path = link_result(dbgym_workspace, real_dpath)
+    workload_symlink_path = dbgym_workspace.link_result(workload_path)
     assert workload_symlink_path == expected_workload_symlink_path
     logging.getLogger(DBGYM_LOGGER_NAME).info(
         f"Generated: {expected_workload_symlink_path}"
diff --git a/benchmark/tests/integtest_benchmark.py b/benchmark/tests/integtest_benchmark.py
index 4c11b4fa..1e8f7910 100644
--- a/benchmark/tests/integtest_benchmark.py
+++ b/benchmark/tests/integtest_benchmark.py
@@ -10,7 +10,7 @@
 
 # It's ok to import private functions from the benchmark module because this is an integration test.
 from benchmark.constants import DEFAULT_SCALE_FACTOR
-from benchmark.job.cli import _job_tables
+from benchmark.job.cli import _job_tables, _job_workload
 from benchmark.tpch.cli import _tpch_tables, _tpch_workload
 from benchmark.tpch.constants import DEFAULT_TPCH_SEED
 from util.workspace import (
@@ -45,12 +45,11 @@ def test_tpch_tables(self) -> None:
         self.assertTrue(fully_resolve_path(tables_path).exists())
 
     def test_job_tables(self) -> None:
-        scale_factor = DEFAULT_SCALE_FACTOR
         tables_path = get_tables_symlink_path(
-            self.workspace.dbgym_workspace_path, "job", scale_factor
+            self.workspace.dbgym_workspace_path, "job", DEFAULT_SCALE_FACTOR
         )
         self.assertFalse(tables_path.exists())
-        _job_tables(self.workspace, scale_factor)
+        _job_tables(self.workspace, DEFAULT_SCALE_FACTOR)
         self.assertTrue(tables_path.exists())
         self.assertTrue(fully_resolve_path(tables_path).exists())
 
@@ -74,6 +73,21 @@ def test_tpch_workload(self) -> None:
         self.assertTrue(workload_path.exists())
         self.assertTrue(fully_resolve_path(workload_path).exists())
 
+    def test_job_workload(self) -> None:
+        workload_path = get_workload_symlink_path(
+            self.workspace.dbgym_workspace_path,
+            "job",
+            DEFAULT_SCALE_FACTOR,
+            get_workload_suffix(
+                "job",
+                query_subset="all",
+            ),
+        )
+        self.assertFalse(workload_path.exists())
+        _job_workload(self.workspace, "all", DEFAULT_SCALE_FACTOR)
+        self.assertTrue(workload_path.exists())
+        self.assertTrue(fully_resolve_path(workload_path).exists())
+
 
 if __name__ == "__main__":
     unittest.main()
diff --git a/benchmark/tpch/cli.py b/benchmark/tpch/cli.py
index 89cb5fd7..27452da1 100644
--- a/benchmark/tpch/cli.py
+++ b/benchmark/tpch/cli.py
@@ -201,12 +201,17 @@ def _generate_tpch_workload(
             "tpch", seed_start=seed_start, seed_end=seed_end, query_subset=query_subset
         ),
     )
-    expected_workload_symlink_dpath = dbgym_workspace.dbgym_cur_symlinks_path / (
+    expected_workload_symlink_path = dbgym_workspace.dbgym_cur_symlinks_path / (
         workload_name + ".link"
     )
+    if expected_workload_symlink_path.exists():
+        logging.getLogger(DBGYM_LOGGER_NAME).info(
+            f"Skipping generation: {expected_workload_symlink_path}"
+        )
+        return
 
     logging.getLogger(DBGYM_LOGGER_NAME).info(
-        f"Generating: {expected_workload_symlink_dpath}"
+        f"Generating: {expected_workload_symlink_path}"
     )
     workload_path = dbgym_workspace.dbgym_this_run_path / workload_name
     workload_path.mkdir(parents=False, exist_ok=False)
@@ -233,10 +238,9 @@ def _generate_tpch_workload(
                     sql_fpath
                 ), "We should only write existent real absolute paths to a file"
                 f.write(f"S{seed}-Q{qname},{sql_fpath}\n")
-                # TODO(WAN): add option to deep-copy the workload.
 
-    workload_symlink_dpath = dbgym_workspace.link_result(workload_path)
-    assert workload_symlink_dpath == expected_workload_symlink_dpath
+    workload_symlink_path = dbgym_workspace.link_result(workload_path)
+    assert workload_symlink_path == expected_workload_symlink_path
     logging.getLogger(DBGYM_LOGGER_NAME).info(
-        f"Generated: {expected_workload_symlink_dpath}"
+        f"Generated: {expected_workload_symlink_path}"
     )

From 883b1839be97edd63ab8c69a199a54a79f121404 Mon Sep 17 00:00:00 2001
From: Patrick Wang <wang.patrick57@gmail.com>
Date: Sun, 29 Dec 2024 16:13:41 -0500
Subject: [PATCH 07/24] fixed bug in workspace where it would always sleep for
 1 second

---
 util/workspace.py | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/util/workspace.py b/util/workspace.py
index 2f6739df..176bfc46 100644
--- a/util/workspace.py
+++ b/util/workspace.py
@@ -181,6 +181,8 @@ def __init__(self, dbgym_workspace_path: Path):
                 )
                 # `exist_ok` is False because we don't want to override a previous task run's data.
                 self.dbgym_this_run_path.mkdir(parents=True, exist_ok=False)
+                # Break if it succeeds so we don't do it a second time.
+                break
             except FileExistsError:
                 # In case we call task.py twice in one second, sleeping here will fix it.
                 # Waiting one second is enough since we assume there's only one task.py running at a time.

From 664728d63f5c901cbe9b1e23eecf1f5a50150b62 Mon Sep 17 00:00:00 2001
From: Patrick Wang <wang.patrick57@gmail.com>
Date: Sun, 29 Dec 2024 16:29:45 -0500
Subject: [PATCH 08/24] can now run integtest_benchmark.py fully

---
 benchmark/tests/integtest_benchmark.py | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/benchmark/tests/integtest_benchmark.py b/benchmark/tests/integtest_benchmark.py
index 1e8f7910..c09b2b6f 100644
--- a/benchmark/tests/integtest_benchmark.py
+++ b/benchmark/tests/integtest_benchmark.py
@@ -28,6 +28,10 @@ def setUp(self) -> None:
         # Get a clean start each time.
         if workspace_path.exists():
             shutil.rmtree(workspace_path)
+
+        # Reset this to avoid the error of it being created twice.
+        # In real usage, the second run would be a different Python process so DBGymWorkspace.num_times_created_this_run would be 0.
+        DBGymWorkspace.num_times_created_this_run = 0
         self.workspace = DBGymWorkspace(workspace_path)
 
     def tearDown(self) -> None:

From e4235fe358527f04514226797b4c76d60264c2b1 Mon Sep 17 00:00:00 2001
From: Patrick Wang <wang.patrick57@gmail.com>
Date: Sun, 29 Dec 2024 16:54:41 -0500
Subject: [PATCH 09/24] gymlib integtests no longer crash when run together
 with other integtests

---
 env/tests/gymlib_integtest_util.py | 2 ++
 util/workspace.py                  | 1 +
 2 files changed, 3 insertions(+)

diff --git a/env/tests/gymlib_integtest_util.py b/env/tests/gymlib_integtest_util.py
index 115cb8d7..9c859c94 100644
--- a/env/tests/gymlib_integtest_util.py
+++ b/env/tests/gymlib_integtest_util.py
@@ -67,6 +67,8 @@ def set_up_workspace() -> None:
         #
         # However, it also can't be created more than once so we need to check `is None`.
         if GymlibIntegtestManager.DBGYM_WORKSPACE is None:
+            # Reset this in case it had been created by a test *not* using GymlibIntegtestManager.set_up_workspace().
+            DBGymWorkspace.num_times_created_this_run = 0
             GymlibIntegtestManager.DBGYM_WORKSPACE = DBGymWorkspace(workspace_path)
 
     @staticmethod
diff --git a/util/workspace.py b/util/workspace.py
index 176bfc46..7e2b4adf 100644
--- a/util/workspace.py
+++ b/util/workspace.py
@@ -131,6 +131,7 @@ class DBGymWorkspace:
 
     def __init__(self, dbgym_workspace_path: Path):
         # The logic around dbgym_tmp_path assumes that DBGymWorkspace is only constructed once.
+        # This is because DBGymWorkspace creates a new run_*/ dir when it's initialized.
         DBGymWorkspace.num_times_created_this_run += 1
         assert (
             DBGymWorkspace.num_times_created_this_run == 1

From c430a904e3a693fac1156cd20114ee5214a36a13 Mon Sep 17 00:00:00 2001
From: Patrick Wang <wang.patrick57@gmail.com>
Date: Sun, 29 Dec 2024 19:31:22 -0500
Subject: [PATCH 10/24] fixed some paths in gymlib integtests

---
 env/tests/_set_up_gymlib_integtest_workspace.sh |  2 +-
 env/tests/gymlib_integtest_util.py              | 12 ++++--------
 2 files changed, 5 insertions(+), 9 deletions(-)

diff --git a/env/tests/_set_up_gymlib_integtest_workspace.sh b/env/tests/_set_up_gymlib_integtest_workspace.sh
index bd2130a7..b85e0fdf 100755
--- a/env/tests/_set_up_gymlib_integtest_workspace.sh
+++ b/env/tests/_set_up_gymlib_integtest_workspace.sh
@@ -16,7 +16,7 @@ set -euo pipefail
 # INTENDED_DBDATA_HARDWARE can be set elsewhere (e.g. by tests_ci.yaml) but we use hdd by default.
 INTENDED_DBDATA_HARDWARE="${INTENDED_DBDATA_HARDWARE:-hdd}"
 
-python3 task.py benchmark $BENCHMARK data $SCALE_FACTOR
+python3 task.py benchmark $BENCHMARK tables $SCALE_FACTOR
 python3 task.py benchmark $BENCHMARK workload --scale-factor $SCALE_FACTOR
 
 python3 task.py dbms postgres build
diff --git a/env/tests/gymlib_integtest_util.py b/env/tests/gymlib_integtest_util.py
index 9c859c94..1875a138 100644
--- a/env/tests/gymlib_integtest_util.py
+++ b/env/tests/gymlib_integtest_util.py
@@ -4,7 +4,7 @@
 from typing import Optional
 
 # TODO: remove symlinks_paths from the import
-from gymlib.symlinks_paths import get_workload_dirname, get_workload_suffix
+from gymlib.symlinks_paths import get_workload_suffix, get_workload_symlink_path
 
 from benchmark.tpch.constants import DEFAULT_TPCH_SEED
 from env.tuning_artifacts import TuningMetadata
@@ -14,7 +14,6 @@
     get_default_dbdata_parent_dpath,
     get_default_pgbin_path,
     get_default_pristine_dbdata_snapshot_path,
-    get_default_workload_path,
     get_workspace_path_from_config,
 )
 
@@ -88,14 +87,11 @@ def get_default_metadata() -> TuningMetadata:
         )
         return TuningMetadata(
             workload_path=fully_resolve_path(
-                get_default_workload_path(
+                get_workload_symlink_path(
                     dbgym_workspace.dbgym_workspace_path,
                     GymlibIntegtestManager.BENCHMARK,
-                    get_workload_dirname(
-                        GymlibIntegtestManager.BENCHMARK,
-                        GymlibIntegtestManager.SCALE_FACTOR,
-                        suffix,
-                    ),
+                    GymlibIntegtestManager.SCALE_FACTOR,
+                    suffix
                 ),
             ),
             pristine_dbdata_snapshot_path=fully_resolve_path(

From e0cde22b79f3c3f6ee15f94288703635eb4ec7bc Mon Sep 17 00:00:00 2001
From: Patrick Wang <wang.patrick57@gmail.com>
Date: Sun, 29 Dec 2024 19:50:02 -0500
Subject: [PATCH 11/24] added and passed test_postgres_build

---
 benchmark/tests/integtest_benchmark.py        |  6 +-
 .../{build_repo.sh => _build_repo.sh}         |  0
 dbms/postgres/cli.py                          | 78 +++++++++----------
 dbms/tests/__init__.py                        |  0
 dbms/tests/dbms_integtest_dbgym_config.yaml   |  1 +
 dbms/tests/integtest_dbms.py                  | 42 ++++++++++
 env/tests/gymlib_integtest_util.py            | 11 ++-
 gymlib_package/gymlib/symlinks_paths.py       |  8 ++
 util/workspace.py                             | 16 ----
 9 files changed, 100 insertions(+), 62 deletions(-)
 rename dbms/postgres/{build_repo.sh => _build_repo.sh} (100%)
 create mode 100644 dbms/tests/__init__.py
 create mode 100644 dbms/tests/dbms_integtest_dbgym_config.yaml
 create mode 100644 dbms/tests/integtest_dbms.py

diff --git a/benchmark/tests/integtest_benchmark.py b/benchmark/tests/integtest_benchmark.py
index c09b2b6f..f8b9d9f8 100644
--- a/benchmark/tests/integtest_benchmark.py
+++ b/benchmark/tests/integtest_benchmark.py
@@ -20,11 +20,13 @@
 )
 
 
-class TestBenchmark(unittest.TestCase):
+class BenchmarkTests(unittest.TestCase):
     DBGYM_CONFIG_PATH = Path("benchmark/tests/benchmark_integtest_dbgym_config.yaml")
 
     def setUp(self) -> None:
-        workspace_path = get_workspace_path_from_config(TestBenchmark.DBGYM_CONFIG_PATH)
+        workspace_path = get_workspace_path_from_config(
+            BenchmarkTests.DBGYM_CONFIG_PATH
+        )
         # Get a clean start each time.
         if workspace_path.exists():
             shutil.rmtree(workspace_path)
diff --git a/dbms/postgres/build_repo.sh b/dbms/postgres/_build_repo.sh
similarity index 100%
rename from dbms/postgres/build_repo.sh
rename to dbms/postgres/_build_repo.sh
diff --git a/dbms/postgres/cli.py b/dbms/postgres/cli.py
index 326fc6d2..44cfbfac 100644
--- a/dbms/postgres/cli.py
+++ b/dbms/postgres/cli.py
@@ -3,7 +3,6 @@
 """
 
 import logging
-import os
 import shutil
 import subprocess
 from pathlib import Path
@@ -11,6 +10,7 @@
 
 import click
 import sqlalchemy
+from gymlib.symlinks_paths import get_pgbin_symlink_path, get_repo_symlink_path
 
 from benchmark.constants import DEFAULT_SCALE_FACTOR
 from benchmark.job.load_info import JobLoadInfo
@@ -35,7 +35,6 @@
     fully_resolve_path,
     get_dbdata_tgz_filename,
     get_default_dbdata_parent_dpath,
-    get_default_pgbin_path,
     is_fully_resolved,
     is_ssd,
     link_result,
@@ -61,7 +60,38 @@ def postgres_group(dbgym_workspace: DBGymWorkspace) -> None:
     help="Include this flag to rebuild Postgres even if it already exists.",
 )
 def postgres_build(dbgym_workspace: DBGymWorkspace, rebuild: bool) -> None:
-    _build_repo(dbgym_workspace, rebuild)
+    _postgres_build(dbgym_workspace, rebuild)
+
+
+def _postgres_build(dbgym_workspace: DBGymWorkspace, rebuild: bool) -> None:
+    """
+    This function exists as a hook for integration tests.
+    """
+    expected_repo_symlink_dpath = get_repo_symlink_path(
+        dbgym_workspace.dbgym_workspace_path
+    )
+    if not rebuild and expected_repo_symlink_dpath.exists():
+        logging.getLogger(DBGYM_LOGGER_NAME).info(
+            f"Skipping _postgres_build: {expected_repo_symlink_dpath}"
+        )
+        return
+
+    logging.getLogger(DBGYM_LOGGER_NAME).info(
+        f"Setting up repo in {expected_repo_symlink_dpath}"
+    )
+    repo_real_dpath = dbgym_workspace.dbgym_this_run_path / "repo"
+    repo_real_dpath.mkdir(parents=False, exist_ok=False)
+    subprocess_run(
+        f"./_build_repo.sh {repo_real_dpath}",
+        cwd=dbgym_workspace.base_dbgym_repo_dpath / "dbms" / "postgres",
+    )
+
+    # only link at the end so that the link only ever points to a complete repo
+    repo_symlink_dpath = dbgym_workspace.link_result(repo_real_dpath)
+    assert expected_repo_symlink_dpath.samefile(repo_symlink_dpath)
+    logging.getLogger(DBGYM_LOGGER_NAME).info(
+        f"Set up repo in {expected_repo_symlink_dpath}"
+    )
 
 
 @postgres_group.command(
@@ -75,7 +105,7 @@ def postgres_build(dbgym_workspace: DBGymWorkspace, rebuild: bool) -> None:
     "--pgbin-path",
     type=Path,
     default=None,
-    help=f"The path to the bin containing Postgres executables. The default is {get_default_pgbin_path(WORKSPACE_PATH_PLACEHOLDER)}.",
+    help=f"The path to the bin containing Postgres executables. The default is {get_pgbin_symlink_path(WORKSPACE_PATH_PLACEHOLDER)}.",
 )
 @click.option(
     "--intended-dbdata-hardware",
@@ -99,7 +129,7 @@ def postgres_dbdata(
 ) -> None:
     # Set args to defaults programmatically (do this before doing anything else in the function)
     if pgbin_path is None:
-        pgbin_path = get_default_pgbin_path(dbgym_workspace.dbgym_workspace_path)
+        pgbin_path = get_pgbin_symlink_path(dbgym_workspace.dbgym_workspace_path)
     if dbdata_parent_dpath is None:
         dbdata_parent_dpath = get_default_dbdata_parent_dpath(
             dbgym_workspace.dbgym_workspace_path
@@ -127,40 +157,6 @@ def postgres_dbdata(
     )
 
 
-def _get_pgbin_symlink_path(dbgym_workspace: DBGymWorkspace) -> Path:
-    return dbgym_workspace.cur_symlinks_build_path(
-        "repo.link", "boot", "build", "postgres", "bin"
-    )
-
-
-def _get_repo_symlink_path(dbgym_workspace: DBGymWorkspace) -> Path:
-    return dbgym_workspace.cur_symlinks_build_path("repo.link")
-
-
-def _build_repo(dbgym_workspace: DBGymWorkspace, rebuild: bool) -> None:
-    expected_repo_symlink_dpath = _get_repo_symlink_path(dbgym_workspace)
-    if not rebuild and expected_repo_symlink_dpath.exists():
-        logging.getLogger(DBGYM_LOGGER_NAME).info(
-            f"Skipping _build_repo: {expected_repo_symlink_dpath}"
-        )
-        return
-
-    logging.getLogger(DBGYM_LOGGER_NAME).info(
-        f"Setting up repo in {expected_repo_symlink_dpath}"
-    )
-    repo_real_dpath = dbgym_workspace.cur_task_runs_build_path("repo", mkdir=True)
-    subprocess_run(
-        f"./build_repo.sh {repo_real_dpath}", cwd=dbgym_workspace.cur_source_path()
-    )
-
-    # only link at the end so that the link only ever points to a complete repo
-    repo_symlink_dpath = link_result(dbgym_workspace, repo_real_dpath)
-    assert expected_repo_symlink_dpath.samefile(repo_symlink_dpath)
-    logging.getLogger(DBGYM_LOGGER_NAME).info(
-        f"Set up repo in {expected_repo_symlink_dpath}"
-    )
-
-
 def _create_dbdata(
     dbgym_workspace: DBGymWorkspace,
     benchmark_name: str,
@@ -216,7 +212,9 @@ def _create_dbdata(
 
 def _generic_dbdata_setup(dbgym_workspace: DBGymWorkspace) -> None:
     # get necessary vars
-    pgbin_real_dpath = _get_pgbin_symlink_path(dbgym_workspace).resolve()
+    pgbin_real_dpath = get_pgbin_symlink_path(
+        dbgym_workspace.dbgym_workspace_path
+    ).resolve()
     assert pgbin_real_dpath.exists()
     dbgym_pguser = DBGYM_POSTGRES_USER
     dbgym_pgpass = DBGYM_POSTGRES_PASS
diff --git a/dbms/tests/__init__.py b/dbms/tests/__init__.py
new file mode 100644
index 00000000..e69de29b
diff --git a/dbms/tests/dbms_integtest_dbgym_config.yaml b/dbms/tests/dbms_integtest_dbgym_config.yaml
new file mode 100644
index 00000000..e8e37294
--- /dev/null
+++ b/dbms/tests/dbms_integtest_dbgym_config.yaml
@@ -0,0 +1 @@
+dbgym_workspace_path: ../dbgym_dbms_integtest_workspace/
diff --git a/dbms/tests/integtest_dbms.py b/dbms/tests/integtest_dbms.py
new file mode 100644
index 00000000..f5961397
--- /dev/null
+++ b/dbms/tests/integtest_dbms.py
@@ -0,0 +1,42 @@
+import shutil
+import unittest
+from pathlib import Path
+
+from gymlib.symlinks_paths import get_repo_symlink_path
+
+from dbms.postgres.cli import _postgres_build
+from util.workspace import (
+    DBGymWorkspace,
+    fully_resolve_path,
+    get_workspace_path_from_config,
+)
+
+
+class DBMSTests(unittest.TestCase):
+    DBGYM_CONFIG_PATH = Path("dbms/tests/dbms_integtest_dbgym_config.yaml")
+
+    def setUp(self) -> None:
+        workspace_path = get_workspace_path_from_config(DBMSTests.DBGYM_CONFIG_PATH)
+        # Get a clean start each time.
+        if workspace_path.exists():
+            shutil.rmtree(workspace_path)
+
+        # Reset this to avoid the error of it being created twice.
+        # In real usage, the second run would be a different Python process so DBGymWorkspace.num_times_created_this_run would be 0.
+        DBGymWorkspace.num_times_created_this_run = 0
+        self.workspace = DBGymWorkspace(workspace_path)
+
+    def tearDown(self) -> None:
+        if self.workspace.dbgym_workspace_path.exists():
+            shutil.rmtree(self.workspace.dbgym_workspace_path)
+
+    def test_postgres_build(self) -> None:
+        repo_path = get_repo_symlink_path(self.workspace.dbgym_workspace_path)
+        self.assertFalse(repo_path.exists())
+        _postgres_build(self.workspace, False)
+        self.assertTrue(repo_path.exists())
+        self.assertTrue(fully_resolve_path(repo_path).exists())
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/env/tests/gymlib_integtest_util.py b/env/tests/gymlib_integtest_util.py
index 1875a138..68a6f1b8 100644
--- a/env/tests/gymlib_integtest_util.py
+++ b/env/tests/gymlib_integtest_util.py
@@ -4,7 +4,11 @@
 from typing import Optional
 
 # TODO: remove symlinks_paths from the import
-from gymlib.symlinks_paths import get_workload_suffix, get_workload_symlink_path
+from gymlib.symlinks_paths import (
+    get_pgbin_symlink_path,
+    get_workload_suffix,
+    get_workload_symlink_path,
+)
 
 from benchmark.tpch.constants import DEFAULT_TPCH_SEED
 from env.tuning_artifacts import TuningMetadata
@@ -12,7 +16,6 @@
     DBGymWorkspace,
     fully_resolve_path,
     get_default_dbdata_parent_dpath,
-    get_default_pgbin_path,
     get_default_pristine_dbdata_snapshot_path,
     get_workspace_path_from_config,
 )
@@ -91,7 +94,7 @@ def get_default_metadata() -> TuningMetadata:
                     dbgym_workspace.dbgym_workspace_path,
                     GymlibIntegtestManager.BENCHMARK,
                     GymlibIntegtestManager.SCALE_FACTOR,
-                    suffix
+                    suffix,
                 ),
             ),
             pristine_dbdata_snapshot_path=fully_resolve_path(
@@ -105,6 +108,6 @@ def get_default_metadata() -> TuningMetadata:
                 get_default_dbdata_parent_dpath(dbgym_workspace.dbgym_workspace_path),
             ),
             pgbin_path=fully_resolve_path(
-                get_default_pgbin_path(dbgym_workspace.dbgym_workspace_path),
+                get_pgbin_symlink_path(dbgym_workspace.dbgym_workspace_path),
             ),
         )
diff --git a/gymlib_package/gymlib/symlinks_paths.py b/gymlib_package/gymlib/symlinks_paths.py
index a25ffdac..e1c4b880 100644
--- a/gymlib_package/gymlib/symlinks_paths.py
+++ b/gymlib_package/gymlib/symlinks_paths.py
@@ -59,3 +59,11 @@ def get_workload_symlink_path(
         / DBGYM_APP_NAME
         / (get_workload_dirname(benchmark, scale_factor, suffix) + ".link")
     )
+
+
+def get_repo_symlink_path(workspace_path: Path) -> Path:
+    return workspace_path / SYMLINKS_DNAME / DBGYM_APP_NAME / "repo.link"
+
+
+def get_pgbin_symlink_path(workspace_path: Path) -> Path:
+    return get_repo_symlink_path(workspace_path) / "boot" / "build" / "postgres" / "bin"
diff --git a/util/workspace.py b/util/workspace.py
index 7e2b4adf..132b31ec 100644
--- a/util/workspace.py
+++ b/util/workspace.py
@@ -45,9 +45,6 @@ def get_latest_run_path_from_workspace_path(workspace_path: Path) -> Path:
     return get_runs_path_from_workspace_path(workspace_path) / "latest_run.link"
 
 
-# Other parameters
-
-
 # Paths of config files in the codebase. These are always relative paths.
 # The reason these can be relative paths instead of functions taking in codebase_path as input is because relative paths are relative to the codebase root
 DEFAULT_BOOT_CONFIG_FPATH = POSTGRES_PATH / "default_boot_config.yaml"
@@ -109,19 +106,6 @@ def get_default_dbdata_parent_dpath(workspace_path: Path) -> Path:
     return get_tmp_path_from_workspace_path(workspace_path)
 
 
-def get_default_repo_path(workspace_path: Path) -> Path:
-    return (
-        get_symlinks_path_from_workspace_path(workspace_path)
-        / "dbgym_dbms_postgres"
-        / "build"
-        / "repo.link"
-    )
-
-
-def get_default_pgbin_path(workspace_path: Path) -> Path:
-    return get_default_repo_path(workspace_path) / "boot" / "build" / "postgres" / "bin"
-
-
 class DBGymWorkspace:
     """
     Global configurations that apply to all parts of DB-Gym

From 9d111d303d6174d917e750adcbb0379fbdb41f50 Mon Sep 17 00:00:00 2001
From: Patrick Wang <wang.patrick57@gmail.com>
Date: Sun, 29 Dec 2024 20:37:33 -0500
Subject: [PATCH 12/24] added and passed test_postgres_dbdata

---
 benchmark/tpch/cli.py                   |  14 ++--
 dbms/postgres/cli.py                    | 100 +++++++++++++++---------
 dbms/tests/integtest_dbms.py            |  25 +++++-
 env/tests/gymlib_integtest_util.py      |   4 +-
 gymlib_package/gymlib/symlinks_paths.py |  26 ++++++
 util/workspace.py                       |  48 ++++++++----
 6 files changed, 156 insertions(+), 61 deletions(-)

diff --git a/benchmark/tpch/cli.py b/benchmark/tpch/cli.py
index 27452da1..f096c2fa 100644
--- a/benchmark/tpch/cli.py
+++ b/benchmark/tpch/cli.py
@@ -4,8 +4,9 @@
 from gymlib.symlinks_paths import (
     get_tables_dirname,
     get_tables_symlink_path,
-    get_workload_dirname,
     get_workload_suffix,
+    get_workload_symlink_path,
+    linkname_to_name,
 )
 
 from benchmark.constants import DEFAULT_SCALE_FACTOR
@@ -17,7 +18,6 @@
     fully_resolve_path,
     get_scale_factor_string,
     is_fully_resolved,
-    link_result,
 )
 
 TPCH_KIT_DIRNAME = "tpch-kit"
@@ -194,16 +194,14 @@ def _generate_tpch_workload(
     query_subset: str,
     scale_factor: float,
 ) -> None:
-    workload_name = get_workload_dirname(
+    expected_workload_symlink_path = get_workload_symlink_path(
+        dbgym_workspace.dbgym_workspace_path,
         "tpch",
         scale_factor,
         get_workload_suffix(
             "tpch", seed_start=seed_start, seed_end=seed_end, query_subset=query_subset
         ),
     )
-    expected_workload_symlink_path = dbgym_workspace.dbgym_cur_symlinks_path / (
-        workload_name + ".link"
-    )
     if expected_workload_symlink_path.exists():
         logging.getLogger(DBGYM_LOGGER_NAME).info(
             f"Skipping generation: {expected_workload_symlink_path}"
@@ -213,7 +211,9 @@ def _generate_tpch_workload(
     logging.getLogger(DBGYM_LOGGER_NAME).info(
         f"Generating: {expected_workload_symlink_path}"
     )
-    workload_path = dbgym_workspace.dbgym_this_run_path / workload_name
+    workload_path = dbgym_workspace.dbgym_this_run_path / linkname_to_name(
+        expected_workload_symlink_path.name
+    )
     workload_path.mkdir(parents=False, exist_ok=False)
 
     query_names = None
diff --git a/dbms/postgres/cli.py b/dbms/postgres/cli.py
index 44cfbfac..262f82a9 100644
--- a/dbms/postgres/cli.py
+++ b/dbms/postgres/cli.py
@@ -10,7 +10,12 @@
 
 import click
 import sqlalchemy
-from gymlib.symlinks_paths import get_pgbin_symlink_path, get_repo_symlink_path
+from gymlib.symlinks_paths import (
+    get_dbdata_tgz_symlink_path,
+    get_pgbin_symlink_path,
+    get_repo_symlink_path,
+    linkname_to_name,
+)
 
 from benchmark.constants import DEFAULT_SCALE_FACTOR
 from benchmark.job.load_info import JobLoadInfo
@@ -33,13 +38,9 @@
     WORKSPACE_PATH_PLACEHOLDER,
     DBGymWorkspace,
     fully_resolve_path,
-    get_dbdata_tgz_filename,
     get_default_dbdata_parent_dpath,
     is_fully_resolved,
     is_ssd,
-    link_result,
-    open_and_save,
-    save_file,
 )
 
 
@@ -127,6 +128,27 @@ def postgres_dbdata(
     intended_dbdata_hardware: str,
     dbdata_parent_dpath: Optional[Path],
 ) -> None:
+    _postgres_dbdata(
+        dbgym_workspace,
+        benchmark_name,
+        scale_factor,
+        pgbin_path,
+        intended_dbdata_hardware,
+        dbdata_parent_dpath,
+    )
+
+
+def _postgres_dbdata(
+    dbgym_workspace: DBGymWorkspace,
+    benchmark_name: str,
+    scale_factor: float,
+    pgbin_path: Optional[Path],
+    intended_dbdata_hardware: str,
+    dbdata_parent_dpath: Optional[Path],
+) -> None:
+    """
+    This function exists as a hook for integration tests.
+    """
     # Set args to defaults programmatically (do this before doing anything else in the function)
     if pgbin_path is None:
         pgbin_path = get_pgbin_symlink_path(dbgym_workspace.dbgym_workspace_path)
@@ -165,46 +187,54 @@ def _create_dbdata(
     dbdata_parent_dpath: Path,
 ) -> None:
     """
-    I chose *not* for this function to skip by default if dbdata_tgz_symlink_path already exists. This
-      is because, while the generated data is deterministic given benchmark_name and scale_factor, any
-      change in the _create_dbdata() function would result in a different dbdata. Since _create_dbdata()
-      may change somewhat frequently, I decided to get rid of the footgun of having changes to
-      _create_dbdata() not propagate to [dbdata].tgz by default.
+    If you change the code of _create_dbdata(), you should also delete the symlink so that the next time you run
+    `dbms postgres dbdata` it will re-create the dbdata.
     """
+    expected_dbdata_tgz_symlink_path = get_dbdata_tgz_symlink_path(
+        dbgym_workspace.dbgym_workspace_path,
+        benchmark_name,
+        scale_factor,
+    )
+    if expected_dbdata_tgz_symlink_path.exists():
+        logging.getLogger(DBGYM_LOGGER_NAME).info(
+            f"Skipping _create_dbdata: {expected_dbdata_tgz_symlink_path}"
+        )
+        return
 
     # It's ok for the dbdata/ directory to be temporary. It just matters that the .tgz is saved in a safe place.
-    dbdata_dpath = dbdata_parent_dpath / "dbdata_being_created"
-    # We might be reusing the same dbdata_parent_dpath, so delete dbdata_dpath if it already exists
-    if dbdata_dpath.exists():
-        shutil.rmtree(dbdata_dpath)
+    dbdata_path = dbdata_parent_dpath / "dbdata_being_created"
+    # We might be reusing the same dbdata_parent_dpath, so delete dbdata_path if it already exists
+    if dbdata_path.exists():
+        shutil.rmtree(dbdata_path)
 
     # Call initdb.
     # Save any script we call from pgbin_symlink_dpath because they are dependencies generated from another task run.
-    save_file(dbgym_workspace, pgbin_path / "initdb")
-    subprocess_run(f'./initdb -D "{dbdata_dpath}"', cwd=pgbin_path)
+    dbgym_workspace.save_file(pgbin_path / "initdb")
+    subprocess_run(f'./initdb -D "{dbdata_path}"', cwd=pgbin_path)
 
     # Start Postgres (all other dbdata setup requires postgres to be started).
     # Note that subprocess_run() never returns when running "pg_ctl start", so I'm using subprocess.run() instead.
-    start_postgres(dbgym_workspace, pgbin_path, dbdata_dpath)
+    start_postgres(dbgym_workspace, pgbin_path, dbdata_path)
 
     # Set up Postgres.
     _generic_dbdata_setup(dbgym_workspace)
     _load_benchmark_into_dbdata(dbgym_workspace, benchmark_name, scale_factor)
 
     # Stop Postgres so that we don't "leak" processes.
-    stop_postgres(dbgym_workspace, pgbin_path, dbdata_dpath)
+    stop_postgres(dbgym_workspace, pgbin_path, dbdata_path)
 
     # Create .tgz file.
     # Note that you can't pass "[dbdata].tgz" as an arg to cur_task_runs_data_path() because that would create "[dbdata].tgz" as a dir.
-    dbdata_tgz_real_fpath = dbgym_workspace.cur_task_runs_data_path(
-        mkdir=True
-    ) / get_dbdata_tgz_filename(benchmark_name, scale_factor)
-    # We need to cd into dbdata_dpath so that the tar file does not contain folders for the whole path of dbdata_dpath.
-    subprocess_run(f"tar -czf {dbdata_tgz_real_fpath} .", cwd=dbdata_dpath)
+    dbdata_tgz_real_path = dbgym_workspace.dbgym_this_run_path / linkname_to_name(
+        expected_dbdata_tgz_symlink_path.name
+    )
+    # We need to cd into dbdata_path so that the tar file does not contain folders for the whole path of dbdata_path.
+    subprocess_run(f"tar -czf {dbdata_tgz_real_path} .", cwd=dbdata_path)
 
     # Create symlink.
     # Only link at the end so that the link only ever points to a complete dbdata.
-    dbdata_tgz_symlink_path = link_result(dbgym_workspace, dbdata_tgz_real_fpath)
+    dbdata_tgz_symlink_path = dbgym_workspace.link_result(dbdata_tgz_real_path)
+    assert expected_dbdata_tgz_symlink_path.samefile(dbdata_tgz_symlink_path)
     logging.getLogger(DBGYM_LOGGER_NAME).info(
         f"Created dbdata in {dbdata_tgz_symlink_path}"
     )
@@ -221,7 +251,7 @@ def _generic_dbdata_setup(dbgym_workspace: DBGymWorkspace) -> None:
     pgport = DEFAULT_POSTGRES_PORT
 
     # Create user
-    save_file(dbgym_workspace, pgbin_real_dpath / "psql")
+    dbgym_workspace.save_file(pgbin_real_dpath / "psql")
     subprocess_run(
         f"./psql -c \"create user {dbgym_pguser} with superuser password '{dbgym_pgpass}'\" {DEFAULT_POSTGRES_DBNAME} -p {pgport} -h localhost",
         cwd=pgbin_real_dpath,
@@ -278,7 +308,7 @@ def _load_into_dbdata(
         sqlalchemy_conn_execute(conn, f"TRUNCATE {table} CASCADE")
     # Then, load the tables.
     for table, table_fpath in load_info.get_tables_and_paths():
-        with open_and_save(dbgym_workspace, table_fpath, "r") as table_csv:
+        with dbgym_workspace.open_and_save(table_fpath, "r") as table_csv:
             assert conn.connection.dbapi_connection is not None
             cur = conn.connection.dbapi_connection.cursor()
             try:
@@ -301,41 +331,41 @@ def _load_into_dbdata(
 # even though they are a little redundant. It seems better than making `dbms` depend on the behavior of the
 # tuning environment.
 def start_postgres(
-    dbgym_workspace: DBGymWorkspace, pgbin_path: Path, dbdata_dpath: Path
+    dbgym_workspace: DBGymWorkspace, pgbin_path: Path, dbdata_path: Path
 ) -> None:
-    _start_or_stop_postgres(dbgym_workspace, pgbin_path, dbdata_dpath, True)
+    _start_or_stop_postgres(dbgym_workspace, pgbin_path, dbdata_path, True)
 
 
 def stop_postgres(
-    dbgym_workspace: DBGymWorkspace, pgbin_path: Path, dbdata_dpath: Path
+    dbgym_workspace: DBGymWorkspace, pgbin_path: Path, dbdata_path: Path
 ) -> None:
-    _start_or_stop_postgres(dbgym_workspace, pgbin_path, dbdata_dpath, False)
+    _start_or_stop_postgres(dbgym_workspace, pgbin_path, dbdata_path, False)
 
 
 def _start_or_stop_postgres(
     dbgym_workspace: DBGymWorkspace,
     pgbin_path: Path,
-    dbdata_dpath: Path,
+    dbdata_path: Path,
     is_start: bool,
 ) -> None:
     # They should be absolute paths and should exist
     assert is_fully_resolved(pgbin_path)
-    assert is_fully_resolved(dbdata_dpath)
+    assert is_fully_resolved(dbdata_path)
     pgport = DEFAULT_POSTGRES_PORT
-    save_file(dbgym_workspace, pgbin_path / "pg_ctl")
+    dbgym_workspace.save_file(pgbin_path / "pg_ctl")
 
     if is_start:
         # We use subprocess.run() because subprocess_run() never returns when running "pg_ctl start".
         # The reason subprocess_run() never returns is because pg_ctl spawns a postgres process so .poll() always returns None.
         # On the other hand, subprocess.run() does return normally, like calling `./pg_ctl` on the command line would do.
         result = subprocess.run(
-            f"./pg_ctl -D \"{dbdata_dpath}\" -o '-p {pgport}' start",
+            f"./pg_ctl -D \"{dbdata_path}\" -o '-p {pgport}' start",
             cwd=pgbin_path,
             shell=True,
         )
         result.check_returncode()
     else:
         subprocess_run(
-            f"./pg_ctl -D \"{dbdata_dpath}\" -o '-p {pgport}' stop",
+            f"./pg_ctl -D \"{dbdata_path}\" -o '-p {pgport}' stop",
             cwd=pgbin_path,
         )
diff --git a/dbms/tests/integtest_dbms.py b/dbms/tests/integtest_dbms.py
index f5961397..ed1a646d 100644
--- a/dbms/tests/integtest_dbms.py
+++ b/dbms/tests/integtest_dbms.py
@@ -2,9 +2,10 @@
 import unittest
 from pathlib import Path
 
-from gymlib.symlinks_paths import get_repo_symlink_path
+from gymlib.symlinks_paths import get_dbdata_tgz_symlink_path, get_repo_symlink_path
 
-from dbms.postgres.cli import _postgres_build
+from benchmark.tpch.cli import _tpch_tables
+from dbms.postgres.cli import _postgres_build, _postgres_dbdata
 from util.workspace import (
     DBGymWorkspace,
     fully_resolve_path,
@@ -37,6 +38,26 @@ def test_postgres_build(self) -> None:
         self.assertTrue(repo_path.exists())
         self.assertTrue(fully_resolve_path(repo_path).exists())
 
+    def test_postgres_dbdata(self) -> None:
+        # Setup
+        # Make sure to recreate self.workspace so that each function call counts as its own run.
+        scale_factor = 0.01
+        _postgres_build(self.workspace, False)
+        DBGymWorkspace.num_times_created_this_run = 0
+        self.workspace = DBGymWorkspace(self.workspace.dbgym_workspace_path)
+        _tpch_tables(self.workspace, scale_factor)
+        DBGymWorkspace.num_times_created_this_run = 0
+        self.workspace = DBGymWorkspace(self.workspace.dbgym_workspace_path)
+
+        # Test
+        dbdata_tgz_path = get_dbdata_tgz_symlink_path(
+            self.workspace.dbgym_workspace_path, "tpch", scale_factor
+        )
+        self.assertFalse(dbdata_tgz_path.exists())
+        _postgres_dbdata(self.workspace, "tpch", scale_factor, None, "hdd", None)
+        self.assertTrue(dbdata_tgz_path.exists())
+        self.assertTrue(fully_resolve_path(dbdata_tgz_path).exists())
+
 
 if __name__ == "__main__":
     unittest.main()
diff --git a/env/tests/gymlib_integtest_util.py b/env/tests/gymlib_integtest_util.py
index 68a6f1b8..959b68a5 100644
--- a/env/tests/gymlib_integtest_util.py
+++ b/env/tests/gymlib_integtest_util.py
@@ -5,6 +5,7 @@
 
 # TODO: remove symlinks_paths from the import
 from gymlib.symlinks_paths import (
+    get_dbdata_tgz_symlink_path,
     get_pgbin_symlink_path,
     get_workload_suffix,
     get_workload_symlink_path,
@@ -16,7 +17,6 @@
     DBGymWorkspace,
     fully_resolve_path,
     get_default_dbdata_parent_dpath,
-    get_default_pristine_dbdata_snapshot_path,
     get_workspace_path_from_config,
 )
 
@@ -98,7 +98,7 @@ def get_default_metadata() -> TuningMetadata:
                 ),
             ),
             pristine_dbdata_snapshot_path=fully_resolve_path(
-                get_default_pristine_dbdata_snapshot_path(
+                get_dbdata_tgz_symlink_path(
                     dbgym_workspace.dbgym_workspace_path,
                     GymlibIntegtestManager.BENCHMARK,
                     GymlibIntegtestManager.SCALE_FACTOR,
diff --git a/gymlib_package/gymlib/symlinks_paths.py b/gymlib_package/gymlib/symlinks_paths.py
index e1c4b880..9c83ef3d 100644
--- a/gymlib_package/gymlib/symlinks_paths.py
+++ b/gymlib_package/gymlib/symlinks_paths.py
@@ -39,6 +39,10 @@ def get_workload_dirname(benchmark: str, scale_factor: float | str, suffix: str)
     return f"workload_{benchmark}_sf{get_scale_factor_string(scale_factor)}_{suffix}"
 
 
+def get_dbdata_tgz_filename(benchmark_name: str, scale_factor: float | str) -> str:
+    return f"{benchmark_name}_sf{get_scale_factor_string(scale_factor)}_pristine_dbdata.tgz"
+
+
 def get_tables_symlink_path(
     workspace_path: Path, benchmark: str, scale_factor: float | str
 ) -> Path:
@@ -67,3 +71,25 @@ def get_repo_symlink_path(workspace_path: Path) -> Path:
 
 def get_pgbin_symlink_path(workspace_path: Path) -> Path:
     return get_repo_symlink_path(workspace_path) / "boot" / "build" / "postgres" / "bin"
+
+
+def get_dbdata_tgz_symlink_path(
+    workspace_path: Path, benchmark_name: str, scale_factor: float | str
+) -> Path:
+    return (
+        workspace_path
+        / SYMLINKS_DNAME
+        / DBGYM_APP_NAME
+        / (get_dbdata_tgz_filename(benchmark_name, scale_factor) + ".link")
+    )
+
+
+# TODO: refactor stuff to use this
+def name_to_linkname(name: str) -> str:
+    assert not name.endswith(".link")
+    return f"{name}.link"
+
+
+def linkname_to_name(linkname: str) -> str:
+    assert linkname.endswith(".link")
+    return linkname[: -len(".link")]
diff --git a/util/workspace.py b/util/workspace.py
index 132b31ec..29075847 100644
--- a/util/workspace.py
+++ b/util/workspace.py
@@ -87,21 +87,6 @@ def get_scale_factor_string(scale_factor: float | str) -> str:
             return str(scale_factor).replace(".", "point")
 
 
-def get_dbdata_tgz_filename(benchmark_name: str, scale_factor: float | str) -> str:
-    return f"{benchmark_name}_sf{get_scale_factor_string(scale_factor)}_pristine_dbdata.tgz"
-
-
-def get_default_pristine_dbdata_snapshot_path(
-    workspace_path: Path, benchmark_name: str, scale_factor: float | str
-) -> Path:
-    return (
-        get_symlinks_path_from_workspace_path(workspace_path)
-        / "dbgym_dbms_postgres"
-        / "data"
-        / (get_dbdata_tgz_filename(benchmark_name, scale_factor) + ".link")
-    )
-
-
 def get_default_dbdata_parent_dpath(workspace_path: Path) -> Path:
     return get_tmp_path_from_workspace_path(workspace_path)
 
@@ -302,6 +287,38 @@ def save_file(self, fpath: Path) -> None:
             copy_fpath = self.dbgym_this_run_path / fname
             shutil.copy(fpath, copy_fpath)
 
+    def open_and_save(self, open_fpath: Path, mode: str = "r") -> IO[Any]:
+        """
+        Open a file and "save" it to [workspace]/task_runs/run_*/.
+        It takes in a str | Path to match the interface of open().
+        This file does not work if open_fpath is a symlink, to make its interface identical to that of open().
+            Make sure to resolve all symlinks with fully_resolve_path().
+        To avoid confusion, I'm enforcing this function to only work with absolute paths.
+        # TODO: maybe make it work on non-fully-resolved paths to better match open()
+        See the comment of save_file() for what "saving" means
+        If you are generating a "result" for the run, _do not_ use this. Just use the normal open().
+            This shouldn't be too hard to remember because this function crashes if open_fpath doesn't exist,
+            and when you write results you're usually opening open_fpaths which do not exist.
+        """
+        # Validate open_fpath
+        assert isinstance(open_fpath, Path)
+        assert is_fully_resolved(
+            open_fpath
+        ), f"open_and_save(): open_fpath ({open_fpath}) should be a fully resolved path"
+        assert not os.path.islink(
+            open_fpath
+        ), f"open_fpath ({open_fpath}) should not be a symlink"
+        assert os.path.exists(open_fpath), f"open_fpath ({open_fpath}) does not exist"
+        # `open_and_save`` *must* be called on files because it doesn't make sense to open a directory. note that this doesn't mean we'll always save
+        #   a file though. we sometimes save a directory (see save_file() for details)
+        assert os.path.isfile(open_fpath), f"open_fpath ({open_fpath}) is not a file"
+
+        # Save
+        self.save_file(open_fpath)
+
+        # Open
+        return open(open_fpath, mode=mode)
+
     # `append_group()` is used to mark the "codebase path" of an invocation of the CLI. The "codebase path" is explained further in the documentation.
     def append_group(self, name: str) -> None:
         self.cur_path_list.append(name)
@@ -507,6 +524,7 @@ def is_child_path(child_path: os.PathLike[str], parent_dpath: os.PathLike[str])
         )
 
 
+# TODO(phw2): deprecate this once I'm done with unittest_workspace.py
 def open_and_save(
     dbgym_workspace: DBGymWorkspace, open_fpath: Path, mode: str = "r"
 ) -> IO[Any]:

From 85f40a0c913128355e1287b0aedaa4d09e004000 Mon Sep 17 00:00:00 2001
From: Patrick Wang <wang.patrick57@gmail.com>
Date: Sun, 29 Dec 2024 21:00:54 -0500
Subject: [PATCH 13/24] now passing all integtests

---
 env/pg_conn.py                                  | 2 +-
 env/tests/_set_up_gymlib_integtest_workspace.sh | 2 +-
 env/tests/integtest_workload.py                 | 1 -
 env/workload.py                                 | 6 +++---
 util/pg.py                                      | 2 +-
 5 files changed, 6 insertions(+), 7 deletions(-)

diff --git a/env/pg_conn.py b/env/pg_conn.py
index 17b36499..db0278b4 100644
--- a/env/pg_conn.py
+++ b/env/pg_conn.py
@@ -346,7 +346,7 @@ def restart_with_changes(
 
         # Set up Boot if we're told to do so
         if self.boot_config_fpath is not None:
-            with open_and_save(self.dbgym_workspace, self.boot_config_fpath) as f:
+            with self.dbgym_workspace.open_and_save(self.boot_config_fpath) as f:
                 boot_config = yaml.safe_load(f)
 
             self._set_up_boot(
diff --git a/env/tests/_set_up_gymlib_integtest_workspace.sh b/env/tests/_set_up_gymlib_integtest_workspace.sh
index b85e0fdf..b1538a70 100755
--- a/env/tests/_set_up_gymlib_integtest_workspace.sh
+++ b/env/tests/_set_up_gymlib_integtest_workspace.sh
@@ -11,7 +11,7 @@
 #   the Postgres repo is very large and (b) the built binary will be different for different machines.
 # This script should be run from the base dbgym/ directory.
 
-set -euo pipefail
+set -euxo pipefail
 
 # INTENDED_DBDATA_HARDWARE can be set elsewhere (e.g. by tests_ci.yaml) but we use hdd by default.
 INTENDED_DBDATA_HARDWARE="${INTENDED_DBDATA_HARDWARE:-hdd}"
diff --git a/env/tests/integtest_workload.py b/env/tests/integtest_workload.py
index a30f975b..cc9d6393 100644
--- a/env/tests/integtest_workload.py
+++ b/env/tests/integtest_workload.py
@@ -3,7 +3,6 @@
 from benchmark.tpch.constants import DEFAULT_TPCH_SEED, NUM_TPCH_QUERIES
 from env.tests.gymlib_integtest_util import GymlibIntegtestManager
 from env.workload import Workload
-from util.workspace import fully_resolve_path, get_default_workload_path
 
 
 class WorkloadTests(unittest.TestCase):
diff --git a/env/workload.py b/env/workload.py
index 89dde426..22a1c0fc 100644
--- a/env/workload.py
+++ b/env/workload.py
@@ -1,6 +1,6 @@
 from pathlib import Path
 
-from util.workspace import DBGymWorkspace, is_fully_resolved, open_and_save
+from util.workspace import DBGymWorkspace, is_fully_resolved
 
 
 class Workload:
@@ -15,13 +15,13 @@ def __init__(self, dbgym_workspace: DBGymWorkspace, workload_dpath: Path) -> Non
 
         assert order_fpath.exists()
 
-        with open_and_save(self.dbgym_workspace, order_fpath) as f:
+        with self.dbgym_workspace.open_and_save(order_fpath) as f:
             for line in f:
                 qid, qpath = line.strip().split(",")
                 qpath = Path(qpath)
                 assert is_fully_resolved(qpath)
 
-                with open_and_save(self.dbgym_workspace, qpath) as qf:
+                with self.dbgym_workspace.open_and_save(qpath) as qf:
                     self.queries[qid] = qf.read()
                 self.query_order.append(qid)
 
diff --git a/util/pg.py b/util/pg.py
index 32f2fdee..a959e11b 100644
--- a/util/pg.py
+++ b/util/pg.py
@@ -28,7 +28,7 @@ def sqlalchemy_conn_execute(
 
 
 def sql_file_queries(dbgym_workspace: DBGymWorkspace, filepath: Path) -> list[str]:
-    with open_and_save(dbgym_workspace, filepath) as f:
+    with dbgym_workspace.open_and_save(filepath) as f:
         lines: list[str] = []
         for line in f:
             if line.startswith("--"):

From 395a6f83ef756d98b5b825369eb59c6cc528fc82 Mon Sep 17 00:00:00 2001
From: Patrick Wang <wang.patrick57@gmail.com>
Date: Sun, 29 Dec 2024 21:05:18 -0500
Subject: [PATCH 14/24] fordpath/fpath/dpath -> path

---
 benchmark/job/cli.py                    |   4 +-
 benchmark/job/load_info.py              |  18 +-
 benchmark/tpch/cli.py                   |  54 ++--
 benchmark/tpch/load_info.py             |  24 +-
 dbms/load_info_base_class.py            |   4 +-
 dbms/postgres/_build_repo.sh            |  22 +-
 dbms/postgres/cli.py                    |  86 +++---
 env/pg_conn.py                          |  76 ++---
 env/replay.py                           |   4 +-
 env/tests/gymlib_integtest_util.py      |   4 +-
 env/tests/integtest_replay.py           |   2 +-
 env/tests/integtest_tuning_artifacts.py |   6 +-
 env/tests/integtest_workload.py         |   6 +-
 env/tuning_artifacts.py                 |  28 +-
 env/workload.py                         |  12 +-
 experiments/load_per_machine_envvars.sh |   6 +-
 manage/cli.py                           |  86 +++---
 scripts/pat_test.sh                     |   2 +-
 task.py                                 |   6 +-
 util/log.py                             |  16 +-
 util/tests/unittest_workspace.py        |   4 +-
 util/workspace.py                       | 370 ++++++++++++------------
 22 files changed, 416 insertions(+), 424 deletions(-)

diff --git a/benchmark/job/cli.py b/benchmark/job/cli.py
index b7853c80..c467e823 100644
--- a/benchmark/job/cli.py
+++ b/benchmark/job/cli.py
@@ -290,8 +290,8 @@ def _generate_job_workload(
         )
 
         for qname in query_names:
-            sql_fpath = fully_resolve_path(queries_parent_path / f"{qname}.sql")
-            f.write(f"Q{qname},{sql_fpath}\n")
+            sql_path = fully_resolve_path(queries_parent_path / f"{qname}.sql")
+            f.write(f"Q{qname},{sql_path}\n")
 
     workload_symlink_path = dbgym_workspace.link_result(workload_path)
     assert workload_symlink_path == expected_workload_symlink_path
diff --git a/benchmark/job/load_info.py b/benchmark/job/load_info.py
index 3a64ab57..8ea28f8a 100644
--- a/benchmark/job/load_info.py
+++ b/benchmark/job/load_info.py
@@ -37,11 +37,11 @@ class JobLoadInfo(LoadInfoBaseClass):
 
     def __init__(self, dbgym_workspace: DBGymWorkspace):
         # Schema (directly in the codebase).
-        job_codebase_path = dbgym_workspace.base_dbgym_repo_dpath / "benchmark" / "job"
-        self._schema_fpath = job_codebase_path / JOB_SCHEMA_FNAME
+        job_codebase_path = dbgym_workspace.base_dbgym_repo_path / "benchmark" / "job"
+        self._schema_path = job_codebase_path / JOB_SCHEMA_FNAME
         assert (
-            self._schema_fpath.exists()
-        ), f"self._schema_fpath ({self._schema_fpath}) does not exist"
+            self._schema_path.exists()
+        ), f"self._schema_path ({self._schema_path}) does not exist"
 
         # Tables
         tables_path = fully_resolve_path(
@@ -51,11 +51,11 @@ def __init__(self, dbgym_workspace: DBGymWorkspace):
         )
         self._tables_and_paths = []
         for table in JobLoadInfo.TABLES:
-            table_fpath = tables_path / f"{table}.csv"
-            self._tables_and_paths.append((table, table_fpath))
+            table_path = tables_path / f"{table}.csv"
+            self._tables_and_paths.append((table, table_path))
 
-    def get_schema_fpath(self) -> Path:
-        return self._schema_fpath
+    def get_schema_path(self) -> Path:
+        return self._schema_path
 
     def get_tables_and_paths(self) -> list[tuple[str, Path]]:
         return self._tables_and_paths
@@ -63,7 +63,7 @@ def get_tables_and_paths(self) -> list[tuple[str, Path]]:
     def get_table_file_delimiter(self) -> str:
         return ","
 
-    def get_constraints_fpath(self) -> Optional[Path]:
+    def get_constraints_path(self) -> Optional[Path]:
         # JOB does not have any constraints. It does have indexes, but we don't want to create
         # those indexes so that the tuning agent can start from a clean slate.
         return None
diff --git a/benchmark/tpch/cli.py b/benchmark/tpch/cli.py
index f096c2fa..af8531b2 100644
--- a/benchmark/tpch/cli.py
+++ b/benchmark/tpch/cli.py
@@ -101,41 +101,41 @@ def _get_queries_dirname(seed: int, scale_factor: float) -> str:
 
 
 def _clone_tpch_kit(dbgym_workspace: DBGymWorkspace) -> None:
-    expected_symlink_dpath = dbgym_workspace.dbgym_cur_symlinks_path / (
+    expected_symlink_path = dbgym_workspace.dbgym_cur_symlinks_path / (
         TPCH_KIT_DIRNAME + ".link"
     )
-    if expected_symlink_dpath.exists():
+    if expected_symlink_path.exists():
         logging.getLogger(DBGYM_LOGGER_NAME).info(
-            f"Skipping clone: {expected_symlink_dpath}"
+            f"Skipping clone: {expected_symlink_path}"
         )
         return
 
-    logging.getLogger(DBGYM_LOGGER_NAME).info(f"Cloning: {expected_symlink_dpath}")
+    logging.getLogger(DBGYM_LOGGER_NAME).info(f"Cloning: {expected_symlink_path}")
     subprocess_run(
         f"./clone_tpch_kit.sh {dbgym_workspace.dbgym_this_run_path}",
-        cwd=dbgym_workspace.base_dbgym_repo_dpath / "benchmark" / "tpch",
+        cwd=dbgym_workspace.base_dbgym_repo_path / "benchmark" / "tpch",
     )
-    symlink_dpath = dbgym_workspace.link_result(
+    symlink_path = dbgym_workspace.link_result(
         dbgym_workspace.dbgym_this_run_path / TPCH_KIT_DIRNAME
     )
-    assert expected_symlink_dpath.samefile(symlink_dpath)
-    logging.getLogger(DBGYM_LOGGER_NAME).info(f"Cloned: {expected_symlink_dpath}")
+    assert expected_symlink_path.samefile(symlink_path)
+    logging.getLogger(DBGYM_LOGGER_NAME).info(f"Cloned: {expected_symlink_path}")
 
 
 def _generate_tpch_queries(
     dbgym_workspace: DBGymWorkspace, seed_start: int, seed_end: int, scale_factor: float
 ) -> None:
-    tpch_kit_dpath = dbgym_workspace.dbgym_cur_symlinks_path / (
+    tpch_kit_path = dbgym_workspace.dbgym_cur_symlinks_path / (
         TPCH_KIT_DIRNAME + ".link"
     )
     logging.getLogger(DBGYM_LOGGER_NAME).info(
         f"Generating queries: [{seed_start}, {seed_end}]"
     )
     for seed in range(seed_start, seed_end + 1):
-        expected_queries_symlink_dpath = dbgym_workspace.dbgym_cur_symlinks_path / (
+        expected_queries_symlink_path = dbgym_workspace.dbgym_cur_symlinks_path / (
             _get_queries_dirname(seed, scale_factor) + ".link"
         )
-        if expected_queries_symlink_dpath.exists():
+        if expected_queries_symlink_path.exists():
             continue
 
         queries_parent_path = (
@@ -147,43 +147,43 @@ def _generate_tpch_queries(
             target_sql = (queries_parent_path / f"{i}.sql").resolve()
             subprocess_run(
                 f"DSS_QUERY=./queries ./qgen {i} -r {seed} -s {scale_factor} > {target_sql}",
-                cwd=tpch_kit_dpath / "dbgen",
+                cwd=tpch_kit_path / "dbgen",
                 verbose=False,
             )
-        queries_symlink_dpath = dbgym_workspace.link_result(queries_parent_path)
-        assert queries_symlink_dpath.samefile(expected_queries_symlink_dpath)
+        queries_symlink_path = dbgym_workspace.link_result(queries_parent_path)
+        assert queries_symlink_path.samefile(expected_queries_symlink_path)
     logging.getLogger(DBGYM_LOGGER_NAME).info(
         f"Generated queries: [{seed_start}, {seed_end}]"
     )
 
 
 def _generate_tpch_tables(dbgym_workspace: DBGymWorkspace, scale_factor: float) -> None:
-    tpch_kit_dpath = dbgym_workspace.dbgym_cur_symlinks_path / (
+    tpch_kit_path = dbgym_workspace.dbgym_cur_symlinks_path / (
         TPCH_KIT_DIRNAME + ".link"
     )
-    expected_tables_symlink_dpath = get_tables_symlink_path(
+    expected_tables_symlink_path = get_tables_symlink_path(
         dbgym_workspace.dbgym_workspace_path, "tpch", scale_factor
     )
-    if expected_tables_symlink_dpath.exists():
+    if expected_tables_symlink_path.exists():
         logging.getLogger(DBGYM_LOGGER_NAME).info(
-            f"Skipping generation: {expected_tables_symlink_dpath}"
+            f"Skipping generation: {expected_tables_symlink_path}"
         )
         return
 
     logging.getLogger(DBGYM_LOGGER_NAME).info(
-        f"Generating: {expected_tables_symlink_dpath}"
+        f"Generating: {expected_tables_symlink_path}"
     )
-    subprocess_run(f"./dbgen -vf -s {scale_factor}", cwd=tpch_kit_dpath / "dbgen")
+    subprocess_run(f"./dbgen -vf -s {scale_factor}", cwd=tpch_kit_path / "dbgen")
     tables_parent_path = dbgym_workspace.dbgym_this_run_path / get_tables_dirname(
         "tpch", scale_factor
     )
     tables_parent_path.mkdir(parents=False, exist_ok=False)
-    subprocess_run(f"mv ./*.tbl {tables_parent_path}", cwd=tpch_kit_dpath / "dbgen")
+    subprocess_run(f"mv ./*.tbl {tables_parent_path}", cwd=tpch_kit_path / "dbgen")
 
-    tables_symlink_dpath = dbgym_workspace.link_result(tables_parent_path)
-    assert tables_symlink_dpath.samefile(expected_tables_symlink_dpath)
+    tables_symlink_path = dbgym_workspace.link_result(tables_parent_path)
+    assert tables_symlink_path.samefile(expected_tables_symlink_path)
     logging.getLogger(DBGYM_LOGGER_NAME).info(
-        f"Generated: {expected_tables_symlink_dpath}"
+        f"Generated: {expected_tables_symlink_path}"
     )
 
 
@@ -233,11 +233,11 @@ def _generate_tpch_workload(
             )
 
             for qname in query_names:
-                sql_fpath = fully_resolve_path(queries_parent_path / f"{qname}.sql")
+                sql_path = fully_resolve_path(queries_parent_path / f"{qname}.sql")
                 assert is_fully_resolved(
-                    sql_fpath
+                    sql_path
                 ), "We should only write existent real absolute paths to a file"
-                f.write(f"S{seed}-Q{qname},{sql_fpath}\n")
+                f.write(f"S{seed}-Q{qname},{sql_path}\n")
 
     workload_symlink_path = dbgym_workspace.link_result(workload_path)
     assert workload_symlink_path == expected_workload_symlink_path
diff --git a/benchmark/tpch/load_info.py b/benchmark/tpch/load_info.py
index 0e5bcc64..a717173c 100644
--- a/benchmark/tpch/load_info.py
+++ b/benchmark/tpch/load_info.py
@@ -24,17 +24,15 @@ class TpchLoadInfo(LoadInfoBaseClass):
 
     def __init__(self, dbgym_workspace: DBGymWorkspace, scale_factor: float):
         # Schema and constraints (directly in the codebase).
-        tpch_codebase_path = (
-            dbgym_workspace.base_dbgym_repo_dpath / "benchmark" / "tpch"
-        )
-        self._schema_fpath = tpch_codebase_path / TPCH_SCHEMA_FNAME
+        tpch_codebase_path = dbgym_workspace.base_dbgym_repo_path / "benchmark" / "tpch"
+        self._schema_path = tpch_codebase_path / TPCH_SCHEMA_FNAME
         assert (
-            self._schema_fpath.exists()
-        ), f"self._schema_fpath ({self._schema_fpath}) does not exist"
-        self._constraints_fpath = tpch_codebase_path / TPCH_CONSTRAINTS_FNAME
+            self._schema_path.exists()
+        ), f"self._schema_path ({self._schema_path}) does not exist"
+        self._constraints_path = tpch_codebase_path / TPCH_CONSTRAINTS_FNAME
         assert (
-            self._constraints_fpath.exists()
-        ), f"self._constraints_fpath ({self._constraints_fpath}) does not exist"
+            self._constraints_path.exists()
+        ), f"self._constraints_path ({self._constraints_path}) does not exist"
 
         # Tables
         tables_path = fully_resolve_path(
@@ -47,8 +45,8 @@ def __init__(self, dbgym_workspace: DBGymWorkspace, scale_factor: float):
             table_path = tables_path / f"{table}.tbl"
             self._tables_and_paths.append((table, table_path))
 
-    def get_schema_fpath(self) -> Path:
-        return self._schema_fpath
+    def get_schema_path(self) -> Path:
+        return self._schema_path
 
     def get_tables_and_paths(self) -> list[tuple[str, Path]]:
         return self._tables_and_paths
@@ -56,5 +54,5 @@ def get_tables_and_paths(self) -> list[tuple[str, Path]]:
     def get_table_file_delimiter(self) -> str:
         return "|"
 
-    def get_constraints_fpath(self) -> Optional[Path]:
-        return self._constraints_fpath
+    def get_constraints_path(self) -> Optional[Path]:
+        return self._constraints_path
diff --git a/dbms/load_info_base_class.py b/dbms/load_info_base_class.py
index bfbe763a..fca7d903 100644
--- a/dbms/load_info_base_class.py
+++ b/dbms/load_info_base_class.py
@@ -9,7 +9,7 @@ class LoadInfoBaseClass:
       copy the comments or type annotations or else they might become out of sync.
     """
 
-    def get_schema_fpath(self) -> Path:
+    def get_schema_path(self) -> Path:
         raise NotImplementedError
 
     def get_tables_and_paths(self) -> list[tuple[str, Path]]:
@@ -21,5 +21,5 @@ def get_table_file_delimiter(self) -> str:
 
     # If the subclassing benchmark does not have constraints, you can return None here.
     # Constraints are also indexes.
-    def get_constraints_fpath(self) -> Optional[Path]:
+    def get_constraints_path(self) -> Optional[Path]:
         raise NotImplementedError
diff --git a/dbms/postgres/_build_repo.sh b/dbms/postgres/_build_repo.sh
index 2127c438..d6fb839b 100755
--- a/dbms/postgres/_build_repo.sh
+++ b/dbms/postgres/_build_repo.sh
@@ -2,14 +2,14 @@
 
 set -euxo pipefail
 
-REPO_REAL_PARENT_DPATH="$1"
+REPO_REAL_PARENT_PATH="$1"
 
 # Download and make postgres from the boot repository.
-mkdir -p "${REPO_REAL_PARENT_DPATH}"
-cd "${REPO_REAL_PARENT_DPATH}"
+mkdir -p "${REPO_REAL_PARENT_PATH}"
+cd "${REPO_REAL_PARENT_PATH}"
 git clone https://github.com/lmwnshn/boot.git --single-branch --branch vldb_2024 --depth 1
 cd ./boot
-./cmudb/build/configure.sh release "${REPO_REAL_PARENT_DPATH}/boot/build/postgres"
+./cmudb/build/configure.sh release "${REPO_REAL_PARENT_PATH}/boot/build/postgres"
 make clean
 make install-world-bin -j4
 
@@ -17,18 +17,18 @@ make install-world-bin -j4
 cd ./cmudb/extension/boot_rs/
 cargo build --release
 cbindgen . -o target/boot_rs.h --lang c
-cd "${REPO_REAL_PARENT_DPATH}/boot"
+cd "${REPO_REAL_PARENT_PATH}/boot"
 
 cd ./cmudb/extension/boot/
 make clean
 make install -j
-cd "${REPO_REAL_PARENT_DPATH}/boot"
+cd "${REPO_REAL_PARENT_PATH}/boot"
 
 # Download and make hypopg.
 git clone https://github.com/HypoPG/hypopg.git
 cd ./hypopg
-PG_CONFIG="${REPO_REAL_PARENT_DPATH}/boot/build/postgres/bin/pg_config" make install
-cd "${REPO_REAL_PARENT_DPATH}/boot"
+PG_CONFIG="${REPO_REAL_PARENT_PATH}/boot/build/postgres/bin/pg_config" make install
+cd "${REPO_REAL_PARENT_PATH}/boot"
 
 # Download and make pg_hint_plan.
 # We need -L to follow links.
@@ -36,6 +36,6 @@ curl -L https://github.com/ossc-db/pg_hint_plan/archive/refs/tags/REL15_1_5_1.ta
 tar -xzf REL15_1_5_1.tar.gz
 rm REL15_1_5_1.tar.gz
 cd ./pg_hint_plan-REL15_1_5_1
-PATH="${REPO_REAL_PARENT_DPATH}/boot/build/postgres/bin:$PATH" make
-PATH="${REPO_REAL_PARENT_DPATH}/boot/build/postgres/bin:$PATH" make install
-cp ./pg_hint_plan.so ${REPO_REAL_PARENT_DPATH}/boot/build/postgres/lib
+PATH="${REPO_REAL_PARENT_PATH}/boot/build/postgres/bin:$PATH" make
+PATH="${REPO_REAL_PARENT_PATH}/boot/build/postgres/bin:$PATH" make install
+cp ./pg_hint_plan.so ${REPO_REAL_PARENT_PATH}/boot/build/postgres/lib
diff --git a/dbms/postgres/cli.py b/dbms/postgres/cli.py
index 262f82a9..dbc46875 100644
--- a/dbms/postgres/cli.py
+++ b/dbms/postgres/cli.py
@@ -38,7 +38,7 @@
     WORKSPACE_PATH_PLACEHOLDER,
     DBGymWorkspace,
     fully_resolve_path,
-    get_default_dbdata_parent_dpath,
+    get_default_dbdata_parent_path,
     is_fully_resolved,
     is_ssd,
 )
@@ -68,30 +68,30 @@ def _postgres_build(dbgym_workspace: DBGymWorkspace, rebuild: bool) -> None:
     """
     This function exists as a hook for integration tests.
     """
-    expected_repo_symlink_dpath = get_repo_symlink_path(
+    expected_repo_symlink_path = get_repo_symlink_path(
         dbgym_workspace.dbgym_workspace_path
     )
-    if not rebuild and expected_repo_symlink_dpath.exists():
+    if not rebuild and expected_repo_symlink_path.exists():
         logging.getLogger(DBGYM_LOGGER_NAME).info(
-            f"Skipping _postgres_build: {expected_repo_symlink_dpath}"
+            f"Skipping _postgres_build: {expected_repo_symlink_path}"
         )
         return
 
     logging.getLogger(DBGYM_LOGGER_NAME).info(
-        f"Setting up repo in {expected_repo_symlink_dpath}"
+        f"Setting up repo in {expected_repo_symlink_path}"
     )
-    repo_real_dpath = dbgym_workspace.dbgym_this_run_path / "repo"
-    repo_real_dpath.mkdir(parents=False, exist_ok=False)
+    repo_real_path = dbgym_workspace.dbgym_this_run_path / "repo"
+    repo_real_path.mkdir(parents=False, exist_ok=False)
     subprocess_run(
-        f"./_build_repo.sh {repo_real_dpath}",
-        cwd=dbgym_workspace.base_dbgym_repo_dpath / "dbms" / "postgres",
+        f"./_build_repo.sh {repo_real_path}",
+        cwd=dbgym_workspace.base_dbgym_repo_path / "dbms" / "postgres",
     )
 
     # only link at the end so that the link only ever points to a complete repo
-    repo_symlink_dpath = dbgym_workspace.link_result(repo_real_dpath)
-    assert expected_repo_symlink_dpath.samefile(repo_symlink_dpath)
+    repo_symlink_path = dbgym_workspace.link_result(repo_real_path)
+    assert expected_repo_symlink_path.samefile(repo_symlink_path)
     logging.getLogger(DBGYM_LOGGER_NAME).info(
-        f"Set up repo in {expected_repo_symlink_dpath}"
+        f"Set up repo in {expected_repo_symlink_path}"
     )
 
 
@@ -112,13 +112,13 @@ def _postgres_build(dbgym_workspace: DBGymWorkspace, rebuild: bool) -> None:
     "--intended-dbdata-hardware",
     type=click.Choice(["hdd", "ssd"]),
     default="hdd",
-    help=f"The intended hardware dbdata should be on. Used as a sanity check for --dbdata-parent-dpath.",
+    help=f"The intended hardware dbdata should be on. Used as a sanity check for --dbdata-parent-path.",
 )
 @click.option(
-    "--dbdata-parent-dpath",
+    "--dbdata-parent-path",
     default=None,
     type=Path,
-    help=f"The path to the parent directory of the dbdata which will be actively tuned. The default is {get_default_dbdata_parent_dpath(WORKSPACE_PATH_PLACEHOLDER)}.",
+    help=f"The path to the parent directory of the dbdata which will be actively tuned. The default is {get_default_dbdata_parent_path(WORKSPACE_PATH_PLACEHOLDER)}.",
 )
 def postgres_dbdata(
     dbgym_workspace: DBGymWorkspace,
@@ -126,7 +126,7 @@ def postgres_dbdata(
     scale_factor: float,
     pgbin_path: Optional[Path],
     intended_dbdata_hardware: str,
-    dbdata_parent_dpath: Optional[Path],
+    dbdata_parent_path: Optional[Path],
 ) -> None:
     _postgres_dbdata(
         dbgym_workspace,
@@ -134,7 +134,7 @@ def postgres_dbdata(
         scale_factor,
         pgbin_path,
         intended_dbdata_hardware,
-        dbdata_parent_dpath,
+        dbdata_parent_path,
     )
 
 
@@ -144,7 +144,7 @@ def _postgres_dbdata(
     scale_factor: float,
     pgbin_path: Optional[Path],
     intended_dbdata_hardware: str,
-    dbdata_parent_dpath: Optional[Path],
+    dbdata_parent_path: Optional[Path],
 ) -> None:
     """
     This function exists as a hook for integration tests.
@@ -152,30 +152,30 @@ def _postgres_dbdata(
     # Set args to defaults programmatically (do this before doing anything else in the function)
     if pgbin_path is None:
         pgbin_path = get_pgbin_symlink_path(dbgym_workspace.dbgym_workspace_path)
-    if dbdata_parent_dpath is None:
-        dbdata_parent_dpath = get_default_dbdata_parent_dpath(
+    if dbdata_parent_path is None:
+        dbdata_parent_path = get_default_dbdata_parent_path(
             dbgym_workspace.dbgym_workspace_path
         )
 
     # Fully resolve all input paths.
     pgbin_path = fully_resolve_path(pgbin_path)
-    dbdata_parent_dpath = fully_resolve_path(dbdata_parent_dpath)
+    dbdata_parent_path = fully_resolve_path(dbdata_parent_path)
 
     # Check assertions on args
     if intended_dbdata_hardware == "hdd":
         assert not is_ssd(
-            dbdata_parent_dpath
-        ), f"Intended hardware is HDD but dbdata_parent_dpath ({dbdata_parent_dpath}) is an SSD"
+            dbdata_parent_path
+        ), f"Intended hardware is HDD but dbdata_parent_path ({dbdata_parent_path}) is an SSD"
     elif intended_dbdata_hardware == "ssd":
         assert is_ssd(
-            dbdata_parent_dpath
-        ), f"Intended hardware is SSD but dbdata_parent_dpath ({dbdata_parent_dpath}) is an HDD"
+            dbdata_parent_path
+        ), f"Intended hardware is SSD but dbdata_parent_path ({dbdata_parent_path}) is an HDD"
     else:
         assert False
 
     # Create dbdata
     _create_dbdata(
-        dbgym_workspace, benchmark_name, scale_factor, pgbin_path, dbdata_parent_dpath
+        dbgym_workspace, benchmark_name, scale_factor, pgbin_path, dbdata_parent_path
     )
 
 
@@ -184,7 +184,7 @@ def _create_dbdata(
     benchmark_name: str,
     scale_factor: float,
     pgbin_path: Path,
-    dbdata_parent_dpath: Path,
+    dbdata_parent_path: Path,
 ) -> None:
     """
     If you change the code of _create_dbdata(), you should also delete the symlink so that the next time you run
@@ -202,13 +202,13 @@ def _create_dbdata(
         return
 
     # It's ok for the dbdata/ directory to be temporary. It just matters that the .tgz is saved in a safe place.
-    dbdata_path = dbdata_parent_dpath / "dbdata_being_created"
-    # We might be reusing the same dbdata_parent_dpath, so delete dbdata_path if it already exists
+    dbdata_path = dbdata_parent_path / "dbdata_being_created"
+    # We might be reusing the same dbdata_parent_path, so delete dbdata_path if it already exists
     if dbdata_path.exists():
         shutil.rmtree(dbdata_path)
 
     # Call initdb.
-    # Save any script we call from pgbin_symlink_dpath because they are dependencies generated from another task run.
+    # Save any script we call from pgbin_symlink_path because they are dependencies generated from another task run.
     dbgym_workspace.save_file(pgbin_path / "initdb")
     subprocess_run(f'./initdb -D "{dbdata_path}"', cwd=pgbin_path)
 
@@ -242,23 +242,23 @@ def _create_dbdata(
 
 def _generic_dbdata_setup(dbgym_workspace: DBGymWorkspace) -> None:
     # get necessary vars
-    pgbin_real_dpath = get_pgbin_symlink_path(
+    pgbin_real_path = get_pgbin_symlink_path(
         dbgym_workspace.dbgym_workspace_path
     ).resolve()
-    assert pgbin_real_dpath.exists()
+    assert pgbin_real_path.exists()
     dbgym_pguser = DBGYM_POSTGRES_USER
     dbgym_pgpass = DBGYM_POSTGRES_PASS
     pgport = DEFAULT_POSTGRES_PORT
 
     # Create user
-    dbgym_workspace.save_file(pgbin_real_dpath / "psql")
+    dbgym_workspace.save_file(pgbin_real_path / "psql")
     subprocess_run(
         f"./psql -c \"create user {dbgym_pguser} with superuser password '{dbgym_pgpass}'\" {DEFAULT_POSTGRES_DBNAME} -p {pgport} -h localhost",
-        cwd=pgbin_real_dpath,
+        cwd=pgbin_real_path,
     )
     subprocess_run(
         f'./psql -c "grant pg_monitor to {dbgym_pguser}" {DEFAULT_POSTGRES_DBNAME} -p {pgport} -h localhost',
-        cwd=pgbin_real_dpath,
+        cwd=pgbin_real_path,
     )
 
     # Load shared preload libraries
@@ -267,14 +267,14 @@ def _generic_dbdata_setup(dbgym_workspace: DBGymWorkspace) -> None:
             # You have to use TO and you can't put single quotes around the libraries (https://postgrespro.com/list/thread-id/2580120)
             # The method I wrote here works for both one library and multiple libraries
             f'./psql -c "ALTER SYSTEM SET shared_preload_libraries TO {SHARED_PRELOAD_LIBRARIES};" {DEFAULT_POSTGRES_DBNAME} -p {pgport} -h localhost',
-            cwd=pgbin_real_dpath,
+            cwd=pgbin_real_path,
         )
 
     # Create the dbgym database. Since one dbdata dir maps to one benchmark, all benchmarks will use the same database
     # as opposed to using databases named after the benchmark.
     subprocess_run(
         f"./psql -c \"create database {DBGYM_POSTGRES_DBNAME} with owner = '{dbgym_pguser}'\" {DEFAULT_POSTGRES_DBNAME} -p {pgport} -h localhost",
-        cwd=pgbin_real_dpath,
+        cwd=pgbin_real_path,
     )
 
 
@@ -301,14 +301,14 @@ def _load_into_dbdata(
     conn: sqlalchemy.Connection,
     load_info: LoadInfoBaseClass,
 ) -> None:
-    sql_file_execute(dbgym_workspace, conn, load_info.get_schema_fpath())
+    sql_file_execute(dbgym_workspace, conn, load_info.get_schema_path())
 
     # Truncate all tables first before even loading a single one.
     for table, _ in load_info.get_tables_and_paths():
         sqlalchemy_conn_execute(conn, f"TRUNCATE {table} CASCADE")
     # Then, load the tables.
-    for table, table_fpath in load_info.get_tables_and_paths():
-        with dbgym_workspace.open_and_save(table_fpath, "r") as table_csv:
+    for table, table_path in load_info.get_tables_and_paths():
+        with dbgym_workspace.open_and_save(table_path, "r") as table_csv:
             assert conn.connection.dbapi_connection is not None
             cur = conn.connection.dbapi_connection.cursor()
             try:
@@ -320,9 +320,9 @@ def _load_into_dbdata(
             finally:
                 cur.close()
 
-    constraints_fpath = load_info.get_constraints_fpath()
-    if constraints_fpath is not None:
-        sql_file_execute(dbgym_workspace, conn, constraints_fpath)
+    constraints_path = load_info.get_constraints_path()
+    if constraints_path is not None:
+        sql_file_execute(dbgym_workspace, conn, constraints_path)
 
 
 # The start and stop functions slightly duplicate functionality from pg_conn.py. However, I chose to do it this way
diff --git a/env/pg_conn.py b/env/pg_conn.py
index db0278b4..bbeee54d 100644
--- a/env/pg_conn.py
+++ b/env/pg_conn.py
@@ -24,7 +24,7 @@
 
 from util.log import DBGYM_LOGGER_NAME
 from util.pg import DBGYM_POSTGRES_DBNAME, SHARED_PRELOAD_LIBRARIES, get_kv_connstr
-from util.workspace import DBGymWorkspace, open_and_save, parent_dpath_of_path
+from util.workspace import DBGymWorkspace, open_and_save, parent_path_of_path
 
 CONNECT_TIMEOUT = 300
 
@@ -37,36 +37,36 @@ def __init__(
         self,
         dbgym_workspace: DBGymWorkspace,
         pgport: int,
-        pristine_dbdata_snapshot_fpath: Path,
-        dbdata_parent_dpath: Path,
+        pristine_dbdata_snapshot_path: Path,
+        dbdata_parent_path: Path,
         pgbin_path: Union[str, Path],
         # Whether this is None determines whether Boot is enabled.
-        boot_config_fpath: Optional[Path],
+        boot_config_path: Optional[Path],
     ) -> None:
 
         self.dbgym_workspace = dbgym_workspace
         self.pgport = pgport
         self.pgbin_path = pgbin_path
-        self.boot_config_fpath = boot_config_fpath
+        self.boot_config_path = boot_config_path
         self.log_step = 0
 
         # All the paths related to dbdata
-        # pristine_dbdata_snapshot_fpath is the .tgz snapshot that represents the starting state
+        # pristine_dbdata_snapshot_path is the .tgz snapshot that represents the starting state
         #   of the database (with the default configuration). It is generated by a call to
         #   `python tune.py dbms postgres ...` and should not be overwritten.
-        self.pristine_dbdata_snapshot_fpath = pristine_dbdata_snapshot_fpath
-        # checkpoint_dbdata_snapshot_fpath is the .tgz snapshot that represents the current
+        self.pristine_dbdata_snapshot_path = pristine_dbdata_snapshot_path
+        # checkpoint_dbdata_snapshot_path is the .tgz snapshot that represents the current
         #   state of the database as it is being tuned. It is generated while tuning and is
         #   discarded once tuning is completed.
-        self.checkpoint_dbdata_snapshot_fpath = (
+        self.checkpoint_dbdata_snapshot_path = (
             dbgym_workspace.dbgym_tmp_path / "checkpoint_dbdata.tgz"
         )
-        # dbdata_parent_dpath is the parent directory of the dbdata that is *actively being tuned*.
-        # It is *not* the parent directory of pristine_dbdata_snapshot_fpath.
+        # dbdata_parent_path is the parent directory of the dbdata that is *actively being tuned*.
+        # It is *not* the parent directory of pristine_dbdata_snapshot_path.
         #   Setting this lets us control the hardware device dbdata is built on (e.g. HDD vs. SSD).
-        self.dbdata_parent_dpath = dbdata_parent_dpath
-        # dbdata_dpath is the dbdata that is *actively being tuned*
-        self.dbdata_dpath = self.dbdata_parent_dpath / f"dbdata{self.pgport}"
+        self.dbdata_parent_path = dbdata_parent_path
+        # dbdata_path is the dbdata that is *actively being tuned*
+        self.dbdata_path = self.dbdata_parent_path / f"dbdata{self.pgport}"
 
         self._conn: Optional[psycopg.Connection[Any]] = None
         self.hint_check_failed_with: Optional[str] = None
@@ -102,16 +102,16 @@ def disconnect(self) -> None:
             self._conn = None
 
     def move_log(self) -> None:
-        pglog_fpath = (
+        pglog_path = (
             self.dbgym_workspace.cur_task_runs_artifacts_path(mkdir=True)
             / f"pg{self.pgport}.log"
         )
-        pglog_this_step_fpath = (
+        pglog_this_step_path = (
             self.dbgym_workspace.cur_task_runs_artifacts_path(mkdir=True)
             / f"pg{self.pgport}.log.{self.log_step}"
         )
-        if pglog_fpath.exists():
-            shutil.move(pglog_fpath, pglog_this_step_fpath)
+        if pglog_path.exists():
+            shutil.move(pglog_path, pglog_this_step_path)
             self.log_step += 1
 
     def force_statement_timeout(self, timeout: float) -> None:
@@ -202,13 +202,13 @@ def time_query(
     def shutdown_postgres(self) -> None:
         """Shuts down postgres."""
         self.disconnect()
-        if not Path(self.dbdata_dpath).exists():
+        if not Path(self.dbdata_path).exists():
             return
 
         while True:
             logging.getLogger(DBGYM_LOGGER_NAME).debug("Shutting down postgres...")
             _, stdout, stderr = local[f"{self.pgbin_path}/pg_ctl"][
-                "stop", "--wait", "-t", "180", "-D", self.dbdata_dpath
+                "stop", "--wait", "-t", "180", "-D", self.dbdata_path
             ].run(retcode=None)
             time.sleep(1)
             logging.getLogger(DBGYM_LOGGER_NAME).debug(
@@ -225,7 +225,7 @@ def shutdown_postgres(self) -> None:
                 DBGYM_POSTGRES_DBNAME,
             ].run(retcode=None)
 
-            exists = (Path(self.dbdata_dpath) / "postmaster.pid").exists()
+            exists = (Path(self.dbdata_path) / "postmaster.pid").exists()
             if not exists and retcode != 0:
                 break
 
@@ -241,7 +241,7 @@ def restart_with_changes(
     ) -> bool:
         """
         This function is called "(re)start" because it also shuts down Postgres before starting it.
-        This function assumes that some snapshot has already been untarred into self.dbdata_dpath.
+        This function assumes that some snapshot has already been untarred into self.dbdata_path.
         You can do this by calling one of the wrappers around _restore_snapshot().
 
         Note that multiple calls are not "additive". Calling this will restart from the latest saved
@@ -250,7 +250,7 @@ def restart_with_changes(
         """
         # Install the new configuration changes.
         if conf_changes is not None:
-            dbdata_auto_conf_path = self.dbdata_dpath / "postgresql.auto.conf"
+            dbdata_auto_conf_path = self.dbdata_path / "postgresql.auto.conf"
             with open(dbdata_auto_conf_path, "w") as f:
                 f.write(
                     "\n".join([f"{knob} = {val}" for knob, val in conf_changes.items()])
@@ -273,14 +273,14 @@ def restart_with_changes(
                 "cf",
                 # We append .tmp so that if we fail in the *middle* of running tar, we
                 #   still have the previous checkpoint available to us
-                f"{self.checkpoint_dbdata_snapshot_fpath}.tmp",
+                f"{self.checkpoint_dbdata_snapshot_path}.tmp",
                 "-C",
-                parent_dpath_of_path(self.dbdata_dpath),
-                self.dbdata_dpath,
+                parent_path_of_path(self.dbdata_path),
+                self.dbdata_path,
             ].run()
 
         # Make sure the PID lock file doesn't exist.
-        pid_lock = Path(f"{self.dbdata_dpath}/postmaster.pid")
+        pid_lock = Path(f"{self.dbdata_path}/postmaster.pid")
         assert not pid_lock.exists()
 
         if dump_page_cache:
@@ -292,7 +292,7 @@ def restart_with_changes(
             # Try starting up.
             retcode, stdout, stderr = local[f"{self.pgbin_path}/pg_ctl"][
                 "-D",
-                self.dbdata_dpath,
+                self.dbdata_path,
                 "--wait",
                 "-t",
                 "180",
@@ -345,8 +345,8 @@ def restart_with_changes(
             )
 
         # Set up Boot if we're told to do so
-        if self.boot_config_fpath is not None:
-            with self.dbgym_workspace.open_and_save(self.boot_config_fpath) as f:
+        if self.boot_config_path is not None:
+            with self.dbgym_workspace.open_and_save(self.boot_config_path) as f:
                 boot_config = yaml.safe_load(f)
 
             self._set_up_boot(
@@ -362,7 +362,7 @@ def restart_with_changes(
 
         # Move the temporary over since we now know the temporary can load.
         if save_checkpoint:
-            shutil.move(f"{self.dbdata_dpath}.tgz.tmp", f"{self.dbdata_dpath}.tgz")
+            shutil.move(f"{self.dbdata_path}.tgz.tmp", f"{self.dbdata_path}.tgz")
 
         return True
 
@@ -483,10 +483,10 @@ def get_system_knobs(self) -> dict[str, str]:
         return knobs
 
     def restore_pristine_snapshot(self) -> bool:
-        return self._restore_snapshot(self.pristine_dbdata_snapshot_fpath)
+        return self._restore_snapshot(self.pristine_dbdata_snapshot_path)
 
     def restore_checkpointed_snapshot(self) -> bool:
-        return self._restore_snapshot(self.checkpoint_dbdata_snapshot_fpath)
+        return self._restore_snapshot(self.checkpoint_dbdata_snapshot_path)
 
     def _restore_snapshot(
         self,
@@ -494,23 +494,23 @@ def _restore_snapshot(
     ) -> bool:
         self.shutdown_postgres()
 
-        local["rm"]["-rf", self.dbdata_dpath].run()
-        local["mkdir"]["-m", "0700", "-p", self.dbdata_dpath].run()
+        local["rm"]["-rf", self.dbdata_path].run()
+        local["mkdir"]["-m", "0700", "-p", self.dbdata_path].run()
 
-        # Strip the "dbdata" so we can implant directly into the target dbdata_dpath.
+        # Strip the "dbdata" so we can implant directly into the target dbdata_path.
         assert dbdata_snapshot_path.exists()
         local["tar"][
             "xf",
             dbdata_snapshot_path,
             "-C",
-            self.dbdata_dpath,
+            self.dbdata_path,
             "--strip-components",
             "1",
         ].run()
         # Imprint the required port.
         (
             (local["echo"][f"port={self.pgport}"])
-            >> f"{self.dbdata_dpath}/postgresql.conf"
+            >> f"{self.dbdata_path}/postgresql.conf"
         )()
 
         return self.restart_postgres()
diff --git a/env/replay.py b/env/replay.py
index 8dcfe8e4..55621284 100644
--- a/env/replay.py
+++ b/env/replay.py
@@ -9,7 +9,7 @@
 
 
 def replay(
-    dbgym_workspace: DBGymWorkspace, tuning_artifacts_dpath: Path
+    dbgym_workspace: DBGymWorkspace, tuning_artifacts_path: Path
 ) -> list[tuple[float, int]]:
     """
     Returns the total runtime and the number of timed out queries for each step.
@@ -18,7 +18,7 @@ def replay(
     """
     replay_data: list[tuple[float, int]] = []
 
-    reader = TuningArtifactsReader(tuning_artifacts_dpath)
+    reader = TuningArtifactsReader(tuning_artifacts_path)
     pg_conn = PostgresConn(
         dbgym_workspace,
         DEFAULT_POSTGRES_PORT,
diff --git a/env/tests/gymlib_integtest_util.py b/env/tests/gymlib_integtest_util.py
index 959b68a5..cac69827 100644
--- a/env/tests/gymlib_integtest_util.py
+++ b/env/tests/gymlib_integtest_util.py
@@ -16,7 +16,7 @@
 from util.workspace import (
     DBGymWorkspace,
     fully_resolve_path,
-    get_default_dbdata_parent_dpath,
+    get_default_dbdata_parent_path,
     get_workspace_path_from_config,
 )
 
@@ -105,7 +105,7 @@ def get_default_metadata() -> TuningMetadata:
                 ),
             ),
             dbdata_parent_path=fully_resolve_path(
-                get_default_dbdata_parent_dpath(dbgym_workspace.dbgym_workspace_path),
+                get_default_dbdata_parent_path(dbgym_workspace.dbgym_workspace_path),
             ),
             pgbin_path=fully_resolve_path(
                 get_pgbin_symlink_path(dbgym_workspace.dbgym_workspace_path),
diff --git a/env/tests/integtest_replay.py b/env/tests/integtest_replay.py
index 9523fac7..86b191f9 100644
--- a/env/tests/integtest_replay.py
+++ b/env/tests/integtest_replay.py
@@ -42,7 +42,7 @@ def test_replay(self) -> None:
         )
         replay_data = replay(
             GymlibIntegtestManager.get_dbgym_workspace(),
-            writer.tuning_artifacts_dpath,
+            writer.tuning_artifacts_path,
         )
 
         # We do some very simple sanity checks here due to the inherent randomness of executing a workload.
diff --git a/env/tests/integtest_tuning_artifacts.py b/env/tests/integtest_tuning_artifacts.py
index dfdc77c0..7d89bbba 100644
--- a/env/tests/integtest_tuning_artifacts.py
+++ b/env/tests/integtest_tuning_artifacts.py
@@ -34,7 +34,7 @@ def test_get_delta_at_step(self) -> None:
         writer.write_step(PostgresConnTests.make_config("b"))
         writer.write_step(PostgresConnTests.make_config("c"))
 
-        reader = TuningArtifactsReader(writer.tuning_artifacts_dpath)
+        reader = TuningArtifactsReader(writer.tuning_artifacts_path)
 
         self.assertEqual(
             reader.get_delta_at_step(1), PostgresConnTests.make_config("b")
@@ -59,7 +59,7 @@ def test_get_all_deltas_in_order(self) -> None:
         writer.write_step(PostgresConnTests.make_config("b"))
         writer.write_step(PostgresConnTests.make_config("c"))
 
-        reader = TuningArtifactsReader(writer.tuning_artifacts_dpath)
+        reader = TuningArtifactsReader(writer.tuning_artifacts_path)
 
         self.assertEqual(
             reader.get_all_deltas_in_order(),
@@ -75,7 +75,7 @@ def test_get_metadata(self) -> None:
             GymlibIntegtestManager.get_dbgym_workspace(),
             GymlibIntegtestManager.get_default_metadata(),
         )
-        reader = TuningArtifactsReader(writer.tuning_artifacts_dpath)
+        reader = TuningArtifactsReader(writer.tuning_artifacts_path)
         metadata = reader.get_metadata()
         expected_metadata = GymlibIntegtestManager.get_default_metadata()
         self.assertEqual(metadata, expected_metadata)
diff --git a/env/tests/integtest_workload.py b/env/tests/integtest_workload.py
index cc9d6393..3584b05c 100644
--- a/env/tests/integtest_workload.py
+++ b/env/tests/integtest_workload.py
@@ -11,11 +11,9 @@ def setUpClass() -> None:
         GymlibIntegtestManager.set_up_workspace()
 
     def test_workload(self) -> None:
-        workload_dpath = GymlibIntegtestManager.get_default_metadata().workload_path
+        workload_path = GymlibIntegtestManager.get_default_metadata().workload_path
 
-        workload = Workload(
-            GymlibIntegtestManager.get_dbgym_workspace(), workload_dpath
-        )
+        workload = Workload(GymlibIntegtestManager.get_dbgym_workspace(), workload_path)
 
         # Check the order of query IDs.
         self.assertEqual(
diff --git a/env/tuning_artifacts.py b/env/tuning_artifacts.py
index a60d6bbe..b5b617da 100644
--- a/env/tuning_artifacts.py
+++ b/env/tuning_artifacts.py
@@ -68,12 +68,12 @@ class DBMSConfigDelta:
     qknobs: QueryKnobsDelta
 
 
-def get_delta_at_step_fpath(tuning_artifacts_dpath: Path, step_num: int) -> Path:
-    return tuning_artifacts_dpath / f"step{step_num}_delta.json"
+def get_delta_at_step_path(tuning_artifacts_path: Path, step_num: int) -> Path:
+    return tuning_artifacts_path / f"step{step_num}_delta.json"
 
 
-def get_metadata_fpath(tuning_artifacts_dpath: Path) -> Path:
-    return tuning_artifacts_dpath / "metadata.json"
+def get_metadata_path(tuning_artifacts_path: Path) -> Path:
+    return tuning_artifacts_path / "metadata.json"
 
 
 class TuningArtifactsWriter:
@@ -81,14 +81,14 @@ def __init__(
         self, dbgym_workspace: DBGymWorkspace, metadata: TuningMetadata
     ) -> None:
         self.dbgym_workspace = dbgym_workspace
-        self.tuning_artifacts_dpath = self.dbgym_workspace.cur_task_runs_artifacts_path(
+        self.tuning_artifacts_path = self.dbgym_workspace.cur_task_runs_artifacts_path(
             "tuning_artifacts", mkdir=True
         )
-        assert is_fully_resolved(self.tuning_artifacts_dpath)
+        assert is_fully_resolved(self.tuning_artifacts_path)
         self.next_step_num = 0
 
         # Write metadata file
-        with get_metadata_fpath(self.tuning_artifacts_dpath).open("w") as f:
+        with get_metadata_path(self.tuning_artifacts_path).open("w") as f:
             json.dump(metadata.asdict(), f)
 
     def write_step(self, dbms_cfg_delta: DBMSConfigDelta) -> None:
@@ -97,23 +97,23 @@ def write_step(self, dbms_cfg_delta: DBMSConfigDelta) -> None:
         """
         curr_step_num = self.next_step_num
         self.next_step_num += 1
-        with get_delta_at_step_fpath(self.tuning_artifacts_dpath, curr_step_num).open(
+        with get_delta_at_step_path(self.tuning_artifacts_path, curr_step_num).open(
             "w"
         ) as f:
             json.dump(asdict(dbms_cfg_delta), f)
 
 
 class TuningArtifactsReader:
-    def __init__(self, tuning_artifacts_dpath: Path) -> None:
-        self.tuning_artifacts_dpath = tuning_artifacts_dpath
-        assert is_fully_resolved(self.tuning_artifacts_dpath)
+    def __init__(self, tuning_artifacts_path: Path) -> None:
+        self.tuning_artifacts_path = tuning_artifacts_path
+        assert is_fully_resolved(self.tuning_artifacts_path)
         num_steps = 0
-        while get_delta_at_step_fpath(self.tuning_artifacts_dpath, num_steps).exists():
+        while get_delta_at_step_path(self.tuning_artifacts_path, num_steps).exists():
             num_steps += 1
         self.num_steps = num_steps
 
     def get_metadata(self) -> TuningMetadata:
-        with get_metadata_fpath(self.tuning_artifacts_dpath).open("r") as f:
+        with get_metadata_path(self.tuning_artifacts_path).open("r") as f:
             data = json.load(f)
             return TuningMetadata(
                 workload_path=Path(data["workload_path"]),
@@ -126,7 +126,7 @@ def get_metadata(self) -> TuningMetadata:
 
     def get_delta_at_step(self, step_num: int) -> DBMSConfigDelta:
         assert step_num >= 0 and step_num < self.num_steps
-        with get_delta_at_step_fpath(self.tuning_artifacts_dpath, step_num).open(
+        with get_delta_at_step_path(self.tuning_artifacts_path, step_num).open(
             "r"
         ) as f:
             data = json.load(f)
diff --git a/env/workload.py b/env/workload.py
index 22a1c0fc..4b164ffc 100644
--- a/env/workload.py
+++ b/env/workload.py
@@ -4,18 +4,18 @@
 
 
 class Workload:
-    def __init__(self, dbgym_workspace: DBGymWorkspace, workload_dpath: Path) -> None:
+    def __init__(self, dbgym_workspace: DBGymWorkspace, workload_path: Path) -> None:
         self.dbgym_workspace = dbgym_workspace
-        self.workload_dpath = workload_dpath
-        assert is_fully_resolved(self.workload_dpath)
+        self.workload_path = workload_path
+        assert is_fully_resolved(self.workload_path)
 
         self.queries: dict[str, str] = {}
-        order_fpath = self.workload_dpath / "order.txt"
+        order_path = self.workload_path / "order.txt"
         self.query_order: list[str] = []
 
-        assert order_fpath.exists()
+        assert order_path.exists()
 
-        with self.dbgym_workspace.open_and_save(order_fpath) as f:
+        with self.dbgym_workspace.open_and_save(order_path) as f:
             for line in f:
                 qid, qpath = line.strip().split(",")
                 qpath = Path(qpath)
diff --git a/experiments/load_per_machine_envvars.sh b/experiments/load_per_machine_envvars.sh
index 22b220c8..1a992195 100644
--- a/experiments/load_per_machine_envvars.sh
+++ b/experiments/load_per_machine_envvars.sh
@@ -2,13 +2,13 @@
 host=$(hostname)
 
 if [ "$host" == "dev4" ]; then
-    export DBDATA_PARENT_DPATH=/mnt/nvme1n1/phw2/dbgym_tmp/
+    export DBDATA_PARENT_PATH=/mnt/nvme1n1/phw2/dbgym_tmp/
     export INTENDED_DBDATA_HARDWARE=ssd
 elif [ "$host" == "dev6" ]; then
-    export DBDATA_PARENT_DPATH=/mnt/nvme0n1/phw2/dbgym_tmp/
+    export DBDATA_PARENT_PATH=/mnt/nvme0n1/phw2/dbgym_tmp/
     export INTENDED_DBDATA_HARDWARE=ssd
 elif [ "$host" == "patnuc" ]; then
-    export DBDATA_PARENT_DPATH=../dbgym_workspace/tmp/
+    export DBDATA_PARENT_PATH=../dbgym_workspace/tmp/
     export INTENDED_DBDATA_HARDWARE=hdd
 else
     echo "Did not recognize host \"$host\""
diff --git a/manage/cli.py b/manage/cli.py
index 5d785d9f..4f41cbc7 100644
--- a/manage/cli.py
+++ b/manage/cli.py
@@ -12,7 +12,7 @@
     get_runs_path_from_workspace_path,
     get_symlinks_path_from_workspace_path,
     is_child_path,
-    parent_dpath_of_path,
+    parent_path_of_path,
 )
 
 
@@ -52,13 +52,13 @@ def manage_count(dbgym_workspace: DBGymWorkspace) -> None:
     )
 
 
-def add_symlinks_in_dpath(
-    symlinks_stack: list[Path], root_dpath: Path, processed_symlinks: set[Path]
+def add_symlinks_in_path(
+    symlinks_stack: list[Path], root_path: Path, processed_symlinks: set[Path]
 ) -> None:
     """
     Will modify symlinks_stack and processed_symlinks.
     """
-    for root_pathstr, dir_names, file_names in os.walk(root_dpath):
+    for root_pathstr, dir_names, file_names in os.walk(root_path):
         root_path = Path(root_pathstr)
         # symlinks can either be files or directories, so we go through both dir_names and file_names
         for file_name in chain(dir_names, file_names):
@@ -101,14 +101,14 @@ def clean_workspace(
       any symlinks referenced in task_runs/run_*/ directories we have already decided to keep.
     """
     # This stack holds the symlinks that are left to be processed
-    symlink_fpaths_to_process: list[Path] = []
+    symlink_paths_to_process: list[Path] = []
     # This set holds the symlinks that have already been processed to avoid infinite loops
     processed_symlinks: set[Path] = set()
 
     # 1. Initialize paths to process
     if dbgym_workspace.dbgym_symlinks_path.exists():
-        add_symlinks_in_dpath(
-            symlink_fpaths_to_process,
+        add_symlinks_in_path(
+            symlink_paths_to_process,
             dbgym_workspace.dbgym_symlinks_path,
             processed_symlinks,
         )
@@ -116,64 +116,60 @@ def clean_workspace(
     # 2. Go through symlinks, figuring out which "children of task runs" to keep
     # Based on the rules of the framework, "children of task runs" should be run_*/ directories.
     # However, the user's workspace might happen to break these rules by putting directories not
-    #   named "run_*/" or files directly in task_runs/. Thus, I use the term "task_run_child_fordpaths"
-    #   instead of "run_dpaths".
-    task_run_child_fordpaths_to_keep = set()
+    #   named "run_*/" or files directly in task_runs/. Thus, I use the term "task_run_child_paths"
+    #   instead of "run_paths".
+    task_run_child_paths_to_keep = set()
 
     if dbgym_workspace.dbgym_runs_path.exists():
-        while symlink_fpaths_to_process:
-            symlink_fpath: Path = symlink_fpaths_to_process.pop()
-            assert symlink_fpath.is_symlink()
+        while symlink_paths_to_process:
+            symlink_path: Path = symlink_paths_to_process.pop()
+            assert symlink_path.is_symlink()
             # Path.resolve() resolves all layers of symlinks while os.readlink() only resolves one layer.
             # However, os.readlink() literally reads the string contents of the link. We need to do some
             #   processing on the result of os.readlink() to convert it to an absolute path
-            real_fordpath = symlink_fpath.resolve()
-            one_layer_resolved_fordpath = os.readlink(symlink_fpath)
-            assert str(real_fordpath) == str(
-                os.readlink(symlink_fpath)
-            ), f"symlink_fpath ({symlink_fpath}) seems to point to *another* symlink. This is difficult to handle, so it is currently disallowed. Please resolve this situation manually."
+            real_path = symlink_path.resolve()
+            one_layer_resolved_path = os.readlink(symlink_path)
+            assert str(real_path) == str(
+                os.readlink(symlink_path)
+            ), f"symlink_path ({symlink_path}) seems to point to *another* symlink. This is difficult to handle, so it is currently disallowed. Please resolve this situation manually."
 
             # If the file doesn't exist, we'll just ignore it.
-            if not real_fordpath.exists():
+            if not real_path.exists():
                 continue
             # We're only trying to figure out which direct children of task_runs/ to save. If the file isn't
             #   even a descendant, we don't care about it.
-            if not is_child_path(real_fordpath, dbgym_workspace.dbgym_runs_path):
+            if not is_child_path(real_path, dbgym_workspace.dbgym_runs_path):
                 continue
 
-            assert not real_fordpath.samefile(dbgym_workspace.dbgym_runs_path)
+            assert not real_path.samefile(dbgym_workspace.dbgym_runs_path)
 
-            # Figure out the task_run_child_fordpath to put into task_run_child_fordpaths_to_keep
-            task_run_child_fordpath = None
-            if parent_dpath_of_path(real_fordpath).samefile(
-                dbgym_workspace.dbgym_runs_path
-            ):
+            # Figure out the task_run_child_path to put into task_run_child_paths_to_keep
+            task_run_child_path = None
+            if parent_path_of_path(real_path).samefile(dbgym_workspace.dbgym_runs_path):
                 # While it's true that it shouldn't be possible to symlink to a directory directly in task_runs/,
                 #   we'll just not delete it if the user happens to have one like this. Even if the user messed up
                 #   the structure somehow, it's just a good idea not to delete it.
-                task_run_child_fordpath = real_fordpath
+                task_run_child_path = real_path
             else:
                 # Technically, it's not allowed to symlink to any files not in task_runs/run_*/[codebase]/[organization]/.
                 #   However, as with above, we won't just nuke files if the workspace doesn't follow this rule for
                 #   some reason.
-                task_run_child_fordpath = real_fordpath
-                while not parent_dpath_of_path(task_run_child_fordpath).samefile(
+                task_run_child_path = real_path
+                while not parent_path_of_path(task_run_child_path).samefile(
                     dbgym_workspace.dbgym_runs_path
                 ):
-                    task_run_child_fordpath = parent_dpath_of_path(
-                        task_run_child_fordpath
-                    )
-            assert task_run_child_fordpath != None
-            assert parent_dpath_of_path(task_run_child_fordpath).samefile(
+                    task_run_child_path = parent_path_of_path(task_run_child_path)
+            assert task_run_child_path != None
+            assert parent_path_of_path(task_run_child_path).samefile(
                 dbgym_workspace.dbgym_runs_path
-            ), f"task_run_child_fordpath ({task_run_child_fordpath}) is not a direct child of dbgym_workspace.dbgym_runs_path"
-            task_run_child_fordpaths_to_keep.add(task_run_child_fordpath)
+            ), f"task_run_child_path ({task_run_child_path}) is not a direct child of dbgym_workspace.dbgym_runs_path"
+            task_run_child_paths_to_keep.add(task_run_child_path)
 
-            # If on safe mode, add symlinks inside the task_run_child_fordpath to be processed
+            # If on safe mode, add symlinks inside the task_run_child_path to be processed
             if mode == "safe":
-                add_symlinks_in_dpath(
-                    symlink_fpaths_to_process,
-                    task_run_child_fordpath,
+                add_symlinks_in_path(
+                    symlink_paths_to_process,
+                    task_run_child_path,
                     processed_symlinks,
                 )
 
@@ -181,12 +177,12 @@ def clean_workspace(
     # It's true that symlinks might link outside of task_runs/*. We'll just not care about those
     starting_num_files = _count_files_in_workspace(dbgym_workspace)
     if dbgym_workspace.dbgym_runs_path.exists():
-        for child_fordpath in dbgym_workspace.dbgym_runs_path.iterdir():
-            if child_fordpath not in task_run_child_fordpaths_to_keep:
-                if child_fordpath.is_dir():
-                    shutil.rmtree(child_fordpath)
+        for child_path in dbgym_workspace.dbgym_runs_path.iterdir():
+            if child_path not in task_run_child_paths_to_keep:
+                if child_path.is_dir():
+                    shutil.rmtree(child_path)
                 else:
-                    os.remove(child_fordpath)
+                    os.remove(child_path)
     ending_num_files = _count_files_in_workspace(dbgym_workspace)
 
     if verbose:
diff --git a/scripts/pat_test.sh b/scripts/pat_test.sh
index dcbb9a90..c15a20c3 100755
--- a/scripts/pat_test.sh
+++ b/scripts/pat_test.sh
@@ -13,4 +13,4 @@ python3 task.py benchmark job workload --query-subset demo
 
 # postgres
 python3 task.py dbms postgres build
-python3 task.py dbms postgres dbdata job --intended-dbdata-hardware $INTENDED_DBDATA_HARDWARE --dbdata-parent-dpath $DBDATA_PARENT_DPATH
\ No newline at end of file
+python3 task.py dbms postgres dbdata job --intended-dbdata-hardware $INTENDED_DBDATA_HARDWARE --dbdata-parent-path $DBDATA_PARENT_PATH
\ No newline at end of file
diff --git a/task.py b/task.py
index 1205999b..5579d9cf 100644
--- a/task.py
+++ b/task.py
@@ -18,9 +18,9 @@ def task(ctx: click.Context) -> None:
     dbgym_workspace = make_standard_dbgym_workspace()
     ctx.obj = dbgym_workspace
 
-    log_dpath = dbgym_workspace.cur_task_runs_artifacts_path(mkdir=True)
-    set_up_loggers(log_dpath)
-    set_up_warnings(log_dpath)
+    log_path = dbgym_workspace.cur_task_runs_artifacts_path(mkdir=True)
+    set_up_loggers(log_path)
+    set_up_warnings(log_path)
 
 
 if __name__ == "__main__":
diff --git a/util/log.py b/util/log.py
index 271fd92f..ae0ca936 100644
--- a/util/log.py
+++ b/util/log.py
@@ -8,7 +8,7 @@
 DBGYM_OUTPUT_LOGGER_NAME = f"{DBGYM_LOGGER_NAME}.output"
 
 
-def set_up_loggers(log_dpath: Path) -> None:
+def set_up_loggers(log_path: Path) -> None:
     """
     Set up everything related to the logging library.
 
@@ -21,7 +21,7 @@ def set_up_loggers(log_dpath: Path) -> None:
     _set_up_logger(
         logging.getLogger(DBGYM_LOGGER_NAME),
         log_format,
-        log_dpath / f"{DBGYM_LOGGER_NAME}.log",
+        log_path / f"{DBGYM_LOGGER_NAME}.log",
     )
 
     # The output logger is meant to output things to the console. We use it instead of using print to indicate that something is
@@ -40,7 +40,7 @@ def set_up_loggers(log_dpath: Path) -> None:
 def _set_up_logger(
     logger: Logger,
     format: str,
-    output_log_fpath: Optional[Path],
+    output_log_path: Optional[Path],
     console_level: int = logging.ERROR,
     file_level: int = logging.DEBUG,
 ) -> None:
@@ -55,18 +55,18 @@ def _set_up_logger(
     logger.addHandler(console_handler)
 
     # Let it output everything to the output file.
-    if output_log_fpath is not None:
-        file_handler = logging.FileHandler(output_log_fpath)
+    if output_log_path is not None:
+        file_handler = logging.FileHandler(output_log_path)
         file_handler.setFormatter(formatter)
         file_handler.setLevel(file_level)
         logger.addHandler(file_handler)
 
 
-def set_up_warnings(log_dpath: Path) -> None:
+def set_up_warnings(log_path: Path) -> None:
     """
     Some libraries (like torch) use warnings instead of logging for warnings. I want to redirect these too to avoid cluttering the console.
     """
-    warnings_fpath = log_dpath / "warnings.log"
+    warnings_path = log_path / "warnings.log"
 
     def write_warning_to_file(
         message: Any,
@@ -76,7 +76,7 @@ def write_warning_to_file(
         file: Optional[Any] = None,
         line: Optional[Any] = None,
     ) -> None:
-        with open(warnings_fpath, "a") as f:
+        with open(warnings_path, "a") as f:
             f.write(f"{filename}:{lineno}: {category.__name__}: {message}\n")
 
     warnings.showwarning = write_warning_to_file
diff --git a/util/tests/unittest_workspace.py b/util/tests/unittest_workspace.py
index 89a1c6c1..3a0f11fc 100644
--- a/util/tests/unittest_workspace.py
+++ b/util/tests/unittest_workspace.py
@@ -273,7 +273,7 @@ def test_link_result_cannot_link_symlink(self) -> None:
         )
         with self.assertRaisesRegex(
             AssertionError,
-            "result_fordpath \(.*\) should be a fully resolved path",
+            "result_path \(.*\) should be a fully resolved path",
         ):
             self.workspace.link_result(symlink_path)
 
@@ -452,7 +452,7 @@ def test_save_file_generated_this_run_raises_error(self) -> None:
         result_path = self.make_result_helper()
         with self.assertRaisesRegex(
             AssertionError,
-            "fpath \(.*\) was generated in this task run \(.*\)\. You do not need to save it",
+            "path \(.*\) was generated in this task run \(.*\)\. You do not need to save it",
         ):
             self.workspace.save_file(result_path)
 
diff --git a/util/workspace.py b/util/workspace.py
index 29075847..0db0beb6 100644
--- a/util/workspace.py
+++ b/util/workspace.py
@@ -47,7 +47,7 @@ def get_latest_run_path_from_workspace_path(workspace_path: Path) -> Path:
 
 # Paths of config files in the codebase. These are always relative paths.
 # The reason these can be relative paths instead of functions taking in codebase_path as input is because relative paths are relative to the codebase root
-DEFAULT_BOOT_CONFIG_FPATH = POSTGRES_PATH / "default_boot_config.yaml"
+DEFAULT_BOOT_CONFIG_PATH = POSTGRES_PATH / "default_boot_config.yaml"
 
 
 # Paths of dependencies in the workspace. These are named "*_path" because they will be an absolute path
@@ -87,7 +87,7 @@ def get_scale_factor_string(scale_factor: float | str) -> str:
             return str(scale_factor).replace(".", "point")
 
 
-def get_default_dbdata_parent_dpath(workspace_path: Path) -> Path:
+def get_default_dbdata_parent_path(workspace_path: Path) -> Path:
     return get_tmp_path_from_workspace_path(workspace_path)
 
 
@@ -106,7 +106,7 @@ def __init__(self, dbgym_workspace_path: Path):
             DBGymWorkspace.num_times_created_this_run == 1
         ), f"DBGymWorkspace has been created {DBGymWorkspace.num_times_created_this_run} times. It should only be created once per run."
 
-        self.base_dbgym_repo_dpath = get_base_dbgym_repo_dpath()
+        self.base_dbgym_repo_path = get_base_dbgym_repo_path()
         self.cur_path_list: list[str] = ["dbgym"]
         self.app_name = (
             "dbgym"  # TODO: discover this dynamically. app means dbgym or an agent
@@ -169,40 +169,40 @@ def __init__(self, dbgym_workspace_path: Path):
     # TODO(phw2): refactor our manual symlinking in postgres/cli.py to use link_result() instead
     def link_result(
         self,
-        result_fordpath: Path,
+        result_path: Path,
         custom_link_name: Optional[str] = None,
     ) -> Path:
         """
-        result_fordpath must be a "result", meaning it was generated inside dbgym_workspace.dbgym_this_run_path.
-        Further, result_fordpath must have been generated by this invocation to task.py. This also means that
-            result_fordpath itself can be a file or a dir but not a symlink.
+        result_path must be a "result", meaning it was generated inside dbgym_workspace.dbgym_this_run_path.
+        Further, result_path must have been generated by this invocation to task.py. This also means that
+            result_path itself can be a file or a dir but not a symlink.
         Given a file or directory in task_runs/run_*/[codebase]/[org], this will create a symlink inside
             symlinks/[codebase]/[org]/.
         Will override the old symlink if there is one, so that symlinks/ always contains the latest generated
             version of a file.
         This function will return the path to the symlink that was created.
         """
-        assert isinstance(result_fordpath, Path)
+        assert isinstance(result_path, Path)
         assert is_fully_resolved(
-            result_fordpath
-        ), f"result_fordpath ({result_fordpath}) should be a fully resolved path"
+            result_path
+        ), f"result_path ({result_path}) should be a fully resolved path"
         assert is_child_path(
-            result_fordpath, self.dbgym_this_run_path
+            result_path, self.dbgym_this_run_path
         ), "The result must have been generated in *this* run_*/ dir"
-        assert not os.path.islink(result_fordpath)
+        assert not os.path.islink(result_path)
 
         if type(custom_link_name) is str:
             link_name = custom_link_name
         else:
-            if os.path.isfile(result_fordpath):
-                link_name = basename_of_path(result_fordpath) + ".link"
-            elif os.path.isdir(result_fordpath):
-                link_name = basename_of_path(result_fordpath) + ".link"
+            if os.path.isfile(result_path):
+                link_name = basename_of_path(result_path) + ".link"
+            elif os.path.isdir(result_path):
+                link_name = basename_of_path(result_path) + ".link"
             else:
-                raise AssertionError("result_fordpath must be either a file or dir")
+                raise AssertionError("result_path must be either a file or dir")
 
-        symlink_parent_dpath = self.dbgym_symlinks_path / self.app_name
-        symlink_parent_dpath.mkdir(parents=True, exist_ok=True)
+        symlink_parent_path = self.dbgym_symlinks_path / self.app_name
+        symlink_parent_path.mkdir(parents=True, exist_ok=True)
 
         # Remove the old symlink ("old" meaning created in an earlier run) if there is one
         # Note that in a multi-threaded setting, this might remove one created by a process in the same run,
@@ -211,21 +211,21 @@ def link_result(
         assert link_name.endswith(".link") and not link_name.endswith(
             ".link.link"
         ), f'link_name ({link_name}) should end with ".link"'
-        symlink_path = symlink_parent_dpath / link_name
+        symlink_path = symlink_parent_path / link_name
         try_remove_file(symlink_path)
-        try_create_symlink(result_fordpath, symlink_path)
+        try_create_symlink(result_path, symlink_path)
 
         return symlink_path
 
-    def get_run_dpath_from_fpath(self, fpath: Path) -> Path:
-        run_dpath = fpath
-        while not parent_dpath_of_path(run_dpath).samefile(self.dbgym_runs_path):
-            run_dpath = parent_dpath_of_path(run_dpath)
-        return run_dpath
+    def get_run_path_from_path(self, path: Path) -> Path:
+        run_path = path
+        while not parent_path_of_path(run_path).samefile(self.dbgym_runs_path):
+            run_path = parent_path_of_path(run_path)
+        return run_path
 
     # TODO(phw2): really look at the clean PR to see what it changed
     # TODO(phw2): after merging agent-train, refactor some code in agent-train to use save_file() instead of open_and_save()
-    def save_file(self, fpath: Path) -> None:
+    def save_file(self, path: Path) -> None:
         """
         If an external function takes in a file/directory as input, you will not be able to call open_and_save().
             In these situations, just call save_file().
@@ -243,88 +243,88 @@ def save_file(self, fpath: Path) -> None:
             - If you save two dependencies with the same *outermost* directory, or two dependencies with the same filename
               both directly inside run_*/, the second save will overwrite the first.
         """
-        # validate fpath
-        assert isinstance(fpath, Path)
-        assert not os.path.islink(fpath), f"fpath ({fpath}) should not be a symlink"
-        assert os.path.exists(fpath), f"fpath ({fpath}) does not exist"
-        assert os.path.isfile(fpath), f"fpath ({fpath}) is not a file"
+        # validate path
+        assert isinstance(path, Path)
+        assert not os.path.islink(path), f"path ({path}) should not be a symlink"
+        assert os.path.exists(path), f"path ({path}) does not exist"
+        assert os.path.isfile(path), f"path ({path}) is not a file"
         assert not is_child_path(
-            fpath, self.dbgym_this_run_path
-        ), f"fpath ({fpath}) was generated in this task run ({self.dbgym_this_run_path}). You do not need to save it"
+            path, self.dbgym_this_run_path
+        ), f"path ({path}) was generated in this task run ({self.dbgym_this_run_path}). You do not need to save it"
 
         # Save _something_ to dbgym_this_run_path.
         # Save a symlink if the opened file was generated by a run. This is for two reasons:
         #   1. Files or dirs generated by a run are supposed to be immutable so saving a symlink is safe.
         #   2. Files or dirs generated by a run may be very large (up to 100s of GBs) so we don't want to copy them.
-        if is_child_path(fpath, self.dbgym_runs_path):
-            # If the fpath file is directly in run_dpath, we symlink the file directly.
-            run_dpath = self.get_run_dpath_from_fpath(fpath)
-            parent_dpath = parent_dpath_of_path(fpath)
-            if parent_dpath.samefile(run_dpath):
-                fname = basename_of_path(fpath)
-                symlink_fpath = self.dbgym_this_run_path / (fname + ".link")
-                try_remove_file(symlink_fpath)
-                try_create_symlink(fpath, symlink_fpath)
-            # Otherwise, we know the fpath file is _not_ directly inside run_dpath dir.
-            # We go as far back as we can while still staying in run_dpath and symlink that "base" dir.
-            # This is because lots of runs create dirs within run_dpath and it creates too much clutter to symlink every individual file.
+        if is_child_path(path, self.dbgym_runs_path):
+            # If the path file is directly in run_path, we symlink the file directly.
+            run_path = self.get_run_path_from_path(path)
+            parent_path = parent_path_of_path(path)
+            if parent_path.samefile(run_path):
+                fname = basename_of_path(path)
+                symlink_path = self.dbgym_this_run_path / (fname + ".link")
+                try_remove_file(symlink_path)
+                try_create_symlink(path, symlink_path)
+            # Otherwise, we know the path file is _not_ directly inside run_path dir.
+            # We go as far back as we can while still staying in run_path and symlink that "base" dir.
+            # This is because lots of runs create dirs within run_path and it creates too much clutter to symlink every individual file.
             # Further, this avoids an edge case where you both save a file and the dir it's in.
             else:
-                # Set base_dpath such that its parent is run_dpath.
-                base_dpath = parent_dpath
-                while not parent_dpath_of_path(base_dpath).samefile(run_dpath):
-                    base_dpath = parent_dpath_of_path(base_dpath)
+                # Set base_path such that its parent is run_path.
+                base_path = parent_path
+                while not parent_path_of_path(base_path).samefile(run_path):
+                    base_path = parent_path_of_path(base_path)
 
                 # Create symlink
-                open_base_dname = basename_of_path(base_dpath)
-                symlink_dpath = self.dbgym_this_run_path / (open_base_dname + ".link")
-                try_remove_file(symlink_dpath)
-                try_create_symlink(base_dpath, symlink_dpath)
+                open_base_dname = basename_of_path(base_path)
+                symlink_path = self.dbgym_this_run_path / (open_base_dname + ".link")
+                try_remove_file(symlink_path)
+                try_create_symlink(base_path, symlink_path)
         # If the file wasn't generated by a run, we can't just symlink it because we don't know that it's immutable.
         else:
-            fname = basename_of_path(fpath)
+            fname = basename_of_path(path)
             # In this case, we want to copy instead of symlinking since it might disappear in the future.
-            copy_fpath = self.dbgym_this_run_path / fname
-            shutil.copy(fpath, copy_fpath)
+            copy_path = self.dbgym_this_run_path / fname
+            shutil.copy(path, copy_path)
 
-    def open_and_save(self, open_fpath: Path, mode: str = "r") -> IO[Any]:
+    def open_and_save(self, open_path: Path, mode: str = "r") -> IO[Any]:
         """
         Open a file and "save" it to [workspace]/task_runs/run_*/.
         It takes in a str | Path to match the interface of open().
-        This file does not work if open_fpath is a symlink, to make its interface identical to that of open().
+        This file does not work if open_path is a symlink, to make its interface identical to that of open().
             Make sure to resolve all symlinks with fully_resolve_path().
         To avoid confusion, I'm enforcing this function to only work with absolute paths.
         # TODO: maybe make it work on non-fully-resolved paths to better match open()
         See the comment of save_file() for what "saving" means
         If you are generating a "result" for the run, _do not_ use this. Just use the normal open().
-            This shouldn't be too hard to remember because this function crashes if open_fpath doesn't exist,
-            and when you write results you're usually opening open_fpaths which do not exist.
+            This shouldn't be too hard to remember because this function crashes if open_path doesn't exist,
+            and when you write results you're usually opening open_paths which do not exist.
         """
-        # Validate open_fpath
-        assert isinstance(open_fpath, Path)
+        # Validate open_path
+        assert isinstance(open_path, Path)
         assert is_fully_resolved(
-            open_fpath
-        ), f"open_and_save(): open_fpath ({open_fpath}) should be a fully resolved path"
+            open_path
+        ), f"open_and_save(): open_path ({open_path}) should be a fully resolved path"
         assert not os.path.islink(
-            open_fpath
-        ), f"open_fpath ({open_fpath}) should not be a symlink"
-        assert os.path.exists(open_fpath), f"open_fpath ({open_fpath}) does not exist"
+            open_path
+        ), f"open_path ({open_path}) should not be a symlink"
+        assert os.path.exists(open_path), f"open_path ({open_path}) does not exist"
         # `open_and_save`` *must* be called on files because it doesn't make sense to open a directory. note that this doesn't mean we'll always save
         #   a file though. we sometimes save a directory (see save_file() for details)
-        assert os.path.isfile(open_fpath), f"open_fpath ({open_fpath}) is not a file"
+        assert os.path.isfile(open_path), f"open_path ({open_path}) is not a file"
 
         # Save
-        self.save_file(open_fpath)
+        self.save_file(open_path)
 
         # Open
-        return open(open_fpath, mode=mode)
+        return open(open_path, mode=mode)
 
     # `append_group()` is used to mark the "codebase path" of an invocation of the CLI. The "codebase path" is explained further in the documentation.
     def append_group(self, name: str) -> None:
         self.cur_path_list.append(name)
 
     def cur_source_path(self, *dirs: str) -> Path:
-        cur_path = self.base_dbgym_repo_dpath
+        cur_path = self.base_dbgym_repo_path
         assert self.cur_path_list[0] == "dbgym"
         for folder in self.cur_path_list[1:]:
             cur_path = cur_path / folder
@@ -408,10 +408,10 @@ def fully_resolve_path(inputpath: os.PathLike[str]) -> Path:
     realabspath = Path(inputpath)
     # `expanduser()` is always "ok" to call first.
     realabspath = realabspath.expanduser()
-    # The reason we don't call Path.absolute() is because the path should be relative to get_base_dbgym_repo_dpath(),
+    # The reason we don't call Path.absolute() is because the path should be relative to get_base_dbgym_repo_path(),
     #   which is not necessary where cwd() points at the time of calling this function.
     if not realabspath.is_absolute():
-        realabspath = get_base_dbgym_repo_dpath() / realabspath
+        realabspath = get_base_dbgym_repo_path() / realabspath
     # `resolve()` has two uses: normalize the path (remove ..) and resolve symlinks.
     # I believe the pathlib library (https://docs.python.org/3/library/pathlib.html#pathlib.Path.resolve) does these together this
     #   way to avoid an edge case related to symlinks and normalizing paths (footnote 1 of the linked docs)
@@ -422,15 +422,15 @@ def fully_resolve_path(inputpath: os.PathLike[str]) -> Path:
     return realabspath
 
 
-def get_base_dbgym_repo_dpath() -> Path:
+def get_base_dbgym_repo_path() -> Path:
     path = Path(os.getcwd())
-    assert _is_base_dbgym_repo_dpath(
+    assert _is_base_dbgym_repo_path(
         path
     ), "This script should be invoked from the root of the dbgym repo."
     return path
 
 
-def _is_base_dbgym_repo_dpath(path: Path) -> bool:
+def _is_base_dbgym_repo_path(path: Path) -> bool:
     """
     Returns whether we are in the base directory of some git repository
     """
@@ -480,121 +480,121 @@ def is_fully_resolved(path: Path) -> bool:
     return str(resolved_path) == str(path)
 
 
-def parent_dpath_of_path(dpath: Path) -> Path:
+def parent_path_of_path(path: Path) -> Path:
     """
     This function only calls Path.parent, but in a safer way.
     """
-    assert isinstance(dpath, Path)
+    assert isinstance(path, Path)
     assert is_fully_resolved(
-        dpath
-    ), f"dpath must be fully resolved because Path.parent has weird behavior on non-resolved paths (see https://docs.python.org/3/library/pathlib.html#pathlib.PurePath.parent)"
-    parent_dpath = dpath.parent
-    assert isinstance(parent_dpath, Path)
-    return parent_dpath
+        path
+    ), f"path must be fully resolved because Path.parent has weird behavior on non-resolved paths (see https://docs.python.org/3/library/pathlib.html#pathlib.PurePath.parent)"
+    parent_path = path.parent
+    assert isinstance(parent_path, Path)
+    return parent_path
 
 
-def basename_of_path(dpath: Path) -> str:
+def basename_of_path(path: Path) -> str:
     """
     This function only calls Path.name, but in a safer way.
     """
-    assert isinstance(dpath, Path)
+    assert isinstance(path, Path)
     assert is_fully_resolved(
-        dpath
-    ), f'dpath must be fully resolved because Path.name has weird behavior on non-resolved paths (like giving ".." if the path ends with a "..")'
-    dpath_dirname, dpath_basename = os.path.split(dpath)
+        path
+    ), f'path must be fully resolved because Path.name has weird behavior on non-resolved paths (like giving ".." if the path ends with a "..")'
+    path_dirname, path_basename = os.path.split(path)
     # this means the path ended with a '/' so all os.path.split() does is get rid of the slash
-    if dpath_basename == "":
-        return os.path.basename(dpath_dirname)
+    if path_basename == "":
+        return os.path.basename(path_dirname)
     else:
-        return dpath_basename
+        return path_basename
 
 
 # TODO(phw2): refactor to use Path
-def is_child_path(child_path: os.PathLike[str], parent_dpath: os.PathLike[str]) -> bool:
+def is_child_path(child_path: os.PathLike[str], parent_path: os.PathLike[str]) -> bool:
     """
-    Checks whether child_path refers to a file/dir/link that is a child of the dir referred to by parent_dpath
+    Checks whether child_path refers to a file/dir/link that is a child of the dir referred to by parent_path
     If the two paths are equal, this function returns FALSE
     """
-    assert os.path.isdir(parent_dpath)
-    if os.path.samefile(child_path, parent_dpath):
+    assert os.path.isdir(parent_path)
+    if os.path.samefile(child_path, parent_path):
         return False
     else:
         return os.path.samefile(
-            os.path.commonpath([parent_dpath, child_path]), parent_dpath
+            os.path.commonpath([parent_path, child_path]), parent_path
         )
 
 
 # TODO(phw2): deprecate this once I'm done with unittest_workspace.py
 def open_and_save(
-    dbgym_workspace: DBGymWorkspace, open_fpath: Path, mode: str = "r"
+    dbgym_workspace: DBGymWorkspace, open_path: Path, mode: str = "r"
 ) -> IO[Any]:
     """
     Open a file and "save" it to [workspace]/task_runs/run_*/.
     It takes in a str | Path to match the interface of open().
-    This file does not work if open_fpath is a symlink, to make its interface identical to that of open().
+    This file does not work if open_path is a symlink, to make its interface identical to that of open().
         Make sure to resolve all symlinks with fully_resolve_path().
     To avoid confusion, I'm enforcing this function to only work with absolute paths.
     # TODO: maybe make it work on non-fully-resolved paths to better match open()
     See the comment of save_file() for what "saving" means
     If you are generating a "result" for the run, _do not_ use this. Just use the normal open().
-        This shouldn't be too hard to remember because this function crashes if open_fpath doesn't exist,
-        and when you write results you're usually opening open_fpaths which do not exist.
+        This shouldn't be too hard to remember because this function crashes if open_path doesn't exist,
+        and when you write results you're usually opening open_paths which do not exist.
     """
-    # validate open_fpath
-    assert isinstance(open_fpath, Path)
+    # validate open_path
+    assert isinstance(open_path, Path)
     assert is_fully_resolved(
-        open_fpath
-    ), f"open_and_save(): open_fpath ({open_fpath}) should be a fully resolved path"
+        open_path
+    ), f"open_and_save(): open_path ({open_path}) should be a fully resolved path"
     assert not os.path.islink(
-        open_fpath
-    ), f"open_fpath ({open_fpath}) should not be a symlink"
-    assert os.path.exists(open_fpath), f"open_fpath ({open_fpath}) does not exist"
+        open_path
+    ), f"open_path ({open_path}) should not be a symlink"
+    assert os.path.exists(open_path), f"open_path ({open_path}) does not exist"
     # open_and_save *must* be called on files because it doesn't make sense to open a directory. note that this doesn't mean we'll always save
     #   a file though. we sometimes save a directory (see save_file() for details)
-    assert os.path.isfile(open_fpath), f"open_fpath ({open_fpath}) is not a file"
+    assert os.path.isfile(open_path), f"open_path ({open_path}) is not a file"
 
     # save
-    save_file(dbgym_workspace, open_fpath)
+    save_file(dbgym_workspace, open_path)
 
     # open
-    return open(open_fpath, mode=mode)
+    return open(open_path, mode=mode)
 
 
-def extract_from_task_run_fordpath(
-    dbgym_workspace: DBGymWorkspace, task_run_fordpath: Path
+def extract_from_task_run_path(
+    dbgym_workspace: DBGymWorkspace, task_run_path: Path
 ) -> tuple[Path, str, Path, str]:
     """
     The task_runs/ folder is organized like task_runs/run_*/[codebase]/[org]/any/path/you/want.
     This function extracts the [codebase] and [org] components
     """
-    assert isinstance(task_run_fordpath, Path)
-    assert not task_run_fordpath.is_symlink()
-    parent_dpath = task_run_fordpath.parent
+    assert isinstance(task_run_path, Path)
+    assert not task_run_path.is_symlink()
+    parent_path = task_run_path.parent
     # TODO(phw2): make this a common function
-    assert not parent_dpath.samefile(
+    assert not parent_path.samefile(
         dbgym_workspace.dbgym_runs_path
-    ), f"task_run_fordpath ({task_run_fordpath}) should be inside a run_*/ dir instead of directly in dbgym_workspace.dbgym_runs_path ({dbgym_workspace.dbgym_runs_path})"
-    assert not parent_dpath_of_path(parent_dpath).samefile(
+    ), f"task_run_path ({task_run_path}) should be inside a run_*/ dir instead of directly in dbgym_workspace.dbgym_runs_path ({dbgym_workspace.dbgym_runs_path})"
+    assert not parent_path_of_path(parent_path).samefile(
         dbgym_workspace.dbgym_runs_path
-    ), f"task_run_fordpath ({task_run_fordpath}) should be inside a run_*/[codebase]/ dir instead of directly in run_*/ ({dbgym_workspace.dbgym_runs_path})"
-    assert not parent_dpath_of_path(parent_dpath_of_path(parent_dpath)).samefile(
+    ), f"task_run_path ({task_run_path}) should be inside a run_*/[codebase]/ dir instead of directly in run_*/ ({dbgym_workspace.dbgym_runs_path})"
+    assert not parent_path_of_path(parent_path_of_path(parent_path)).samefile(
         dbgym_workspace.dbgym_runs_path
-    ), f"task_run_fordpath ({task_run_fordpath}) should be inside a run_*/[codebase]/[organization]/ dir instead of directly in run_*/ ({dbgym_workspace.dbgym_runs_path})"
-    # org_dpath is the run_*/[codebase]/[organization]/ dir that task_run_fordpath is in
-    org_dpath = parent_dpath
-    while not parent_dpath_of_path(
-        parent_dpath_of_path(parent_dpath_of_path(org_dpath))
+    ), f"task_run_path ({task_run_path}) should be inside a run_*/[codebase]/[organization]/ dir instead of directly in run_*/ ({dbgym_workspace.dbgym_runs_path})"
+    # org_path is the run_*/[codebase]/[organization]/ dir that task_run_path is in
+    org_path = parent_path
+    while not parent_path_of_path(
+        parent_path_of_path(parent_path_of_path(org_path))
     ).samefile(dbgym_workspace.dbgym_runs_path):
-        org_dpath = parent_dpath_of_path(org_dpath)
-    org_dname = basename_of_path(org_dpath)
-    codebase_dpath = parent_dpath_of_path(org_dpath)
-    codebase_dname = basename_of_path(codebase_dpath)
+        org_path = parent_path_of_path(org_path)
+    org_dname = basename_of_path(org_path)
+    codebase_path = parent_path_of_path(org_path)
+    codebase_dname = basename_of_path(codebase_path)
 
-    return codebase_dpath, codebase_dname, org_dpath, org_dname
+    return codebase_path, codebase_dname, org_path, org_dname
 
 
 # TODO(phw2): deprecate this once I'm done with unittest_workspace.py
-def save_file(dbgym_workspace: DBGymWorkspace, fpath: Path) -> None:
+def save_file(dbgym_workspace: DBGymWorkspace, path: Path) -> None:
     """
     If an external function takes in a file/directory as input, you will not be able to call open_and_save().
         In these situations, just call save_file().
@@ -604,102 +604,102 @@ def save_file(dbgym_workspace: DBGymWorkspace, fpath: Path) -> None:
     We create a symlink if it is a "dependency", meaning a task.py command was run to generate it
         In these cases we create a symlink so we have full provenance for how the dependency was created
     """
-    # validate fpath
-    assert is_fully_resolved(fpath), f"fpath ({fpath}) should be a fully resolved path"
-    assert os.path.isfile(fpath), f"fpath ({fpath}) should be a file"
+    # validate path
+    assert is_fully_resolved(path), f"path ({path}) should be a fully resolved path"
+    assert os.path.isfile(path), f"path ({path}) should be a file"
     assert not is_child_path(
-        fpath, dbgym_workspace.dbgym_this_run_path
-    ), f"fpath ({fpath}) was generated in this task run ({dbgym_workspace.dbgym_this_run_path}). You do not need to save it"
+        path, dbgym_workspace.dbgym_this_run_path
+    ), f"path ({path}) was generated in this task run ({dbgym_workspace.dbgym_this_run_path}). You do not need to save it"
 
     # save _something_ to dbgym_this_run_path
     # save a symlink if the opened file was generated by a run. this is for two reasons:
     #   1. files or dirs generated by a run are supposed to be immutable so saving a symlink is safe
     #   2. files or dirs generated by a run may be very large (up to 100s of GBs) so we don't want to copy them
-    if is_child_path(fpath, dbgym_workspace.dbgym_runs_path):
+    if is_child_path(path, dbgym_workspace.dbgym_runs_path):
         # get paths we'll need later.
-        _, codebase_dname, org_dpath, org_dname = extract_from_task_run_fordpath(
-            dbgym_workspace, fpath
+        _, codebase_dname, org_path, org_dname = extract_from_task_run_path(
+            dbgym_workspace, path
         )
-        this_run_save_dpath = (
+        this_run_save_path = (
             dbgym_workspace.dbgym_this_run_path / codebase_dname / org_dname
         )
-        os.makedirs(this_run_save_dpath, exist_ok=True)
-
-        # if the fpath file is directly in org_dpath, we symlink the file directly
-        parent_dpath = parent_dpath_of_path(fpath)
-        if parent_dpath.samefile(org_dpath):
-            fname = basename_of_path(fpath)
-            symlink_fpath = this_run_save_dpath / (fname + ".link")
-            try_create_symlink(fpath, symlink_fpath)
-        # else, we know the fpath file is _not_ directly inside org_dpath dir
-        # we go as far back as we can while still staying in org_dpath and symlink that "base" dir
-        # this is because lots of runs create dirs within org_dpath and it's just a waste of space to symlink every individual file
+        os.makedirs(this_run_save_path, exist_ok=True)
+
+        # if the path file is directly in org_path, we symlink the file directly
+        parent_path = parent_path_of_path(path)
+        if parent_path.samefile(org_path):
+            fname = basename_of_path(path)
+            symlink_path = this_run_save_path / (fname + ".link")
+            try_create_symlink(path, symlink_path)
+        # else, we know the path file is _not_ directly inside org_path dir
+        # we go as far back as we can while still staying in org_path and symlink that "base" dir
+        # this is because lots of runs create dirs within org_path and it's just a waste of space to symlink every individual file
         else:
-            # set base_dpath such that its parent is org_dpath
-            base_dpath = parent_dpath
-            while not parent_dpath_of_path(base_dpath).samefile(org_dpath):
-                base_dpath = parent_dpath_of_path(base_dpath)
+            # set base_path such that its parent is org_path
+            base_path = parent_path
+            while not parent_path_of_path(base_path).samefile(org_path):
+                base_path = parent_path_of_path(base_path)
 
             # create symlink
-            open_base_dname = basename_of_path(base_dpath)
-            symlink_dpath = this_run_save_dpath / (open_base_dname + ".link")
-            try_create_symlink(base_dpath, symlink_dpath)
+            open_base_dname = basename_of_path(base_path)
+            symlink_path = this_run_save_path / (open_base_dname + ".link")
+            try_create_symlink(base_path, symlink_path)
     # if it wasn't generated by a run
     else:
         # since we don't know where the file is at all, the location is "unknown" and the org is "all"
-        this_run_save_dpath = dbgym_workspace.dbgym_this_run_path / "unknown" / "all"
-        os.makedirs(this_run_save_dpath, exist_ok=True)
-        fname = basename_of_path(fpath)
+        this_run_save_path = dbgym_workspace.dbgym_this_run_path / "unknown" / "all"
+        os.makedirs(this_run_save_path, exist_ok=True)
+        fname = basename_of_path(path)
         # in this case, we want to copy instead of symlinking since it might disappear in the future
-        copy_fpath = this_run_save_dpath / fname
-        shutil.copy(fpath, copy_fpath)
+        copy_path = this_run_save_path / fname
+        shutil.copy(path, copy_path)
 
 
 # TODO(phw2): deprecate this once I'm done with unittest_workspace.py
 def link_result(
     dbgym_workspace: DBGymWorkspace,
-    result_fordpath: Path,
+    result_path: Path,
     custom_result_name: Optional[str] = None,
 ) -> Path:
     """
-    result_fordpath must be a "result", meaning it was generated inside dbgym_workspace.dbgym_this_run_path.
-    Further, result_fordpath must have been generated by this invocation to task.py. This also means that
-        result_fordpath itself can be a file or a dir but not a symlink.
+    result_path must be a "result", meaning it was generated inside dbgym_workspace.dbgym_this_run_path.
+    Further, result_path must have been generated by this invocation to task.py. This also means that
+        result_path itself can be a file or a dir but not a symlink.
     Given a file or directory in task_runs/run_*/[codebase]/[org], this will create a symlink inside
         symlinks/[codebase]/[org]/.
     Will override the old symlink if there is one, so that symlinks/ always contains the latest generated
         version of a file.
     This function will return the path to the symlink that was created.
     """
-    assert isinstance(result_fordpath, Path)
+    assert isinstance(result_path, Path)
     assert is_fully_resolved(
-        result_fordpath
-    ), f"result_fordpath ({result_fordpath}) should be a fully resolved path"
-    assert is_child_path(result_fordpath, dbgym_workspace.dbgym_this_run_path)
-    assert not os.path.islink(result_fordpath)
+        result_path
+    ), f"result_path ({result_path}) should be a fully resolved path"
+    assert is_child_path(result_path, dbgym_workspace.dbgym_this_run_path)
+    assert not os.path.islink(result_path)
 
     if type(custom_result_name) is str:
         result_name = custom_result_name
     else:
-        if os.path.isfile(result_fordpath):
-            result_name = basename_of_path(result_fordpath) + ".link"
-        elif os.path.isdir(result_fordpath):
-            result_name = basename_of_path(result_fordpath) + ".link"
+        if os.path.isfile(result_path):
+            result_name = basename_of_path(result_path) + ".link"
+        elif os.path.isdir(result_path):
+            result_name = basename_of_path(result_path) + ".link"
         else:
-            raise AssertionError("result_fordpath must be either a file or dir")
+            raise AssertionError("result_path must be either a file or dir")
 
     # Figure out the parent directory path of the symlink
-    codebase_dpath, codebase_dname, _, org_dname = extract_from_task_run_fordpath(
-        dbgym_workspace, result_fordpath
+    codebase_path, codebase_dname, _, org_dname = extract_from_task_run_path(
+        dbgym_workspace, result_path
     )
     # We're only supposed to save files generated by us, which means they should be in cur_task_runs_path()
-    assert codebase_dpath.samefile(
+    assert codebase_path.samefile(
         dbgym_workspace.cur_task_runs_path()
     ), f"link_result should only be called on files generated by this invocation to task.py"
-    symlink_parent_dpath = (
+    symlink_parent_path = (
         dbgym_workspace.dbgym_symlinks_path / codebase_dname / org_dname
     )
-    symlink_parent_dpath.mkdir(parents=True, exist_ok=True)
+    symlink_parent_path.mkdir(parents=True, exist_ok=True)
 
     # Remove the old symlink ("old" meaning created in an earlier run) if there is one
     # Note that in a multi-threaded setting, this might remove one created by a process in the same run,
@@ -708,9 +708,9 @@ def link_result(
     assert result_name.endswith(".link") and not result_name.endswith(
         ".link.link"
     ), f'result_name ({result_name}) should end with ".link"'
-    symlink_path = symlink_parent_dpath / result_name
+    symlink_path = symlink_parent_path / result_name
     try_remove_file(symlink_path)
-    try_create_symlink(result_fordpath, symlink_path)
+    try_create_symlink(result_path, symlink_path)
 
     return symlink_path
 

From 1dcf6697f8c73785009fd70ec9637b0db054087a Mon Sep 17 00:00:00 2001
From: Patrick Wang <wang.patrick57@gmail.com>
Date: Sun, 29 Dec 2024 21:55:09 -0500
Subject: [PATCH 15/24] dbms integtest now uses intended_dbdata_hardware

---
 .github/workflows/tests.yaml | 2 +-
 dbms/postgres/cli.py         | 4 +++-
 dbms/tests/integtest_dbms.py | 6 +++++-
 3 files changed, 9 insertions(+), 3 deletions(-)

diff --git a/.github/workflows/tests.yaml b/.github/workflows/tests.yaml
index a8fec259..da3543d5 100644
--- a/.github/workflows/tests.yaml
+++ b/.github/workflows/tests.yaml
@@ -52,7 +52,7 @@ jobs:
       # Integration tests do require external systems to be running (most commonly a database instance).
       # Unlike end-to-end tests though, they test a specific module in a detailed manner, much like a unit test does.
       env:
-        # We set `INTENDED_DBDATA_HARDWARE` so that it's seen when `integtest_pg_conn.py` executes `_set_up_gymlib_integtest_workspace.sh`.
+        # The CI runs on ssd so we have to set this.
         INTENDED_DBDATA_HARDWARE: ssd
       run: |
         . "$HOME/.cargo/env"
diff --git a/dbms/postgres/cli.py b/dbms/postgres/cli.py
index dbc46875..b364ff90 100644
--- a/dbms/postgres/cli.py
+++ b/dbms/postgres/cli.py
@@ -171,7 +171,9 @@ def _postgres_dbdata(
             dbdata_parent_path
         ), f"Intended hardware is SSD but dbdata_parent_path ({dbdata_parent_path}) is an HDD"
     else:
-        assert False
+        assert (
+            False
+        ), f'Intended hardware is "{intended_dbdata_hardware}" which is invalid'
 
     # Create dbdata
     _create_dbdata(
diff --git a/dbms/tests/integtest_dbms.py b/dbms/tests/integtest_dbms.py
index ed1a646d..466030bd 100644
--- a/dbms/tests/integtest_dbms.py
+++ b/dbms/tests/integtest_dbms.py
@@ -1,3 +1,4 @@
+import os
 import shutil
 import unittest
 from pathlib import Path
@@ -54,7 +55,10 @@ def test_postgres_dbdata(self) -> None:
             self.workspace.dbgym_workspace_path, "tpch", scale_factor
         )
         self.assertFalse(dbdata_tgz_path.exists())
-        _postgres_dbdata(self.workspace, "tpch", scale_factor, None, "hdd", None)
+        intended_dbdata_hardware = os.environ.get("INTENDED_DBDATA_HARDWARE", "hdd")
+        _postgres_dbdata(
+            self.workspace, "tpch", scale_factor, None, intended_dbdata_hardware, None
+        )
         self.assertTrue(dbdata_tgz_path.exists())
         self.assertTrue(fully_resolve_path(dbdata_tgz_path).exists())
 

From b5fd76ea5159b584f1c3300de15f77d4a85e43c4 Mon Sep 17 00:00:00 2001
From: Patrick Wang <wang.patrick57@gmail.com>
Date: Mon, 30 Dec 2024 13:36:51 -0500
Subject: [PATCH 16/24] fully removed the old open_and_save, save_file, and
 link_result

---
 benchmark/job/cli.py |   7 +-
 env/pg_conn.py       |   2 +-
 util/pg.py           |   2 +-
 util/workspace.py    | 158 -------------------------------------------
 4 files changed, 3 insertions(+), 166 deletions(-)

diff --git a/benchmark/job/cli.py b/benchmark/job/cli.py
index c467e823..136c0b19 100644
--- a/benchmark/job/cli.py
+++ b/benchmark/job/cli.py
@@ -11,12 +11,7 @@
 from benchmark.constants import DEFAULT_SCALE_FACTOR
 from util.log import DBGYM_LOGGER_NAME
 from util.shell import subprocess_run
-from util.workspace import (
-    DBGymWorkspace,
-    fully_resolve_path,
-    is_fully_resolved,
-    link_result,
-)
+from util.workspace import DBGymWorkspace, fully_resolve_path
 
 JOB_TABLES_URL = "https://event.cwi.nl/da/job/imdb.tgz"
 JOB_QUERIES_URL = "https://event.cwi.nl/da/job/job.tgz"
diff --git a/env/pg_conn.py b/env/pg_conn.py
index bbeee54d..540e6134 100644
--- a/env/pg_conn.py
+++ b/env/pg_conn.py
@@ -24,7 +24,7 @@
 
 from util.log import DBGYM_LOGGER_NAME
 from util.pg import DBGYM_POSTGRES_DBNAME, SHARED_PRELOAD_LIBRARIES, get_kv_connstr
-from util.workspace import DBGymWorkspace, open_and_save, parent_path_of_path
+from util.workspace import DBGymWorkspace, parent_path_of_path
 
 CONNECT_TIMEOUT = 300
 
diff --git a/util/pg.py b/util/pg.py
index a959e11b..23c06b60 100644
--- a/util/pg.py
+++ b/util/pg.py
@@ -11,7 +11,7 @@
 import sqlalchemy
 from sqlalchemy import create_engine, text
 
-from util.workspace import DBGymWorkspace, open_and_save
+from util.workspace import DBGymWorkspace
 
 DBGYM_POSTGRES_USER = "dbgym_user"
 DBGYM_POSTGRES_PASS = "dbgym_pass"
diff --git a/util/workspace.py b/util/workspace.py
index 0db0beb6..286488ec 100644
--- a/util/workspace.py
+++ b/util/workspace.py
@@ -524,42 +524,6 @@ def is_child_path(child_path: os.PathLike[str], parent_path: os.PathLike[str]) -
         )
 
 
-# TODO(phw2): deprecate this once I'm done with unittest_workspace.py
-def open_and_save(
-    dbgym_workspace: DBGymWorkspace, open_path: Path, mode: str = "r"
-) -> IO[Any]:
-    """
-    Open a file and "save" it to [workspace]/task_runs/run_*/.
-    It takes in a str | Path to match the interface of open().
-    This file does not work if open_path is a symlink, to make its interface identical to that of open().
-        Make sure to resolve all symlinks with fully_resolve_path().
-    To avoid confusion, I'm enforcing this function to only work with absolute paths.
-    # TODO: maybe make it work on non-fully-resolved paths to better match open()
-    See the comment of save_file() for what "saving" means
-    If you are generating a "result" for the run, _do not_ use this. Just use the normal open().
-        This shouldn't be too hard to remember because this function crashes if open_path doesn't exist,
-        and when you write results you're usually opening open_paths which do not exist.
-    """
-    # validate open_path
-    assert isinstance(open_path, Path)
-    assert is_fully_resolved(
-        open_path
-    ), f"open_and_save(): open_path ({open_path}) should be a fully resolved path"
-    assert not os.path.islink(
-        open_path
-    ), f"open_path ({open_path}) should not be a symlink"
-    assert os.path.exists(open_path), f"open_path ({open_path}) does not exist"
-    # open_and_save *must* be called on files because it doesn't make sense to open a directory. note that this doesn't mean we'll always save
-    #   a file though. we sometimes save a directory (see save_file() for details)
-    assert os.path.isfile(open_path), f"open_path ({open_path}) is not a file"
-
-    # save
-    save_file(dbgym_workspace, open_path)
-
-    # open
-    return open(open_path, mode=mode)
-
-
 def extract_from_task_run_path(
     dbgym_workspace: DBGymWorkspace, task_run_path: Path
 ) -> tuple[Path, str, Path, str]:
@@ -593,128 +557,6 @@ def extract_from_task_run_path(
     return codebase_path, codebase_dname, org_path, org_dname
 
 
-# TODO(phw2): deprecate this once I'm done with unittest_workspace.py
-def save_file(dbgym_workspace: DBGymWorkspace, path: Path) -> None:
-    """
-    If an external function takes in a file/directory as input, you will not be able to call open_and_save().
-        In these situations, just call save_file().
-    Like open_and_save(), this function only works with real absolute paths.
-    "Saving" can mean either copying the file or creating a symlink to it
-    We copy the file if it is a "config", meaning it just exists without having been generated
-    We create a symlink if it is a "dependency", meaning a task.py command was run to generate it
-        In these cases we create a symlink so we have full provenance for how the dependency was created
-    """
-    # validate path
-    assert is_fully_resolved(path), f"path ({path}) should be a fully resolved path"
-    assert os.path.isfile(path), f"path ({path}) should be a file"
-    assert not is_child_path(
-        path, dbgym_workspace.dbgym_this_run_path
-    ), f"path ({path}) was generated in this task run ({dbgym_workspace.dbgym_this_run_path}). You do not need to save it"
-
-    # save _something_ to dbgym_this_run_path
-    # save a symlink if the opened file was generated by a run. this is for two reasons:
-    #   1. files or dirs generated by a run are supposed to be immutable so saving a symlink is safe
-    #   2. files or dirs generated by a run may be very large (up to 100s of GBs) so we don't want to copy them
-    if is_child_path(path, dbgym_workspace.dbgym_runs_path):
-        # get paths we'll need later.
-        _, codebase_dname, org_path, org_dname = extract_from_task_run_path(
-            dbgym_workspace, path
-        )
-        this_run_save_path = (
-            dbgym_workspace.dbgym_this_run_path / codebase_dname / org_dname
-        )
-        os.makedirs(this_run_save_path, exist_ok=True)
-
-        # if the path file is directly in org_path, we symlink the file directly
-        parent_path = parent_path_of_path(path)
-        if parent_path.samefile(org_path):
-            fname = basename_of_path(path)
-            symlink_path = this_run_save_path / (fname + ".link")
-            try_create_symlink(path, symlink_path)
-        # else, we know the path file is _not_ directly inside org_path dir
-        # we go as far back as we can while still staying in org_path and symlink that "base" dir
-        # this is because lots of runs create dirs within org_path and it's just a waste of space to symlink every individual file
-        else:
-            # set base_path such that its parent is org_path
-            base_path = parent_path
-            while not parent_path_of_path(base_path).samefile(org_path):
-                base_path = parent_path_of_path(base_path)
-
-            # create symlink
-            open_base_dname = basename_of_path(base_path)
-            symlink_path = this_run_save_path / (open_base_dname + ".link")
-            try_create_symlink(base_path, symlink_path)
-    # if it wasn't generated by a run
-    else:
-        # since we don't know where the file is at all, the location is "unknown" and the org is "all"
-        this_run_save_path = dbgym_workspace.dbgym_this_run_path / "unknown" / "all"
-        os.makedirs(this_run_save_path, exist_ok=True)
-        fname = basename_of_path(path)
-        # in this case, we want to copy instead of symlinking since it might disappear in the future
-        copy_path = this_run_save_path / fname
-        shutil.copy(path, copy_path)
-
-
-# TODO(phw2): deprecate this once I'm done with unittest_workspace.py
-def link_result(
-    dbgym_workspace: DBGymWorkspace,
-    result_path: Path,
-    custom_result_name: Optional[str] = None,
-) -> Path:
-    """
-    result_path must be a "result", meaning it was generated inside dbgym_workspace.dbgym_this_run_path.
-    Further, result_path must have been generated by this invocation to task.py. This also means that
-        result_path itself can be a file or a dir but not a symlink.
-    Given a file or directory in task_runs/run_*/[codebase]/[org], this will create a symlink inside
-        symlinks/[codebase]/[org]/.
-    Will override the old symlink if there is one, so that symlinks/ always contains the latest generated
-        version of a file.
-    This function will return the path to the symlink that was created.
-    """
-    assert isinstance(result_path, Path)
-    assert is_fully_resolved(
-        result_path
-    ), f"result_path ({result_path}) should be a fully resolved path"
-    assert is_child_path(result_path, dbgym_workspace.dbgym_this_run_path)
-    assert not os.path.islink(result_path)
-
-    if type(custom_result_name) is str:
-        result_name = custom_result_name
-    else:
-        if os.path.isfile(result_path):
-            result_name = basename_of_path(result_path) + ".link"
-        elif os.path.isdir(result_path):
-            result_name = basename_of_path(result_path) + ".link"
-        else:
-            raise AssertionError("result_path must be either a file or dir")
-
-    # Figure out the parent directory path of the symlink
-    codebase_path, codebase_dname, _, org_dname = extract_from_task_run_path(
-        dbgym_workspace, result_path
-    )
-    # We're only supposed to save files generated by us, which means they should be in cur_task_runs_path()
-    assert codebase_path.samefile(
-        dbgym_workspace.cur_task_runs_path()
-    ), f"link_result should only be called on files generated by this invocation to task.py"
-    symlink_parent_path = (
-        dbgym_workspace.dbgym_symlinks_path / codebase_dname / org_dname
-    )
-    symlink_parent_path.mkdir(parents=True, exist_ok=True)
-
-    # Remove the old symlink ("old" meaning created in an earlier run) if there is one
-    # Note that in a multi-threaded setting, this might remove one created by a process in the same run,
-    #   meaning it's not "old" by our definition of "old". However, we'll always end up with a symlink
-    #   file of the current run regardless of the order of threads.
-    assert result_name.endswith(".link") and not result_name.endswith(
-        ".link.link"
-    ), f'result_name ({result_name}) should end with ".link"'
-    symlink_path = symlink_parent_path / result_name
-    try_remove_file(symlink_path)
-    try_create_symlink(result_path, symlink_path)
-
-    return symlink_path
-
-
 def try_create_symlink(src_path: Path, dst_path: Path) -> None:
     """
     Our functions that create symlinks might be called by multiple processes at once

From 49364c9d74ddbe05afb99041d1070da05ee98653 Mon Sep 17 00:00:00 2001
From: Patrick Wang <wang.patrick57@gmail.com>
Date: Mon, 30 Dec 2024 13:37:50 -0500
Subject: [PATCH 17/24] num_times_created_this_run ->
 _num_times_created_this_run

---
 benchmark/tests/integtest_benchmark.py | 4 ++--
 dbms/tests/integtest_dbms.py           | 8 ++++----
 env/tests/gymlib_integtest_util.py     | 2 +-
 util/tests/unittest_workspace.py       | 4 ++--
 util/workspace.py                      | 8 ++++----
 5 files changed, 13 insertions(+), 13 deletions(-)

diff --git a/benchmark/tests/integtest_benchmark.py b/benchmark/tests/integtest_benchmark.py
index f8b9d9f8..b4f4fdbc 100644
--- a/benchmark/tests/integtest_benchmark.py
+++ b/benchmark/tests/integtest_benchmark.py
@@ -32,8 +32,8 @@ def setUp(self) -> None:
             shutil.rmtree(workspace_path)
 
         # Reset this to avoid the error of it being created twice.
-        # In real usage, the second run would be a different Python process so DBGymWorkspace.num_times_created_this_run would be 0.
-        DBGymWorkspace.num_times_created_this_run = 0
+        # In real usage, the second run would be a different Python process so DBGymWorkspace._num_times_created_this_run would be 0.
+        DBGymWorkspace._num_times_created_this_run = 0
         self.workspace = DBGymWorkspace(workspace_path)
 
     def tearDown(self) -> None:
diff --git a/dbms/tests/integtest_dbms.py b/dbms/tests/integtest_dbms.py
index 466030bd..822e8aac 100644
--- a/dbms/tests/integtest_dbms.py
+++ b/dbms/tests/integtest_dbms.py
@@ -24,8 +24,8 @@ def setUp(self) -> None:
             shutil.rmtree(workspace_path)
 
         # Reset this to avoid the error of it being created twice.
-        # In real usage, the second run would be a different Python process so DBGymWorkspace.num_times_created_this_run would be 0.
-        DBGymWorkspace.num_times_created_this_run = 0
+        # In real usage, the second run would be a different Python process so DBGymWorkspace._num_times_created_this_run would be 0.
+        DBGymWorkspace._num_times_created_this_run = 0
         self.workspace = DBGymWorkspace(workspace_path)
 
     def tearDown(self) -> None:
@@ -44,10 +44,10 @@ def test_postgres_dbdata(self) -> None:
         # Make sure to recreate self.workspace so that each function call counts as its own run.
         scale_factor = 0.01
         _postgres_build(self.workspace, False)
-        DBGymWorkspace.num_times_created_this_run = 0
+        DBGymWorkspace._num_times_created_this_run = 0
         self.workspace = DBGymWorkspace(self.workspace.dbgym_workspace_path)
         _tpch_tables(self.workspace, scale_factor)
-        DBGymWorkspace.num_times_created_this_run = 0
+        DBGymWorkspace._num_times_created_this_run = 0
         self.workspace = DBGymWorkspace(self.workspace.dbgym_workspace_path)
 
         # Test
diff --git a/env/tests/gymlib_integtest_util.py b/env/tests/gymlib_integtest_util.py
index cac69827..577d1ffc 100644
--- a/env/tests/gymlib_integtest_util.py
+++ b/env/tests/gymlib_integtest_util.py
@@ -70,7 +70,7 @@ def set_up_workspace() -> None:
         # However, it also can't be created more than once so we need to check `is None`.
         if GymlibIntegtestManager.DBGYM_WORKSPACE is None:
             # Reset this in case it had been created by a test *not* using GymlibIntegtestManager.set_up_workspace().
-            DBGymWorkspace.num_times_created_this_run = 0
+            DBGymWorkspace._num_times_created_this_run = 0
             GymlibIntegtestManager.DBGYM_WORKSPACE = DBGymWorkspace(workspace_path)
 
     @staticmethod
diff --git a/util/tests/unittest_workspace.py b/util/tests/unittest_workspace.py
index 3a0f11fc..0575b8b3 100644
--- a/util/tests/unittest_workspace.py
+++ b/util/tests/unittest_workspace.py
@@ -40,8 +40,8 @@ def tearDown(self) -> None:
     # Importantly though, I don't have helper functions for the complex functions that I want to test (e.g. link_result and save_file).
     def init_workspace_helper(self) -> None:
         # Reset this to avoid the error of it being created twice.
-        # In real usage, the second run would be a different Python process so DBGymWorkspace.num_times_created_this_run would be 0.
-        DBGymWorkspace.num_times_created_this_run = 0
+        # In real usage, the second run would be a different Python process so DBGymWorkspace._num_times_created_this_run would be 0.
+        DBGymWorkspace._num_times_created_this_run = 0
         self.workspace = DBGymWorkspace(self.workspace_path)
 
         if self.expected_structure is None:
diff --git a/util/workspace.py b/util/workspace.py
index 286488ec..28ed17fa 100644
--- a/util/workspace.py
+++ b/util/workspace.py
@@ -96,15 +96,15 @@ class DBGymWorkspace:
     Global configurations that apply to all parts of DB-Gym
     """
 
-    num_times_created_this_run: int = 0
+    _num_times_created_this_run: int = 0
 
     def __init__(self, dbgym_workspace_path: Path):
         # The logic around dbgym_tmp_path assumes that DBGymWorkspace is only constructed once.
         # This is because DBGymWorkspace creates a new run_*/ dir when it's initialized.
-        DBGymWorkspace.num_times_created_this_run += 1
+        DBGymWorkspace._num_times_created_this_run += 1
         assert (
-            DBGymWorkspace.num_times_created_this_run == 1
-        ), f"DBGymWorkspace has been created {DBGymWorkspace.num_times_created_this_run} times. It should only be created once per run."
+            DBGymWorkspace._num_times_created_this_run == 1
+        ), f"DBGymWorkspace has been created {DBGymWorkspace._num_times_created_this_run} times. It should only be created once per run."
 
         self.base_dbgym_repo_path = get_base_dbgym_repo_path()
         self.cur_path_list: list[str] = ["dbgym"]

From 2ffa8c7acc6ec4ae872f856afbacec55bef1f440 Mon Sep 17 00:00:00 2001
From: Patrick Wang <wang.patrick57@gmail.com>
Date: Mon, 30 Dec 2024 13:38:38 -0500
Subject: [PATCH 18/24] rm lab.py

---
 lab.py | 3 ---
 1 file changed, 3 deletions(-)
 delete mode 100644 lab.py

diff --git a/lab.py b/lab.py
deleted file mode 100644
index 77595ff3..00000000
--- a/lab.py
+++ /dev/null
@@ -1,3 +0,0 @@
-import gymlib
-
-print(gymlib.magic.get_magic_number())

From 17f89ffffed8cba7b56ce682683a6a6fa6c51992 Mon Sep 17 00:00:00 2001
From: Patrick Wang <wang.patrick57@gmail.com>
Date: Mon, 30 Dec 2024 13:55:14 -0500
Subject: [PATCH 19/24] deleted all the cur_* functions from workspace fixed
 all uses of them

---
 dbms/postgres/cli.py    |  1 -
 env/pg_conn.py          | 10 ++----
 env/tuning_artifacts.py |  5 +--
 task.py                 |  2 +-
 util/workspace.py       | 69 -----------------------------------------
 5 files changed, 7 insertions(+), 80 deletions(-)

diff --git a/dbms/postgres/cli.py b/dbms/postgres/cli.py
index b364ff90..91e17469 100644
--- a/dbms/postgres/cli.py
+++ b/dbms/postgres/cli.py
@@ -226,7 +226,6 @@ def _create_dbdata(
     stop_postgres(dbgym_workspace, pgbin_path, dbdata_path)
 
     # Create .tgz file.
-    # Note that you can't pass "[dbdata].tgz" as an arg to cur_task_runs_data_path() because that would create "[dbdata].tgz" as a dir.
     dbdata_tgz_real_path = dbgym_workspace.dbgym_this_run_path / linkname_to_name(
         expected_dbdata_tgz_symlink_path.name
     )
diff --git a/env/pg_conn.py b/env/pg_conn.py
index 540e6134..7aa4e5d8 100644
--- a/env/pg_conn.py
+++ b/env/pg_conn.py
@@ -102,12 +102,9 @@ def disconnect(self) -> None:
             self._conn = None
 
     def move_log(self) -> None:
-        pglog_path = (
-            self.dbgym_workspace.cur_task_runs_artifacts_path(mkdir=True)
-            / f"pg{self.pgport}.log"
-        )
+        pglog_path = self.dbgym_workspace.dbgym_this_run_path / f"pg{self.pgport}.log"
         pglog_this_step_path = (
-            self.dbgym_workspace.cur_task_runs_artifacts_path(mkdir=True)
+            self.dbgym_workspace.dbgym_this_run_path
             / f"pg{self.pgport}.log.{self.log_step}"
         )
         if pglog_path.exists():
@@ -299,8 +296,7 @@ def restart_with_changes(
                 "-l",
                 # We log to pg{self.pgport}.log instead of pg.log so that different PostgresConn objects
                 #   don't all try to write to the same file.
-                self.dbgym_workspace.cur_task_runs_artifacts_path(mkdir=True)
-                / f"pg{self.pgport}.log",
+                self.dbgym_workspace.dbgym_this_run_path / f"pg{self.pgport}.log",
                 "start",
             ].run(retcode=None)
 
diff --git a/env/tuning_artifacts.py b/env/tuning_artifacts.py
index b5b617da..f2eb9178 100644
--- a/env/tuning_artifacts.py
+++ b/env/tuning_artifacts.py
@@ -81,9 +81,10 @@ def __init__(
         self, dbgym_workspace: DBGymWorkspace, metadata: TuningMetadata
     ) -> None:
         self.dbgym_workspace = dbgym_workspace
-        self.tuning_artifacts_path = self.dbgym_workspace.cur_task_runs_artifacts_path(
-            "tuning_artifacts", mkdir=True
+        self.tuning_artifacts_path = (
+            self.dbgym_workspace.dbgym_this_run_path / "tuning_artifacts"
         )
+        self.tuning_artifacts_path.mkdir(parents=False, exist_ok=False)
         assert is_fully_resolved(self.tuning_artifacts_path)
         self.next_step_num = 0
 
diff --git a/task.py b/task.py
index 5579d9cf..f5a0e278 100644
--- a/task.py
+++ b/task.py
@@ -18,7 +18,7 @@ def task(ctx: click.Context) -> None:
     dbgym_workspace = make_standard_dbgym_workspace()
     ctx.obj = dbgym_workspace
 
-    log_path = dbgym_workspace.cur_task_runs_artifacts_path(mkdir=True)
+    log_path = dbgym_workspace.dbgym_this_run_path
     set_up_loggers(log_path)
     set_up_warnings(log_path)
 
diff --git a/util/workspace.py b/util/workspace.py
index 28ed17fa..83f1a399 100644
--- a/util/workspace.py
+++ b/util/workspace.py
@@ -50,30 +50,6 @@ def get_latest_run_path_from_workspace_path(workspace_path: Path) -> Path:
 DEFAULT_BOOT_CONFIG_PATH = POSTGRES_PATH / "default_boot_config.yaml"
 
 
-# Paths of dependencies in the workspace. These are named "*_path" because they will be an absolute path
-# The reason these _cannot_ be relative paths is because relative paths are relative to the codebase root, not the workspace root
-# Note that it's okay to hardcode the codebase paths (like dbgym_dbms_postgres) here. In the worst case, we'll just break an
-#   integration test. The "source of truth" of codebase paths is based on DBGymWorkspace.cur_source_path(), which will always
-#   reflect the actual codebase structure. As long as we automatically enforce getting the right codebase paths when writing, it's
-#   ok to have to hardcode them when reading.
-# Details
-#  - If a name already has the workload_name, I omit scale factor. This is because the workload_name includes the scale factor
-#  - By convention, symlinks should end with ".link". The bug that motivated this decision involved replaying a tuning run. When
-#    replaying a tuning run, you read the tuning_steps/ folder of the tuning run. Earlier, I created a symlink to that tuning_steps/
-#    folder called run_*/*/tuning_steps. However, replay itself generates an replay_info.log file, which goes in
-#    run_*/*/tuning_steps/. The bug was that my replay function was overwriting the replay_info.log file of the
-#    tuning run. By naming all symlinks "*.link", we avoid the possibility of subtle bugs like this happening.
-def get_default_workload_path(
-    workspace_path: Path, benchmark_name: str, workload_name: str
-) -> Path:
-    return (
-        get_symlinks_path_from_workspace_path(workspace_path)
-        / f"dbgym_benchmark_{benchmark_name}"
-        / "data"
-        / (workload_name + ".link")
-    )
-
-
 SCALE_FACTOR_PLACEHOLDER: str = "[scale_factor]"
 
 
@@ -323,51 +299,6 @@ def open_and_save(self, open_path: Path, mode: str = "r") -> IO[Any]:
     def append_group(self, name: str) -> None:
         self.cur_path_list.append(name)
 
-    def cur_source_path(self, *dirs: str) -> Path:
-        cur_path = self.base_dbgym_repo_path
-        assert self.cur_path_list[0] == "dbgym"
-        for folder in self.cur_path_list[1:]:
-            cur_path = cur_path / folder
-        for dir in dirs:
-            cur_path = cur_path / dir
-        return cur_path
-
-    def _cur_symlinks_path(self, *dirs: str, mkdir: bool = False) -> Path:
-        flattened_structure = "_".join(self.cur_path_list)
-        cur_path = self.dbgym_symlinks_path / flattened_structure
-        for dir in dirs:
-            cur_path = cur_path / dir
-        if mkdir:
-            cur_path.mkdir(parents=True, exist_ok=True)
-        return cur_path
-
-    def cur_task_runs_path(self, *dirs: str, mkdir: bool = False) -> Path:
-        flattened_structure = "_".join(self.cur_path_list)
-        cur_path = self.dbgym_this_run_path / flattened_structure
-        for dir in dirs:
-            cur_path = cur_path / dir
-        if mkdir:
-            cur_path.mkdir(parents=True, exist_ok=True)
-        return cur_path
-
-    def cur_symlinks_bin_path(self, *dirs: str, mkdir: bool = False) -> Path:
-        return self._cur_symlinks_path("bin", *dirs, mkdir=mkdir)
-
-    def cur_symlinks_build_path(self, *dirs: str, mkdir: bool = False) -> Path:
-        return self._cur_symlinks_path("build", *dirs, mkdir=mkdir)
-
-    def cur_symlinks_data_path(self, *dirs: str, mkdir: bool = False) -> Path:
-        return self._cur_symlinks_path("data", *dirs, mkdir=mkdir)
-
-    def cur_task_runs_build_path(self, *dirs: str, mkdir: bool = False) -> Path:
-        return self.cur_task_runs_path("build", *dirs, mkdir=mkdir)
-
-    def cur_task_runs_data_path(self, *dirs: str, mkdir: bool = False) -> Path:
-        return self.cur_task_runs_path("data", *dirs, mkdir=mkdir)
-
-    def cur_task_runs_artifacts_path(self, *dirs: str, mkdir: bool = False) -> Path:
-        return self.cur_task_runs_path("artifacts", *dirs, mkdir=mkdir)
-
 
 def get_workspace_path_from_config(dbgym_config_path: Path) -> Path:
     """

From e3ac7058bee7243ffd96efd29f21c739e7cf711a Mon Sep 17 00:00:00 2001
From: Patrick Wang <wang.patrick57@gmail.com>
Date: Mon, 30 Dec 2024 13:56:13 -0500
Subject: [PATCH 20/24] deleted append_group

---
 benchmark/cli.py      | 2 +-
 benchmark/job/cli.py  | 2 +-
 benchmark/tpch/cli.py | 2 +-
 dbms/cli.py           | 2 +-
 dbms/postgres/cli.py  | 2 +-
 util/workspace.py     | 5 -----
 6 files changed, 5 insertions(+), 10 deletions(-)

diff --git a/benchmark/cli.py b/benchmark/cli.py
index 1cf63418..45943a16 100644
--- a/benchmark/cli.py
+++ b/benchmark/cli.py
@@ -8,7 +8,7 @@
 @click.group(name="benchmark")
 @click.pass_obj
 def benchmark_group(dbgym_workspace: DBGymWorkspace) -> None:
-    dbgym_workspace.append_group("benchmark")
+    pass
 
 
 benchmark_group.add_command(tpch_group)
diff --git a/benchmark/job/cli.py b/benchmark/job/cli.py
index 136c0b19..661ed938 100644
--- a/benchmark/job/cli.py
+++ b/benchmark/job/cli.py
@@ -136,7 +136,7 @@
 @click.group(name="job")
 @click.pass_obj
 def job_group(dbgym_workspace: DBGymWorkspace) -> None:
-    dbgym_workspace.append_group("job")
+    pass
 
 
 @job_group.command(name="tables")
diff --git a/benchmark/tpch/cli.py b/benchmark/tpch/cli.py
index af8531b2..014401da 100644
--- a/benchmark/tpch/cli.py
+++ b/benchmark/tpch/cli.py
@@ -26,7 +26,7 @@
 @click.group(name="tpch")
 @click.pass_obj
 def tpch_group(dbgym_workspace: DBGymWorkspace) -> None:
-    dbgym_workspace.append_group("tpch")
+    pass
 
 
 @tpch_group.command(name="tables")
diff --git a/dbms/cli.py b/dbms/cli.py
index 6d97c5e2..2804b79b 100644
--- a/dbms/cli.py
+++ b/dbms/cli.py
@@ -7,7 +7,7 @@
 @click.group(name="dbms")
 @click.pass_obj
 def dbms_group(dbgym_workspace: DBGymWorkspace) -> None:
-    dbgym_workspace.append_group("dbms")
+    pass
 
 
 dbms_group.add_command(postgres_group)
diff --git a/dbms/postgres/cli.py b/dbms/postgres/cli.py
index 91e17469..702285ea 100644
--- a/dbms/postgres/cli.py
+++ b/dbms/postgres/cli.py
@@ -47,7 +47,7 @@
 @click.group(name="postgres")
 @click.pass_obj
 def postgres_group(dbgym_workspace: DBGymWorkspace) -> None:
-    dbgym_workspace.append_group("postgres")
+    pass
 
 
 @postgres_group.command(
diff --git a/util/workspace.py b/util/workspace.py
index 83f1a399..a49547eb 100644
--- a/util/workspace.py
+++ b/util/workspace.py
@@ -83,7 +83,6 @@ def __init__(self, dbgym_workspace_path: Path):
         ), f"DBGymWorkspace has been created {DBGymWorkspace._num_times_created_this_run} times. It should only be created once per run."
 
         self.base_dbgym_repo_path = get_base_dbgym_repo_path()
-        self.cur_path_list: list[str] = ["dbgym"]
         self.app_name = (
             "dbgym"  # TODO: discover this dynamically. app means dbgym or an agent
         )
@@ -295,10 +294,6 @@ def open_and_save(self, open_path: Path, mode: str = "r") -> IO[Any]:
         # Open
         return open(open_path, mode=mode)
 
-    # `append_group()` is used to mark the "codebase path" of an invocation of the CLI. The "codebase path" is explained further in the documentation.
-    def append_group(self, name: str) -> None:
-        self.cur_path_list.append(name)
-
 
 def get_workspace_path_from_config(dbgym_config_path: Path) -> Path:
     """

From f5c89e7e9b7864be6b01b082bd25a18a58d25293 Mon Sep 17 00:00:00 2001
From: Patrick Wang <wang.patrick57@gmail.com>
Date: Mon, 30 Dec 2024 14:01:08 -0500
Subject: [PATCH 21/24] deleted more functions from workspace.py

---
 benchmark/tpch/cli.py              |  8 +---
 dbms/postgres/cli.py               |  6 +--
 env/tests/gymlib_integtest_util.py |  4 +-
 util/workspace.py                  | 66 +-----------------------------
 4 files changed, 8 insertions(+), 76 deletions(-)

diff --git a/benchmark/tpch/cli.py b/benchmark/tpch/cli.py
index 014401da..9276b7ed 100644
--- a/benchmark/tpch/cli.py
+++ b/benchmark/tpch/cli.py
@@ -2,6 +2,7 @@
 
 import click
 from gymlib.symlinks_paths import (
+    get_scale_factor_string,
     get_tables_dirname,
     get_tables_symlink_path,
     get_workload_suffix,
@@ -13,12 +14,7 @@
 from benchmark.tpch.constants import DEFAULT_TPCH_SEED, NUM_TPCH_QUERIES
 from util.log import DBGYM_LOGGER_NAME
 from util.shell import subprocess_run
-from util.workspace import (
-    DBGymWorkspace,
-    fully_resolve_path,
-    get_scale_factor_string,
-    is_fully_resolved,
-)
+from util.workspace import DBGymWorkspace, fully_resolve_path, is_fully_resolved
 
 TPCH_KIT_DIRNAME = "tpch-kit"
 
diff --git a/dbms/postgres/cli.py b/dbms/postgres/cli.py
index 702285ea..28dd1620 100644
--- a/dbms/postgres/cli.py
+++ b/dbms/postgres/cli.py
@@ -38,7 +38,7 @@
     WORKSPACE_PATH_PLACEHOLDER,
     DBGymWorkspace,
     fully_resolve_path,
-    get_default_dbdata_parent_path,
+    get_tmp_path_from_workspace_path,
     is_fully_resolved,
     is_ssd,
 )
@@ -118,7 +118,7 @@ def _postgres_build(dbgym_workspace: DBGymWorkspace, rebuild: bool) -> None:
     "--dbdata-parent-path",
     default=None,
     type=Path,
-    help=f"The path to the parent directory of the dbdata which will be actively tuned. The default is {get_default_dbdata_parent_path(WORKSPACE_PATH_PLACEHOLDER)}.",
+    help=f"The path to the parent directory of the dbdata which will be actively tuned. The default is {get_tmp_path_from_workspace_path(WORKSPACE_PATH_PLACEHOLDER)}.",
 )
 def postgres_dbdata(
     dbgym_workspace: DBGymWorkspace,
@@ -153,7 +153,7 @@ def _postgres_dbdata(
     if pgbin_path is None:
         pgbin_path = get_pgbin_symlink_path(dbgym_workspace.dbgym_workspace_path)
     if dbdata_parent_path is None:
-        dbdata_parent_path = get_default_dbdata_parent_path(
+        dbdata_parent_path = get_tmp_path_from_workspace_path(
             dbgym_workspace.dbgym_workspace_path
         )
 
diff --git a/env/tests/gymlib_integtest_util.py b/env/tests/gymlib_integtest_util.py
index 577d1ffc..3c71b498 100644
--- a/env/tests/gymlib_integtest_util.py
+++ b/env/tests/gymlib_integtest_util.py
@@ -16,7 +16,7 @@
 from util.workspace import (
     DBGymWorkspace,
     fully_resolve_path,
-    get_default_dbdata_parent_path,
+    get_tmp_path_from_workspace_path,
     get_workspace_path_from_config,
 )
 
@@ -105,7 +105,7 @@ def get_default_metadata() -> TuningMetadata:
                 ),
             ),
             dbdata_parent_path=fully_resolve_path(
-                get_default_dbdata_parent_path(dbgym_workspace.dbgym_workspace_path),
+                get_tmp_path_from_workspace_path(dbgym_workspace.dbgym_workspace_path),
             ),
             pgbin_path=fully_resolve_path(
                 get_pgbin_symlink_path(dbgym_workspace.dbgym_workspace_path),
diff --git a/util/workspace.py b/util/workspace.py
index a49547eb..59784aa2 100644
--- a/util/workspace.py
+++ b/util/workspace.py
@@ -11,20 +11,10 @@
 from pathlib import Path
 from typing import IO, Any, Optional
 
-import redis
 import yaml
 
-from benchmark.tpch.constants import DEFAULT_TPCH_SEED
 from util.log import DBGYM_LOGGER_NAME
-from util.shell import subprocess_run
 
-# Relative paths of different folders in the codebase
-DBMS_PATH = Path("dbms")
-POSTGRES_PATH = DBMS_PATH / "postgres"
-TUNE_PATH = Path("tune")
-
-# Paths of different parts of the workspace
-# I made these Path objects even though they're not real paths just so they can work correctly with my other helper functions
 WORKSPACE_PATH_PLACEHOLDER = Path("[workspace]")
 
 
@@ -47,24 +37,7 @@ def get_latest_run_path_from_workspace_path(workspace_path: Path) -> Path:
 
 # Paths of config files in the codebase. These are always relative paths.
 # The reason these can be relative paths instead of functions taking in codebase_path as input is because relative paths are relative to the codebase root
-DEFAULT_BOOT_CONFIG_PATH = POSTGRES_PATH / "default_boot_config.yaml"
-
-
-SCALE_FACTOR_PLACEHOLDER: str = "[scale_factor]"
-
-
-def get_scale_factor_string(scale_factor: float | str) -> str:
-    if type(scale_factor) is str and scale_factor == SCALE_FACTOR_PLACEHOLDER:
-        return scale_factor
-    else:
-        if float(int(scale_factor)) == scale_factor:
-            return str(int(scale_factor))
-        else:
-            return str(scale_factor).replace(".", "point")
-
-
-def get_default_dbdata_parent_path(workspace_path: Path) -> Path:
-    return get_tmp_path_from_workspace_path(workspace_path)
+DEFAULT_BOOT_CONFIG_PATH = Path("dbms") / "postgres" / "default_boot_config.yaml"
 
 
 class DBGymWorkspace:
@@ -508,43 +481,6 @@ def try_remove_file(path: Path) -> None:
         pass
 
 
-# TODO: move this stuff to shell.py
-def restart_ray(redis_port: int) -> None:
-    """
-    Stop and start Ray.
-    This is good to do between each stage to avoid bugs from carrying over across stages
-    """
-    subprocess_run("ray stop -f")
-    ncpu = os.cpu_count()
-    # --disable-usage-stats avoids a Y/N prompt
-    subprocess_run(
-        f"OMP_NUM_THREADS={ncpu} ray start --head --port={redis_port} --num-cpus={ncpu} --disable-usage-stats"
-    )
-
-
-def make_redis_started(port: int) -> None:
-    """
-    Start Redis if it's not already started.
-    Note that Ray uses Redis but does *not* use this function. It starts Redis on its own.
-    One current use for this function to start/stop Redis for Boot.
-    """
-    try:
-        r = redis.Redis(port=port)
-        r.ping()
-        # This means Redis is running, so we do nothing
-        do_start_redis = False
-    except (redis.ConnectionError, redis.TimeoutError):
-        # This means Redis is not running, so we start it
-        do_start_redis = True
-
-    # I'm starting Redis outside of except so that errors in r.ping get propagated correctly
-    if do_start_redis:
-        subprocess_run(f"redis-server --port {port} --daemonize yes")
-        # When you start Redis in daemon mode, it won't let you know if it's started, so we ping again to check
-        r = redis.Redis(port=port)
-        r.ping()
-
-
 def is_ssd(path: Path) -> bool:
     try:
         device = (

From e6ed5660a8c4529eee04cf69e4d0eb50167cc802 Mon Sep 17 00:00:00 2001
From: Patrick Wang <wang.patrick57@gmail.com>
Date: Mon, 30 Dec 2024 14:07:27 -0500
Subject: [PATCH 22/24] refactored everything to use linkname helpers

---
 benchmark/job/cli.py                    |  5 +++--
 benchmark/tpch/cli.py                   | 11 ++++++-----
 gymlib_package/gymlib/symlinks_paths.py | 16 ++++++++++------
 util/workspace.py                       | 17 ++++++++++-------
 4 files changed, 29 insertions(+), 20 deletions(-)

diff --git a/benchmark/job/cli.py b/benchmark/job/cli.py
index 661ed938..066fa3cc 100644
--- a/benchmark/job/cli.py
+++ b/benchmark/job/cli.py
@@ -6,6 +6,7 @@
     get_tables_dirname,
     get_workload_dirname,
     get_workload_suffix,
+    name_to_linkname,
 )
 
 from benchmark.constants import DEFAULT_SCALE_FACTOR
@@ -255,7 +256,7 @@ def _generate_job_workload(
         get_workload_suffix("job", query_subset=query_subset),
     )
     expected_workload_symlink_path = dbgym_workspace.dbgym_cur_symlinks_path / (
-        workload_name + ".link"
+        name_to_linkname(workload_name)
     )
     if expected_workload_symlink_path.exists():
         logging.getLogger(DBGYM_LOGGER_NAME).info(
@@ -281,7 +282,7 @@ def _generate_job_workload(
 
     with open(workload_path / "order.txt", "w") as f:
         queries_parent_path = dbgym_workspace.dbgym_cur_symlinks_path / (
-            JOB_QUERIES_DNAME + ".link"
+            name_to_linkname(JOB_QUERIES_DNAME)
         )
 
         for qname in query_names:
diff --git a/benchmark/tpch/cli.py b/benchmark/tpch/cli.py
index 9276b7ed..c83fae75 100644
--- a/benchmark/tpch/cli.py
+++ b/benchmark/tpch/cli.py
@@ -8,6 +8,7 @@
     get_workload_suffix,
     get_workload_symlink_path,
     linkname_to_name,
+    name_to_linkname,
 )
 
 from benchmark.constants import DEFAULT_SCALE_FACTOR
@@ -98,7 +99,7 @@ def _get_queries_dirname(seed: int, scale_factor: float) -> str:
 
 def _clone_tpch_kit(dbgym_workspace: DBGymWorkspace) -> None:
     expected_symlink_path = dbgym_workspace.dbgym_cur_symlinks_path / (
-        TPCH_KIT_DIRNAME + ".link"
+        name_to_linkname(TPCH_KIT_DIRNAME)
     )
     if expected_symlink_path.exists():
         logging.getLogger(DBGYM_LOGGER_NAME).info(
@@ -122,14 +123,14 @@ def _generate_tpch_queries(
     dbgym_workspace: DBGymWorkspace, seed_start: int, seed_end: int, scale_factor: float
 ) -> None:
     tpch_kit_path = dbgym_workspace.dbgym_cur_symlinks_path / (
-        TPCH_KIT_DIRNAME + ".link"
+        name_to_linkname(TPCH_KIT_DIRNAME)
     )
     logging.getLogger(DBGYM_LOGGER_NAME).info(
         f"Generating queries: [{seed_start}, {seed_end}]"
     )
     for seed in range(seed_start, seed_end + 1):
         expected_queries_symlink_path = dbgym_workspace.dbgym_cur_symlinks_path / (
-            _get_queries_dirname(seed, scale_factor) + ".link"
+            name_to_linkname(_get_queries_dirname(seed, scale_factor))
         )
         if expected_queries_symlink_path.exists():
             continue
@@ -155,7 +156,7 @@ def _generate_tpch_queries(
 
 def _generate_tpch_tables(dbgym_workspace: DBGymWorkspace, scale_factor: float) -> None:
     tpch_kit_path = dbgym_workspace.dbgym_cur_symlinks_path / (
-        TPCH_KIT_DIRNAME + ".link"
+        name_to_linkname(TPCH_KIT_DIRNAME)
     )
     expected_tables_symlink_path = get_tables_symlink_path(
         dbgym_workspace.dbgym_workspace_path, "tpch", scale_factor
@@ -225,7 +226,7 @@ def _generate_tpch_workload(
     with open(workload_path / "order.txt", "w") as f:
         for seed in range(seed_start, seed_end + 1):
             queries_parent_path = dbgym_workspace.dbgym_cur_symlinks_path / (
-                _get_queries_dirname(seed, scale_factor) + ".link"
+                name_to_linkname(_get_queries_dirname(seed, scale_factor))
             )
 
             for qname in query_names:
diff --git a/gymlib_package/gymlib/symlinks_paths.py b/gymlib_package/gymlib/symlinks_paths.py
index 9c83ef3d..90736adf 100644
--- a/gymlib_package/gymlib/symlinks_paths.py
+++ b/gymlib_package/gymlib/symlinks_paths.py
@@ -50,7 +50,7 @@ def get_tables_symlink_path(
         workspace_path
         / SYMLINKS_DNAME
         / DBGYM_APP_NAME
-        / (get_tables_dirname(benchmark, scale_factor) + ".link")
+        / name_to_linkname(get_tables_dirname(benchmark, scale_factor))
     )
 
 
@@ -61,7 +61,7 @@ def get_workload_symlink_path(
         workspace_path
         / SYMLINKS_DNAME
         / DBGYM_APP_NAME
-        / (get_workload_dirname(benchmark, scale_factor, suffix) + ".link")
+        / name_to_linkname(get_workload_dirname(benchmark, scale_factor, suffix))
     )
 
 
@@ -80,16 +80,20 @@ def get_dbdata_tgz_symlink_path(
         workspace_path
         / SYMLINKS_DNAME
         / DBGYM_APP_NAME
-        / (get_dbdata_tgz_filename(benchmark_name, scale_factor) + ".link")
+        / name_to_linkname(get_dbdata_tgz_filename(benchmark_name, scale_factor))
     )
 
 
-# TODO: refactor stuff to use this
+def is_linkname(name: str) -> bool:
+    assert not name.endswith(".link.link")
+    return name.endswith(".link")
+
+
 def name_to_linkname(name: str) -> str:
-    assert not name.endswith(".link")
+    assert not is_linkname(name)
     return f"{name}.link"
 
 
 def linkname_to_name(linkname: str) -> str:
-    assert linkname.endswith(".link")
+    assert is_linkname(linkname)
     return linkname[: -len(".link")]
diff --git a/util/workspace.py b/util/workspace.py
index 59784aa2..32bfd03d 100644
--- a/util/workspace.py
+++ b/util/workspace.py
@@ -12,6 +12,7 @@
 from typing import IO, Any, Optional
 
 import yaml
+from gymlib.symlinks_paths import is_linkname, name_to_linkname
 
 from util.log import DBGYM_LOGGER_NAME
 
@@ -143,9 +144,9 @@ def link_result(
             link_name = custom_link_name
         else:
             if os.path.isfile(result_path):
-                link_name = basename_of_path(result_path) + ".link"
+                link_name = name_to_linkname(basename_of_path(result_path))
             elif os.path.isdir(result_path):
-                link_name = basename_of_path(result_path) + ".link"
+                link_name = name_to_linkname(basename_of_path(result_path))
             else:
                 raise AssertionError("result_path must be either a file or dir")
 
@@ -156,8 +157,8 @@ def link_result(
         # Note that in a multi-threaded setting, this might remove one created by a process in the same run,
         #   meaning it's not "old" by our definition of "old". However, we'll always end up with a symlink
         #   file of the current run regardless of the order of threads.
-        assert link_name.endswith(".link") and not link_name.endswith(
-            ".link.link"
+        assert is_linkname(
+            link_name
         ), f'link_name ({link_name}) should end with ".link"'
         symlink_path = symlink_parent_path / link_name
         try_remove_file(symlink_path)
@@ -210,7 +211,7 @@ def save_file(self, path: Path) -> None:
             parent_path = parent_path_of_path(path)
             if parent_path.samefile(run_path):
                 fname = basename_of_path(path)
-                symlink_path = self.dbgym_this_run_path / (fname + ".link")
+                symlink_path = self.dbgym_this_run_path / name_to_linkname(fname)
                 try_remove_file(symlink_path)
                 try_create_symlink(path, symlink_path)
             # Otherwise, we know the path file is _not_ directly inside run_path dir.
@@ -225,7 +226,9 @@ def save_file(self, path: Path) -> None:
 
                 # Create symlink
                 open_base_dname = basename_of_path(base_path)
-                symlink_path = self.dbgym_this_run_path / (open_base_dname + ".link")
+                symlink_path = self.dbgym_this_run_path / name_to_linkname(
+                    open_base_dname
+                )
                 try_remove_file(symlink_path)
                 try_create_symlink(base_path, symlink_path)
         # If the file wasn't generated by a run, we can't just symlink it because we don't know that it's immutable.
@@ -461,7 +464,7 @@ def try_create_symlink(src_path: Path, dst_path: Path) -> None:
     Our functions that create symlinks might be called by multiple processes at once
     during HPO. Thus, this is a thread-safe way to create a symlink.
     """
-    assert dst_path.suffix == ".link"
+    assert is_linkname(dst_path.name)
     try:
         os.symlink(src_path, dst_path)
     except FileExistsError:

From 388aeec36c6106a3d019191c258be971496cc639 Mon Sep 17 00:00:00 2001
From: Patrick Wang <wang.patrick57@gmail.com>
Date: Mon, 30 Dec 2024 14:37:05 -0500
Subject: [PATCH 23/24] refactored how workspace works in gymlib tests

---
 env/tests/gymlib_integtest_util.py      | 34 +++++++++----------------
 env/tests/integtest_pg_conn.py          |  8 +++++-
 env/tests/integtest_replay.py           | 10 ++++++--
 env/tests/integtest_tuning_artifacts.py | 12 ++++++---
 env/tests/integtest_workload.py         |  9 +++++--
 env/tuning_artifacts.py                 |  3 ++-
 6 files changed, 45 insertions(+), 31 deletions(-)

diff --git a/env/tests/gymlib_integtest_util.py b/env/tests/gymlib_integtest_util.py
index 3c71b498..44ec3c4c 100644
--- a/env/tests/gymlib_integtest_util.py
+++ b/env/tests/gymlib_integtest_util.py
@@ -35,9 +35,7 @@ class GymlibIntegtestManager:
     BENCHMARK = "tpch"
     SCALE_FACTOR = 0.01
     DBGYM_CONFIG_PATH = Path("env/tests/gymlib_integtest_dbgym_config.yaml")
-
-    # This is set at most once by set_up_workspace().
-    DBGYM_WORKSPACE: Optional[DBGymWorkspace] = None
+    WORKSPACE_PATH: Optional[Path] = None
 
     @staticmethod
     def set_up_workspace() -> None:
@@ -45,12 +43,12 @@ def set_up_workspace() -> None:
         Set up the workspace if it has not already been set up.
         None of the integtest_*.py files will delete the workspace so that future tests run faster.
         """
-        workspace_path = get_workspace_path_from_config(
+        GymlibIntegtestManager.WORKSPACE_PATH = get_workspace_path_from_config(
             GymlibIntegtestManager.DBGYM_CONFIG_PATH
         )
 
         # This if statement prevents us from setting up the workspace twice, which saves time.
-        if not workspace_path.exists():
+        if not GymlibIntegtestManager.WORKSPACE_PATH.exists():
             subprocess.run(
                 ["./env/tests/_set_up_gymlib_integtest_workspace.sh"],
                 env={
@@ -64,23 +62,13 @@ def set_up_workspace() -> None:
                 check=True,
             )
 
-        # Once we get here, we have an invariant that the workspace exists. We need this
-        # invariant to be true in order to create the DBGymWorkspace.
-        #
-        # However, it also can't be created more than once so we need to check `is None`.
-        if GymlibIntegtestManager.DBGYM_WORKSPACE is None:
-            # Reset this in case it had been created by a test *not* using GymlibIntegtestManager.set_up_workspace().
-            DBGymWorkspace._num_times_created_this_run = 0
-            GymlibIntegtestManager.DBGYM_WORKSPACE = DBGymWorkspace(workspace_path)
-
     @staticmethod
-    def get_dbgym_workspace() -> DBGymWorkspace:
-        assert GymlibIntegtestManager.DBGYM_WORKSPACE is not None
-        return GymlibIntegtestManager.DBGYM_WORKSPACE
+    def get_workspace_path() -> Path:
+        assert GymlibIntegtestManager.WORKSPACE_PATH is not None
+        return GymlibIntegtestManager.WORKSPACE_PATH
 
     @staticmethod
     def get_default_metadata() -> TuningMetadata:
-        dbgym_workspace = GymlibIntegtestManager.get_dbgym_workspace()
         assert GymlibIntegtestManager.BENCHMARK == "tpch"
         suffix = get_workload_suffix(
             GymlibIntegtestManager.BENCHMARK,
@@ -91,7 +79,7 @@ def get_default_metadata() -> TuningMetadata:
         return TuningMetadata(
             workload_path=fully_resolve_path(
                 get_workload_symlink_path(
-                    dbgym_workspace.dbgym_workspace_path,
+                    GymlibIntegtestManager.get_workspace_path(),
                     GymlibIntegtestManager.BENCHMARK,
                     GymlibIntegtestManager.SCALE_FACTOR,
                     suffix,
@@ -99,15 +87,17 @@ def get_default_metadata() -> TuningMetadata:
             ),
             pristine_dbdata_snapshot_path=fully_resolve_path(
                 get_dbdata_tgz_symlink_path(
-                    dbgym_workspace.dbgym_workspace_path,
+                    GymlibIntegtestManager.get_workspace_path(),
                     GymlibIntegtestManager.BENCHMARK,
                     GymlibIntegtestManager.SCALE_FACTOR,
                 ),
             ),
             dbdata_parent_path=fully_resolve_path(
-                get_tmp_path_from_workspace_path(dbgym_workspace.dbgym_workspace_path),
+                get_tmp_path_from_workspace_path(
+                    GymlibIntegtestManager.get_workspace_path()
+                ),
             ),
             pgbin_path=fully_resolve_path(
-                get_pgbin_symlink_path(dbgym_workspace.dbgym_workspace_path),
+                get_pgbin_symlink_path(GymlibIntegtestManager.get_workspace_path()),
             ),
         )
diff --git a/env/tests/integtest_pg_conn.py b/env/tests/integtest_pg_conn.py
index a5e11f4d..d0464aa4 100644
--- a/env/tests/integtest_pg_conn.py
+++ b/env/tests/integtest_pg_conn.py
@@ -10,12 +10,18 @@
     get_is_postgres_running,
     get_running_postgres_ports,
 )
+from util.workspace import DBGymWorkspace
 
 
 class PostgresConnTests(unittest.TestCase):
+    workspace: DBGymWorkspace
+
     @staticmethod
     def setUpClass() -> None:
         GymlibIntegtestManager.set_up_workspace()
+        PostgresConnTests.workspace = DBGymWorkspace(
+            GymlibIntegtestManager.get_workspace_path()
+        )
 
     def setUp(self) -> None:
         self.assertFalse(
@@ -38,7 +44,7 @@ def tearDown(self) -> None:
 
     def create_pg_conn(self, pgport: int = DEFAULT_POSTGRES_PORT) -> PostgresConn:
         return PostgresConn(
-            GymlibIntegtestManager.get_dbgym_workspace(),
+            PostgresConnTests.workspace,
             pgport,
             self.metadata.pristine_dbdata_snapshot_path,
             self.metadata.dbdata_parent_path,
diff --git a/env/tests/integtest_replay.py b/env/tests/integtest_replay.py
index 86b191f9..1c6e3e6c 100644
--- a/env/tests/integtest_replay.py
+++ b/env/tests/integtest_replay.py
@@ -10,16 +10,22 @@
     SysKnobsDelta,
     TuningArtifactsWriter,
 )
+from util.workspace import DBGymWorkspace
 
 
 class ReplayTests(unittest.TestCase):
+    workspace: DBGymWorkspace
+
     @staticmethod
     def setUpClass() -> None:
         GymlibIntegtestManager.set_up_workspace()
+        ReplayTests.workspace = DBGymWorkspace(
+            GymlibIntegtestManager.get_workspace_path()
+        )
 
     def test_replay(self) -> None:
         writer = TuningArtifactsWriter(
-            GymlibIntegtestManager.get_dbgym_workspace(),
+            ReplayTests.workspace,
             GymlibIntegtestManager.get_default_metadata(),
         )
         writer.write_step(
@@ -41,7 +47,7 @@ def test_replay(self) -> None:
             )
         )
         replay_data = replay(
-            GymlibIntegtestManager.get_dbgym_workspace(),
+            ReplayTests.workspace,
             writer.tuning_artifacts_path,
         )
 
diff --git a/env/tests/integtest_tuning_artifacts.py b/env/tests/integtest_tuning_artifacts.py
index 7d89bbba..1baa1fdb 100644
--- a/env/tests/integtest_tuning_artifacts.py
+++ b/env/tests/integtest_tuning_artifacts.py
@@ -9,6 +9,7 @@
     TuningArtifactsReader,
     TuningArtifactsWriter,
 )
+from util.workspace import DBGymWorkspace
 
 
 class PostgresConnTests(unittest.TestCase):
@@ -16,6 +17,11 @@ class PostgresConnTests(unittest.TestCase):
     def setUpClass() -> None:
         GymlibIntegtestManager.set_up_workspace()
 
+    def setUp(self) -> None:
+        # We re-create a workspace for each test because each test will create its own TuningArtifactsWriter.
+        DBGymWorkspace._num_times_created_this_run = 0
+        self.workspace = DBGymWorkspace(GymlibIntegtestManager.get_workspace_path())
+
     @staticmethod
     def make_config(letter: str) -> DBMSConfigDelta:
         return DBMSConfigDelta(
@@ -26,7 +32,7 @@ def make_config(letter: str) -> DBMSConfigDelta:
 
     def test_get_delta_at_step(self) -> None:
         writer = TuningArtifactsWriter(
-            GymlibIntegtestManager.get_dbgym_workspace(),
+            self.workspace,
             GymlibIntegtestManager.get_default_metadata(),
         )
 
@@ -51,7 +57,7 @@ def test_get_delta_at_step(self) -> None:
 
     def test_get_all_deltas_in_order(self) -> None:
         writer = TuningArtifactsWriter(
-            GymlibIntegtestManager.get_dbgym_workspace(),
+            self.workspace,
             GymlibIntegtestManager.get_default_metadata(),
         )
 
@@ -72,7 +78,7 @@ def test_get_all_deltas_in_order(self) -> None:
 
     def test_get_metadata(self) -> None:
         writer = TuningArtifactsWriter(
-            GymlibIntegtestManager.get_dbgym_workspace(),
+            self.workspace,
             GymlibIntegtestManager.get_default_metadata(),
         )
         reader = TuningArtifactsReader(writer.tuning_artifacts_path)
diff --git a/env/tests/integtest_workload.py b/env/tests/integtest_workload.py
index 3584b05c..2fad33eb 100644
--- a/env/tests/integtest_workload.py
+++ b/env/tests/integtest_workload.py
@@ -3,17 +3,22 @@
 from benchmark.tpch.constants import DEFAULT_TPCH_SEED, NUM_TPCH_QUERIES
 from env.tests.gymlib_integtest_util import GymlibIntegtestManager
 from env.workload import Workload
+from util.workspace import DBGymWorkspace
 
 
 class WorkloadTests(unittest.TestCase):
+    workspace: DBGymWorkspace
+
     @staticmethod
     def setUpClass() -> None:
         GymlibIntegtestManager.set_up_workspace()
+        WorkloadTests.workspace = DBGymWorkspace(
+            GymlibIntegtestManager.get_workspace_path()
+        )
 
     def test_workload(self) -> None:
         workload_path = GymlibIntegtestManager.get_default_metadata().workload_path
-
-        workload = Workload(GymlibIntegtestManager.get_dbgym_workspace(), workload_path)
+        workload = Workload(WorkloadTests.workspace, workload_path)
 
         # Check the order of query IDs.
         self.assertEqual(
diff --git a/env/tuning_artifacts.py b/env/tuning_artifacts.py
index f2eb9178..ab26e3fd 100644
--- a/env/tuning_artifacts.py
+++ b/env/tuning_artifacts.py
@@ -1,7 +1,7 @@
 import json
 from dataclasses import asdict, dataclass
 from pathlib import Path
-from typing import Any, NewType, TypedDict
+from typing import Any, NewType
 
 from util.workspace import DBGymWorkspace, is_fully_resolved
 
@@ -84,6 +84,7 @@ def __init__(
         self.tuning_artifacts_path = (
             self.dbgym_workspace.dbgym_this_run_path / "tuning_artifacts"
         )
+        # exist_ok is False because you should only create one TuningArtifactsWriter per run.
         self.tuning_artifacts_path.mkdir(parents=False, exist_ok=False)
         assert is_fully_resolved(self.tuning_artifacts_path)
         self.next_step_num = 0

From e3f7bfe12fe9ad49e09599f614b12b9c3e334355 Mon Sep 17 00:00:00 2001
From: Patrick Wang <wang.patrick57@gmail.com>
Date: Mon, 30 Dec 2024 14:51:38 -0500
Subject: [PATCH 24/24] now resetting num times created in integtests

---
 env/tests/integtest_pg_conn.py  | 2 ++
 env/tests/integtest_replay.py   | 2 ++
 env/tests/integtest_workload.py | 2 ++
 3 files changed, 6 insertions(+)

diff --git a/env/tests/integtest_pg_conn.py b/env/tests/integtest_pg_conn.py
index d0464aa4..da586398 100644
--- a/env/tests/integtest_pg_conn.py
+++ b/env/tests/integtest_pg_conn.py
@@ -19,6 +19,8 @@ class PostgresConnTests(unittest.TestCase):
     @staticmethod
     def setUpClass() -> None:
         GymlibIntegtestManager.set_up_workspace()
+        # Reset _num_times_created_this_run since previous tests may have created a workspace.
+        DBGymWorkspace._num_times_created_this_run = 0
         PostgresConnTests.workspace = DBGymWorkspace(
             GymlibIntegtestManager.get_workspace_path()
         )
diff --git a/env/tests/integtest_replay.py b/env/tests/integtest_replay.py
index 1c6e3e6c..1752e192 100644
--- a/env/tests/integtest_replay.py
+++ b/env/tests/integtest_replay.py
@@ -19,6 +19,8 @@ class ReplayTests(unittest.TestCase):
     @staticmethod
     def setUpClass() -> None:
         GymlibIntegtestManager.set_up_workspace()
+        # Reset _num_times_created_this_run since previous tests may have created a workspace.
+        DBGymWorkspace._num_times_created_this_run = 0
         ReplayTests.workspace = DBGymWorkspace(
             GymlibIntegtestManager.get_workspace_path()
         )
diff --git a/env/tests/integtest_workload.py b/env/tests/integtest_workload.py
index 2fad33eb..494e0365 100644
--- a/env/tests/integtest_workload.py
+++ b/env/tests/integtest_workload.py
@@ -12,6 +12,8 @@ class WorkloadTests(unittest.TestCase):
     @staticmethod
     def setUpClass() -> None:
         GymlibIntegtestManager.set_up_workspace()
+        # Reset _num_times_created_this_run since previous tests may have created a workspace.
+        DBGymWorkspace._num_times_created_this_run = 0
         WorkloadTests.workspace = DBGymWorkspace(
             GymlibIntegtestManager.get_workspace_path()
         )