From 9e3ae0dfc04b0a6a6e2fa3edfdcae4e465fe4071 Mon Sep 17 00:00:00 2001
From: gary-huang <garyhuang@hotmail.ca>
Date: Mon, 14 Jul 2025 13:10:26 -0400
Subject: [PATCH 01/20] dataframe and csv support

also create_dataset does not push immediately, it is more natural to push manually
---
 ddtrace/llmobs/_experiment.py              | 43 +++++++++++++
 ddtrace/llmobs/_llmobs.py                  | 75 +++++++++++++++++++++-
 riotfile.py                                |  1 +
 setup.cfg                                  | 13 ----
 tests/llmobs/static_files/good_dataset.csv |  3 +
 tests/llmobs/test_experiments.py           | 32 +++++++++
 6 files changed, 151 insertions(+), 16 deletions(-)
 create mode 100644 tests/llmobs/static_files/good_dataset.csv

diff --git a/ddtrace/llmobs/_experiment.py b/ddtrace/llmobs/_experiment.py
index e5a3959a67a..832e1d6e2ad 100644
--- a/ddtrace/llmobs/_experiment.py
+++ b/ddtrace/llmobs/_experiment.py
@@ -161,6 +161,49 @@ def __len__(self) -> int:
     def __iter__(self) -> Iterator[DatasetRecord]:
         return iter(self._records)
 
+    def as_dataframe(self):
+        try:
+            import pandas as pd
+        except ImportError as e:
+            raise ImportError(
+                "pandas is required to convert dataset to DataFrame. " "Please install it with `pip install pandas`"
+            ) from e
+
+        column_tuples = set()
+        data_rows = []
+        for record in self._records:
+            flat_record = {}
+
+            input_data = record.get("input_data", {})
+            if isinstance(input_data, dict):
+                for k, v in input_data.items():
+                    flat_record[("input_data", k)] = v
+                    column_tuples.add(("input_data", k))
+            else:
+                flat_record[("input_data", "")] = input_data  # Use empty string for single input
+                column_tuples.add(("input_data", ""))
+
+            expected_output = record.get("expected_output", {})
+            if isinstance(expected_output, dict):
+                for k, v in expected_output.items():
+                    flat_record[("expected_output", k)] = v
+                    column_tuples.add(("expected_output", k))
+            else:
+                flat_record[("expected_output", "")] = expected_output  # Use empty string for single output
+                column_tuples.add(("expected_output", ""))
+
+            for k, v in record.get("metadata", {}):
+                flat_record[("metadata", k)] = v
+                column_tuples.add(("metadata", k))
+
+            data_rows.append(flat_record)
+
+        records_list = []
+        for flat_record in data_rows:
+            row = [flat_record.get(col, None) for col in column_tuples]
+            records_list.append(row)
+
+        return pd.DataFrame(data=records_list, columns=pd.MultiIndex.from_tuples(column_tuples))
 
 class Experiment:
     def __init__(
diff --git a/ddtrace/llmobs/_llmobs.py b/ddtrace/llmobs/_llmobs.py
index d09851fae4a..d299fc146c0 100644
--- a/ddtrace/llmobs/_llmobs.py
+++ b/ddtrace/llmobs/_llmobs.py
@@ -1,3 +1,4 @@
+import csv
 from dataclasses import dataclass
 from dataclasses import field
 import inspect
@@ -41,7 +42,7 @@
 from ddtrace.internal.utils.formats import asbool
 from ddtrace.internal.utils.formats import format_trace_id
 from ddtrace.internal.utils.formats import parse_tags_str
-from ddtrace.llmobs import _constants as constants
+from ddtrace.llmobs import _constants as constants, DatasetRecord
 from ddtrace.llmobs import _telemetry as telemetry
 from ddtrace.llmobs._constants import ANNOTATIONS_CONTEXT_ID
 from ddtrace.llmobs._constants import DECORATOR
@@ -583,8 +584,76 @@ def create_dataset(cls, name: str, description: str, records: List[DatasetRecord
         ds = cls._instance._dne_client.dataset_create(name, description)
         for r in records:
             ds.append(r)
-        if len(records) > 0:
-            ds.push()
+        return ds
+
+    @classmethod
+    def create_dataset_from_csv(cls, csv_path: str, dataset_name: str, input_data_columns: List[str], expected_output_columns: List[str], metadata_columns: List[str] = [], csv_delimiter: str = ",", description="") -> Dataset:
+        ds = cls._instance._dne_client.dataset_create(dataset_name, description)
+
+        # Store the original field size limit to restore it later
+        original_field_size_limit = csv.field_size_limit()
+
+        csv.field_size_limit(10 * 1024 * 1024) # 10mb
+
+        records = []
+        try:
+            # First check if the file exists and is not empty before parsing
+            with open(csv_path, mode="r") as csvfile:
+                content = csvfile.readline().strip()
+                if not content:
+                    raise ValueError("CSV file appears to be empty or header is missing.")
+
+            with open(csv_path, mode="r") as csvfile:
+                try:
+                    rows = csv.DictReader(csvfile, delimiter=csv_delimiter)
+
+                    # Check header presence before trying to read rows
+                    if rows.fieldnames is None:
+                        # Treat files with no header at all as effectively empty
+                        raise ValueError("CSV file appears to be empty or header is missing.")
+
+                    header_columns = rows.fieldnames
+                    missing_input_columns = [col for col in input_data_columns if col not in header_columns]
+                    missing_output_columns = [col for col in expected_output_columns if col not in header_columns]
+
+                    if missing_input_columns:
+                        raise ValueError(f"Input columns not found in CSV header: {missing_input_columns}")
+                    if missing_output_columns:
+                        raise ValueError(f"Expected output columns not found in CSV header: {missing_output_columns}")
+
+                    # Determine metadata columns (all columns not used for input or expected output)
+                    metadata_columns = [
+                        col for col in header_columns if col not in input_data_columns and col not in expected_output_columns
+                    ]
+
+                    for row in rows:
+                        try:
+                            ds.append(
+                                DatasetRecord(
+                                    input_data={col: row[col] for col in input_data_columns},
+                                    expected_output={col: row[col] for col in expected_output_columns},
+                                    metadata={col: row[col] for col in metadata_columns},
+                                ))
+
+                        except KeyError as ke:
+                            # Missing columns in a data row indicates malformed CSV
+                            raise KeyError(f"Error parsing CSV file: missing column {ke} in a row")
+                        except Exception as e:
+                            # Other errors during row processing also indicate CSV issues
+                            raise ValueError(f"Error parsing CSV file (row processing): {e}")
+
+                except csv.Error as e:
+                    # Catch CSV-specific parsing errors
+                    raise ValueError(f"Error parsing CSV file: {e}")
+
+        except FileNotFoundError as e:
+            raise FileNotFoundError(f"CSV file not found: {csv_path}") from e
+        except PermissionError as e:
+            raise PermissionError(f"Permission denied when reading CSV file: {csvfile}") from e
+        finally:
+            # Always restore the original field size limit
+            csv.field_size_limit(original_field_size_limit)
+
         return ds
 
     @classmethod
diff --git a/riotfile.py b/riotfile.py
index b13e4fe7db8..828ceaea242 100644
--- a/riotfile.py
+++ b/riotfile.py
@@ -3138,6 +3138,7 @@ def select_pys(min_version: str = MIN_PYTHON_VERSION, max_version: str = MAX_PYT
                 "pytest-asyncio": "==0.21.1",
                 "ragas": "==0.1.21",
                 "langchain": latest,
+                "pandas": latest,
             },
             pys=select_pys(min_version="3.8"),
         ),
diff --git a/setup.cfg b/setup.cfg
index a62b967bb25..501d7d10115 100644
--- a/setup.cfg
+++ b/setup.cfg
@@ -6,19 +6,6 @@ skip = *.json,*.h,*.cpp,*.c,.riot,.tox,.mypy_cache,.git,*ddtrace/vendor,tests/co
 exclude-file = .codespellignorelines
 ignore-words-list = asend,dne,fo,medias,ment,nin,ot,setttings,statics,ba,spawnve,doas
 
-# DEV: We use `conftest.py` as a local pytest plugin to configure hooks for collection
-[tool:pytest]
-# --cov-report is intentionally empty else pytest-cov will default to generating a report
-addopts =
-  --cov=ddtrace/
-  --cov=tests/
-  --cov-append
-  --cov-report=
-  --durations=10
-  --junitxml=test-results/junit.xml
-# DEV: The default is `test_*\.py` which will miss `test.py` files
-python_files = test*\.py
-asyncio_mode = auto
 
 [flake8]
 max-line-length = 120
diff --git a/tests/llmobs/static_files/good_dataset.csv b/tests/llmobs/static_files/good_dataset.csv
new file mode 100644
index 00000000000..be4f03d552c
--- /dev/null
+++ b/tests/llmobs/static_files/good_dataset.csv
@@ -0,0 +1,3 @@
+in0,in1,in2,out0,out1,m0
+r0v1,r0v2,r0v3,r0v4,r0v5,r0v6
+r1v1,r1v2,r1v3,r1v4,r1v5,r1v6
diff --git a/tests/llmobs/test_experiments.py b/tests/llmobs/test_experiments.py
index e1a3f925ff1..4370b9cd8c4 100644
--- a/tests/llmobs/test_experiments.py
+++ b/tests/llmobs/test_experiments.py
@@ -89,6 +89,38 @@ def test_dataset_create_delete(llmobs):
 
     llmobs._delete_dataset(dataset_id=dataset._id)
 
+def test_dataset_as_dataframe(llmobs):
+    dataset = llmobs.create_dataset(name="test-dataset-3", description="A third test dataset")
+    dataset._records = [
+        DatasetRecord(input_data=[{"role" : "system", "content": "i am machine"}, {"role" : "user", "content": "hello"}], expected_output="label")
+    ]
+    df = dataset.as_dataframe()
+    llmobs._delete_dataset(dataset_id=dataset._id)
+
+def test_dataset_csv(llmobs):
+    test_path = os.path.dirname(__file__)
+    csv_path = os.path.join(test_path, "static_files/good_dataset.csv")
+    dataset = llmobs.create_dataset_from_csv(csv_path=csv_path, dataset_name="test-dataset-good-csv", description="A good csv dataset", input_data_columns=["in0", "in1", "in2"], expected_output_columns=["out0", "out1"])
+    assert len(dataset) == 2
+    assert len(dataset[0]["input_data"]) == 3
+    assert dataset[0]["input_data"]["in0"] == "r0v1"
+    assert dataset[0]["input_data"]["in1"] == "r0v2"
+    assert dataset[0]["input_data"]["in2"] == "r0v3"
+    assert dataset[1]["input_data"]["in0"] == "r1v1"
+    assert dataset[1]["input_data"]["in1"] == "r1v2"
+    assert dataset[1]["input_data"]["in2"] == "r1v3"
+
+    assert len(dataset[0]["expected_output"]) == 2
+    assert dataset[0]["expected_output"]["out0"] == "r0v4"
+    assert dataset[0]["expected_output"]["out1"] == "r0v5"
+    assert dataset[1]["expected_output"]["out0"] == "r1v4"
+    assert dataset[1]["expected_output"]["out1"] == "r1v5"
+
+    assert len(dataset[0]["metadata"]) == 1
+    assert dataset[0]["metadata"]["m0"] == "r0v6"
+    assert dataset[1]["metadata"]["m0"] == "r1v6"
+
+    llmobs._delete_dataset(dataset_id=dataset._id)
 
 def test_dataset_pull_non_existent(llmobs):
     with pytest.raises(ValueError):

From e9da8e2326ed8fa2ce969ae38684790b438462ae Mon Sep 17 00:00:00 2001
From: gary-huang <garyhuang@hotmail.ca>
Date: Thu, 17 Jul 2025 03:58:54 -0400
Subject: [PATCH 02/20] black and hatch

---
 ddtrace/llmobs/_experiment.py    |  7 +++----
 ddtrace/llmobs/_llmobs.py        | 24 ++++++++++++++++++------
 tests/llmobs/test_experiments.py | 16 ++++++++++++++--
 3 files changed, 35 insertions(+), 12 deletions(-)

diff --git a/ddtrace/llmobs/_experiment.py b/ddtrace/llmobs/_experiment.py
index 832e1d6e2ad..8d7c37869b3 100644
--- a/ddtrace/llmobs/_experiment.py
+++ b/ddtrace/llmobs/_experiment.py
@@ -145,12 +145,10 @@ def delete(self, index: int) -> None:
         del self._records[index]
 
     @overload
-    def __getitem__(self, index: int) -> DatasetRecord:
-        ...
+    def __getitem__(self, index: int) -> DatasetRecord: ...
 
     @overload
-    def __getitem__(self, index: slice) -> List[DatasetRecord]:
-        ...
+    def __getitem__(self, index: slice) -> List[DatasetRecord]: ...
 
     def __getitem__(self, index: Union[int, slice]) -> Union[DatasetRecord, List[DatasetRecord]]:
         return self._records.__getitem__(index)
@@ -205,6 +203,7 @@ def as_dataframe(self):
 
         return pd.DataFrame(data=records_list, columns=pd.MultiIndex.from_tuples(column_tuples))
 
+
 class Experiment:
     def __init__(
         self,
diff --git a/ddtrace/llmobs/_llmobs.py b/ddtrace/llmobs/_llmobs.py
index d299fc146c0..00a86699dac 100644
--- a/ddtrace/llmobs/_llmobs.py
+++ b/ddtrace/llmobs/_llmobs.py
@@ -78,7 +78,7 @@
 from ddtrace.llmobs._evaluators.runner import EvaluatorRunner
 from ddtrace.llmobs._experiment import Dataset
 from ddtrace.llmobs._experiment import DatasetRecordInputType
-from ddtrace.llmobs._experiment import DatasetRecordRaw as DatasetRecord
+from ddtrace.llmobs._experiment import DatasetRecord
 from ddtrace.llmobs._experiment import Experiment
 from ddtrace.llmobs._experiment import ExperimentConfigType
 from ddtrace.llmobs._experiment import JSONType
@@ -587,15 +587,23 @@ def create_dataset(cls, name: str, description: str, records: List[DatasetRecord
         return ds
 
     @classmethod
-    def create_dataset_from_csv(cls, csv_path: str, dataset_name: str, input_data_columns: List[str], expected_output_columns: List[str], metadata_columns: List[str] = [], csv_delimiter: str = ",", description="") -> Dataset:
+    def create_dataset_from_csv(
+        cls,
+        csv_path: str,
+        dataset_name: str,
+        input_data_columns: List[str],
+        expected_output_columns: List[str],
+        metadata_columns: List[str] = [],
+        csv_delimiter: str = ",",
+        description="",
+    ) -> Dataset:
         ds = cls._instance._dne_client.dataset_create(dataset_name, description)
 
         # Store the original field size limit to restore it later
         original_field_size_limit = csv.field_size_limit()
 
-        csv.field_size_limit(10 * 1024 * 1024) # 10mb
+        csv.field_size_limit(10 * 1024 * 1024)  # 10mb
 
-        records = []
         try:
             # First check if the file exists and is not empty before parsing
             with open(csv_path, mode="r") as csvfile:
@@ -623,7 +631,9 @@ def create_dataset_from_csv(cls, csv_path: str, dataset_name: str, input_data_co
 
                     # Determine metadata columns (all columns not used for input or expected output)
                     metadata_columns = [
-                        col for col in header_columns if col not in input_data_columns and col not in expected_output_columns
+                        col
+                        for col in header_columns
+                        if col not in input_data_columns and col not in expected_output_columns
                     ]
 
                     for row in rows:
@@ -633,7 +643,9 @@ def create_dataset_from_csv(cls, csv_path: str, dataset_name: str, input_data_co
                                     input_data={col: row[col] for col in input_data_columns},
                                     expected_output={col: row[col] for col in expected_output_columns},
                                     metadata={col: row[col] for col in metadata_columns},
-                                ))
+                                    record_id="",
+                                )
+                            )
 
                         except KeyError as ke:
                             # Missing columns in a data row indicates malformed CSV
diff --git a/tests/llmobs/test_experiments.py b/tests/llmobs/test_experiments.py
index 4370b9cd8c4..defad5959cd 100644
--- a/tests/llmobs/test_experiments.py
+++ b/tests/llmobs/test_experiments.py
@@ -89,18 +89,29 @@ def test_dataset_create_delete(llmobs):
 
     llmobs._delete_dataset(dataset_id=dataset._id)
 
+
 def test_dataset_as_dataframe(llmobs):
     dataset = llmobs.create_dataset(name="test-dataset-3", description="A third test dataset")
     dataset._records = [
-        DatasetRecord(input_data=[{"role" : "system", "content": "i am machine"}, {"role" : "user", "content": "hello"}], expected_output="label")
+        DatasetRecord(
+            input_data=[{"role": "system", "content": "i am machine"}, {"role": "user", "content": "hello"}],
+            expected_output="label",
+        )
     ]
     df = dataset.as_dataframe()
     llmobs._delete_dataset(dataset_id=dataset._id)
 
+
 def test_dataset_csv(llmobs):
     test_path = os.path.dirname(__file__)
     csv_path = os.path.join(test_path, "static_files/good_dataset.csv")
-    dataset = llmobs.create_dataset_from_csv(csv_path=csv_path, dataset_name="test-dataset-good-csv", description="A good csv dataset", input_data_columns=["in0", "in1", "in2"], expected_output_columns=["out0", "out1"])
+    dataset = llmobs.create_dataset_from_csv(
+        csv_path=csv_path,
+        dataset_name="test-dataset-good-csv",
+        description="A good csv dataset",
+        input_data_columns=["in0", "in1", "in2"],
+        expected_output_columns=["out0", "out1"],
+    )
     assert len(dataset) == 2
     assert len(dataset[0]["input_data"]) == 3
     assert dataset[0]["input_data"]["in0"] == "r0v1"
@@ -122,6 +133,7 @@ def test_dataset_csv(llmobs):
 
     llmobs._delete_dataset(dataset_id=dataset._id)
 
+
 def test_dataset_pull_non_existent(llmobs):
     with pytest.raises(ValueError):
         llmobs.pull_dataset(name="test-dataset-non-existent")

From f7fbfb6ffab7bcbdca85eb53ec7212268770f8a8 Mon Sep 17 00:00:00 2001
From: Kyle Verhoog <kyle@verhoog.ca>
Date: Tue, 22 Jul 2025 08:34:28 -0400
Subject: [PATCH 03/20] chore(llmobs): fix misc experiments stuff

---
 ddtrace/llmobs/_experiment.py | 8 +++++---
 ddtrace/llmobs/_writer.py     | 2 +-
 2 files changed, 6 insertions(+), 4 deletions(-)

diff --git a/ddtrace/llmobs/_experiment.py b/ddtrace/llmobs/_experiment.py
index 055dc319d8f..a4063c5f473 100644
--- a/ddtrace/llmobs/_experiment.py
+++ b/ddtrace/llmobs/_experiment.py
@@ -217,12 +217,14 @@ def run(
         if not self._llmobs_instance.enabled:
             logger.warning(
                 "Skipping experiment as LLMObs is not enabled. "
-                "Ensure LLM Observability is enabled via `LLMObs.enable(...)` or set `DD_LLMOBS_ENABLED=1`."
+                "Ensure LLM Observability is enabled via `LLMObs.enable(...)` or set `DD_LLMOBS_ENABLED=1` and use `ddtrace-run` to run your application."
             )
             return []
-        project_id = self._llmobs_instance._dne_client.project_get(self._project_name)
-        if not project_id:
+        try:
+            project_id = self._llmobs_instance._dne_client.project_get(self._project_name)
+        except FileNotFoundError:
             project_id = self._llmobs_instance._dne_client.project_create(self._project_name)
+
         self._project_id = project_id
         experiment_id, experiment_run_name = self._llmobs_instance._dne_client.experiment_create(
             self.name,
diff --git a/ddtrace/llmobs/_writer.py b/ddtrace/llmobs/_writer.py
index af27a6efdba..884f9c861b0 100644
--- a/ddtrace/llmobs/_writer.py
+++ b/ddtrace/llmobs/_writer.py
@@ -454,7 +454,7 @@ def project_get(self, name: str) -> str:
         response_data = resp.get_json()
         data = response_data["data"]
         if not data:
-            raise ValueError(f"Project {name} not found")
+            raise FileNotFoundError(f"Project {name} not found")
         return data[0]["id"]
 
     def experiment_create(

From ce17651c8866e95633517b54753ccb14b9334a64 Mon Sep 17 00:00:00 2001
From: gary-huang <garyhuang@hotmail.ca>
Date: Tue, 22 Jul 2025 08:40:04 -0400
Subject: [PATCH 04/20] black

---
 ddtrace/llmobs/_experiment.py | 6 ++++--
 1 file changed, 4 insertions(+), 2 deletions(-)

diff --git a/ddtrace/llmobs/_experiment.py b/ddtrace/llmobs/_experiment.py
index 8d7c37869b3..4a8877846b6 100644
--- a/ddtrace/llmobs/_experiment.py
+++ b/ddtrace/llmobs/_experiment.py
@@ -145,10 +145,12 @@ def delete(self, index: int) -> None:
         del self._records[index]
 
     @overload
-    def __getitem__(self, index: int) -> DatasetRecord: ...
+    def __getitem__(self, index: int) -> DatasetRecord:
+        ...
 
     @overload
-    def __getitem__(self, index: slice) -> List[DatasetRecord]: ...
+    def __getitem__(self, index: slice) -> List[DatasetRecord]:
+        ...
 
     def __getitem__(self, index: Union[int, slice]) -> Union[DatasetRecord, List[DatasetRecord]]:
         return self._records.__getitem__(index)

From 03314734e0cf5d9d68e24a5ff1780be6cc5613d6 Mon Sep 17 00:00:00 2001
From: Gary Huang <garyhuang@hotmail.ca>
Date: Tue, 22 Jul 2025 08:40:47 -0400
Subject: [PATCH 05/20] Update ddtrace/llmobs/_experiment.py

Co-authored-by: Yun Kim <35776586+Yun-Kim@users.noreply.github.com>
---
 ddtrace/llmobs/_experiment.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/ddtrace/llmobs/_experiment.py b/ddtrace/llmobs/_experiment.py
index 4a8877846b6..c051d9efe0e 100644
--- a/ddtrace/llmobs/_experiment.py
+++ b/ddtrace/llmobs/_experiment.py
@@ -166,7 +166,7 @@ def as_dataframe(self):
             import pandas as pd
         except ImportError as e:
             raise ImportError(
-                "pandas is required to convert dataset to DataFrame. " "Please install it with `pip install pandas`"
+                "pandas is required to convert dataset to DataFrame. Please install via `pip install pandas`"
             ) from e
 
         column_tuples = set()

From 7bb80ed1fc020ff888dd09963a51735187b7625d Mon Sep 17 00:00:00 2001
From: Gary Huang <garyhuang@hotmail.ca>
Date: Tue, 22 Jul 2025 08:43:41 -0400
Subject: [PATCH 06/20] Update ddtrace/llmobs/_experiment.py

Co-authored-by: Yun Kim <35776586+Yun-Kim@users.noreply.github.com>
---
 ddtrace/llmobs/_experiment.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/ddtrace/llmobs/_experiment.py b/ddtrace/llmobs/_experiment.py
index c051d9efe0e..8aa7fe0e019 100644
--- a/ddtrace/llmobs/_experiment.py
+++ b/ddtrace/llmobs/_experiment.py
@@ -192,7 +192,7 @@ def as_dataframe(self):
                 flat_record[("expected_output", "")] = expected_output  # Use empty string for single output
                 column_tuples.add(("expected_output", ""))
 
-            for k, v in record.get("metadata", {}):
+            for k, v in record.get("metadata", {}).items():
                 flat_record[("metadata", k)] = v
                 column_tuples.add(("metadata", k))
 

From 319e799258f4b84a4fff7c87e71d2d4508097067 Mon Sep 17 00:00:00 2001
From: gary-huang <garyhuang@hotmail.ca>
Date: Tue, 22 Jul 2025 09:04:36 -0400
Subject: [PATCH 07/20] address comments

---
 ddtrace/llmobs/_constants.py |  1 +
 ddtrace/llmobs/_llmobs.py    | 89 ++++++++++++++----------------------
 2 files changed, 36 insertions(+), 54 deletions(-)

diff --git a/ddtrace/llmobs/_constants.py b/ddtrace/llmobs/_constants.py
index 5e511d4694b..217b53cf347 100644
--- a/ddtrace/llmobs/_constants.py
+++ b/ddtrace/llmobs/_constants.py
@@ -56,6 +56,7 @@
 EVP_PAYLOAD_SIZE_LIMIT = 5 << 20  # 5MB (actual limit is 5.1MB)
 EVP_EVENT_SIZE_LIMIT = (1 << 20) - 1024  # 999KB (actual limit is 1MB)
 
+EXPERIMENT_CSV_FIELD_MAX_SIZE = 10 * 1024 * 1024
 
 DROPPED_IO_COLLECTION_ERROR = "dropped_io"
 DROPPED_VALUE_TEXT = "[This value has been dropped because this span's size exceeds the 1MB size limit.]"
diff --git a/ddtrace/llmobs/_llmobs.py b/ddtrace/llmobs/_llmobs.py
index 00a86699dac..4ac09031ad5 100644
--- a/ddtrace/llmobs/_llmobs.py
+++ b/ddtrace/llmobs/_llmobs.py
@@ -42,13 +42,14 @@
 from ddtrace.internal.utils.formats import asbool
 from ddtrace.internal.utils.formats import format_trace_id
 from ddtrace.internal.utils.formats import parse_tags_str
-from ddtrace.llmobs import _constants as constants, DatasetRecord
+from ddtrace.llmobs import _constants as constants
 from ddtrace.llmobs import _telemetry as telemetry
 from ddtrace.llmobs._constants import ANNOTATIONS_CONTEXT_ID
 from ddtrace.llmobs._constants import DECORATOR
 from ddtrace.llmobs._constants import DISPATCH_ON_LLM_TOOL_CHOICE
 from ddtrace.llmobs._constants import DISPATCH_ON_TOOL_CALL
 from ddtrace.llmobs._constants import DISPATCH_ON_TOOL_CALL_OUTPUT_USED
+from ddtrace.llmobs._constants import EXPERIMENT_CSV_FIELD_MAX_SIZE
 from ddtrace.llmobs._constants import EXPERIMENT_ID_KEY
 from ddtrace.llmobs._constants import INPUT_DOCUMENTS
 from ddtrace.llmobs._constants import INPUT_MESSAGES
@@ -602,66 +603,46 @@ def create_dataset_from_csv(
         # Store the original field size limit to restore it later
         original_field_size_limit = csv.field_size_limit()
 
-        csv.field_size_limit(10 * 1024 * 1024)  # 10mb
+        csv.field_size_limit(EXPERIMENT_CSV_FIELD_MAX_SIZE)  # 10mb
 
         try:
-            # First check if the file exists and is not empty before parsing
             with open(csv_path, mode="r") as csvfile:
                 content = csvfile.readline().strip()
                 if not content:
                     raise ValueError("CSV file appears to be empty or header is missing.")
 
-            with open(csv_path, mode="r") as csvfile:
-                try:
-                    rows = csv.DictReader(csvfile, delimiter=csv_delimiter)
-
-                    # Check header presence before trying to read rows
-                    if rows.fieldnames is None:
-                        # Treat files with no header at all as effectively empty
-                        raise ValueError("CSV file appears to be empty or header is missing.")
-
-                    header_columns = rows.fieldnames
-                    missing_input_columns = [col for col in input_data_columns if col not in header_columns]
-                    missing_output_columns = [col for col in expected_output_columns if col not in header_columns]
-
-                    if missing_input_columns:
-                        raise ValueError(f"Input columns not found in CSV header: {missing_input_columns}")
-                    if missing_output_columns:
-                        raise ValueError(f"Expected output columns not found in CSV header: {missing_output_columns}")
-
-                    # Determine metadata columns (all columns not used for input or expected output)
-                    metadata_columns = [
-                        col
-                        for col in header_columns
-                        if col not in input_data_columns and col not in expected_output_columns
-                    ]
-
-                    for row in rows:
-                        try:
-                            ds.append(
-                                DatasetRecord(
-                                    input_data={col: row[col] for col in input_data_columns},
-                                    expected_output={col: row[col] for col in expected_output_columns},
-                                    metadata={col: row[col] for col in metadata_columns},
-                                    record_id="",
-                                )
-                            )
-
-                        except KeyError as ke:
-                            # Missing columns in a data row indicates malformed CSV
-                            raise KeyError(f"Error parsing CSV file: missing column {ke} in a row")
-                        except Exception as e:
-                            # Other errors during row processing also indicate CSV issues
-                            raise ValueError(f"Error parsing CSV file (row processing): {e}")
-
-                except csv.Error as e:
-                    # Catch CSV-specific parsing errors
-                    raise ValueError(f"Error parsing CSV file: {e}")
-
-        except FileNotFoundError as e:
-            raise FileNotFoundError(f"CSV file not found: {csv_path}") from e
-        except PermissionError as e:
-            raise PermissionError(f"Permission denied when reading CSV file: {csvfile}") from e
+                csvfile.seek(0)
+
+                rows = csv.DictReader(csvfile, delimiter=csv_delimiter)
+
+                if rows.fieldnames is None:
+                    raise ValueError("CSV file appears to be empty or header is missing.")
+
+                header_columns = rows.fieldnames
+                missing_input_columns = [col for col in input_data_columns if col not in header_columns]
+                missing_output_columns = [col for col in expected_output_columns if col not in header_columns]
+
+                if any(col not in header_columns for col in input_data_columns):
+                    raise ValueError(f"Input columns not found in CSV header: {missing_input_columns}")
+                if any(col not in header_columns for col in expected_output_columns):
+                    raise ValueError(f"Expected output columns not found in CSV header: {missing_output_columns}")
+
+                metadata_columns = [
+                    col
+                    for col in header_columns
+                    if col not in input_data_columns and col not in expected_output_columns
+                ]
+
+                for row in rows:
+                    ds.append(
+                        DatasetRecord(
+                            input_data={col: row[col] for col in input_data_columns},
+                            expected_output={col: row[col] for col in expected_output_columns},
+                            metadata={col: row[col] for col in metadata_columns},
+                            record_id="",
+                        )
+                    )
+
         finally:
             # Always restore the original field size limit
             csv.field_size_limit(original_field_size_limit)

From 54247181ec95428d021c02bd9226bd4fb4cd3dcd Mon Sep 17 00:00:00 2001
From: Kyle Verhoog <kyle@verhoog.ca>
Date: Tue, 22 Jul 2025 09:05:18 -0400
Subject: [PATCH 08/20] fmt

---
 ddtrace/llmobs/_experiment.py | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/ddtrace/llmobs/_experiment.py b/ddtrace/llmobs/_experiment.py
index a4063c5f473..d6e724492b4 100644
--- a/ddtrace/llmobs/_experiment.py
+++ b/ddtrace/llmobs/_experiment.py
@@ -217,7 +217,8 @@ def run(
         if not self._llmobs_instance.enabled:
             logger.warning(
                 "Skipping experiment as LLMObs is not enabled. "
-                "Ensure LLM Observability is enabled via `LLMObs.enable(...)` or set `DD_LLMOBS_ENABLED=1` and use `ddtrace-run` to run your application."
+                "Ensure LLM Observability is enabled via `LLMObs.enable(...)` "
+                "or set `DD_LLMOBS_ENABLED=1` and use `ddtrace-run` to run your application."
             )
             return []
         try:

From 5763329eb038bf6f5def77f9c5dd6ff8edae3d3f Mon Sep 17 00:00:00 2001
From: gary-huang <garyhuang@hotmail.ca>
Date: Tue, 22 Jul 2025 13:30:23 -0400
Subject: [PATCH 09/20] revert create_dataset push, this will be done in
 another PR

---
 ddtrace/llmobs/_llmobs.py | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/ddtrace/llmobs/_llmobs.py b/ddtrace/llmobs/_llmobs.py
index 1e3d516624c..580036d5546 100644
--- a/ddtrace/llmobs/_llmobs.py
+++ b/ddtrace/llmobs/_llmobs.py
@@ -594,6 +594,8 @@ def create_dataset(cls, name: str, description: str, records: List[DatasetRecord
         ds = cls._instance._dne_client.dataset_create(name, description)
         for r in records:
             ds.append(r)
+        if len(records) > 0:
+            ds.push()
         return ds
 
     @classmethod

From dc389eab57cca66f8cf2cc1ea044048fb7e8a98a Mon Sep 17 00:00:00 2001
From: gary-huang <garyhuang@hotmail.ca>
Date: Tue, 22 Jul 2025 19:25:57 -0400
Subject: [PATCH 10/20] add more tests

---
 ddtrace/llmobs/_llmobs.py                     |  9 +-
 tests/llmobs/static_files/empty.csv           |  0
 .../good_dataset_pipe_separated.csv           |  3 +
 tests/llmobs/test_experiments.py              | 92 ++++++++++++++++++-
 4 files changed, 97 insertions(+), 7 deletions(-)
 create mode 100644 tests/llmobs/static_files/empty.csv
 create mode 100644 tests/llmobs/static_files/good_dataset_pipe_separated.csv

diff --git a/ddtrace/llmobs/_llmobs.py b/ddtrace/llmobs/_llmobs.py
index 580036d5546..fa190c182c7 100644
--- a/ddtrace/llmobs/_llmobs.py
+++ b/ddtrace/llmobs/_llmobs.py
@@ -632,17 +632,14 @@ def create_dataset_from_csv(
                 header_columns = rows.fieldnames
                 missing_input_columns = [col for col in input_data_columns if col not in header_columns]
                 missing_output_columns = [col for col in expected_output_columns if col not in header_columns]
+                missing_metadata_columns = [col for col in metadata_columns if col not in metadata_columns]
 
                 if any(col not in header_columns for col in input_data_columns):
                     raise ValueError(f"Input columns not found in CSV header: {missing_input_columns}")
                 if any(col not in header_columns for col in expected_output_columns):
                     raise ValueError(f"Expected output columns not found in CSV header: {missing_output_columns}")
-
-                metadata_columns = [
-                    col
-                    for col in header_columns
-                    if col not in input_data_columns and col not in expected_output_columns
-                ]
+                if any(col not in header_columns for col in metadata_columns):
+                    raise ValueError(f"Metadata columns not found in CSV header: {missing_metadata_columns}")
 
                 for row in rows:
                     ds.append(
diff --git a/tests/llmobs/static_files/empty.csv b/tests/llmobs/static_files/empty.csv
new file mode 100644
index 00000000000..e69de29bb2d
diff --git a/tests/llmobs/static_files/good_dataset_pipe_separated.csv b/tests/llmobs/static_files/good_dataset_pipe_separated.csv
new file mode 100644
index 00000000000..7a5a698efe1
--- /dev/null
+++ b/tests/llmobs/static_files/good_dataset_pipe_separated.csv
@@ -0,0 +1,3 @@
+in0|in1|in2|out0|out1|m0
+r0v1|r0v2|r0v3|r0v4|r0v5|r0v6
+r1v1|r1v2|r1v3|r1v4|r1v5|r1v6
diff --git a/tests/llmobs/test_experiments.py b/tests/llmobs/test_experiments.py
index 518503e889d..24b84145d98 100644
--- a/tests/llmobs/test_experiments.py
+++ b/tests/llmobs/test_experiments.py
@@ -95,8 +95,65 @@ def test_dataset_as_dataframe(llmobs):
         )
     ]
     df = dataset.as_dataframe()
+    assert len(df.columns) == 2
+    assert df.size == 2 # size is num elements in a series
     llmobs._delete_dataset(dataset_id=dataset._id)
 
+def test_csv_dataset_as_dataframe(llmobs):
+    test_path = os.path.dirname(__file__)
+    csv_path = os.path.join(test_path, "static_files/good_dataset.csv")
+    dataset = llmobs.create_dataset_from_csv(
+        csv_path=csv_path,
+        dataset_name="test-dataset-good-csv",
+        description="A good csv dataset",
+        input_data_columns=["in0", "in1", "in2"],
+        expected_output_columns=["out0", "out1"],
+        metadata_columns=["m0"]
+    )
+    assert len(dataset) == 2
+
+    df = dataset.as_dataframe()
+    assert len(df.columns) == 6
+    assert sorted(df.columns) == [("expected_output", "out0"), ("expected_output", "out1"), ("input_data", "in0"), ("input_data", "in1"), ("input_data", "in2"), ("metadata", "m0")]
+
+    llmobs._delete_dataset(dataset_id=dataset._id)
+
+
+def test_dataset_csv_missing_input_col(llmobs):
+    test_path = os.path.dirname(__file__)
+    csv_path = os.path.join(test_path, "static_files/good_dataset.csv")
+    with pytest.raises(ValueError, match=re.escape("Input columns not found in CSV header: [\'in998\', \'in999\']")):
+        llmobs.create_dataset_from_csv(
+            csv_path=csv_path,
+            dataset_name="test-dataset-good-csv",
+            description="A good csv dataset",
+            input_data_columns=["in998", "in999"],
+            expected_output_columns=["out0", "out1"],
+        )
+
+def test_dataset_csv_missing_output_col(llmobs):
+    test_path = os.path.dirname(__file__)
+    csv_path = os.path.join(test_path, "static_files/good_dataset.csv")
+    with pytest.raises(ValueError, match=re.escape("Expected output columns not found in CSV header: [\'out999\']")):
+        llmobs.create_dataset_from_csv(
+            csv_path=csv_path,
+            dataset_name="test-dataset-good-csv",
+            description="A good csv dataset",
+            input_data_columns=["in0", "in1", "in2"],
+            expected_output_columns=["out999"],
+        )
+
+def test_dataset_csv_empty_csv(llmobs):
+    test_path = os.path.dirname(__file__)
+    csv_path = os.path.join(test_path, "static_files/empty.csv")
+    with pytest.raises(ValueError, match=re.escape("CSV file appears to be empty or header is missing.")):
+        llmobs.create_dataset_from_csv(
+            csv_path=csv_path,
+            dataset_name="test-dataset-bad-csv",
+            description="not a real csv dataset",
+            input_data_columns=["in0", "in1", "in2"],
+            expected_output_columns=["out0"],
+        )
 
 def test_dataset_csv(llmobs):
     test_path = os.path.dirname(__file__)
@@ -123,12 +180,45 @@ def test_dataset_csv(llmobs):
     assert dataset[1]["expected_output"]["out0"] == "r1v4"
     assert dataset[1]["expected_output"]["out1"] == "r1v5"
 
+    assert dataset.description == "A good csv dataset"
+
+    llmobs._delete_dataset(dataset_id=dataset._id)
+
+
+def test_dataset_csv_pipe_separated(llmobs):
+    test_path = os.path.dirname(__file__)
+    csv_path = os.path.join(test_path, "static_files/good_dataset_pipe_separated.csv")
+    dataset = llmobs.create_dataset_from_csv(
+        csv_path=csv_path,
+        dataset_name="test-dataset-good-csv-pipe",
+        description="A good pipe separated csv dataset",
+        input_data_columns=["in0", "in1", "in2"],
+        expected_output_columns=["out0", "out1"],
+        metadata_columns=["m0"],
+        csv_delimiter="|",
+    )
+    assert len(dataset) == 2
+    assert len(dataset[0]["input_data"]) == 3
+    assert dataset[0]["input_data"]["in0"] == "r0v1"
+    assert dataset[0]["input_data"]["in1"] == "r0v2"
+    assert dataset[0]["input_data"]["in2"] == "r0v3"
+    assert dataset[1]["input_data"]["in0"] == "r1v1"
+    assert dataset[1]["input_data"]["in1"] == "r1v2"
+    assert dataset[1]["input_data"]["in2"] == "r1v3"
+
+    assert len(dataset[0]["expected_output"]) == 2
+    assert dataset[0]["expected_output"]["out0"] == "r0v4"
+    assert dataset[0]["expected_output"]["out1"] == "r0v5"
+    assert dataset[1]["expected_output"]["out0"] == "r1v4"
+    assert dataset[1]["expected_output"]["out1"] == "r1v5"
+
     assert len(dataset[0]["metadata"]) == 1
     assert dataset[0]["metadata"]["m0"] == "r0v6"
     assert dataset[1]["metadata"]["m0"] == "r1v6"
 
-    llmobs._delete_dataset(dataset_id=dataset._id)
+    assert dataset.description == "A good pipe separated csv dataset"
 
+    llmobs._delete_dataset(dataset_id=dataset._id)
 
 def test_dataset_pull_non_existent(llmobs):
     with pytest.raises(ValueError):

From ebc060ecf67f96e00c07d577cb5272e42f4e72b7 Mon Sep 17 00:00:00 2001
From: gary-huang <garyhuang@hotmail.ca>
Date: Tue, 22 Jul 2025 22:34:32 -0400
Subject: [PATCH 11/20] wip push on create csV

---
 ddtrace/llmobs/_llmobs.py        |  2 ++
 tests/llmobs/test_experiments.py | 10 ++++++++++
 2 files changed, 12 insertions(+)

diff --git a/ddtrace/llmobs/_llmobs.py b/ddtrace/llmobs/_llmobs.py
index a30cbb07110..4043fabbd40 100644
--- a/ddtrace/llmobs/_llmobs.py
+++ b/ddtrace/llmobs/_llmobs.py
@@ -656,6 +656,8 @@ def create_dataset_from_csv(
             # Always restore the original field size limit
             csv.field_size_limit(original_field_size_limit)
 
+        if len(ds) > 0:
+            ds.push()
         return ds
 
     @classmethod
diff --git a/tests/llmobs/test_experiments.py b/tests/llmobs/test_experiments.py
index 7466f15635b..f36cdf32166 100644
--- a/tests/llmobs/test_experiments.py
+++ b/tests/llmobs/test_experiments.py
@@ -182,6 +182,16 @@ def test_dataset_csv(llmobs):
 
     assert dataset.description == "A good csv dataset"
 
+    assert dataset._id is not None
+
+    wait_for_backend()
+    ds = llmobs.pull_dataset(name=dataset.name)
+
+    assert len(ds) == len(dataset)
+    assert ds.name == dataset.name
+    assert ds.description == dataset.description
+    assert ds._version == 1
+
     llmobs._delete_dataset(dataset_id=dataset._id)
 
 

From 4c9087289f3ce366774c434006c16d64a5484340 Mon Sep 17 00:00:00 2001
From: gary-huang <garyhuang@hotmail.ca>
Date: Tue, 22 Jul 2025 22:34:32 -0400
Subject: [PATCH 12/20] push on create csv

---
 ...9cb46e5ed3_batch_update_post_466da69c.yaml | 49 +++++++++++++++++++
 ...05e-1d9cb46e5ed3_records_get_46238641.yaml | 45 +++++++++++++++++
 ...b33f55d51d_batch_update_post_b8c84073.yaml | 49 +++++++++++++++++++
 ...bbf-95b33f55d51d_records_get_d7e731ad.yaml | 45 +++++++++++++++++
 ...-obs_v1_datasets_delete_post_d7976483.yaml | 46 +++++++++++++++++
 ...-obs_v1_datasets_delete_post_f8dc510e.yaml | 46 +++++++++++++++++
 ...st-dataset-good-csv-pipe_get_bcb704ce.yaml | 46 +++++++++++++++++
 ...e__test-dataset-good-csv_get_989b2028.yaml | 46 +++++++++++++++++
 ...ble_llm-obs_v1_datasets_post_027be704.yaml | 46 +++++++++++++++++
 ...ble_llm-obs_v1_datasets_post_c1d4ae31.yaml | 46 +++++++++++++++++
 ...ble_llm-obs_v1_datasets_post_d59b5313.yaml | 46 +++++++++++++++++
 tests/llmobs/test_experiments.py              | 10 ++++
 12 files changed, 520 insertions(+)
 create mode 100644 tests/llmobs/llmobs_cassettes/datadog/datadog_api_unstable_llm-obs_v1_datasets_8dbba503-cf48-4e82-805e-1d9cb46e5ed3_batch_update_post_466da69c.yaml
 create mode 100644 tests/llmobs/llmobs_cassettes/datadog/datadog_api_unstable_llm-obs_v1_datasets_8dbba503-cf48-4e82-805e-1d9cb46e5ed3_records_get_46238641.yaml
 create mode 100644 tests/llmobs/llmobs_cassettes/datadog/datadog_api_unstable_llm-obs_v1_datasets_acf19ca4-8062-4548-abbf-95b33f55d51d_batch_update_post_b8c84073.yaml
 create mode 100644 tests/llmobs/llmobs_cassettes/datadog/datadog_api_unstable_llm-obs_v1_datasets_acf19ca4-8062-4548-abbf-95b33f55d51d_records_get_d7e731ad.yaml
 create mode 100644 tests/llmobs/llmobs_cassettes/datadog/datadog_api_unstable_llm-obs_v1_datasets_delete_post_d7976483.yaml
 create mode 100644 tests/llmobs/llmobs_cassettes/datadog/datadog_api_unstable_llm-obs_v1_datasets_delete_post_f8dc510e.yaml
 create mode 100644 tests/llmobs/llmobs_cassettes/datadog/datadog_api_unstable_llm-obs_v1_datasets_filter_name__test-dataset-good-csv-pipe_get_bcb704ce.yaml
 create mode 100644 tests/llmobs/llmobs_cassettes/datadog/datadog_api_unstable_llm-obs_v1_datasets_filter_name__test-dataset-good-csv_get_989b2028.yaml
 create mode 100644 tests/llmobs/llmobs_cassettes/datadog/datadog_api_unstable_llm-obs_v1_datasets_post_027be704.yaml
 create mode 100644 tests/llmobs/llmobs_cassettes/datadog/datadog_api_unstable_llm-obs_v1_datasets_post_c1d4ae31.yaml
 create mode 100644 tests/llmobs/llmobs_cassettes/datadog/datadog_api_unstable_llm-obs_v1_datasets_post_d59b5313.yaml

diff --git a/tests/llmobs/llmobs_cassettes/datadog/datadog_api_unstable_llm-obs_v1_datasets_8dbba503-cf48-4e82-805e-1d9cb46e5ed3_batch_update_post_466da69c.yaml b/tests/llmobs/llmobs_cassettes/datadog/datadog_api_unstable_llm-obs_v1_datasets_8dbba503-cf48-4e82-805e-1d9cb46e5ed3_batch_update_post_466da69c.yaml
new file mode 100644
index 00000000000..81adf342dcc
--- /dev/null
+++ b/tests/llmobs/llmobs_cassettes/datadog/datadog_api_unstable_llm-obs_v1_datasets_8dbba503-cf48-4e82-805e-1d9cb46e5ed3_batch_update_post_466da69c.yaml
@@ -0,0 +1,49 @@
+interactions:
+- request:
+    body: '{"data": {"type": "datasets", "id": "8dbba503-cf48-4e82-805e-1d9cb46e5ed3",
+      "attributes": {"insert_records": [{"input": {"in0": "r0v1", "in1": "r0v2", "in2":
+      "r0v3"}, "expected_output": {"out0": "r0v4", "out1": "r0v5"}, "metadata": {}},
+      {"input": {"in0": "r1v1", "in1": "r1v2", "in2": "r1v3"}, "expected_output":
+      {"out0": "r1v4", "out1": "r1v5"}, "metadata": {}}], "update_records": [], "delete_records":
+      []}}}'
+    headers:
+      Accept:
+      - '*/*'
+      ? !!python/object/apply:multidict._multidict.istr
+      - Accept-Encoding
+      : - identity
+      Connection:
+      - keep-alive
+      Content-Length:
+      - '410'
+      ? !!python/object/apply:multidict._multidict.istr
+      - Content-Type
+      : - application/json
+      User-Agent:
+      - python-requests/2.32.3
+    method: POST
+    uri: https://api.datadoghq.com/api/unstable/llm-obs/v1/datasets/8dbba503-cf48-4e82-805e-1d9cb46e5ed3/batch_update
+  response:
+    body:
+      string: '{"data":[{"id":"79dc4c72-e4fd-4de7-b46e-419b8072a207","type":"datasets","attributes":{"author":{"id":"de473b30-eb9f-11e9-a77a-c7405862b8bd"},"created_at":"2025-07-23T02:50:48.152449869Z","dataset_id":"8dbba503-cf48-4e82-805e-1d9cb46e5ed3","expected_output":{"out0":"r0v4","out1":"r0v5"},"input":{"in0":"r0v1","in1":"r0v2","in2":"r0v3"},"metadata":{},"updated_at":"2025-07-23T02:50:48.152449869Z","version":1}},{"id":"f0f3fe26-1060-452f-a292-30b1293dd5a1","type":"datasets","attributes":{"author":{"id":"de473b30-eb9f-11e9-a77a-c7405862b8bd"},"created_at":"2025-07-23T02:50:48.152449869Z","dataset_id":"8dbba503-cf48-4e82-805e-1d9cb46e5ed3","expected_output":{"out0":"r1v4","out1":"r1v5"},"input":{"in0":"r1v1","in1":"r1v2","in2":"r1v3"},"metadata":{},"updated_at":"2025-07-23T02:50:48.152449869Z","version":1}}]}'
+    headers:
+      content-length:
+      - '812'
+      content-security-policy:
+      - frame-ancestors 'self'; report-uri https://logs.browser-intake-datadoghq.com/api/v2/logs?dd-api-key=pube4f163c23bbf91c16b8f57f56af9fc58&dd-evp-origin=content-security-policy&ddsource=csp-report&ddtags=site%3Adatadoghq.com
+      content-type:
+      - application/vnd.api+json
+      date:
+      - Wed, 23 Jul 2025 02:50:48 GMT
+      strict-transport-security:
+      - max-age=31536000; includeSubDomains; preload
+      vary:
+      - Accept-Encoding
+      x-content-type-options:
+      - nosniff
+      x-frame-options:
+      - SAMEORIGIN
+    status:
+      code: 200
+      message: OK
+version: 1
diff --git a/tests/llmobs/llmobs_cassettes/datadog/datadog_api_unstable_llm-obs_v1_datasets_8dbba503-cf48-4e82-805e-1d9cb46e5ed3_records_get_46238641.yaml b/tests/llmobs/llmobs_cassettes/datadog/datadog_api_unstable_llm-obs_v1_datasets_8dbba503-cf48-4e82-805e-1d9cb46e5ed3_records_get_46238641.yaml
new file mode 100644
index 00000000000..f8b4fdefe78
--- /dev/null
+++ b/tests/llmobs/llmobs_cassettes/datadog/datadog_api_unstable_llm-obs_v1_datasets_8dbba503-cf48-4e82-805e-1d9cb46e5ed3_records_get_46238641.yaml
@@ -0,0 +1,45 @@
+interactions:
+- request:
+    body: null
+    headers:
+      Accept:
+      - '*/*'
+      ? !!python/object/apply:multidict._multidict.istr
+      - Accept-Encoding
+      : - identity
+      Connection:
+      - keep-alive
+      ? !!python/object/apply:multidict._multidict.istr
+      - Content-Length
+      : - '0'
+      ? !!python/object/apply:multidict._multidict.istr
+      - Content-Type
+      : - application/json
+      User-Agent:
+      - python-requests/2.32.3
+    method: GET
+    uri: https://api.datadoghq.com/api/unstable/llm-obs/v1/datasets/8dbba503-cf48-4e82-805e-1d9cb46e5ed3/records
+  response:
+    body:
+      string: '{"data":[{"id":"79dc4c72-e4fd-4de7-b46e-419b8072a207","type":"datasets","attributes":{"author":{"id":"de473b30-eb9f-11e9-a77a-c7405862b8bd"},"created_at":"2025-07-23T02:50:48.152449Z","dataset_id":"8dbba503-cf48-4e82-805e-1d9cb46e5ed3","expected_output":{"out0":"r0v4","out1":"r0v5"},"input":{"in0":"r0v1","in1":"r0v2","in2":"r0v3"},"metadata":{},"updated_at":"2025-07-23T02:50:48.152449Z"}},{"id":"f0f3fe26-1060-452f-a292-30b1293dd5a1","type":"datasets","attributes":{"author":{"id":"de473b30-eb9f-11e9-a77a-c7405862b8bd"},"created_at":"2025-07-23T02:50:48.152449Z","dataset_id":"8dbba503-cf48-4e82-805e-1d9cb46e5ed3","expected_output":{"out0":"r1v4","out1":"r1v5"},"input":{"in0":"r1v1","in1":"r1v2","in2":"r1v3"},"metadata":{},"updated_at":"2025-07-23T02:50:48.152449Z"}}],"meta":{"after":""}}'
+    headers:
+      content-length:
+      - '796'
+      content-security-policy:
+      - frame-ancestors 'self'; report-uri https://logs.browser-intake-datadoghq.com/api/v2/logs?dd-api-key=pube4f163c23bbf91c16b8f57f56af9fc58&dd-evp-origin=content-security-policy&ddsource=csp-report&ddtags=site%3Adatadoghq.com
+      content-type:
+      - application/vnd.api+json
+      date:
+      - Wed, 23 Jul 2025 02:50:52 GMT
+      strict-transport-security:
+      - max-age=31536000; includeSubDomains; preload
+      vary:
+      - Accept-Encoding
+      x-content-type-options:
+      - nosniff
+      x-frame-options:
+      - SAMEORIGIN
+    status:
+      code: 200
+      message: OK
+version: 1
diff --git a/tests/llmobs/llmobs_cassettes/datadog/datadog_api_unstable_llm-obs_v1_datasets_acf19ca4-8062-4548-abbf-95b33f55d51d_batch_update_post_b8c84073.yaml b/tests/llmobs/llmobs_cassettes/datadog/datadog_api_unstable_llm-obs_v1_datasets_acf19ca4-8062-4548-abbf-95b33f55d51d_batch_update_post_b8c84073.yaml
new file mode 100644
index 00000000000..eb8a50f0d92
--- /dev/null
+++ b/tests/llmobs/llmobs_cassettes/datadog/datadog_api_unstable_llm-obs_v1_datasets_acf19ca4-8062-4548-abbf-95b33f55d51d_batch_update_post_b8c84073.yaml
@@ -0,0 +1,49 @@
+interactions:
+- request:
+    body: '{"data": {"type": "datasets", "id": "acf19ca4-8062-4548-abbf-95b33f55d51d",
+      "attributes": {"insert_records": [{"input": {"in0": "r0v1", "in1": "r0v2", "in2":
+      "r0v3"}, "expected_output": {"out0": "r0v4", "out1": "r0v5"}, "metadata": {"m0":
+      "r0v6"}}, {"input": {"in0": "r1v1", "in1": "r1v2", "in2": "r1v3"}, "expected_output":
+      {"out0": "r1v4", "out1": "r1v5"}, "metadata": {"m0": "r1v6"}}], "update_records":
+      [], "delete_records": []}}}'
+    headers:
+      Accept:
+      - '*/*'
+      ? !!python/object/apply:multidict._multidict.istr
+      - Accept-Encoding
+      : - identity
+      Connection:
+      - keep-alive
+      Content-Length:
+      - '434'
+      ? !!python/object/apply:multidict._multidict.istr
+      - Content-Type
+      : - application/json
+      User-Agent:
+      - python-requests/2.32.3
+    method: POST
+    uri: https://api.datadoghq.com/api/unstable/llm-obs/v1/datasets/acf19ca4-8062-4548-abbf-95b33f55d51d/batch_update
+  response:
+    body:
+      string: '{"data":[{"id":"016350fb-8b44-4d5d-af57-0c8408bb5d25","type":"datasets","attributes":{"author":{"id":"de473b30-eb9f-11e9-a77a-c7405862b8bd"},"created_at":"2025-07-23T02:50:52.795124332Z","dataset_id":"acf19ca4-8062-4548-abbf-95b33f55d51d","expected_output":{"out0":"r0v4","out1":"r0v5"},"input":{"in0":"r0v1","in1":"r0v2","in2":"r0v3"},"metadata":{"m0":"r0v6"},"updated_at":"2025-07-23T02:50:52.795124332Z","version":1}},{"id":"91fcfd23-040c-40ad-a0b0-f497f10b41e3","type":"datasets","attributes":{"author":{"id":"de473b30-eb9f-11e9-a77a-c7405862b8bd"},"created_at":"2025-07-23T02:50:52.795124332Z","dataset_id":"acf19ca4-8062-4548-abbf-95b33f55d51d","expected_output":{"out0":"r1v4","out1":"r1v5"},"input":{"in0":"r1v1","in1":"r1v2","in2":"r1v3"},"metadata":{"m0":"r1v6"},"updated_at":"2025-07-23T02:50:52.795124332Z","version":1}}]}'
+    headers:
+      content-length:
+      - '834'
+      content-security-policy:
+      - frame-ancestors 'self'; report-uri https://logs.browser-intake-datadoghq.com/api/v2/logs?dd-api-key=pube4f163c23bbf91c16b8f57f56af9fc58&dd-evp-origin=content-security-policy&ddsource=csp-report&ddtags=site%3Adatadoghq.com
+      content-type:
+      - application/vnd.api+json
+      date:
+      - Wed, 23 Jul 2025 02:50:52 GMT
+      strict-transport-security:
+      - max-age=31536000; includeSubDomains; preload
+      vary:
+      - Accept-Encoding
+      x-content-type-options:
+      - nosniff
+      x-frame-options:
+      - SAMEORIGIN
+    status:
+      code: 200
+      message: OK
+version: 1
diff --git a/tests/llmobs/llmobs_cassettes/datadog/datadog_api_unstable_llm-obs_v1_datasets_acf19ca4-8062-4548-abbf-95b33f55d51d_records_get_d7e731ad.yaml b/tests/llmobs/llmobs_cassettes/datadog/datadog_api_unstable_llm-obs_v1_datasets_acf19ca4-8062-4548-abbf-95b33f55d51d_records_get_d7e731ad.yaml
new file mode 100644
index 00000000000..51d0723acfa
--- /dev/null
+++ b/tests/llmobs/llmobs_cassettes/datadog/datadog_api_unstable_llm-obs_v1_datasets_acf19ca4-8062-4548-abbf-95b33f55d51d_records_get_d7e731ad.yaml
@@ -0,0 +1,45 @@
+interactions:
+- request:
+    body: null
+    headers:
+      Accept:
+      - '*/*'
+      ? !!python/object/apply:multidict._multidict.istr
+      - Accept-Encoding
+      : - identity
+      Connection:
+      - keep-alive
+      ? !!python/object/apply:multidict._multidict.istr
+      - Content-Length
+      : - '0'
+      ? !!python/object/apply:multidict._multidict.istr
+      - Content-Type
+      : - application/json
+      User-Agent:
+      - python-requests/2.32.3
+    method: GET
+    uri: https://api.datadoghq.com/api/unstable/llm-obs/v1/datasets/acf19ca4-8062-4548-abbf-95b33f55d51d/records
+  response:
+    body:
+      string: '{"data":[{"id":"016350fb-8b44-4d5d-af57-0c8408bb5d25","type":"datasets","attributes":{"author":{"id":"de473b30-eb9f-11e9-a77a-c7405862b8bd"},"created_at":"2025-07-23T02:50:52.795124Z","dataset_id":"acf19ca4-8062-4548-abbf-95b33f55d51d","expected_output":{"out0":"r0v4","out1":"r0v5"},"input":{"in0":"r0v1","in1":"r0v2","in2":"r0v3"},"metadata":{"m0":"r0v6"},"updated_at":"2025-07-23T02:50:52.795124Z"}},{"id":"91fcfd23-040c-40ad-a0b0-f497f10b41e3","type":"datasets","attributes":{"author":{"id":"de473b30-eb9f-11e9-a77a-c7405862b8bd"},"created_at":"2025-07-23T02:50:52.795124Z","dataset_id":"acf19ca4-8062-4548-abbf-95b33f55d51d","expected_output":{"out0":"r1v4","out1":"r1v5"},"input":{"in0":"r1v1","in1":"r1v2","in2":"r1v3"},"metadata":{"m0":"r1v6"},"updated_at":"2025-07-23T02:50:52.795124Z"}}],"meta":{"after":""}}'
+    headers:
+      content-length:
+      - '818'
+      content-security-policy:
+      - frame-ancestors 'self'; report-uri https://logs.browser-intake-datadoghq.com/api/v2/logs?dd-api-key=pube4f163c23bbf91c16b8f57f56af9fc58&dd-evp-origin=content-security-policy&ddsource=csp-report&ddtags=site%3Adatadoghq.com
+      content-type:
+      - application/vnd.api+json
+      date:
+      - Wed, 23 Jul 2025 02:50:55 GMT
+      strict-transport-security:
+      - max-age=31536000; includeSubDomains; preload
+      vary:
+      - Accept-Encoding
+      x-content-type-options:
+      - nosniff
+      x-frame-options:
+      - SAMEORIGIN
+    status:
+      code: 200
+      message: OK
+version: 1
diff --git a/tests/llmobs/llmobs_cassettes/datadog/datadog_api_unstable_llm-obs_v1_datasets_delete_post_d7976483.yaml b/tests/llmobs/llmobs_cassettes/datadog/datadog_api_unstable_llm-obs_v1_datasets_delete_post_d7976483.yaml
new file mode 100644
index 00000000000..698ca4baa2d
--- /dev/null
+++ b/tests/llmobs/llmobs_cassettes/datadog/datadog_api_unstable_llm-obs_v1_datasets_delete_post_d7976483.yaml
@@ -0,0 +1,46 @@
+interactions:
+- request:
+    body: '{"data": {"type": "datasets", "attributes": {"type": "soft", "dataset_ids":
+      ["8dbba503-cf48-4e82-805e-1d9cb46e5ed3"]}}}'
+    headers:
+      Accept:
+      - '*/*'
+      ? !!python/object/apply:multidict._multidict.istr
+      - Accept-Encoding
+      : - identity
+      Connection:
+      - keep-alive
+      Content-Length:
+      - '119'
+      ? !!python/object/apply:multidict._multidict.istr
+      - Content-Type
+      : - application/json
+      User-Agent:
+      - python-requests/2.32.3
+    method: POST
+    uri: https://api.datadoghq.com/api/unstable/llm-obs/v1/datasets/delete
+  response:
+    body:
+      string: '{"data":[{"id":"8dbba503-cf48-4e82-805e-1d9cb46e5ed3","type":"datasets","attributes":{"author":{"id":"de473b30-eb9f-11e9-a77a-c7405862b8bd"},"created_at":"2025-07-23T02:50:46.680898Z","current_version":1,"deleted_at":"2025-07-23T02:50:52.35158Z","description":"A
+        good csv dataset","name":"test-dataset-good-csv","updated_at":"2025-07-23T02:50:48.30675Z"}}]}'
+    headers:
+      content-length:
+      - '357'
+      content-security-policy:
+      - frame-ancestors 'self'; report-uri https://logs.browser-intake-datadoghq.com/api/v2/logs?dd-api-key=pube4f163c23bbf91c16b8f57f56af9fc58&dd-evp-origin=content-security-policy&ddsource=csp-report&ddtags=site%3Adatadoghq.com
+      content-type:
+      - application/vnd.api+json
+      date:
+      - Wed, 23 Jul 2025 02:50:52 GMT
+      strict-transport-security:
+      - max-age=31536000; includeSubDomains; preload
+      vary:
+      - Accept-Encoding
+      x-content-type-options:
+      - nosniff
+      x-frame-options:
+      - SAMEORIGIN
+    status:
+      code: 200
+      message: OK
+version: 1
diff --git a/tests/llmobs/llmobs_cassettes/datadog/datadog_api_unstable_llm-obs_v1_datasets_delete_post_f8dc510e.yaml b/tests/llmobs/llmobs_cassettes/datadog/datadog_api_unstable_llm-obs_v1_datasets_delete_post_f8dc510e.yaml
new file mode 100644
index 00000000000..10725a15b17
--- /dev/null
+++ b/tests/llmobs/llmobs_cassettes/datadog/datadog_api_unstable_llm-obs_v1_datasets_delete_post_f8dc510e.yaml
@@ -0,0 +1,46 @@
+interactions:
+- request:
+    body: '{"data": {"type": "datasets", "attributes": {"type": "soft", "dataset_ids":
+      ["acf19ca4-8062-4548-abbf-95b33f55d51d"]}}}'
+    headers:
+      Accept:
+      - '*/*'
+      ? !!python/object/apply:multidict._multidict.istr
+      - Accept-Encoding
+      : - identity
+      Connection:
+      - keep-alive
+      Content-Length:
+      - '119'
+      ? !!python/object/apply:multidict._multidict.istr
+      - Content-Type
+      : - application/json
+      User-Agent:
+      - python-requests/2.32.3
+    method: POST
+    uri: https://api.datadoghq.com/api/unstable/llm-obs/v1/datasets/delete
+  response:
+    body:
+      string: '{"data":[{"id":"acf19ca4-8062-4548-abbf-95b33f55d51d","type":"datasets","attributes":{"author":{"id":"de473b30-eb9f-11e9-a77a-c7405862b8bd"},"created_at":"2025-07-23T02:50:52.663543Z","current_version":1,"deleted_at":"2025-07-23T02:50:55.361839Z","description":"A
+        good pipe separated csv dataset","name":"test-dataset-good-csv-pipe","updated_at":"2025-07-23T02:50:52.943101Z"}}]}'
+    headers:
+      content-length:
+      - '379'
+      content-security-policy:
+      - frame-ancestors 'self'; report-uri https://logs.browser-intake-datadoghq.com/api/v2/logs?dd-api-key=pube4f163c23bbf91c16b8f57f56af9fc58&dd-evp-origin=content-security-policy&ddsource=csp-report&ddtags=site%3Adatadoghq.com
+      content-type:
+      - application/vnd.api+json
+      date:
+      - Wed, 23 Jul 2025 02:50:55 GMT
+      strict-transport-security:
+      - max-age=31536000; includeSubDomains; preload
+      vary:
+      - Accept-Encoding
+      x-content-type-options:
+      - nosniff
+      x-frame-options:
+      - SAMEORIGIN
+    status:
+      code: 200
+      message: OK
+version: 1
diff --git a/tests/llmobs/llmobs_cassettes/datadog/datadog_api_unstable_llm-obs_v1_datasets_filter_name__test-dataset-good-csv-pipe_get_bcb704ce.yaml b/tests/llmobs/llmobs_cassettes/datadog/datadog_api_unstable_llm-obs_v1_datasets_filter_name__test-dataset-good-csv-pipe_get_bcb704ce.yaml
new file mode 100644
index 00000000000..032e3c997f3
--- /dev/null
+++ b/tests/llmobs/llmobs_cassettes/datadog/datadog_api_unstable_llm-obs_v1_datasets_filter_name__test-dataset-good-csv-pipe_get_bcb704ce.yaml
@@ -0,0 +1,46 @@
+interactions:
+- request:
+    body: null
+    headers:
+      Accept:
+      - '*/*'
+      ? !!python/object/apply:multidict._multidict.istr
+      - Accept-Encoding
+      : - identity
+      Connection:
+      - keep-alive
+      ? !!python/object/apply:multidict._multidict.istr
+      - Content-Length
+      : - '0'
+      ? !!python/object/apply:multidict._multidict.istr
+      - Content-Type
+      : - application/json
+      User-Agent:
+      - python-requests/2.32.3
+    method: GET
+    uri: https://api.datadoghq.com/api/unstable/llm-obs/v1/datasets?filter%5Bname%5D=test-dataset-good-csv-pipe
+  response:
+    body:
+      string: '{"data":[{"id":"acf19ca4-8062-4548-abbf-95b33f55d51d","type":"datasets","attributes":{"author":{"id":"de473b30-eb9f-11e9-a77a-c7405862b8bd"},"created_at":"2025-07-23T02:50:52.663543Z","current_version":1,"description":"A
+        good pipe separated csv dataset","name":"test-dataset-good-csv-pipe","updated_at":"2025-07-23T02:50:52.943101Z"}}],"meta":{"after":""}}'
+    headers:
+      content-length:
+      - '356'
+      content-security-policy:
+      - frame-ancestors 'self'; report-uri https://logs.browser-intake-datadoghq.com/api/v2/logs?dd-api-key=pube4f163c23bbf91c16b8f57f56af9fc58&dd-evp-origin=content-security-policy&ddsource=csp-report&ddtags=site%3Adatadoghq.com
+      content-type:
+      - application/vnd.api+json
+      date:
+      - Wed, 23 Jul 2025 02:50:55 GMT
+      strict-transport-security:
+      - max-age=31536000; includeSubDomains; preload
+      vary:
+      - Accept-Encoding
+      x-content-type-options:
+      - nosniff
+      x-frame-options:
+      - SAMEORIGIN
+    status:
+      code: 200
+      message: OK
+version: 1
diff --git a/tests/llmobs/llmobs_cassettes/datadog/datadog_api_unstable_llm-obs_v1_datasets_filter_name__test-dataset-good-csv_get_989b2028.yaml b/tests/llmobs/llmobs_cassettes/datadog/datadog_api_unstable_llm-obs_v1_datasets_filter_name__test-dataset-good-csv_get_989b2028.yaml
new file mode 100644
index 00000000000..02e7ba102ae
--- /dev/null
+++ b/tests/llmobs/llmobs_cassettes/datadog/datadog_api_unstable_llm-obs_v1_datasets_filter_name__test-dataset-good-csv_get_989b2028.yaml
@@ -0,0 +1,46 @@
+interactions:
+- request:
+    body: null
+    headers:
+      Accept:
+      - '*/*'
+      ? !!python/object/apply:multidict._multidict.istr
+      - Accept-Encoding
+      : - identity
+      Connection:
+      - keep-alive
+      ? !!python/object/apply:multidict._multidict.istr
+      - Content-Length
+      : - '0'
+      ? !!python/object/apply:multidict._multidict.istr
+      - Content-Type
+      : - application/json
+      User-Agent:
+      - python-requests/2.32.3
+    method: GET
+    uri: https://api.datadoghq.com/api/unstable/llm-obs/v1/datasets?filter%5Bname%5D=test-dataset-good-csv
+  response:
+    body:
+      string: '{"data":[{"id":"8dbba503-cf48-4e82-805e-1d9cb46e5ed3","type":"datasets","attributes":{"author":{"id":"de473b30-eb9f-11e9-a77a-c7405862b8bd"},"created_at":"2025-07-23T02:50:46.680898Z","current_version":1,"description":"A
+        good csv dataset","name":"test-dataset-good-csv","updated_at":"2025-07-23T02:50:48.30675Z"}}],"meta":{"after":""}}'
+    headers:
+      content-length:
+      - '335'
+      content-security-policy:
+      - frame-ancestors 'self'; report-uri https://logs.browser-intake-datadoghq.com/api/v2/logs?dd-api-key=pube4f163c23bbf91c16b8f57f56af9fc58&dd-evp-origin=content-security-policy&ddsource=csp-report&ddtags=site%3Adatadoghq.com
+      content-type:
+      - application/vnd.api+json
+      date:
+      - Wed, 23 Jul 2025 02:50:51 GMT
+      strict-transport-security:
+      - max-age=31536000; includeSubDomains; preload
+      vary:
+      - Accept-Encoding
+      x-content-type-options:
+      - nosniff
+      x-frame-options:
+      - SAMEORIGIN
+    status:
+      code: 200
+      message: OK
+version: 1
diff --git a/tests/llmobs/llmobs_cassettes/datadog/datadog_api_unstable_llm-obs_v1_datasets_post_027be704.yaml b/tests/llmobs/llmobs_cassettes/datadog/datadog_api_unstable_llm-obs_v1_datasets_post_027be704.yaml
new file mode 100644
index 00000000000..232be655b93
--- /dev/null
+++ b/tests/llmobs/llmobs_cassettes/datadog/datadog_api_unstable_llm-obs_v1_datasets_post_027be704.yaml
@@ -0,0 +1,46 @@
+interactions:
+- request:
+    body: '{"data": {"type": "datasets", "attributes": {"name": "test-dataset-good-csv-pipe",
+      "description": "A good pipe separated csv dataset"}}}'
+    headers:
+      Accept:
+      - '*/*'
+      ? !!python/object/apply:multidict._multidict.istr
+      - Accept-Encoding
+      : - identity
+      Connection:
+      - keep-alive
+      Content-Length:
+      - '136'
+      ? !!python/object/apply:multidict._multidict.istr
+      - Content-Type
+      : - application/json
+      User-Agent:
+      - python-requests/2.32.3
+    method: POST
+    uri: https://api.datadoghq.com/api/unstable/llm-obs/v1/datasets
+  response:
+    body:
+      string: '{"data":{"id":"acf19ca4-8062-4548-abbf-95b33f55d51d","type":"datasets","attributes":{"author":{"id":"de473b30-eb9f-11e9-a77a-c7405862b8bd"},"created_at":"2025-07-23T02:50:52.663543594Z","current_version":0,"description":"A
+        good pipe separated csv dataset","name":"test-dataset-good-csv-pipe","updated_at":"2025-07-23T02:50:52.663543594Z"}}}'
+    headers:
+      content-length:
+      - '340'
+      content-security-policy:
+      - frame-ancestors 'self'; report-uri https://logs.browser-intake-datadoghq.com/api/v2/logs?dd-api-key=pube4f163c23bbf91c16b8f57f56af9fc58&dd-evp-origin=content-security-policy&ddsource=csp-report&ddtags=site%3Adatadoghq.com
+      content-type:
+      - application/vnd.api+json
+      date:
+      - Wed, 23 Jul 2025 02:50:52 GMT
+      strict-transport-security:
+      - max-age=31536000; includeSubDomains; preload
+      vary:
+      - Accept-Encoding
+      x-content-type-options:
+      - nosniff
+      x-frame-options:
+      - SAMEORIGIN
+    status:
+      code: 200
+      message: OK
+version: 1
diff --git a/tests/llmobs/llmobs_cassettes/datadog/datadog_api_unstable_llm-obs_v1_datasets_post_c1d4ae31.yaml b/tests/llmobs/llmobs_cassettes/datadog/datadog_api_unstable_llm-obs_v1_datasets_post_c1d4ae31.yaml
new file mode 100644
index 00000000000..de032063c0c
--- /dev/null
+++ b/tests/llmobs/llmobs_cassettes/datadog/datadog_api_unstable_llm-obs_v1_datasets_post_c1d4ae31.yaml
@@ -0,0 +1,46 @@
+interactions:
+- request:
+    body: '{"data": {"type": "datasets", "attributes": {"name": "test-dataset-good-csv",
+      "description": "A good csv dataset"}}}'
+    headers:
+      Accept:
+      - '*/*'
+      ? !!python/object/apply:multidict._multidict.istr
+      - Accept-Encoding
+      : - identity
+      Connection:
+      - keep-alive
+      Content-Length:
+      - '116'
+      ? !!python/object/apply:multidict._multidict.istr
+      - Content-Type
+      : - application/json
+      User-Agent:
+      - python-requests/2.32.3
+    method: POST
+    uri: https://api.datadoghq.com/api/unstable/llm-obs/v1/datasets
+  response:
+    body:
+      string: '{"data":{"id":"8dbba503-cf48-4e82-805e-1d9cb46e5ed3","type":"datasets","attributes":{"author":{"id":"de473b30-eb9f-11e9-a77a-c7405862b8bd"},"created_at":"2025-07-23T02:50:46.680898186Z","current_version":0,"description":"A
+        good csv dataset","name":"test-dataset-good-csv","updated_at":"2025-07-23T02:50:46.680898186Z"}}}'
+    headers:
+      content-length:
+      - '320'
+      content-security-policy:
+      - frame-ancestors 'self'; report-uri https://logs.browser-intake-datadoghq.com/api/v2/logs?dd-api-key=pube4f163c23bbf91c16b8f57f56af9fc58&dd-evp-origin=content-security-policy&ddsource=csp-report&ddtags=site%3Adatadoghq.com
+      content-type:
+      - application/vnd.api+json
+      date:
+      - Wed, 23 Jul 2025 02:50:46 GMT
+      strict-transport-security:
+      - max-age=31536000; includeSubDomains; preload
+      vary:
+      - Accept-Encoding
+      x-content-type-options:
+      - nosniff
+      x-frame-options:
+      - SAMEORIGIN
+    status:
+      code: 200
+      message: OK
+version: 1
diff --git a/tests/llmobs/llmobs_cassettes/datadog/datadog_api_unstable_llm-obs_v1_datasets_post_d59b5313.yaml b/tests/llmobs/llmobs_cassettes/datadog/datadog_api_unstable_llm-obs_v1_datasets_post_d59b5313.yaml
new file mode 100644
index 00000000000..07a85cd5c60
--- /dev/null
+++ b/tests/llmobs/llmobs_cassettes/datadog/datadog_api_unstable_llm-obs_v1_datasets_post_d59b5313.yaml
@@ -0,0 +1,46 @@
+interactions:
+- request:
+    body: '{"data": {"type": "datasets", "attributes": {"name": "test-dataset-bad-csv",
+      "description": "not a real csv dataset"}}}'
+    headers:
+      Accept:
+      - '*/*'
+      ? !!python/object/apply:multidict._multidict.istr
+      - Accept-Encoding
+      : - identity
+      Connection:
+      - keep-alive
+      Content-Length:
+      - '119'
+      ? !!python/object/apply:multidict._multidict.istr
+      - Content-Type
+      : - application/json
+      User-Agent:
+      - python-requests/2.32.3
+    method: POST
+    uri: https://api.datadoghq.com/api/unstable/llm-obs/v1/datasets
+  response:
+    body:
+      string: '{"data":{"id":"6890ba6e-8023-414c-bdb1-662c63f9f489","type":"datasets","attributes":{"author":{"id":"de473b30-eb9f-11e9-a77a-c7405862b8bd"},"created_at":"2025-07-22T23:10:06.524865Z","current_version":0,"description":"not
+        a real csv dataset","name":"test-dataset-bad-csv","updated_at":"2025-07-22T23:10:06.524865Z"}}}'
+    headers:
+      content-length:
+      - '317'
+      content-security-policy:
+      - frame-ancestors 'self'; report-uri https://logs.browser-intake-datadoghq.com/api/v2/logs?dd-api-key=pube4f163c23bbf91c16b8f57f56af9fc58&dd-evp-origin=content-security-policy&ddsource=csp-report&ddtags=site%3Adatadoghq.com
+      content-type:
+      - application/vnd.api+json
+      date:
+      - Wed, 23 Jul 2025 02:50:47 GMT
+      strict-transport-security:
+      - max-age=31536000; includeSubDomains; preload
+      vary:
+      - Accept-Encoding
+      x-content-type-options:
+      - nosniff
+      x-frame-options:
+      - SAMEORIGIN
+    status:
+      code: 200
+      message: OK
+version: 1
diff --git a/tests/llmobs/test_experiments.py b/tests/llmobs/test_experiments.py
index f36cdf32166..74b774ad30d 100644
--- a/tests/llmobs/test_experiments.py
+++ b/tests/llmobs/test_experiments.py
@@ -228,6 +228,16 @@ def test_dataset_csv_pipe_separated(llmobs):
 
     assert dataset.description == "A good pipe separated csv dataset"
 
+    assert dataset._id is not None
+
+    wait_for_backend()
+    ds = llmobs.pull_dataset(name=dataset.name)
+
+    assert len(ds) == len(dataset)
+    assert ds.name == dataset.name
+    assert ds.description == dataset.description
+    assert ds._version == 1
+
     llmobs._delete_dataset(dataset_id=dataset._id)
 
 def test_dataset_pull_non_existent(llmobs):

From 772c544fa117df691a63f0273b71c9f4684633a8 Mon Sep 17 00:00:00 2001
From: gary-huang <garyhuang@hotmail.ca>
Date: Tue, 22 Jul 2025 22:53:40 -0400
Subject: [PATCH 13/20] restore setup

---
 setup.cfg | 13 +++++++++++++
 1 file changed, 13 insertions(+)

diff --git a/setup.cfg b/setup.cfg
index 501d7d10115..a62b967bb25 100644
--- a/setup.cfg
+++ b/setup.cfg
@@ -6,6 +6,19 @@ skip = *.json,*.h,*.cpp,*.c,.riot,.tox,.mypy_cache,.git,*ddtrace/vendor,tests/co
 exclude-file = .codespellignorelines
 ignore-words-list = asend,dne,fo,medias,ment,nin,ot,setttings,statics,ba,spawnve,doas
 
+# DEV: We use `conftest.py` as a local pytest plugin to configure hooks for collection
+[tool:pytest]
+# --cov-report is intentionally empty else pytest-cov will default to generating a report
+addopts =
+  --cov=ddtrace/
+  --cov=tests/
+  --cov-append
+  --cov-report=
+  --durations=10
+  --junitxml=test-results/junit.xml
+# DEV: The default is `test_*\.py` which will miss `test.py` files
+python_files = test*\.py
+asyncio_mode = auto
 
 [flake8]
 max-line-length = 120

From 59b993779ae718220149ff33687238fd5ad543e4 Mon Sep 17 00:00:00 2001
From: gary-huang <garyhuang@hotmail.ca>
Date: Tue, 22 Jul 2025 22:54:34 -0400
Subject: [PATCH 14/20] black

---
 tests/llmobs/test_experiments.py | 22 +++++++++++++++++-----
 1 file changed, 17 insertions(+), 5 deletions(-)

diff --git a/tests/llmobs/test_experiments.py b/tests/llmobs/test_experiments.py
index 74b774ad30d..27cba20bc7a 100644
--- a/tests/llmobs/test_experiments.py
+++ b/tests/llmobs/test_experiments.py
@@ -96,9 +96,10 @@ def test_dataset_as_dataframe(llmobs):
     ]
     df = dataset.as_dataframe()
     assert len(df.columns) == 2
-    assert df.size == 2 # size is num elements in a series
+    assert df.size == 2  # size is num elements in a series
     llmobs._delete_dataset(dataset_id=dataset._id)
 
+
 def test_csv_dataset_as_dataframe(llmobs):
     test_path = os.path.dirname(__file__)
     csv_path = os.path.join(test_path, "static_files/good_dataset.csv")
@@ -108,13 +109,20 @@ def test_csv_dataset_as_dataframe(llmobs):
         description="A good csv dataset",
         input_data_columns=["in0", "in1", "in2"],
         expected_output_columns=["out0", "out1"],
-        metadata_columns=["m0"]
+        metadata_columns=["m0"],
     )
     assert len(dataset) == 2
 
     df = dataset.as_dataframe()
     assert len(df.columns) == 6
-    assert sorted(df.columns) == [("expected_output", "out0"), ("expected_output", "out1"), ("input_data", "in0"), ("input_data", "in1"), ("input_data", "in2"), ("metadata", "m0")]
+    assert sorted(df.columns) == [
+        ("expected_output", "out0"),
+        ("expected_output", "out1"),
+        ("input_data", "in0"),
+        ("input_data", "in1"),
+        ("input_data", "in2"),
+        ("metadata", "m0"),
+    ]
 
     llmobs._delete_dataset(dataset_id=dataset._id)
 
@@ -122,7 +130,7 @@ def test_csv_dataset_as_dataframe(llmobs):
 def test_dataset_csv_missing_input_col(llmobs):
     test_path = os.path.dirname(__file__)
     csv_path = os.path.join(test_path, "static_files/good_dataset.csv")
-    with pytest.raises(ValueError, match=re.escape("Input columns not found in CSV header: [\'in998\', \'in999\']")):
+    with pytest.raises(ValueError, match=re.escape("Input columns not found in CSV header: ['in998', 'in999']")):
         llmobs.create_dataset_from_csv(
             csv_path=csv_path,
             dataset_name="test-dataset-good-csv",
@@ -131,10 +139,11 @@ def test_dataset_csv_missing_input_col(llmobs):
             expected_output_columns=["out0", "out1"],
         )
 
+
 def test_dataset_csv_missing_output_col(llmobs):
     test_path = os.path.dirname(__file__)
     csv_path = os.path.join(test_path, "static_files/good_dataset.csv")
-    with pytest.raises(ValueError, match=re.escape("Expected output columns not found in CSV header: [\'out999\']")):
+    with pytest.raises(ValueError, match=re.escape("Expected output columns not found in CSV header: ['out999']")):
         llmobs.create_dataset_from_csv(
             csv_path=csv_path,
             dataset_name="test-dataset-good-csv",
@@ -143,6 +152,7 @@ def test_dataset_csv_missing_output_col(llmobs):
             expected_output_columns=["out999"],
         )
 
+
 def test_dataset_csv_empty_csv(llmobs):
     test_path = os.path.dirname(__file__)
     csv_path = os.path.join(test_path, "static_files/empty.csv")
@@ -155,6 +165,7 @@ def test_dataset_csv_empty_csv(llmobs):
             expected_output_columns=["out0"],
         )
 
+
 def test_dataset_csv(llmobs):
     test_path = os.path.dirname(__file__)
     csv_path = os.path.join(test_path, "static_files/good_dataset.csv")
@@ -240,6 +251,7 @@ def test_dataset_csv_pipe_separated(llmobs):
 
     llmobs._delete_dataset(dataset_id=dataset._id)
 
+
 def test_dataset_pull_non_existent(llmobs):
     with pytest.raises(ValueError):
         llmobs.pull_dataset(name="test-dataset-non-existent")

From 2ca92fe551113cb29b3e19d7a3f5e3bc6b088157 Mon Sep 17 00:00:00 2001
From: gary-huang <garyhuang@hotmail.ca>
Date: Wed, 23 Jul 2025 09:26:36 -0400
Subject: [PATCH 15/20] address comments

---
 ddtrace/llmobs/_experiment.py    |   2 +-
 tests/llmobs/test_experiments.py | 219 ++++++++++++++++---------------
 2 files changed, 113 insertions(+), 108 deletions(-)

diff --git a/ddtrace/llmobs/_experiment.py b/ddtrace/llmobs/_experiment.py
index 6c0781e0491..3eda6192a61 100644
--- a/ddtrace/llmobs/_experiment.py
+++ b/ddtrace/llmobs/_experiment.py
@@ -171,7 +171,7 @@ def __len__(self) -> int:
     def __iter__(self) -> Iterator[DatasetRecord]:
         return iter(self._records)
 
-    def as_dataframe(self):
+    def as_dataframe(self) -> None:
         try:
             import pandas as pd
         except ImportError as e:
diff --git a/tests/llmobs/test_experiments.py b/tests/llmobs/test_experiments.py
index 27cba20bc7a..163e50a27fe 100644
--- a/tests/llmobs/test_experiments.py
+++ b/tests/llmobs/test_experiments.py
@@ -86,45 +86,42 @@ def test_dataset_create_delete(llmobs):
     llmobs._delete_dataset(dataset_id=dataset._id)
 
 
-def test_dataset_as_dataframe(llmobs):
-    dataset = llmobs.create_dataset(name="test-dataset-3", description="A third test dataset")
-    dataset._records = [
-        DatasetRecord(
-            input_data=[{"role": "system", "content": "i am machine"}, {"role": "user", "content": "hello"}],
-            expected_output="label",
-        )
-    ]
+def test_dataset_as_dataframe(llmobs, test_dataset_one_record):
+    dataset = test_dataset_one_record
     df = dataset.as_dataframe()
     assert len(df.columns) == 2
     assert df.size == 2  # size is num elements in a series
-    llmobs._delete_dataset(dataset_id=dataset._id)
 
 
 def test_csv_dataset_as_dataframe(llmobs):
     test_path = os.path.dirname(__file__)
     csv_path = os.path.join(test_path, "static_files/good_dataset.csv")
-    dataset = llmobs.create_dataset_from_csv(
-        csv_path=csv_path,
-        dataset_name="test-dataset-good-csv",
-        description="A good csv dataset",
-        input_data_columns=["in0", "in1", "in2"],
-        expected_output_columns=["out0", "out1"],
-        metadata_columns=["m0"],
-    )
-    assert len(dataset) == 2
-
-    df = dataset.as_dataframe()
-    assert len(df.columns) == 6
-    assert sorted(df.columns) == [
-        ("expected_output", "out0"),
-        ("expected_output", "out1"),
-        ("input_data", "in0"),
-        ("input_data", "in1"),
-        ("input_data", "in2"),
-        ("metadata", "m0"),
-    ]
-
-    llmobs._delete_dataset(dataset_id=dataset._id)
+    dataset_id = None
+    try:
+        dataset = llmobs.create_dataset_from_csv(
+            csv_path=csv_path,
+            dataset_name="test-dataset-good-csv",
+            description="A good csv dataset",
+            input_data_columns=["in0", "in1", "in2"],
+            expected_output_columns=["out0", "out1"],
+            metadata_columns=["m0"],
+        )
+        dataset_id = dataset._id
+        assert len(dataset) == 2
+
+        df = dataset.as_dataframe()
+        assert len(df.columns) == 6
+        assert sorted(df.columns) == [
+            ("expected_output", "out0"),
+            ("expected_output", "out1"),
+            ("input_data", "in0"),
+            ("input_data", "in1"),
+            ("input_data", "in2"),
+            ("metadata", "m0"),
+        ]
+    finally:
+        if dataset_id:
+            llmobs._delete_dataset(dataset_id=dataset_id)
 
 
 def test_dataset_csv_missing_input_col(llmobs):
@@ -169,87 +166,95 @@ def test_dataset_csv_empty_csv(llmobs):
 def test_dataset_csv(llmobs):
     test_path = os.path.dirname(__file__)
     csv_path = os.path.join(test_path, "static_files/good_dataset.csv")
-    dataset = llmobs.create_dataset_from_csv(
-        csv_path=csv_path,
-        dataset_name="test-dataset-good-csv",
-        description="A good csv dataset",
-        input_data_columns=["in0", "in1", "in2"],
-        expected_output_columns=["out0", "out1"],
-    )
-    assert len(dataset) == 2
-    assert len(dataset[0]["input_data"]) == 3
-    assert dataset[0]["input_data"]["in0"] == "r0v1"
-    assert dataset[0]["input_data"]["in1"] == "r0v2"
-    assert dataset[0]["input_data"]["in2"] == "r0v3"
-    assert dataset[1]["input_data"]["in0"] == "r1v1"
-    assert dataset[1]["input_data"]["in1"] == "r1v2"
-    assert dataset[1]["input_data"]["in2"] == "r1v3"
-
-    assert len(dataset[0]["expected_output"]) == 2
-    assert dataset[0]["expected_output"]["out0"] == "r0v4"
-    assert dataset[0]["expected_output"]["out1"] == "r0v5"
-    assert dataset[1]["expected_output"]["out0"] == "r1v4"
-    assert dataset[1]["expected_output"]["out1"] == "r1v5"
-
-    assert dataset.description == "A good csv dataset"
-
-    assert dataset._id is not None
-
-    wait_for_backend()
-    ds = llmobs.pull_dataset(name=dataset.name)
-
-    assert len(ds) == len(dataset)
-    assert ds.name == dataset.name
-    assert ds.description == dataset.description
-    assert ds._version == 1
-
-    llmobs._delete_dataset(dataset_id=dataset._id)
+    dataset_id = None
+    try:
+        dataset = llmobs.create_dataset_from_csv(
+            csv_path=csv_path,
+            dataset_name="test-dataset-good-csv",
+            description="A good csv dataset",
+            input_data_columns=["in0", "in1", "in2"],
+            expected_output_columns=["out0", "out1"],
+        )
+        dataset_id = dataset._id
+        assert len(dataset) == 2
+        assert len(dataset[0]["input_data"]) == 3
+        assert dataset[0]["input_data"]["in0"] == "r0v1"
+        assert dataset[0]["input_data"]["in1"] == "r0v2"
+        assert dataset[0]["input_data"]["in2"] == "r0v3"
+        assert dataset[1]["input_data"]["in0"] == "r1v1"
+        assert dataset[1]["input_data"]["in1"] == "r1v2"
+        assert dataset[1]["input_data"]["in2"] == "r1v3"
+
+        assert len(dataset[0]["expected_output"]) == 2
+        assert dataset[0]["expected_output"]["out0"] == "r0v4"
+        assert dataset[0]["expected_output"]["out1"] == "r0v5"
+        assert dataset[1]["expected_output"]["out0"] == "r1v4"
+        assert dataset[1]["expected_output"]["out1"] == "r1v5"
+
+        assert dataset.description == "A good csv dataset"
+
+        assert dataset._id is not None
+
+        wait_for_backend()
+        ds = llmobs.pull_dataset(name=dataset.name)
+
+        assert len(ds) == len(dataset)
+        assert ds.name == dataset.name
+        assert ds.description == dataset.description
+        assert ds._version == 1
+    finally:
+        if dataset_id:
+            llmobs._delete_dataset(dataset_id=dataset_id)
 
 
 def test_dataset_csv_pipe_separated(llmobs):
     test_path = os.path.dirname(__file__)
     csv_path = os.path.join(test_path, "static_files/good_dataset_pipe_separated.csv")
-    dataset = llmobs.create_dataset_from_csv(
-        csv_path=csv_path,
-        dataset_name="test-dataset-good-csv-pipe",
-        description="A good pipe separated csv dataset",
-        input_data_columns=["in0", "in1", "in2"],
-        expected_output_columns=["out0", "out1"],
-        metadata_columns=["m0"],
-        csv_delimiter="|",
-    )
-    assert len(dataset) == 2
-    assert len(dataset[0]["input_data"]) == 3
-    assert dataset[0]["input_data"]["in0"] == "r0v1"
-    assert dataset[0]["input_data"]["in1"] == "r0v2"
-    assert dataset[0]["input_data"]["in2"] == "r0v3"
-    assert dataset[1]["input_data"]["in0"] == "r1v1"
-    assert dataset[1]["input_data"]["in1"] == "r1v2"
-    assert dataset[1]["input_data"]["in2"] == "r1v3"
-
-    assert len(dataset[0]["expected_output"]) == 2
-    assert dataset[0]["expected_output"]["out0"] == "r0v4"
-    assert dataset[0]["expected_output"]["out1"] == "r0v5"
-    assert dataset[1]["expected_output"]["out0"] == "r1v4"
-    assert dataset[1]["expected_output"]["out1"] == "r1v5"
-
-    assert len(dataset[0]["metadata"]) == 1
-    assert dataset[0]["metadata"]["m0"] == "r0v6"
-    assert dataset[1]["metadata"]["m0"] == "r1v6"
-
-    assert dataset.description == "A good pipe separated csv dataset"
-
-    assert dataset._id is not None
-
-    wait_for_backend()
-    ds = llmobs.pull_dataset(name=dataset.name)
-
-    assert len(ds) == len(dataset)
-    assert ds.name == dataset.name
-    assert ds.description == dataset.description
-    assert ds._version == 1
-
-    llmobs._delete_dataset(dataset_id=dataset._id)
+    dataset_id = None
+    try:
+        dataset = llmobs.create_dataset_from_csv(
+            csv_path=csv_path,
+            dataset_name="test-dataset-good-csv-pipe",
+            description="A good pipe separated csv dataset",
+            input_data_columns=["in0", "in1", "in2"],
+            expected_output_columns=["out0", "out1"],
+            metadata_columns=["m0"],
+            csv_delimiter="|",
+        )
+        dataset_id = dataset._id
+        assert len(dataset) == 2
+        assert len(dataset[0]["input_data"]) == 3
+        assert dataset[0]["input_data"]["in0"] == "r0v1"
+        assert dataset[0]["input_data"]["in1"] == "r0v2"
+        assert dataset[0]["input_data"]["in2"] == "r0v3"
+        assert dataset[1]["input_data"]["in0"] == "r1v1"
+        assert dataset[1]["input_data"]["in1"] == "r1v2"
+        assert dataset[1]["input_data"]["in2"] == "r1v3"
+
+        assert len(dataset[0]["expected_output"]) == 2
+        assert dataset[0]["expected_output"]["out0"] == "r0v4"
+        assert dataset[0]["expected_output"]["out1"] == "r0v5"
+        assert dataset[1]["expected_output"]["out0"] == "r1v4"
+        assert dataset[1]["expected_output"]["out1"] == "r1v5"
+
+        assert len(dataset[0]["metadata"]) == 1
+        assert dataset[0]["metadata"]["m0"] == "r0v6"
+        assert dataset[1]["metadata"]["m0"] == "r1v6"
+
+        assert dataset.description == "A good pipe separated csv dataset"
+
+        assert dataset._id is not None
+
+        wait_for_backend()
+        ds = llmobs.pull_dataset(name=dataset.name)
+
+        assert len(ds) == len(dataset)
+        assert ds.name == dataset.name
+        assert ds.description == dataset.description
+        assert ds._version == 1
+    finally:
+        if dataset_id:
+            llmobs._delete_dataset(dataset_id=dataset._id)
 
 
 def test_dataset_pull_non_existent(llmobs):

From 0d94cbbd803f23ba6f67b06ad608cb1a20a2a944 Mon Sep 17 00:00:00 2001
From: Kyle Verhoog <kyle@verhoog.ca>
Date: Wed, 23 Jul 2025 11:12:55 -0400
Subject: [PATCH 16/20] riotfiles

---
 .../requirements/{1fe8dd2.txt => 1900591.txt} | 80 +++++++++---------
 .../requirements/{1687eab.txt => 46e9996.txt} | 80 +++++++++---------
 .../requirements/{771848b.txt => 5908834.txt} | 80 +++++++++---------
 .../requirements/{146f2d8.txt => 97f1328.txt} | 38 +++++----
 .../requirements/{12c5529.txt => f37741b.txt} | 82 ++++++++++---------
 .../requirements/{4102ef5.txt => ffd66c1.txt} | 80 +++++++++---------
 6 files changed, 231 insertions(+), 209 deletions(-)
 rename .riot/requirements/{1fe8dd2.txt => 1900591.txt} (54%)
 rename .riot/requirements/{1687eab.txt => 46e9996.txt} (50%)
 rename .riot/requirements/{771848b.txt => 5908834.txt} (52%)
 rename .riot/requirements/{146f2d8.txt => 97f1328.txt} (74%)
 rename .riot/requirements/{12c5529.txt => f37741b.txt} (54%)
 rename .riot/requirements/{4102ef5.txt => ffd66c1.txt} (54%)

diff --git a/.riot/requirements/1fe8dd2.txt b/.riot/requirements/1900591.txt
similarity index 54%
rename from .riot/requirements/1fe8dd2.txt
rename to .riot/requirements/1900591.txt
index 72ea102ea03..ed309c651a9 100644
--- a/.riot/requirements/1fe8dd2.txt
+++ b/.riot/requirements/1900591.txt
@@ -2,33 +2,35 @@
 # This file is autogenerated by pip-compile with Python 3.13
 # by the following command:
 #
-#    pip-compile --allow-unsafe --no-annotate .riot/requirements/1fe8dd2.in
+#    pip-compile --allow-unsafe --no-annotate .riot/requirements/1900591.in
 #
-aiohappyeyeballs==2.4.8
-aiohttp==3.11.13
-aiosignal==1.3.2
+aiohappyeyeballs==2.6.1
+aiohttp==3.12.14
+aiosignal==1.4.0
 annotated-types==0.7.0
-anyio==4.8.0
+anyio==4.9.0
 appdirs==1.4.4
-attrs==25.1.0
-certifi==2025.1.31
-charset-normalizer==3.4.1
-coverage[toml]==7.6.12
+attrs==25.3.0
+certifi==2025.7.14
+charset-normalizer==3.4.2
+coverage[toml]==7.9.2
 dataclasses-json==0.6.7
-datasets==3.3.2
+datasets==4.0.0
 dill==0.3.8
 distro==1.9.0
-filelock==3.17.0
-frozenlist==1.5.0
-fsspec[http]==2024.12.0
-h11==0.14.0
-httpcore==1.0.7
+filelock==3.18.0
+frozenlist==1.7.0
+fsspec[http]==2025.3.0
+greenlet==3.2.3
+h11==0.16.0
+hf-xet==1.1.5
+httpcore==1.0.9
 httpx==0.28.1
-huggingface-hub==0.29.2
+huggingface-hub==0.33.4
 hypothesis==6.45.0
 idna==3.10
-iniconfig==2.0.0
-jiter==0.8.2
+iniconfig==2.1.0
+jiter==0.10.0
 jsonpatch==1.33
 jsonpointer==3.0.0
 langchain==0.2.17
@@ -39,45 +41,47 @@ langchain-text-splitters==0.2.4
 langsmith==0.1.147
 marshmallow==3.26.1
 mock==5.2.0
-multidict==6.1.0
+multidict==6.6.3
 multiprocess==0.70.16
-mypy-extensions==1.0.0
+mypy-extensions==1.1.0
 nest-asyncio==1.6.0
 numpy==1.26.4
-openai==1.65.3
+openai==1.97.1
 opentracing==2.4.0
-orjson==3.10.15
+orjson==3.11.0
 packaging==24.2
-pandas==2.2.3
-pluggy==1.5.0
-propcache==0.3.0
-pyarrow==19.0.1
-pydantic==2.10.6
-pydantic-core==2.27.2
+pandas==2.3.1
+pluggy==1.6.0
+propcache==0.3.2
+pyarrow==21.0.0
+pydantic==2.11.7
+pydantic-core==2.33.2
+pygments==2.19.2
 pysbd==0.3.4
-pytest==8.3.5
+pytest==8.4.1
 pytest-asyncio==0.21.1
-pytest-cov==6.0.0
-pytest-mock==3.14.0
+pytest-cov==6.2.1
+pytest-mock==3.14.1
 python-dateutil==2.9.0.post0
-pytz==2025.1
+pytz==2025.2
 pyyaml==6.0.2
 ragas==0.1.21
 regex==2024.11.6
-requests==2.32.3
+requests==2.32.4
 requests-toolbelt==1.0.0
 six==1.17.0
 sniffio==1.3.1
 sortedcontainers==2.4.0
-sqlalchemy==2.0.38
+sqlalchemy==2.0.41
 tenacity==8.5.0
 tiktoken==0.9.0
 tqdm==4.67.1
-typing-extensions==4.12.2
+typing-extensions==4.14.1
 typing-inspect==0.9.0
-tzdata==2025.1
-urllib3==2.3.0
+typing-inspection==0.4.1
+tzdata==2025.2
+urllib3==2.5.0
 vcrpy==7.0.0
 wrapt==1.17.2
 xxhash==3.5.0
-yarl==1.18.3
+yarl==1.20.1
diff --git a/.riot/requirements/1687eab.txt b/.riot/requirements/46e9996.txt
similarity index 50%
rename from .riot/requirements/1687eab.txt
rename to .riot/requirements/46e9996.txt
index 009a5ede488..6c8fc1bdc4c 100644
--- a/.riot/requirements/1687eab.txt
+++ b/.riot/requirements/46e9996.txt
@@ -2,33 +2,35 @@
 # This file is autogenerated by pip-compile with Python 3.11
 # by the following command:
 #
-#    pip-compile --allow-unsafe --no-annotate --resolver=backtracking .riot/requirements/1687eab.in
+#    pip-compile --allow-unsafe --no-annotate .riot/requirements/46e9996.in
 #
-aiohappyeyeballs==2.4.8
-aiohttp==3.11.13
-aiosignal==1.3.2
+aiohappyeyeballs==2.6.1
+aiohttp==3.12.14
+aiosignal==1.4.0
 annotated-types==0.7.0
-anyio==4.8.0
+anyio==4.9.0
 appdirs==1.4.4
-attrs==25.1.0
-certifi==2025.1.31
-charset-normalizer==3.4.1
-coverage[toml]==7.6.12
+attrs==25.3.0
+certifi==2025.7.14
+charset-normalizer==3.4.2
+coverage[toml]==7.9.2
 dataclasses-json==0.6.7
-datasets==3.3.2
+datasets==4.0.0
 dill==0.3.8
 distro==1.9.0
-filelock==3.17.0
-frozenlist==1.5.0
-fsspec[http]==2024.12.0
-h11==0.14.0
-httpcore==1.0.7
+filelock==3.18.0
+frozenlist==1.7.0
+fsspec[http]==2025.3.0
+greenlet==3.2.3
+h11==0.16.0
+hf-xet==1.1.5
+httpcore==1.0.9
 httpx==0.28.1
-huggingface-hub==0.29.2
+huggingface-hub==0.33.4
 hypothesis==6.45.0
 idna==3.10
-iniconfig==2.0.0
-jiter==0.8.2
+iniconfig==2.1.0
+jiter==0.10.0
 jsonpatch==1.33
 jsonpointer==3.0.0
 langchain==0.2.17
@@ -39,45 +41,47 @@ langchain-text-splitters==0.2.4
 langsmith==0.1.147
 marshmallow==3.26.1
 mock==5.2.0
-multidict==6.1.0
+multidict==6.6.3
 multiprocess==0.70.16
-mypy-extensions==1.0.0
+mypy-extensions==1.1.0
 nest-asyncio==1.6.0
 numpy==1.26.4
-openai==1.65.3
+openai==1.97.1
 opentracing==2.4.0
-orjson==3.10.15
+orjson==3.11.0
 packaging==24.2
-pandas==2.2.3
-pluggy==1.5.0
-propcache==0.3.0
-pyarrow==19.0.1
-pydantic==2.10.6
-pydantic-core==2.27.2
+pandas==2.3.1
+pluggy==1.6.0
+propcache==0.3.2
+pyarrow==21.0.0
+pydantic==2.11.7
+pydantic-core==2.33.2
+pygments==2.19.2
 pysbd==0.3.4
-pytest==8.3.5
+pytest==8.4.1
 pytest-asyncio==0.21.1
-pytest-cov==6.0.0
-pytest-mock==3.14.0
+pytest-cov==6.2.1
+pytest-mock==3.14.1
 python-dateutil==2.9.0.post0
-pytz==2025.1
+pytz==2025.2
 pyyaml==6.0.2
 ragas==0.1.21
 regex==2024.11.6
-requests==2.32.3
+requests==2.32.4
 requests-toolbelt==1.0.0
 six==1.17.0
 sniffio==1.3.1
 sortedcontainers==2.4.0
-sqlalchemy==2.0.38
+sqlalchemy==2.0.41
 tenacity==8.5.0
 tiktoken==0.9.0
 tqdm==4.67.1
-typing-extensions==4.12.2
+typing-extensions==4.14.1
 typing-inspect==0.9.0
-tzdata==2025.1
-urllib3==2.3.0
+typing-inspection==0.4.1
+tzdata==2025.2
+urllib3==2.5.0
 vcrpy==7.0.0
 wrapt==1.17.2
 xxhash==3.5.0
-yarl==1.18.3
+yarl==1.20.1
diff --git a/.riot/requirements/771848b.txt b/.riot/requirements/5908834.txt
similarity index 52%
rename from .riot/requirements/771848b.txt
rename to .riot/requirements/5908834.txt
index cd804c107c2..443cdf7b385 100644
--- a/.riot/requirements/771848b.txt
+++ b/.riot/requirements/5908834.txt
@@ -2,35 +2,37 @@
 # This file is autogenerated by pip-compile with Python 3.9
 # by the following command:
 #
-#    pip-compile --allow-unsafe --no-annotate --resolver=backtracking .riot/requirements/771848b.in
+#    pip-compile --allow-unsafe --no-annotate .riot/requirements/5908834.in
 #
-aiohappyeyeballs==2.4.8
-aiohttp==3.11.13
-aiosignal==1.3.2
+aiohappyeyeballs==2.6.1
+aiohttp==3.12.14
+aiosignal==1.4.0
 annotated-types==0.7.0
-anyio==4.8.0
+anyio==4.9.0
 appdirs==1.4.4
 async-timeout==4.0.3
-attrs==25.1.0
-certifi==2025.1.31
-charset-normalizer==3.4.1
-coverage[toml]==7.6.12
+attrs==25.3.0
+certifi==2025.7.14
+charset-normalizer==3.4.2
+coverage[toml]==7.9.2
 dataclasses-json==0.6.7
-datasets==3.3.2
+datasets==4.0.0
 dill==0.3.8
 distro==1.9.0
-exceptiongroup==1.2.2
-filelock==3.17.0
-frozenlist==1.5.0
-fsspec[http]==2024.12.0
-h11==0.14.0
-httpcore==1.0.7
+exceptiongroup==1.3.0
+filelock==3.18.0
+frozenlist==1.7.0
+fsspec[http]==2025.3.0
+greenlet==3.2.3
+h11==0.16.0
+hf-xet==1.1.5
+httpcore==1.0.9
 httpx==0.28.1
-huggingface-hub==0.29.2
+huggingface-hub==0.33.4
 hypothesis==6.45.0
 idna==3.10
-iniconfig==2.0.0
-jiter==0.8.2
+iniconfig==2.1.0
+jiter==0.10.0
 jsonpatch==1.33
 jsonpointer==3.0.0
 langchain==0.2.17
@@ -41,46 +43,48 @@ langchain-text-splitters==0.2.4
 langsmith==0.1.147
 marshmallow==3.26.1
 mock==5.2.0
-multidict==6.1.0
+multidict==6.6.3
 multiprocess==0.70.16
-mypy-extensions==1.0.0
+mypy-extensions==1.1.0
 nest-asyncio==1.6.0
 numpy==1.26.4
-openai==1.65.3
+openai==1.97.1
 opentracing==2.4.0
-orjson==3.10.15
+orjson==3.11.0
 packaging==24.2
-pandas==2.2.3
-pluggy==1.5.0
-propcache==0.3.0
-pyarrow==19.0.1
-pydantic==2.10.6
-pydantic-core==2.27.2
+pandas==2.3.1
+pluggy==1.6.0
+propcache==0.3.2
+pyarrow==21.0.0
+pydantic==2.11.7
+pydantic-core==2.33.2
+pygments==2.19.2
 pysbd==0.3.4
-pytest==8.3.5
+pytest==8.4.1
 pytest-asyncio==0.21.1
-pytest-cov==6.0.0
-pytest-mock==3.14.0
+pytest-cov==6.2.1
+pytest-mock==3.14.1
 python-dateutil==2.9.0.post0
-pytz==2025.1
+pytz==2025.2
 pyyaml==6.0.2
 ragas==0.1.21
 regex==2024.11.6
-requests==2.32.3
+requests==2.32.4
 requests-toolbelt==1.0.0
 six==1.17.0
 sniffio==1.3.1
 sortedcontainers==2.4.0
-sqlalchemy==2.0.38
+sqlalchemy==2.0.41
 tenacity==8.5.0
 tiktoken==0.9.0
 tomli==2.2.1
 tqdm==4.67.1
-typing-extensions==4.12.2
+typing-extensions==4.14.1
 typing-inspect==0.9.0
-tzdata==2025.1
+typing-inspection==0.4.1
+tzdata==2025.2
 urllib3==1.26.20
 vcrpy==7.0.0
 wrapt==1.17.2
 xxhash==3.5.0
-yarl==1.18.3
+yarl==1.20.1
diff --git a/.riot/requirements/146f2d8.txt b/.riot/requirements/97f1328.txt
similarity index 74%
rename from .riot/requirements/146f2d8.txt
rename to .riot/requirements/97f1328.txt
index e552a5620a3..68875c403e1 100644
--- a/.riot/requirements/146f2d8.txt
+++ b/.riot/requirements/97f1328.txt
@@ -2,7 +2,7 @@
 # This file is autogenerated by pip-compile with Python 3.8
 # by the following command:
 #
-#    pip-compile --allow-unsafe --no-annotate --resolver=backtracking .riot/requirements/146f2d8.in
+#    pip-compile --allow-unsafe --no-annotate .riot/requirements/97f1328.in
 #
 aiohappyeyeballs==2.4.4
 aiohttp==3.10.11
@@ -11,26 +11,28 @@ annotated-types==0.7.0
 anyio==4.5.2
 appdirs==1.4.4
 async-timeout==4.0.3
-attrs==25.1.0
-certifi==2025.1.31
-charset-normalizer==3.4.1
+attrs==25.3.0
+certifi==2025.7.14
+charset-normalizer==3.4.2
 coverage[toml]==7.6.1
 dataclasses-json==0.6.7
 datasets==3.1.0
 dill==0.3.8
 distro==1.9.0
-exceptiongroup==1.2.2
+exceptiongroup==1.3.0
 filelock==3.16.1
 frozenlist==1.5.0
 fsspec[http]==2024.9.0
-h11==0.14.0
-httpcore==1.0.7
+greenlet==3.1.1
+h11==0.16.0
+hf-xet==1.1.5
+httpcore==1.0.9
 httpx==0.28.1
-huggingface-hub==0.29.2
+huggingface-hub==0.33.4
 hypothesis==6.45.0
 idna==3.10
-iniconfig==2.0.0
-jiter==0.8.2
+iniconfig==2.1.0
+jiter==0.9.1
 jsonpatch==1.33
 jsonpointer==3.0.0
 langchain==0.2.17
@@ -43,10 +45,10 @@ marshmallow==3.22.0
 mock==5.2.0
 multidict==6.1.0
 multiprocess==0.70.16
-mypy-extensions==1.0.0
+mypy-extensions==1.1.0
 nest-asyncio==1.6.0
 numpy==1.24.4
-openai==1.65.3
+openai==1.97.1
 opentracing==2.4.0
 orjson==3.10.15
 packaging==24.2
@@ -60,25 +62,25 @@ pysbd==0.3.4
 pytest==8.3.5
 pytest-asyncio==0.21.1
 pytest-cov==5.0.0
-pytest-mock==3.14.0
+pytest-mock==3.14.1
 python-dateutil==2.9.0.post0
-pytz==2025.1
+pytz==2025.2
 pyyaml==6.0.2
 ragas==0.1.21
 regex==2024.11.6
-requests==2.32.3
+requests==2.32.4
 requests-toolbelt==1.0.0
 six==1.17.0
 sniffio==1.3.1
 sortedcontainers==2.4.0
-sqlalchemy==2.0.38
+sqlalchemy==2.0.41
 tenacity==8.5.0
 tiktoken==0.7.0
 tomli==2.2.1
 tqdm==4.67.1
-typing-extensions==4.12.2
+typing-extensions==4.13.2
 typing-inspect==0.9.0
-tzdata==2025.1
+tzdata==2025.2
 urllib3==1.26.20
 vcrpy==6.0.2
 wrapt==1.17.2
diff --git a/.riot/requirements/12c5529.txt b/.riot/requirements/f37741b.txt
similarity index 54%
rename from .riot/requirements/12c5529.txt
rename to .riot/requirements/f37741b.txt
index 40afeea9f40..f258baee359 100644
--- a/.riot/requirements/12c5529.txt
+++ b/.riot/requirements/f37741b.txt
@@ -2,35 +2,37 @@
 # This file is autogenerated by pip-compile with Python 3.10
 # by the following command:
 #
-#    pip-compile --allow-unsafe --no-annotate .riot/requirements/12c5529.in
+#    pip-compile --allow-unsafe --no-annotate .riot/requirements/f37741b.in
 #
-aiohappyeyeballs==2.4.8
-aiohttp==3.11.13
-aiosignal==1.3.2
+aiohappyeyeballs==2.6.1
+aiohttp==3.12.14
+aiosignal==1.4.0
 annotated-types==0.7.0
-anyio==4.8.0
+anyio==4.9.0
 appdirs==1.4.4
 async-timeout==4.0.3
-attrs==25.1.0
-certifi==2025.1.31
-charset-normalizer==3.4.1
-coverage[toml]==7.6.12
+attrs==25.3.0
+certifi==2025.7.14
+charset-normalizer==3.4.2
+coverage[toml]==7.9.2
 dataclasses-json==0.6.7
-datasets==3.3.2
+datasets==4.0.0
 dill==0.3.8
 distro==1.9.0
-exceptiongroup==1.2.2
-filelock==3.17.0
-frozenlist==1.5.0
-fsspec[http]==2024.12.0
-h11==0.14.0
-httpcore==1.0.7
+exceptiongroup==1.3.0
+filelock==3.18.0
+frozenlist==1.7.0
+fsspec[http]==2025.3.0
+greenlet==3.2.3
+h11==0.16.0
+hf-xet==1.1.5
+httpcore==1.0.9
 httpx==0.28.1
-huggingface-hub==0.29.2
+huggingface-hub==0.33.4
 hypothesis==6.45.0
 idna==3.10
-iniconfig==2.0.0
-jiter==0.8.2
+iniconfig==2.1.0
+jiter==0.10.0
 jsonpatch==1.33
 jsonpointer==3.0.0
 langchain==0.2.17
@@ -41,46 +43,48 @@ langchain-text-splitters==0.2.4
 langsmith==0.1.147
 marshmallow==3.26.1
 mock==5.2.0
-multidict==6.1.0
+multidict==6.6.3
 multiprocess==0.70.16
-mypy-extensions==1.0.0
+mypy-extensions==1.1.0
 nest-asyncio==1.6.0
 numpy==1.26.4
-openai==1.65.3
+openai==1.97.1
 opentracing==2.4.0
-orjson==3.10.15
+orjson==3.11.0
 packaging==24.2
-pandas==2.2.3
-pluggy==1.5.0
-propcache==0.3.0
-pyarrow==19.0.1
-pydantic==2.10.6
-pydantic-core==2.27.2
+pandas==2.3.1
+pluggy==1.6.0
+propcache==0.3.2
+pyarrow==21.0.0
+pydantic==2.11.7
+pydantic-core==2.33.2
+pygments==2.19.2
 pysbd==0.3.4
-pytest==8.3.5
+pytest==8.4.1
 pytest-asyncio==0.21.1
-pytest-cov==6.0.0
-pytest-mock==3.14.0
+pytest-cov==6.2.1
+pytest-mock==3.14.1
 python-dateutil==2.9.0.post0
-pytz==2025.1
+pytz==2025.2
 pyyaml==6.0.2
 ragas==0.1.21
 regex==2024.11.6
-requests==2.32.3
+requests==2.32.4
 requests-toolbelt==1.0.0
 six==1.17.0
 sniffio==1.3.1
 sortedcontainers==2.4.0
-sqlalchemy==2.0.38
+sqlalchemy==2.0.41
 tenacity==8.5.0
 tiktoken==0.9.0
 tomli==2.2.1
 tqdm==4.67.1
-typing-extensions==4.12.2
+typing-extensions==4.14.1
 typing-inspect==0.9.0
-tzdata==2025.1
-urllib3==2.3.0
+typing-inspection==0.4.1
+tzdata==2025.2
+urllib3==2.5.0
 vcrpy==7.0.0
 wrapt==1.17.2
 xxhash==3.5.0
-yarl==1.18.3
+yarl==1.20.1
diff --git a/.riot/requirements/4102ef5.txt b/.riot/requirements/ffd66c1.txt
similarity index 54%
rename from .riot/requirements/4102ef5.txt
rename to .riot/requirements/ffd66c1.txt
index d0518848787..89c4f02b050 100644
--- a/.riot/requirements/4102ef5.txt
+++ b/.riot/requirements/ffd66c1.txt
@@ -2,33 +2,35 @@
 # This file is autogenerated by pip-compile with Python 3.12
 # by the following command:
 #
-#    pip-compile --allow-unsafe --no-annotate .riot/requirements/4102ef5.in
+#    pip-compile --allow-unsafe --no-annotate .riot/requirements/ffd66c1.in
 #
-aiohappyeyeballs==2.4.8
-aiohttp==3.11.13
-aiosignal==1.3.2
+aiohappyeyeballs==2.6.1
+aiohttp==3.12.14
+aiosignal==1.4.0
 annotated-types==0.7.0
-anyio==4.8.0
+anyio==4.9.0
 appdirs==1.4.4
-attrs==25.1.0
-certifi==2025.1.31
-charset-normalizer==3.4.1
-coverage[toml]==7.6.12
+attrs==25.3.0
+certifi==2025.7.14
+charset-normalizer==3.4.2
+coverage[toml]==7.9.2
 dataclasses-json==0.6.7
-datasets==3.3.2
+datasets==4.0.0
 dill==0.3.8
 distro==1.9.0
-filelock==3.17.0
-frozenlist==1.5.0
-fsspec[http]==2024.12.0
-h11==0.14.0
-httpcore==1.0.7
+filelock==3.18.0
+frozenlist==1.7.0
+fsspec[http]==2025.3.0
+greenlet==3.2.3
+h11==0.16.0
+hf-xet==1.1.5
+httpcore==1.0.9
 httpx==0.28.1
-huggingface-hub==0.29.2
+huggingface-hub==0.33.4
 hypothesis==6.45.0
 idna==3.10
-iniconfig==2.0.0
-jiter==0.8.2
+iniconfig==2.1.0
+jiter==0.10.0
 jsonpatch==1.33
 jsonpointer==3.0.0
 langchain==0.2.17
@@ -39,45 +41,47 @@ langchain-text-splitters==0.2.4
 langsmith==0.1.147
 marshmallow==3.26.1
 mock==5.2.0
-multidict==6.1.0
+multidict==6.6.3
 multiprocess==0.70.16
-mypy-extensions==1.0.0
+mypy-extensions==1.1.0
 nest-asyncio==1.6.0
 numpy==1.26.4
-openai==1.65.3
+openai==1.97.1
 opentracing==2.4.0
-orjson==3.10.15
+orjson==3.11.0
 packaging==24.2
-pandas==2.2.3
-pluggy==1.5.0
-propcache==0.3.0
-pyarrow==19.0.1
-pydantic==2.10.6
-pydantic-core==2.27.2
+pandas==2.3.1
+pluggy==1.6.0
+propcache==0.3.2
+pyarrow==21.0.0
+pydantic==2.11.7
+pydantic-core==2.33.2
+pygments==2.19.2
 pysbd==0.3.4
-pytest==8.3.5
+pytest==8.4.1
 pytest-asyncio==0.21.1
-pytest-cov==6.0.0
-pytest-mock==3.14.0
+pytest-cov==6.2.1
+pytest-mock==3.14.1
 python-dateutil==2.9.0.post0
-pytz==2025.1
+pytz==2025.2
 pyyaml==6.0.2
 ragas==0.1.21
 regex==2024.11.6
-requests==2.32.3
+requests==2.32.4
 requests-toolbelt==1.0.0
 six==1.17.0
 sniffio==1.3.1
 sortedcontainers==2.4.0
-sqlalchemy==2.0.38
+sqlalchemy==2.0.41
 tenacity==8.5.0
 tiktoken==0.9.0
 tqdm==4.67.1
-typing-extensions==4.12.2
+typing-extensions==4.14.1
 typing-inspect==0.9.0
-tzdata==2025.1
-urllib3==2.3.0
+typing-inspection==0.4.1
+tzdata==2025.2
+urllib3==2.5.0
 vcrpy==7.0.0
 wrapt==1.17.2
 xxhash==3.5.0
-yarl==1.18.3
+yarl==1.20.1

From f53d1496c3139d7633efe382ab989d1e581d0624 Mon Sep 17 00:00:00 2001
From: gary-huang <garyhuang@hotmail.ca>
Date: Wed, 23 Jul 2025 11:38:16 -0400
Subject: [PATCH 17/20] ruff

---
 ddtrace/llmobs/_llmobs.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/ddtrace/llmobs/_llmobs.py b/ddtrace/llmobs/_llmobs.py
index 4043fabbd40..f0f4e246166 100644
--- a/ddtrace/llmobs/_llmobs.py
+++ b/ddtrace/llmobs/_llmobs.py
@@ -80,8 +80,8 @@
 from ddtrace.llmobs._context import LLMObsContextProvider
 from ddtrace.llmobs._evaluators.runner import EvaluatorRunner
 from ddtrace.llmobs._experiment import Dataset
-from ddtrace.llmobs._experiment import DatasetRecordInputType
 from ddtrace.llmobs._experiment import DatasetRecord
+from ddtrace.llmobs._experiment import DatasetRecordInputType
 from ddtrace.llmobs._experiment import Experiment
 from ddtrace.llmobs._experiment import ExperimentConfigType
 from ddtrace.llmobs._experiment import JSONType

From e9705d9895f9fa2a61c2fdd2e2a17c3ddb62e318 Mon Sep 17 00:00:00 2001
From: gary-huang <garyhuang@hotmail.ca>
Date: Wed, 23 Jul 2025 11:47:02 -0400
Subject: [PATCH 18/20] type check

---
 ddtrace/llmobs/_experiment.py | 24 ++++++++++++------------
 1 file changed, 12 insertions(+), 12 deletions(-)

diff --git a/ddtrace/llmobs/_experiment.py b/ddtrace/llmobs/_experiment.py
index 3eda6192a61..3f61e617a4b 100644
--- a/ddtrace/llmobs/_experiment.py
+++ b/ddtrace/llmobs/_experiment.py
@@ -182,29 +182,29 @@ def as_dataframe(self) -> None:
         column_tuples = set()
         data_rows = []
         for record in self._records:
-            flat_record = {}
+            flat_record = {}  # type: Dict[Union[str, Tuple[str, str]], Any]
 
             input_data = record.get("input_data", {})
             if isinstance(input_data, dict):
-                for k, v in input_data.items():
-                    flat_record[("input_data", k)] = v
-                    column_tuples.add(("input_data", k))
+                for input_data_col, input_data_val in input_data.items():
+                    flat_record[("input_data", input_data_col)] = input_data_val
+                    column_tuples.add(("input_data", input_data_col))
             else:
-                flat_record[("input_data", "")] = input_data  # Use empty string for single input
+                flat_record[("input_data", "")] = input_data
                 column_tuples.add(("input_data", ""))
 
             expected_output = record.get("expected_output", {})
             if isinstance(expected_output, dict):
-                for k, v in expected_output.items():
-                    flat_record[("expected_output", k)] = v
-                    column_tuples.add(("expected_output", k))
+                for expected_output_col, expected_output_val in expected_output.items():
+                    flat_record[("expected_output", expected_output_col)] = expected_output_val
+                    column_tuples.add(("expected_output", expected_output_col))
             else:
-                flat_record[("expected_output", "")] = expected_output  # Use empty string for single output
+                flat_record[("expected_output", "")] = expected_output
                 column_tuples.add(("expected_output", ""))
 
-            for k, v in record.get("metadata", {}).items():
-                flat_record[("metadata", k)] = v
-                column_tuples.add(("metadata", k))
+            for metadata_col, metadata_val in record.get("metadata", {}).items():
+                flat_record[("metadata", metadata_col)] = metadata_val
+                column_tuples.add(("metadata", metadata_col))
 
             data_rows.append(flat_record)
 

From 6f625e3975711947547eca99260cf514eaa4893e Mon Sep 17 00:00:00 2001
From: gary-huang <garyhuang@hotmail.ca>
Date: Wed, 23 Jul 2025 12:28:14 -0400
Subject: [PATCH 19/20] missing cassette?

---
 ...9cb46e5ed3_batch_update_post_34c161e3.yaml | 49 +++++++++++++++++++
 1 file changed, 49 insertions(+)
 create mode 100644 tests/llmobs/llmobs_cassettes/datadog/datadog_api_unstable_llm-obs_v1_datasets_8dbba503-cf48-4e82-805e-1d9cb46e5ed3_batch_update_post_34c161e3.yaml

diff --git a/tests/llmobs/llmobs_cassettes/datadog/datadog_api_unstable_llm-obs_v1_datasets_8dbba503-cf48-4e82-805e-1d9cb46e5ed3_batch_update_post_34c161e3.yaml b/tests/llmobs/llmobs_cassettes/datadog/datadog_api_unstable_llm-obs_v1_datasets_8dbba503-cf48-4e82-805e-1d9cb46e5ed3_batch_update_post_34c161e3.yaml
new file mode 100644
index 00000000000..3e4ea62376f
--- /dev/null
+++ b/tests/llmobs/llmobs_cassettes/datadog/datadog_api_unstable_llm-obs_v1_datasets_8dbba503-cf48-4e82-805e-1d9cb46e5ed3_batch_update_post_34c161e3.yaml
@@ -0,0 +1,49 @@
+interactions:
+- request:
+    body: '{"data": {"type": "datasets", "id": "8dbba503-cf48-4e82-805e-1d9cb46e5ed3",
+      "attributes": {"insert_records": [{"input": {"in0": "r0v1", "in1": "r0v2", "in2":
+      "r0v3"}, "expected_output": {"out0": "r0v4", "out1": "r0v5"}, "metadata": {"m0":
+      "r0v6"}}, {"input": {"in0": "r1v1", "in1": "r1v2", "in2": "r1v3"}, "expected_output":
+      {"out0": "r1v4", "out1": "r1v5"}, "metadata": {"m0": "r1v6"}}], "update_records":
+      [], "delete_records": []}}}'
+    headers:
+      Accept:
+      - '*/*'
+      ? !!python/object/apply:multidict._multidict.istr
+      - Accept-Encoding
+      : - identity
+      Connection:
+      - keep-alive
+      Content-Length:
+      - '434'
+      ? !!python/object/apply:multidict._multidict.istr
+      - Content-Type
+      : - application/json
+      User-Agent:
+      - python-requests/2.32.3
+    method: POST
+    uri: https://api.datadoghq.com/api/unstable/llm-obs/v1/datasets/8dbba503-cf48-4e82-805e-1d9cb46e5ed3/batch_update
+  response:
+    body:
+      string: '{"data":[]}'
+    headers:
+      content-length:
+      - '11'
+      content-security-policy:
+      - frame-ancestors 'self'; report-uri https://logs.browser-intake-datadoghq.com/api/v2/logs?dd-api-key=pube4f163c23bbf91c16b8f57f56af9fc58&dd-evp-origin=content-security-policy&ddsource=csp-report&ddtags=site%3Adatadoghq.com
+      content-type:
+      - application/vnd.api+json
+      date:
+      - Wed, 23 Jul 2025 16:27:16 GMT
+      strict-transport-security:
+      - max-age=31536000; includeSubDomains; preload
+      vary:
+      - Accept-Encoding
+      x-content-type-options:
+      - nosniff
+      x-frame-options:
+      - SAMEORIGIN
+    status:
+      code: 200
+      message: OK
+version: 1

From 10fea87e8034bd4152a10a3575cd3228ccf67e71 Mon Sep 17 00:00:00 2001
From: Yun Kim <yun.kim@datadoghq.com>
Date: Wed, 23 Jul 2025 14:57:49 -0400
Subject: [PATCH 20/20] Revert earlier change removing config from tasks

---
 ddtrace/llmobs/_experiment.py    |  4 ++--
 ddtrace/llmobs/_llmobs.py        |  8 ++++----
 tests/llmobs/test_experiments.py | 12 +++++++++---
 3 files changed, 15 insertions(+), 9 deletions(-)

diff --git a/ddtrace/llmobs/_experiment.py b/ddtrace/llmobs/_experiment.py
index 078b01145ae..996af3cd00d 100644
--- a/ddtrace/llmobs/_experiment.py
+++ b/ddtrace/llmobs/_experiment.py
@@ -226,7 +226,7 @@ class Experiment:
     def __init__(
         self,
         name: str,
-        task: Callable[[DatasetRecordInputType], JSONType],
+        task: Callable[[DatasetRecordInputType, Optional[ExperimentConfigType]], JSONType],
         dataset: Dataset,
         evaluators: List[Callable[[DatasetRecordInputType, JSONType, JSONType], JSONType]],
         project_name: str,
@@ -318,7 +318,7 @@ def _process_record(self, idx_record: Tuple[int, DatasetRecord]) -> Optional[Tas
             }
             output_data = None
             try:
-                output_data = self._task(input_data)
+                output_data = self._task(input_data, self._config)
             except Exception:
                 span.set_exc_info(*sys.exc_info())
             self._llmobs_instance.annotate(span, input_data=input_data, output_data=output_data, tags=tags)
diff --git a/ddtrace/llmobs/_llmobs.py b/ddtrace/llmobs/_llmobs.py
index 1e4cfa3a1f2..74fdd935944 100644
--- a/ddtrace/llmobs/_llmobs.py
+++ b/ddtrace/llmobs/_llmobs.py
@@ -672,7 +672,7 @@ def _delete_dataset(cls, dataset_id: str) -> None:
     def experiment(
         cls,
         name: str,
-        task: Callable[[DatasetRecordInputType], JSONType],
+        task: Callable[[DatasetRecordInputType, Optional[ExperimentConfigType]], JSONType],
         dataset: Dataset,
         evaluators: List[Callable[[DatasetRecordInputType, JSONType, JSONType], JSONType]],
         description: str = "",
@@ -682,7 +682,7 @@ def experiment(
         """Initializes an Experiment to run a task on a Dataset and evaluators.
 
         :param name: The name of the experiment.
-        :param task: The task function to run. Must accept a parameter ``input_data`` and optionally ``config``.
+        :param task: The task function to run. Must accept parameters ``input_data`` and ``config``.
         :param dataset: The dataset to run the experiment on, created with LLMObs.pull/create_dataset().
         :param evaluators: A list of evaluator functions to evaluate the task output.
                            Must accept parameters ``input_data``, ``output_data``, and ``expected_output``.
@@ -694,8 +694,8 @@ def experiment(
             raise TypeError("task must be a callable function.")
         sig = inspect.signature(task)
         params = sig.parameters
-        if "input_data" not in params:
-            raise TypeError("Task function must accept 'input_data' parameters.")
+        if "input_data" not in params or "config" not in params:
+            raise TypeError("Task function must have 'input_data' and 'config' parameters.")
         if not isinstance(dataset, Dataset):
             raise TypeError("Dataset must be an LLMObs Dataset object.")
         if not evaluators or not all(callable(evaluator) for evaluator in evaluators):
diff --git a/tests/llmobs/test_experiments.py b/tests/llmobs/test_experiments.py
index 163e50a27fe..57409feae5e 100644
--- a/tests/llmobs/test_experiments.py
+++ b/tests/llmobs/test_experiments.py
@@ -28,11 +28,11 @@ def wait_for_backend():
         time.sleep(2)
 
 
-def dummy_task(input_data):
+def dummy_task(input_data, config):
     return input_data
 
 
-def faulty_task(input_data):
+def faulty_task(input_data, config):
     raise ValueError("This is a test error")
 
 
@@ -389,12 +389,18 @@ def test_experiment_invalid_task_type_raises(llmobs, test_dataset_one_record):
 
 
 def test_experiment_invalid_task_signature_raises(llmobs, test_dataset_one_record):
-    with pytest.raises(TypeError, match="Task function must accept 'input_data' parameters."):
+    with pytest.raises(TypeError, match="Task function must have 'input_data' and 'config' parameters."):
 
         def my_task(not_input):
             pass
 
         llmobs.experiment("test_experiment", my_task, test_dataset_one_record, [dummy_evaluator])
+    with pytest.raises(TypeError, match="Task function must have 'input_data' and 'config' parameters."):
+
+        def my_task(input_data, not_config):
+            pass
+
+        llmobs.experiment("test_experiment", my_task, test_dataset_one_record, [dummy_evaluator])
 
 
 def test_experiment_invalid_dataset_raises(llmobs):