From 9e3ae0dfc04b0a6a6e2fa3edfdcae4e465fe4071 Mon Sep 17 00:00:00 2001 From: gary-huang Date: Mon, 14 Jul 2025 13:10:26 -0400 Subject: [PATCH 01/20] dataframe and csv support also create_dataset does not push immediately, it is more natural to push manually --- ddtrace/llmobs/_experiment.py | 43 +++++++++++++ ddtrace/llmobs/_llmobs.py | 75 +++++++++++++++++++++- riotfile.py | 1 + setup.cfg | 13 ---- tests/llmobs/static_files/good_dataset.csv | 3 + tests/llmobs/test_experiments.py | 32 +++++++++ 6 files changed, 151 insertions(+), 16 deletions(-) create mode 100644 tests/llmobs/static_files/good_dataset.csv diff --git a/ddtrace/llmobs/_experiment.py b/ddtrace/llmobs/_experiment.py index e5a3959a67a..832e1d6e2ad 100644 --- a/ddtrace/llmobs/_experiment.py +++ b/ddtrace/llmobs/_experiment.py @@ -161,6 +161,49 @@ def __len__(self) -> int: def __iter__(self) -> Iterator[DatasetRecord]: return iter(self._records) + def as_dataframe(self): + try: + import pandas as pd + except ImportError as e: + raise ImportError( + "pandas is required to convert dataset to DataFrame. " "Please install it with `pip install pandas`" + ) from e + + column_tuples = set() + data_rows = [] + for record in self._records: + flat_record = {} + + input_data = record.get("input_data", {}) + if isinstance(input_data, dict): + for k, v in input_data.items(): + flat_record[("input_data", k)] = v + column_tuples.add(("input_data", k)) + else: + flat_record[("input_data", "")] = input_data # Use empty string for single input + column_tuples.add(("input_data", "")) + + expected_output = record.get("expected_output", {}) + if isinstance(expected_output, dict): + for k, v in expected_output.items(): + flat_record[("expected_output", k)] = v + column_tuples.add(("expected_output", k)) + else: + flat_record[("expected_output", "")] = expected_output # Use empty string for single output + column_tuples.add(("expected_output", "")) + + for k, v in record.get("metadata", {}): + flat_record[("metadata", k)] = v + column_tuples.add(("metadata", k)) + + data_rows.append(flat_record) + + records_list = [] + for flat_record in data_rows: + row = [flat_record.get(col, None) for col in column_tuples] + records_list.append(row) + + return pd.DataFrame(data=records_list, columns=pd.MultiIndex.from_tuples(column_tuples)) class Experiment: def __init__( diff --git a/ddtrace/llmobs/_llmobs.py b/ddtrace/llmobs/_llmobs.py index d09851fae4a..d299fc146c0 100644 --- a/ddtrace/llmobs/_llmobs.py +++ b/ddtrace/llmobs/_llmobs.py @@ -1,3 +1,4 @@ +import csv from dataclasses import dataclass from dataclasses import field import inspect @@ -41,7 +42,7 @@ from ddtrace.internal.utils.formats import asbool from ddtrace.internal.utils.formats import format_trace_id from ddtrace.internal.utils.formats import parse_tags_str -from ddtrace.llmobs import _constants as constants +from ddtrace.llmobs import _constants as constants, DatasetRecord from ddtrace.llmobs import _telemetry as telemetry from ddtrace.llmobs._constants import ANNOTATIONS_CONTEXT_ID from ddtrace.llmobs._constants import DECORATOR @@ -583,8 +584,76 @@ def create_dataset(cls, name: str, description: str, records: List[DatasetRecord ds = cls._instance._dne_client.dataset_create(name, description) for r in records: ds.append(r) - if len(records) > 0: - ds.push() + return ds + + @classmethod + def create_dataset_from_csv(cls, csv_path: str, dataset_name: str, input_data_columns: List[str], expected_output_columns: List[str], metadata_columns: List[str] = [], csv_delimiter: str = ",", description="") -> Dataset: + ds = cls._instance._dne_client.dataset_create(dataset_name, description) + + # Store the original field size limit to restore it later + original_field_size_limit = csv.field_size_limit() + + csv.field_size_limit(10 * 1024 * 1024) # 10mb + + records = [] + try: + # First check if the file exists and is not empty before parsing + with open(csv_path, mode="r") as csvfile: + content = csvfile.readline().strip() + if not content: + raise ValueError("CSV file appears to be empty or header is missing.") + + with open(csv_path, mode="r") as csvfile: + try: + rows = csv.DictReader(csvfile, delimiter=csv_delimiter) + + # Check header presence before trying to read rows + if rows.fieldnames is None: + # Treat files with no header at all as effectively empty + raise ValueError("CSV file appears to be empty or header is missing.") + + header_columns = rows.fieldnames + missing_input_columns = [col for col in input_data_columns if col not in header_columns] + missing_output_columns = [col for col in expected_output_columns if col not in header_columns] + + if missing_input_columns: + raise ValueError(f"Input columns not found in CSV header: {missing_input_columns}") + if missing_output_columns: + raise ValueError(f"Expected output columns not found in CSV header: {missing_output_columns}") + + # Determine metadata columns (all columns not used for input or expected output) + metadata_columns = [ + col for col in header_columns if col not in input_data_columns and col not in expected_output_columns + ] + + for row in rows: + try: + ds.append( + DatasetRecord( + input_data={col: row[col] for col in input_data_columns}, + expected_output={col: row[col] for col in expected_output_columns}, + metadata={col: row[col] for col in metadata_columns}, + )) + + except KeyError as ke: + # Missing columns in a data row indicates malformed CSV + raise KeyError(f"Error parsing CSV file: missing column {ke} in a row") + except Exception as e: + # Other errors during row processing also indicate CSV issues + raise ValueError(f"Error parsing CSV file (row processing): {e}") + + except csv.Error as e: + # Catch CSV-specific parsing errors + raise ValueError(f"Error parsing CSV file: {e}") + + except FileNotFoundError as e: + raise FileNotFoundError(f"CSV file not found: {csv_path}") from e + except PermissionError as e: + raise PermissionError(f"Permission denied when reading CSV file: {csvfile}") from e + finally: + # Always restore the original field size limit + csv.field_size_limit(original_field_size_limit) + return ds @classmethod diff --git a/riotfile.py b/riotfile.py index b13e4fe7db8..828ceaea242 100644 --- a/riotfile.py +++ b/riotfile.py @@ -3138,6 +3138,7 @@ def select_pys(min_version: str = MIN_PYTHON_VERSION, max_version: str = MAX_PYT "pytest-asyncio": "==0.21.1", "ragas": "==0.1.21", "langchain": latest, + "pandas": latest, }, pys=select_pys(min_version="3.8"), ), diff --git a/setup.cfg b/setup.cfg index a62b967bb25..501d7d10115 100644 --- a/setup.cfg +++ b/setup.cfg @@ -6,19 +6,6 @@ skip = *.json,*.h,*.cpp,*.c,.riot,.tox,.mypy_cache,.git,*ddtrace/vendor,tests/co exclude-file = .codespellignorelines ignore-words-list = asend,dne,fo,medias,ment,nin,ot,setttings,statics,ba,spawnve,doas -# DEV: We use `conftest.py` as a local pytest plugin to configure hooks for collection -[tool:pytest] -# --cov-report is intentionally empty else pytest-cov will default to generating a report -addopts = - --cov=ddtrace/ - --cov=tests/ - --cov-append - --cov-report= - --durations=10 - --junitxml=test-results/junit.xml -# DEV: The default is `test_*\.py` which will miss `test.py` files -python_files = test*\.py -asyncio_mode = auto [flake8] max-line-length = 120 diff --git a/tests/llmobs/static_files/good_dataset.csv b/tests/llmobs/static_files/good_dataset.csv new file mode 100644 index 00000000000..be4f03d552c --- /dev/null +++ b/tests/llmobs/static_files/good_dataset.csv @@ -0,0 +1,3 @@ +in0,in1,in2,out0,out1,m0 +r0v1,r0v2,r0v3,r0v4,r0v5,r0v6 +r1v1,r1v2,r1v3,r1v4,r1v5,r1v6 diff --git a/tests/llmobs/test_experiments.py b/tests/llmobs/test_experiments.py index e1a3f925ff1..4370b9cd8c4 100644 --- a/tests/llmobs/test_experiments.py +++ b/tests/llmobs/test_experiments.py @@ -89,6 +89,38 @@ def test_dataset_create_delete(llmobs): llmobs._delete_dataset(dataset_id=dataset._id) +def test_dataset_as_dataframe(llmobs): + dataset = llmobs.create_dataset(name="test-dataset-3", description="A third test dataset") + dataset._records = [ + DatasetRecord(input_data=[{"role" : "system", "content": "i am machine"}, {"role" : "user", "content": "hello"}], expected_output="label") + ] + df = dataset.as_dataframe() + llmobs._delete_dataset(dataset_id=dataset._id) + +def test_dataset_csv(llmobs): + test_path = os.path.dirname(__file__) + csv_path = os.path.join(test_path, "static_files/good_dataset.csv") + dataset = llmobs.create_dataset_from_csv(csv_path=csv_path, dataset_name="test-dataset-good-csv", description="A good csv dataset", input_data_columns=["in0", "in1", "in2"], expected_output_columns=["out0", "out1"]) + assert len(dataset) == 2 + assert len(dataset[0]["input_data"]) == 3 + assert dataset[0]["input_data"]["in0"] == "r0v1" + assert dataset[0]["input_data"]["in1"] == "r0v2" + assert dataset[0]["input_data"]["in2"] == "r0v3" + assert dataset[1]["input_data"]["in0"] == "r1v1" + assert dataset[1]["input_data"]["in1"] == "r1v2" + assert dataset[1]["input_data"]["in2"] == "r1v3" + + assert len(dataset[0]["expected_output"]) == 2 + assert dataset[0]["expected_output"]["out0"] == "r0v4" + assert dataset[0]["expected_output"]["out1"] == "r0v5" + assert dataset[1]["expected_output"]["out0"] == "r1v4" + assert dataset[1]["expected_output"]["out1"] == "r1v5" + + assert len(dataset[0]["metadata"]) == 1 + assert dataset[0]["metadata"]["m0"] == "r0v6" + assert dataset[1]["metadata"]["m0"] == "r1v6" + + llmobs._delete_dataset(dataset_id=dataset._id) def test_dataset_pull_non_existent(llmobs): with pytest.raises(ValueError): From e9da8e2326ed8fa2ce969ae38684790b438462ae Mon Sep 17 00:00:00 2001 From: gary-huang Date: Thu, 17 Jul 2025 03:58:54 -0400 Subject: [PATCH 02/20] black and hatch --- ddtrace/llmobs/_experiment.py | 7 +++---- ddtrace/llmobs/_llmobs.py | 24 ++++++++++++++++++------ tests/llmobs/test_experiments.py | 16 ++++++++++++++-- 3 files changed, 35 insertions(+), 12 deletions(-) diff --git a/ddtrace/llmobs/_experiment.py b/ddtrace/llmobs/_experiment.py index 832e1d6e2ad..8d7c37869b3 100644 --- a/ddtrace/llmobs/_experiment.py +++ b/ddtrace/llmobs/_experiment.py @@ -145,12 +145,10 @@ def delete(self, index: int) -> None: del self._records[index] @overload - def __getitem__(self, index: int) -> DatasetRecord: - ... + def __getitem__(self, index: int) -> DatasetRecord: ... @overload - def __getitem__(self, index: slice) -> List[DatasetRecord]: - ... + def __getitem__(self, index: slice) -> List[DatasetRecord]: ... def __getitem__(self, index: Union[int, slice]) -> Union[DatasetRecord, List[DatasetRecord]]: return self._records.__getitem__(index) @@ -205,6 +203,7 @@ def as_dataframe(self): return pd.DataFrame(data=records_list, columns=pd.MultiIndex.from_tuples(column_tuples)) + class Experiment: def __init__( self, diff --git a/ddtrace/llmobs/_llmobs.py b/ddtrace/llmobs/_llmobs.py index d299fc146c0..00a86699dac 100644 --- a/ddtrace/llmobs/_llmobs.py +++ b/ddtrace/llmobs/_llmobs.py @@ -78,7 +78,7 @@ from ddtrace.llmobs._evaluators.runner import EvaluatorRunner from ddtrace.llmobs._experiment import Dataset from ddtrace.llmobs._experiment import DatasetRecordInputType -from ddtrace.llmobs._experiment import DatasetRecordRaw as DatasetRecord +from ddtrace.llmobs._experiment import DatasetRecord from ddtrace.llmobs._experiment import Experiment from ddtrace.llmobs._experiment import ExperimentConfigType from ddtrace.llmobs._experiment import JSONType @@ -587,15 +587,23 @@ def create_dataset(cls, name: str, description: str, records: List[DatasetRecord return ds @classmethod - def create_dataset_from_csv(cls, csv_path: str, dataset_name: str, input_data_columns: List[str], expected_output_columns: List[str], metadata_columns: List[str] = [], csv_delimiter: str = ",", description="") -> Dataset: + def create_dataset_from_csv( + cls, + csv_path: str, + dataset_name: str, + input_data_columns: List[str], + expected_output_columns: List[str], + metadata_columns: List[str] = [], + csv_delimiter: str = ",", + description="", + ) -> Dataset: ds = cls._instance._dne_client.dataset_create(dataset_name, description) # Store the original field size limit to restore it later original_field_size_limit = csv.field_size_limit() - csv.field_size_limit(10 * 1024 * 1024) # 10mb + csv.field_size_limit(10 * 1024 * 1024) # 10mb - records = [] try: # First check if the file exists and is not empty before parsing with open(csv_path, mode="r") as csvfile: @@ -623,7 +631,9 @@ def create_dataset_from_csv(cls, csv_path: str, dataset_name: str, input_data_co # Determine metadata columns (all columns not used for input or expected output) metadata_columns = [ - col for col in header_columns if col not in input_data_columns and col not in expected_output_columns + col + for col in header_columns + if col not in input_data_columns and col not in expected_output_columns ] for row in rows: @@ -633,7 +643,9 @@ def create_dataset_from_csv(cls, csv_path: str, dataset_name: str, input_data_co input_data={col: row[col] for col in input_data_columns}, expected_output={col: row[col] for col in expected_output_columns}, metadata={col: row[col] for col in metadata_columns}, - )) + record_id="", + ) + ) except KeyError as ke: # Missing columns in a data row indicates malformed CSV diff --git a/tests/llmobs/test_experiments.py b/tests/llmobs/test_experiments.py index 4370b9cd8c4..defad5959cd 100644 --- a/tests/llmobs/test_experiments.py +++ b/tests/llmobs/test_experiments.py @@ -89,18 +89,29 @@ def test_dataset_create_delete(llmobs): llmobs._delete_dataset(dataset_id=dataset._id) + def test_dataset_as_dataframe(llmobs): dataset = llmobs.create_dataset(name="test-dataset-3", description="A third test dataset") dataset._records = [ - DatasetRecord(input_data=[{"role" : "system", "content": "i am machine"}, {"role" : "user", "content": "hello"}], expected_output="label") + DatasetRecord( + input_data=[{"role": "system", "content": "i am machine"}, {"role": "user", "content": "hello"}], + expected_output="label", + ) ] df = dataset.as_dataframe() llmobs._delete_dataset(dataset_id=dataset._id) + def test_dataset_csv(llmobs): test_path = os.path.dirname(__file__) csv_path = os.path.join(test_path, "static_files/good_dataset.csv") - dataset = llmobs.create_dataset_from_csv(csv_path=csv_path, dataset_name="test-dataset-good-csv", description="A good csv dataset", input_data_columns=["in0", "in1", "in2"], expected_output_columns=["out0", "out1"]) + dataset = llmobs.create_dataset_from_csv( + csv_path=csv_path, + dataset_name="test-dataset-good-csv", + description="A good csv dataset", + input_data_columns=["in0", "in1", "in2"], + expected_output_columns=["out0", "out1"], + ) assert len(dataset) == 2 assert len(dataset[0]["input_data"]) == 3 assert dataset[0]["input_data"]["in0"] == "r0v1" @@ -122,6 +133,7 @@ def test_dataset_csv(llmobs): llmobs._delete_dataset(dataset_id=dataset._id) + def test_dataset_pull_non_existent(llmobs): with pytest.raises(ValueError): llmobs.pull_dataset(name="test-dataset-non-existent") From f7fbfb6ffab7bcbdca85eb53ec7212268770f8a8 Mon Sep 17 00:00:00 2001 From: Kyle Verhoog Date: Tue, 22 Jul 2025 08:34:28 -0400 Subject: [PATCH 03/20] chore(llmobs): fix misc experiments stuff --- ddtrace/llmobs/_experiment.py | 8 +++++--- ddtrace/llmobs/_writer.py | 2 +- 2 files changed, 6 insertions(+), 4 deletions(-) diff --git a/ddtrace/llmobs/_experiment.py b/ddtrace/llmobs/_experiment.py index 055dc319d8f..a4063c5f473 100644 --- a/ddtrace/llmobs/_experiment.py +++ b/ddtrace/llmobs/_experiment.py @@ -217,12 +217,14 @@ def run( if not self._llmobs_instance.enabled: logger.warning( "Skipping experiment as LLMObs is not enabled. " - "Ensure LLM Observability is enabled via `LLMObs.enable(...)` or set `DD_LLMOBS_ENABLED=1`." + "Ensure LLM Observability is enabled via `LLMObs.enable(...)` or set `DD_LLMOBS_ENABLED=1` and use `ddtrace-run` to run your application." ) return [] - project_id = self._llmobs_instance._dne_client.project_get(self._project_name) - if not project_id: + try: + project_id = self._llmobs_instance._dne_client.project_get(self._project_name) + except FileNotFoundError: project_id = self._llmobs_instance._dne_client.project_create(self._project_name) + self._project_id = project_id experiment_id, experiment_run_name = self._llmobs_instance._dne_client.experiment_create( self.name, diff --git a/ddtrace/llmobs/_writer.py b/ddtrace/llmobs/_writer.py index af27a6efdba..884f9c861b0 100644 --- a/ddtrace/llmobs/_writer.py +++ b/ddtrace/llmobs/_writer.py @@ -454,7 +454,7 @@ def project_get(self, name: str) -> str: response_data = resp.get_json() data = response_data["data"] if not data: - raise ValueError(f"Project {name} not found") + raise FileNotFoundError(f"Project {name} not found") return data[0]["id"] def experiment_create( From ce17651c8866e95633517b54753ccb14b9334a64 Mon Sep 17 00:00:00 2001 From: gary-huang Date: Tue, 22 Jul 2025 08:40:04 -0400 Subject: [PATCH 04/20] black --- ddtrace/llmobs/_experiment.py | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/ddtrace/llmobs/_experiment.py b/ddtrace/llmobs/_experiment.py index 8d7c37869b3..4a8877846b6 100644 --- a/ddtrace/llmobs/_experiment.py +++ b/ddtrace/llmobs/_experiment.py @@ -145,10 +145,12 @@ def delete(self, index: int) -> None: del self._records[index] @overload - def __getitem__(self, index: int) -> DatasetRecord: ... + def __getitem__(self, index: int) -> DatasetRecord: + ... @overload - def __getitem__(self, index: slice) -> List[DatasetRecord]: ... + def __getitem__(self, index: slice) -> List[DatasetRecord]: + ... def __getitem__(self, index: Union[int, slice]) -> Union[DatasetRecord, List[DatasetRecord]]: return self._records.__getitem__(index) From 03314734e0cf5d9d68e24a5ff1780be6cc5613d6 Mon Sep 17 00:00:00 2001 From: Gary Huang Date: Tue, 22 Jul 2025 08:40:47 -0400 Subject: [PATCH 05/20] Update ddtrace/llmobs/_experiment.py Co-authored-by: Yun Kim <35776586+Yun-Kim@users.noreply.github.com> --- ddtrace/llmobs/_experiment.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/ddtrace/llmobs/_experiment.py b/ddtrace/llmobs/_experiment.py index 4a8877846b6..c051d9efe0e 100644 --- a/ddtrace/llmobs/_experiment.py +++ b/ddtrace/llmobs/_experiment.py @@ -166,7 +166,7 @@ def as_dataframe(self): import pandas as pd except ImportError as e: raise ImportError( - "pandas is required to convert dataset to DataFrame. " "Please install it with `pip install pandas`" + "pandas is required to convert dataset to DataFrame. Please install via `pip install pandas`" ) from e column_tuples = set() From 7bb80ed1fc020ff888dd09963a51735187b7625d Mon Sep 17 00:00:00 2001 From: Gary Huang Date: Tue, 22 Jul 2025 08:43:41 -0400 Subject: [PATCH 06/20] Update ddtrace/llmobs/_experiment.py Co-authored-by: Yun Kim <35776586+Yun-Kim@users.noreply.github.com> --- ddtrace/llmobs/_experiment.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/ddtrace/llmobs/_experiment.py b/ddtrace/llmobs/_experiment.py index c051d9efe0e..8aa7fe0e019 100644 --- a/ddtrace/llmobs/_experiment.py +++ b/ddtrace/llmobs/_experiment.py @@ -192,7 +192,7 @@ def as_dataframe(self): flat_record[("expected_output", "")] = expected_output # Use empty string for single output column_tuples.add(("expected_output", "")) - for k, v in record.get("metadata", {}): + for k, v in record.get("metadata", {}).items(): flat_record[("metadata", k)] = v column_tuples.add(("metadata", k)) From 319e799258f4b84a4fff7c87e71d2d4508097067 Mon Sep 17 00:00:00 2001 From: gary-huang Date: Tue, 22 Jul 2025 09:04:36 -0400 Subject: [PATCH 07/20] address comments --- ddtrace/llmobs/_constants.py | 1 + ddtrace/llmobs/_llmobs.py | 89 ++++++++++++++---------------------- 2 files changed, 36 insertions(+), 54 deletions(-) diff --git a/ddtrace/llmobs/_constants.py b/ddtrace/llmobs/_constants.py index 5e511d4694b..217b53cf347 100644 --- a/ddtrace/llmobs/_constants.py +++ b/ddtrace/llmobs/_constants.py @@ -56,6 +56,7 @@ EVP_PAYLOAD_SIZE_LIMIT = 5 << 20 # 5MB (actual limit is 5.1MB) EVP_EVENT_SIZE_LIMIT = (1 << 20) - 1024 # 999KB (actual limit is 1MB) +EXPERIMENT_CSV_FIELD_MAX_SIZE = 10 * 1024 * 1024 DROPPED_IO_COLLECTION_ERROR = "dropped_io" DROPPED_VALUE_TEXT = "[This value has been dropped because this span's size exceeds the 1MB size limit.]" diff --git a/ddtrace/llmobs/_llmobs.py b/ddtrace/llmobs/_llmobs.py index 00a86699dac..4ac09031ad5 100644 --- a/ddtrace/llmobs/_llmobs.py +++ b/ddtrace/llmobs/_llmobs.py @@ -42,13 +42,14 @@ from ddtrace.internal.utils.formats import asbool from ddtrace.internal.utils.formats import format_trace_id from ddtrace.internal.utils.formats import parse_tags_str -from ddtrace.llmobs import _constants as constants, DatasetRecord +from ddtrace.llmobs import _constants as constants from ddtrace.llmobs import _telemetry as telemetry from ddtrace.llmobs._constants import ANNOTATIONS_CONTEXT_ID from ddtrace.llmobs._constants import DECORATOR from ddtrace.llmobs._constants import DISPATCH_ON_LLM_TOOL_CHOICE from ddtrace.llmobs._constants import DISPATCH_ON_TOOL_CALL from ddtrace.llmobs._constants import DISPATCH_ON_TOOL_CALL_OUTPUT_USED +from ddtrace.llmobs._constants import EXPERIMENT_CSV_FIELD_MAX_SIZE from ddtrace.llmobs._constants import EXPERIMENT_ID_KEY from ddtrace.llmobs._constants import INPUT_DOCUMENTS from ddtrace.llmobs._constants import INPUT_MESSAGES @@ -602,66 +603,46 @@ def create_dataset_from_csv( # Store the original field size limit to restore it later original_field_size_limit = csv.field_size_limit() - csv.field_size_limit(10 * 1024 * 1024) # 10mb + csv.field_size_limit(EXPERIMENT_CSV_FIELD_MAX_SIZE) # 10mb try: - # First check if the file exists and is not empty before parsing with open(csv_path, mode="r") as csvfile: content = csvfile.readline().strip() if not content: raise ValueError("CSV file appears to be empty or header is missing.") - with open(csv_path, mode="r") as csvfile: - try: - rows = csv.DictReader(csvfile, delimiter=csv_delimiter) - - # Check header presence before trying to read rows - if rows.fieldnames is None: - # Treat files with no header at all as effectively empty - raise ValueError("CSV file appears to be empty or header is missing.") - - header_columns = rows.fieldnames - missing_input_columns = [col for col in input_data_columns if col not in header_columns] - missing_output_columns = [col for col in expected_output_columns if col not in header_columns] - - if missing_input_columns: - raise ValueError(f"Input columns not found in CSV header: {missing_input_columns}") - if missing_output_columns: - raise ValueError(f"Expected output columns not found in CSV header: {missing_output_columns}") - - # Determine metadata columns (all columns not used for input or expected output) - metadata_columns = [ - col - for col in header_columns - if col not in input_data_columns and col not in expected_output_columns - ] - - for row in rows: - try: - ds.append( - DatasetRecord( - input_data={col: row[col] for col in input_data_columns}, - expected_output={col: row[col] for col in expected_output_columns}, - metadata={col: row[col] for col in metadata_columns}, - record_id="", - ) - ) - - except KeyError as ke: - # Missing columns in a data row indicates malformed CSV - raise KeyError(f"Error parsing CSV file: missing column {ke} in a row") - except Exception as e: - # Other errors during row processing also indicate CSV issues - raise ValueError(f"Error parsing CSV file (row processing): {e}") - - except csv.Error as e: - # Catch CSV-specific parsing errors - raise ValueError(f"Error parsing CSV file: {e}") - - except FileNotFoundError as e: - raise FileNotFoundError(f"CSV file not found: {csv_path}") from e - except PermissionError as e: - raise PermissionError(f"Permission denied when reading CSV file: {csvfile}") from e + csvfile.seek(0) + + rows = csv.DictReader(csvfile, delimiter=csv_delimiter) + + if rows.fieldnames is None: + raise ValueError("CSV file appears to be empty or header is missing.") + + header_columns = rows.fieldnames + missing_input_columns = [col for col in input_data_columns if col not in header_columns] + missing_output_columns = [col for col in expected_output_columns if col not in header_columns] + + if any(col not in header_columns for col in input_data_columns): + raise ValueError(f"Input columns not found in CSV header: {missing_input_columns}") + if any(col not in header_columns for col in expected_output_columns): + raise ValueError(f"Expected output columns not found in CSV header: {missing_output_columns}") + + metadata_columns = [ + col + for col in header_columns + if col not in input_data_columns and col not in expected_output_columns + ] + + for row in rows: + ds.append( + DatasetRecord( + input_data={col: row[col] for col in input_data_columns}, + expected_output={col: row[col] for col in expected_output_columns}, + metadata={col: row[col] for col in metadata_columns}, + record_id="", + ) + ) + finally: # Always restore the original field size limit csv.field_size_limit(original_field_size_limit) From 54247181ec95428d021c02bd9226bd4fb4cd3dcd Mon Sep 17 00:00:00 2001 From: Kyle Verhoog Date: Tue, 22 Jul 2025 09:05:18 -0400 Subject: [PATCH 08/20] fmt --- ddtrace/llmobs/_experiment.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/ddtrace/llmobs/_experiment.py b/ddtrace/llmobs/_experiment.py index a4063c5f473..d6e724492b4 100644 --- a/ddtrace/llmobs/_experiment.py +++ b/ddtrace/llmobs/_experiment.py @@ -217,7 +217,8 @@ def run( if not self._llmobs_instance.enabled: logger.warning( "Skipping experiment as LLMObs is not enabled. " - "Ensure LLM Observability is enabled via `LLMObs.enable(...)` or set `DD_LLMOBS_ENABLED=1` and use `ddtrace-run` to run your application." + "Ensure LLM Observability is enabled via `LLMObs.enable(...)` " + "or set `DD_LLMOBS_ENABLED=1` and use `ddtrace-run` to run your application." ) return [] try: From 5763329eb038bf6f5def77f9c5dd6ff8edae3d3f Mon Sep 17 00:00:00 2001 From: gary-huang Date: Tue, 22 Jul 2025 13:30:23 -0400 Subject: [PATCH 09/20] revert create_dataset push, this will be done in another PR --- ddtrace/llmobs/_llmobs.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/ddtrace/llmobs/_llmobs.py b/ddtrace/llmobs/_llmobs.py index 1e3d516624c..580036d5546 100644 --- a/ddtrace/llmobs/_llmobs.py +++ b/ddtrace/llmobs/_llmobs.py @@ -594,6 +594,8 @@ def create_dataset(cls, name: str, description: str, records: List[DatasetRecord ds = cls._instance._dne_client.dataset_create(name, description) for r in records: ds.append(r) + if len(records) > 0: + ds.push() return ds @classmethod From dc389eab57cca66f8cf2cc1ea044048fb7e8a98a Mon Sep 17 00:00:00 2001 From: gary-huang Date: Tue, 22 Jul 2025 19:25:57 -0400 Subject: [PATCH 10/20] add more tests --- ddtrace/llmobs/_llmobs.py | 9 +- tests/llmobs/static_files/empty.csv | 0 .../good_dataset_pipe_separated.csv | 3 + tests/llmobs/test_experiments.py | 92 ++++++++++++++++++- 4 files changed, 97 insertions(+), 7 deletions(-) create mode 100644 tests/llmobs/static_files/empty.csv create mode 100644 tests/llmobs/static_files/good_dataset_pipe_separated.csv diff --git a/ddtrace/llmobs/_llmobs.py b/ddtrace/llmobs/_llmobs.py index 580036d5546..fa190c182c7 100644 --- a/ddtrace/llmobs/_llmobs.py +++ b/ddtrace/llmobs/_llmobs.py @@ -632,17 +632,14 @@ def create_dataset_from_csv( header_columns = rows.fieldnames missing_input_columns = [col for col in input_data_columns if col not in header_columns] missing_output_columns = [col for col in expected_output_columns if col not in header_columns] + missing_metadata_columns = [col for col in metadata_columns if col not in metadata_columns] if any(col not in header_columns for col in input_data_columns): raise ValueError(f"Input columns not found in CSV header: {missing_input_columns}") if any(col not in header_columns for col in expected_output_columns): raise ValueError(f"Expected output columns not found in CSV header: {missing_output_columns}") - - metadata_columns = [ - col - for col in header_columns - if col not in input_data_columns and col not in expected_output_columns - ] + if any(col not in header_columns for col in metadata_columns): + raise ValueError(f"Metadata columns not found in CSV header: {missing_metadata_columns}") for row in rows: ds.append( diff --git a/tests/llmobs/static_files/empty.csv b/tests/llmobs/static_files/empty.csv new file mode 100644 index 00000000000..e69de29bb2d diff --git a/tests/llmobs/static_files/good_dataset_pipe_separated.csv b/tests/llmobs/static_files/good_dataset_pipe_separated.csv new file mode 100644 index 00000000000..7a5a698efe1 --- /dev/null +++ b/tests/llmobs/static_files/good_dataset_pipe_separated.csv @@ -0,0 +1,3 @@ +in0|in1|in2|out0|out1|m0 +r0v1|r0v2|r0v3|r0v4|r0v5|r0v6 +r1v1|r1v2|r1v3|r1v4|r1v5|r1v6 diff --git a/tests/llmobs/test_experiments.py b/tests/llmobs/test_experiments.py index 518503e889d..24b84145d98 100644 --- a/tests/llmobs/test_experiments.py +++ b/tests/llmobs/test_experiments.py @@ -95,8 +95,65 @@ def test_dataset_as_dataframe(llmobs): ) ] df = dataset.as_dataframe() + assert len(df.columns) == 2 + assert df.size == 2 # size is num elements in a series llmobs._delete_dataset(dataset_id=dataset._id) +def test_csv_dataset_as_dataframe(llmobs): + test_path = os.path.dirname(__file__) + csv_path = os.path.join(test_path, "static_files/good_dataset.csv") + dataset = llmobs.create_dataset_from_csv( + csv_path=csv_path, + dataset_name="test-dataset-good-csv", + description="A good csv dataset", + input_data_columns=["in0", "in1", "in2"], + expected_output_columns=["out0", "out1"], + metadata_columns=["m0"] + ) + assert len(dataset) == 2 + + df = dataset.as_dataframe() + assert len(df.columns) == 6 + assert sorted(df.columns) == [("expected_output", "out0"), ("expected_output", "out1"), ("input_data", "in0"), ("input_data", "in1"), ("input_data", "in2"), ("metadata", "m0")] + + llmobs._delete_dataset(dataset_id=dataset._id) + + +def test_dataset_csv_missing_input_col(llmobs): + test_path = os.path.dirname(__file__) + csv_path = os.path.join(test_path, "static_files/good_dataset.csv") + with pytest.raises(ValueError, match=re.escape("Input columns not found in CSV header: [\'in998\', \'in999\']")): + llmobs.create_dataset_from_csv( + csv_path=csv_path, + dataset_name="test-dataset-good-csv", + description="A good csv dataset", + input_data_columns=["in998", "in999"], + expected_output_columns=["out0", "out1"], + ) + +def test_dataset_csv_missing_output_col(llmobs): + test_path = os.path.dirname(__file__) + csv_path = os.path.join(test_path, "static_files/good_dataset.csv") + with pytest.raises(ValueError, match=re.escape("Expected output columns not found in CSV header: [\'out999\']")): + llmobs.create_dataset_from_csv( + csv_path=csv_path, + dataset_name="test-dataset-good-csv", + description="A good csv dataset", + input_data_columns=["in0", "in1", "in2"], + expected_output_columns=["out999"], + ) + +def test_dataset_csv_empty_csv(llmobs): + test_path = os.path.dirname(__file__) + csv_path = os.path.join(test_path, "static_files/empty.csv") + with pytest.raises(ValueError, match=re.escape("CSV file appears to be empty or header is missing.")): + llmobs.create_dataset_from_csv( + csv_path=csv_path, + dataset_name="test-dataset-bad-csv", + description="not a real csv dataset", + input_data_columns=["in0", "in1", "in2"], + expected_output_columns=["out0"], + ) def test_dataset_csv(llmobs): test_path = os.path.dirname(__file__) @@ -123,12 +180,45 @@ def test_dataset_csv(llmobs): assert dataset[1]["expected_output"]["out0"] == "r1v4" assert dataset[1]["expected_output"]["out1"] == "r1v5" + assert dataset.description == "A good csv dataset" + + llmobs._delete_dataset(dataset_id=dataset._id) + + +def test_dataset_csv_pipe_separated(llmobs): + test_path = os.path.dirname(__file__) + csv_path = os.path.join(test_path, "static_files/good_dataset_pipe_separated.csv") + dataset = llmobs.create_dataset_from_csv( + csv_path=csv_path, + dataset_name="test-dataset-good-csv-pipe", + description="A good pipe separated csv dataset", + input_data_columns=["in0", "in1", "in2"], + expected_output_columns=["out0", "out1"], + metadata_columns=["m0"], + csv_delimiter="|", + ) + assert len(dataset) == 2 + assert len(dataset[0]["input_data"]) == 3 + assert dataset[0]["input_data"]["in0"] == "r0v1" + assert dataset[0]["input_data"]["in1"] == "r0v2" + assert dataset[0]["input_data"]["in2"] == "r0v3" + assert dataset[1]["input_data"]["in0"] == "r1v1" + assert dataset[1]["input_data"]["in1"] == "r1v2" + assert dataset[1]["input_data"]["in2"] == "r1v3" + + assert len(dataset[0]["expected_output"]) == 2 + assert dataset[0]["expected_output"]["out0"] == "r0v4" + assert dataset[0]["expected_output"]["out1"] == "r0v5" + assert dataset[1]["expected_output"]["out0"] == "r1v4" + assert dataset[1]["expected_output"]["out1"] == "r1v5" + assert len(dataset[0]["metadata"]) == 1 assert dataset[0]["metadata"]["m0"] == "r0v6" assert dataset[1]["metadata"]["m0"] == "r1v6" - llmobs._delete_dataset(dataset_id=dataset._id) + assert dataset.description == "A good pipe separated csv dataset" + llmobs._delete_dataset(dataset_id=dataset._id) def test_dataset_pull_non_existent(llmobs): with pytest.raises(ValueError): From ebc060ecf67f96e00c07d577cb5272e42f4e72b7 Mon Sep 17 00:00:00 2001 From: gary-huang Date: Tue, 22 Jul 2025 22:34:32 -0400 Subject: [PATCH 11/20] wip push on create csV --- ddtrace/llmobs/_llmobs.py | 2 ++ tests/llmobs/test_experiments.py | 10 ++++++++++ 2 files changed, 12 insertions(+) diff --git a/ddtrace/llmobs/_llmobs.py b/ddtrace/llmobs/_llmobs.py index a30cbb07110..4043fabbd40 100644 --- a/ddtrace/llmobs/_llmobs.py +++ b/ddtrace/llmobs/_llmobs.py @@ -656,6 +656,8 @@ def create_dataset_from_csv( # Always restore the original field size limit csv.field_size_limit(original_field_size_limit) + if len(ds) > 0: + ds.push() return ds @classmethod diff --git a/tests/llmobs/test_experiments.py b/tests/llmobs/test_experiments.py index 7466f15635b..f36cdf32166 100644 --- a/tests/llmobs/test_experiments.py +++ b/tests/llmobs/test_experiments.py @@ -182,6 +182,16 @@ def test_dataset_csv(llmobs): assert dataset.description == "A good csv dataset" + assert dataset._id is not None + + wait_for_backend() + ds = llmobs.pull_dataset(name=dataset.name) + + assert len(ds) == len(dataset) + assert ds.name == dataset.name + assert ds.description == dataset.description + assert ds._version == 1 + llmobs._delete_dataset(dataset_id=dataset._id) From 4c9087289f3ce366774c434006c16d64a5484340 Mon Sep 17 00:00:00 2001 From: gary-huang Date: Tue, 22 Jul 2025 22:34:32 -0400 Subject: [PATCH 12/20] push on create csv --- ...9cb46e5ed3_batch_update_post_466da69c.yaml | 49 +++++++++++++++++++ ...05e-1d9cb46e5ed3_records_get_46238641.yaml | 45 +++++++++++++++++ ...b33f55d51d_batch_update_post_b8c84073.yaml | 49 +++++++++++++++++++ ...bbf-95b33f55d51d_records_get_d7e731ad.yaml | 45 +++++++++++++++++ ...-obs_v1_datasets_delete_post_d7976483.yaml | 46 +++++++++++++++++ ...-obs_v1_datasets_delete_post_f8dc510e.yaml | 46 +++++++++++++++++ ...st-dataset-good-csv-pipe_get_bcb704ce.yaml | 46 +++++++++++++++++ ...e__test-dataset-good-csv_get_989b2028.yaml | 46 +++++++++++++++++ ...ble_llm-obs_v1_datasets_post_027be704.yaml | 46 +++++++++++++++++ ...ble_llm-obs_v1_datasets_post_c1d4ae31.yaml | 46 +++++++++++++++++ ...ble_llm-obs_v1_datasets_post_d59b5313.yaml | 46 +++++++++++++++++ tests/llmobs/test_experiments.py | 10 ++++ 12 files changed, 520 insertions(+) create mode 100644 tests/llmobs/llmobs_cassettes/datadog/datadog_api_unstable_llm-obs_v1_datasets_8dbba503-cf48-4e82-805e-1d9cb46e5ed3_batch_update_post_466da69c.yaml create mode 100644 tests/llmobs/llmobs_cassettes/datadog/datadog_api_unstable_llm-obs_v1_datasets_8dbba503-cf48-4e82-805e-1d9cb46e5ed3_records_get_46238641.yaml create mode 100644 tests/llmobs/llmobs_cassettes/datadog/datadog_api_unstable_llm-obs_v1_datasets_acf19ca4-8062-4548-abbf-95b33f55d51d_batch_update_post_b8c84073.yaml create mode 100644 tests/llmobs/llmobs_cassettes/datadog/datadog_api_unstable_llm-obs_v1_datasets_acf19ca4-8062-4548-abbf-95b33f55d51d_records_get_d7e731ad.yaml create mode 100644 tests/llmobs/llmobs_cassettes/datadog/datadog_api_unstable_llm-obs_v1_datasets_delete_post_d7976483.yaml create mode 100644 tests/llmobs/llmobs_cassettes/datadog/datadog_api_unstable_llm-obs_v1_datasets_delete_post_f8dc510e.yaml create mode 100644 tests/llmobs/llmobs_cassettes/datadog/datadog_api_unstable_llm-obs_v1_datasets_filter_name__test-dataset-good-csv-pipe_get_bcb704ce.yaml create mode 100644 tests/llmobs/llmobs_cassettes/datadog/datadog_api_unstable_llm-obs_v1_datasets_filter_name__test-dataset-good-csv_get_989b2028.yaml create mode 100644 tests/llmobs/llmobs_cassettes/datadog/datadog_api_unstable_llm-obs_v1_datasets_post_027be704.yaml create mode 100644 tests/llmobs/llmobs_cassettes/datadog/datadog_api_unstable_llm-obs_v1_datasets_post_c1d4ae31.yaml create mode 100644 tests/llmobs/llmobs_cassettes/datadog/datadog_api_unstable_llm-obs_v1_datasets_post_d59b5313.yaml diff --git a/tests/llmobs/llmobs_cassettes/datadog/datadog_api_unstable_llm-obs_v1_datasets_8dbba503-cf48-4e82-805e-1d9cb46e5ed3_batch_update_post_466da69c.yaml b/tests/llmobs/llmobs_cassettes/datadog/datadog_api_unstable_llm-obs_v1_datasets_8dbba503-cf48-4e82-805e-1d9cb46e5ed3_batch_update_post_466da69c.yaml new file mode 100644 index 00000000000..81adf342dcc --- /dev/null +++ b/tests/llmobs/llmobs_cassettes/datadog/datadog_api_unstable_llm-obs_v1_datasets_8dbba503-cf48-4e82-805e-1d9cb46e5ed3_batch_update_post_466da69c.yaml @@ -0,0 +1,49 @@ +interactions: +- request: + body: '{"data": {"type": "datasets", "id": "8dbba503-cf48-4e82-805e-1d9cb46e5ed3", + "attributes": {"insert_records": [{"input": {"in0": "r0v1", "in1": "r0v2", "in2": + "r0v3"}, "expected_output": {"out0": "r0v4", "out1": "r0v5"}, "metadata": {}}, + {"input": {"in0": "r1v1", "in1": "r1v2", "in2": "r1v3"}, "expected_output": + {"out0": "r1v4", "out1": "r1v5"}, "metadata": {}}], "update_records": [], "delete_records": + []}}}' + headers: + Accept: + - '*/*' + ? !!python/object/apply:multidict._multidict.istr + - Accept-Encoding + : - identity + Connection: + - keep-alive + Content-Length: + - '410' + ? !!python/object/apply:multidict._multidict.istr + - Content-Type + : - application/json + User-Agent: + - python-requests/2.32.3 + method: POST + uri: https://api.datadoghq.com/api/unstable/llm-obs/v1/datasets/8dbba503-cf48-4e82-805e-1d9cb46e5ed3/batch_update + response: + body: + string: '{"data":[{"id":"79dc4c72-e4fd-4de7-b46e-419b8072a207","type":"datasets","attributes":{"author":{"id":"de473b30-eb9f-11e9-a77a-c7405862b8bd"},"created_at":"2025-07-23T02:50:48.152449869Z","dataset_id":"8dbba503-cf48-4e82-805e-1d9cb46e5ed3","expected_output":{"out0":"r0v4","out1":"r0v5"},"input":{"in0":"r0v1","in1":"r0v2","in2":"r0v3"},"metadata":{},"updated_at":"2025-07-23T02:50:48.152449869Z","version":1}},{"id":"f0f3fe26-1060-452f-a292-30b1293dd5a1","type":"datasets","attributes":{"author":{"id":"de473b30-eb9f-11e9-a77a-c7405862b8bd"},"created_at":"2025-07-23T02:50:48.152449869Z","dataset_id":"8dbba503-cf48-4e82-805e-1d9cb46e5ed3","expected_output":{"out0":"r1v4","out1":"r1v5"},"input":{"in0":"r1v1","in1":"r1v2","in2":"r1v3"},"metadata":{},"updated_at":"2025-07-23T02:50:48.152449869Z","version":1}}]}' + headers: + content-length: + - '812' + content-security-policy: + - frame-ancestors 'self'; report-uri https://logs.browser-intake-datadoghq.com/api/v2/logs?dd-api-key=pube4f163c23bbf91c16b8f57f56af9fc58&dd-evp-origin=content-security-policy&ddsource=csp-report&ddtags=site%3Adatadoghq.com + content-type: + - application/vnd.api+json + date: + - Wed, 23 Jul 2025 02:50:48 GMT + strict-transport-security: + - max-age=31536000; includeSubDomains; preload + vary: + - Accept-Encoding + x-content-type-options: + - nosniff + x-frame-options: + - SAMEORIGIN + status: + code: 200 + message: OK +version: 1 diff --git a/tests/llmobs/llmobs_cassettes/datadog/datadog_api_unstable_llm-obs_v1_datasets_8dbba503-cf48-4e82-805e-1d9cb46e5ed3_records_get_46238641.yaml b/tests/llmobs/llmobs_cassettes/datadog/datadog_api_unstable_llm-obs_v1_datasets_8dbba503-cf48-4e82-805e-1d9cb46e5ed3_records_get_46238641.yaml new file mode 100644 index 00000000000..f8b4fdefe78 --- /dev/null +++ b/tests/llmobs/llmobs_cassettes/datadog/datadog_api_unstable_llm-obs_v1_datasets_8dbba503-cf48-4e82-805e-1d9cb46e5ed3_records_get_46238641.yaml @@ -0,0 +1,45 @@ +interactions: +- request: + body: null + headers: + Accept: + - '*/*' + ? !!python/object/apply:multidict._multidict.istr + - Accept-Encoding + : - identity + Connection: + - keep-alive + ? !!python/object/apply:multidict._multidict.istr + - Content-Length + : - '0' + ? !!python/object/apply:multidict._multidict.istr + - Content-Type + : - application/json + User-Agent: + - python-requests/2.32.3 + method: GET + uri: https://api.datadoghq.com/api/unstable/llm-obs/v1/datasets/8dbba503-cf48-4e82-805e-1d9cb46e5ed3/records + response: + body: + string: '{"data":[{"id":"79dc4c72-e4fd-4de7-b46e-419b8072a207","type":"datasets","attributes":{"author":{"id":"de473b30-eb9f-11e9-a77a-c7405862b8bd"},"created_at":"2025-07-23T02:50:48.152449Z","dataset_id":"8dbba503-cf48-4e82-805e-1d9cb46e5ed3","expected_output":{"out0":"r0v4","out1":"r0v5"},"input":{"in0":"r0v1","in1":"r0v2","in2":"r0v3"},"metadata":{},"updated_at":"2025-07-23T02:50:48.152449Z"}},{"id":"f0f3fe26-1060-452f-a292-30b1293dd5a1","type":"datasets","attributes":{"author":{"id":"de473b30-eb9f-11e9-a77a-c7405862b8bd"},"created_at":"2025-07-23T02:50:48.152449Z","dataset_id":"8dbba503-cf48-4e82-805e-1d9cb46e5ed3","expected_output":{"out0":"r1v4","out1":"r1v5"},"input":{"in0":"r1v1","in1":"r1v2","in2":"r1v3"},"metadata":{},"updated_at":"2025-07-23T02:50:48.152449Z"}}],"meta":{"after":""}}' + headers: + content-length: + - '796' + content-security-policy: + - frame-ancestors 'self'; report-uri https://logs.browser-intake-datadoghq.com/api/v2/logs?dd-api-key=pube4f163c23bbf91c16b8f57f56af9fc58&dd-evp-origin=content-security-policy&ddsource=csp-report&ddtags=site%3Adatadoghq.com + content-type: + - application/vnd.api+json + date: + - Wed, 23 Jul 2025 02:50:52 GMT + strict-transport-security: + - max-age=31536000; includeSubDomains; preload + vary: + - Accept-Encoding + x-content-type-options: + - nosniff + x-frame-options: + - SAMEORIGIN + status: + code: 200 + message: OK +version: 1 diff --git a/tests/llmobs/llmobs_cassettes/datadog/datadog_api_unstable_llm-obs_v1_datasets_acf19ca4-8062-4548-abbf-95b33f55d51d_batch_update_post_b8c84073.yaml b/tests/llmobs/llmobs_cassettes/datadog/datadog_api_unstable_llm-obs_v1_datasets_acf19ca4-8062-4548-abbf-95b33f55d51d_batch_update_post_b8c84073.yaml new file mode 100644 index 00000000000..eb8a50f0d92 --- /dev/null +++ b/tests/llmobs/llmobs_cassettes/datadog/datadog_api_unstable_llm-obs_v1_datasets_acf19ca4-8062-4548-abbf-95b33f55d51d_batch_update_post_b8c84073.yaml @@ -0,0 +1,49 @@ +interactions: +- request: + body: '{"data": {"type": "datasets", "id": "acf19ca4-8062-4548-abbf-95b33f55d51d", + "attributes": {"insert_records": [{"input": {"in0": "r0v1", "in1": "r0v2", "in2": + "r0v3"}, "expected_output": {"out0": "r0v4", "out1": "r0v5"}, "metadata": {"m0": + "r0v6"}}, {"input": {"in0": "r1v1", "in1": "r1v2", "in2": "r1v3"}, "expected_output": + {"out0": "r1v4", "out1": "r1v5"}, "metadata": {"m0": "r1v6"}}], "update_records": + [], "delete_records": []}}}' + headers: + Accept: + - '*/*' + ? !!python/object/apply:multidict._multidict.istr + - Accept-Encoding + : - identity + Connection: + - keep-alive + Content-Length: + - '434' + ? !!python/object/apply:multidict._multidict.istr + - Content-Type + : - application/json + User-Agent: + - python-requests/2.32.3 + method: POST + uri: https://api.datadoghq.com/api/unstable/llm-obs/v1/datasets/acf19ca4-8062-4548-abbf-95b33f55d51d/batch_update + response: + body: + string: '{"data":[{"id":"016350fb-8b44-4d5d-af57-0c8408bb5d25","type":"datasets","attributes":{"author":{"id":"de473b30-eb9f-11e9-a77a-c7405862b8bd"},"created_at":"2025-07-23T02:50:52.795124332Z","dataset_id":"acf19ca4-8062-4548-abbf-95b33f55d51d","expected_output":{"out0":"r0v4","out1":"r0v5"},"input":{"in0":"r0v1","in1":"r0v2","in2":"r0v3"},"metadata":{"m0":"r0v6"},"updated_at":"2025-07-23T02:50:52.795124332Z","version":1}},{"id":"91fcfd23-040c-40ad-a0b0-f497f10b41e3","type":"datasets","attributes":{"author":{"id":"de473b30-eb9f-11e9-a77a-c7405862b8bd"},"created_at":"2025-07-23T02:50:52.795124332Z","dataset_id":"acf19ca4-8062-4548-abbf-95b33f55d51d","expected_output":{"out0":"r1v4","out1":"r1v5"},"input":{"in0":"r1v1","in1":"r1v2","in2":"r1v3"},"metadata":{"m0":"r1v6"},"updated_at":"2025-07-23T02:50:52.795124332Z","version":1}}]}' + headers: + content-length: + - '834' + content-security-policy: + - frame-ancestors 'self'; report-uri https://logs.browser-intake-datadoghq.com/api/v2/logs?dd-api-key=pube4f163c23bbf91c16b8f57f56af9fc58&dd-evp-origin=content-security-policy&ddsource=csp-report&ddtags=site%3Adatadoghq.com + content-type: + - application/vnd.api+json + date: + - Wed, 23 Jul 2025 02:50:52 GMT + strict-transport-security: + - max-age=31536000; includeSubDomains; preload + vary: + - Accept-Encoding + x-content-type-options: + - nosniff + x-frame-options: + - SAMEORIGIN + status: + code: 200 + message: OK +version: 1 diff --git a/tests/llmobs/llmobs_cassettes/datadog/datadog_api_unstable_llm-obs_v1_datasets_acf19ca4-8062-4548-abbf-95b33f55d51d_records_get_d7e731ad.yaml b/tests/llmobs/llmobs_cassettes/datadog/datadog_api_unstable_llm-obs_v1_datasets_acf19ca4-8062-4548-abbf-95b33f55d51d_records_get_d7e731ad.yaml new file mode 100644 index 00000000000..51d0723acfa --- /dev/null +++ b/tests/llmobs/llmobs_cassettes/datadog/datadog_api_unstable_llm-obs_v1_datasets_acf19ca4-8062-4548-abbf-95b33f55d51d_records_get_d7e731ad.yaml @@ -0,0 +1,45 @@ +interactions: +- request: + body: null + headers: + Accept: + - '*/*' + ? !!python/object/apply:multidict._multidict.istr + - Accept-Encoding + : - identity + Connection: + - keep-alive + ? !!python/object/apply:multidict._multidict.istr + - Content-Length + : - '0' + ? !!python/object/apply:multidict._multidict.istr + - Content-Type + : - application/json + User-Agent: + - python-requests/2.32.3 + method: GET + uri: https://api.datadoghq.com/api/unstable/llm-obs/v1/datasets/acf19ca4-8062-4548-abbf-95b33f55d51d/records + response: + body: + string: '{"data":[{"id":"016350fb-8b44-4d5d-af57-0c8408bb5d25","type":"datasets","attributes":{"author":{"id":"de473b30-eb9f-11e9-a77a-c7405862b8bd"},"created_at":"2025-07-23T02:50:52.795124Z","dataset_id":"acf19ca4-8062-4548-abbf-95b33f55d51d","expected_output":{"out0":"r0v4","out1":"r0v5"},"input":{"in0":"r0v1","in1":"r0v2","in2":"r0v3"},"metadata":{"m0":"r0v6"},"updated_at":"2025-07-23T02:50:52.795124Z"}},{"id":"91fcfd23-040c-40ad-a0b0-f497f10b41e3","type":"datasets","attributes":{"author":{"id":"de473b30-eb9f-11e9-a77a-c7405862b8bd"},"created_at":"2025-07-23T02:50:52.795124Z","dataset_id":"acf19ca4-8062-4548-abbf-95b33f55d51d","expected_output":{"out0":"r1v4","out1":"r1v5"},"input":{"in0":"r1v1","in1":"r1v2","in2":"r1v3"},"metadata":{"m0":"r1v6"},"updated_at":"2025-07-23T02:50:52.795124Z"}}],"meta":{"after":""}}' + headers: + content-length: + - '818' + content-security-policy: + - frame-ancestors 'self'; report-uri https://logs.browser-intake-datadoghq.com/api/v2/logs?dd-api-key=pube4f163c23bbf91c16b8f57f56af9fc58&dd-evp-origin=content-security-policy&ddsource=csp-report&ddtags=site%3Adatadoghq.com + content-type: + - application/vnd.api+json + date: + - Wed, 23 Jul 2025 02:50:55 GMT + strict-transport-security: + - max-age=31536000; includeSubDomains; preload + vary: + - Accept-Encoding + x-content-type-options: + - nosniff + x-frame-options: + - SAMEORIGIN + status: + code: 200 + message: OK +version: 1 diff --git a/tests/llmobs/llmobs_cassettes/datadog/datadog_api_unstable_llm-obs_v1_datasets_delete_post_d7976483.yaml b/tests/llmobs/llmobs_cassettes/datadog/datadog_api_unstable_llm-obs_v1_datasets_delete_post_d7976483.yaml new file mode 100644 index 00000000000..698ca4baa2d --- /dev/null +++ b/tests/llmobs/llmobs_cassettes/datadog/datadog_api_unstable_llm-obs_v1_datasets_delete_post_d7976483.yaml @@ -0,0 +1,46 @@ +interactions: +- request: + body: '{"data": {"type": "datasets", "attributes": {"type": "soft", "dataset_ids": + ["8dbba503-cf48-4e82-805e-1d9cb46e5ed3"]}}}' + headers: + Accept: + - '*/*' + ? !!python/object/apply:multidict._multidict.istr + - Accept-Encoding + : - identity + Connection: + - keep-alive + Content-Length: + - '119' + ? !!python/object/apply:multidict._multidict.istr + - Content-Type + : - application/json + User-Agent: + - python-requests/2.32.3 + method: POST + uri: https://api.datadoghq.com/api/unstable/llm-obs/v1/datasets/delete + response: + body: + string: '{"data":[{"id":"8dbba503-cf48-4e82-805e-1d9cb46e5ed3","type":"datasets","attributes":{"author":{"id":"de473b30-eb9f-11e9-a77a-c7405862b8bd"},"created_at":"2025-07-23T02:50:46.680898Z","current_version":1,"deleted_at":"2025-07-23T02:50:52.35158Z","description":"A + good csv dataset","name":"test-dataset-good-csv","updated_at":"2025-07-23T02:50:48.30675Z"}}]}' + headers: + content-length: + - '357' + content-security-policy: + - frame-ancestors 'self'; report-uri https://logs.browser-intake-datadoghq.com/api/v2/logs?dd-api-key=pube4f163c23bbf91c16b8f57f56af9fc58&dd-evp-origin=content-security-policy&ddsource=csp-report&ddtags=site%3Adatadoghq.com + content-type: + - application/vnd.api+json + date: + - Wed, 23 Jul 2025 02:50:52 GMT + strict-transport-security: + - max-age=31536000; includeSubDomains; preload + vary: + - Accept-Encoding + x-content-type-options: + - nosniff + x-frame-options: + - SAMEORIGIN + status: + code: 200 + message: OK +version: 1 diff --git a/tests/llmobs/llmobs_cassettes/datadog/datadog_api_unstable_llm-obs_v1_datasets_delete_post_f8dc510e.yaml b/tests/llmobs/llmobs_cassettes/datadog/datadog_api_unstable_llm-obs_v1_datasets_delete_post_f8dc510e.yaml new file mode 100644 index 00000000000..10725a15b17 --- /dev/null +++ b/tests/llmobs/llmobs_cassettes/datadog/datadog_api_unstable_llm-obs_v1_datasets_delete_post_f8dc510e.yaml @@ -0,0 +1,46 @@ +interactions: +- request: + body: '{"data": {"type": "datasets", "attributes": {"type": "soft", "dataset_ids": + ["acf19ca4-8062-4548-abbf-95b33f55d51d"]}}}' + headers: + Accept: + - '*/*' + ? !!python/object/apply:multidict._multidict.istr + - Accept-Encoding + : - identity + Connection: + - keep-alive + Content-Length: + - '119' + ? !!python/object/apply:multidict._multidict.istr + - Content-Type + : - application/json + User-Agent: + - python-requests/2.32.3 + method: POST + uri: https://api.datadoghq.com/api/unstable/llm-obs/v1/datasets/delete + response: + body: + string: '{"data":[{"id":"acf19ca4-8062-4548-abbf-95b33f55d51d","type":"datasets","attributes":{"author":{"id":"de473b30-eb9f-11e9-a77a-c7405862b8bd"},"created_at":"2025-07-23T02:50:52.663543Z","current_version":1,"deleted_at":"2025-07-23T02:50:55.361839Z","description":"A + good pipe separated csv dataset","name":"test-dataset-good-csv-pipe","updated_at":"2025-07-23T02:50:52.943101Z"}}]}' + headers: + content-length: + - '379' + content-security-policy: + - frame-ancestors 'self'; report-uri https://logs.browser-intake-datadoghq.com/api/v2/logs?dd-api-key=pube4f163c23bbf91c16b8f57f56af9fc58&dd-evp-origin=content-security-policy&ddsource=csp-report&ddtags=site%3Adatadoghq.com + content-type: + - application/vnd.api+json + date: + - Wed, 23 Jul 2025 02:50:55 GMT + strict-transport-security: + - max-age=31536000; includeSubDomains; preload + vary: + - Accept-Encoding + x-content-type-options: + - nosniff + x-frame-options: + - SAMEORIGIN + status: + code: 200 + message: OK +version: 1 diff --git a/tests/llmobs/llmobs_cassettes/datadog/datadog_api_unstable_llm-obs_v1_datasets_filter_name__test-dataset-good-csv-pipe_get_bcb704ce.yaml b/tests/llmobs/llmobs_cassettes/datadog/datadog_api_unstable_llm-obs_v1_datasets_filter_name__test-dataset-good-csv-pipe_get_bcb704ce.yaml new file mode 100644 index 00000000000..032e3c997f3 --- /dev/null +++ b/tests/llmobs/llmobs_cassettes/datadog/datadog_api_unstable_llm-obs_v1_datasets_filter_name__test-dataset-good-csv-pipe_get_bcb704ce.yaml @@ -0,0 +1,46 @@ +interactions: +- request: + body: null + headers: + Accept: + - '*/*' + ? !!python/object/apply:multidict._multidict.istr + - Accept-Encoding + : - identity + Connection: + - keep-alive + ? !!python/object/apply:multidict._multidict.istr + - Content-Length + : - '0' + ? !!python/object/apply:multidict._multidict.istr + - Content-Type + : - application/json + User-Agent: + - python-requests/2.32.3 + method: GET + uri: https://api.datadoghq.com/api/unstable/llm-obs/v1/datasets?filter%5Bname%5D=test-dataset-good-csv-pipe + response: + body: + string: '{"data":[{"id":"acf19ca4-8062-4548-abbf-95b33f55d51d","type":"datasets","attributes":{"author":{"id":"de473b30-eb9f-11e9-a77a-c7405862b8bd"},"created_at":"2025-07-23T02:50:52.663543Z","current_version":1,"description":"A + good pipe separated csv dataset","name":"test-dataset-good-csv-pipe","updated_at":"2025-07-23T02:50:52.943101Z"}}],"meta":{"after":""}}' + headers: + content-length: + - '356' + content-security-policy: + - frame-ancestors 'self'; report-uri https://logs.browser-intake-datadoghq.com/api/v2/logs?dd-api-key=pube4f163c23bbf91c16b8f57f56af9fc58&dd-evp-origin=content-security-policy&ddsource=csp-report&ddtags=site%3Adatadoghq.com + content-type: + - application/vnd.api+json + date: + - Wed, 23 Jul 2025 02:50:55 GMT + strict-transport-security: + - max-age=31536000; includeSubDomains; preload + vary: + - Accept-Encoding + x-content-type-options: + - nosniff + x-frame-options: + - SAMEORIGIN + status: + code: 200 + message: OK +version: 1 diff --git a/tests/llmobs/llmobs_cassettes/datadog/datadog_api_unstable_llm-obs_v1_datasets_filter_name__test-dataset-good-csv_get_989b2028.yaml b/tests/llmobs/llmobs_cassettes/datadog/datadog_api_unstable_llm-obs_v1_datasets_filter_name__test-dataset-good-csv_get_989b2028.yaml new file mode 100644 index 00000000000..02e7ba102ae --- /dev/null +++ b/tests/llmobs/llmobs_cassettes/datadog/datadog_api_unstable_llm-obs_v1_datasets_filter_name__test-dataset-good-csv_get_989b2028.yaml @@ -0,0 +1,46 @@ +interactions: +- request: + body: null + headers: + Accept: + - '*/*' + ? !!python/object/apply:multidict._multidict.istr + - Accept-Encoding + : - identity + Connection: + - keep-alive + ? !!python/object/apply:multidict._multidict.istr + - Content-Length + : - '0' + ? !!python/object/apply:multidict._multidict.istr + - Content-Type + : - application/json + User-Agent: + - python-requests/2.32.3 + method: GET + uri: https://api.datadoghq.com/api/unstable/llm-obs/v1/datasets?filter%5Bname%5D=test-dataset-good-csv + response: + body: + string: '{"data":[{"id":"8dbba503-cf48-4e82-805e-1d9cb46e5ed3","type":"datasets","attributes":{"author":{"id":"de473b30-eb9f-11e9-a77a-c7405862b8bd"},"created_at":"2025-07-23T02:50:46.680898Z","current_version":1,"description":"A + good csv dataset","name":"test-dataset-good-csv","updated_at":"2025-07-23T02:50:48.30675Z"}}],"meta":{"after":""}}' + headers: + content-length: + - '335' + content-security-policy: + - frame-ancestors 'self'; report-uri https://logs.browser-intake-datadoghq.com/api/v2/logs?dd-api-key=pube4f163c23bbf91c16b8f57f56af9fc58&dd-evp-origin=content-security-policy&ddsource=csp-report&ddtags=site%3Adatadoghq.com + content-type: + - application/vnd.api+json + date: + - Wed, 23 Jul 2025 02:50:51 GMT + strict-transport-security: + - max-age=31536000; includeSubDomains; preload + vary: + - Accept-Encoding + x-content-type-options: + - nosniff + x-frame-options: + - SAMEORIGIN + status: + code: 200 + message: OK +version: 1 diff --git a/tests/llmobs/llmobs_cassettes/datadog/datadog_api_unstable_llm-obs_v1_datasets_post_027be704.yaml b/tests/llmobs/llmobs_cassettes/datadog/datadog_api_unstable_llm-obs_v1_datasets_post_027be704.yaml new file mode 100644 index 00000000000..232be655b93 --- /dev/null +++ b/tests/llmobs/llmobs_cassettes/datadog/datadog_api_unstable_llm-obs_v1_datasets_post_027be704.yaml @@ -0,0 +1,46 @@ +interactions: +- request: + body: '{"data": {"type": "datasets", "attributes": {"name": "test-dataset-good-csv-pipe", + "description": "A good pipe separated csv dataset"}}}' + headers: + Accept: + - '*/*' + ? !!python/object/apply:multidict._multidict.istr + - Accept-Encoding + : - identity + Connection: + - keep-alive + Content-Length: + - '136' + ? !!python/object/apply:multidict._multidict.istr + - Content-Type + : - application/json + User-Agent: + - python-requests/2.32.3 + method: POST + uri: https://api.datadoghq.com/api/unstable/llm-obs/v1/datasets + response: + body: + string: '{"data":{"id":"acf19ca4-8062-4548-abbf-95b33f55d51d","type":"datasets","attributes":{"author":{"id":"de473b30-eb9f-11e9-a77a-c7405862b8bd"},"created_at":"2025-07-23T02:50:52.663543594Z","current_version":0,"description":"A + good pipe separated csv dataset","name":"test-dataset-good-csv-pipe","updated_at":"2025-07-23T02:50:52.663543594Z"}}}' + headers: + content-length: + - '340' + content-security-policy: + - frame-ancestors 'self'; report-uri https://logs.browser-intake-datadoghq.com/api/v2/logs?dd-api-key=pube4f163c23bbf91c16b8f57f56af9fc58&dd-evp-origin=content-security-policy&ddsource=csp-report&ddtags=site%3Adatadoghq.com + content-type: + - application/vnd.api+json + date: + - Wed, 23 Jul 2025 02:50:52 GMT + strict-transport-security: + - max-age=31536000; includeSubDomains; preload + vary: + - Accept-Encoding + x-content-type-options: + - nosniff + x-frame-options: + - SAMEORIGIN + status: + code: 200 + message: OK +version: 1 diff --git a/tests/llmobs/llmobs_cassettes/datadog/datadog_api_unstable_llm-obs_v1_datasets_post_c1d4ae31.yaml b/tests/llmobs/llmobs_cassettes/datadog/datadog_api_unstable_llm-obs_v1_datasets_post_c1d4ae31.yaml new file mode 100644 index 00000000000..de032063c0c --- /dev/null +++ b/tests/llmobs/llmobs_cassettes/datadog/datadog_api_unstable_llm-obs_v1_datasets_post_c1d4ae31.yaml @@ -0,0 +1,46 @@ +interactions: +- request: + body: '{"data": {"type": "datasets", "attributes": {"name": "test-dataset-good-csv", + "description": "A good csv dataset"}}}' + headers: + Accept: + - '*/*' + ? !!python/object/apply:multidict._multidict.istr + - Accept-Encoding + : - identity + Connection: + - keep-alive + Content-Length: + - '116' + ? !!python/object/apply:multidict._multidict.istr + - Content-Type + : - application/json + User-Agent: + - python-requests/2.32.3 + method: POST + uri: https://api.datadoghq.com/api/unstable/llm-obs/v1/datasets + response: + body: + string: '{"data":{"id":"8dbba503-cf48-4e82-805e-1d9cb46e5ed3","type":"datasets","attributes":{"author":{"id":"de473b30-eb9f-11e9-a77a-c7405862b8bd"},"created_at":"2025-07-23T02:50:46.680898186Z","current_version":0,"description":"A + good csv dataset","name":"test-dataset-good-csv","updated_at":"2025-07-23T02:50:46.680898186Z"}}}' + headers: + content-length: + - '320' + content-security-policy: + - frame-ancestors 'self'; report-uri https://logs.browser-intake-datadoghq.com/api/v2/logs?dd-api-key=pube4f163c23bbf91c16b8f57f56af9fc58&dd-evp-origin=content-security-policy&ddsource=csp-report&ddtags=site%3Adatadoghq.com + content-type: + - application/vnd.api+json + date: + - Wed, 23 Jul 2025 02:50:46 GMT + strict-transport-security: + - max-age=31536000; includeSubDomains; preload + vary: + - Accept-Encoding + x-content-type-options: + - nosniff + x-frame-options: + - SAMEORIGIN + status: + code: 200 + message: OK +version: 1 diff --git a/tests/llmobs/llmobs_cassettes/datadog/datadog_api_unstable_llm-obs_v1_datasets_post_d59b5313.yaml b/tests/llmobs/llmobs_cassettes/datadog/datadog_api_unstable_llm-obs_v1_datasets_post_d59b5313.yaml new file mode 100644 index 00000000000..07a85cd5c60 --- /dev/null +++ b/tests/llmobs/llmobs_cassettes/datadog/datadog_api_unstable_llm-obs_v1_datasets_post_d59b5313.yaml @@ -0,0 +1,46 @@ +interactions: +- request: + body: '{"data": {"type": "datasets", "attributes": {"name": "test-dataset-bad-csv", + "description": "not a real csv dataset"}}}' + headers: + Accept: + - '*/*' + ? !!python/object/apply:multidict._multidict.istr + - Accept-Encoding + : - identity + Connection: + - keep-alive + Content-Length: + - '119' + ? !!python/object/apply:multidict._multidict.istr + - Content-Type + : - application/json + User-Agent: + - python-requests/2.32.3 + method: POST + uri: https://api.datadoghq.com/api/unstable/llm-obs/v1/datasets + response: + body: + string: '{"data":{"id":"6890ba6e-8023-414c-bdb1-662c63f9f489","type":"datasets","attributes":{"author":{"id":"de473b30-eb9f-11e9-a77a-c7405862b8bd"},"created_at":"2025-07-22T23:10:06.524865Z","current_version":0,"description":"not + a real csv dataset","name":"test-dataset-bad-csv","updated_at":"2025-07-22T23:10:06.524865Z"}}}' + headers: + content-length: + - '317' + content-security-policy: + - frame-ancestors 'self'; report-uri https://logs.browser-intake-datadoghq.com/api/v2/logs?dd-api-key=pube4f163c23bbf91c16b8f57f56af9fc58&dd-evp-origin=content-security-policy&ddsource=csp-report&ddtags=site%3Adatadoghq.com + content-type: + - application/vnd.api+json + date: + - Wed, 23 Jul 2025 02:50:47 GMT + strict-transport-security: + - max-age=31536000; includeSubDomains; preload + vary: + - Accept-Encoding + x-content-type-options: + - nosniff + x-frame-options: + - SAMEORIGIN + status: + code: 200 + message: OK +version: 1 diff --git a/tests/llmobs/test_experiments.py b/tests/llmobs/test_experiments.py index f36cdf32166..74b774ad30d 100644 --- a/tests/llmobs/test_experiments.py +++ b/tests/llmobs/test_experiments.py @@ -228,6 +228,16 @@ def test_dataset_csv_pipe_separated(llmobs): assert dataset.description == "A good pipe separated csv dataset" + assert dataset._id is not None + + wait_for_backend() + ds = llmobs.pull_dataset(name=dataset.name) + + assert len(ds) == len(dataset) + assert ds.name == dataset.name + assert ds.description == dataset.description + assert ds._version == 1 + llmobs._delete_dataset(dataset_id=dataset._id) def test_dataset_pull_non_existent(llmobs): From 772c544fa117df691a63f0273b71c9f4684633a8 Mon Sep 17 00:00:00 2001 From: gary-huang Date: Tue, 22 Jul 2025 22:53:40 -0400 Subject: [PATCH 13/20] restore setup --- setup.cfg | 13 +++++++++++++ 1 file changed, 13 insertions(+) diff --git a/setup.cfg b/setup.cfg index 501d7d10115..a62b967bb25 100644 --- a/setup.cfg +++ b/setup.cfg @@ -6,6 +6,19 @@ skip = *.json,*.h,*.cpp,*.c,.riot,.tox,.mypy_cache,.git,*ddtrace/vendor,tests/co exclude-file = .codespellignorelines ignore-words-list = asend,dne,fo,medias,ment,nin,ot,setttings,statics,ba,spawnve,doas +# DEV: We use `conftest.py` as a local pytest plugin to configure hooks for collection +[tool:pytest] +# --cov-report is intentionally empty else pytest-cov will default to generating a report +addopts = + --cov=ddtrace/ + --cov=tests/ + --cov-append + --cov-report= + --durations=10 + --junitxml=test-results/junit.xml +# DEV: The default is `test_*\.py` which will miss `test.py` files +python_files = test*\.py +asyncio_mode = auto [flake8] max-line-length = 120 From 59b993779ae718220149ff33687238fd5ad543e4 Mon Sep 17 00:00:00 2001 From: gary-huang Date: Tue, 22 Jul 2025 22:54:34 -0400 Subject: [PATCH 14/20] black --- tests/llmobs/test_experiments.py | 22 +++++++++++++++++----- 1 file changed, 17 insertions(+), 5 deletions(-) diff --git a/tests/llmobs/test_experiments.py b/tests/llmobs/test_experiments.py index 74b774ad30d..27cba20bc7a 100644 --- a/tests/llmobs/test_experiments.py +++ b/tests/llmobs/test_experiments.py @@ -96,9 +96,10 @@ def test_dataset_as_dataframe(llmobs): ] df = dataset.as_dataframe() assert len(df.columns) == 2 - assert df.size == 2 # size is num elements in a series + assert df.size == 2 # size is num elements in a series llmobs._delete_dataset(dataset_id=dataset._id) + def test_csv_dataset_as_dataframe(llmobs): test_path = os.path.dirname(__file__) csv_path = os.path.join(test_path, "static_files/good_dataset.csv") @@ -108,13 +109,20 @@ def test_csv_dataset_as_dataframe(llmobs): description="A good csv dataset", input_data_columns=["in0", "in1", "in2"], expected_output_columns=["out0", "out1"], - metadata_columns=["m0"] + metadata_columns=["m0"], ) assert len(dataset) == 2 df = dataset.as_dataframe() assert len(df.columns) == 6 - assert sorted(df.columns) == [("expected_output", "out0"), ("expected_output", "out1"), ("input_data", "in0"), ("input_data", "in1"), ("input_data", "in2"), ("metadata", "m0")] + assert sorted(df.columns) == [ + ("expected_output", "out0"), + ("expected_output", "out1"), + ("input_data", "in0"), + ("input_data", "in1"), + ("input_data", "in2"), + ("metadata", "m0"), + ] llmobs._delete_dataset(dataset_id=dataset._id) @@ -122,7 +130,7 @@ def test_csv_dataset_as_dataframe(llmobs): def test_dataset_csv_missing_input_col(llmobs): test_path = os.path.dirname(__file__) csv_path = os.path.join(test_path, "static_files/good_dataset.csv") - with pytest.raises(ValueError, match=re.escape("Input columns not found in CSV header: [\'in998\', \'in999\']")): + with pytest.raises(ValueError, match=re.escape("Input columns not found in CSV header: ['in998', 'in999']")): llmobs.create_dataset_from_csv( csv_path=csv_path, dataset_name="test-dataset-good-csv", @@ -131,10 +139,11 @@ def test_dataset_csv_missing_input_col(llmobs): expected_output_columns=["out0", "out1"], ) + def test_dataset_csv_missing_output_col(llmobs): test_path = os.path.dirname(__file__) csv_path = os.path.join(test_path, "static_files/good_dataset.csv") - with pytest.raises(ValueError, match=re.escape("Expected output columns not found in CSV header: [\'out999\']")): + with pytest.raises(ValueError, match=re.escape("Expected output columns not found in CSV header: ['out999']")): llmobs.create_dataset_from_csv( csv_path=csv_path, dataset_name="test-dataset-good-csv", @@ -143,6 +152,7 @@ def test_dataset_csv_missing_output_col(llmobs): expected_output_columns=["out999"], ) + def test_dataset_csv_empty_csv(llmobs): test_path = os.path.dirname(__file__) csv_path = os.path.join(test_path, "static_files/empty.csv") @@ -155,6 +165,7 @@ def test_dataset_csv_empty_csv(llmobs): expected_output_columns=["out0"], ) + def test_dataset_csv(llmobs): test_path = os.path.dirname(__file__) csv_path = os.path.join(test_path, "static_files/good_dataset.csv") @@ -240,6 +251,7 @@ def test_dataset_csv_pipe_separated(llmobs): llmobs._delete_dataset(dataset_id=dataset._id) + def test_dataset_pull_non_existent(llmobs): with pytest.raises(ValueError): llmobs.pull_dataset(name="test-dataset-non-existent") From 2ca92fe551113cb29b3e19d7a3f5e3bc6b088157 Mon Sep 17 00:00:00 2001 From: gary-huang Date: Wed, 23 Jul 2025 09:26:36 -0400 Subject: [PATCH 15/20] address comments --- ddtrace/llmobs/_experiment.py | 2 +- tests/llmobs/test_experiments.py | 219 ++++++++++++++++--------------- 2 files changed, 113 insertions(+), 108 deletions(-) diff --git a/ddtrace/llmobs/_experiment.py b/ddtrace/llmobs/_experiment.py index 6c0781e0491..3eda6192a61 100644 --- a/ddtrace/llmobs/_experiment.py +++ b/ddtrace/llmobs/_experiment.py @@ -171,7 +171,7 @@ def __len__(self) -> int: def __iter__(self) -> Iterator[DatasetRecord]: return iter(self._records) - def as_dataframe(self): + def as_dataframe(self) -> None: try: import pandas as pd except ImportError as e: diff --git a/tests/llmobs/test_experiments.py b/tests/llmobs/test_experiments.py index 27cba20bc7a..163e50a27fe 100644 --- a/tests/llmobs/test_experiments.py +++ b/tests/llmobs/test_experiments.py @@ -86,45 +86,42 @@ def test_dataset_create_delete(llmobs): llmobs._delete_dataset(dataset_id=dataset._id) -def test_dataset_as_dataframe(llmobs): - dataset = llmobs.create_dataset(name="test-dataset-3", description="A third test dataset") - dataset._records = [ - DatasetRecord( - input_data=[{"role": "system", "content": "i am machine"}, {"role": "user", "content": "hello"}], - expected_output="label", - ) - ] +def test_dataset_as_dataframe(llmobs, test_dataset_one_record): + dataset = test_dataset_one_record df = dataset.as_dataframe() assert len(df.columns) == 2 assert df.size == 2 # size is num elements in a series - llmobs._delete_dataset(dataset_id=dataset._id) def test_csv_dataset_as_dataframe(llmobs): test_path = os.path.dirname(__file__) csv_path = os.path.join(test_path, "static_files/good_dataset.csv") - dataset = llmobs.create_dataset_from_csv( - csv_path=csv_path, - dataset_name="test-dataset-good-csv", - description="A good csv dataset", - input_data_columns=["in0", "in1", "in2"], - expected_output_columns=["out0", "out1"], - metadata_columns=["m0"], - ) - assert len(dataset) == 2 - - df = dataset.as_dataframe() - assert len(df.columns) == 6 - assert sorted(df.columns) == [ - ("expected_output", "out0"), - ("expected_output", "out1"), - ("input_data", "in0"), - ("input_data", "in1"), - ("input_data", "in2"), - ("metadata", "m0"), - ] - - llmobs._delete_dataset(dataset_id=dataset._id) + dataset_id = None + try: + dataset = llmobs.create_dataset_from_csv( + csv_path=csv_path, + dataset_name="test-dataset-good-csv", + description="A good csv dataset", + input_data_columns=["in0", "in1", "in2"], + expected_output_columns=["out0", "out1"], + metadata_columns=["m0"], + ) + dataset_id = dataset._id + assert len(dataset) == 2 + + df = dataset.as_dataframe() + assert len(df.columns) == 6 + assert sorted(df.columns) == [ + ("expected_output", "out0"), + ("expected_output", "out1"), + ("input_data", "in0"), + ("input_data", "in1"), + ("input_data", "in2"), + ("metadata", "m0"), + ] + finally: + if dataset_id: + llmobs._delete_dataset(dataset_id=dataset_id) def test_dataset_csv_missing_input_col(llmobs): @@ -169,87 +166,95 @@ def test_dataset_csv_empty_csv(llmobs): def test_dataset_csv(llmobs): test_path = os.path.dirname(__file__) csv_path = os.path.join(test_path, "static_files/good_dataset.csv") - dataset = llmobs.create_dataset_from_csv( - csv_path=csv_path, - dataset_name="test-dataset-good-csv", - description="A good csv dataset", - input_data_columns=["in0", "in1", "in2"], - expected_output_columns=["out0", "out1"], - ) - assert len(dataset) == 2 - assert len(dataset[0]["input_data"]) == 3 - assert dataset[0]["input_data"]["in0"] == "r0v1" - assert dataset[0]["input_data"]["in1"] == "r0v2" - assert dataset[0]["input_data"]["in2"] == "r0v3" - assert dataset[1]["input_data"]["in0"] == "r1v1" - assert dataset[1]["input_data"]["in1"] == "r1v2" - assert dataset[1]["input_data"]["in2"] == "r1v3" - - assert len(dataset[0]["expected_output"]) == 2 - assert dataset[0]["expected_output"]["out0"] == "r0v4" - assert dataset[0]["expected_output"]["out1"] == "r0v5" - assert dataset[1]["expected_output"]["out0"] == "r1v4" - assert dataset[1]["expected_output"]["out1"] == "r1v5" - - assert dataset.description == "A good csv dataset" - - assert dataset._id is not None - - wait_for_backend() - ds = llmobs.pull_dataset(name=dataset.name) - - assert len(ds) == len(dataset) - assert ds.name == dataset.name - assert ds.description == dataset.description - assert ds._version == 1 - - llmobs._delete_dataset(dataset_id=dataset._id) + dataset_id = None + try: + dataset = llmobs.create_dataset_from_csv( + csv_path=csv_path, + dataset_name="test-dataset-good-csv", + description="A good csv dataset", + input_data_columns=["in0", "in1", "in2"], + expected_output_columns=["out0", "out1"], + ) + dataset_id = dataset._id + assert len(dataset) == 2 + assert len(dataset[0]["input_data"]) == 3 + assert dataset[0]["input_data"]["in0"] == "r0v1" + assert dataset[0]["input_data"]["in1"] == "r0v2" + assert dataset[0]["input_data"]["in2"] == "r0v3" + assert dataset[1]["input_data"]["in0"] == "r1v1" + assert dataset[1]["input_data"]["in1"] == "r1v2" + assert dataset[1]["input_data"]["in2"] == "r1v3" + + assert len(dataset[0]["expected_output"]) == 2 + assert dataset[0]["expected_output"]["out0"] == "r0v4" + assert dataset[0]["expected_output"]["out1"] == "r0v5" + assert dataset[1]["expected_output"]["out0"] == "r1v4" + assert dataset[1]["expected_output"]["out1"] == "r1v5" + + assert dataset.description == "A good csv dataset" + + assert dataset._id is not None + + wait_for_backend() + ds = llmobs.pull_dataset(name=dataset.name) + + assert len(ds) == len(dataset) + assert ds.name == dataset.name + assert ds.description == dataset.description + assert ds._version == 1 + finally: + if dataset_id: + llmobs._delete_dataset(dataset_id=dataset_id) def test_dataset_csv_pipe_separated(llmobs): test_path = os.path.dirname(__file__) csv_path = os.path.join(test_path, "static_files/good_dataset_pipe_separated.csv") - dataset = llmobs.create_dataset_from_csv( - csv_path=csv_path, - dataset_name="test-dataset-good-csv-pipe", - description="A good pipe separated csv dataset", - input_data_columns=["in0", "in1", "in2"], - expected_output_columns=["out0", "out1"], - metadata_columns=["m0"], - csv_delimiter="|", - ) - assert len(dataset) == 2 - assert len(dataset[0]["input_data"]) == 3 - assert dataset[0]["input_data"]["in0"] == "r0v1" - assert dataset[0]["input_data"]["in1"] == "r0v2" - assert dataset[0]["input_data"]["in2"] == "r0v3" - assert dataset[1]["input_data"]["in0"] == "r1v1" - assert dataset[1]["input_data"]["in1"] == "r1v2" - assert dataset[1]["input_data"]["in2"] == "r1v3" - - assert len(dataset[0]["expected_output"]) == 2 - assert dataset[0]["expected_output"]["out0"] == "r0v4" - assert dataset[0]["expected_output"]["out1"] == "r0v5" - assert dataset[1]["expected_output"]["out0"] == "r1v4" - assert dataset[1]["expected_output"]["out1"] == "r1v5" - - assert len(dataset[0]["metadata"]) == 1 - assert dataset[0]["metadata"]["m0"] == "r0v6" - assert dataset[1]["metadata"]["m0"] == "r1v6" - - assert dataset.description == "A good pipe separated csv dataset" - - assert dataset._id is not None - - wait_for_backend() - ds = llmobs.pull_dataset(name=dataset.name) - - assert len(ds) == len(dataset) - assert ds.name == dataset.name - assert ds.description == dataset.description - assert ds._version == 1 - - llmobs._delete_dataset(dataset_id=dataset._id) + dataset_id = None + try: + dataset = llmobs.create_dataset_from_csv( + csv_path=csv_path, + dataset_name="test-dataset-good-csv-pipe", + description="A good pipe separated csv dataset", + input_data_columns=["in0", "in1", "in2"], + expected_output_columns=["out0", "out1"], + metadata_columns=["m0"], + csv_delimiter="|", + ) + dataset_id = dataset._id + assert len(dataset) == 2 + assert len(dataset[0]["input_data"]) == 3 + assert dataset[0]["input_data"]["in0"] == "r0v1" + assert dataset[0]["input_data"]["in1"] == "r0v2" + assert dataset[0]["input_data"]["in2"] == "r0v3" + assert dataset[1]["input_data"]["in0"] == "r1v1" + assert dataset[1]["input_data"]["in1"] == "r1v2" + assert dataset[1]["input_data"]["in2"] == "r1v3" + + assert len(dataset[0]["expected_output"]) == 2 + assert dataset[0]["expected_output"]["out0"] == "r0v4" + assert dataset[0]["expected_output"]["out1"] == "r0v5" + assert dataset[1]["expected_output"]["out0"] == "r1v4" + assert dataset[1]["expected_output"]["out1"] == "r1v5" + + assert len(dataset[0]["metadata"]) == 1 + assert dataset[0]["metadata"]["m0"] == "r0v6" + assert dataset[1]["metadata"]["m0"] == "r1v6" + + assert dataset.description == "A good pipe separated csv dataset" + + assert dataset._id is not None + + wait_for_backend() + ds = llmobs.pull_dataset(name=dataset.name) + + assert len(ds) == len(dataset) + assert ds.name == dataset.name + assert ds.description == dataset.description + assert ds._version == 1 + finally: + if dataset_id: + llmobs._delete_dataset(dataset_id=dataset._id) def test_dataset_pull_non_existent(llmobs): From 0d94cbbd803f23ba6f67b06ad608cb1a20a2a944 Mon Sep 17 00:00:00 2001 From: Kyle Verhoog Date: Wed, 23 Jul 2025 11:12:55 -0400 Subject: [PATCH 16/20] riotfiles --- .../requirements/{1fe8dd2.txt => 1900591.txt} | 80 +++++++++--------- .../requirements/{1687eab.txt => 46e9996.txt} | 80 +++++++++--------- .../requirements/{771848b.txt => 5908834.txt} | 80 +++++++++--------- .../requirements/{146f2d8.txt => 97f1328.txt} | 38 +++++---- .../requirements/{12c5529.txt => f37741b.txt} | 82 ++++++++++--------- .../requirements/{4102ef5.txt => ffd66c1.txt} | 80 +++++++++--------- 6 files changed, 231 insertions(+), 209 deletions(-) rename .riot/requirements/{1fe8dd2.txt => 1900591.txt} (54%) rename .riot/requirements/{1687eab.txt => 46e9996.txt} (50%) rename .riot/requirements/{771848b.txt => 5908834.txt} (52%) rename .riot/requirements/{146f2d8.txt => 97f1328.txt} (74%) rename .riot/requirements/{12c5529.txt => f37741b.txt} (54%) rename .riot/requirements/{4102ef5.txt => ffd66c1.txt} (54%) diff --git a/.riot/requirements/1fe8dd2.txt b/.riot/requirements/1900591.txt similarity index 54% rename from .riot/requirements/1fe8dd2.txt rename to .riot/requirements/1900591.txt index 72ea102ea03..ed309c651a9 100644 --- a/.riot/requirements/1fe8dd2.txt +++ b/.riot/requirements/1900591.txt @@ -2,33 +2,35 @@ # This file is autogenerated by pip-compile with Python 3.13 # by the following command: # -# pip-compile --allow-unsafe --no-annotate .riot/requirements/1fe8dd2.in +# pip-compile --allow-unsafe --no-annotate .riot/requirements/1900591.in # -aiohappyeyeballs==2.4.8 -aiohttp==3.11.13 -aiosignal==1.3.2 +aiohappyeyeballs==2.6.1 +aiohttp==3.12.14 +aiosignal==1.4.0 annotated-types==0.7.0 -anyio==4.8.0 +anyio==4.9.0 appdirs==1.4.4 -attrs==25.1.0 -certifi==2025.1.31 -charset-normalizer==3.4.1 -coverage[toml]==7.6.12 +attrs==25.3.0 +certifi==2025.7.14 +charset-normalizer==3.4.2 +coverage[toml]==7.9.2 dataclasses-json==0.6.7 -datasets==3.3.2 +datasets==4.0.0 dill==0.3.8 distro==1.9.0 -filelock==3.17.0 -frozenlist==1.5.0 -fsspec[http]==2024.12.0 -h11==0.14.0 -httpcore==1.0.7 +filelock==3.18.0 +frozenlist==1.7.0 +fsspec[http]==2025.3.0 +greenlet==3.2.3 +h11==0.16.0 +hf-xet==1.1.5 +httpcore==1.0.9 httpx==0.28.1 -huggingface-hub==0.29.2 +huggingface-hub==0.33.4 hypothesis==6.45.0 idna==3.10 -iniconfig==2.0.0 -jiter==0.8.2 +iniconfig==2.1.0 +jiter==0.10.0 jsonpatch==1.33 jsonpointer==3.0.0 langchain==0.2.17 @@ -39,45 +41,47 @@ langchain-text-splitters==0.2.4 langsmith==0.1.147 marshmallow==3.26.1 mock==5.2.0 -multidict==6.1.0 +multidict==6.6.3 multiprocess==0.70.16 -mypy-extensions==1.0.0 +mypy-extensions==1.1.0 nest-asyncio==1.6.0 numpy==1.26.4 -openai==1.65.3 +openai==1.97.1 opentracing==2.4.0 -orjson==3.10.15 +orjson==3.11.0 packaging==24.2 -pandas==2.2.3 -pluggy==1.5.0 -propcache==0.3.0 -pyarrow==19.0.1 -pydantic==2.10.6 -pydantic-core==2.27.2 +pandas==2.3.1 +pluggy==1.6.0 +propcache==0.3.2 +pyarrow==21.0.0 +pydantic==2.11.7 +pydantic-core==2.33.2 +pygments==2.19.2 pysbd==0.3.4 -pytest==8.3.5 +pytest==8.4.1 pytest-asyncio==0.21.1 -pytest-cov==6.0.0 -pytest-mock==3.14.0 +pytest-cov==6.2.1 +pytest-mock==3.14.1 python-dateutil==2.9.0.post0 -pytz==2025.1 +pytz==2025.2 pyyaml==6.0.2 ragas==0.1.21 regex==2024.11.6 -requests==2.32.3 +requests==2.32.4 requests-toolbelt==1.0.0 six==1.17.0 sniffio==1.3.1 sortedcontainers==2.4.0 -sqlalchemy==2.0.38 +sqlalchemy==2.0.41 tenacity==8.5.0 tiktoken==0.9.0 tqdm==4.67.1 -typing-extensions==4.12.2 +typing-extensions==4.14.1 typing-inspect==0.9.0 -tzdata==2025.1 -urllib3==2.3.0 +typing-inspection==0.4.1 +tzdata==2025.2 +urllib3==2.5.0 vcrpy==7.0.0 wrapt==1.17.2 xxhash==3.5.0 -yarl==1.18.3 +yarl==1.20.1 diff --git a/.riot/requirements/1687eab.txt b/.riot/requirements/46e9996.txt similarity index 50% rename from .riot/requirements/1687eab.txt rename to .riot/requirements/46e9996.txt index 009a5ede488..6c8fc1bdc4c 100644 --- a/.riot/requirements/1687eab.txt +++ b/.riot/requirements/46e9996.txt @@ -2,33 +2,35 @@ # This file is autogenerated by pip-compile with Python 3.11 # by the following command: # -# pip-compile --allow-unsafe --no-annotate --resolver=backtracking .riot/requirements/1687eab.in +# pip-compile --allow-unsafe --no-annotate .riot/requirements/46e9996.in # -aiohappyeyeballs==2.4.8 -aiohttp==3.11.13 -aiosignal==1.3.2 +aiohappyeyeballs==2.6.1 +aiohttp==3.12.14 +aiosignal==1.4.0 annotated-types==0.7.0 -anyio==4.8.0 +anyio==4.9.0 appdirs==1.4.4 -attrs==25.1.0 -certifi==2025.1.31 -charset-normalizer==3.4.1 -coverage[toml]==7.6.12 +attrs==25.3.0 +certifi==2025.7.14 +charset-normalizer==3.4.2 +coverage[toml]==7.9.2 dataclasses-json==0.6.7 -datasets==3.3.2 +datasets==4.0.0 dill==0.3.8 distro==1.9.0 -filelock==3.17.0 -frozenlist==1.5.0 -fsspec[http]==2024.12.0 -h11==0.14.0 -httpcore==1.0.7 +filelock==3.18.0 +frozenlist==1.7.0 +fsspec[http]==2025.3.0 +greenlet==3.2.3 +h11==0.16.0 +hf-xet==1.1.5 +httpcore==1.0.9 httpx==0.28.1 -huggingface-hub==0.29.2 +huggingface-hub==0.33.4 hypothesis==6.45.0 idna==3.10 -iniconfig==2.0.0 -jiter==0.8.2 +iniconfig==2.1.0 +jiter==0.10.0 jsonpatch==1.33 jsonpointer==3.0.0 langchain==0.2.17 @@ -39,45 +41,47 @@ langchain-text-splitters==0.2.4 langsmith==0.1.147 marshmallow==3.26.1 mock==5.2.0 -multidict==6.1.0 +multidict==6.6.3 multiprocess==0.70.16 -mypy-extensions==1.0.0 +mypy-extensions==1.1.0 nest-asyncio==1.6.0 numpy==1.26.4 -openai==1.65.3 +openai==1.97.1 opentracing==2.4.0 -orjson==3.10.15 +orjson==3.11.0 packaging==24.2 -pandas==2.2.3 -pluggy==1.5.0 -propcache==0.3.0 -pyarrow==19.0.1 -pydantic==2.10.6 -pydantic-core==2.27.2 +pandas==2.3.1 +pluggy==1.6.0 +propcache==0.3.2 +pyarrow==21.0.0 +pydantic==2.11.7 +pydantic-core==2.33.2 +pygments==2.19.2 pysbd==0.3.4 -pytest==8.3.5 +pytest==8.4.1 pytest-asyncio==0.21.1 -pytest-cov==6.0.0 -pytest-mock==3.14.0 +pytest-cov==6.2.1 +pytest-mock==3.14.1 python-dateutil==2.9.0.post0 -pytz==2025.1 +pytz==2025.2 pyyaml==6.0.2 ragas==0.1.21 regex==2024.11.6 -requests==2.32.3 +requests==2.32.4 requests-toolbelt==1.0.0 six==1.17.0 sniffio==1.3.1 sortedcontainers==2.4.0 -sqlalchemy==2.0.38 +sqlalchemy==2.0.41 tenacity==8.5.0 tiktoken==0.9.0 tqdm==4.67.1 -typing-extensions==4.12.2 +typing-extensions==4.14.1 typing-inspect==0.9.0 -tzdata==2025.1 -urllib3==2.3.0 +typing-inspection==0.4.1 +tzdata==2025.2 +urllib3==2.5.0 vcrpy==7.0.0 wrapt==1.17.2 xxhash==3.5.0 -yarl==1.18.3 +yarl==1.20.1 diff --git a/.riot/requirements/771848b.txt b/.riot/requirements/5908834.txt similarity index 52% rename from .riot/requirements/771848b.txt rename to .riot/requirements/5908834.txt index cd804c107c2..443cdf7b385 100644 --- a/.riot/requirements/771848b.txt +++ b/.riot/requirements/5908834.txt @@ -2,35 +2,37 @@ # This file is autogenerated by pip-compile with Python 3.9 # by the following command: # -# pip-compile --allow-unsafe --no-annotate --resolver=backtracking .riot/requirements/771848b.in +# pip-compile --allow-unsafe --no-annotate .riot/requirements/5908834.in # -aiohappyeyeballs==2.4.8 -aiohttp==3.11.13 -aiosignal==1.3.2 +aiohappyeyeballs==2.6.1 +aiohttp==3.12.14 +aiosignal==1.4.0 annotated-types==0.7.0 -anyio==4.8.0 +anyio==4.9.0 appdirs==1.4.4 async-timeout==4.0.3 -attrs==25.1.0 -certifi==2025.1.31 -charset-normalizer==3.4.1 -coverage[toml]==7.6.12 +attrs==25.3.0 +certifi==2025.7.14 +charset-normalizer==3.4.2 +coverage[toml]==7.9.2 dataclasses-json==0.6.7 -datasets==3.3.2 +datasets==4.0.0 dill==0.3.8 distro==1.9.0 -exceptiongroup==1.2.2 -filelock==3.17.0 -frozenlist==1.5.0 -fsspec[http]==2024.12.0 -h11==0.14.0 -httpcore==1.0.7 +exceptiongroup==1.3.0 +filelock==3.18.0 +frozenlist==1.7.0 +fsspec[http]==2025.3.0 +greenlet==3.2.3 +h11==0.16.0 +hf-xet==1.1.5 +httpcore==1.0.9 httpx==0.28.1 -huggingface-hub==0.29.2 +huggingface-hub==0.33.4 hypothesis==6.45.0 idna==3.10 -iniconfig==2.0.0 -jiter==0.8.2 +iniconfig==2.1.0 +jiter==0.10.0 jsonpatch==1.33 jsonpointer==3.0.0 langchain==0.2.17 @@ -41,46 +43,48 @@ langchain-text-splitters==0.2.4 langsmith==0.1.147 marshmallow==3.26.1 mock==5.2.0 -multidict==6.1.0 +multidict==6.6.3 multiprocess==0.70.16 -mypy-extensions==1.0.0 +mypy-extensions==1.1.0 nest-asyncio==1.6.0 numpy==1.26.4 -openai==1.65.3 +openai==1.97.1 opentracing==2.4.0 -orjson==3.10.15 +orjson==3.11.0 packaging==24.2 -pandas==2.2.3 -pluggy==1.5.0 -propcache==0.3.0 -pyarrow==19.0.1 -pydantic==2.10.6 -pydantic-core==2.27.2 +pandas==2.3.1 +pluggy==1.6.0 +propcache==0.3.2 +pyarrow==21.0.0 +pydantic==2.11.7 +pydantic-core==2.33.2 +pygments==2.19.2 pysbd==0.3.4 -pytest==8.3.5 +pytest==8.4.1 pytest-asyncio==0.21.1 -pytest-cov==6.0.0 -pytest-mock==3.14.0 +pytest-cov==6.2.1 +pytest-mock==3.14.1 python-dateutil==2.9.0.post0 -pytz==2025.1 +pytz==2025.2 pyyaml==6.0.2 ragas==0.1.21 regex==2024.11.6 -requests==2.32.3 +requests==2.32.4 requests-toolbelt==1.0.0 six==1.17.0 sniffio==1.3.1 sortedcontainers==2.4.0 -sqlalchemy==2.0.38 +sqlalchemy==2.0.41 tenacity==8.5.0 tiktoken==0.9.0 tomli==2.2.1 tqdm==4.67.1 -typing-extensions==4.12.2 +typing-extensions==4.14.1 typing-inspect==0.9.0 -tzdata==2025.1 +typing-inspection==0.4.1 +tzdata==2025.2 urllib3==1.26.20 vcrpy==7.0.0 wrapt==1.17.2 xxhash==3.5.0 -yarl==1.18.3 +yarl==1.20.1 diff --git a/.riot/requirements/146f2d8.txt b/.riot/requirements/97f1328.txt similarity index 74% rename from .riot/requirements/146f2d8.txt rename to .riot/requirements/97f1328.txt index e552a5620a3..68875c403e1 100644 --- a/.riot/requirements/146f2d8.txt +++ b/.riot/requirements/97f1328.txt @@ -2,7 +2,7 @@ # This file is autogenerated by pip-compile with Python 3.8 # by the following command: # -# pip-compile --allow-unsafe --no-annotate --resolver=backtracking .riot/requirements/146f2d8.in +# pip-compile --allow-unsafe --no-annotate .riot/requirements/97f1328.in # aiohappyeyeballs==2.4.4 aiohttp==3.10.11 @@ -11,26 +11,28 @@ annotated-types==0.7.0 anyio==4.5.2 appdirs==1.4.4 async-timeout==4.0.3 -attrs==25.1.0 -certifi==2025.1.31 -charset-normalizer==3.4.1 +attrs==25.3.0 +certifi==2025.7.14 +charset-normalizer==3.4.2 coverage[toml]==7.6.1 dataclasses-json==0.6.7 datasets==3.1.0 dill==0.3.8 distro==1.9.0 -exceptiongroup==1.2.2 +exceptiongroup==1.3.0 filelock==3.16.1 frozenlist==1.5.0 fsspec[http]==2024.9.0 -h11==0.14.0 -httpcore==1.0.7 +greenlet==3.1.1 +h11==0.16.0 +hf-xet==1.1.5 +httpcore==1.0.9 httpx==0.28.1 -huggingface-hub==0.29.2 +huggingface-hub==0.33.4 hypothesis==6.45.0 idna==3.10 -iniconfig==2.0.0 -jiter==0.8.2 +iniconfig==2.1.0 +jiter==0.9.1 jsonpatch==1.33 jsonpointer==3.0.0 langchain==0.2.17 @@ -43,10 +45,10 @@ marshmallow==3.22.0 mock==5.2.0 multidict==6.1.0 multiprocess==0.70.16 -mypy-extensions==1.0.0 +mypy-extensions==1.1.0 nest-asyncio==1.6.0 numpy==1.24.4 -openai==1.65.3 +openai==1.97.1 opentracing==2.4.0 orjson==3.10.15 packaging==24.2 @@ -60,25 +62,25 @@ pysbd==0.3.4 pytest==8.3.5 pytest-asyncio==0.21.1 pytest-cov==5.0.0 -pytest-mock==3.14.0 +pytest-mock==3.14.1 python-dateutil==2.9.0.post0 -pytz==2025.1 +pytz==2025.2 pyyaml==6.0.2 ragas==0.1.21 regex==2024.11.6 -requests==2.32.3 +requests==2.32.4 requests-toolbelt==1.0.0 six==1.17.0 sniffio==1.3.1 sortedcontainers==2.4.0 -sqlalchemy==2.0.38 +sqlalchemy==2.0.41 tenacity==8.5.0 tiktoken==0.7.0 tomli==2.2.1 tqdm==4.67.1 -typing-extensions==4.12.2 +typing-extensions==4.13.2 typing-inspect==0.9.0 -tzdata==2025.1 +tzdata==2025.2 urllib3==1.26.20 vcrpy==6.0.2 wrapt==1.17.2 diff --git a/.riot/requirements/12c5529.txt b/.riot/requirements/f37741b.txt similarity index 54% rename from .riot/requirements/12c5529.txt rename to .riot/requirements/f37741b.txt index 40afeea9f40..f258baee359 100644 --- a/.riot/requirements/12c5529.txt +++ b/.riot/requirements/f37741b.txt @@ -2,35 +2,37 @@ # This file is autogenerated by pip-compile with Python 3.10 # by the following command: # -# pip-compile --allow-unsafe --no-annotate .riot/requirements/12c5529.in +# pip-compile --allow-unsafe --no-annotate .riot/requirements/f37741b.in # -aiohappyeyeballs==2.4.8 -aiohttp==3.11.13 -aiosignal==1.3.2 +aiohappyeyeballs==2.6.1 +aiohttp==3.12.14 +aiosignal==1.4.0 annotated-types==0.7.0 -anyio==4.8.0 +anyio==4.9.0 appdirs==1.4.4 async-timeout==4.0.3 -attrs==25.1.0 -certifi==2025.1.31 -charset-normalizer==3.4.1 -coverage[toml]==7.6.12 +attrs==25.3.0 +certifi==2025.7.14 +charset-normalizer==3.4.2 +coverage[toml]==7.9.2 dataclasses-json==0.6.7 -datasets==3.3.2 +datasets==4.0.0 dill==0.3.8 distro==1.9.0 -exceptiongroup==1.2.2 -filelock==3.17.0 -frozenlist==1.5.0 -fsspec[http]==2024.12.0 -h11==0.14.0 -httpcore==1.0.7 +exceptiongroup==1.3.0 +filelock==3.18.0 +frozenlist==1.7.0 +fsspec[http]==2025.3.0 +greenlet==3.2.3 +h11==0.16.0 +hf-xet==1.1.5 +httpcore==1.0.9 httpx==0.28.1 -huggingface-hub==0.29.2 +huggingface-hub==0.33.4 hypothesis==6.45.0 idna==3.10 -iniconfig==2.0.0 -jiter==0.8.2 +iniconfig==2.1.0 +jiter==0.10.0 jsonpatch==1.33 jsonpointer==3.0.0 langchain==0.2.17 @@ -41,46 +43,48 @@ langchain-text-splitters==0.2.4 langsmith==0.1.147 marshmallow==3.26.1 mock==5.2.0 -multidict==6.1.0 +multidict==6.6.3 multiprocess==0.70.16 -mypy-extensions==1.0.0 +mypy-extensions==1.1.0 nest-asyncio==1.6.0 numpy==1.26.4 -openai==1.65.3 +openai==1.97.1 opentracing==2.4.0 -orjson==3.10.15 +orjson==3.11.0 packaging==24.2 -pandas==2.2.3 -pluggy==1.5.0 -propcache==0.3.0 -pyarrow==19.0.1 -pydantic==2.10.6 -pydantic-core==2.27.2 +pandas==2.3.1 +pluggy==1.6.0 +propcache==0.3.2 +pyarrow==21.0.0 +pydantic==2.11.7 +pydantic-core==2.33.2 +pygments==2.19.2 pysbd==0.3.4 -pytest==8.3.5 +pytest==8.4.1 pytest-asyncio==0.21.1 -pytest-cov==6.0.0 -pytest-mock==3.14.0 +pytest-cov==6.2.1 +pytest-mock==3.14.1 python-dateutil==2.9.0.post0 -pytz==2025.1 +pytz==2025.2 pyyaml==6.0.2 ragas==0.1.21 regex==2024.11.6 -requests==2.32.3 +requests==2.32.4 requests-toolbelt==1.0.0 six==1.17.0 sniffio==1.3.1 sortedcontainers==2.4.0 -sqlalchemy==2.0.38 +sqlalchemy==2.0.41 tenacity==8.5.0 tiktoken==0.9.0 tomli==2.2.1 tqdm==4.67.1 -typing-extensions==4.12.2 +typing-extensions==4.14.1 typing-inspect==0.9.0 -tzdata==2025.1 -urllib3==2.3.0 +typing-inspection==0.4.1 +tzdata==2025.2 +urllib3==2.5.0 vcrpy==7.0.0 wrapt==1.17.2 xxhash==3.5.0 -yarl==1.18.3 +yarl==1.20.1 diff --git a/.riot/requirements/4102ef5.txt b/.riot/requirements/ffd66c1.txt similarity index 54% rename from .riot/requirements/4102ef5.txt rename to .riot/requirements/ffd66c1.txt index d0518848787..89c4f02b050 100644 --- a/.riot/requirements/4102ef5.txt +++ b/.riot/requirements/ffd66c1.txt @@ -2,33 +2,35 @@ # This file is autogenerated by pip-compile with Python 3.12 # by the following command: # -# pip-compile --allow-unsafe --no-annotate .riot/requirements/4102ef5.in +# pip-compile --allow-unsafe --no-annotate .riot/requirements/ffd66c1.in # -aiohappyeyeballs==2.4.8 -aiohttp==3.11.13 -aiosignal==1.3.2 +aiohappyeyeballs==2.6.1 +aiohttp==3.12.14 +aiosignal==1.4.0 annotated-types==0.7.0 -anyio==4.8.0 +anyio==4.9.0 appdirs==1.4.4 -attrs==25.1.0 -certifi==2025.1.31 -charset-normalizer==3.4.1 -coverage[toml]==7.6.12 +attrs==25.3.0 +certifi==2025.7.14 +charset-normalizer==3.4.2 +coverage[toml]==7.9.2 dataclasses-json==0.6.7 -datasets==3.3.2 +datasets==4.0.0 dill==0.3.8 distro==1.9.0 -filelock==3.17.0 -frozenlist==1.5.0 -fsspec[http]==2024.12.0 -h11==0.14.0 -httpcore==1.0.7 +filelock==3.18.0 +frozenlist==1.7.0 +fsspec[http]==2025.3.0 +greenlet==3.2.3 +h11==0.16.0 +hf-xet==1.1.5 +httpcore==1.0.9 httpx==0.28.1 -huggingface-hub==0.29.2 +huggingface-hub==0.33.4 hypothesis==6.45.0 idna==3.10 -iniconfig==2.0.0 -jiter==0.8.2 +iniconfig==2.1.0 +jiter==0.10.0 jsonpatch==1.33 jsonpointer==3.0.0 langchain==0.2.17 @@ -39,45 +41,47 @@ langchain-text-splitters==0.2.4 langsmith==0.1.147 marshmallow==3.26.1 mock==5.2.0 -multidict==6.1.0 +multidict==6.6.3 multiprocess==0.70.16 -mypy-extensions==1.0.0 +mypy-extensions==1.1.0 nest-asyncio==1.6.0 numpy==1.26.4 -openai==1.65.3 +openai==1.97.1 opentracing==2.4.0 -orjson==3.10.15 +orjson==3.11.0 packaging==24.2 -pandas==2.2.3 -pluggy==1.5.0 -propcache==0.3.0 -pyarrow==19.0.1 -pydantic==2.10.6 -pydantic-core==2.27.2 +pandas==2.3.1 +pluggy==1.6.0 +propcache==0.3.2 +pyarrow==21.0.0 +pydantic==2.11.7 +pydantic-core==2.33.2 +pygments==2.19.2 pysbd==0.3.4 -pytest==8.3.5 +pytest==8.4.1 pytest-asyncio==0.21.1 -pytest-cov==6.0.0 -pytest-mock==3.14.0 +pytest-cov==6.2.1 +pytest-mock==3.14.1 python-dateutil==2.9.0.post0 -pytz==2025.1 +pytz==2025.2 pyyaml==6.0.2 ragas==0.1.21 regex==2024.11.6 -requests==2.32.3 +requests==2.32.4 requests-toolbelt==1.0.0 six==1.17.0 sniffio==1.3.1 sortedcontainers==2.4.0 -sqlalchemy==2.0.38 +sqlalchemy==2.0.41 tenacity==8.5.0 tiktoken==0.9.0 tqdm==4.67.1 -typing-extensions==4.12.2 +typing-extensions==4.14.1 typing-inspect==0.9.0 -tzdata==2025.1 -urllib3==2.3.0 +typing-inspection==0.4.1 +tzdata==2025.2 +urllib3==2.5.0 vcrpy==7.0.0 wrapt==1.17.2 xxhash==3.5.0 -yarl==1.18.3 +yarl==1.20.1 From f53d1496c3139d7633efe382ab989d1e581d0624 Mon Sep 17 00:00:00 2001 From: gary-huang Date: Wed, 23 Jul 2025 11:38:16 -0400 Subject: [PATCH 17/20] ruff --- ddtrace/llmobs/_llmobs.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/ddtrace/llmobs/_llmobs.py b/ddtrace/llmobs/_llmobs.py index 4043fabbd40..f0f4e246166 100644 --- a/ddtrace/llmobs/_llmobs.py +++ b/ddtrace/llmobs/_llmobs.py @@ -80,8 +80,8 @@ from ddtrace.llmobs._context import LLMObsContextProvider from ddtrace.llmobs._evaluators.runner import EvaluatorRunner from ddtrace.llmobs._experiment import Dataset -from ddtrace.llmobs._experiment import DatasetRecordInputType from ddtrace.llmobs._experiment import DatasetRecord +from ddtrace.llmobs._experiment import DatasetRecordInputType from ddtrace.llmobs._experiment import Experiment from ddtrace.llmobs._experiment import ExperimentConfigType from ddtrace.llmobs._experiment import JSONType From e9705d9895f9fa2a61c2fdd2e2a17c3ddb62e318 Mon Sep 17 00:00:00 2001 From: gary-huang Date: Wed, 23 Jul 2025 11:47:02 -0400 Subject: [PATCH 18/20] type check --- ddtrace/llmobs/_experiment.py | 24 ++++++++++++------------ 1 file changed, 12 insertions(+), 12 deletions(-) diff --git a/ddtrace/llmobs/_experiment.py b/ddtrace/llmobs/_experiment.py index 3eda6192a61..3f61e617a4b 100644 --- a/ddtrace/llmobs/_experiment.py +++ b/ddtrace/llmobs/_experiment.py @@ -182,29 +182,29 @@ def as_dataframe(self) -> None: column_tuples = set() data_rows = [] for record in self._records: - flat_record = {} + flat_record = {} # type: Dict[Union[str, Tuple[str, str]], Any] input_data = record.get("input_data", {}) if isinstance(input_data, dict): - for k, v in input_data.items(): - flat_record[("input_data", k)] = v - column_tuples.add(("input_data", k)) + for input_data_col, input_data_val in input_data.items(): + flat_record[("input_data", input_data_col)] = input_data_val + column_tuples.add(("input_data", input_data_col)) else: - flat_record[("input_data", "")] = input_data # Use empty string for single input + flat_record[("input_data", "")] = input_data column_tuples.add(("input_data", "")) expected_output = record.get("expected_output", {}) if isinstance(expected_output, dict): - for k, v in expected_output.items(): - flat_record[("expected_output", k)] = v - column_tuples.add(("expected_output", k)) + for expected_output_col, expected_output_val in expected_output.items(): + flat_record[("expected_output", expected_output_col)] = expected_output_val + column_tuples.add(("expected_output", expected_output_col)) else: - flat_record[("expected_output", "")] = expected_output # Use empty string for single output + flat_record[("expected_output", "")] = expected_output column_tuples.add(("expected_output", "")) - for k, v in record.get("metadata", {}).items(): - flat_record[("metadata", k)] = v - column_tuples.add(("metadata", k)) + for metadata_col, metadata_val in record.get("metadata", {}).items(): + flat_record[("metadata", metadata_col)] = metadata_val + column_tuples.add(("metadata", metadata_col)) data_rows.append(flat_record) From 6f625e3975711947547eca99260cf514eaa4893e Mon Sep 17 00:00:00 2001 From: gary-huang Date: Wed, 23 Jul 2025 12:28:14 -0400 Subject: [PATCH 19/20] missing cassette? --- ...9cb46e5ed3_batch_update_post_34c161e3.yaml | 49 +++++++++++++++++++ 1 file changed, 49 insertions(+) create mode 100644 tests/llmobs/llmobs_cassettes/datadog/datadog_api_unstable_llm-obs_v1_datasets_8dbba503-cf48-4e82-805e-1d9cb46e5ed3_batch_update_post_34c161e3.yaml diff --git a/tests/llmobs/llmobs_cassettes/datadog/datadog_api_unstable_llm-obs_v1_datasets_8dbba503-cf48-4e82-805e-1d9cb46e5ed3_batch_update_post_34c161e3.yaml b/tests/llmobs/llmobs_cassettes/datadog/datadog_api_unstable_llm-obs_v1_datasets_8dbba503-cf48-4e82-805e-1d9cb46e5ed3_batch_update_post_34c161e3.yaml new file mode 100644 index 00000000000..3e4ea62376f --- /dev/null +++ b/tests/llmobs/llmobs_cassettes/datadog/datadog_api_unstable_llm-obs_v1_datasets_8dbba503-cf48-4e82-805e-1d9cb46e5ed3_batch_update_post_34c161e3.yaml @@ -0,0 +1,49 @@ +interactions: +- request: + body: '{"data": {"type": "datasets", "id": "8dbba503-cf48-4e82-805e-1d9cb46e5ed3", + "attributes": {"insert_records": [{"input": {"in0": "r0v1", "in1": "r0v2", "in2": + "r0v3"}, "expected_output": {"out0": "r0v4", "out1": "r0v5"}, "metadata": {"m0": + "r0v6"}}, {"input": {"in0": "r1v1", "in1": "r1v2", "in2": "r1v3"}, "expected_output": + {"out0": "r1v4", "out1": "r1v5"}, "metadata": {"m0": "r1v6"}}], "update_records": + [], "delete_records": []}}}' + headers: + Accept: + - '*/*' + ? !!python/object/apply:multidict._multidict.istr + - Accept-Encoding + : - identity + Connection: + - keep-alive + Content-Length: + - '434' + ? !!python/object/apply:multidict._multidict.istr + - Content-Type + : - application/json + User-Agent: + - python-requests/2.32.3 + method: POST + uri: https://api.datadoghq.com/api/unstable/llm-obs/v1/datasets/8dbba503-cf48-4e82-805e-1d9cb46e5ed3/batch_update + response: + body: + string: '{"data":[]}' + headers: + content-length: + - '11' + content-security-policy: + - frame-ancestors 'self'; report-uri https://logs.browser-intake-datadoghq.com/api/v2/logs?dd-api-key=pube4f163c23bbf91c16b8f57f56af9fc58&dd-evp-origin=content-security-policy&ddsource=csp-report&ddtags=site%3Adatadoghq.com + content-type: + - application/vnd.api+json + date: + - Wed, 23 Jul 2025 16:27:16 GMT + strict-transport-security: + - max-age=31536000; includeSubDomains; preload + vary: + - Accept-Encoding + x-content-type-options: + - nosniff + x-frame-options: + - SAMEORIGIN + status: + code: 200 + message: OK +version: 1 From 10fea87e8034bd4152a10a3575cd3228ccf67e71 Mon Sep 17 00:00:00 2001 From: Yun Kim Date: Wed, 23 Jul 2025 14:57:49 -0400 Subject: [PATCH 20/20] Revert earlier change removing config from tasks --- ddtrace/llmobs/_experiment.py | 4 ++-- ddtrace/llmobs/_llmobs.py | 8 ++++---- tests/llmobs/test_experiments.py | 12 +++++++++--- 3 files changed, 15 insertions(+), 9 deletions(-) diff --git a/ddtrace/llmobs/_experiment.py b/ddtrace/llmobs/_experiment.py index 078b01145ae..996af3cd00d 100644 --- a/ddtrace/llmobs/_experiment.py +++ b/ddtrace/llmobs/_experiment.py @@ -226,7 +226,7 @@ class Experiment: def __init__( self, name: str, - task: Callable[[DatasetRecordInputType], JSONType], + task: Callable[[DatasetRecordInputType, Optional[ExperimentConfigType]], JSONType], dataset: Dataset, evaluators: List[Callable[[DatasetRecordInputType, JSONType, JSONType], JSONType]], project_name: str, @@ -318,7 +318,7 @@ def _process_record(self, idx_record: Tuple[int, DatasetRecord]) -> Optional[Tas } output_data = None try: - output_data = self._task(input_data) + output_data = self._task(input_data, self._config) except Exception: span.set_exc_info(*sys.exc_info()) self._llmobs_instance.annotate(span, input_data=input_data, output_data=output_data, tags=tags) diff --git a/ddtrace/llmobs/_llmobs.py b/ddtrace/llmobs/_llmobs.py index 1e4cfa3a1f2..74fdd935944 100644 --- a/ddtrace/llmobs/_llmobs.py +++ b/ddtrace/llmobs/_llmobs.py @@ -672,7 +672,7 @@ def _delete_dataset(cls, dataset_id: str) -> None: def experiment( cls, name: str, - task: Callable[[DatasetRecordInputType], JSONType], + task: Callable[[DatasetRecordInputType, Optional[ExperimentConfigType]], JSONType], dataset: Dataset, evaluators: List[Callable[[DatasetRecordInputType, JSONType, JSONType], JSONType]], description: str = "", @@ -682,7 +682,7 @@ def experiment( """Initializes an Experiment to run a task on a Dataset and evaluators. :param name: The name of the experiment. - :param task: The task function to run. Must accept a parameter ``input_data`` and optionally ``config``. + :param task: The task function to run. Must accept parameters ``input_data`` and ``config``. :param dataset: The dataset to run the experiment on, created with LLMObs.pull/create_dataset(). :param evaluators: A list of evaluator functions to evaluate the task output. Must accept parameters ``input_data``, ``output_data``, and ``expected_output``. @@ -694,8 +694,8 @@ def experiment( raise TypeError("task must be a callable function.") sig = inspect.signature(task) params = sig.parameters - if "input_data" not in params: - raise TypeError("Task function must accept 'input_data' parameters.") + if "input_data" not in params or "config" not in params: + raise TypeError("Task function must have 'input_data' and 'config' parameters.") if not isinstance(dataset, Dataset): raise TypeError("Dataset must be an LLMObs Dataset object.") if not evaluators or not all(callable(evaluator) for evaluator in evaluators): diff --git a/tests/llmobs/test_experiments.py b/tests/llmobs/test_experiments.py index 163e50a27fe..57409feae5e 100644 --- a/tests/llmobs/test_experiments.py +++ b/tests/llmobs/test_experiments.py @@ -28,11 +28,11 @@ def wait_for_backend(): time.sleep(2) -def dummy_task(input_data): +def dummy_task(input_data, config): return input_data -def faulty_task(input_data): +def faulty_task(input_data, config): raise ValueError("This is a test error") @@ -389,12 +389,18 @@ def test_experiment_invalid_task_type_raises(llmobs, test_dataset_one_record): def test_experiment_invalid_task_signature_raises(llmobs, test_dataset_one_record): - with pytest.raises(TypeError, match="Task function must accept 'input_data' parameters."): + with pytest.raises(TypeError, match="Task function must have 'input_data' and 'config' parameters."): def my_task(not_input): pass llmobs.experiment("test_experiment", my_task, test_dataset_one_record, [dummy_evaluator]) + with pytest.raises(TypeError, match="Task function must have 'input_data' and 'config' parameters."): + + def my_task(input_data, not_config): + pass + + llmobs.experiment("test_experiment", my_task, test_dataset_one_record, [dummy_evaluator]) def test_experiment_invalid_dataset_raises(llmobs):