From 5848fd8be3847f760afd4eb194f5366d842ecfef Mon Sep 17 00:00:00 2001 From: Arjun Dinesh Jagdale <142811259+ArjunJagdale@users.noreply.github.com> Date: Fri, 27 Jun 2025 10:51:14 +0530 Subject: [PATCH 1/3] Fix misleading add_column() usage example in docstring This PR fixes the usage example in the Dataset.add_column() docstring, which previously implied that add_column() modifies the dataset in-place. Why: The method returns a new dataset with the additional column, and users must assign the result to a variable to preserve the change. Fixes #7611 --- src/datasets/arrow_dataset.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/src/datasets/arrow_dataset.py b/src/datasets/arrow_dataset.py index e499fb6f9ba..be1ead56a30 100644 --- a/src/datasets/arrow_dataset.py +++ b/src/datasets/arrow_dataset.py @@ -6005,7 +6005,8 @@ def add_column( >>> from datasets import load_dataset >>> ds = load_dataset("cornell-movie-review-data/rotten_tomatoes", split="validation") >>> more_text = ds["text"] - >>> ds.add_column(name="text_2", column=more_text) + >>> ds = ds.add_column(name="text_2", column=more_text) + >>> ds Dataset({ features: ['text', 'label', 'text_2'], num_rows: 1066 From 5927192b3710cd33ed73097bdfaf7b98a75af318 Mon Sep 17 00:00:00 2001 From: Arjun Dinesh Jagdale <142811259+ArjunJagdale@users.noreply.github.com> Date: Mon, 7 Jul 2025 23:40:34 +0530 Subject: [PATCH 2/3] Fix misleading docstring examples for select_columns, select, filter, shard, and flatten MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Fix misleading docstring examples for select_columns, select, filter, shard, and flatten - Updated usage examples to show correct behavior (methods return new datasets) - Added inline comments to clarify that methods do not modify in-place - Fixes follow-up from issue #7611 and @lhoestq’s review on PR #7648 --- src/datasets/arrow_dataset.py | 21 ++++++++++++++++----- 1 file changed, 16 insertions(+), 5 deletions(-) diff --git a/src/datasets/arrow_dataset.py b/src/datasets/arrow_dataset.py index be1ead56a30..637b4c904ec 100644 --- a/src/datasets/arrow_dataset.py +++ b/src/datasets/arrow_dataset.py @@ -2041,7 +2041,9 @@ def flatten(self, new_fingerprint: Optional[str] = None, max_depth=16) -> "Datas 'question': Value('string'), 'answers': {'text': List(Value('string')), 'answer_start': List(Value('int32'))}} - >>> ds.flatten() + >>> # Note: this method returns a new dataset and does not modify in-place + >>> ds = ds.flatten() + >>> ds Dataset({ features: ['id', 'title', 'context', 'question', 'answers.text', 'answers.answer_start'], num_rows: 87599 @@ -2399,7 +2401,9 @@ def select_columns(self, column_names: Union[str, list[str]], new_fingerprint: O ```py >>> from datasets import load_dataset >>> ds = load_dataset("cornell-movie-review-data/rotten_tomatoes", split="validation") - >>> ds.select_columns(['text']) + >>> # Note: this method returns a new dataset and does not modify in-place + >>> ds = ds.select_columns(['text']) + >>> ds Dataset({ features: ['text'], num_rows: 1066 @@ -3869,12 +3873,15 @@ def filter( ```py >>> from datasets import load_dataset >>> ds = load_dataset("cornell-movie-review-data/rotten_tomatoes", split="validation") - >>> ds.filter(lambda x: x["label"] == 1) + >>> # Note: this method returns a new dataset and does not modify in-place + >>> ds = ds.filter(lambda x: x["label"] == 1) + >>> ds Dataset({ features: ['text', 'label'], num_rows: 533 }) ``` + """ if len(self.list_indexes()) > 0: raise DatasetTransformationNotAllowedError( @@ -4041,7 +4048,9 @@ def select( ```py >>> from datasets import load_dataset >>> ds = load_dataset("cornell-movie-review-data/rotten_tomatoes", split="validation") - >>> ds.select(range(4)) + >>> # Note: this method returns a new dataset and does not modify in-place + >>> ds = ds.select(range(4)) + >>> ds Dataset({ features: ['text', 'label'], num_rows: 4 @@ -4936,7 +4945,9 @@ def shard( features: ['text', 'label'], num_rows: 1066 }) - >>> ds.shard(num_shards=2, index=0) + >>> # Note: this method returns a new dataset and does not modify in-place + >>> ds = ds.shard(num_shards=2, index=0) + >>> ds Dataset({ features: ['text', 'label'], num_rows: 533 From 60fd9566cc587d18112d1526f85378e2ca48f47d Mon Sep 17 00:00:00 2001 From: Quentin Lhoest <42851186+lhoestq@users.noreply.github.com> Date: Thu, 17 Jul 2025 15:14:05 +0200 Subject: [PATCH 3/3] Apply suggestions from code review --- src/datasets/arrow_dataset.py | 5 ----- 1 file changed, 5 deletions(-) diff --git a/src/datasets/arrow_dataset.py b/src/datasets/arrow_dataset.py index 637b4c904ec..81dbe7634b8 100644 --- a/src/datasets/arrow_dataset.py +++ b/src/datasets/arrow_dataset.py @@ -2041,7 +2041,6 @@ def flatten(self, new_fingerprint: Optional[str] = None, max_depth=16) -> "Datas 'question': Value('string'), 'answers': {'text': List(Value('string')), 'answer_start': List(Value('int32'))}} - >>> # Note: this method returns a new dataset and does not modify in-place >>> ds = ds.flatten() >>> ds Dataset({ @@ -2401,7 +2400,6 @@ def select_columns(self, column_names: Union[str, list[str]], new_fingerprint: O ```py >>> from datasets import load_dataset >>> ds = load_dataset("cornell-movie-review-data/rotten_tomatoes", split="validation") - >>> # Note: this method returns a new dataset and does not modify in-place >>> ds = ds.select_columns(['text']) >>> ds Dataset({ @@ -3873,7 +3871,6 @@ def filter( ```py >>> from datasets import load_dataset >>> ds = load_dataset("cornell-movie-review-data/rotten_tomatoes", split="validation") - >>> # Note: this method returns a new dataset and does not modify in-place >>> ds = ds.filter(lambda x: x["label"] == 1) >>> ds Dataset({ @@ -4048,7 +4045,6 @@ def select( ```py >>> from datasets import load_dataset >>> ds = load_dataset("cornell-movie-review-data/rotten_tomatoes", split="validation") - >>> # Note: this method returns a new dataset and does not modify in-place >>> ds = ds.select(range(4)) >>> ds Dataset({ @@ -4945,7 +4941,6 @@ def shard( features: ['text', 'label'], num_rows: 1066 }) - >>> # Note: this method returns a new dataset and does not modify in-place >>> ds = ds.shard(num_shards=2, index=0) >>> ds Dataset({