From 5848fd8be3847f760afd4eb194f5366d842ecfef Mon Sep 17 00:00:00 2001
From: Arjun Dinesh Jagdale <142811259+ArjunJagdale@users.noreply.github.com>
Date: Fri, 27 Jun 2025 10:51:14 +0530
Subject: [PATCH 1/3] Fix misleading add_column() usage example in docstring

This PR fixes the usage example in the Dataset.add_column() docstring, which previously implied that add_column() modifies the dataset in-place.

Why:
The method returns a new dataset with the additional column, and users must assign the result to a variable to preserve the change.

Fixes #7611
---
 src/datasets/arrow_dataset.py | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/src/datasets/arrow_dataset.py b/src/datasets/arrow_dataset.py
index e499fb6f9ba..be1ead56a30 100644
--- a/src/datasets/arrow_dataset.py
+++ b/src/datasets/arrow_dataset.py
@@ -6005,7 +6005,8 @@ def add_column(
         >>> from datasets import load_dataset
         >>> ds = load_dataset("cornell-movie-review-data/rotten_tomatoes", split="validation")
         >>> more_text = ds["text"]
-        >>> ds.add_column(name="text_2", column=more_text)
+        >>> ds = ds.add_column(name="text_2", column=more_text)
+        >>> ds
         Dataset({
             features: ['text', 'label', 'text_2'],
             num_rows: 1066

From 5927192b3710cd33ed73097bdfaf7b98a75af318 Mon Sep 17 00:00:00 2001
From: Arjun Dinesh Jagdale <142811259+ArjunJagdale@users.noreply.github.com>
Date: Mon, 7 Jul 2025 23:40:34 +0530
Subject: [PATCH 2/3] Fix misleading docstring examples for select_columns,
 select, filter, shard, and flatten
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Fix misleading docstring examples for select_columns, select, filter, shard, and flatten

- Updated usage examples to show correct behavior (methods return new datasets)
- Added inline comments to clarify that methods do not modify in-place
- Fixes follow-up from issue #7611 and @lhoestq’s review on PR #7648
---
 src/datasets/arrow_dataset.py | 21 ++++++++++++++++-----
 1 file changed, 16 insertions(+), 5 deletions(-)

diff --git a/src/datasets/arrow_dataset.py b/src/datasets/arrow_dataset.py
index be1ead56a30..637b4c904ec 100644
--- a/src/datasets/arrow_dataset.py
+++ b/src/datasets/arrow_dataset.py
@@ -2041,7 +2041,9 @@ def flatten(self, new_fingerprint: Optional[str] = None, max_depth=16) -> "Datas
          'question': Value('string'),
          'answers': {'text': List(Value('string')),
          'answer_start': List(Value('int32'))}}
-        >>> ds.flatten()
+        >>> # Note: this method returns a new dataset and does not modify in-place
+        >>> ds = ds.flatten()
+        >>> ds
         Dataset({
             features: ['id', 'title', 'context', 'question', 'answers.text', 'answers.answer_start'],
             num_rows: 87599
@@ -2399,7 +2401,9 @@ def select_columns(self, column_names: Union[str, list[str]], new_fingerprint: O
         ```py
         >>> from datasets import load_dataset
         >>> ds = load_dataset("cornell-movie-review-data/rotten_tomatoes", split="validation")
-        >>> ds.select_columns(['text'])
+        >>> # Note: this method returns a new dataset and does not modify in-place
+        >>> ds = ds.select_columns(['text'])
+        >>> ds
         Dataset({
             features: ['text'],
             num_rows: 1066
@@ -3869,12 +3873,15 @@ def filter(
         ```py
         >>> from datasets import load_dataset
         >>> ds = load_dataset("cornell-movie-review-data/rotten_tomatoes", split="validation")
-        >>> ds.filter(lambda x: x["label"] == 1)
+        >>> # Note: this method returns a new dataset and does not modify in-place
+        >>> ds = ds.filter(lambda x: x["label"] == 1)
+        >>> ds
         Dataset({
             features: ['text', 'label'],
             num_rows: 533
         })
         ```
+
         """
         if len(self.list_indexes()) > 0:
             raise DatasetTransformationNotAllowedError(
@@ -4041,7 +4048,9 @@ def select(
         ```py
         >>> from datasets import load_dataset
         >>> ds = load_dataset("cornell-movie-review-data/rotten_tomatoes", split="validation")
-        >>> ds.select(range(4))
+        >>> # Note: this method returns a new dataset and does not modify in-place
+        >>> ds = ds.select(range(4))
+        >>> ds
         Dataset({
             features: ['text', 'label'],
             num_rows: 4
@@ -4936,7 +4945,9 @@ def shard(
             features: ['text', 'label'],
             num_rows: 1066
         })
-        >>> ds.shard(num_shards=2, index=0)
+        >>> # Note: this method returns a new dataset and does not modify in-place
+        >>> ds = ds.shard(num_shards=2, index=0)
+        >>> ds
         Dataset({
             features: ['text', 'label'],
             num_rows: 533

From 60fd9566cc587d18112d1526f85378e2ca48f47d Mon Sep 17 00:00:00 2001
From: Quentin Lhoest <42851186+lhoestq@users.noreply.github.com>
Date: Thu, 17 Jul 2025 15:14:05 +0200
Subject: [PATCH 3/3] Apply suggestions from code review

---
 src/datasets/arrow_dataset.py | 5 -----
 1 file changed, 5 deletions(-)

diff --git a/src/datasets/arrow_dataset.py b/src/datasets/arrow_dataset.py
index 637b4c904ec..81dbe7634b8 100644
--- a/src/datasets/arrow_dataset.py
+++ b/src/datasets/arrow_dataset.py
@@ -2041,7 +2041,6 @@ def flatten(self, new_fingerprint: Optional[str] = None, max_depth=16) -> "Datas
          'question': Value('string'),
          'answers': {'text': List(Value('string')),
          'answer_start': List(Value('int32'))}}
-        >>> # Note: this method returns a new dataset and does not modify in-place
         >>> ds = ds.flatten()
         >>> ds
         Dataset({
@@ -2401,7 +2400,6 @@ def select_columns(self, column_names: Union[str, list[str]], new_fingerprint: O
         ```py
         >>> from datasets import load_dataset
         >>> ds = load_dataset("cornell-movie-review-data/rotten_tomatoes", split="validation")
-        >>> # Note: this method returns a new dataset and does not modify in-place
         >>> ds = ds.select_columns(['text'])
         >>> ds
         Dataset({
@@ -3873,7 +3871,6 @@ def filter(
         ```py
         >>> from datasets import load_dataset
         >>> ds = load_dataset("cornell-movie-review-data/rotten_tomatoes", split="validation")
-        >>> # Note: this method returns a new dataset and does not modify in-place
         >>> ds = ds.filter(lambda x: x["label"] == 1)
         >>> ds
         Dataset({
@@ -4048,7 +4045,6 @@ def select(
         ```py
         >>> from datasets import load_dataset
         >>> ds = load_dataset("cornell-movie-review-data/rotten_tomatoes", split="validation")
-        >>> # Note: this method returns a new dataset and does not modify in-place
         >>> ds = ds.select(range(4))
         >>> ds
         Dataset({
@@ -4945,7 +4941,6 @@ def shard(
             features: ['text', 'label'],
             num_rows: 1066
         })
-        >>> # Note: this method returns a new dataset and does not modify in-place
         >>> ds = ds.shard(num_shards=2, index=0)
         >>> ds
         Dataset({