refactor(data): Some refactorings and additional tests for packed data filtering.

BlueCrescent · BlueCrescent · commit 1dfcd0877751 · 2025-06-12T17:16:28.000+02:00
diff --git a/src/modalities/dataloader/create_packed_data.py b/src/modalities/dataloader/create_packed_data.py
@@ -258,7 +258,7 @@ def _write_batch(
                 # write index
                 f.write(pickle.dumps(index_list))
 
-            _update_data_length_in_pre_allocated_header(dst_path, index_list)
+            update_data_length_in_pre_allocated_header(dst_path, index_list)
 
         return writer
 
@@ -324,10 +324,10 @@ def _process_line(self, line: str, process_id: int) -> bytes:
         return token_byte_string
 
 
-def _update_data_length_in_pre_allocated_header(dst_path: Path, index_list: list[tuple[int, int]]):
+def update_data_length_in_pre_allocated_header(dst_path: Path, index_list: list[tuple[int, int]]):
     # Update the length of the data section in the pre-allocated header of the destination file.
     # The data segment length is sum of the starting position and the length of the last document.
-    length_of_byte_encoded_data_section = index_list[-1][0] + index_list[-1][1]
+    length_of_byte_encoded_data_section = index_list[-1][0] + index_list[-1][1] if len(index_list) > 0 else 0
     data_section_length_in_bytes = length_of_byte_encoded_data_section.to_bytes(
         EmbeddedStreamData.DATA_SECTION_LENGTH_IN_BYTES, byteorder="little"
     )
diff --git a/src/modalities/dataloader/filter_packed_data.py b/src/modalities/dataloader/filter_packed_data.py
@@ -6,13 +6,13 @@
 from numpy.typing import NDArray
 from tqdm import tqdm
 
-from modalities.dataloader.create_packed_data import EmbeddedStreamData, _update_data_length_in_pre_allocated_header
+from modalities.dataloader.create_packed_data import EmbeddedStreamData, update_data_length_in_pre_allocated_header
 from modalities.dataloader.dataset import PackedMemMapDatasetBase
 
 
 def filter_dataset(
-    dst_path: Path,
     src_path: Path,
+    dst_path: Path,
     filter_func: Callable[[tuple[int, dict[str, NDArray[np.int_]]]], bool],
     sample_key: str = "input_ids",
 ) -> None:
@@ -41,6 +41,7 @@ def filter_dataset(
         # When we load the file, we add the header size to the offset
         curr_offset = 0
 
+        # Provide sample and its index (via enumerate) to the filter function.
         for _, entry in filter(filter_func, enumerate(tqdm(source_data, desc="Filtering samples"))):
             tokens: NDArray[np.int_] = entry[sample_key].astype(tok_type)
             tokens = tokens.astype(tokens.dtype.newbyteorder("<"))
@@ -49,7 +50,7 @@ def filter_dataset(
             segment_length = len(tokens_as_bytes)
             index_list.append((curr_offset, segment_length))
             curr_offset += segment_length
-        # write index
+        # Write index at end of the file.
         f_out.write(pickle.dumps(index_list))
 
-    _update_data_length_in_pre_allocated_header(dst_path, index_list)
+    update_data_length_in_pre_allocated_header(dst_path, index_list)
diff --git a/tests/dataloader/test_filter_packed_data.py b/tests/dataloader/test_filter_packed_data.py
@@ -1,39 +1,65 @@
+import hashlib
 from pathlib import Path
 
 import numpy as np
+import pytest
 from numpy.typing import NDArray
 
 from modalities.dataloader.dataset import PackedMemMapDatasetBase
 from modalities.dataloader.filter_packed_data import filter_dataset
 
 
-def test_creates_output_file(tmp_path: Path, dummy_packed_data_path: Path):
+def test_creates_output_file(tmp_path: Path, packed_data_paths: Path):
     output_path = Path(tmp_path, "output.pbin")
     filter_dataset(
-        dst_path=output_path, src_path=dummy_packed_data_path, filter_func=accept_even_indices, sample_key="input_ids"
+        src_path=packed_data_paths, dst_path=output_path, filter_func=accept_even_indices, sample_key="input_ids"
     )
     assert output_path.exists()
 
 
-def test_filtered_data_has_expected_length(tmp_path: Path, dummy_packed_data_path: Path):
+def test_filtered_data_has_expected_length(tmp_path: Path, packed_data_paths: Path):
     output_path = Path(tmp_path, "output.pbin")
     filter_dataset(
-        dst_path=output_path, src_path=dummy_packed_data_path, filter_func=accept_even_indices, sample_key="input_ids"
+        src_path=packed_data_paths, dst_path=output_path, filter_func=accept_even_indices, sample_key="input_ids"
     )
+    original_data = PackedMemMapDatasetBase(packed_data_paths, sample_key="input_ids")
     filtered_data = PackedMemMapDatasetBase(output_path, sample_key="input_ids")
-    assert len(filtered_data) == 2
+    assert (
+        len(filtered_data) == len(original_data) // 2 + len(original_data) % 2
+    ), "Filtered data length should be half of the original data length (rounded up)."
 
 
 def test_filtered_data_has_expected_content(tmp_path: Path, dummy_packed_data_path: Path):
     output_path = Path(tmp_path, "output.pbin")
     filter_dataset(
-        dst_path=output_path, src_path=dummy_packed_data_path, filter_func=accept_even_indices, sample_key="input_ids"
+        src_path=dummy_packed_data_path, dst_path=output_path, filter_func=accept_even_indices, sample_key="input_ids"
     )
     filtered_data = PackedMemMapDatasetBase(output_path, sample_key="input_ids")
     assert filtered_data[0]["input_ids"].tolist() == list(range(24 // 4))
     assert filtered_data[1]["input_ids"].tolist() == list(range(64 // 4, (64 + 12) // 4))
 
 
+def test_always_true_filtered_data_has_identical_file_hash(tmp_path: Path, packed_data_paths: Path):
+    output_path = Path(tmp_path, "output.pbin")
+    filter_dataset(src_path=packed_data_paths, dst_path=output_path, filter_func=lambda x: True, sample_key="input_ids")
+    with open(packed_data_paths, "rb") as f_in, open(output_path, "rb") as f_out:
+        original_hash = hashlib.sha256(f_in.read()).hexdigest()
+        filtered_hash = hashlib.sha256(f_out.read()).hexdigest()
+    assert (
+        original_hash == filtered_hash
+    ), "Filtered data should have the same hash as the original data when no filtering is applied."
+
+
+def test_always_false_filtered_data_produces_valid_file(tmp_path: Path, packed_data_paths: Path):
+    output_path = Path(tmp_path, "output.pbin")
+    filter_dataset(
+        src_path=packed_data_paths, dst_path=output_path, filter_func=lambda x: False, sample_key="input_ids"
+    )
+    filtered_data = PackedMemMapDatasetBase(output_path, sample_key="input_ids")
+    assert len(filtered_data) == 0, "Filtered data should be empty when all samples are filtered out."
+    assert output_path.stat().st_size > 0, "Output file should not be empty even if no samples are included."
+
+
 def accept_even_indices(idx_content: tuple[int, dict[str, NDArray[np.int_]]]) -> bool:
     """
     Filter function that accepts only even indices.
@@ -45,3 +71,9 @@ def accept_even_indices(idx_content: tuple[int, dict[str, NDArray[np.int_]]]) ->
     """
     idx, _ = idx_content
     return idx % 2 == 0
+
+
+@pytest.fixture(params=[0, 1])
+def packed_data_paths(dummy_packed_data_path: Path, request: pytest.FixtureRequest) -> Path:
+    path_options = [dummy_packed_data_path, Path("tests", "data", "datasets", "lorem_ipsum_long.pbin")]
+    return path_options[request.param]