refactor(data): Minor refactorings to address PR comments.

BlueCrescent · BlueCrescent · commit 5ae28334c1f3 · 2025-06-18T12:07:36.000+02:00
diff --git a/src/modalities/dataloader/create_packed_data.py b/src/modalities/dataloader/create_packed_data.py
@@ -327,7 +327,14 @@ def _process_line(self, line: str, process_id: int) -> bytes:
 def update_data_length_in_pre_allocated_header(dst_path: Path, index_list: list[tuple[int, int]]):
     # Update the length of the data section in the pre-allocated header of the destination file.
     # The data segment length is sum of the starting position and the length of the last document.
-    length_of_byte_encoded_data_section = index_list[-1][0] + index_list[-1][1] if len(index_list) > 0 else 0
+    if len(index_list) > 0:
+        length_of_byte_encoded_data_section = index_list[-1][0] + index_list[-1][1]
+    else:
+        length_of_byte_encoded_data_section = 0
+        logger.warning(
+            f'No data was written to the file "{dst_path}". '
+            "This can happen if the input file is empty or all samples were filtered out."
+        )
     data_section_length_in_bytes = length_of_byte_encoded_data_section.to_bytes(
         EmbeddedStreamData.DATA_SECTION_LENGTH_IN_BYTES, byteorder="little"
     )
diff --git a/tests/dataloader/test_filter_packed_data.py b/tests/dataloader/test_filter_packed_data.py
@@ -9,20 +9,20 @@
 from modalities.dataloader.filter_packed_data import filter_dataset
 
 
-def test_creates_output_file(tmp_path: Path, packed_data_paths: Path):
+def test_creates_output_file(tmp_path: Path, packed_data_path: Path):
     output_path = Path(tmp_path, "output.pbin")
     filter_dataset(
-        src_path=packed_data_paths, dst_path=output_path, filter_func=accept_even_indices, sample_key="input_ids"
+        src_path=packed_data_path, dst_path=output_path, filter_func=accept_even_indices, sample_key="input_ids"
     )
     assert output_path.exists()
 
 
-def test_filtered_data_has_expected_length(tmp_path: Path, packed_data_paths: Path):
+def test_filtered_data_has_expected_length(tmp_path: Path, packed_data_path: Path):
     output_path = Path(tmp_path, "output.pbin")
     filter_dataset(
-        src_path=packed_data_paths, dst_path=output_path, filter_func=accept_even_indices, sample_key="input_ids"
+        src_path=packed_data_path, dst_path=output_path, filter_func=accept_even_indices, sample_key="input_ids"
     )
-    original_data = PackedMemMapDatasetBase(packed_data_paths, sample_key="input_ids")
+    original_data = PackedMemMapDatasetBase(packed_data_path, sample_key="input_ids")
     filtered_data = PackedMemMapDatasetBase(output_path, sample_key="input_ids")
     assert (
         len(filtered_data) == len(original_data) // 2 + len(original_data) % 2
@@ -39,22 +39,20 @@ def test_filtered_data_has_expected_content(tmp_path: Path, dummy_packed_data_pa
     assert filtered_data[1]["input_ids"].tolist() == list(range(64 // 4, (64 + 12) // 4))
 
 
-def test_always_true_filtered_data_has_identical_file_hash(tmp_path: Path, packed_data_paths: Path):
+def test_always_true_filtered_data_has_identical_file_hash(tmp_path: Path, packed_data_path: Path):
     output_path = Path(tmp_path, "output.pbin")
-    filter_dataset(src_path=packed_data_paths, dst_path=output_path, filter_func=lambda x: True, sample_key="input_ids")
-    with open(packed_data_paths, "rb") as f_in, open(output_path, "rb") as f_out:
+    filter_dataset(src_path=packed_data_path, dst_path=output_path, filter_func=lambda x: True, sample_key="input_ids")
+    with open(packed_data_path, "rb") as f_in, open(output_path, "rb") as f_out:
         original_hash = hashlib.sha256(f_in.read()).hexdigest()
         filtered_hash = hashlib.sha256(f_out.read()).hexdigest()
     assert (
         original_hash == filtered_hash
     ), "Filtered data should have the same hash as the original data when no filtering is applied."
 
 
-def test_always_false_filtered_data_produces_valid_file(tmp_path: Path, packed_data_paths: Path):
+def test_always_false_filtered_data_produces_valid_file(tmp_path: Path, packed_data_path: Path):
     output_path = Path(tmp_path, "output.pbin")
-    filter_dataset(
-        src_path=packed_data_paths, dst_path=output_path, filter_func=lambda x: False, sample_key="input_ids"
-    )
+    filter_dataset(src_path=packed_data_path, dst_path=output_path, filter_func=lambda x: False, sample_key="input_ids")
     filtered_data = PackedMemMapDatasetBase(output_path, sample_key="input_ids")
     assert len(filtered_data) == 0, "Filtered data should be empty when all samples are filtered out."
     assert output_path.stat().st_size > 0, "Output file should not be empty even if no samples are included."
@@ -74,6 +72,6 @@ def accept_even_indices(idx_content: tuple[int, dict[str, NDArray[np.int_]]]) ->
 
 
 @pytest.fixture(params=[0, 1])
-def packed_data_paths(dummy_packed_data_path: Path, request: pytest.FixtureRequest) -> Path:
-    path_options = [dummy_packed_data_path, Path("tests", "data", "datasets", "lorem_ipsum_long.pbin")]
+def packed_data_path(dummy_packed_data_path: Path, request: pytest.FixtureRequest) -> Path:
+    path_options = [dummy_packed_data_path, Path("tests/data/datasets/lorem_ipsum_long.pbin")]
     return path_options[request.param]