sapientinc · dywsy21 · Aug 4, 2025
diff --git a/dataset/build_arc_dataset.py b/dataset/build_arc_dataset.py
@@ -146,11 +146,8 @@ def _map_grid(grid: np.ndarray):
 
 
 def load_puzzles_arcagi(results: dict, dataset_path: str, config: DataProcessConfig):
-    train_examples_dest = ("train", "all")
-    test_examples_map = {
-        "evaluation": [(1.0, ("test", "all"))],
-        "_default": [(1.0, ("train", "all"))]
-    }
+    # Changed: Split puzzles completely, not examples within puzzles
+    test_puzzle_fraction = 0.2  # Reserve 20% of puzzles for testing
 
     total_puzzles = 0
     for subdir in os.scandir(dataset_path):
@@ -164,19 +161,29 @@ def load_puzzles_arcagi(results: dict, dataset_path: str, config: DataProcessCon
             # Shuffle puzzles
             np.random.shuffle(puzzles)
 
-            # Assign by fraction
-            for idx, (default_name, puzzle) in enumerate(puzzles):
-                fraction = idx / len(puzzles)
-                test_examples_dest = None
-                for f, dest in test_examples_map.get(subdir.name, test_examples_map["_default"]):
-                    if fraction < f:
-                        test_examples_dest = dest
-                        break
-
-                assert test_examples_dest is not None
+            # Split puzzles at puzzle level to avoid data leakage
+            if subdir.name == "evaluation":
+                # For evaluation set, reserve some puzzles completely for testing
+                num_test_puzzles = int(len(puzzles) * test_puzzle_fraction)
+                train_puzzles = puzzles[num_test_puzzles:]
+                test_puzzles = puzzles[:num_test_puzzles]
+
+                # Process train puzzles - both train and test examples go to training
+                for default_name, puzzle in train_puzzles:
+                    convert_single_arc_puzzle(results, default_name, puzzle, config.num_aug, 
+                                            {"train": ("train", "all"), "test": ("train", "all")})
+
+                # Process test puzzles - both train and test examples go to testing  
+                for default_name, puzzle in test_puzzles:
+                    convert_single_arc_puzzle(results, default_name, puzzle, 0,  # No augmentation for test
+                                            {"train": ("test", "all"), "test": ("test", "all")})
+            else:
+                # For other directories, all puzzles go to training
+                for default_name, puzzle in puzzles:
+                    convert_single_arc_puzzle(results, default_name, puzzle, config.num_aug,
+                                            {"train": ("train", "all"), "test": ("train", "all")})
 
-                convert_single_arc_puzzle(results, default_name, puzzle, config.num_aug, {"train": train_examples_dest, "test": test_examples_dest})
-                total_puzzles += 1
+            total_puzzles += len(puzzles)
 
     print (f"[{dataset_path}] total puzzles: {total_puzzles}")
 

diff --git a/dataset/common.py b/dataset/common.py
@@ -49,3 +49,27 @@ def dihedral_transform(arr: np.ndarray, tid: int) -> np.ndarray:
 
 def inverse_dihedral_transform(arr: np.ndarray, tid: int) -> np.ndarray:
     return dihedral_transform(arr, DIHEDRAL_INVERSE[tid])
+
+
+def split_puzzles_by_id(puzzle_ids: List[str], test_fraction: float = 0.2, seed: int = 42) -> tuple[set[str], set[str]]:
+    """Split puzzle IDs into train and test sets to avoid data leakage.
+
+    Args:
+        puzzle_ids: List of puzzle identifiers
+        test_fraction: Fraction of puzzles to reserve for testing
+        seed: Random seed for reproducible splits
+
+    Returns:
+        Tuple of (train_puzzle_ids, test_puzzle_ids)
+    """
+    import random
+    random.seed(seed)
+
+    shuffled_ids = puzzle_ids.copy()
+    random.shuffle(shuffled_ids)
+
+    num_test = int(len(shuffled_ids) * test_fraction)
+    test_ids = set(shuffled_ids[:num_test])
+    train_ids = set(shuffled_ids[num_test:])
+
+    return train_ids, test_ids