diff --git a/configs/dataset/graph/GraphUniverse_CD.yaml b/configs/dataset/graph/GraphUniverse_CD.yaml new file mode 100644 index 000000000..e77924e76 --- /dev/null +++ b/configs/dataset/graph/GraphUniverse_CD.yaml @@ -0,0 +1,54 @@ + +loader: + _target_: topobench.data.loaders.GraphUniverseDatasetLoader + parameters: + data_domain: graph + data_type: GraphUniverse + data_name: GraphUniverse + data_dir: ${paths.data_dir}/${dataset.loader.parameters.data_domain}/${dataset.loader.parameters.data_type} + generation_parameters: + task: community_detection + universe_parameters: + K: 10 + feature_dim: 15 + center_variance: 0.2 + cluster_variance: 0.5 + edge_propensity_variance: 0.5 + seed: 42 + family_parameters: + n_graphs: 100 + min_n_nodes: 50 + max_n_nodes: 200 + min_communities: 4 + max_communities: 6 + homophily_range: [0.4, 0.6] + avg_degree_range: [1.0, 5.0] + degree_separation_range: [0.5, 0.8] + degree_distribution: power_law + power_law_exponent_range: [2.0, 2.5] + seed: ${dataset.loader.parameters.generation_parameters.universe_parameters.seed} + + +# Dataset parameters +parameters: + num_features: ${dataset.loader.parameters.generation_parameters.universe_parameters.feature_dim} + num_classes: ${dataset.loader.parameters.generation_parameters.universe_parameters.K} + task: classification + loss_type: cross_entropy + monitor_metric: accuracy + task_level: node + +#splits +split_params: + learning_setting: inductive + data_split_dir: ${dataset.loader.parameters.data_dir}/data_splits + data_seed: 0 + split_type: random #'k-fold' # either "k-fold" or "random" strategies + k: 10 # for "k-fold" Cross-Validation + train_prop: 0.7 # for "random" strategy splitting + +# Dataloader parameters +dataloader_params: + batch_size: 16 + num_workers: 0 + pin_memory: False \ No newline at end of file diff --git a/configs/dataset/graph/GraphUniverse_TC.yaml b/configs/dataset/graph/GraphUniverse_TC.yaml new file mode 100644 index 000000000..24ef6f2d5 --- /dev/null +++ b/configs/dataset/graph/GraphUniverse_TC.yaml @@ -0,0 +1,54 @@ + +loader: + _target_: topobench.data.loaders.GraphUniverseDatasetLoader + parameters: + data_domain: graph + data_type: GraphUniverse + data_name: GraphUniverse + data_dir: ${paths.data_dir}/${dataset.loader.parameters.data_domain}/${dataset.loader.parameters.data_type} + generation_parameters: + task: triangle_counting + universe_parameters: + K: 10 + feature_dim: 15 + center_variance: 0.2 + cluster_variance: 0.5 + edge_propensity_variance: 0.5 + seed: 42 + family_parameters: + n_graphs: 100 + min_n_nodes: 50 + max_n_nodes: 200 + min_communities: 4 + max_communities: 6 + homophily_range: [0.4, 0.6] + avg_degree_range: [2.0, 10.0] + degree_separation_range: [0.9, 1.0] + degree_distribution: power_law + power_law_exponent_range: [2.0, 2.5] + seed: ${dataset.loader.parameters.generation_parameters.universe_parameters.seed} + + +# Dataset parameters +parameters: + num_features: ${dataset.loader.parameters.generation_parameters.universe_parameters.feature_dim} + num_classes: 1 + task: regression + loss_type: mae + monitor_metric: mae + task_level: graph + +#splits +split_params: + learning_setting: inductive + data_split_dir: ${dataset.loader.parameters.data_dir}/data_splits + data_seed: 0 + split_type: random #'k-fold' # either "k-fold" or "random" strategies + k: 10 # for "k-fold" Cross-Validation + train_prop: 0.7 # for "random" strategy splitting + +# Dataloader parameters +dataloader_params: + batch_size: 16 + num_workers: 0 + pin_memory: False \ No newline at end of file diff --git a/pyproject.toml b/pyproject.toml index 3234ea9e6..a68eff33a 100755 --- a/pyproject.toml +++ b/pyproject.toml @@ -53,6 +53,7 @@ dependencies=[ "rootutils", "topomodelx @ git+https://github.com/pyt-team/TopoModelX.git", "toponetx @ git+https://github.com/pyt-team/TopoNetX.git", + "graphuniverse @ git+https://github.com/LouisVanLangendonck/GraphUniverse.git", "lightning==2.4.0", ] diff --git a/test/_utils/simplified_pipeline.py b/test/_utils/simplified_pipeline.py index 39c949210..f07f7037d 100644 --- a/test/_utils/simplified_pipeline.py +++ b/test/_utils/simplified_pipeline.py @@ -62,8 +62,9 @@ def run(cfg: DictConfig) -> DictConfig: # Preprocess dataset and load the splits transform_config = cfg.get("transforms", None) preprocessor = PreProcessor(dataset, dataset_dir, transform_config) + task_level = cfg.dataset.parameters.get("task_level", None) dataset_train, dataset_val, dataset_test = ( - preprocessor.load_dataset_splits(cfg.dataset.split_params) + preprocessor.load_dataset_splits(cfg.dataset.split_params, task_level=task_level) ) # Prepare datamodule if cfg.dataset.parameters.task_level in ["node", "graph"]: diff --git a/test/data/preprocess/test_preprocessor.py b/test/data/preprocess/test_preprocessor.py index 89a60c321..c8ceb3c3e 100644 --- a/test/data/preprocess/test_preprocessor.py +++ b/test/data/preprocess/test_preprocessor.py @@ -131,7 +131,7 @@ def test_load_dataset_splits_inductive(self, mock_load_inductive_splits): preprocessor.load_dataset_splits(split_params) mock_load_inductive_splits.assert_called_once_with( - preprocessor, split_params + preprocessor, split_params, task_level=None ) @patch("topobench.data.preprocessor.preprocessor.load_transductive_splits") @@ -160,6 +160,32 @@ def test_load_dataset_splits_transductive(self, mock_load_transductive_splits): mock_load_transductive_splits.assert_called_once_with( preprocessor, split_params ) + @patch("topobench.data.preprocessor.preprocessor.load_inductive_splits") + def test_load_dataset_splits_with_task_level(self, mock_load_inductive_splits): + """Test loading dataset splits with task_level parameter. + + Parameters + ---------- + mock_load_inductive_splits : MagicMock + Mock of the load_inductive_splits function. + """ + mock_dataset = MagicMock(spec=torch_geometric.data.Dataset) + mock_dataset.transform = None + mock_dataset._data = torch_geometric.data.Data() + mock_dataset.slices = {} + mock_dataset.__iter__ = MagicMock(return_value=iter([])) + + with tempfile.TemporaryDirectory() as tmpdir: + with patch("torch_geometric.data.InMemoryDataset.__init__"): + with patch.object(PreProcessor, "load"): + preprocessor = PreProcessor(mock_dataset, tmpdir, None) + + split_params = DictConfig({"learning_setting": "inductive"}) + preprocessor.load_dataset_splits(split_params, task_level="node") + + mock_load_inductive_splits.assert_called_once_with( + preprocessor, split_params, task_level="node" + ) def test_invalid_learning_setting(self): """Test error with invalid learning setting.""" diff --git a/test/data/utils/test_split_utils.py b/test/data/utils/test_split_utils.py index 1453aafbb..188db6258 100644 --- a/test/data/utils/test_split_utils.py +++ b/test/data/utils/test_split_utils.py @@ -335,6 +335,69 @@ def test_multidimensional_ragged_labels(self): assert len(test_ds) > 0 assert len(train_ds) + len(val_ds) + len(test_ds) == n_graphs + def test_node_task_level_random_split(self): + """Test with task_level='node' using random split.""" + n_graphs = 20 + label_shapes = [()] * n_graphs + mock_dataset = self.create_mock_dataset(n_graphs, label_shapes) + + parameters = DictConfig({ + "split_type": "random", + "data_seed": 0, + "train_prop": 0.6, + "data_split_dir": os.path.join(self.test_dir, "data_splits") + }) + + # Call with task_level='node' + train_ds, val_ds, test_ds = load_inductive_splits( + mock_dataset, parameters, task_level="node" + ) + + # Verify splits exist and are non-empty + assert len(train_ds) > 0 + assert len(val_ds) > 0 + assert len(test_ds) > 0 + assert len(train_ds) + len(val_ds) + len(test_ds) == n_graphs + + def test_node_task_level_kfold_raises_error(self): + """Test that task_level='node' with k-fold raises NotImplementedError.""" + n_graphs = 20 + label_shapes = [()] * n_graphs + mock_dataset = self.create_mock_dataset(n_graphs, label_shapes) + + parameters = DictConfig({ + "split_type": "k-fold", + "data_seed": 0, + "k": 5, + "data_split_dir": os.path.join(self.test_dir, "data_splits") + }) + + # K-fold should not be supported for node-level tasks in inductive setting + with pytest.raises(NotImplementedError, match="K-Fold splitting is not supported for node-level tasks"): + load_inductive_splits(mock_dataset, parameters, task_level="node") + + def test_graph_task_level_explicit(self): + """Test with task_level='graph' explicitly set.""" + n_graphs = 20 + label_shapes = [()] * n_graphs + mock_dataset = self.create_mock_dataset(n_graphs, label_shapes) + + parameters = DictConfig({ + "split_type": "random", + "data_seed": 0, + "train_prop": 0.6, + "data_split_dir": os.path.join(self.test_dir, "data_splits") + }) + + # Call with task_level='graph' + train_ds, val_ds, test_ds = load_inductive_splits( + mock_dataset, parameters, task_level="graph" + ) + + assert len(train_ds) > 0 + assert len(val_ds) > 0 + assert len(test_ds) > 0 + assert len(train_ds) + len(val_ds) + len(test_ds) == n_graphs class TestKFoldSplit: """Test k_fold_split function.""" diff --git a/test/pipeline/test_pipeline.py b/test/pipeline/test_pipeline.py index 785987159..45a53f3db 100644 --- a/test/pipeline/test_pipeline.py +++ b/test/pipeline/test_pipeline.py @@ -4,9 +4,8 @@ from test._utils.simplified_pipeline import run -DATASET = "graph/MUTAG" # ADD YOUR DATASET HERE -MODELS = ["graph/gcn", "cell/topotune", "simplicial/topotune"] # ADD ONE OR SEVERAL MODELS OF YOUR CHOICE HERE - +DATASET = "graph/GraphUniverse_CD" # ADD YOUR DATASET HERE +MODELS = ["graph/gcn", "graph/gps"] class TestPipeline: """Test pipeline for a particular dataset and model.""" diff --git a/topobench/data/loaders/graph/graph_universe_loader.py b/topobench/data/loaders/graph/graph_universe_loader.py new file mode 100644 index 000000000..fbe4e16d4 --- /dev/null +++ b/topobench/data/loaders/graph/graph_universe_loader.py @@ -0,0 +1,69 @@ +"""Loaders for GraphUniverse [1] datasets. + +[1] Anonymous (2025). GraphUniverse: Enabling Systematic Evaluation of Inductive Generalization. In Submitted to The Fourteenth International Conference on Learning Representations. +(github: https://github.com/LouisVanLangendonck/GraphUniverse) +""" + +from graph_universe import GraphUniverseDataset +from omegaconf import DictConfig +from torch_geometric.data import Data, Dataset + +from topobench.data.loaders.base import AbstractLoader + + +class GraphUniverseDatasetLoader(AbstractLoader): + """Load Graph Universe datasets. + + Parameters + ---------- + parameters : DictConfig + Configuration parameters containing: + - data_dir: Root directory for data + - data_name: Name of the dataset + - data_type: Type of the dataset (e.g., "graph_classification") + """ + + def __init__(self, parameters: DictConfig) -> None: + super().__init__(parameters) + if self.parameters.get("num_nodes_range") is not None: + self.parameters["generation_parameters"]["family_parameters"]["min_n_nodes"] = self.parameters.get("num_nodes_range")[0] + self.parameters["generation_parameters"]["family_parameters"]["max_n_nodes"] = self.parameters.get("num_nodes_range")[1] + + def load_dataset(self) -> Dataset: + """Load Graph Universe dataset. + + Returns + ------- + Dataset + The loaded Graph Universe dataset. + + Raises + ------ + RuntimeError + If dataset loading fails. + """ + + dataset = GraphUniverseDataset( + root=str(self.root_data_dir), + parameters=self.parameters["generation_parameters"] + ) + + return dataset + + def load(self, **kwargs) -> tuple[Data, str]: + """Load data. + + Parameters + ---------- + **kwargs : dict + Additional keyword arguments. + + Returns + ------- + tuple[torch_geometric.data.Data, str] + Tuple containing the loaded data and the data directory. + """ + dataset = self.load_dataset(**kwargs) + data_dir = dataset.raw_dir + + return dataset, data_dir diff --git a/topobench/data/preprocessor/preprocessor.py b/topobench/data/preprocessor/preprocessor.py index e5c4a913a..3c4dc69f8 100644 --- a/topobench/data/preprocessor/preprocessor.py +++ b/topobench/data/preprocessor/preprocessor.py @@ -224,7 +224,7 @@ def load(self, path: str) -> None: self.data = data_cls.from_dict(data) def load_dataset_splits( - self, split_params + self, split_params, task_level=None ) -> tuple[ DataloadDataset, DataloadDataset | None, DataloadDataset | None ]: @@ -234,6 +234,8 @@ def load_dataset_splits( ---------- split_params : dict Parameters for loading the dataset splits. + task_level : str, optional + Task level ('node' or 'graph'). Used to determine split strategy. Returns ------- @@ -244,7 +246,7 @@ def load_dataset_splits( raise ValueError("No learning setting specified in split_params") if split_params.learning_setting == "inductive": - return load_inductive_splits(self, split_params) + return load_inductive_splits(self, split_params, task_level=task_level) elif split_params.learning_setting == "transductive": return load_transductive_splits(self, split_params) else: diff --git a/topobench/data/utils/split_utils.py b/topobench/data/utils/split_utils.py index f78994222..22957773c 100644 --- a/topobench/data/utils/split_utils.py +++ b/topobench/data/utils/split_utils.py @@ -287,7 +287,7 @@ def load_transductive_splits(dataset, parameters): return DataloadDataset([data]), None, None -def load_inductive_splits(dataset, parameters): +def load_inductive_splits(dataset, parameters, task_level=None): r"""Load multiple-graph datasets with the specified split. Parameters @@ -296,6 +296,9 @@ def load_inductive_splits(dataset, parameters): Graph dataset. parameters : DictConfig Configuration parameters. + task_level : str, optional + Task level ('node' or 'graph'). If 'node', uses graph indices for splitting. + If None or 'graph', uses graph-level labels for stratification. Returns ------- @@ -306,15 +309,20 @@ def load_inductive_splits(dataset, parameters): assert len(dataset) > 1, ( "Datasets should have more than one graph in an inductive setting." ) - # Check if labels are ragged (different sizes across graphs) - label_list = [data.y.squeeze(0).numpy() for data in dataset] - label_shapes = [label.shape for label in label_list] - # Use dtype=object only if labels have different shapes (ragged) - labels = ( - np.array(label_list, dtype=object) - if len(set(label_shapes)) > 1 - else np.array(label_list) - ) + + if task_level == "node": + # Use graph indices as pseudo-labels for stratification + labels = np.arange(len(dataset)) + else: + # For graph classification, extract graph-level labels for stratification + label_list = [data.y.squeeze(0).numpy() for data in dataset] + label_shapes = [label.shape for label in label_list] + # Use dtype=object only if labels have different shapes (ragged) + labels = ( + np.array(label_list, dtype=object) + if len(set(label_shapes)) > 1 + else np.array(label_list) + ) root = ( dataset.dataset.get_data_dir() @@ -329,6 +337,12 @@ def load_inductive_splits(dataset, parameters): assert type(labels) is not object, ( "K-Fold splitting not supported for ragged labels." ) + if task_level == "node": + raise NotImplementedError( + "K-Fold splitting is not supported for node-level tasks in inductive setting. " + "Each graph has unique node labels, making stratification impossible. " + "Please use 'random' split_type instead." + ) split_idx = k_fold_split(labels, parameters, root=root) elif parameters.split_type == "fixed" and hasattr(dataset, "split_idx"): diff --git a/topobench/model/model.py b/topobench/model/model.py index a7c688b47..4c89e1c14 100755 --- a/topobench/model/model.py +++ b/topobench/model/model.py @@ -23,6 +23,8 @@ class TBModel(LightningModule): The backbone wrapper class (default: None). feature_encoder : torch.nn.Module, optional The feature encoder (default: None). + learning_setting: str | None = None, + The learning setting (default: None). evaluator : Any, optional The evaluator class (default: None). optimizer : Any, optional @@ -38,6 +40,7 @@ def __init__( loss: torch.nn.Module, backbone_wrapper: torch.nn.Module | None = None, feature_encoder: torch.nn.Module | None = None, + learning_setting: str | None = None, evaluator: Any = None, optimizer: Any = None, **kwargs, @@ -72,6 +75,9 @@ def __init__( self.loss = loss self.task_level = self.readout.task_level + # Learning setting + self.learning_setting = learning_setting + # Tracking best so far validation accuracy self.val_acc_best = MeanMetric() self.metric_collector_val = [] @@ -155,6 +161,9 @@ def training_step(self, batch: Data, batch_idx: int) -> torch.Tensor: self.state_str = "Training" model_out = self.model_step(batch) + # Get actual batch size (if graph, num_graphs, if node, num_nodes) + actual_batch_size = batch.num_graphs if hasattr(batch, "num_graphs") else 1 + # Update and log metrics loss_value = model_out["loss"].item() self.log( @@ -163,7 +172,7 @@ def training_step(self, batch: Data, batch_idx: int) -> torch.Tensor: on_step=False, on_epoch=True, prog_bar=True, - batch_size=1, + batch_size=actual_batch_size, ) # Return loss for backpropagation step @@ -182,6 +191,9 @@ def validation_step(self, batch: Data, batch_idx: int) -> None: self.state_str = "Validation" model_out = self.model_step(batch) + # Get actual batch size (if graph, num_graphs, if node, num_nodes) + actual_batch_size = batch.num_graphs if hasattr(batch, "num_graphs") else 1 + # Log Loss loss_value = model_out["loss"].item() self.log( @@ -190,7 +202,7 @@ def validation_step(self, batch: Data, batch_idx: int) -> None: on_step=False, on_epoch=True, prog_bar=True, - batch_size=1, + batch_size=actual_batch_size, ) def test_step(self, batch: Data, batch_idx: int) -> None: @@ -206,6 +218,9 @@ def test_step(self, batch: Data, batch_idx: int) -> None: self.state_str = "Test" model_out = self.model_step(batch) + # Get actual batch size (if graph, num_graphs, if node, num_nodes) + actual_batch_size = batch.num_graphs if hasattr(batch, "num_graphs") else 1 + # Log loss loss_value = model_out["loss"].item() self.log( @@ -214,7 +229,7 @@ def test_step(self, batch: Data, batch_idx: int) -> None: on_step=False, on_epoch=True, prog_bar=True, - batch_size=1, + batch_size=actual_batch_size, ) def process_outputs(self, model_out: dict, batch: Data) -> dict: @@ -233,17 +248,17 @@ def process_outputs(self, model_out: dict, batch: Data) -> dict: Dictionary containing the updated model output. """ # Get the correct mask - if self.state_str == "Training": - mask = batch.train_mask - elif self.state_str == "Validation": - mask = batch.val_mask - elif self.state_str == "Test": - mask = batch.test_mask - else: - raise ValueError("Invalid state_str") - - if self.task_level == "node": - # Keep only train data points + if self.learning_setting == "transductive": + if self.state_str == "Training": + mask = batch.train_mask + elif self.state_str == "Validation": + mask = batch.val_mask + elif self.state_str == "Test": + mask = batch.test_mask + else: + raise ValueError("Invalid state_str") + + # Filter Outputs and labels according to mask for key, val in model_out.items(): if key in ["logits", "labels"]: model_out[key] = val[mask] diff --git a/topobench/run.py b/topobench/run.py index ab6f8602a..71879e093 100755 --- a/topobench/run.py +++ b/topobench/run.py @@ -166,8 +166,9 @@ def run(cfg: DictConfig) -> tuple[dict[str, Any], dict[str, Any]]: log.info("Instantiating preprocessor...") transform_config = cfg.get("transforms", None) preprocessor = PreProcessor(dataset, dataset_dir, transform_config) + task_level = cfg.dataset.parameters.get("task_level", None) dataset_train, dataset_val, dataset_test = ( - preprocessor.load_dataset_splits(cfg.dataset.split_params) + preprocessor.load_dataset_splits(cfg.dataset.split_params, task_level=task_level) ) # Prepare datamodule log.info("Instantiating datamodule...") @@ -186,6 +187,7 @@ def run(cfg: DictConfig) -> tuple[dict[str, Any], dict[str, Any]]: model: LightningModule = hydra.utils.instantiate( cfg.model, evaluator=cfg.evaluator, + learning_setting=cfg.dataset.split_params.learning_setting, optimizer=cfg.optimizer, loss=cfg.loss, )