Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
25 commits
Select commit Hold shift + click to select a range
6f16d4d
initial commit for OC20/22
theosaulus Nov 17, 2025
42e7ad8
preprocessing still not fully working
theosaulus Nov 18, 2025
2d40f9d
preprocessing seems to work now although slow
theosaulus Nov 18, 2025
869894d
code cleaning and separating different functions in different files t…
theosaulus Nov 21, 2025
b290b42
IS2RE works
theosaulus Nov 21, 2025
bf3d279
format
theosaulus Nov 21, 2025
42adabf
renaming and tests
theosaulus Nov 22, 2025
01f082e
keep some files untouched
theosaulus Nov 22, 2025
de9f3e7
Merge pull request #1 from theosaulus/oc20
theosaulus Nov 22, 2025
e7c1e10
ruff fix
theosaulus Nov 22, 2025
c9bc65f
fixed data splits, tests, and code running
theosaulus Nov 22, 2025
6e845b5
Merge pull request #2 from theosaulus/oc20
theosaulus Nov 22, 2025
2f50eea
configs and tests
theosaulus Nov 23, 2025
e76187f
Merge pull request #3 from theosaulus/oc20
theosaulus Nov 23, 2025
0da47c7
mock config and avoid testing the other configs
theosaulus Nov 25, 2025
2f589c3
Merge pull request #4 from theosaulus/oc20
theosaulus Nov 25, 2025
c9baf78
remove heavy tests on the larger datasets
theosaulus Nov 25, 2025
08d57ab
Merge pull request #5 from theosaulus/oc20
theosaulus Nov 25, 2025
554e397
removing again unnecessary tests and bug fix on number of loaded mole…
theosaulus Nov 25, 2025
5b13f85
ruff
theosaulus Nov 25, 2025
3c1ac1a
Merge pull request #6 from theosaulus/oc20
theosaulus Nov 25, 2025
48f5b65
fixed tests cleanly
theosaulus Nov 26, 2025
97f0a85
Merge pull request #7 from theosaulus/oc20
theosaulus Nov 26, 2025
6dde111
ase package
theosaulus Nov 26, 2025
7fd4b08
Merge pull request #8 from theosaulus/oc20
theosaulus Nov 26, 2025
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
35 changes: 35 additions & 0 deletions configs/dataset/graph/OC20_IS2RE.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,35 @@
# OC20 IS2RE task
# Train/val/test splits are precomputed in the LMDB archive

loader:
_target_: topobench.data.loaders.graph.oc20_is2re_dataset_loader.IS2REDatasetLoader
parameters:
data_domain: graph
data_type: oc20
data_name: OC20_IS2RE
data_dir: ${paths.data_dir}/${dataset.loader.parameters.data_domain}/${dataset.loader.parameters.data_type}
task: is2re
download: true
legacy_format: false
dtype: float32
max_samples: 10 # Set to integer (e.g., 1000) to limit dataset size for fast experiments, or null for full dataset

parameters:
num_features: 6 # Will be determined by the actual data
num_classes: 1
task: regression
loss_type: mse
monitor_metric: mae
task_level: graph

split_params:
learning_setting: inductive
split_type: fixed # splits are precomputed in the dataset
data_split_dir: ${paths.data_dir}/data_splits/${dataset.loader.parameters.data_name}
data_seed: 0

dataloader_params:
batch_size: 32
num_workers: 0
pin_memory: true
persistent_workers: false
42 changes: 42 additions & 0 deletions configs/dataset/graph/OC20_S2EF_200K.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,42 @@
# OC20 S2EF Dataset Configuration
# Structure to Energy and Forces prediction for catalyst discovery
# Dataset: 200K training samples with multiple validation splits
# Validation: all 4 validation splits aggregated (val_id, val_ood_ads, val_ood_cat, val_ood_both)
# Test: official test split

loader:
_target_: topobench.data.loaders.graph.oc20_dataset_loader.OC20DatasetLoader
parameters:
data_domain: graph
data_type: oc20
data_name: OC20_S2EF_200K
task: s2ef
train_split: "200K"
val_splits: null # null means use all 4 validation splits (val_id, val_ood_ads, val_ood_cat, val_ood_both)
include_test: false # S2EF test data is LMDB format (incompatible with .extxyz/ASE DB train/val)
download: true
dtype: float32
legacy_format: false
max_samples: 10 # Set to integer (e.g., 1000) to limit dataset size for fast experiments, or null for full dataset
data_dir: ${paths.data_dir}/${dataset.loader.parameters.data_domain}/${dataset.loader.parameters.data_type}

parameters:
num_features: 1 # Number of node features (atomic numbers)
num_classes: 1 # Regression task (energy prediction)
task: regression
loss_type: mse
monitor_metric: mae
task_level: graph # Graph-level prediction


split_params:
learning_setting: inductive
split_type: fixed # Splits are provided by the dataset
data_split_dir: ${paths.data_dir}/data_splits/${dataset.loader.parameters.data_name}
data_seed: 0

dataloader_params:
batch_size: 32
num_workers: 0
pin_memory: true
persistent_workers: false
42 changes: 42 additions & 0 deletions configs/dataset/graph/OC20_S2EF_200K_mock.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,42 @@
# OC20 S2EF Mock Dataset Configuration
# Mock configuration for testing purposes using the 200K training samples (350MB)
# This configuration is designed to be used for CI/CD testing without requiring large dataset downloads
# It downloads only the 200K training split and uses it for train/val/test

loader:
_target_: topobench.data.loaders.graph.oc20_dataset_loader.OC20DatasetLoader
parameters:
data_domain: graph
data_type: oc20
data_name: OC20_S2EF_200K_mock
task: s2ef
train_split: "200K"
val_splits: [] # Empty list to avoid downloading validation splits
include_test: false # Don't download test data to keep size minimal
download: true
dtype: float32
legacy_format: false
max_samples: 10 # Limit to 10 samples for fast testing
data_dir: ${paths.data_dir}/${dataset.loader.parameters.data_domain}/${dataset.loader.parameters.data_type}

parameters:
num_features: 1 # Number of node features (atomic numbers)
num_classes: 1 # Regression task (energy prediction)
task: regression
loss_type: mse
monitor_metric: mae
task_level: graph # Graph-level prediction


split_params:
learning_setting: inductive
split_type: random # Use random splitting since we only download train split
data_split_dir: ${paths.data_dir}/data_splits/${dataset.loader.parameters.data_name}
data_seed: 0
train_prop: 0.6 # 60% train, 20% validation, 20% test

dataloader_params:
batch_size: 8 # Smaller batch size for testing
num_workers: 0
pin_memory: true
persistent_workers: false
38 changes: 38 additions & 0 deletions configs/dataset/graph/OC20_S2EF_20M.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,38 @@
# OC20 S2EF dataset with 20M training samples
# Validation: all 4 validation splits aggregated (val_id, val_ood_ads, val_ood_cat, val_ood_both)
# Test: official test split

loader:
_target_: topobench.data.loaders.graph.oc20_dataset_loader.OC20DatasetLoader
parameters:
data_domain: graph
data_type: oc20
data_name: OC20_S2EF_20M
data_dir: ${paths.data_dir}/${dataset.loader.parameters.data_domain}/${dataset.loader.parameters.data_type}
task: s2ef
train_split: "20M"
val_splits: null # null means use all 4 validation splits
test_split: "test"
download: true
legacy_format: false
dtype: float32

parameters:
num_features: 6 # Will be determined by the actual data
num_classes: 1
task: regression
loss_type: mse
monitor_metric: mae
task_level: graph

split_params:
learning_setting: inductive
split_type: fixed # splits are provided by the dataset
data_split_dir: ${paths.data_dir}/data_splits/${dataset.loader.parameters.data_name}
data_seed: 0

dataloader_params:
batch_size: 32
num_workers: 0
pin_memory: true
persistent_workers: false
38 changes: 38 additions & 0 deletions configs/dataset/graph/OC20_S2EF_2M.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,38 @@
# OC20 S2EF dataset with 2M training samples
# Validation: all 4 validation splits aggregated (val_id, val_ood_ads, val_ood_cat, val_ood_both)
# Test: official test split

loader:
_target_: topobench.data.loaders.graph.oc20_dataset_loader.OC20DatasetLoader
parameters:
data_domain: graph
data_type: oc20
data_name: OC20_S2EF_2M
data_dir: ${paths.data_dir}/${dataset.loader.parameters.data_domain}/${dataset.loader.parameters.data_type}
task: s2ef
train_split: "2M"
val_splits: null # null means use all 4 validation splits
test_split: "test"
download: true
legacy_format: false
dtype: float32

parameters:
num_features: 6 # Will be determined by the actual data
num_classes: 1
task: regression
loss_type: mse
monitor_metric: mae
task_level: graph

split_params:
learning_setting: inductive
split_type: fixed # splits are provided by the dataset
data_split_dir: ${paths.data_dir}/data_splits/${dataset.loader.parameters.data_name}
data_seed: 0

dataloader_params:
batch_size: 32
num_workers: 0
pin_memory: true
persistent_workers: false
38 changes: 38 additions & 0 deletions configs/dataset/graph/OC20_S2EF_all.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,38 @@
# OC20 S2EF dataset with all training samples (~134M)
# Validation: all 4 validation splits aggregated (val_id, val_ood_ads, val_ood_cat, val_ood_both)
# Test: official test split

loader:
_target_: topobench.data.loaders.graph.oc20_dataset_loader.OC20DatasetLoader
parameters:
data_domain: graph
data_type: oc20
data_name: OC20_S2EF_all
data_dir: ${paths.data_dir}/${dataset.loader.parameters.data_domain}/${dataset.loader.parameters.data_type}
task: s2ef
train_split: "all"
val_splits: null # null means use all 4 validation splits
test_split: "test"
download: true
legacy_format: false
dtype: float32

parameters:
num_features: 6 # Will be determined by the actual data
num_classes: 1
task: regression
loss_type: mse
monitor_metric: mae
task_level: graph

split_params:
learning_setting: inductive
split_type: fixed # splits are provided by the dataset
data_split_dir: ${paths.data_dir}/data_splits/${dataset.loader.parameters.data_name}
data_seed: 0

dataloader_params:
batch_size: 32
num_workers: 0
pin_memory: true
persistent_workers: false
35 changes: 35 additions & 0 deletions configs/dataset/graph/OC22_IS2RE.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,35 @@
# OC22 IS2RE task
# Train/val/test splits are precomputed in the LMDB archive

loader:
_target_: topobench.data.loaders.graph.oc22_is2re_dataset_loader.OC22IS2REDatasetLoader
parameters:
data_domain: graph
data_type: oc20
data_name: OC22_IS2RE
data_dir: ${paths.data_dir}/${dataset.loader.parameters.data_domain}/${dataset.loader.parameters.data_type}
task: oc22_is2re
download: true
legacy_format: false
dtype: float32
max_samples: 10 # Set to integer (e.g., 1000) to limit dataset size for fast experiments, or null for full dataset

parameters:
num_features: 6 # Will be determined by the actual data
num_classes: 1
task: regression
loss_type: mse
monitor_metric: mae
task_level: graph

split_params:
learning_setting: inductive
split_type: fixed # splits are precomputed in the dataset
data_split_dir: ${paths.data_dir}/data_splits/${dataset.loader.parameters.data_name}
data_seed: 0

dataloader_params:
batch_size: 32
num_workers: 0
pin_memory: true
persistent_workers: false
1 change: 1 addition & 0 deletions pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -54,6 +54,7 @@ dependencies=[
"topomodelx @ git+https://github.com/pyt-team/TopoModelX.git",
"toponetx @ git+https://github.com/pyt-team/TopoNetX.git",
"lightning==2.4.0",
"ase", # Required for OC20/OC22 S2EF dataset tests
]

[project.optional-dependencies]
Expand Down
9 changes: 9 additions & 0 deletions test/conftest.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,6 @@
"""Configuration file for pytest."""
import os
from pathlib import Path
import networkx as nx
import pytest
import torch
Expand All @@ -11,6 +13,13 @@
)


# Set PROJECT_ROOT environment variable if not already set
if "PROJECT_ROOT" not in os.environ:
# Get the project root (parent of test directory)
project_root = Path(__file__).parent.parent.absolute()
os.environ["PROJECT_ROOT"] = str(project_root)


@pytest.fixture
def mocker_fixture(mocker):
"""Return pytest mocker, used when one want to use mocker in setup_method.
Expand Down
5 changes: 4 additions & 1 deletion test/data/load/test_datasetloaders.py
Original file line number Diff line number Diff line change
Expand Up @@ -41,7 +41,10 @@ def _gather_config_files(self, base_dir: Path) -> List[str]:
# Below the datasets that have some default transforms with we manually overriten with no_transform,
# due to lack of default transform for domain2domain
"REDDIT-BINARY.yaml", "IMDB-MULTI.yaml", "IMDB-BINARY.yaml", #"ZINC.yaml"
"ogbg-molpcba.yaml", "manual_dataset.yaml" # "ogbg-molhiv.yaml"
"ogbg-molpcba.yaml", "manual_dataset.yaml", # "ogbg-molhiv.yaml"
# OC20/OC22 datasets that require large downloads (excluded from tests)
"OC20_S2EF_200K.yaml", "OC20_S2EF_2M.yaml", "OC20_S2EF_20M.yaml",
"OC20_S2EF_all.yaml", "OC20_IS2RE.yaml", "OC22_IS2RE.yaml"
}

# Below the datasets that takes quite some time to load and process
Expand Down
Loading
Loading