Add alternate dataframe split implementations (#608)

cthoyt · web-flow · commit b5077e7f7e3b · 2025-09-22T16:25:26.000+02:00
diff --git a/pyproject.toml b/pyproject.toml
@@ -14,7 +14,7 @@ requires-python = ">=3.10,<4.0.0"
 
 dependencies = [
     "click>=8.1.6",
-    "curies>=0.10.18",
+    "curies>=0.10.23",
     "linkml-runtime>=1.7.5",
     "linkml>1.7.10",
     "pandas>1.0.3",
diff --git a/src/sssom/cli.py b/src/sssom/cli.py
@@ -5,6 +5,7 @@
 import logging as _logging
 import os
 import sys
+import typing
 from operator import itemgetter
 from pathlib import Path
 from typing import Any, Callable, List, Optional, TextIO, Tuple, TypeVar, get_args
@@ -35,7 +36,7 @@
     split_file,
     validate_file,
 )
-from .parsers import PARSING_FUNCTIONS, parse_sssom_table
+from .parsers import PARSING_FUNCTIONS, SplitMethod, parse_sssom_table
 from .rdf_util import rewire_graph
 from .sparql_util import EndpointConfig, query_mappings
 from .util import (
@@ -235,9 +236,10 @@ def validate(input: str, validation_types: List[SchemaValidationType]) -> None:
 @main.command()
 @input_argument
 @output_directory_option
-def split(input: str, output_directory: str) -> None:
+@click.option("--method", type=click.Choice(typing.get_args(SplitMethod)))
+def split(input: str, output_directory: str, method: SplitMethod) -> None:
     """Split input file into multiple output broken down by prefixes."""
-    split_file(input_path=input, output_directory=output_directory)
+    split_file(input_path=input, output_directory=output_directory, method=method)
 
 
 @main.command()
diff --git a/src/sssom/io.py b/src/sssom/io.py
@@ -31,7 +31,7 @@
     get_default_metadata,
 )
 from .context import get_converter
-from .parsers import get_parsing_function, parse_sssom_table, split_dataframe
+from .parsers import SplitMethod, get_parsing_function, parse_sssom_table, split_dataframe
 from .util import MappingSetDataFrame, are_params_slots, augment_metadata, raise_for_bad_path
 from .writers import get_writer_function, write_table, write_tables
 
@@ -130,7 +130,9 @@ def validate_file(
     return validate(msdf=msdf, validation_types=validation_types, fail_on_error=fail_on_error)
 
 
-def split_file(input_path: str, output_directory: Union[str, Path]) -> None:
+def split_file(
+    input_path: str, output_directory: Union[str, Path], *, method: SplitMethod | None = None
+) -> None:
     """Split an SSSOM TSV by prefixes and relations.
 
     :param input_path: The path to the input file in one of the legal formats, eg obographs,
@@ -139,7 +141,7 @@ def split_file(input_path: str, output_directory: Union[str, Path]) -> None:
     """
     raise_for_bad_path(input_path)
     msdf = parse_sssom_table(input_path)
-    splitted = split_dataframe(msdf)
+    splitted = split_dataframe(msdf, method=method)
     write_tables(splitted, output_directory)
 
 
diff --git a/src/sssom/parsers.py b/src/sssom/parsers.py
@@ -11,7 +11,18 @@
 import typing
 from collections import ChainMap, Counter
 from pathlib import Path
-from typing import Any, Callable, Dict, Iterable, List, Optional, TextIO, Tuple, Union, cast
+from typing import (
+    Any,
+    Callable,
+    Dict,
+    Iterable,
+    List,
+    Optional,
+    TextIO,
+    Tuple,
+    Union,
+    cast,
+)
 from xml.dom import Node, minidom
 from xml.dom.minidom import Document, Element
 
@@ -20,12 +31,19 @@
 import pandas as pd
 import requests
 import yaml
-from curies import Converter
+from curies import Converter, ReferenceTuple
+from curies.dataframe import (
+    get_df_curies_index,
+    get_df_prefixes_index,
+    get_filter_df_by_curies_index,
+    get_filter_df_by_prefixes_index,
+)
 from linkml_runtime.loaders.json_loader import JSONLoader
 from linkml_runtime.loaders.rdflib_loader import RDFLibLoader
 from pandas.errors import EmptyDataError
 from rdflib import Graph
 from sssom_schema import Mapping, MappingSet
+from typing_extensions import Literal, TypeAlias
 
 from sssom.constants import (
     CONFIDENCE,
@@ -976,8 +994,12 @@ def _get_mapping_set_from_df(df: pd.DataFrame, meta: Optional[MetadataType] = No
     return mapping_set
 
 
+SplitMethod: TypeAlias = Literal["disjoint-indexes", "dense-indexes"]
+
+
 def split_dataframe(
     msdf: MappingSetDataFrame,
+    method: SplitMethod | None = None,
 ) -> Dict[str, MappingSetDataFrame]:
     """Group the mapping set dataframe into several subdataframes by prefix.
 
@@ -995,6 +1017,7 @@ def split_dataframe(
         subject_prefixes=subject_prefixes,
         object_prefixes=object_prefixes,
         relations=relations,
+        method=method,
     )
 
 
@@ -1008,16 +1031,57 @@ def split_dataframe_by_prefix(
     subject_prefixes: Iterable[str],
     object_prefixes: Iterable[str],
     relations: Iterable[str],
+    *,
+    method: SplitMethod | None = None,
 ) -> Dict[str, MappingSetDataFrame]:
     """Split a mapping set dataframe by prefix.
 
     :param msdf: An SSSOM MappingSetDataFrame
     :param subject_prefixes: a list of prefixes pertaining to the subject
     :param object_prefixes: a list of prefixes pertaining to the object
     :param relations: a list of relations of interest
+    :param method: The method for calculating splits
 
     :returns: a dict of SSSOM data frame names to MappingSetDataFrame
     """
+    if method is None:
+        return _split_dataframe_by_prefix_old(
+            msdf,
+            subject_prefixes=subject_prefixes,
+            object_prefixes=object_prefixes,
+            relations=relations,
+        )
+
+    predicates: List[ReferenceTuple] = []
+    for relation in relations:
+        if reference_tuple := msdf.converter.parse_curie(relation):
+            predicates.append(reference_tuple)
+        else:
+            logging.warning("invalid relation CURIE for dataframe split: %s", relation)
+
+    rr = _help_split_dataframe_by_prefix(
+        msdf.df,
+        subject_prefixes=subject_prefixes,
+        predicates=predicates,
+        object_prefixes=object_prefixes,
+        method=method,
+    )
+    rv = {}
+    for (subject_prefix, relation_t, object_prefix), df in rr:
+        subconverter = msdf.converter.get_subconverter(
+            [subject_prefix, object_prefix, relation_t.prefix]
+        )
+        split = _get_split_key(subject_prefix, relation_t.identifier, object_prefix)
+        rv[split] = from_sssom_dataframe(df, prefix_map=subconverter, meta=msdf.metadata)
+    return rv
+
+
+def _split_dataframe_by_prefix_old(
+    msdf: MappingSetDataFrame,
+    subject_prefixes: Iterable[str],
+    object_prefixes: Iterable[str],
+    relations: Iterable[str],
+) -> Dict[str, MappingSetDataFrame]:
     df = msdf.df
     meta = msdf.metadata
     split_to_msdf: Dict[str, MappingSetDataFrame] = {}
@@ -1049,6 +1113,69 @@ def split_dataframe_by_prefix(
     return split_to_msdf
 
 
+def _help_split_dataframe_by_prefix(
+    df: pd.DataFrame,
+    subject_prefixes: str | Iterable[str],
+    predicates: curies.ReferenceTuple | Iterable[curies.ReferenceTuple],
+    object_prefixes: str | Iterable[str],
+    *,
+    method: SplitMethod,
+) -> Iterable[tuple[tuple[str, curies.ReferenceTuple, str], pd.DataFrame]]:
+    subject_prefixes = _clean_list(subject_prefixes)
+    predicates = [predicates] if isinstance(predicates, curies.ReferenceTuple) else list(predicates)
+    object_prefixes = _clean_list(object_prefixes)
+
+    if method == "disjoint-indexes":
+        s_indexes: dict[str, pd.Series[bool]] = {
+            subject_prefix: get_filter_df_by_prefixes_index(
+                df, column="subject_id", prefixes=subject_prefix
+            )
+            for subject_prefix in subject_prefixes
+        }
+        p_indexes: dict[ReferenceTuple, pd.Series[bool]] = {
+            predicate: get_filter_df_by_curies_index(
+                df, column="predicate_id", curies=predicate.curie
+            )
+            for predicate in predicates
+        }
+        o_indexes: dict[str, pd.Series[bool]] = {
+            object_prefix: get_filter_df_by_prefixes_index(
+                df, column="object_id", prefixes=object_prefix
+            )
+            for object_prefix in object_prefixes
+        }
+        for subject_prefix, predicate, object_prefix in itt.product(
+            subject_prefixes, predicates, object_prefixes
+        ):
+            idx = s_indexes[subject_prefix] & p_indexes[predicate] & o_indexes[object_prefix]
+            if not idx.any():
+                continue
+            yield (subject_prefix, predicate, object_prefix), df[idx]
+
+    elif method == "dense-indexes":
+        s_index = get_df_prefixes_index(df, column="subject_id")
+        p_index = get_df_curies_index(df, column="predicate_id")
+        o_index = get_df_prefixes_index(df, column="object_id")
+        for subject_prefix, predicate, object_prefix in itt.product(
+            subject_prefixes, predicates, object_prefixes
+        ):
+            dense_idx: list[int] = sorted(
+                set(s_index.get(subject_prefix, []))
+                .intersection(p_index.get(predicate.curie, []))
+                .intersection(o_index.get(object_prefix, []))
+            )
+            if not dense_idx:
+                continue
+            yield (subject_prefix, predicate, object_prefix), df.iloc[dense_idx]
+
+    else:
+        raise ValueError(f"invalid split method: {method}")
+
+
+def _clean_list(item: str | Iterable[str]) -> list[str]:
+    return [item] if isinstance(item, str) else list(item)
+
+
 def _ensure_valid_mapping_from_dict(mdict: Dict[str, Any]) -> Optional[Mapping]:
     """Return a valid mapping object if it can be constructed, else None.
 
diff --git a/tests/test_parsers.py b/tests/test_parsers.py
@@ -4,6 +4,7 @@
 import json
 import math
 import os
+import typing
 import unittest
 from collections import ChainMap
 from pathlib import Path
@@ -22,6 +23,7 @@
 from sssom.io import parse_file
 from sssom.parsers import (
     PARSING_FUNCTIONS,
+    SplitMethod,
     from_alignment_minidom,
     from_obographs,
     from_sssom_dataframe,
@@ -551,25 +553,46 @@ def test_split_df(self) -> None:
         df = pd.DataFrame(rows, columns=columns)
         msdf = from_sssom_dataframe(df, converter)
 
+        sdf = pd.DataFrame(subrows, columns=columns)
+
+        for method in [None, *typing.get_args(SplitMethod)]:
+            with self.subTest(method=method):
+                self.assert_msdf(msdf, sdf, method)
+
+    def assert_msdf(
+        self, msdf: MappingSetDataFrame, sdf: pd.DataFrame, method: SplitMethod | None
+    ) -> None:
+        """Test the dataframe."""
         # test that if there's ever an empty list, then it returns an empty dict
-        self.assertFalse(split_dataframe_by_prefix(msdf, [], ["p2"], ["skos:exactMatch"]))
-        self.assertFalse(split_dataframe_by_prefix(msdf, ["p1"], ["p2"], []))
-        self.assertFalse(split_dataframe_by_prefix(msdf, ["p1"], [], ["skos:exactMatch"]))
+        self.assertFalse(
+            split_dataframe_by_prefix(msdf, [], ["p2"], ["skos:exactMatch"], method=method)
+        )
+        self.assertFalse(split_dataframe_by_prefix(msdf, ["p1"], ["p2"], [], method=method))
+        self.assertFalse(
+            split_dataframe_by_prefix(msdf, ["p1"], [], ["skos:exactMatch"], method=method)
+        )
 
         # test that missing prefixes don't result in anything
-        self.assertFalse(split_dataframe_by_prefix(msdf, ["nope"], ["p2"], ["skos:exactMatch"]))
-        self.assertFalse(split_dataframe_by_prefix(msdf, ["p1"], ["nope"], ["skos:exactMatch"]))
-        self.assertFalse(split_dataframe_by_prefix(msdf, ["p1"], ["p2"], ["nope:nope"]))
+        self.assertFalse(
+            split_dataframe_by_prefix(msdf, ["nope"], ["p2"], ["skos:exactMatch"], method=method)
+        )
+        self.assertFalse(
+            split_dataframe_by_prefix(msdf, ["p1"], ["nope"], ["skos:exactMatch"], method=method)
+        )
+        self.assertFalse(
+            split_dataframe_by_prefix(msdf, ["p1"], ["p2"], ["nope:nope"], method=method)
+        )
 
-        sdf = pd.DataFrame(subrows, columns=columns)
         # test an explicit return with only single entries
-        rv = split_dataframe_by_prefix(msdf, ["p1"], ["p2"], ["skos:exactMatch"])
+        rv = split_dataframe_by_prefix(msdf, ["p1"], ["p2"], ["skos:exactMatch"], method=method)
         self.assertEqual(1, len(rv), msg="nothing was indexed")
         self.assertIn("p1_exactmatch_p2", rv)
         self.assertEqual(sdf.values.tolist(), rv["p1_exactmatch_p2"].df.values.tolist())
 
         # test an explicit return with multiple entries
-        rv = split_dataframe_by_prefix(msdf, ["p1"], ["p2", "p3"], ["skos:exactMatch"])
+        rv = split_dataframe_by_prefix(
+            msdf, ["p1"], ["p2", "p3"], ["skos:exactMatch"], method=method
+        )
         self.assertEqual(2, len(rv), msg="nothing was indexed")
         self.assertIn("p1_exactmatch_p2", rv)
         self.assertIn("p1_exactmatch_p3", rv)