Skip to content

Commit b5077e7

Browse files
authored
Add alternate dataframe split implementations (#608)
1 parent 4c90486 commit b5077e7

File tree

5 files changed

+172
-18
lines changed

5 files changed

+172
-18
lines changed

pyproject.toml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -14,7 +14,7 @@ requires-python = ">=3.10,<4.0.0"
1414

1515
dependencies = [
1616
"click>=8.1.6",
17-
"curies>=0.10.18",
17+
"curies>=0.10.23",
1818
"linkml-runtime>=1.7.5",
1919
"linkml>1.7.10",
2020
"pandas>1.0.3",

src/sssom/cli.py

Lines changed: 5 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -5,6 +5,7 @@
55
import logging as _logging
66
import os
77
import sys
8+
import typing
89
from operator import itemgetter
910
from pathlib import Path
1011
from typing import Any, Callable, List, Optional, TextIO, Tuple, TypeVar, get_args
@@ -35,7 +36,7 @@
3536
split_file,
3637
validate_file,
3738
)
38-
from .parsers import PARSING_FUNCTIONS, parse_sssom_table
39+
from .parsers import PARSING_FUNCTIONS, SplitMethod, parse_sssom_table
3940
from .rdf_util import rewire_graph
4041
from .sparql_util import EndpointConfig, query_mappings
4142
from .util import (
@@ -235,9 +236,10 @@ def validate(input: str, validation_types: List[SchemaValidationType]) -> None:
235236
@main.command()
236237
@input_argument
237238
@output_directory_option
238-
def split(input: str, output_directory: str) -> None:
239+
@click.option("--method", type=click.Choice(typing.get_args(SplitMethod)))
240+
def split(input: str, output_directory: str, method: SplitMethod) -> None:
239241
"""Split input file into multiple output broken down by prefixes."""
240-
split_file(input_path=input, output_directory=output_directory)
242+
split_file(input_path=input, output_directory=output_directory, method=method)
241243

242244

243245
@main.command()

src/sssom/io.py

Lines changed: 5 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -31,7 +31,7 @@
3131
get_default_metadata,
3232
)
3333
from .context import get_converter
34-
from .parsers import get_parsing_function, parse_sssom_table, split_dataframe
34+
from .parsers import SplitMethod, get_parsing_function, parse_sssom_table, split_dataframe
3535
from .util import MappingSetDataFrame, are_params_slots, augment_metadata, raise_for_bad_path
3636
from .writers import get_writer_function, write_table, write_tables
3737

@@ -130,7 +130,9 @@ def validate_file(
130130
return validate(msdf=msdf, validation_types=validation_types, fail_on_error=fail_on_error)
131131

132132

133-
def split_file(input_path: str, output_directory: Union[str, Path]) -> None:
133+
def split_file(
134+
input_path: str, output_directory: Union[str, Path], *, method: SplitMethod | None = None
135+
) -> None:
134136
"""Split an SSSOM TSV by prefixes and relations.
135137
136138
:param input_path: The path to the input file in one of the legal formats, eg obographs,
@@ -139,7 +141,7 @@ def split_file(input_path: str, output_directory: Union[str, Path]) -> None:
139141
"""
140142
raise_for_bad_path(input_path)
141143
msdf = parse_sssom_table(input_path)
142-
splitted = split_dataframe(msdf)
144+
splitted = split_dataframe(msdf, method=method)
143145
write_tables(splitted, output_directory)
144146

145147

src/sssom/parsers.py

Lines changed: 129 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -11,7 +11,18 @@
1111
import typing
1212
from collections import ChainMap, Counter
1313
from pathlib import Path
14-
from typing import Any, Callable, Dict, Iterable, List, Optional, TextIO, Tuple, Union, cast
14+
from typing import (
15+
Any,
16+
Callable,
17+
Dict,
18+
Iterable,
19+
List,
20+
Optional,
21+
TextIO,
22+
Tuple,
23+
Union,
24+
cast,
25+
)
1526
from xml.dom import Node, minidom
1627
from xml.dom.minidom import Document, Element
1728

@@ -20,12 +31,19 @@
2031
import pandas as pd
2132
import requests
2233
import yaml
23-
from curies import Converter
34+
from curies import Converter, ReferenceTuple
35+
from curies.dataframe import (
36+
get_df_curies_index,
37+
get_df_prefixes_index,
38+
get_filter_df_by_curies_index,
39+
get_filter_df_by_prefixes_index,
40+
)
2441
from linkml_runtime.loaders.json_loader import JSONLoader
2542
from linkml_runtime.loaders.rdflib_loader import RDFLibLoader
2643
from pandas.errors import EmptyDataError
2744
from rdflib import Graph
2845
from sssom_schema import Mapping, MappingSet
46+
from typing_extensions import Literal, TypeAlias
2947

3048
from sssom.constants import (
3149
CONFIDENCE,
@@ -976,8 +994,12 @@ def _get_mapping_set_from_df(df: pd.DataFrame, meta: Optional[MetadataType] = No
976994
return mapping_set
977995

978996

997+
SplitMethod: TypeAlias = Literal["disjoint-indexes", "dense-indexes"]
998+
999+
9791000
def split_dataframe(
9801001
msdf: MappingSetDataFrame,
1002+
method: SplitMethod | None = None,
9811003
) -> Dict[str, MappingSetDataFrame]:
9821004
"""Group the mapping set dataframe into several subdataframes by prefix.
9831005
@@ -995,6 +1017,7 @@ def split_dataframe(
9951017
subject_prefixes=subject_prefixes,
9961018
object_prefixes=object_prefixes,
9971019
relations=relations,
1020+
method=method,
9981021
)
9991022

10001023

@@ -1008,16 +1031,57 @@ def split_dataframe_by_prefix(
10081031
subject_prefixes: Iterable[str],
10091032
object_prefixes: Iterable[str],
10101033
relations: Iterable[str],
1034+
*,
1035+
method: SplitMethod | None = None,
10111036
) -> Dict[str, MappingSetDataFrame]:
10121037
"""Split a mapping set dataframe by prefix.
10131038
10141039
:param msdf: An SSSOM MappingSetDataFrame
10151040
:param subject_prefixes: a list of prefixes pertaining to the subject
10161041
:param object_prefixes: a list of prefixes pertaining to the object
10171042
:param relations: a list of relations of interest
1043+
:param method: The method for calculating splits
10181044
10191045
:returns: a dict of SSSOM data frame names to MappingSetDataFrame
10201046
"""
1047+
if method is None:
1048+
return _split_dataframe_by_prefix_old(
1049+
msdf,
1050+
subject_prefixes=subject_prefixes,
1051+
object_prefixes=object_prefixes,
1052+
relations=relations,
1053+
)
1054+
1055+
predicates: List[ReferenceTuple] = []
1056+
for relation in relations:
1057+
if reference_tuple := msdf.converter.parse_curie(relation):
1058+
predicates.append(reference_tuple)
1059+
else:
1060+
logging.warning("invalid relation CURIE for dataframe split: %s", relation)
1061+
1062+
rr = _help_split_dataframe_by_prefix(
1063+
msdf.df,
1064+
subject_prefixes=subject_prefixes,
1065+
predicates=predicates,
1066+
object_prefixes=object_prefixes,
1067+
method=method,
1068+
)
1069+
rv = {}
1070+
for (subject_prefix, relation_t, object_prefix), df in rr:
1071+
subconverter = msdf.converter.get_subconverter(
1072+
[subject_prefix, object_prefix, relation_t.prefix]
1073+
)
1074+
split = _get_split_key(subject_prefix, relation_t.identifier, object_prefix)
1075+
rv[split] = from_sssom_dataframe(df, prefix_map=subconverter, meta=msdf.metadata)
1076+
return rv
1077+
1078+
1079+
def _split_dataframe_by_prefix_old(
1080+
msdf: MappingSetDataFrame,
1081+
subject_prefixes: Iterable[str],
1082+
object_prefixes: Iterable[str],
1083+
relations: Iterable[str],
1084+
) -> Dict[str, MappingSetDataFrame]:
10211085
df = msdf.df
10221086
meta = msdf.metadata
10231087
split_to_msdf: Dict[str, MappingSetDataFrame] = {}
@@ -1049,6 +1113,69 @@ def split_dataframe_by_prefix(
10491113
return split_to_msdf
10501114

10511115

1116+
def _help_split_dataframe_by_prefix(
1117+
df: pd.DataFrame,
1118+
subject_prefixes: str | Iterable[str],
1119+
predicates: curies.ReferenceTuple | Iterable[curies.ReferenceTuple],
1120+
object_prefixes: str | Iterable[str],
1121+
*,
1122+
method: SplitMethod,
1123+
) -> Iterable[tuple[tuple[str, curies.ReferenceTuple, str], pd.DataFrame]]:
1124+
subject_prefixes = _clean_list(subject_prefixes)
1125+
predicates = [predicates] if isinstance(predicates, curies.ReferenceTuple) else list(predicates)
1126+
object_prefixes = _clean_list(object_prefixes)
1127+
1128+
if method == "disjoint-indexes":
1129+
s_indexes: dict[str, pd.Series[bool]] = {
1130+
subject_prefix: get_filter_df_by_prefixes_index(
1131+
df, column="subject_id", prefixes=subject_prefix
1132+
)
1133+
for subject_prefix in subject_prefixes
1134+
}
1135+
p_indexes: dict[ReferenceTuple, pd.Series[bool]] = {
1136+
predicate: get_filter_df_by_curies_index(
1137+
df, column="predicate_id", curies=predicate.curie
1138+
)
1139+
for predicate in predicates
1140+
}
1141+
o_indexes: dict[str, pd.Series[bool]] = {
1142+
object_prefix: get_filter_df_by_prefixes_index(
1143+
df, column="object_id", prefixes=object_prefix
1144+
)
1145+
for object_prefix in object_prefixes
1146+
}
1147+
for subject_prefix, predicate, object_prefix in itt.product(
1148+
subject_prefixes, predicates, object_prefixes
1149+
):
1150+
idx = s_indexes[subject_prefix] & p_indexes[predicate] & o_indexes[object_prefix]
1151+
if not idx.any():
1152+
continue
1153+
yield (subject_prefix, predicate, object_prefix), df[idx]
1154+
1155+
elif method == "dense-indexes":
1156+
s_index = get_df_prefixes_index(df, column="subject_id")
1157+
p_index = get_df_curies_index(df, column="predicate_id")
1158+
o_index = get_df_prefixes_index(df, column="object_id")
1159+
for subject_prefix, predicate, object_prefix in itt.product(
1160+
subject_prefixes, predicates, object_prefixes
1161+
):
1162+
dense_idx: list[int] = sorted(
1163+
set(s_index.get(subject_prefix, []))
1164+
.intersection(p_index.get(predicate.curie, []))
1165+
.intersection(o_index.get(object_prefix, []))
1166+
)
1167+
if not dense_idx:
1168+
continue
1169+
yield (subject_prefix, predicate, object_prefix), df.iloc[dense_idx]
1170+
1171+
else:
1172+
raise ValueError(f"invalid split method: {method}")
1173+
1174+
1175+
def _clean_list(item: str | Iterable[str]) -> list[str]:
1176+
return [item] if isinstance(item, str) else list(item)
1177+
1178+
10521179
def _ensure_valid_mapping_from_dict(mdict: Dict[str, Any]) -> Optional[Mapping]:
10531180
"""Return a valid mapping object if it can be constructed, else None.
10541181

tests/test_parsers.py

Lines changed: 32 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -4,6 +4,7 @@
44
import json
55
import math
66
import os
7+
import typing
78
import unittest
89
from collections import ChainMap
910
from pathlib import Path
@@ -22,6 +23,7 @@
2223
from sssom.io import parse_file
2324
from sssom.parsers import (
2425
PARSING_FUNCTIONS,
26+
SplitMethod,
2527
from_alignment_minidom,
2628
from_obographs,
2729
from_sssom_dataframe,
@@ -551,25 +553,46 @@ def test_split_df(self) -> None:
551553
df = pd.DataFrame(rows, columns=columns)
552554
msdf = from_sssom_dataframe(df, converter)
553555

556+
sdf = pd.DataFrame(subrows, columns=columns)
557+
558+
for method in [None, *typing.get_args(SplitMethod)]:
559+
with self.subTest(method=method):
560+
self.assert_msdf(msdf, sdf, method)
561+
562+
def assert_msdf(
563+
self, msdf: MappingSetDataFrame, sdf: pd.DataFrame, method: SplitMethod | None
564+
) -> None:
565+
"""Test the dataframe."""
554566
# test that if there's ever an empty list, then it returns an empty dict
555-
self.assertFalse(split_dataframe_by_prefix(msdf, [], ["p2"], ["skos:exactMatch"]))
556-
self.assertFalse(split_dataframe_by_prefix(msdf, ["p1"], ["p2"], []))
557-
self.assertFalse(split_dataframe_by_prefix(msdf, ["p1"], [], ["skos:exactMatch"]))
567+
self.assertFalse(
568+
split_dataframe_by_prefix(msdf, [], ["p2"], ["skos:exactMatch"], method=method)
569+
)
570+
self.assertFalse(split_dataframe_by_prefix(msdf, ["p1"], ["p2"], [], method=method))
571+
self.assertFalse(
572+
split_dataframe_by_prefix(msdf, ["p1"], [], ["skos:exactMatch"], method=method)
573+
)
558574

559575
# test that missing prefixes don't result in anything
560-
self.assertFalse(split_dataframe_by_prefix(msdf, ["nope"], ["p2"], ["skos:exactMatch"]))
561-
self.assertFalse(split_dataframe_by_prefix(msdf, ["p1"], ["nope"], ["skos:exactMatch"]))
562-
self.assertFalse(split_dataframe_by_prefix(msdf, ["p1"], ["p2"], ["nope:nope"]))
576+
self.assertFalse(
577+
split_dataframe_by_prefix(msdf, ["nope"], ["p2"], ["skos:exactMatch"], method=method)
578+
)
579+
self.assertFalse(
580+
split_dataframe_by_prefix(msdf, ["p1"], ["nope"], ["skos:exactMatch"], method=method)
581+
)
582+
self.assertFalse(
583+
split_dataframe_by_prefix(msdf, ["p1"], ["p2"], ["nope:nope"], method=method)
584+
)
563585

564-
sdf = pd.DataFrame(subrows, columns=columns)
565586
# test an explicit return with only single entries
566-
rv = split_dataframe_by_prefix(msdf, ["p1"], ["p2"], ["skos:exactMatch"])
587+
rv = split_dataframe_by_prefix(msdf, ["p1"], ["p2"], ["skos:exactMatch"], method=method)
567588
self.assertEqual(1, len(rv), msg="nothing was indexed")
568589
self.assertIn("p1_exactmatch_p2", rv)
569590
self.assertEqual(sdf.values.tolist(), rv["p1_exactmatch_p2"].df.values.tolist())
570591

571592
# test an explicit return with multiple entries
572-
rv = split_dataframe_by_prefix(msdf, ["p1"], ["p2", "p3"], ["skos:exactMatch"])
593+
rv = split_dataframe_by_prefix(
594+
msdf, ["p1"], ["p2", "p3"], ["skos:exactMatch"], method=method
595+
)
573596
self.assertEqual(2, len(rv), msg="nothing was indexed")
574597
self.assertIn("p1_exactmatch_p2", rv)
575598
self.assertIn("p1_exactmatch_p3", rv)

0 commit comments

Comments
 (0)