1111import typing
1212from collections import ChainMap , Counter
1313from pathlib import Path
14- from typing import Any , Callable , Dict , Iterable , List , Optional , TextIO , Tuple , Union , cast
14+ from typing import (
15+ Any ,
16+ Callable ,
17+ Dict ,
18+ Iterable ,
19+ List ,
20+ Optional ,
21+ TextIO ,
22+ Tuple ,
23+ Union ,
24+ cast ,
25+ )
1526from xml .dom import Node , minidom
1627from xml .dom .minidom import Document , Element
1728
2031import pandas as pd
2132import requests
2233import yaml
23- from curies import Converter
34+ from curies import Converter , ReferenceTuple
35+ from curies .dataframe import (
36+ get_df_curies_index ,
37+ get_df_prefixes_index ,
38+ get_filter_df_by_curies_index ,
39+ get_filter_df_by_prefixes_index ,
40+ )
2441from linkml_runtime .loaders .json_loader import JSONLoader
2542from linkml_runtime .loaders .rdflib_loader import RDFLibLoader
2643from pandas .errors import EmptyDataError
2744from rdflib import Graph
2845from sssom_schema import Mapping , MappingSet
46+ from typing_extensions import Literal , TypeAlias
2947
3048from sssom .constants import (
3149 CONFIDENCE ,
@@ -976,8 +994,12 @@ def _get_mapping_set_from_df(df: pd.DataFrame, meta: Optional[MetadataType] = No
976994 return mapping_set
977995
978996
997+ SplitMethod : TypeAlias = Literal ["disjoint-indexes" , "dense-indexes" ]
998+
999+
9791000def split_dataframe (
9801001 msdf : MappingSetDataFrame ,
1002+ method : SplitMethod | None = None ,
9811003) -> Dict [str , MappingSetDataFrame ]:
9821004 """Group the mapping set dataframe into several subdataframes by prefix.
9831005
@@ -995,6 +1017,7 @@ def split_dataframe(
9951017 subject_prefixes = subject_prefixes ,
9961018 object_prefixes = object_prefixes ,
9971019 relations = relations ,
1020+ method = method ,
9981021 )
9991022
10001023
@@ -1008,16 +1031,57 @@ def split_dataframe_by_prefix(
10081031 subject_prefixes : Iterable [str ],
10091032 object_prefixes : Iterable [str ],
10101033 relations : Iterable [str ],
1034+ * ,
1035+ method : SplitMethod | None = None ,
10111036) -> Dict [str , MappingSetDataFrame ]:
10121037 """Split a mapping set dataframe by prefix.
10131038
10141039 :param msdf: An SSSOM MappingSetDataFrame
10151040 :param subject_prefixes: a list of prefixes pertaining to the subject
10161041 :param object_prefixes: a list of prefixes pertaining to the object
10171042 :param relations: a list of relations of interest
1043+ :param method: The method for calculating splits
10181044
10191045 :returns: a dict of SSSOM data frame names to MappingSetDataFrame
10201046 """
1047+ if method is None :
1048+ return _split_dataframe_by_prefix_old (
1049+ msdf ,
1050+ subject_prefixes = subject_prefixes ,
1051+ object_prefixes = object_prefixes ,
1052+ relations = relations ,
1053+ )
1054+
1055+ predicates : List [ReferenceTuple ] = []
1056+ for relation in relations :
1057+ if reference_tuple := msdf .converter .parse_curie (relation ):
1058+ predicates .append (reference_tuple )
1059+ else :
1060+ logging .warning ("invalid relation CURIE for dataframe split: %s" , relation )
1061+
1062+ rr = _help_split_dataframe_by_prefix (
1063+ msdf .df ,
1064+ subject_prefixes = subject_prefixes ,
1065+ predicates = predicates ,
1066+ object_prefixes = object_prefixes ,
1067+ method = method ,
1068+ )
1069+ rv = {}
1070+ for (subject_prefix , relation_t , object_prefix ), df in rr :
1071+ subconverter = msdf .converter .get_subconverter (
1072+ [subject_prefix , object_prefix , relation_t .prefix ]
1073+ )
1074+ split = _get_split_key (subject_prefix , relation_t .identifier , object_prefix )
1075+ rv [split ] = from_sssom_dataframe (df , prefix_map = subconverter , meta = msdf .metadata )
1076+ return rv
1077+
1078+
1079+ def _split_dataframe_by_prefix_old (
1080+ msdf : MappingSetDataFrame ,
1081+ subject_prefixes : Iterable [str ],
1082+ object_prefixes : Iterable [str ],
1083+ relations : Iterable [str ],
1084+ ) -> Dict [str , MappingSetDataFrame ]:
10211085 df = msdf .df
10221086 meta = msdf .metadata
10231087 split_to_msdf : Dict [str , MappingSetDataFrame ] = {}
@@ -1049,6 +1113,69 @@ def split_dataframe_by_prefix(
10491113 return split_to_msdf
10501114
10511115
1116+ def _help_split_dataframe_by_prefix (
1117+ df : pd .DataFrame ,
1118+ subject_prefixes : str | Iterable [str ],
1119+ predicates : curies .ReferenceTuple | Iterable [curies .ReferenceTuple ],
1120+ object_prefixes : str | Iterable [str ],
1121+ * ,
1122+ method : SplitMethod ,
1123+ ) -> Iterable [tuple [tuple [str , curies .ReferenceTuple , str ], pd .DataFrame ]]:
1124+ subject_prefixes = _clean_list (subject_prefixes )
1125+ predicates = [predicates ] if isinstance (predicates , curies .ReferenceTuple ) else list (predicates )
1126+ object_prefixes = _clean_list (object_prefixes )
1127+
1128+ if method == "disjoint-indexes" :
1129+ s_indexes : dict [str , pd .Series [bool ]] = {
1130+ subject_prefix : get_filter_df_by_prefixes_index (
1131+ df , column = "subject_id" , prefixes = subject_prefix
1132+ )
1133+ for subject_prefix in subject_prefixes
1134+ }
1135+ p_indexes : dict [ReferenceTuple , pd .Series [bool ]] = {
1136+ predicate : get_filter_df_by_curies_index (
1137+ df , column = "predicate_id" , curies = predicate .curie
1138+ )
1139+ for predicate in predicates
1140+ }
1141+ o_indexes : dict [str , pd .Series [bool ]] = {
1142+ object_prefix : get_filter_df_by_prefixes_index (
1143+ df , column = "object_id" , prefixes = object_prefix
1144+ )
1145+ for object_prefix in object_prefixes
1146+ }
1147+ for subject_prefix , predicate , object_prefix in itt .product (
1148+ subject_prefixes , predicates , object_prefixes
1149+ ):
1150+ idx = s_indexes [subject_prefix ] & p_indexes [predicate ] & o_indexes [object_prefix ]
1151+ if not idx .any ():
1152+ continue
1153+ yield (subject_prefix , predicate , object_prefix ), df [idx ]
1154+
1155+ elif method == "dense-indexes" :
1156+ s_index = get_df_prefixes_index (df , column = "subject_id" )
1157+ p_index = get_df_curies_index (df , column = "predicate_id" )
1158+ o_index = get_df_prefixes_index (df , column = "object_id" )
1159+ for subject_prefix , predicate , object_prefix in itt .product (
1160+ subject_prefixes , predicates , object_prefixes
1161+ ):
1162+ dense_idx : list [int ] = sorted (
1163+ set (s_index .get (subject_prefix , []))
1164+ .intersection (p_index .get (predicate .curie , []))
1165+ .intersection (o_index .get (object_prefix , []))
1166+ )
1167+ if not dense_idx :
1168+ continue
1169+ yield (subject_prefix , predicate , object_prefix ), df .iloc [dense_idx ]
1170+
1171+ else :
1172+ raise ValueError (f"invalid split method: { method } " )
1173+
1174+
1175+ def _clean_list (item : str | Iterable [str ]) -> list [str ]:
1176+ return [item ] if isinstance (item , str ) else list (item )
1177+
1178+
10521179def _ensure_valid_mapping_from_dict (mdict : Dict [str , Any ]) -> Optional [Mapping ]:
10531180 """Return a valid mapping object if it can be constructed, else None.
10541181
0 commit comments