@@ -444,33 +444,32 @@ def filter_redundant_rows(df: pd.DataFrame, ignore_predicate: bool = False) -> p
444444 else :
445445 key = [SUBJECT_ID , OBJECT_ID , PREDICATE_ID ]
446446 dfmax : pd .DataFrame
447- dfmax = df .groupby (key , as_index = False )[CONFIDENCE ].apply (max ).drop_duplicates ()
448- max_conf : Dict [Tuple [str , ...], float ] = {}
449- for _ , row in dfmax .iterrows ():
447+ if not df .empty :
448+ dfmax = df .groupby (key , as_index = False )[CONFIDENCE ].apply (max ).drop_duplicates ()
449+ max_conf : Dict [Tuple [str , ...], float ] = {}
450+ for _ , row in dfmax .iterrows ():
451+ if ignore_predicate :
452+ max_conf [(row [SUBJECT_ID ], row [OBJECT_ID ])] = row [CONFIDENCE ]
453+ else :
454+ max_conf [(row [SUBJECT_ID ], row [OBJECT_ID ], row [PREDICATE_ID ])] = row [CONFIDENCE ]
450455 if ignore_predicate :
451- max_conf [(row [SUBJECT_ID ], row [OBJECT_ID ])] = row [CONFIDENCE ]
456+ df = df [
457+ df .apply (
458+ lambda x : x [CONFIDENCE ] >= max_conf [(x [SUBJECT_ID ], x [OBJECT_ID ])],
459+ axis = 1 ,
460+ )
461+ ]
452462 else :
453- max_conf [(row [SUBJECT_ID ], row [OBJECT_ID ], row [PREDICATE_ID ])] = row [CONFIDENCE ]
454- if ignore_predicate :
455- df = df [
456- df .apply (
457- lambda x : x [CONFIDENCE ] >= max_conf [(x [SUBJECT_ID ], x [OBJECT_ID ])],
458- axis = 1 ,
459- )
460- ]
461- else :
462- df = df [
463- df .apply (
464- lambda x : x [CONFIDENCE ] >= max_conf [(x [SUBJECT_ID ], x [OBJECT_ID ], x [PREDICATE_ID ])],
465- axis = 1 ,
466- )
467- ]
463+ df = df [
464+ df .apply (
465+ lambda x : x [CONFIDENCE ]
466+ >= max_conf [(x [SUBJECT_ID ], x [OBJECT_ID ], x [PREDICATE_ID ])],
467+ axis = 1 ,
468+ )
469+ ]
468470 # We are preserving confidence = NaN rows without making assumptions.
469471 # This means that there are potential duplicate mappings
470- # FutureWarning: The frame.append method is deprecated and
471- # will be removed from pandas in a future version.
472- # Use pandas.concat instead.
473- # return_df = df.append(nan_df).drop_duplicates()
472+
474473 confidence_reconciled_df = pd .concat ([df , nan_df ]).drop_duplicates ()
475474
476475 # Reconciling dataframe rows based on the predicates with equal confidence.
0 commit comments