Skip to content

Commit e0dfcb3

Browse files
authored
Addressed situation when assign_default_confidence() returns only dataframe with all NaN confidence values (#548)
Ok, so here was the problem: When the dataframe whose redundant rows had to be filtered out had all `NaN` values for confidence, the line https://github.com/mapping-commons/sssom-py/blob/550206721911f711ee678eb1a8da50591649bd04/src/sssom/util.py#L441 returned `df` = Empty dataframe and the entire source data frame = `nan_df`. Due to this, the following line: https://github.com/mapping-commons/sssom-py/blob/550206721911f711ee678eb1a8da50591649bd04/src/sssom/util.py#L447 result in `dfmax = {}` which is of type `pandas.Series`. Hence the confusion. The correct way to handle this is simple adding an `if` statement: https://github.com/mapping-commons/sssom-py/blob/ffa2109616020f994196cbb827d71bca17192014/src/sssom/util.py#L447-L469 I've added an explicit test and it passes. Fixes #546
1 parent edf432a commit e0dfcb3

File tree

2 files changed

+29
-23
lines changed

2 files changed

+29
-23
lines changed

src/sssom/util.py

Lines changed: 22 additions & 23 deletions
Original file line numberDiff line numberDiff line change
@@ -444,33 +444,32 @@ def filter_redundant_rows(df: pd.DataFrame, ignore_predicate: bool = False) -> p
444444
else:
445445
key = [SUBJECT_ID, OBJECT_ID, PREDICATE_ID]
446446
dfmax: pd.DataFrame
447-
dfmax = df.groupby(key, as_index=False)[CONFIDENCE].apply(max).drop_duplicates()
448-
max_conf: Dict[Tuple[str, ...], float] = {}
449-
for _, row in dfmax.iterrows():
447+
if not df.empty:
448+
dfmax = df.groupby(key, as_index=False)[CONFIDENCE].apply(max).drop_duplicates()
449+
max_conf: Dict[Tuple[str, ...], float] = {}
450+
for _, row in dfmax.iterrows():
451+
if ignore_predicate:
452+
max_conf[(row[SUBJECT_ID], row[OBJECT_ID])] = row[CONFIDENCE]
453+
else:
454+
max_conf[(row[SUBJECT_ID], row[OBJECT_ID], row[PREDICATE_ID])] = row[CONFIDENCE]
450455
if ignore_predicate:
451-
max_conf[(row[SUBJECT_ID], row[OBJECT_ID])] = row[CONFIDENCE]
456+
df = df[
457+
df.apply(
458+
lambda x: x[CONFIDENCE] >= max_conf[(x[SUBJECT_ID], x[OBJECT_ID])],
459+
axis=1,
460+
)
461+
]
452462
else:
453-
max_conf[(row[SUBJECT_ID], row[OBJECT_ID], row[PREDICATE_ID])] = row[CONFIDENCE]
454-
if ignore_predicate:
455-
df = df[
456-
df.apply(
457-
lambda x: x[CONFIDENCE] >= max_conf[(x[SUBJECT_ID], x[OBJECT_ID])],
458-
axis=1,
459-
)
460-
]
461-
else:
462-
df = df[
463-
df.apply(
464-
lambda x: x[CONFIDENCE] >= max_conf[(x[SUBJECT_ID], x[OBJECT_ID], x[PREDICATE_ID])],
465-
axis=1,
466-
)
467-
]
463+
df = df[
464+
df.apply(
465+
lambda x: x[CONFIDENCE]
466+
>= max_conf[(x[SUBJECT_ID], x[OBJECT_ID], x[PREDICATE_ID])],
467+
axis=1,
468+
)
469+
]
468470
# We are preserving confidence = NaN rows without making assumptions.
469471
# This means that there are potential duplicate mappings
470-
# FutureWarning: The frame.append method is deprecated and
471-
# will be removed from pandas in a future version.
472-
# Use pandas.concat instead.
473-
# return_df = df.append(nan_df).drop_duplicates()
472+
474473
confidence_reconciled_df = pd.concat([df, nan_df]).drop_duplicates()
475474

476475
# Reconciling dataframe rows based on the predicates with equal confidence.

tests/test_reconcile.py

Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -22,6 +22,13 @@ def test_filter(self):
2222
df2 = filter_redundant_rows(self.msdf2.df)
2323
self.assertEqual(18, len(df2.index))
2424

25+
# Create a new dataframe with the confidence column having NaN values
26+
import numpy as np
27+
28+
self.msdf1.df["confidence"] = np.NAN
29+
df3 = filter_redundant_rows(self.msdf1.df)
30+
self.assertEqual(11, len(df3.index))
31+
2532
def test_deal_with_negation(self):
2633
"""Test handling negating returns the right number of rows."""
2734
df1 = deal_with_negation(self.msdf1.df)

0 commit comments

Comments
 (0)