Addressed situation when assign_default_confidence() returns only dataframe with all NaN confidence values (#548)

hrshdhgd · web-flow · commit e0dfcb34282a · 2024-06-26T16:52:40.000-05:00
Ok, so here was the problem: When the dataframe whose redundant rows had to be filtered out had all `NaN` values for confidence, the line https://github.com/mapping-commons/sssom-py/blob/550206721911f711ee678eb1a8da50591649bd04/src/sssom/util.py#L441 returned `df` = Empty dataframe and the entire source data frame = `nan_df`. Due to this, the following line: https://github.com/mapping-commons/sssom-py/blob/550206721911f711ee678eb1a8da50591649bd04/src/sssom/util.py#L447 result in `dfmax = {}` which is of type `pandas.Series`. Hence the confusion. The correct way to handle this is simple adding an `if` statement: https://github.com/mapping-commons/sssom-py/blob/ffa2109616020f994196cbb827d71bca17192014/src/sssom/util.py#L447-L469 I've added an explicit test and it passes. Fixes #546
diff --git a/src/sssom/util.py b/src/sssom/util.py
@@ -444,33 +444,32 @@ def filter_redundant_rows(df: pd.DataFrame, ignore_predicate: bool = False) -> p
     else:
         key = [SUBJECT_ID, OBJECT_ID, PREDICATE_ID]
     dfmax: pd.DataFrame
-    dfmax = df.groupby(key, as_index=False)[CONFIDENCE].apply(max).drop_duplicates()
-    max_conf: Dict[Tuple[str, ...], float] = {}
-    for _, row in dfmax.iterrows():
+    if not df.empty:
+        dfmax = df.groupby(key, as_index=False)[CONFIDENCE].apply(max).drop_duplicates()
+        max_conf: Dict[Tuple[str, ...], float] = {}
+        for _, row in dfmax.iterrows():
+            if ignore_predicate:
+                max_conf[(row[SUBJECT_ID], row[OBJECT_ID])] = row[CONFIDENCE]
+            else:
+                max_conf[(row[SUBJECT_ID], row[OBJECT_ID], row[PREDICATE_ID])] = row[CONFIDENCE]
         if ignore_predicate:
-            max_conf[(row[SUBJECT_ID], row[OBJECT_ID])] = row[CONFIDENCE]
+            df = df[
+                df.apply(
+                    lambda x: x[CONFIDENCE] >= max_conf[(x[SUBJECT_ID], x[OBJECT_ID])],
+                    axis=1,
+                )
+            ]
         else:
-            max_conf[(row[SUBJECT_ID], row[OBJECT_ID], row[PREDICATE_ID])] = row[CONFIDENCE]
-    if ignore_predicate:
-        df = df[
-            df.apply(
-                lambda x: x[CONFIDENCE] >= max_conf[(x[SUBJECT_ID], x[OBJECT_ID])],
-                axis=1,
-            )
-        ]
-    else:
-        df = df[
-            df.apply(
-                lambda x: x[CONFIDENCE] >= max_conf[(x[SUBJECT_ID], x[OBJECT_ID], x[PREDICATE_ID])],
-                axis=1,
-            )
-        ]
+            df = df[
+                df.apply(
+                    lambda x: x[CONFIDENCE]
+                    >= max_conf[(x[SUBJECT_ID], x[OBJECT_ID], x[PREDICATE_ID])],
+                    axis=1,
+                )
+            ]
     # We are preserving confidence = NaN rows without making assumptions.
     # This means that there are potential duplicate mappings
-    # FutureWarning: The frame.append method is deprecated and
-    # will be removed from pandas in a future version.
-    # Use pandas.concat instead.
-    # return_df = df.append(nan_df).drop_duplicates()
+
     confidence_reconciled_df = pd.concat([df, nan_df]).drop_duplicates()
 
     # Reconciling dataframe rows based on the predicates with equal confidence.
diff --git a/tests/test_reconcile.py b/tests/test_reconcile.py
@@ -22,6 +22,13 @@ def test_filter(self):
         df2 = filter_redundant_rows(self.msdf2.df)
         self.assertEqual(18, len(df2.index))
 
+        # Create a new dataframe with the confidence column having NaN values
+        import numpy as np
+
+        self.msdf1.df["confidence"] = np.NAN
+        df3 = filter_redundant_rows(self.msdf1.df)
+        self.assertEqual(11, len(df3.index))
+
     def test_deal_with_negation(self):
         """Test handling negating returns the right number of rows."""
         df1 = deal_with_negation(self.msdf1.df)