From 75cb6a16ab858802d091285411e718604590ad81 Mon Sep 17 00:00:00 2001 From: Jochen Sieg Date: Fri, 4 Jul 2025 13:31:19 +0200 Subject: [PATCH 1/2] bug unittest for transform/fit_transform with error handling --- tests/test_pipeline.py | 93 +++++++++++++++++++++++++++++++----------- 1 file changed, 70 insertions(+), 23 deletions(-) diff --git a/tests/test_pipeline.py b/tests/test_pipeline.py index 84eb6ae4..33ed17e0 100644 --- a/tests/test_pipeline.py +++ b/tests/test_pipeline.py @@ -54,7 +54,7 @@ def test_fit_transform_single_core(self) -> None: [ ("smi2mol", smi2mol), ("morgan", mol2morgan), - ] + ], ) # Run pipeline @@ -73,11 +73,11 @@ def test_sklearn_pipeline(self) -> None: ("smi2mol", smi2mol), ("morgan", mol2morgan), ("decision_tree", d_tree), - ] + ], ) s_pipeline.fit(TEST_SMILES, CONTAINS_OX) predicted_value_array = s_pipeline.predict(TEST_SMILES) - for pred_val, true_val in zip(predicted_value_array, CONTAINS_OX): + for pred_val, true_val in zip(predicted_value_array, CONTAINS_OX, strict=False): self.assertEqual(pred_val, true_val) def test_sklearn_pipeline_parallel(self) -> None: @@ -96,7 +96,7 @@ def test_sklearn_pipeline_parallel(self) -> None: s_pipeline.fit(TEST_SMILES, CONTAINS_OX) out = s_pipeline.predict(TEST_SMILES) self.assertEqual(len(out), len(CONTAINS_OX)) - for pred_val, true_val in zip(out, CONTAINS_OX): + for pred_val, true_val in zip(out, CONTAINS_OX, strict=False): self.assertEqual(pred_val, true_val) def test_salt_removal(self) -> None: @@ -119,16 +119,18 @@ def test_salt_removal(self) -> None: ("empty_mol_filter", empty_mol_filter), ("remove_charge", remove_charge), ("mol2smi", mol2smi), - ] + ], ) - generated_smiles = salt_remover_pipeline.transform(smiles_with_salt_list) + generated_smiles_list = salt_remover_pipeline.transform(smiles_with_salt_list) for generated_smiles, smiles_without_salt in zip( - generated_smiles, smiles_without_salt_list + generated_smiles_list, + smiles_without_salt_list, + strict=False, ): self.assertEqual(generated_smiles, smiles_without_salt) def test_json_generation(self) -> None: - """Test that the json representation of a pipeline can be loaded back into a pipeline.""" + """Test json representation of a pipeline can be loaded back into a pipeline.""" # Create pipeline smi2mol = SmilesToMol() metal_disconnector = MetalDisconnector() @@ -146,7 +148,7 @@ def test_json_generation(self) -> None: ("metal_disconnector", metal_disconnector), ("salt_remover", salt_remover), ("physchem", physchem), - ] + ], ) # Convert pipeline to json @@ -156,7 +158,9 @@ def test_json_generation(self) -> None: self.assertTrue(isinstance(loaded_pipeline, Pipeline)) # Compare pipeline elements for loaded_element, original_element in zip( - loaded_pipeline.steps, pipeline_element_list + loaded_pipeline.steps, + pipeline_element_list, + strict=False, ): if loaded_element[1] == "passthrough": self.assertEqual(loaded_element[1], original_element) @@ -176,7 +180,7 @@ def test_fit_transform_record_remove_nones(self) -> None: mol2morgan = MolToMorganFP(radius=FP_RADIUS, n_bits=FP_SIZE) empty_mol_filter = EmptyMoleculeFilter() remove_none = ErrorFilter.from_element_list( - [smi2mol, salt_remover, mol2morgan, empty_mol_filter] + [smi2mol, salt_remover, mol2morgan, empty_mol_filter], ) # Create pipeline pipeline = Pipeline( @@ -191,13 +195,16 @@ def test_fit_transform_record_remove_nones(self) -> None: # Run pipeline matrix = pipeline.fit_transform(TEST_SMILES + FAULTY_TEST_SMILES) - # Compare with expected output (Which is the same as the output without the faulty smiles) + # Compare with expected output (Which is the same as the output without the + # faulty smiles) self.assertTrue(are_equal(EXPECTED_OUTPUT, matrix)) def test_caching(self) -> None: - """Test if the caching gives the same results and is faster on the second run.""" + """Test caching gives the same results and is faster on the second run.""" molecule_net_logd_df = pd.read_csv( - TEST_DATA_DIR / "molecule_net_logd.tsv.gz", sep="\t", nrows=20 + TEST_DATA_DIR / "molecule_net_logd.tsv.gz", + sep="\t", + nrows=20, ) prediction_list = [] for cache_activated in [False, True]: @@ -235,7 +242,8 @@ def test_caching(self) -> None: n_transformations = pipeline.named_steps["mol2concat"].n_transformations if cache_activated: - # Fit is called twice, but the transform is only called once, since the second run is cached + # Fit is called twice, but the transform is only called once, + # since the second run is cached self.assertEqual(n_transformations, 1) else: self.assertEqual(n_transformations, 2) @@ -244,6 +252,39 @@ def test_caching(self) -> None: for pred1, pred2 in combinations(prediction_list, 2): self.assertTrue(np.allclose(pred1, pred2)) + def test_transform_and_fit_transform_produce_same_results_with_error_handling( + self, + ) -> None: + """Test transform/fit_transform produce the same results with error handling. + + There was an issue when the descriptor calculation fails for a molecule, then + the fit_transform method would be executed correctly, but the transform method + fails, because the Reinserter will be applied before the assembly step, leading + to shape mismatches in the numpy matrix construction. + """ + error_filter = ErrorFilter(filter_everything=True) + error_replacer = FilterReinserter.from_error_filter(error_filter, np.nan) + + pipeline = Pipeline( + [ + ("smi2mol", SmilesToMol()), + ("physchem", MolToRDKitPhysChem(standardizer=None)), + ("error_filter", error_filter), + ("error_replacer", error_replacer), + ], + ) + + smiles = [ + "CCC", + "C1=NC(N)=[Se]=C1", # fails physchem calculation + ] + + # Run pipeline + matrix_fit_transform = pipeline.fit_transform(smiles) + matrix_transform = pipeline.transform(smiles) + + self.assertTrue(are_equal(matrix_fit_transform, matrix_transform)) + class PipelineCompatibilityTest(unittest.TestCase): """Test if the pipeline is compatible with other sklearn functionalities.""" @@ -263,7 +304,7 @@ def test_gridsearchcv(self) -> None: "physchem__descriptor_list": [ ["HeavyAtomMolWt"], ["HeavyAtomMolWt", "HeavyAtomCount"], - ] + ], }, }, ] @@ -273,7 +314,8 @@ def test_gridsearchcv(self) -> None: element = test_data_dict["element"] param_grid = test_data_dict["param_grid"] - # set up a pipeline that trains a random forest classifier on morgan fingerprints + # set up a pipeline that trains a random forest classifier on morgan + # fingerprints pipeline = Pipeline( [ ("auto2mol", AutoToMol()), @@ -307,13 +349,15 @@ def test_gridsearchcv(self) -> None: self.assertIn(grid_search_cv.best_params_[k], value) def test_gridsearch_cache(self) -> None: - """Run a short GridSearchCV and check if the caching and not caching gives the same results.""" + """Check GridSearchCV gives same results with caching and without caching.""" h_params = { "rf__n_estimators": [1, 2], } # First without caching data_df = pd.read_csv( - TEST_DATA_DIR / "molecule_net_logd.tsv.gz", sep="\t", nrows=20 + TEST_DATA_DIR / "molecule_net_logd.tsv.gz", + sep="\t", + nrows=20, ) best_param_dict = {} prediction_dict = {} @@ -339,7 +383,7 @@ def test_gridsearch_cache(self) -> None: grid_search_cv.fit(data_df["smiles"].tolist(), data_df["exp"].tolist()) best_param_dict[cache_activated] = grid_search_cv.best_params_ prediction_dict[cache_activated] = grid_search_cv.predict( - data_df["smiles"].tolist() + data_df["smiles"].tolist(), ) mem.clear(warn=False) self.assertEqual(best_param_dict[True], best_param_dict[False]) @@ -360,13 +404,16 @@ def test_calibrated_classifier(self) -> None: ( "error_replacer", PostPredictionWrapper( - FilterReinserter.from_error_filter(error_filter, np.nan) + FilterReinserter.from_error_filter(error_filter, np.nan), ), ), - ] + ], ) calibrated_pipeline = CalibratedClassifierCV( - s_pipeline, cv=2, ensemble=True, method="isotonic" + s_pipeline, + cv=2, + ensemble=True, + method="isotonic", ) calibrated_pipeline.fit(TEST_SMILES, CONTAINS_OX) predicted_value_array = calibrated_pipeline.predict(TEST_SMILES) From d898d5e2903b12fa7c01d4a204e1f1d0f8c49342 Mon Sep 17 00:00:00 2001 From: Jochen Sieg Date: Fri, 4 Jul 2025 13:34:37 +0200 Subject: [PATCH 2/2] change zip's strict to True --- tests/test_pipeline.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/tests/test_pipeline.py b/tests/test_pipeline.py index 33ed17e0..27e3466e 100644 --- a/tests/test_pipeline.py +++ b/tests/test_pipeline.py @@ -77,7 +77,7 @@ def test_sklearn_pipeline(self) -> None: ) s_pipeline.fit(TEST_SMILES, CONTAINS_OX) predicted_value_array = s_pipeline.predict(TEST_SMILES) - for pred_val, true_val in zip(predicted_value_array, CONTAINS_OX, strict=False): + for pred_val, true_val in zip(predicted_value_array, CONTAINS_OX, strict=True): self.assertEqual(pred_val, true_val) def test_sklearn_pipeline_parallel(self) -> None: @@ -96,7 +96,7 @@ def test_sklearn_pipeline_parallel(self) -> None: s_pipeline.fit(TEST_SMILES, CONTAINS_OX) out = s_pipeline.predict(TEST_SMILES) self.assertEqual(len(out), len(CONTAINS_OX)) - for pred_val, true_val in zip(out, CONTAINS_OX, strict=False): + for pred_val, true_val in zip(out, CONTAINS_OX, strict=True): self.assertEqual(pred_val, true_val) def test_salt_removal(self) -> None: @@ -125,7 +125,7 @@ def test_salt_removal(self) -> None: for generated_smiles, smiles_without_salt in zip( generated_smiles_list, smiles_without_salt_list, - strict=False, + strict=True, ): self.assertEqual(generated_smiles, smiles_without_salt) @@ -160,7 +160,7 @@ def test_json_generation(self) -> None: for loaded_element, original_element in zip( loaded_pipeline.steps, pipeline_element_list, - strict=False, + strict=True, ): if loaded_element[1] == "passthrough": self.assertEqual(loaded_element[1], original_element)