From 5121800c66c20b3b2a4c2620773d00ae551845bf Mon Sep 17 00:00:00 2001 From: tashiro akira Date: Tue, 24 Oct 2023 10:40:45 +0900 Subject: [PATCH 01/16] Fixed an error when there were many missing bool columns in the input data Signed-off-by: tashiro akira --- .../preprocessing_templates/fillna-type-string.py.jinja | 2 ++ .../preprocessing_templates/fillna-type-string_predict.py.jinja | 1 + .../preprocessing_templates/fillna-type-string_train.py.jinja | 1 + 3 files changed, 4 insertions(+) diff --git a/sapientml_core/templates/preprocessing_templates/fillna-type-string.py.jinja b/sapientml_core/templates/preprocessing_templates/fillna-type-string.py.jinja index 5a8da92..5d0787b 100644 --- a/sapientml_core/templates/preprocessing_templates/fillna-type-string.py.jinja +++ b/sapientml_core/templates/preprocessing_templates/fillna-type-string.py.jinja @@ -9,6 +9,8 @@ simple_imputer = SimpleImputer(missing_values=np.nan, strategy='most_frequent') {% endif %} {% if cols_almost_missing_string %} STRING_ALMOST_MISSING_COLS = {{ cols_almost_missing_string }} +{{ train_dataset }}[STRING_ALMOST_MISSING_COLS] = {{ train_dataset }}[STRING_ALMOST_MISSING_COLS].astype(str) +{{ test_dataset }}[STRING_ALMOST_MISSING_COLS] = {{ test_dataset }}[STRING_ALMOST_MISSING_COLS].astype(str) {{ train_dataset }}[STRING_ALMOST_MISSING_COLS] = {{ train_dataset }}[STRING_ALMOST_MISSING_COLS].fillna('') {{ test_dataset }}[STRING_ALMOST_MISSING_COLS] = {{ test_dataset }}[STRING_ALMOST_MISSING_COLS].fillna('') {% endif %} \ No newline at end of file diff --git a/sapientml_core/templates/preprocessing_templates/fillna-type-string_predict.py.jinja b/sapientml_core/templates/preprocessing_templates/fillna-type-string_predict.py.jinja index ef58ba7..472ec11 100644 --- a/sapientml_core/templates/preprocessing_templates/fillna-type-string_predict.py.jinja +++ b/sapientml_core/templates/preprocessing_templates/fillna-type-string_predict.py.jinja @@ -7,5 +7,6 @@ STRING_COLS_WITH_MISSING_VALUES = {{ columns }} {% endif %} {% if cols_almost_missing_string %} STRING_ALMOST_MISSING_COLS = {{ cols_almost_missing_string }} +{{ test_dataset }}[STRING_ALMOST_MISSING_COLS] = {{ test_dataset }}[STRING_ALMOST_MISSING_COLS].astype(str) {{ test_dataset }}[STRING_ALMOST_MISSING_COLS] = {{ test_dataset }}[STRING_ALMOST_MISSING_COLS].fillna('') {% endif %} \ No newline at end of file diff --git a/sapientml_core/templates/preprocessing_templates/fillna-type-string_train.py.jinja b/sapientml_core/templates/preprocessing_templates/fillna-type-string_train.py.jinja index 404804b..81d5621 100644 --- a/sapientml_core/templates/preprocessing_templates/fillna-type-string_train.py.jinja +++ b/sapientml_core/templates/preprocessing_templates/fillna-type-string_train.py.jinja @@ -11,5 +11,6 @@ with open('simpleimputer-string.pkl', 'wb') as f: {% endif %} {% if cols_almost_missing_string %} STRING_ALMOST_MISSING_COLS = {{ cols_almost_missing_string }} +{{ train_dataset }}[STRING_ALMOST_MISSING_COLS] = {{ train_dataset }}[STRING_ALMOST_MISSING_COLS].astype(str) {{ train_dataset }}[STRING_ALMOST_MISSING_COLS] = {{ train_dataset }}[STRING_ALMOST_MISSING_COLS].fillna('') {% endif %} \ No newline at end of file From b684d5c5ade4d1b8dd8c3492a46c4293f518d96c Mon Sep 17 00:00:00 2001 From: tashiro akira Date: Tue, 7 Nov 2023 18:10:21 +0900 Subject: [PATCH 02/16] fix:Add action to return column names Signed-off-by: tashiro akira --- sapientml_core/generator.py | 10 +++++++++- .../other_templates/return_column_name.py.jinja | 6 ++++++ sapientml_core/templates/pipeline_test.py.jinja | 9 ++++++--- 3 files changed, 21 insertions(+), 4 deletions(-) create mode 100644 sapientml_core/templates/other_templates/return_column_name.py.jinja diff --git a/sapientml_core/generator.py b/sapientml_core/generator.py index 1a141e5..ec9488b 100644 --- a/sapientml_core/generator.py +++ b/sapientml_core/generator.py @@ -32,6 +32,7 @@ from sapientml.params import Code, Dataset, PipelineResult, RunningResult, Task from sapientml.util.json_util import JSONEncoder from sapientml.util.logging import setup_logger +from jinja2 import Environment, FileSystemLoader from tqdm import tqdm from . import internal_path @@ -42,7 +43,7 @@ from .training import project_corpus logger = setup_logger() - +env = Environment(loader=FileSystemLoader(f"{os.path.dirname(__file__)}/templates"), trim_blocks=True) def add_prefix(filename, prefix): """Add prefix to filename if prefix exists. @@ -216,11 +217,18 @@ def generate_pipeline(self, dataset: Dataset, task: Task): dataset, preprocess_block = self.preprocess.generate_code(dataset, task) code_block = loaddata_block + preprocess_block dataset, sapientml_results = self.generate_code(dataset, task) + tpl_return_column_name = env.get_template("other_templates/return_column_name.py.jinja") + code_return_column_name = tpl_return_column_name.render() + result_pipelines: list[Code] = [] for pipeline in sapientml_results: pipeline.validation = code_block.validation + pipeline.validation pipeline.test = code_block.test + pipeline.test + if "cols_has_symbols" in pipeline.test: + addindex = pipeline.test.index("# OUTPUT PREDICTION") + pipeline.test = pipeline.test[:addindex-1] + code_return_column_name + pipeline.test[addindex-1:] + pipeline.train = code_block.train + pipeline.train pipeline.predict = code_block.predict + pipeline.predict result_pipelines.append(pipeline) diff --git a/sapientml_core/templates/other_templates/return_column_name.py.jinja b/sapientml_core/templates/other_templates/return_column_name.py.jinja new file mode 100644 index 0000000..39fb405 --- /dev/null +++ b/sapientml_core/templates/other_templates/return_column_name.py.jinja @@ -0,0 +1,6 @@ +# Undo special symbols for column names +feature_train = feature_train.rename(columns={TARGET_COLUMNS[0]: cols_has_symbols}) +target_train = target_train.rename(columns={TARGET_COLUMNS[0]: cols_has_symbols}) +TARGET_COLUMNS = cols_has_symbols + + diff --git a/sapientml_core/templates/pipeline_test.py.jinja b/sapientml_core/templates/pipeline_test.py.jinja index e2c6cbb..8be348f 100644 --- a/sapientml_core/templates/pipeline_test.py.jinja +++ b/sapientml_core/templates/pipeline_test.py.jinja @@ -64,14 +64,17 @@ if set(TARGET_COLUMNS).issubset(test_dataset.columns.tolist()): {{ pipeline_json['confusion_matrix']['code'] }} {% endfilter %} {% endif %} +{% if 'permutation_importance' in pipeline_json %} + +{{ pipeline_json['permutation_importance']['code'] }} +{% endif %} + + {% if 'output_prediction' in pipeline_json %} {{ pipeline_json['output_prediction']['code'] }} {% endif %} -{% if 'permutation_importance' in pipeline_json %} -{{ pipeline_json['permutation_importance']['code'] }} -{% endif %} {% if 'shap' in pipeline_json %} From 5572a5484dd6f616cd3bd75f3843df82ffb1fd19 Mon Sep 17 00:00:00 2001 From: tashiro akira Date: Tue, 14 Nov 2023 16:25:29 +0900 Subject: [PATCH 03/16] fix:Reflects the findings of the review Signed-off-by: tashiro akira --- sapientml_core/generator.py | 6 +++--- .../templates/other_templates/return_column_name.py.jinja | 4 +--- 2 files changed, 4 insertions(+), 6 deletions(-) diff --git a/sapientml_core/generator.py b/sapientml_core/generator.py index ec9488b..ca5557f 100644 --- a/sapientml_core/generator.py +++ b/sapientml_core/generator.py @@ -26,13 +26,13 @@ from shutil import copyfile from typing import Tuple, Union +from jinja2 import Environment, FileSystemLoader from sapientml.executor import PipelineExecutor from sapientml.generator import CodeBlockGenerator, PipelineGenerator from sapientml.macros import metric_lower_is_better from sapientml.params import Code, Dataset, PipelineResult, RunningResult, Task from sapientml.util.json_util import JSONEncoder from sapientml.util.logging import setup_logger -from jinja2 import Environment, FileSystemLoader from tqdm import tqdm from . import internal_path @@ -45,6 +45,7 @@ logger = setup_logger() env = Environment(loader=FileSystemLoader(f"{os.path.dirname(__file__)}/templates"), trim_blocks=True) + def add_prefix(filename, prefix): """Add prefix to filename if prefix exists. @@ -220,14 +221,13 @@ def generate_pipeline(self, dataset: Dataset, task: Task): tpl_return_column_name = env.get_template("other_templates/return_column_name.py.jinja") code_return_column_name = tpl_return_column_name.render() - result_pipelines: list[Code] = [] for pipeline in sapientml_results: pipeline.validation = code_block.validation + pipeline.validation pipeline.test = code_block.test + pipeline.test if "cols_has_symbols" in pipeline.test: addindex = pipeline.test.index("# OUTPUT PREDICTION") - pipeline.test = pipeline.test[:addindex-1] + code_return_column_name + pipeline.test[addindex-1:] + pipeline.test = pipeline.test[: addindex - 1] + code_return_column_name + pipeline.test[addindex - 1 :] pipeline.train = code_block.train + pipeline.train pipeline.predict = code_block.predict + pipeline.predict diff --git a/sapientml_core/templates/other_templates/return_column_name.py.jinja b/sapientml_core/templates/other_templates/return_column_name.py.jinja index 39fb405..4212954 100644 --- a/sapientml_core/templates/other_templates/return_column_name.py.jinja +++ b/sapientml_core/templates/other_templates/return_column_name.py.jinja @@ -1,6 +1,4 @@ -# Undo special symbols for column names -feature_train = feature_train.rename(columns={TARGET_COLUMNS[0]: cols_has_symbols}) -target_train = target_train.rename(columns={TARGET_COLUMNS[0]: cols_has_symbols}) +# Undo special symbols for column names TARGET_COLUMNS = cols_has_symbols From 334f69031cf14f3ebfe3fb844cc5765b4318b52a Mon Sep 17 00:00:00 2001 From: tashiro akira Date: Mon, 20 Nov 2023 16:37:01 +0900 Subject: [PATCH 04/16] fix:Fixed an error with mixed target column types. Signed-off-by: tashiro akira --- .../templates/other_templates/target_separation_test.py.jinja | 2 +- .../other_templates/target_separation_train.py.jinja | 2 +- .../other_templates/target_separation_validation.py.jinja | 4 ++-- 3 files changed, 4 insertions(+), 4 deletions(-) diff --git a/sapientml_core/templates/other_templates/target_separation_test.py.jinja b/sapientml_core/templates/other_templates/target_separation_test.py.jinja index 55e7101..caae186 100644 --- a/sapientml_core/templates/other_templates/target_separation_test.py.jinja +++ b/sapientml_core/templates/other_templates/target_separation_test.py.jinja @@ -1,7 +1,7 @@ # DETACH TARGET TARGET_COLUMNS = {{ pipeline.task.target_columns }} feature_train = train_dataset.drop(TARGET_COLUMNS, axis=1) -target_train = train_dataset[TARGET_COLUMNS].copy() +target_train = train_dataset[TARGET_COLUMNS].copy().astype(str) if set(TARGET_COLUMNS).issubset(test_dataset.columns.tolist()): feature_test = test_dataset.drop(TARGET_COLUMNS, axis=1) target_test = test_dataset[TARGET_COLUMNS].copy() diff --git a/sapientml_core/templates/other_templates/target_separation_train.py.jinja b/sapientml_core/templates/other_templates/target_separation_train.py.jinja index 525f82f..6a24da1 100644 --- a/sapientml_core/templates/other_templates/target_separation_train.py.jinja +++ b/sapientml_core/templates/other_templates/target_separation_train.py.jinja @@ -1,4 +1,4 @@ # DETACH TARGET TARGET_COLUMNS = {{ pipeline.task.target_columns }} feature_train = train_dataset.drop(TARGET_COLUMNS, axis=1) -target_train = train_dataset[TARGET_COLUMNS].copy() \ No newline at end of file +target_train = train_dataset[TARGET_COLUMNS].copy().astype(str) \ No newline at end of file diff --git a/sapientml_core/templates/other_templates/target_separation_validation.py.jinja b/sapientml_core/templates/other_templates/target_separation_validation.py.jinja index 7b68025..1e0750f 100644 --- a/sapientml_core/templates/other_templates/target_separation_validation.py.jinja +++ b/sapientml_core/templates/other_templates/target_separation_validation.py.jinja @@ -1,7 +1,7 @@ # DETACH TARGET TARGET_COLUMNS = {{ pipeline.task.target_columns }} -feature_train = train_dataset.drop(TARGET_COLUMNS, axis=1) -target_train = train_dataset[TARGET_COLUMNS].copy() +feature_train = train_dataset.drop(TARGET_COLUMNS, axis=1).astype(str) +target_train = train_dataset[TARGET_COLUMNS].copy().astype(str) feature_test = test_dataset.drop(TARGET_COLUMNS, axis=1) target_test = test_dataset[TARGET_COLUMNS].copy() From 83438c0304d1fa7efdd239ab7d6402f9c3a864f9 Mon Sep 17 00:00:00 2001 From: tashiro akira Date: Tue, 21 Nov 2023 09:32:26 +0900 Subject: [PATCH 05/16] fix: Return changes to move modifications to another branch. Signed-off-by: tashiro akira --- .../templates/other_templates/target_separation_test.py.jinja | 2 +- .../other_templates/target_separation_train.py.jinja | 2 +- .../other_templates/target_separation_validation.py.jinja | 4 ++-- 3 files changed, 4 insertions(+), 4 deletions(-) diff --git a/sapientml_core/templates/other_templates/target_separation_test.py.jinja b/sapientml_core/templates/other_templates/target_separation_test.py.jinja index caae186..55e7101 100644 --- a/sapientml_core/templates/other_templates/target_separation_test.py.jinja +++ b/sapientml_core/templates/other_templates/target_separation_test.py.jinja @@ -1,7 +1,7 @@ # DETACH TARGET TARGET_COLUMNS = {{ pipeline.task.target_columns }} feature_train = train_dataset.drop(TARGET_COLUMNS, axis=1) -target_train = train_dataset[TARGET_COLUMNS].copy().astype(str) +target_train = train_dataset[TARGET_COLUMNS].copy() if set(TARGET_COLUMNS).issubset(test_dataset.columns.tolist()): feature_test = test_dataset.drop(TARGET_COLUMNS, axis=1) target_test = test_dataset[TARGET_COLUMNS].copy() diff --git a/sapientml_core/templates/other_templates/target_separation_train.py.jinja b/sapientml_core/templates/other_templates/target_separation_train.py.jinja index 6a24da1..525f82f 100644 --- a/sapientml_core/templates/other_templates/target_separation_train.py.jinja +++ b/sapientml_core/templates/other_templates/target_separation_train.py.jinja @@ -1,4 +1,4 @@ # DETACH TARGET TARGET_COLUMNS = {{ pipeline.task.target_columns }} feature_train = train_dataset.drop(TARGET_COLUMNS, axis=1) -target_train = train_dataset[TARGET_COLUMNS].copy().astype(str) \ No newline at end of file +target_train = train_dataset[TARGET_COLUMNS].copy() \ No newline at end of file diff --git a/sapientml_core/templates/other_templates/target_separation_validation.py.jinja b/sapientml_core/templates/other_templates/target_separation_validation.py.jinja index 1e0750f..7b68025 100644 --- a/sapientml_core/templates/other_templates/target_separation_validation.py.jinja +++ b/sapientml_core/templates/other_templates/target_separation_validation.py.jinja @@ -1,7 +1,7 @@ # DETACH TARGET TARGET_COLUMNS = {{ pipeline.task.target_columns }} -feature_train = train_dataset.drop(TARGET_COLUMNS, axis=1).astype(str) -target_train = train_dataset[TARGET_COLUMNS].copy().astype(str) +feature_train = train_dataset.drop(TARGET_COLUMNS, axis=1) +target_train = train_dataset[TARGET_COLUMNS].copy() feature_test = test_dataset.drop(TARGET_COLUMNS, axis=1) target_test = test_dataset[TARGET_COLUMNS].copy() From 2f43a68e8b6617dbad61f1c95dfc2e3d9ef851f0 Mon Sep 17 00:00:00 2001 From: tashiro akira Date: Tue, 12 Dec 2023 15:17:48 +0900 Subject: [PATCH 06/16] fix:Reflected review results Signed-off-by: tashiro akira --- .../templates/other_templates/return_column_name.py.jinja | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/sapientml_core/templates/other_templates/return_column_name.py.jinja b/sapientml_core/templates/other_templates/return_column_name.py.jinja index 4212954..317f25c 100644 --- a/sapientml_core/templates/other_templates/return_column_name.py.jinja +++ b/sapientml_core/templates/other_templates/return_column_name.py.jinja @@ -1,4 +1,5 @@ -# Undo special symbols for column names -TARGET_COLUMNS = cols_has_symbols - +# Undo special symbols for column names +target_train[cols_has_symbols_target] = target_train[TARGET_COLUMNS].copy() +target_train = target_train.drop(TARGET_COLUMNS, axis=1) +TARGET_COLUMNS = cols_has_symbols_target From e67846181a237f74d6bf40be3b37be6884d367e9 Mon Sep 17 00:00:00 2001 From: tashiro akira Date: Tue, 12 Dec 2023 15:50:49 +0900 Subject: [PATCH 07/16] style:Removed unnecessary blank lines. Signed-off-by: tashiro akira --- sapientml_core/templates/pipeline_test.py.jinja | 1 - 1 file changed, 1 deletion(-) diff --git a/sapientml_core/templates/pipeline_test.py.jinja b/sapientml_core/templates/pipeline_test.py.jinja index ee4574a..5850baa 100644 --- a/sapientml_core/templates/pipeline_test.py.jinja +++ b/sapientml_core/templates/pipeline_test.py.jinja @@ -73,7 +73,6 @@ if set(TARGET_COLUMNS).issubset(test_dataset.columns.tolist()): {{ pipeline_json['permutation_importance']['code'] }} {% endif %} - {% if 'shap' in pipeline_json and not pipeline.task.is_multiclass %} {{ pipeline_json['shap']['code'] }} From 5254cfe7cd44da7f1264b8f9134d000c4c9e5a6f Mon Sep 17 00:00:00 2001 From: tashiro akira Date: Fri, 22 Dec 2023 10:11:45 +0900 Subject: [PATCH 08/16] fix:Fixed to return column names in csv file Signed-off-by: tashiro akira --- sapientml_core/generator.py | 27 ++++++++++++++----- .../permutation_importance.py.jinja | 4 +-- .../prediction_result.py.jinja | 8 +++--- .../return_column_name.py.jinja | 5 ---- 4 files changed, 27 insertions(+), 17 deletions(-) delete mode 100644 sapientml_core/templates/other_templates/return_column_name.py.jinja diff --git a/sapientml_core/generator.py b/sapientml_core/generator.py index ca5557f..8f8146e 100644 --- a/sapientml_core/generator.py +++ b/sapientml_core/generator.py @@ -26,7 +26,6 @@ from shutil import copyfile from typing import Tuple, Union -from jinja2 import Environment, FileSystemLoader from sapientml.executor import PipelineExecutor from sapientml.generator import CodeBlockGenerator, PipelineGenerator from sapientml.macros import metric_lower_is_better @@ -43,7 +42,6 @@ from .training import project_corpus logger = setup_logger() -env = Environment(loader=FileSystemLoader(f"{os.path.dirname(__file__)}/templates"), trim_blocks=True) def add_prefix(filename, prefix): @@ -218,16 +216,33 @@ def generate_pipeline(self, dataset: Dataset, task: Task): dataset, preprocess_block = self.preprocess.generate_code(dataset, task) code_block = loaddata_block + preprocess_block dataset, sapientml_results = self.generate_code(dataset, task) - tpl_return_column_name = env.get_template("other_templates/return_column_name.py.jinja") - code_return_column_name = tpl_return_column_name.render() result_pipelines: list[Code] = [] for pipeline in sapientml_results: pipeline.validation = code_block.validation + pipeline.validation pipeline.test = code_block.test + pipeline.test if "cols_has_symbols" in pipeline.test: - addindex = pipeline.test.index("# OUTPUT PREDICTION") - pipeline.test = pipeline.test[: addindex - 1] + code_return_column_name + pipeline.test[addindex - 1 :] + addindex = pipeline.test.index("perm_df = pd.DataFrame") + pipeline.test = ( + pipeline.test[:addindex] + + "feature_train_csv = feature_train.rename(columns=rename_symbol_cols)\n " + + pipeline.test[addindex:] + ) + addindex = pipeline.test.index("prediction = pd.DataFrame") + pipeline.test = ( + pipeline.test[:addindex] + + "TARGET_COLUMNS_csv = [rename_symbol_cols[TARGET_COLUMNS[0]]]\n" + + pipeline.test[addindex:] + ) + else: + addindex = pipeline.test.index("perm_df = pd.DataFrame") + pipeline.test = ( + pipeline.test[:addindex] + "feature_train_csv = feature_train\n " + pipeline.test[addindex:] + ) + addindex = pipeline.test.index("prediction = pd.DataFrame") + pipeline.test = ( + pipeline.test[:addindex] + "TARGET_COLUMNS_csv = [TARGET_COLUMNS[0]]\n" + pipeline.test[addindex:] + ) pipeline.train = code_block.train + pipeline.train pipeline.predict = code_block.predict + pipeline.predict diff --git a/sapientml_core/templates/other_templates/permutation_importance.py.jinja b/sapientml_core/templates/other_templates/permutation_importance.py.jinja index 9d10b79..89da8dd 100644 --- a/sapientml_core/templates/other_templates/permutation_importance.py.jinja +++ b/sapientml_core/templates/other_templates/permutation_importance.py.jinja @@ -13,13 +13,13 @@ if len(feature_train.columns) <= 100: perm = permutation_importance(model, feature_train.sparse.to_dense(), {{ TARGET_TRAIN }}, n_repeats=5, random_state=0) - perm_df = pd.DataFrame({"feature": feature_train.columns, "importance": perm.importances_mean}) + perm_df = pd.DataFrame({"feature": feature_train_csv.columns, "importance": perm.importances_mean}) perm_df.to_csv("./permutation_importance.csv", index=False) {% else %} if len(feature_train.columns) <= 100: perm = permutation_importance(model, feature_train, {{ TARGET_TRAIN }}, n_repeats=5, random_state=0) - perm_df = pd.DataFrame({"feature": feature_train.columns, "importance": perm.importances_mean}) + perm_df = pd.DataFrame({"feature": feature_train_csv.columns, "importance": perm.importances_mean}) perm_df.to_csv("./permutation_importance.csv", index=False) {% endif %} diff --git a/sapientml_core/templates/other_templates/prediction_result.py.jinja b/sapientml_core/templates/other_templates/prediction_result.py.jinja index b7d2c1d..eadde69 100644 --- a/sapientml_core/templates/other_templates/prediction_result.py.jinja +++ b/sapientml_core/templates/other_templates/prediction_result.py.jinja @@ -5,17 +5,17 @@ prediction = pd.DataFrame(y_prob, columns=label_encoder.inverse_transform(model. {% elif pipeline.config.predict_option == macros.PRED_PROBABILITY and pipeline.task.is_multiclass == True and (pipeline.adaptation_metric in macros.metrics_for_classification) and (not pipeline.adaptation_metric.startswith("MAP_"))%} prediction = pd.DataFrame(y_prob, columns=model.classes_, index=feature_test.index) {% elif pipeline.config.predict_option == macros.PRED_PROBABILITY and (pipeline.adaptation_metric in macros.metrics_for_classification) and (not pipeline.adaptation_metric.startswith("MAP_"))%} -prediction = pd.DataFrame(y_prob, columns=TARGET_COLUMNS, index=feature_test.index) +prediction = pd.DataFrame(y_prob, columns=TARGET_COLUMNS_csv, index=feature_test.index) {% elif pipeline.config.predict_option is none and model_name == xgbclassifier and pipeline.task.is_multiclass == True and (pipeline.adaptation_metric in macros.metric_needing_predict_proba) and (not pipeline.adaptation_metric.startswith("MAP_"))%} prediction = pd.DataFrame(y_prob, columns=label_encoder.inverse_transform(model.classes_), index=feature_test.index) {% elif pipeline.config.predict_option is none and pipeline.task.is_multiclass == True and (pipeline.adaptation_metric in macros.metric_needing_predict_proba) and (not pipeline.adaptation_metric.startswith("MAP_"))%} prediction = pd.DataFrame(y_prob, columns=model.classes_, index=feature_test.index) {% elif pipeline.config.predict_option is none and (pipeline.adaptation_metric in macros.metric_needing_predict_proba) and (not pipeline.adaptation_metric.startswith("MAP_"))%} -prediction = pd.DataFrame(y_prob, columns=TARGET_COLUMNS, index=feature_test.index) +prediction = pd.DataFrame(y_prob, columns=TARGET_COLUMNS_csv, index=feature_test.index) {% elif pipeline.adaptation_metric.startswith("MAP_") %} {% set k = pipeline.adaptation_metric.split("_")[1] %} -prediction = pd.DataFrame(y_prob, columns=[TARGET_COLUMNS[0] + "_" +str(i) for i in range(1, y_prob.shape[1] + 1)], index=feature_test.index) +prediction = pd.DataFrame(y_prob, columns=[TARGET_COLUMNS_csv[0] + "_" +str(i) for i in range(1, y_prob.shape[1] + 1)], index=feature_test.index) {% else %} -prediction = pd.DataFrame(y_pred, columns=TARGET_COLUMNS, index=feature_test.index) +prediction = pd.DataFrame(y_pred, columns=TARGET_COLUMNS_csv, index=feature_test.index) {% endif %} prediction.to_csv("./prediction_result.csv") diff --git a/sapientml_core/templates/other_templates/return_column_name.py.jinja b/sapientml_core/templates/other_templates/return_column_name.py.jinja deleted file mode 100644 index 317f25c..0000000 --- a/sapientml_core/templates/other_templates/return_column_name.py.jinja +++ /dev/null @@ -1,5 +0,0 @@ - -# Undo special symbols for column names -target_train[cols_has_symbols_target] = target_train[TARGET_COLUMNS].copy() -target_train = target_train.drop(TARGET_COLUMNS, axis=1) -TARGET_COLUMNS = cols_has_symbols_target From 83817045c9b9eb82f8951d78f30c8397c8bd77da Mon Sep 17 00:00:00 2001 From: tashiro akira Date: Thu, 11 Jan 2024 15:40:18 +0900 Subject: [PATCH 09/16] fix:Reflected the content of the review Signed-off-by: tashiro akira --- sapientml_core/generator.py | 33 ++++++++----------- .../permutation_importance.py.jinja | 4 +-- .../prediction_result.py.jinja | 8 ++--- 3 files changed, 20 insertions(+), 25 deletions(-) diff --git a/sapientml_core/generator.py b/sapientml_core/generator.py index 8f8146e..d91bae0 100644 --- a/sapientml_core/generator.py +++ b/sapientml_core/generator.py @@ -222,28 +222,23 @@ def generate_pipeline(self, dataset: Dataset, task: Task): pipeline.validation = code_block.validation + pipeline.validation pipeline.test = code_block.test + pipeline.test if "cols_has_symbols" in pipeline.test: - addindex = pipeline.test.index("perm_df = pd.DataFrame") - pipeline.test = ( - pipeline.test[:addindex] - + "feature_train_csv = feature_train.rename(columns=rename_symbol_cols)\n " - + pipeline.test[addindex:] + pipeline.test = pipeline.test.replace( + '"feature": feature_train.columns', + '"feature": feature_train.rename(columns=rename_symbol_cols).columns', ) - addindex = pipeline.test.index("prediction = pd.DataFrame") - pipeline.test = ( - pipeline.test[:addindex] - + "TARGET_COLUMNS_csv = [rename_symbol_cols[TARGET_COLUMNS[0]]]\n" - + pipeline.test[addindex:] - ) - else: - addindex = pipeline.test.index("perm_df = pd.DataFrame") - pipeline.test = ( - pipeline.test[:addindex] + "feature_train_csv = feature_train\n " + pipeline.test[addindex:] - ) - addindex = pipeline.test.index("prediction = pd.DataFrame") - pipeline.test = ( - pipeline.test[:addindex] + "TARGET_COLUMNS_csv = [TARGET_COLUMNS[0]]\n" + pipeline.test[addindex:] + pipeline.test = pipeline.test.replace( + "prediction.to_csv", "prediction.rename(columns=rename_symbol_cols).to_csv" ) + def replace_targets(match_obj): + return match_obj[0].replace( + "TARGET_COLUMNS", "[rename_symbol_cols.get(v, v) for v in TARGET_COLUMNS]" + ) + + pat = r"prediction = pd.DataFrame\(y_prob, columns=.?TARGET_COLUMNS.*, index=feature_test.index\)" + pipeline.test = re.sub(pat, replace_targets, pipeline.test) + pipeline.predict = re.sub(pat, replace_targets, pipeline.predict) + pipeline.train = code_block.train + pipeline.train pipeline.predict = code_block.predict + pipeline.predict result_pipelines.append(pipeline) diff --git a/sapientml_core/templates/other_templates/permutation_importance.py.jinja b/sapientml_core/templates/other_templates/permutation_importance.py.jinja index 89da8dd..9d10b79 100644 --- a/sapientml_core/templates/other_templates/permutation_importance.py.jinja +++ b/sapientml_core/templates/other_templates/permutation_importance.py.jinja @@ -13,13 +13,13 @@ if len(feature_train.columns) <= 100: perm = permutation_importance(model, feature_train.sparse.to_dense(), {{ TARGET_TRAIN }}, n_repeats=5, random_state=0) - perm_df = pd.DataFrame({"feature": feature_train_csv.columns, "importance": perm.importances_mean}) + perm_df = pd.DataFrame({"feature": feature_train.columns, "importance": perm.importances_mean}) perm_df.to_csv("./permutation_importance.csv", index=False) {% else %} if len(feature_train.columns) <= 100: perm = permutation_importance(model, feature_train, {{ TARGET_TRAIN }}, n_repeats=5, random_state=0) - perm_df = pd.DataFrame({"feature": feature_train_csv.columns, "importance": perm.importances_mean}) + perm_df = pd.DataFrame({"feature": feature_train.columns, "importance": perm.importances_mean}) perm_df.to_csv("./permutation_importance.csv", index=False) {% endif %} diff --git a/sapientml_core/templates/other_templates/prediction_result.py.jinja b/sapientml_core/templates/other_templates/prediction_result.py.jinja index eadde69..b7d2c1d 100644 --- a/sapientml_core/templates/other_templates/prediction_result.py.jinja +++ b/sapientml_core/templates/other_templates/prediction_result.py.jinja @@ -5,17 +5,17 @@ prediction = pd.DataFrame(y_prob, columns=label_encoder.inverse_transform(model. {% elif pipeline.config.predict_option == macros.PRED_PROBABILITY and pipeline.task.is_multiclass == True and (pipeline.adaptation_metric in macros.metrics_for_classification) and (not pipeline.adaptation_metric.startswith("MAP_"))%} prediction = pd.DataFrame(y_prob, columns=model.classes_, index=feature_test.index) {% elif pipeline.config.predict_option == macros.PRED_PROBABILITY and (pipeline.adaptation_metric in macros.metrics_for_classification) and (not pipeline.adaptation_metric.startswith("MAP_"))%} -prediction = pd.DataFrame(y_prob, columns=TARGET_COLUMNS_csv, index=feature_test.index) +prediction = pd.DataFrame(y_prob, columns=TARGET_COLUMNS, index=feature_test.index) {% elif pipeline.config.predict_option is none and model_name == xgbclassifier and pipeline.task.is_multiclass == True and (pipeline.adaptation_metric in macros.metric_needing_predict_proba) and (not pipeline.adaptation_metric.startswith("MAP_"))%} prediction = pd.DataFrame(y_prob, columns=label_encoder.inverse_transform(model.classes_), index=feature_test.index) {% elif pipeline.config.predict_option is none and pipeline.task.is_multiclass == True and (pipeline.adaptation_metric in macros.metric_needing_predict_proba) and (not pipeline.adaptation_metric.startswith("MAP_"))%} prediction = pd.DataFrame(y_prob, columns=model.classes_, index=feature_test.index) {% elif pipeline.config.predict_option is none and (pipeline.adaptation_metric in macros.metric_needing_predict_proba) and (not pipeline.adaptation_metric.startswith("MAP_"))%} -prediction = pd.DataFrame(y_prob, columns=TARGET_COLUMNS_csv, index=feature_test.index) +prediction = pd.DataFrame(y_prob, columns=TARGET_COLUMNS, index=feature_test.index) {% elif pipeline.adaptation_metric.startswith("MAP_") %} {% set k = pipeline.adaptation_metric.split("_")[1] %} -prediction = pd.DataFrame(y_prob, columns=[TARGET_COLUMNS_csv[0] + "_" +str(i) for i in range(1, y_prob.shape[1] + 1)], index=feature_test.index) +prediction = pd.DataFrame(y_prob, columns=[TARGET_COLUMNS[0] + "_" +str(i) for i in range(1, y_prob.shape[1] + 1)], index=feature_test.index) {% else %} -prediction = pd.DataFrame(y_pred, columns=TARGET_COLUMNS_csv, index=feature_test.index) +prediction = pd.DataFrame(y_pred, columns=TARGET_COLUMNS, index=feature_test.index) {% endif %} prediction.to_csv("./prediction_result.csv") From e1e4644b0cee36640cadb820f8139325637c35f7 Mon Sep 17 00:00:00 2001 From: tashiro akira Date: Tue, 27 Feb 2024 11:41:39 +0900 Subject: [PATCH 10/16] fix:Save Changes Temporarily Signed-off-by: tashiro akira --- sapientml_core/explain/AutoVisualization.py | 6 ++++++ sapientml_core/params.py | 21 +++++++++++++++++++++ 2 files changed, 27 insertions(+) diff --git a/sapientml_core/explain/AutoVisualization.py b/sapientml_core/explain/AutoVisualization.py index ef72c06..4fa671c 100644 --- a/sapientml_core/explain/AutoVisualization.py +++ b/sapientml_core/explain/AutoVisualization.py @@ -18,6 +18,7 @@ import numpy as np import pandas as pd +import re from sapientml.util.logging import setup_logger warnings.filterwarnings("ignore") @@ -56,6 +57,11 @@ def AutoVisualization( """ problem_type = problem_type.lower() + # cols_has_symbols = df.columns.tolist() + # inhibited_symbol_pattern = re.compile(r"[\{\}\[\]\",:<'\\\+]+") + # rename_symbol_cols = {inhibited_symbol_pattern.sub("", col): col if col in cols_has_symbols else col in cols_has_symbols for col in cols_has_symbols } + # target_columns = {rename_symbol_cols[k] for k in target_columns} + if not ignore_columns: ignore_columns = [] diff --git a/sapientml_core/params.py b/sapientml_core/params.py index aacf73d..8cf3107 100644 --- a/sapientml_core/params.py +++ b/sapientml_core/params.py @@ -18,6 +18,7 @@ import numpy as np import pandas as pd +import collections from pandas.core.dtypes.common import is_numeric_dtype from pydantic import BaseModel, Field, field_validator from sapientml.params import Code, Config, Task @@ -410,6 +411,26 @@ def summarize_dataset(df_train: pd.DataFrame, task: Task) -> DatasetSummary: DatasetSummary """ is_multi_classes: list[bool] = [] + + #kakunin + # target_columns={} + # print(df_train.columns) + # print(task.target_columns) + # sametargetcolumn = collections.Counter(task.target_columns) + # for target in sametargetcolumn.keys(): + # if sametargetcolumn[target] > 1: + # for i in range(sametargetcolumn[target]): + # if target + str(i+1) not in df_train.columns : + # target_columns[target+str(i+1)] = target + # else: + # target_columns.append(target) + + # for target in target_columns.keys(): + # df_train = df_train.rename({target_columns[target]:target}, axis=1) + + # print(target_columns) + # print(df_train.columns) + for target in task.target_columns: is_multi_classes.append(len(df_train[target].unique()) > 1) has_multi_class_targets = all(is_multi_classes) From 1619b10ad7d499e1072ef9c4f04eb2f929af36b9 Mon Sep 17 00:00:00 2001 From: tashiro akira Date: Mon, 11 Mar 2024 10:48:42 +0900 Subject: [PATCH 11/16] fix:Reflect Modifications Signed-off-by: tashiro akira --- sapientml_core/generator.py | 3 ++- sapientml_core/params.py | 19 ------------------- 2 files changed, 2 insertions(+), 20 deletions(-) diff --git a/sapientml_core/generator.py b/sapientml_core/generator.py index d91bae0..cc9744c 100644 --- a/sapientml_core/generator.py +++ b/sapientml_core/generator.py @@ -221,6 +221,7 @@ def generate_pipeline(self, dataset: Dataset, task: Task): for pipeline in sapientml_results: pipeline.validation = code_block.validation + pipeline.validation pipeline.test = code_block.test + pipeline.test + pipeline.predict = code_block.predict + pipeline.predict if "cols_has_symbols" in pipeline.test: pipeline.test = pipeline.test.replace( '"feature": feature_train.columns', @@ -240,7 +241,7 @@ def replace_targets(match_obj): pipeline.predict = re.sub(pat, replace_targets, pipeline.predict) pipeline.train = code_block.train + pipeline.train - pipeline.predict = code_block.predict + pipeline.predict + # pipeline.predict = code_block.predict + pipeline.predict result_pipelines.append(pipeline) logger.info("Executing generated pipelines...") diff --git a/sapientml_core/params.py b/sapientml_core/params.py index 8cf3107..5150145 100644 --- a/sapientml_core/params.py +++ b/sapientml_core/params.py @@ -412,25 +412,6 @@ def summarize_dataset(df_train: pd.DataFrame, task: Task) -> DatasetSummary: """ is_multi_classes: list[bool] = [] - #kakunin - # target_columns={} - # print(df_train.columns) - # print(task.target_columns) - # sametargetcolumn = collections.Counter(task.target_columns) - # for target in sametargetcolumn.keys(): - # if sametargetcolumn[target] > 1: - # for i in range(sametargetcolumn[target]): - # if target + str(i+1) not in df_train.columns : - # target_columns[target+str(i+1)] = target - # else: - # target_columns.append(target) - - # for target in target_columns.keys(): - # df_train = df_train.rename({target_columns[target]:target}, axis=1) - - # print(target_columns) - # print(df_train.columns) - for target in task.target_columns: is_multi_classes.append(len(df_train[target].unique()) > 1) has_multi_class_targets = all(is_multi_classes) From 1cee4514011ac84c7a349243d62466fba0018cd2 Mon Sep 17 00:00:00 2001 From: tashiro akira Date: Fri, 22 Mar 2024 16:11:33 +0900 Subject: [PATCH 12/16] fix:Reflected review results Signed-off-by: tashiro akira --- sapientml_core/explain/AutoVisualization.py | 6 --- sapientml_core/explain/main.py | 52 ++++++++++++++++++--- sapientml_core/generator.py | 18 ++++++- sapientml_core/params.py | 2 +- 4 files changed, 64 insertions(+), 14 deletions(-) diff --git a/sapientml_core/explain/AutoVisualization.py b/sapientml_core/explain/AutoVisualization.py index 4fa671c..ef72c06 100644 --- a/sapientml_core/explain/AutoVisualization.py +++ b/sapientml_core/explain/AutoVisualization.py @@ -18,7 +18,6 @@ import numpy as np import pandas as pd -import re from sapientml.util.logging import setup_logger warnings.filterwarnings("ignore") @@ -57,11 +56,6 @@ def AutoVisualization( """ problem_type = problem_type.lower() - # cols_has_symbols = df.columns.tolist() - # inhibited_symbol_pattern = re.compile(r"[\{\}\[\]\",:<'\\\+]+") - # rename_symbol_cols = {inhibited_symbol_pattern.sub("", col): col if col in cols_has_symbols else col in cols_has_symbols for col in cols_has_symbols } - # target_columns = {rename_symbol_cols[k] for k in target_columns} - if not ignore_columns: ignore_columns = [] diff --git a/sapientml_core/explain/main.py b/sapientml_core/explain/main.py index 0b17761..14ffb34 100644 --- a/sapientml_core/explain/main.py +++ b/sapientml_core/explain/main.py @@ -12,11 +12,13 @@ # See the License for the specific language governing permissions and # limitations under the License. +import collections from typing import Literal, Optional import pandas as pd from sapientml.params import CancellationToken from sapientml.util.logging import setup_logger +from sapientml_preprocess.generator import check_cols_has_symbols, remove_symbols from .AutoEDA import EDA from .AutoVisualization import AutoVisualization_Class @@ -81,12 +83,50 @@ def process( if visualization: # Call AutoVisualization to generate visualization codes AV = AutoVisualization_Class() - visualization_code = AV.AutoVisualization( - df=dataframe, - target_columns=target_columns, - problem_type=problem_type, - ignore_columns=ignore_columns, - ) + cols_has_symbols = check_cols_has_symbols(dataframe.columns.to_list()) + rename_col = [] + rename_dict = {} + if cols_has_symbols: + df = list( + dataframe.rename(columns=lambda col: remove_symbols(col) if col in cols_has_symbols else col).columns + ) + same_column = collections.Counter(df) + for target in same_column.keys(): + rename_col = [] + rename_dict = {} + i = 1 + for col in df: + if target in col and same_column[target] > 1: + rename_col.append(str(col + str(i))) + rename_dict[str(col + str(i))] = dataframe.columns[len(rename_dict)] + i = i + 1 + else: + rename_col.append(str(col)) + rename_dict[col] = dataframe.columns[len(rename_dict)] + df = rename_col + + if len(rename_dict) != 0: + col_has_target = [rename_dict[col] for col in target_columns] + visualization_code = AV.AutoVisualization( + df=dataframe, + target_columns=col_has_target, + problem_type=problem_type, + ignore_columns=ignore_columns, + ) + else: + visualization_code = AV.AutoVisualization( + df=dataframe, + target_columns=target_columns, + problem_type=problem_type, + ignore_columns=ignore_columns, + ) + else: + visualization_code = AV.AutoVisualization( + df=dataframe, + target_columns=target_columns, + problem_type=problem_type, + ignore_columns=ignore_columns, + ) else: visualization_code = None diff --git a/sapientml_core/generator.py b/sapientml_core/generator.py index 50dbb21..a6a1a17 100644 --- a/sapientml_core/generator.py +++ b/sapientml_core/generator.py @@ -231,6 +231,22 @@ def generate_pipeline(self, dataset: Dataset, task: Task): "prediction.to_csv", "prediction.rename(columns=rename_symbol_cols).to_csv" ) + pipeline.predict = pipeline.predict.replace( + '"feature": feature_train.columns', + '"feature": feature_train.rename(columns=rename_symbol_cols).columns', + ) + pipeline.predict = pipeline.predict.replace( + "prediction.to_csv", "prediction.rename(columns=rename_symbol_cols).to_csv" + ) + + pipeline.validation = pipeline.validation.replace( + '"feature": feature_train.columns', + '"feature": feature_train.rename(columns=rename_symbol_cols).columns', + ) + pipeline.validation = pipeline.validation.replace( + "prediction.to_csv", "prediction.rename(columns=rename_symbol_cols).to_csv" + ) + def replace_targets(match_obj): return match_obj[0].replace( "TARGET_COLUMNS", "[rename_symbol_cols.get(v, v) for v in TARGET_COLUMNS]" @@ -239,9 +255,9 @@ def replace_targets(match_obj): pat = r"prediction = pd.DataFrame\(y_prob, columns=.?TARGET_COLUMNS.*, index=feature_test.index\)" pipeline.test = re.sub(pat, replace_targets, pipeline.test) pipeline.predict = re.sub(pat, replace_targets, pipeline.predict) + pipeline.validation = re.sub(pat, replace_targets, pipeline.validation) pipeline.train = code_block.train + pipeline.train - # pipeline.predict = code_block.predict + pipeline.predict result_pipelines.append(pipeline) logger.info("Executing generated pipelines...") diff --git a/sapientml_core/params.py b/sapientml_core/params.py index e2c2bf5..58a8375 100644 --- a/sapientml_core/params.py +++ b/sapientml_core/params.py @@ -12,13 +12,13 @@ # See the License for the specific language governing permissions and # limitations under the License. +import collections import re from collections import defaultdict from typing import Any, Literal, Optional, Union import numpy as np import pandas as pd -import collections from pandas.core.dtypes.common import is_numeric_dtype from pydantic import BaseModel, Field, field_validator from sapientml.params import Code, Config, Task From 78d1aadc1822b5f7932a5b84c470710565e9e1c5 Mon Sep 17 00:00:00 2001 From: tashiro akira Date: Fri, 22 Mar 2024 16:24:02 +0900 Subject: [PATCH 13/16] fix:Remove Unnecessary Modifications Signed-off-by: tashiro akira --- sapientml_core/params.py | 2 -- 1 file changed, 2 deletions(-) diff --git a/sapientml_core/params.py b/sapientml_core/params.py index 58a8375..3a71eb0 100644 --- a/sapientml_core/params.py +++ b/sapientml_core/params.py @@ -12,7 +12,6 @@ # See the License for the specific language governing permissions and # limitations under the License. -import collections import re from collections import defaultdict from typing import Any, Literal, Optional, Union @@ -415,7 +414,6 @@ def summarize_dataset(df_train: pd.DataFrame, task: Task) -> DatasetSummary: DatasetSummary """ is_multi_classes: list[bool] = [] - for target in task.target_columns: is_multi_classes.append(len(df_train[target].unique()) > 1) has_multi_class_targets = all(is_multi_classes) From 7724cd190a7d7e096168b01e58d19f694f4bc223 Mon Sep 17 00:00:00 2001 From: tashiro akira Date: Fri, 22 Mar 2024 16:56:00 +0900 Subject: [PATCH 14/16] fix:The format has been modified. Signed-off-by: tashiro akira --- sapientml_core/generator.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/sapientml_core/generator.py b/sapientml_core/generator.py index a6a1a17..2b089d9 100644 --- a/sapientml_core/generator.py +++ b/sapientml_core/generator.py @@ -255,7 +255,7 @@ def replace_targets(match_obj): pat = r"prediction = pd.DataFrame\(y_prob, columns=.?TARGET_COLUMNS.*, index=feature_test.index\)" pipeline.test = re.sub(pat, replace_targets, pipeline.test) pipeline.predict = re.sub(pat, replace_targets, pipeline.predict) - pipeline.validation = re.sub(pat, replace_targets, pipeline.validation) + pipeline.validation = re.sub(pat, replace_targets, pipeline.validation) pipeline.train = code_block.train + pipeline.train result_pipelines.append(pipeline) From 969ded3d3be7d1ac7be106607f7e1d561276d5aa Mon Sep 17 00:00:00 2001 From: tashiro-akira Date: Thu, 25 Apr 2024 09:41:24 +0900 Subject: [PATCH 15/16] fix:Reflect Review Results Signed-off-by: tashiro-akira --- sapientml_core/explain/main.py | 43 ++++++++++++++++++++++------------ 1 file changed, 28 insertions(+), 15 deletions(-) diff --git a/sapientml_core/explain/main.py b/sapientml_core/explain/main.py index 14ffb34..f893278 100644 --- a/sapientml_core/explain/main.py +++ b/sapientml_core/explain/main.py @@ -84,29 +84,42 @@ def process( # Call AutoVisualization to generate visualization codes AV = AutoVisualization_Class() cols_has_symbols = check_cols_has_symbols(dataframe.columns.to_list()) - rename_col = [] + no_symbol_columns = [ col for col in dataframe.columns.values if col not in cols_has_symbols ] rename_dict = {} if cols_has_symbols: df = list( dataframe.rename(columns=lambda col: remove_symbols(col) if col in cols_has_symbols else col).columns ) - same_column = collections.Counter(df) - for target in same_column.keys(): - rename_col = [] - rename_dict = {} - i = 1 - for col in df: - if target in col and same_column[target] > 1: - rename_col.append(str(col + str(i))) - rename_dict[str(col + str(i))] = dataframe.columns[len(rename_dict)] - i = i + 1 + rename_dict = {} + if len(df) != len(set(df)) : + same_column = {k: v for k, v in collections.Counter(df).items() if v > 1 and k in no_symbol_columns} + for target, org_column in zip(df, dataframe.columns.tolist()): + if target in same_column.keys(): + # rename_dict[org_column] = target + str(same_column[target] - 1) + rename_dict[target + str(same_column[target] - 1)] = org_column + same_column[target] = same_column[target] - 1 else: - rename_col.append(str(col)) - rename_dict[col] = dataframe.columns[len(rename_dict)] - df = rename_col + rename_dict[target] = org_column + + df = list(rename_dict.values()) + + # same_column = collections.Counter(df) + # for target in same_column.keys(): + # rename_col = [] + # rename_dict = {} + # i = 1 + # for col in df: + # if target in col and same_column[target] > 1: + # rename_col.append(str(col + str(i))) + # rename_dict[str(col + str(i))] = dataframe.columns[len(rename_dict)] + # i = i + 1 + # else: + # rename_col.append(str(col)) + # rename_dict[col] = dataframe.columns[len(rename_dict)] + # df = rename_col if len(rename_dict) != 0: - col_has_target = [rename_dict[col] for col in target_columns] + col_has_target = [target for target in rename_dict.keys() if rename_dict.values() == target_columns ] visualization_code = AV.AutoVisualization( df=dataframe, target_columns=col_has_target, From 426a32d85906e22c4036234ccb7c76f4ca99098d Mon Sep 17 00:00:00 2001 From: tashiro-akira Date: Thu, 25 Apr 2024 11:01:51 +0900 Subject: [PATCH 16/16] fix:Fixed error in running lint Signed-off-by: tashiro-akira --- sapientml_core/explain/main.py | 39 ++++++++++------------------------ 1 file changed, 11 insertions(+), 28 deletions(-) diff --git a/sapientml_core/explain/main.py b/sapientml_core/explain/main.py index f893278..57783b5 100644 --- a/sapientml_core/explain/main.py +++ b/sapientml_core/explain/main.py @@ -84,42 +84,25 @@ def process( # Call AutoVisualization to generate visualization codes AV = AutoVisualization_Class() cols_has_symbols = check_cols_has_symbols(dataframe.columns.to_list()) - no_symbol_columns = [ col for col in dataframe.columns.values if col not in cols_has_symbols ] + no_symbol_columns = [col for col in dataframe.columns.values if col not in cols_has_symbols] rename_dict = {} if cols_has_symbols: df = list( dataframe.rename(columns=lambda col: remove_symbols(col) if col in cols_has_symbols else col).columns ) rename_dict = {} - if len(df) != len(set(df)) : - same_column = {k: v for k, v in collections.Counter(df).items() if v > 1 and k in no_symbol_columns} - for target, org_column in zip(df, dataframe.columns.tolist()): - if target in same_column.keys(): - # rename_dict[org_column] = target + str(same_column[target] - 1) - rename_dict[target + str(same_column[target] - 1)] = org_column - same_column[target] = same_column[target] - 1 - else: - rename_dict[target] = org_column - - df = list(rename_dict.values()) - - # same_column = collections.Counter(df) - # for target in same_column.keys(): - # rename_col = [] - # rename_dict = {} - # i = 1 - # for col in df: - # if target in col and same_column[target] > 1: - # rename_col.append(str(col + str(i))) - # rename_dict[str(col + str(i))] = dataframe.columns[len(rename_dict)] - # i = i + 1 - # else: - # rename_col.append(str(col)) - # rename_dict[col] = dataframe.columns[len(rename_dict)] - # df = rename_col + same_column = {k: v for k, v in collections.Counter(df).items() if v > 1 and k in no_symbol_columns} + for target, org_column in zip(df, dataframe.columns.tolist()): + if target in same_column.keys(): + rename_dict[target + str(same_column[target] - 1)] = org_column + same_column[target] = same_column[target] - 1 + else: + rename_dict[target] = org_column + + df = list(rename_dict.values()) if len(rename_dict) != 0: - col_has_target = [target for target in rename_dict.keys() if rename_dict.values() == target_columns ] + col_has_target = [target for target in rename_dict.keys() if rename_dict.values() == target_columns] visualization_code = AV.AutoVisualization( df=dataframe, target_columns=col_has_target,