From 55786bc50f92999e514b47f9df55bd36a9405550 Mon Sep 17 00:00:00 2001 From: tashiro akira Date: Wed, 25 Oct 2023 14:48:46 +0900 Subject: [PATCH 01/16] fix: Fix error caused when input data is mixed with datetime and string types Signed-off-by: tashiro akira --- sapientml_preprocess/generator.py | 15 ++++++++++----- .../templates/handle_mixed_typed_columns.py.jinja | 5 +++++ 2 files changed, 15 insertions(+), 5 deletions(-) diff --git a/sapientml_preprocess/generator.py b/sapientml_preprocess/generator.py index 92815f8..9634c76 100644 --- a/sapientml_preprocess/generator.py +++ b/sapientml_preprocess/generator.py @@ -24,8 +24,9 @@ import pandas as pd import requests from jinja2 import Environment, FileSystemLoader +from pandas.api.types import infer_dtype from sapientml.generator import CodeBlockGenerator -from sapientml.params import Code, Dataset, Task +from sapientml.params import Code, Dataset, Task, _is_date_colum from sapientml.util.logging import setup_logger from sapientml_preprocess.params import PreprocessConfig @@ -259,8 +260,12 @@ def generate_code(self, dataset: Dataset, task: Task) -> Tuple[Dataset, Code]: # split a columns into 2 columns, one column has only numeric, another columns has only string # this operation should be done before calculating meta features mix_typed_cols = _confirm_mixed_type(df.drop(task.target_columns, axis=1)) + datetime_astype_str = [] cols_numeric_and_string = [] for col in mix_typed_cols: + if(str(df[col].dtype) in "object" and infer_dtype(df[col],skipna=True) in "mixed" and not _is_date_column(df[col])): + datetime_astype_str.append(col) + cols_numeric_and_string.append(col) cols_numeric_and_string.append(col) only_str = col + "__str" only_num = col + "__num" @@ -270,10 +275,10 @@ def generate_code(self, dataset: Dataset, task: Task) -> Tuple[Dataset, Code]: df = df.drop(col, axis=1) if cols_numeric_and_string: tpl = template_env.get_template("handle_mixed_typed_columns.py.jinja") - code.validation += _render(tpl, training=True, test=True, cols_numeric_and_string=cols_numeric_and_string) - code.test += _render(tpl, training=True, test=True, cols_numeric_and_string=cols_numeric_and_string) - code.train += _render(tpl, training=True, test=False, cols_numeric_and_string=cols_numeric_and_string) - code.predict += _render(tpl, training=False, test=True, cols_numeric_and_string=cols_numeric_and_string) + code.validation += _render(tpl, training=True, test=True, cols_numeric_and_string=cols_numeric_and_string,datetime_astype_str=datetime_astype_str) + code.test += _render(tpl, training=True, test=True, cols_numeric_and_string=cols_numeric_and_string,datetime_astype_str=datetime_astype_str) + code.train += _render(tpl, training=True, test=False, cols_numeric_and_string=cols_numeric_and_string,datetime_astype_str=datetime_astype_str) + code.predict += _render(tpl, training=False, test=True, cols_numeric_and_string=cols_numeric_and_string,datetime_astype_str=datetime_astype_str) # meta features must be calculated after replacing inf with nan, # becuase the replaced nan must be preprocessed in the generated code. diff --git a/sapientml_preprocess/templates/handle_mixed_typed_columns.py.jinja b/sapientml_preprocess/templates/handle_mixed_typed_columns.py.jinja index 5e9b3d3..d5a4d06 100644 --- a/sapientml_preprocess/templates/handle_mixed_typed_columns.py.jinja +++ b/sapientml_preprocess/templates/handle_mixed_typed_columns.py.jinja @@ -1,14 +1,19 @@ # HANDLE MIXED TYPE import numpy as np cols_numeric_and_string = {{ cols_numeric_and_string}} +datetime_astype_str = {{ datetime_astype_str }} for col in cols_numeric_and_string: {% if training %} train_dataset[col + '__str'] = np.where(pd.to_numeric(train_dataset[col], errors='coerce').isnull(), train_dataset[col], np.nan) train_dataset[col + '__num'] = np.where(pd.to_numeric(train_dataset[col], errors='coerce').isnull(), np.nan, train_dataset[col]).astype(float) + if(col in datetime_astype_str): + train_dataset[col + '__str'] = train_dataset[col + '__str'].astype(str) train_dataset = train_dataset.drop(col, axis=1) {% endif %} {% if test %} test_dataset[col + '__str'] = np.where(pd.to_numeric(test_dataset[col], errors='coerce').isnull(), test_dataset[col], np.nan) test_dataset[col + '__num'] = np.where(pd.to_numeric(test_dataset[col], errors='coerce').isnull(), np.nan, test_dataset[col]).astype(float) + if(col in datetime_astype_str): + test_dataset[col + '__str'] = test_dataset[col + '__str'].astype(str) test_dataset = test_dataset.drop(col, axis=1) {% endif %} From 5466bed34029498911227009004d57c9f2933243 Mon Sep 17 00:00:00 2001 From: tashiro akira Date: Wed, 25 Oct 2023 15:09:09 +0900 Subject: [PATCH 02/16] fix: Fix error caused when input data is mixed with datetime and string types Signed-off-by: tashiro akira --- sapientml_preprocess/generator.py | 1 - 1 file changed, 1 deletion(-) diff --git a/sapientml_preprocess/generator.py b/sapientml_preprocess/generator.py index 9634c76..821fb5a 100644 --- a/sapientml_preprocess/generator.py +++ b/sapientml_preprocess/generator.py @@ -266,7 +266,6 @@ def generate_code(self, dataset: Dataset, task: Task) -> Tuple[Dataset, Code]: if(str(df[col].dtype) in "object" and infer_dtype(df[col],skipna=True) in "mixed" and not _is_date_column(df[col])): datetime_astype_str.append(col) cols_numeric_and_string.append(col) - cols_numeric_and_string.append(col) only_str = col + "__str" only_num = col + "__num" df[only_str] = np.where(pd.to_numeric(df[col], errors="coerce").isnull(), df[col], np.nan) From f4807c2b9033018beddc31a414354707c188bc35 Mon Sep 17 00:00:00 2001 From: tashiro akira Date: Thu, 26 Oct 2023 09:50:10 +0900 Subject: [PATCH 03/16] fix: Fix error caused when input data is mixed with datetime and string types Signed-off-by: tashiro akira --- sapientml_preprocess/generator.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/sapientml_preprocess/generator.py b/sapientml_preprocess/generator.py index 821fb5a..c5a50ab 100644 --- a/sapientml_preprocess/generator.py +++ b/sapientml_preprocess/generator.py @@ -26,7 +26,7 @@ from jinja2 import Environment, FileSystemLoader from pandas.api.types import infer_dtype from sapientml.generator import CodeBlockGenerator -from sapientml.params import Code, Dataset, Task, _is_date_colum +from sapientml.params import Code, Dataset, Task, _is_date_column from sapientml.util.logging import setup_logger from sapientml_preprocess.params import PreprocessConfig From 99a9d7ed2576a0d1545bdc36aaa47281940d91fb Mon Sep 17 00:00:00 2001 From: tashiro akira Date: Wed, 8 Nov 2023 13:59:49 +0900 Subject: [PATCH 04/16] fix:Modifying Source Code Formatting Signed-off-by: tashiro akira --- sapientml_preprocess/generator.py | 38 +++++++++++++++++++++++++++---- 1 file changed, 33 insertions(+), 5 deletions(-) diff --git a/sapientml_preprocess/generator.py b/sapientml_preprocess/generator.py index c5a50ab..638f381 100644 --- a/sapientml_preprocess/generator.py +++ b/sapientml_preprocess/generator.py @@ -263,7 +263,11 @@ def generate_code(self, dataset: Dataset, task: Task) -> Tuple[Dataset, Code]: datetime_astype_str = [] cols_numeric_and_string = [] for col in mix_typed_cols: - if(str(df[col].dtype) in "object" and infer_dtype(df[col],skipna=True) in "mixed" and not _is_date_column(df[col])): + if ( + str(df[col].dtype) in "object" + and infer_dtype(df[col], skipna=True) in "mixed" + and not _is_date_column(df[col]) + ): datetime_astype_str.append(col) cols_numeric_and_string.append(col) only_str = col + "__str" @@ -274,10 +278,34 @@ def generate_code(self, dataset: Dataset, task: Task) -> Tuple[Dataset, Code]: df = df.drop(col, axis=1) if cols_numeric_and_string: tpl = template_env.get_template("handle_mixed_typed_columns.py.jinja") - code.validation += _render(tpl, training=True, test=True, cols_numeric_and_string=cols_numeric_and_string,datetime_astype_str=datetime_astype_str) - code.test += _render(tpl, training=True, test=True, cols_numeric_and_string=cols_numeric_and_string,datetime_astype_str=datetime_astype_str) - code.train += _render(tpl, training=True, test=False, cols_numeric_and_string=cols_numeric_and_string,datetime_astype_str=datetime_astype_str) - code.predict += _render(tpl, training=False, test=True, cols_numeric_and_string=cols_numeric_and_string,datetime_astype_str=datetime_astype_str) + code.validation += _render( + tpl, + training=True, + test=True, + cols_numeric_and_string=cols_numeric_and_string, + datetime_astype_str=datetime_astype_str, + ) + code.test += _render( + tpl, + training=True, + test=True, + cols_numeric_and_string=cols_numeric_and_string, + datetime_astype_str=datetime_astype_str, + ) + code.train += _render( + tpl, + training=True, + test=False, + cols_numeric_and_string=cols_numeric_and_string, + datetime_astype_str=datetime_astype_str, + ) + code.predict += _render( + tpl, + training=False, + test=True, + cols_numeric_and_string=cols_numeric_and_string, + datetime_astype_str=datetime_astype_str, + ) # meta features must be calculated after replacing inf with nan, # becuase the replaced nan must be preprocessed in the generated code. From 812da985199f4c767b79ec7d80c87e70478b0650 Mon Sep 17 00:00:00 2001 From: tashiro akira Date: Mon, 4 Dec 2023 14:07:08 +0900 Subject: [PATCH 05/16] fix:Reflect Review Signed-off-by: tashiro akira --- sapientml_preprocess/generator.py | 19 ++++--------------- .../handle_mixed_typed_columns.py.jinja | 7 ++----- 2 files changed, 6 insertions(+), 20 deletions(-) diff --git a/sapientml_preprocess/generator.py b/sapientml_preprocess/generator.py index 638f381..55a7eb6 100644 --- a/sapientml_preprocess/generator.py +++ b/sapientml_preprocess/generator.py @@ -260,15 +260,8 @@ def generate_code(self, dataset: Dataset, task: Task) -> Tuple[Dataset, Code]: # split a columns into 2 columns, one column has only numeric, another columns has only string # this operation should be done before calculating meta features mix_typed_cols = _confirm_mixed_type(df.drop(task.target_columns, axis=1)) - datetime_astype_str = [] cols_numeric_and_string = [] for col in mix_typed_cols: - if ( - str(df[col].dtype) in "object" - and infer_dtype(df[col], skipna=True) in "mixed" - and not _is_date_column(df[col]) - ): - datetime_astype_str.append(col) cols_numeric_and_string.append(col) only_str = col + "__str" only_num = col + "__num" @@ -282,29 +275,25 @@ def generate_code(self, dataset: Dataset, task: Task) -> Tuple[Dataset, Code]: tpl, training=True, test=True, - cols_numeric_and_string=cols_numeric_and_string, - datetime_astype_str=datetime_astype_str, + cols_numeric_and_string=cols_numeric_and_string ) code.test += _render( tpl, training=True, test=True, - cols_numeric_and_string=cols_numeric_and_string, - datetime_astype_str=datetime_astype_str, + cols_numeric_and_string=cols_numeric_and_string ) code.train += _render( tpl, training=True, test=False, - cols_numeric_and_string=cols_numeric_and_string, - datetime_astype_str=datetime_astype_str, + cols_numeric_and_string=cols_numeric_and_string ) code.predict += _render( tpl, training=False, test=True, - cols_numeric_and_string=cols_numeric_and_string, - datetime_astype_str=datetime_astype_str, + cols_numeric_and_string=cols_numeric_and_string ) # meta features must be calculated after replacing inf with nan, diff --git a/sapientml_preprocess/templates/handle_mixed_typed_columns.py.jinja b/sapientml_preprocess/templates/handle_mixed_typed_columns.py.jinja index d5a4d06..54d807d 100644 --- a/sapientml_preprocess/templates/handle_mixed_typed_columns.py.jinja +++ b/sapientml_preprocess/templates/handle_mixed_typed_columns.py.jinja @@ -1,19 +1,16 @@ # HANDLE MIXED TYPE import numpy as np cols_numeric_and_string = {{ cols_numeric_and_string}} -datetime_astype_str = {{ datetime_astype_str }} for col in cols_numeric_and_string: {% if training %} train_dataset[col + '__str'] = np.where(pd.to_numeric(train_dataset[col], errors='coerce').isnull(), train_dataset[col], np.nan) + train_dataset[col + '__str'] = np.where(train_dataset[col + '__str'].notnull(), train_dataset[col + '__str'].astype(str), np.nan) train_dataset[col + '__num'] = np.where(pd.to_numeric(train_dataset[col], errors='coerce').isnull(), np.nan, train_dataset[col]).astype(float) - if(col in datetime_astype_str): - train_dataset[col + '__str'] = train_dataset[col + '__str'].astype(str) train_dataset = train_dataset.drop(col, axis=1) {% endif %} {% if test %} test_dataset[col + '__str'] = np.where(pd.to_numeric(test_dataset[col], errors='coerce').isnull(), test_dataset[col], np.nan) + test_dataset[col + '__str'] = np.where(test_dataset[col + '__str'].notnull(), test_dataset[col + '__str'].astype(str), np.nan) test_dataset[col + '__num'] = np.where(pd.to_numeric(test_dataset[col], errors='coerce').isnull(), np.nan, test_dataset[col]).astype(float) - if(col in datetime_astype_str): - test_dataset[col + '__str'] = test_dataset[col + '__str'].astype(str) test_dataset = test_dataset.drop(col, axis=1) {% endif %} From 937880cc8c160c66c83a75f291bc956f4a862a9e Mon Sep 17 00:00:00 2001 From: tashiro akira Date: Mon, 4 Dec 2023 14:11:44 +0900 Subject: [PATCH 06/16] fix: Remove Unnecessary Imports Signed-off-by: tashiro akira --- sapientml_preprocess/generator.py | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/sapientml_preprocess/generator.py b/sapientml_preprocess/generator.py index 55a7eb6..3ae14d4 100644 --- a/sapientml_preprocess/generator.py +++ b/sapientml_preprocess/generator.py @@ -24,9 +24,8 @@ import pandas as pd import requests from jinja2 import Environment, FileSystemLoader -from pandas.api.types import infer_dtype from sapientml.generator import CodeBlockGenerator -from sapientml.params import Code, Dataset, Task, _is_date_column +from sapientml.params import Code, Dataset, Task from sapientml.util.logging import setup_logger from sapientml_preprocess.params import PreprocessConfig From 79e39e37746bd70c62e1c0c1e9b078e61d8d7274 Mon Sep 17 00:00:00 2001 From: tashiro akira Date: Fri, 8 Dec 2023 09:59:12 +0900 Subject: [PATCH 07/16] fix: Reflect the point Signed-off-by: tashiro akira --- sapientml_preprocess/generator.py | 1 + 1 file changed, 1 insertion(+) diff --git a/sapientml_preprocess/generator.py b/sapientml_preprocess/generator.py index 3ae14d4..1d9042f 100644 --- a/sapientml_preprocess/generator.py +++ b/sapientml_preprocess/generator.py @@ -265,6 +265,7 @@ def generate_code(self, dataset: Dataset, task: Task) -> Tuple[Dataset, Code]: only_str = col + "__str" only_num = col + "__num" df[only_str] = np.where(pd.to_numeric(df[col], errors="coerce").isnull(), df[col], np.nan) + df[only_str] = np.where(df[only_str].notnull(), df[only_str].astype(str), np.nan) # without .astype(float), cannot recongnize as `int` or `float`, leading to generate inappropriate code snippet df[only_num] = np.where(pd.to_numeric(df[col], errors="coerce").isnull(), np.nan, df[col]).astype(float) df = df.drop(col, axis=1) From edd4d49a690f597b9bb74b08a9a81c6fc84e2556 Mon Sep 17 00:00:00 2001 From: tashiro akira Date: Tue, 12 Dec 2023 15:02:36 +0900 Subject: [PATCH 08/16] fix:Reflected review results Signed-off-by: tashiro akira --- sapientml_preprocess/generator.py | 9 +++++---- sapientml_preprocess/templates/rename_columns.py.jinja | 1 + 2 files changed, 6 insertions(+), 4 deletions(-) diff --git a/sapientml_preprocess/generator.py b/sapientml_preprocess/generator.py index 1d9042f..ddc3ced 100644 --- a/sapientml_preprocess/generator.py +++ b/sapientml_preprocess/generator.py @@ -225,6 +225,7 @@ def generate_code(self, dataset: Dataset, task: Task) -> Tuple[Dataset, Code]: # Remove special symbols that interfere with visualization and model training cols_has_symbols = [] cols_has_symbols = check_cols_has_symbols(df.columns.to_list()) + cols_has_symbols_target = task.target_columns if cols_has_symbols: logger.warning( f"Symbols that inhibit training and visualization will be removed from column name {str(cols_has_symbols)}." @@ -234,10 +235,10 @@ def generate_code(self, dataset: Dataset, task: Task) -> Tuple[Dataset, Code]: remove_symbols(col) if col in cols_has_symbols else col for col in task.target_columns ] tpl = template_env.get_template("rename_columns.py.jinja") - code.validation += _render(tpl, training=True, test=True, cols_has_symbols=cols_has_symbols) - code.test += _render(tpl, training=True, test=True, cols_has_symbols=cols_has_symbols) - code.train += _render(tpl, training=True, test=False, cols_has_symbols=cols_has_symbols) - code.predict += _render(tpl, training=False, test=True, cols_has_symbols=cols_has_symbols) + code.validation += _render(tpl, training=True, test=True, cols_has_symbols=cols_has_symbols, cols_has_symbols_target=cols_has_symbols_target) + code.test += _render(tpl, training=True, test=True, cols_has_symbols=cols_has_symbols, cols_has_symbols_target=cols_has_symbols_target) + code.train += _render(tpl, training=True, test=False, cols_has_symbols=cols_has_symbols, cols_has_symbols_target=cols_has_symbols_target) + code.predict += _render(tpl, training=False, test=True, cols_has_symbols=cols_has_symbols, cols_has_symbols_target=cols_has_symbols_target) # handle list(tuple, dict) value in dataframe. # in generated scripts, visualisation will be executed before pre-processing such as handle mixed-type. diff --git a/sapientml_preprocess/templates/rename_columns.py.jinja b/sapientml_preprocess/templates/rename_columns.py.jinja index 7e21706..d0fd06c 100644 --- a/sapientml_preprocess/templates/rename_columns.py.jinja +++ b/sapientml_preprocess/templates/rename_columns.py.jinja @@ -1,6 +1,7 @@ # Remove special symbols that interfere with visualization and model training import re cols_has_symbols = {{ cols_has_symbols }} +cols_has_symbols_target = {{ cols_has_symbols_target }} inhibited_symbol_pattern = re.compile(r"[\{\}\[\]\",:<'\\]+") {% if training %} train_dataset = train_dataset.rename(columns=lambda col: inhibited_symbol_pattern.sub("", col) if col in cols_has_symbols else col) From e063795badef4c9da186ffe250b44f830690dd34 Mon Sep 17 00:00:00 2001 From: tashiro akira Date: Tue, 12 Dec 2023 15:30:21 +0900 Subject: [PATCH 09/16] style:Modified to fit the format Signed-off-by: tashiro akira --- sapientml_preprocess/generator.py | 60 ++++++++++++++++--------------- 1 file changed, 32 insertions(+), 28 deletions(-) diff --git a/sapientml_preprocess/generator.py b/sapientml_preprocess/generator.py index ddc3ced..3bebf6f 100644 --- a/sapientml_preprocess/generator.py +++ b/sapientml_preprocess/generator.py @@ -235,10 +235,34 @@ def generate_code(self, dataset: Dataset, task: Task) -> Tuple[Dataset, Code]: remove_symbols(col) if col in cols_has_symbols else col for col in task.target_columns ] tpl = template_env.get_template("rename_columns.py.jinja") - code.validation += _render(tpl, training=True, test=True, cols_has_symbols=cols_has_symbols, cols_has_symbols_target=cols_has_symbols_target) - code.test += _render(tpl, training=True, test=True, cols_has_symbols=cols_has_symbols, cols_has_symbols_target=cols_has_symbols_target) - code.train += _render(tpl, training=True, test=False, cols_has_symbols=cols_has_symbols, cols_has_symbols_target=cols_has_symbols_target) - code.predict += _render(tpl, training=False, test=True, cols_has_symbols=cols_has_symbols, cols_has_symbols_target=cols_has_symbols_target) + code.validation += _render( + tpl, + training=True, + test=True, + cols_has_symbols=cols_has_symbols, + cols_has_symbols_target=cols_has_symbols_target, + ) + code.test += _render( + tpl, + training=True, + test=True, + cols_has_symbols=cols_has_symbols, + cols_has_symbols_target=cols_has_symbols_target, + ) + code.train += _render( + tpl, + training=True, + test=False, + cols_has_symbols=cols_has_symbols, + cols_has_symbols_target=cols_has_symbols_target, + ) + code.predict += _render( + tpl, + training=False, + test=True, + cols_has_symbols=cols_has_symbols, + cols_has_symbols_target=cols_has_symbols_target, + ) # handle list(tuple, dict) value in dataframe. # in generated scripts, visualisation will be executed before pre-processing such as handle mixed-type. @@ -272,30 +296,10 @@ def generate_code(self, dataset: Dataset, task: Task) -> Tuple[Dataset, Code]: df = df.drop(col, axis=1) if cols_numeric_and_string: tpl = template_env.get_template("handle_mixed_typed_columns.py.jinja") - code.validation += _render( - tpl, - training=True, - test=True, - cols_numeric_and_string=cols_numeric_and_string - ) - code.test += _render( - tpl, - training=True, - test=True, - cols_numeric_and_string=cols_numeric_and_string - ) - code.train += _render( - tpl, - training=True, - test=False, - cols_numeric_and_string=cols_numeric_and_string - ) - code.predict += _render( - tpl, - training=False, - test=True, - cols_numeric_and_string=cols_numeric_and_string - ) + code.validation += _render(tpl, training=True, test=True, cols_numeric_and_string=cols_numeric_and_string) + code.test += _render(tpl, training=True, test=True, cols_numeric_and_string=cols_numeric_and_string) + code.train += _render(tpl, training=True, test=False, cols_numeric_and_string=cols_numeric_and_string) + code.predict += _render(tpl, training=False, test=True, cols_numeric_and_string=cols_numeric_and_string) # meta features must be calculated after replacing inf with nan, # becuase the replaced nan must be preprocessed in the generated code. From 22be366c5b23f695d934301d2e8304bcc7d11724 Mon Sep 17 00:00:00 2001 From: tashiro akira Date: Fri, 22 Dec 2023 09:50:16 +0900 Subject: [PATCH 10/16] fix:Fixed to return column names in csv file Signed-off-by: tashiro akira --- sapientml_preprocess/generator.py | 33 +++---------------- .../templates/rename_columns.py.jinja | 3 +- 2 files changed, 6 insertions(+), 30 deletions(-) diff --git a/sapientml_preprocess/generator.py b/sapientml_preprocess/generator.py index 3bebf6f..fe01dc6 100644 --- a/sapientml_preprocess/generator.py +++ b/sapientml_preprocess/generator.py @@ -225,7 +225,6 @@ def generate_code(self, dataset: Dataset, task: Task) -> Tuple[Dataset, Code]: # Remove special symbols that interfere with visualization and model training cols_has_symbols = [] cols_has_symbols = check_cols_has_symbols(df.columns.to_list()) - cols_has_symbols_target = task.target_columns if cols_has_symbols: logger.warning( f"Symbols that inhibit training and visualization will be removed from column name {str(cols_has_symbols)}." @@ -235,34 +234,10 @@ def generate_code(self, dataset: Dataset, task: Task) -> Tuple[Dataset, Code]: remove_symbols(col) if col in cols_has_symbols else col for col in task.target_columns ] tpl = template_env.get_template("rename_columns.py.jinja") - code.validation += _render( - tpl, - training=True, - test=True, - cols_has_symbols=cols_has_symbols, - cols_has_symbols_target=cols_has_symbols_target, - ) - code.test += _render( - tpl, - training=True, - test=True, - cols_has_symbols=cols_has_symbols, - cols_has_symbols_target=cols_has_symbols_target, - ) - code.train += _render( - tpl, - training=True, - test=False, - cols_has_symbols=cols_has_symbols, - cols_has_symbols_target=cols_has_symbols_target, - ) - code.predict += _render( - tpl, - training=False, - test=True, - cols_has_symbols=cols_has_symbols, - cols_has_symbols_target=cols_has_symbols_target, - ) + code.validation += _render(tpl, training=True, test=True, cols_has_symbols=cols_has_symbols) + code.test += _render(tpl, training=True, test=True, cols_has_symbols=cols_has_symbols) + code.train += _render(tpl, training=True, test=False, cols_has_symbols=cols_has_symbols) + code.predict += _render(tpl, training=False, test=True, cols_has_symbols=cols_has_symbols) # handle list(tuple, dict) value in dataframe. # in generated scripts, visualisation will be executed before pre-processing such as handle mixed-type. diff --git a/sapientml_preprocess/templates/rename_columns.py.jinja b/sapientml_preprocess/templates/rename_columns.py.jinja index d0fd06c..c3b056e 100644 --- a/sapientml_preprocess/templates/rename_columns.py.jinja +++ b/sapientml_preprocess/templates/rename_columns.py.jinja @@ -1,9 +1,10 @@ # Remove special symbols that interfere with visualization and model training import re cols_has_symbols = {{ cols_has_symbols }} -cols_has_symbols_target = {{ cols_has_symbols_target }} inhibited_symbol_pattern = re.compile(r"[\{\}\[\]\",:<'\\]+") {% if training %} +rename_symbol_cols = {col: inhibited_symbol_pattern.sub("", col) if col in cols_has_symbols else col in cols_has_symbols for col in cols_has_symbols } +rename_symbol_cols = {v: k for k, v in rename_symbol_cols.items()} train_dataset = train_dataset.rename(columns=lambda col: inhibited_symbol_pattern.sub("", col) if col in cols_has_symbols else col) {% endif %} {% if test %} From 3b62abbb81d1670db9e6c5de01745accfa75ab95 Mon Sep 17 00:00:00 2001 From: tashiro akira Date: Thu, 11 Jan 2024 15:38:29 +0900 Subject: [PATCH 11/16] fix:Reflected the content of the review Signed-off-by: tashiro akira --- sapientml_preprocess/templates/rename_columns.py.jinja | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/sapientml_preprocess/templates/rename_columns.py.jinja b/sapientml_preprocess/templates/rename_columns.py.jinja index c3b056e..a1bd71e 100644 --- a/sapientml_preprocess/templates/rename_columns.py.jinja +++ b/sapientml_preprocess/templates/rename_columns.py.jinja @@ -2,11 +2,11 @@ import re cols_has_symbols = {{ cols_has_symbols }} inhibited_symbol_pattern = re.compile(r"[\{\}\[\]\",:<'\\]+") -{% if training %} rename_symbol_cols = {col: inhibited_symbol_pattern.sub("", col) if col in cols_has_symbols else col in cols_has_symbols for col in cols_has_symbols } -rename_symbol_cols = {v: k for k, v in rename_symbol_cols.items()} -train_dataset = train_dataset.rename(columns=lambda col: inhibited_symbol_pattern.sub("", col) if col in cols_has_symbols else col) +{% if training %} +train_dataset = train_dataset.rename(columns=rename_symbol_cols) {% endif %} {% if test %} -test_dataset = test_dataset.rename(columns=lambda col: inhibited_symbol_pattern.sub("", col) if col in cols_has_symbols else col) -{% endif %} \ No newline at end of file +test_dataset = test_dataset.rename(columns=rename_symbol_cols) +{% endif %} +rename_symbol_cols = {v: k for k, v in rename_symbol_cols.items()} \ No newline at end of file From 6433d99bb36292a2f04d8884c25a4accb77a0ec3 Mon Sep 17 00:00:00 2001 From: tashiro akira Date: Fri, 22 Mar 2024 16:11:15 +0900 Subject: [PATCH 12/16] fix:Reflected review results Signed-off-by: tashiro akira --- sapientml_preprocess/generator.py | 42 ++++++++++++++++--- .../templates/rename_columns.py.jinja | 8 +++- 2 files changed, 43 insertions(+), 7 deletions(-) diff --git a/sapientml_preprocess/generator.py b/sapientml_preprocess/generator.py index fe01dc6..ab5aa6b 100644 --- a/sapientml_preprocess/generator.py +++ b/sapientml_preprocess/generator.py @@ -12,6 +12,7 @@ # See the License for the specific language governing permissions and # limitations under the License. +import collections import os import re from pathlib import Path @@ -31,7 +32,7 @@ logger = setup_logger() -INHIBITED_SYMBOL_PATTERN = re.compile(r"[\{\}\[\]\",:<'\\]+") +INHIBITED_SYMBOL_PATTERN = re.compile(r"[\{\}\[\]\",:<'\\\+]+") template_env = Environment(loader=FileSystemLoader(f"{os.path.dirname(__file__)}/templates"), trim_blocks=True) @@ -229,15 +230,46 @@ def generate_code(self, dataset: Dataset, task: Task) -> Tuple[Dataset, Code]: logger.warning( f"Symbols that inhibit training and visualization will be removed from column name {str(cols_has_symbols)}." ) + org_df_column = df.columns.values + org_target_column = task.target_columns df = df.rename(columns=lambda col: remove_symbols(col) if col in cols_has_symbols else col) task.target_columns = [ remove_symbols(col) if col in cols_has_symbols else col for col in task.target_columns ] + same_column = {k: v for k, v in collections.Counter(list(df.columns.values)).items() if v > 1} + rename_dict = {} + if len(same_column) != 0: + for target in same_column.keys(): + rename_dict = {} + rename_target_col = [] + df_cols = list(df.columns.values) + i = 1 + for col in df_cols: + if target in col: + rename_dict[org_df_column[len(rename_dict)]] = str(col + str(i)) + i = i + 1 + else: + rename_dict[org_df_column[len(rename_dict)]] = col + df = df.set_axis(list(rename_dict.values()), axis=1) + i = 1 + for col in org_target_column: + rename_target_col.append(rename_dict[col]) + + task.target_columns = rename_target_col + tpl = template_env.get_template("rename_columns.py.jinja") - code.validation += _render(tpl, training=True, test=True, cols_has_symbols=cols_has_symbols) - code.test += _render(tpl, training=True, test=True, cols_has_symbols=cols_has_symbols) - code.train += _render(tpl, training=True, test=False, cols_has_symbols=cols_has_symbols) - code.predict += _render(tpl, training=False, test=True, cols_has_symbols=cols_has_symbols) + code.validation += _render( + tpl, training=True, test=True, cols_has_symbols=cols_has_symbols, rename_dict=rename_dict + ) + code.test += _render( + tpl, training=True, test=True, cols_has_symbols=cols_has_symbols, rename_dict=rename_dict + ) + code.train += _render( + tpl, training=True, test=False, cols_has_symbols=cols_has_symbols, rename_dict=rename_dict + ) + code.predict += _render( + tpl, training=False, test=True, cols_has_symbols=cols_has_symbols, rename_dict=rename_dict + ) # handle list(tuple, dict) value in dataframe. # in generated scripts, visualisation will be executed before pre-processing such as handle mixed-type. diff --git a/sapientml_preprocess/templates/rename_columns.py.jinja b/sapientml_preprocess/templates/rename_columns.py.jinja index a1bd71e..4f32c5d 100644 --- a/sapientml_preprocess/templates/rename_columns.py.jinja +++ b/sapientml_preprocess/templates/rename_columns.py.jinja @@ -1,8 +1,12 @@ # Remove special symbols that interfere with visualization and model training import re cols_has_symbols = {{ cols_has_symbols }} -inhibited_symbol_pattern = re.compile(r"[\{\}\[\]\",:<'\\]+") -rename_symbol_cols = {col: inhibited_symbol_pattern.sub("", col) if col in cols_has_symbols else col in cols_has_symbols for col in cols_has_symbols } +rename_dict = {{ rename_dict }} +inhibited_symbol_pattern = re.compile(r"[\{\}\[\]\",:<'\\\+]+") +if len(rename_dict) == 0 : + rename_symbol_cols = {col: inhibited_symbol_pattern.sub("", col) if col in cols_has_symbols else col in cols_has_symbols for col in cols_has_symbols } +else: + rename_symbol_cols = rename_dict {% if training %} train_dataset = train_dataset.rename(columns=rename_symbol_cols) {% endif %} From e1da54cda2cc03cd1fd448dd64a32b3e461e910a Mon Sep 17 00:00:00 2001 From: tashiro-akira Date: Mon, 22 Apr 2024 16:51:22 +0900 Subject: [PATCH 13/16] fix:Reflect Review Results Signed-off-by: tashiro-akira --- sapientml_preprocess/generator.py | 34 ++++++++----------- .../templates/rename_columns.py.jinja | 9 ++--- 2 files changed, 19 insertions(+), 24 deletions(-) diff --git a/sapientml_preprocess/generator.py b/sapientml_preprocess/generator.py index ab5aa6b..9c654a4 100644 --- a/sapientml_preprocess/generator.py +++ b/sapientml_preprocess/generator.py @@ -231,31 +231,25 @@ def generate_code(self, dataset: Dataset, task: Task) -> Tuple[Dataset, Code]: f"Symbols that inhibit training and visualization will be removed from column name {str(cols_has_symbols)}." ) org_df_column = df.columns.values - org_target_column = task.target_columns + org_target_columns = task.target_columns + no_symbol_columns = [ col for col in df.columns.values if col not in cols_has_symbols ] df = df.rename(columns=lambda col: remove_symbols(col) if col in cols_has_symbols else col) task.target_columns = [ remove_symbols(col) if col in cols_has_symbols else col for col in task.target_columns ] - same_column = {k: v for k, v in collections.Counter(list(df.columns.values)).items() if v > 1} rename_dict = {} - if len(same_column) != 0: - for target in same_column.keys(): - rename_dict = {} - rename_target_col = [] - df_cols = list(df.columns.values) - i = 1 - for col in df_cols: - if target in col: - rename_dict[org_df_column[len(rename_dict)]] = str(col + str(i)) - i = i + 1 - else: - rename_dict[org_df_column[len(rename_dict)]] = col - df = df.set_axis(list(rename_dict.values()), axis=1) - i = 1 - for col in org_target_column: - rename_target_col.append(rename_dict[col]) - - task.target_columns = rename_target_col + if df.columns.duplicated().any() : + same_column = {k: v for k, v in collections.Counter(list(df.columns.values)).items() if v > 1 and k in no_symbol_columns} + for target, org_column in zip(df.columns.to_list(), org_df_column.tolist()): + if target in same_column.keys(): + rename_dict[org_column] = target + str(same_column[target] - 1) + same_column[target] = same_column[target] - 1 + else: + rename_dict[org_column] = target + + df = df.set_axis(list(rename_dict.values()), axis=1) + # task.target_columns = [col for col in list(rename_dict.values()) if task.target_columns in list(rename_dict.values())] + task.target_columns = [rename_dict[col] for col in org_target_columns] tpl = template_env.get_template("rename_columns.py.jinja") code.validation += _render( diff --git a/sapientml_preprocess/templates/rename_columns.py.jinja b/sapientml_preprocess/templates/rename_columns.py.jinja index 4f32c5d..62d45a8 100644 --- a/sapientml_preprocess/templates/rename_columns.py.jinja +++ b/sapientml_preprocess/templates/rename_columns.py.jinja @@ -3,10 +3,11 @@ import re cols_has_symbols = {{ cols_has_symbols }} rename_dict = {{ rename_dict }} inhibited_symbol_pattern = re.compile(r"[\{\}\[\]\",:<'\\\+]+") -if len(rename_dict) == 0 : - rename_symbol_cols = {col: inhibited_symbol_pattern.sub("", col) if col in cols_has_symbols else col in cols_has_symbols for col in cols_has_symbols } -else: - rename_symbol_cols = rename_dict +{% if rename_dict %} +rename_symbol_cols = rename_dict +{% else %} +rename_symbol_cols = {col: inhibited_symbol_pattern.sub("", col) if col in cols_has_symbols else col in cols_has_symbols for col in cols_has_symbols } +{% endif %} {% if training %} train_dataset = train_dataset.rename(columns=rename_symbol_cols) {% endif %} From bde5ad92232711eb3e30aa063f8a59c19c9e2bb5 Mon Sep 17 00:00:00 2001 From: tashiro-akira Date: Thu, 25 Apr 2024 11:00:26 +0900 Subject: [PATCH 14/16] fix:Fixed error in running lint Signed-off-by: tashiro-akira --- sapientml_preprocess/generator.py | 13 ++++++++----- .../templates/rename_columns.py.jinja | 2 ++ 2 files changed, 10 insertions(+), 5 deletions(-) diff --git a/sapientml_preprocess/generator.py b/sapientml_preprocess/generator.py index 52399bc..efc4897 100644 --- a/sapientml_preprocess/generator.py +++ b/sapientml_preprocess/generator.py @@ -232,23 +232,26 @@ def generate_code(self, dataset: Dataset, task: Task) -> Tuple[Dataset, Code]: ) org_df_column = df.columns.values org_target_columns = task.target_columns - no_symbol_columns = [ col for col in df.columns.values if col not in cols_has_symbols ] + no_symbol_columns = [col for col in df.columns.values if col not in cols_has_symbols] df = df.rename(columns=lambda col: remove_symbols(col) if col in cols_has_symbols else col) task.target_columns = [ remove_symbols(col) if col in cols_has_symbols else col for col in task.target_columns ] rename_dict = {} - if df.columns.duplicated().any() : - same_column = {k: v for k, v in collections.Counter(list(df.columns.values)).items() if v > 1 and k in no_symbol_columns} + if df.columns.duplicated().any(): + same_column = { + k: v + for k, v in collections.Counter(list(df.columns.values)).items() + if v > 1 and k in no_symbol_columns + } for target, org_column in zip(df.columns.to_list(), org_df_column.tolist()): if target in same_column.keys(): rename_dict[org_column] = target + str(same_column[target] - 1) same_column[target] = same_column[target] - 1 else: rename_dict[org_column] = target - + df = df.set_axis(list(rename_dict.values()), axis=1) - # task.target_columns = [col for col in list(rename_dict.values()) if task.target_columns in list(rename_dict.values())] task.target_columns = [rename_dict[col] for col in org_target_columns] tpl = template_env.get_template("rename_columns.py.jinja") diff --git a/sapientml_preprocess/templates/rename_columns.py.jinja b/sapientml_preprocess/templates/rename_columns.py.jinja index 62d45a8..d6b540e 100644 --- a/sapientml_preprocess/templates/rename_columns.py.jinja +++ b/sapientml_preprocess/templates/rename_columns.py.jinja @@ -1,7 +1,9 @@ # Remove special symbols that interfere with visualization and model training import re cols_has_symbols = {{ cols_has_symbols }} +{% if rename_dict %} rename_dict = {{ rename_dict }} +{% endif %} inhibited_symbol_pattern = re.compile(r"[\{\}\[\]\",:<'\\\+]+") {% if rename_dict %} rename_symbol_cols = rename_dict From 8c15e2151073132a461494e4983f6b8131accbb6 Mon Sep 17 00:00:00 2001 From: tashiro-akira Date: Thu, 25 Apr 2024 11:06:19 +0900 Subject: [PATCH 15/16] fix:Reflect Review Results Signed-off-by: tashiro-akira --- sapientml_preprocess/templates/rename_columns.py.jinja | 5 +---- 1 file changed, 1 insertion(+), 4 deletions(-) diff --git a/sapientml_preprocess/templates/rename_columns.py.jinja b/sapientml_preprocess/templates/rename_columns.py.jinja index d6b540e..45a5f62 100644 --- a/sapientml_preprocess/templates/rename_columns.py.jinja +++ b/sapientml_preprocess/templates/rename_columns.py.jinja @@ -1,12 +1,9 @@ # Remove special symbols that interfere with visualization and model training import re cols_has_symbols = {{ cols_has_symbols }} -{% if rename_dict %} -rename_dict = {{ rename_dict }} -{% endif %} inhibited_symbol_pattern = re.compile(r"[\{\}\[\]\",:<'\\\+]+") {% if rename_dict %} -rename_symbol_cols = rename_dict +rename_symbol_cols = {{ rename_dict }} {% else %} rename_symbol_cols = {col: inhibited_symbol_pattern.sub("", col) if col in cols_has_symbols else col in cols_has_symbols for col in cols_has_symbols } {% endif %} From 1f62efd62a3222a72bd2741c283a813c51c64717 Mon Sep 17 00:00:00 2001 From: tashiro-akira Date: Thu, 25 Apr 2024 13:42:51 +0900 Subject: [PATCH 16/16] fix:Reflect Review Results Signed-off-by: tashiro-akira --- sapientml_preprocess/generator.py | 2 +- sapientml_preprocess/templates/rename_columns.py.jinja | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/sapientml_preprocess/generator.py b/sapientml_preprocess/generator.py index efc4897..41dc4b0 100644 --- a/sapientml_preprocess/generator.py +++ b/sapientml_preprocess/generator.py @@ -32,7 +32,7 @@ logger = setup_logger() -INHIBITED_SYMBOL_PATTERN = re.compile(r"[\{\}\[\]\",:<'\\\+]+") +INHIBITED_SYMBOL_PATTERN = re.compile(r"[\{\}\[\]\",:<'\\]+") template_env = Environment(loader=FileSystemLoader(f"{os.path.dirname(__file__)}/templates"), trim_blocks=True) diff --git a/sapientml_preprocess/templates/rename_columns.py.jinja b/sapientml_preprocess/templates/rename_columns.py.jinja index 45a5f62..846e637 100644 --- a/sapientml_preprocess/templates/rename_columns.py.jinja +++ b/sapientml_preprocess/templates/rename_columns.py.jinja @@ -1,7 +1,7 @@ # Remove special symbols that interfere with visualization and model training import re cols_has_symbols = {{ cols_has_symbols }} -inhibited_symbol_pattern = re.compile(r"[\{\}\[\]\",:<'\\\+]+") +inhibited_symbol_pattern = re.compile(r"[\{\}\[\]\",:<'\\]+") {% if rename_dict %} rename_symbol_cols = {{ rename_dict }} {% else %}