From 55786bc50f92999e514b47f9df55bd36a9405550 Mon Sep 17 00:00:00 2001
From: tashiro akira <fj1755jk@fujitsu.com>
Date: Wed, 25 Oct 2023 14:48:46 +0900
Subject: [PATCH 01/16] fix: Fix error caused when input data is mixed with
 datetime and string types

Signed-off-by: tashiro akira <fj1755jk@fujitsu.com>
---
 sapientml_preprocess/generator.py                 | 15 ++++++++++-----
 .../templates/handle_mixed_typed_columns.py.jinja |  5 +++++
 2 files changed, 15 insertions(+), 5 deletions(-)

diff --git a/sapientml_preprocess/generator.py b/sapientml_preprocess/generator.py
index 92815f8..9634c76 100644
--- a/sapientml_preprocess/generator.py
+++ b/sapientml_preprocess/generator.py
@@ -24,8 +24,9 @@
 import pandas as pd
 import requests
 from jinja2 import Environment, FileSystemLoader
+from pandas.api.types import infer_dtype
 from sapientml.generator import CodeBlockGenerator
-from sapientml.params import Code, Dataset, Task
+from sapientml.params import Code, Dataset, Task, _is_date_colum
 from sapientml.util.logging import setup_logger
 from sapientml_preprocess.params import PreprocessConfig
 
@@ -259,8 +260,12 @@ def generate_code(self, dataset: Dataset, task: Task) -> Tuple[Dataset, Code]:
         # split a columns into 2 columns, one column has only numeric, another columns has only string
         # this operation should be done before calculating meta features
         mix_typed_cols = _confirm_mixed_type(df.drop(task.target_columns, axis=1))
+        datetime_astype_str = []
         cols_numeric_and_string = []
         for col in mix_typed_cols:
+            if(str(df[col].dtype) in "object" and infer_dtype(df[col],skipna=True) in "mixed" and not _is_date_column(df[col])):
+                datetime_astype_str.append(col)
+            cols_numeric_and_string.append(col)
             cols_numeric_and_string.append(col)
             only_str = col + "__str"
             only_num = col + "__num"
@@ -270,10 +275,10 @@ def generate_code(self, dataset: Dataset, task: Task) -> Tuple[Dataset, Code]:
             df = df.drop(col, axis=1)
         if cols_numeric_and_string:
             tpl = template_env.get_template("handle_mixed_typed_columns.py.jinja")
-            code.validation += _render(tpl, training=True, test=True, cols_numeric_and_string=cols_numeric_and_string)
-            code.test += _render(tpl, training=True, test=True, cols_numeric_and_string=cols_numeric_and_string)
-            code.train += _render(tpl, training=True, test=False, cols_numeric_and_string=cols_numeric_and_string)
-            code.predict += _render(tpl, training=False, test=True, cols_numeric_and_string=cols_numeric_and_string)
+            code.validation += _render(tpl, training=True, test=True, cols_numeric_and_string=cols_numeric_and_string,datetime_astype_str=datetime_astype_str)
+            code.test += _render(tpl, training=True, test=True, cols_numeric_and_string=cols_numeric_and_string,datetime_astype_str=datetime_astype_str)
+            code.train += _render(tpl, training=True, test=False, cols_numeric_and_string=cols_numeric_and_string,datetime_astype_str=datetime_astype_str)
+            code.predict += _render(tpl, training=False, test=True, cols_numeric_and_string=cols_numeric_and_string,datetime_astype_str=datetime_astype_str)
 
         # meta features must be calculated after replacing inf with nan,
         # becuase the replaced nan must be preprocessed in the generated code.
diff --git a/sapientml_preprocess/templates/handle_mixed_typed_columns.py.jinja b/sapientml_preprocess/templates/handle_mixed_typed_columns.py.jinja
index 5e9b3d3..d5a4d06 100644
--- a/sapientml_preprocess/templates/handle_mixed_typed_columns.py.jinja
+++ b/sapientml_preprocess/templates/handle_mixed_typed_columns.py.jinja
@@ -1,14 +1,19 @@
 # HANDLE MIXED TYPE
 import numpy as np
 cols_numeric_and_string = {{ cols_numeric_and_string}}
+datetime_astype_str = {{ datetime_astype_str }}
 for col in cols_numeric_and_string:
 {% if training %}
     train_dataset[col + '__str'] = np.where(pd.to_numeric(train_dataset[col], errors='coerce').isnull(), train_dataset[col], np.nan)
     train_dataset[col + '__num'] = np.where(pd.to_numeric(train_dataset[col], errors='coerce').isnull(), np.nan, train_dataset[col]).astype(float)
+    if(col in datetime_astype_str):
+        train_dataset[col + '__str'] = train_dataset[col + '__str'].astype(str)
     train_dataset = train_dataset.drop(col, axis=1)
 {% endif %}
 {% if test %}
     test_dataset[col + '__str'] = np.where(pd.to_numeric(test_dataset[col], errors='coerce').isnull(), test_dataset[col], np.nan)
     test_dataset[col + '__num'] = np.where(pd.to_numeric(test_dataset[col], errors='coerce').isnull(), np.nan, test_dataset[col]).astype(float)
+    if(col in datetime_astype_str):
+        test_dataset[col + '__str'] = test_dataset[col + '__str'].astype(str)
     test_dataset = test_dataset.drop(col, axis=1)
 {% endif %}

From 5466bed34029498911227009004d57c9f2933243 Mon Sep 17 00:00:00 2001
From: tashiro akira <fj1755jk@fujitsu.com>
Date: Wed, 25 Oct 2023 15:09:09 +0900
Subject: [PATCH 02/16] fix: Fix error caused when input data is mixed with
 datetime and string types

Signed-off-by: tashiro akira <fj1755jk@fujitsu.com>
---
 sapientml_preprocess/generator.py | 1 -
 1 file changed, 1 deletion(-)

diff --git a/sapientml_preprocess/generator.py b/sapientml_preprocess/generator.py
index 9634c76..821fb5a 100644
--- a/sapientml_preprocess/generator.py
+++ b/sapientml_preprocess/generator.py
@@ -266,7 +266,6 @@ def generate_code(self, dataset: Dataset, task: Task) -> Tuple[Dataset, Code]:
             if(str(df[col].dtype) in "object" and infer_dtype(df[col],skipna=True) in "mixed" and not _is_date_column(df[col])):
                 datetime_astype_str.append(col)
             cols_numeric_and_string.append(col)
-            cols_numeric_and_string.append(col)
             only_str = col + "__str"
             only_num = col + "__num"
             df[only_str] = np.where(pd.to_numeric(df[col], errors="coerce").isnull(), df[col], np.nan)

From f4807c2b9033018beddc31a414354707c188bc35 Mon Sep 17 00:00:00 2001
From: tashiro akira <fj1755jk@fujitsu.com>
Date: Thu, 26 Oct 2023 09:50:10 +0900
Subject: [PATCH 03/16] fix: Fix error caused when input data is mixed with
 datetime and string types

Signed-off-by: tashiro akira <fj1755jk@fujitsu.com>
---
 sapientml_preprocess/generator.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/sapientml_preprocess/generator.py b/sapientml_preprocess/generator.py
index 821fb5a..c5a50ab 100644
--- a/sapientml_preprocess/generator.py
+++ b/sapientml_preprocess/generator.py
@@ -26,7 +26,7 @@
 from jinja2 import Environment, FileSystemLoader
 from pandas.api.types import infer_dtype
 from sapientml.generator import CodeBlockGenerator
-from sapientml.params import Code, Dataset, Task, _is_date_colum
+from sapientml.params import Code, Dataset, Task, _is_date_column
 from sapientml.util.logging import setup_logger
 from sapientml_preprocess.params import PreprocessConfig
 

From 99a9d7ed2576a0d1545bdc36aaa47281940d91fb Mon Sep 17 00:00:00 2001
From: tashiro akira <fj1755jk@fujitsu.com>
Date: Wed, 8 Nov 2023 13:59:49 +0900
Subject: [PATCH 04/16] fix:Modifying Source Code Formatting

Signed-off-by: tashiro akira <fj1755jk@fujitsu.com>
---
 sapientml_preprocess/generator.py | 38 +++++++++++++++++++++++++++----
 1 file changed, 33 insertions(+), 5 deletions(-)

diff --git a/sapientml_preprocess/generator.py b/sapientml_preprocess/generator.py
index c5a50ab..638f381 100644
--- a/sapientml_preprocess/generator.py
+++ b/sapientml_preprocess/generator.py
@@ -263,7 +263,11 @@ def generate_code(self, dataset: Dataset, task: Task) -> Tuple[Dataset, Code]:
         datetime_astype_str = []
         cols_numeric_and_string = []
         for col in mix_typed_cols:
-            if(str(df[col].dtype) in "object" and infer_dtype(df[col],skipna=True) in "mixed" and not _is_date_column(df[col])):
+            if (
+                str(df[col].dtype) in "object"
+                and infer_dtype(df[col], skipna=True) in "mixed"
+                and not _is_date_column(df[col])
+            ):
                 datetime_astype_str.append(col)
             cols_numeric_and_string.append(col)
             only_str = col + "__str"
@@ -274,10 +278,34 @@ def generate_code(self, dataset: Dataset, task: Task) -> Tuple[Dataset, Code]:
             df = df.drop(col, axis=1)
         if cols_numeric_and_string:
             tpl = template_env.get_template("handle_mixed_typed_columns.py.jinja")
-            code.validation += _render(tpl, training=True, test=True, cols_numeric_and_string=cols_numeric_and_string,datetime_astype_str=datetime_astype_str)
-            code.test += _render(tpl, training=True, test=True, cols_numeric_and_string=cols_numeric_and_string,datetime_astype_str=datetime_astype_str)
-            code.train += _render(tpl, training=True, test=False, cols_numeric_and_string=cols_numeric_and_string,datetime_astype_str=datetime_astype_str)
-            code.predict += _render(tpl, training=False, test=True, cols_numeric_and_string=cols_numeric_and_string,datetime_astype_str=datetime_astype_str)
+            code.validation += _render(
+                tpl,
+                training=True,
+                test=True,
+                cols_numeric_and_string=cols_numeric_and_string,
+                datetime_astype_str=datetime_astype_str,
+            )
+            code.test += _render(
+                tpl,
+                training=True,
+                test=True,
+                cols_numeric_and_string=cols_numeric_and_string,
+                datetime_astype_str=datetime_astype_str,
+            )
+            code.train += _render(
+                tpl,
+                training=True,
+                test=False,
+                cols_numeric_and_string=cols_numeric_and_string,
+                datetime_astype_str=datetime_astype_str,
+            )
+            code.predict += _render(
+                tpl,
+                training=False,
+                test=True,
+                cols_numeric_and_string=cols_numeric_and_string,
+                datetime_astype_str=datetime_astype_str,
+            )
 
         # meta features must be calculated after replacing inf with nan,
         # becuase the replaced nan must be preprocessed in the generated code.

From 812da985199f4c767b79ec7d80c87e70478b0650 Mon Sep 17 00:00:00 2001
From: tashiro akira <fj1755jk@fujitsu.com>
Date: Mon, 4 Dec 2023 14:07:08 +0900
Subject: [PATCH 05/16] fix:Reflect Review

Signed-off-by: tashiro akira <fj1755jk@fujitsu.com>
---
 sapientml_preprocess/generator.py             | 19 ++++---------------
 .../handle_mixed_typed_columns.py.jinja       |  7 ++-----
 2 files changed, 6 insertions(+), 20 deletions(-)

diff --git a/sapientml_preprocess/generator.py b/sapientml_preprocess/generator.py
index 638f381..55a7eb6 100644
--- a/sapientml_preprocess/generator.py
+++ b/sapientml_preprocess/generator.py
@@ -260,15 +260,8 @@ def generate_code(self, dataset: Dataset, task: Task) -> Tuple[Dataset, Code]:
         # split a columns into 2 columns, one column has only numeric, another columns has only string
         # this operation should be done before calculating meta features
         mix_typed_cols = _confirm_mixed_type(df.drop(task.target_columns, axis=1))
-        datetime_astype_str = []
         cols_numeric_and_string = []
         for col in mix_typed_cols:
-            if (
-                str(df[col].dtype) in "object"
-                and infer_dtype(df[col], skipna=True) in "mixed"
-                and not _is_date_column(df[col])
-            ):
-                datetime_astype_str.append(col)
             cols_numeric_and_string.append(col)
             only_str = col + "__str"
             only_num = col + "__num"
@@ -282,29 +275,25 @@ def generate_code(self, dataset: Dataset, task: Task) -> Tuple[Dataset, Code]:
                 tpl,
                 training=True,
                 test=True,
-                cols_numeric_and_string=cols_numeric_and_string,
-                datetime_astype_str=datetime_astype_str,
+                cols_numeric_and_string=cols_numeric_and_string
             )
             code.test += _render(
                 tpl,
                 training=True,
                 test=True,
-                cols_numeric_and_string=cols_numeric_and_string,
-                datetime_astype_str=datetime_astype_str,
+                cols_numeric_and_string=cols_numeric_and_string
             )
             code.train += _render(
                 tpl,
                 training=True,
                 test=False,
-                cols_numeric_and_string=cols_numeric_and_string,
-                datetime_astype_str=datetime_astype_str,
+                cols_numeric_and_string=cols_numeric_and_string
             )
             code.predict += _render(
                 tpl,
                 training=False,
                 test=True,
-                cols_numeric_and_string=cols_numeric_and_string,
-                datetime_astype_str=datetime_astype_str,
+                cols_numeric_and_string=cols_numeric_and_string
             )
 
         # meta features must be calculated after replacing inf with nan,
diff --git a/sapientml_preprocess/templates/handle_mixed_typed_columns.py.jinja b/sapientml_preprocess/templates/handle_mixed_typed_columns.py.jinja
index d5a4d06..54d807d 100644
--- a/sapientml_preprocess/templates/handle_mixed_typed_columns.py.jinja
+++ b/sapientml_preprocess/templates/handle_mixed_typed_columns.py.jinja
@@ -1,19 +1,16 @@
 # HANDLE MIXED TYPE
 import numpy as np
 cols_numeric_and_string = {{ cols_numeric_and_string}}
-datetime_astype_str = {{ datetime_astype_str }}
 for col in cols_numeric_and_string:
 {% if training %}
     train_dataset[col + '__str'] = np.where(pd.to_numeric(train_dataset[col], errors='coerce').isnull(), train_dataset[col], np.nan)
+    train_dataset[col + '__str'] = np.where(train_dataset[col + '__str'].notnull(), train_dataset[col + '__str'].astype(str), np.nan)
     train_dataset[col + '__num'] = np.where(pd.to_numeric(train_dataset[col], errors='coerce').isnull(), np.nan, train_dataset[col]).astype(float)
-    if(col in datetime_astype_str):
-        train_dataset[col + '__str'] = train_dataset[col + '__str'].astype(str)
     train_dataset = train_dataset.drop(col, axis=1)
 {% endif %}
 {% if test %}
     test_dataset[col + '__str'] = np.where(pd.to_numeric(test_dataset[col], errors='coerce').isnull(), test_dataset[col], np.nan)
+    test_dataset[col + '__str'] = np.where(test_dataset[col + '__str'].notnull(), test_dataset[col + '__str'].astype(str), np.nan)
     test_dataset[col + '__num'] = np.where(pd.to_numeric(test_dataset[col], errors='coerce').isnull(), np.nan, test_dataset[col]).astype(float)
-    if(col in datetime_astype_str):
-        test_dataset[col + '__str'] = test_dataset[col + '__str'].astype(str)
     test_dataset = test_dataset.drop(col, axis=1)
 {% endif %}

From 937880cc8c160c66c83a75f291bc956f4a862a9e Mon Sep 17 00:00:00 2001
From: tashiro akira <fj1755jk@fujitsu.com>
Date: Mon, 4 Dec 2023 14:11:44 +0900
Subject: [PATCH 06/16] fix: Remove Unnecessary Imports

Signed-off-by: tashiro akira <fj1755jk@fujitsu.com>
---
 sapientml_preprocess/generator.py | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/sapientml_preprocess/generator.py b/sapientml_preprocess/generator.py
index 55a7eb6..3ae14d4 100644
--- a/sapientml_preprocess/generator.py
+++ b/sapientml_preprocess/generator.py
@@ -24,9 +24,8 @@
 import pandas as pd
 import requests
 from jinja2 import Environment, FileSystemLoader
-from pandas.api.types import infer_dtype
 from sapientml.generator import CodeBlockGenerator
-from sapientml.params import Code, Dataset, Task, _is_date_column
+from sapientml.params import Code, Dataset, Task
 from sapientml.util.logging import setup_logger
 from sapientml_preprocess.params import PreprocessConfig
 

From 79e39e37746bd70c62e1c0c1e9b078e61d8d7274 Mon Sep 17 00:00:00 2001
From: tashiro akira <fj1755jk@fujitsu.com>
Date: Fri, 8 Dec 2023 09:59:12 +0900
Subject: [PATCH 07/16] fix: Reflect the point

Signed-off-by: tashiro akira <fj1755jk@fujitsu.com>
---
 sapientml_preprocess/generator.py | 1 +
 1 file changed, 1 insertion(+)

diff --git a/sapientml_preprocess/generator.py b/sapientml_preprocess/generator.py
index 3ae14d4..1d9042f 100644
--- a/sapientml_preprocess/generator.py
+++ b/sapientml_preprocess/generator.py
@@ -265,6 +265,7 @@ def generate_code(self, dataset: Dataset, task: Task) -> Tuple[Dataset, Code]:
             only_str = col + "__str"
             only_num = col + "__num"
             df[only_str] = np.where(pd.to_numeric(df[col], errors="coerce").isnull(), df[col], np.nan)
+            df[only_str] = np.where(df[only_str].notnull(), df[only_str].astype(str), np.nan)
             # without .astype(float), cannot recongnize as `int` or `float`, leading to generate inappropriate code snippet
             df[only_num] = np.where(pd.to_numeric(df[col], errors="coerce").isnull(), np.nan, df[col]).astype(float)
             df = df.drop(col, axis=1)

From edd4d49a690f597b9bb74b08a9a81c6fc84e2556 Mon Sep 17 00:00:00 2001
From: tashiro akira <fj1755jk@fujitsu.com>
Date: Tue, 12 Dec 2023 15:02:36 +0900
Subject: [PATCH 08/16] fix:Reflected review results

Signed-off-by: tashiro akira <fj1755jk@fujitsu.com>
---
 sapientml_preprocess/generator.py                      | 9 +++++----
 sapientml_preprocess/templates/rename_columns.py.jinja | 1 +
 2 files changed, 6 insertions(+), 4 deletions(-)

diff --git a/sapientml_preprocess/generator.py b/sapientml_preprocess/generator.py
index 1d9042f..ddc3ced 100644
--- a/sapientml_preprocess/generator.py
+++ b/sapientml_preprocess/generator.py
@@ -225,6 +225,7 @@ def generate_code(self, dataset: Dataset, task: Task) -> Tuple[Dataset, Code]:
         # Remove special symbols that interfere with visualization and model training
         cols_has_symbols = []
         cols_has_symbols = check_cols_has_symbols(df.columns.to_list())
+        cols_has_symbols_target = task.target_columns
         if cols_has_symbols:
             logger.warning(
                 f"Symbols that inhibit training and visualization will be removed from column name {str(cols_has_symbols)}."
@@ -234,10 +235,10 @@ def generate_code(self, dataset: Dataset, task: Task) -> Tuple[Dataset, Code]:
                 remove_symbols(col) if col in cols_has_symbols else col for col in task.target_columns
             ]
             tpl = template_env.get_template("rename_columns.py.jinja")
-            code.validation += _render(tpl, training=True, test=True, cols_has_symbols=cols_has_symbols)
-            code.test += _render(tpl, training=True, test=True, cols_has_symbols=cols_has_symbols)
-            code.train += _render(tpl, training=True, test=False, cols_has_symbols=cols_has_symbols)
-            code.predict += _render(tpl, training=False, test=True, cols_has_symbols=cols_has_symbols)
+            code.validation += _render(tpl, training=True, test=True, cols_has_symbols=cols_has_symbols, cols_has_symbols_target=cols_has_symbols_target)
+            code.test += _render(tpl, training=True, test=True, cols_has_symbols=cols_has_symbols, cols_has_symbols_target=cols_has_symbols_target)
+            code.train += _render(tpl, training=True, test=False, cols_has_symbols=cols_has_symbols, cols_has_symbols_target=cols_has_symbols_target)
+            code.predict += _render(tpl, training=False, test=True, cols_has_symbols=cols_has_symbols, cols_has_symbols_target=cols_has_symbols_target)
 
         # handle list(tuple, dict) value in dataframe.
         # in generated scripts, visualisation will be executed before pre-processing such as handle mixed-type.
diff --git a/sapientml_preprocess/templates/rename_columns.py.jinja b/sapientml_preprocess/templates/rename_columns.py.jinja
index 7e21706..d0fd06c 100644
--- a/sapientml_preprocess/templates/rename_columns.py.jinja
+++ b/sapientml_preprocess/templates/rename_columns.py.jinja
@@ -1,6 +1,7 @@
 # Remove special symbols that interfere with visualization and model training
 import re
 cols_has_symbols = {{ cols_has_symbols }}
+cols_has_symbols_target = {{ cols_has_symbols_target }}
 inhibited_symbol_pattern = re.compile(r"[\{\}\[\]\",:<'\\]+")
 {% if training %}
 train_dataset = train_dataset.rename(columns=lambda col: inhibited_symbol_pattern.sub("", col) if col in cols_has_symbols else col)

From e063795badef4c9da186ffe250b44f830690dd34 Mon Sep 17 00:00:00 2001
From: tashiro akira <fj1755jk@fujitsu.com>
Date: Tue, 12 Dec 2023 15:30:21 +0900
Subject: [PATCH 09/16] style:Modified to fit the format

Signed-off-by: tashiro akira <fj1755jk@fujitsu.com>
---
 sapientml_preprocess/generator.py | 60 ++++++++++++++++---------------
 1 file changed, 32 insertions(+), 28 deletions(-)

diff --git a/sapientml_preprocess/generator.py b/sapientml_preprocess/generator.py
index ddc3ced..3bebf6f 100644
--- a/sapientml_preprocess/generator.py
+++ b/sapientml_preprocess/generator.py
@@ -235,10 +235,34 @@ def generate_code(self, dataset: Dataset, task: Task) -> Tuple[Dataset, Code]:
                 remove_symbols(col) if col in cols_has_symbols else col for col in task.target_columns
             ]
             tpl = template_env.get_template("rename_columns.py.jinja")
-            code.validation += _render(tpl, training=True, test=True, cols_has_symbols=cols_has_symbols, cols_has_symbols_target=cols_has_symbols_target)
-            code.test += _render(tpl, training=True, test=True, cols_has_symbols=cols_has_symbols, cols_has_symbols_target=cols_has_symbols_target)
-            code.train += _render(tpl, training=True, test=False, cols_has_symbols=cols_has_symbols, cols_has_symbols_target=cols_has_symbols_target)
-            code.predict += _render(tpl, training=False, test=True, cols_has_symbols=cols_has_symbols, cols_has_symbols_target=cols_has_symbols_target)
+            code.validation += _render(
+                tpl,
+                training=True,
+                test=True,
+                cols_has_symbols=cols_has_symbols,
+                cols_has_symbols_target=cols_has_symbols_target,
+            )
+            code.test += _render(
+                tpl,
+                training=True,
+                test=True,
+                cols_has_symbols=cols_has_symbols,
+                cols_has_symbols_target=cols_has_symbols_target,
+            )
+            code.train += _render(
+                tpl,
+                training=True,
+                test=False,
+                cols_has_symbols=cols_has_symbols,
+                cols_has_symbols_target=cols_has_symbols_target,
+            )
+            code.predict += _render(
+                tpl,
+                training=False,
+                test=True,
+                cols_has_symbols=cols_has_symbols,
+                cols_has_symbols_target=cols_has_symbols_target,
+            )
 
         # handle list(tuple, dict) value in dataframe.
         # in generated scripts, visualisation will be executed before pre-processing such as handle mixed-type.
@@ -272,30 +296,10 @@ def generate_code(self, dataset: Dataset, task: Task) -> Tuple[Dataset, Code]:
             df = df.drop(col, axis=1)
         if cols_numeric_and_string:
             tpl = template_env.get_template("handle_mixed_typed_columns.py.jinja")
-            code.validation += _render(
-                tpl,
-                training=True,
-                test=True,
-                cols_numeric_and_string=cols_numeric_and_string
-            )
-            code.test += _render(
-                tpl,
-                training=True,
-                test=True,
-                cols_numeric_and_string=cols_numeric_and_string
-            )
-            code.train += _render(
-                tpl,
-                training=True,
-                test=False,
-                cols_numeric_and_string=cols_numeric_and_string
-            )
-            code.predict += _render(
-                tpl,
-                training=False,
-                test=True,
-                cols_numeric_and_string=cols_numeric_and_string
-            )
+            code.validation += _render(tpl, training=True, test=True, cols_numeric_and_string=cols_numeric_and_string)
+            code.test += _render(tpl, training=True, test=True, cols_numeric_and_string=cols_numeric_and_string)
+            code.train += _render(tpl, training=True, test=False, cols_numeric_and_string=cols_numeric_and_string)
+            code.predict += _render(tpl, training=False, test=True, cols_numeric_and_string=cols_numeric_and_string)
 
         # meta features must be calculated after replacing inf with nan,
         # becuase the replaced nan must be preprocessed in the generated code.

From 22be366c5b23f695d934301d2e8304bcc7d11724 Mon Sep 17 00:00:00 2001
From: tashiro akira <fj1755jk@fujitsu.com>
Date: Fri, 22 Dec 2023 09:50:16 +0900
Subject: [PATCH 10/16] fix:Fixed to return column names in csv file

Signed-off-by: tashiro akira <fj1755jk@fujitsu.com>
---
 sapientml_preprocess/generator.py             | 33 +++----------------
 .../templates/rename_columns.py.jinja         |  3 +-
 2 files changed, 6 insertions(+), 30 deletions(-)

diff --git a/sapientml_preprocess/generator.py b/sapientml_preprocess/generator.py
index 3bebf6f..fe01dc6 100644
--- a/sapientml_preprocess/generator.py
+++ b/sapientml_preprocess/generator.py
@@ -225,7 +225,6 @@ def generate_code(self, dataset: Dataset, task: Task) -> Tuple[Dataset, Code]:
         # Remove special symbols that interfere with visualization and model training
         cols_has_symbols = []
         cols_has_symbols = check_cols_has_symbols(df.columns.to_list())
-        cols_has_symbols_target = task.target_columns
         if cols_has_symbols:
             logger.warning(
                 f"Symbols that inhibit training and visualization will be removed from column name {str(cols_has_symbols)}."
@@ -235,34 +234,10 @@ def generate_code(self, dataset: Dataset, task: Task) -> Tuple[Dataset, Code]:
                 remove_symbols(col) if col in cols_has_symbols else col for col in task.target_columns
             ]
             tpl = template_env.get_template("rename_columns.py.jinja")
-            code.validation += _render(
-                tpl,
-                training=True,
-                test=True,
-                cols_has_symbols=cols_has_symbols,
-                cols_has_symbols_target=cols_has_symbols_target,
-            )
-            code.test += _render(
-                tpl,
-                training=True,
-                test=True,
-                cols_has_symbols=cols_has_symbols,
-                cols_has_symbols_target=cols_has_symbols_target,
-            )
-            code.train += _render(
-                tpl,
-                training=True,
-                test=False,
-                cols_has_symbols=cols_has_symbols,
-                cols_has_symbols_target=cols_has_symbols_target,
-            )
-            code.predict += _render(
-                tpl,
-                training=False,
-                test=True,
-                cols_has_symbols=cols_has_symbols,
-                cols_has_symbols_target=cols_has_symbols_target,
-            )
+            code.validation += _render(tpl, training=True, test=True, cols_has_symbols=cols_has_symbols)
+            code.test += _render(tpl, training=True, test=True, cols_has_symbols=cols_has_symbols)
+            code.train += _render(tpl, training=True, test=False, cols_has_symbols=cols_has_symbols)
+            code.predict += _render(tpl, training=False, test=True, cols_has_symbols=cols_has_symbols)
 
         # handle list(tuple, dict) value in dataframe.
         # in generated scripts, visualisation will be executed before pre-processing such as handle mixed-type.
diff --git a/sapientml_preprocess/templates/rename_columns.py.jinja b/sapientml_preprocess/templates/rename_columns.py.jinja
index d0fd06c..c3b056e 100644
--- a/sapientml_preprocess/templates/rename_columns.py.jinja
+++ b/sapientml_preprocess/templates/rename_columns.py.jinja
@@ -1,9 +1,10 @@
 # Remove special symbols that interfere with visualization and model training
 import re
 cols_has_symbols = {{ cols_has_symbols }}
-cols_has_symbols_target = {{ cols_has_symbols_target }}
 inhibited_symbol_pattern = re.compile(r"[\{\}\[\]\",:<'\\]+")
 {% if training %}
+rename_symbol_cols = {col: inhibited_symbol_pattern.sub("", col) if col in cols_has_symbols else col in cols_has_symbols for col in cols_has_symbols }
+rename_symbol_cols = {v: k for k, v in rename_symbol_cols.items()}
 train_dataset = train_dataset.rename(columns=lambda col: inhibited_symbol_pattern.sub("", col) if col in cols_has_symbols else col)
 {% endif %}
 {% if test %}

From 3b62abbb81d1670db9e6c5de01745accfa75ab95 Mon Sep 17 00:00:00 2001
From: tashiro akira <fj1755jk@fujitsu.com>
Date: Thu, 11 Jan 2024 15:38:29 +0900
Subject: [PATCH 11/16] fix:Reflected the content of the review

Signed-off-by: tashiro akira <fj1755jk@fujitsu.com>
---
 sapientml_preprocess/templates/rename_columns.py.jinja | 10 +++++-----
 1 file changed, 5 insertions(+), 5 deletions(-)

diff --git a/sapientml_preprocess/templates/rename_columns.py.jinja b/sapientml_preprocess/templates/rename_columns.py.jinja
index c3b056e..a1bd71e 100644
--- a/sapientml_preprocess/templates/rename_columns.py.jinja
+++ b/sapientml_preprocess/templates/rename_columns.py.jinja
@@ -2,11 +2,11 @@
 import re
 cols_has_symbols = {{ cols_has_symbols }}
 inhibited_symbol_pattern = re.compile(r"[\{\}\[\]\",:<'\\]+")
-{% if training %}
 rename_symbol_cols = {col: inhibited_symbol_pattern.sub("", col) if col in cols_has_symbols else col in cols_has_symbols for col in cols_has_symbols }
-rename_symbol_cols = {v: k for k, v in rename_symbol_cols.items()}
-train_dataset = train_dataset.rename(columns=lambda col: inhibited_symbol_pattern.sub("", col) if col in cols_has_symbols else col)
+{% if training %}
+train_dataset = train_dataset.rename(columns=rename_symbol_cols)
 {% endif %}
 {% if test %}
-test_dataset = test_dataset.rename(columns=lambda col: inhibited_symbol_pattern.sub("", col) if col in cols_has_symbols else col)
-{% endif %}
\ No newline at end of file
+test_dataset = test_dataset.rename(columns=rename_symbol_cols)
+{% endif %}
+rename_symbol_cols = {v: k for k, v in rename_symbol_cols.items()}
\ No newline at end of file

From 6433d99bb36292a2f04d8884c25a4accb77a0ec3 Mon Sep 17 00:00:00 2001
From: tashiro akira <fj1755jk@fujitsu.com>
Date: Fri, 22 Mar 2024 16:11:15 +0900
Subject: [PATCH 12/16] fix:Reflected review results

Signed-off-by: tashiro akira <fj1755jk@fujitsu.com>
---
 sapientml_preprocess/generator.py             | 42 ++++++++++++++++---
 .../templates/rename_columns.py.jinja         |  8 +++-
 2 files changed, 43 insertions(+), 7 deletions(-)

diff --git a/sapientml_preprocess/generator.py b/sapientml_preprocess/generator.py
index fe01dc6..ab5aa6b 100644
--- a/sapientml_preprocess/generator.py
+++ b/sapientml_preprocess/generator.py
@@ -12,6 +12,7 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
+import collections
 import os
 import re
 from pathlib import Path
@@ -31,7 +32,7 @@
 
 logger = setup_logger()
 
-INHIBITED_SYMBOL_PATTERN = re.compile(r"[\{\}\[\]\",:<'\\]+")
+INHIBITED_SYMBOL_PATTERN = re.compile(r"[\{\}\[\]\",:<'\\\+]+")
 
 
 template_env = Environment(loader=FileSystemLoader(f"{os.path.dirname(__file__)}/templates"), trim_blocks=True)
@@ -229,15 +230,46 @@ def generate_code(self, dataset: Dataset, task: Task) -> Tuple[Dataset, Code]:
             logger.warning(
                 f"Symbols that inhibit training and visualization will be removed from column name {str(cols_has_symbols)}."
             )
+            org_df_column = df.columns.values
+            org_target_column = task.target_columns
             df = df.rename(columns=lambda col: remove_symbols(col) if col in cols_has_symbols else col)
             task.target_columns = [
                 remove_symbols(col) if col in cols_has_symbols else col for col in task.target_columns
             ]
+            same_column = {k: v for k, v in collections.Counter(list(df.columns.values)).items() if v > 1}
+            rename_dict = {}
+            if len(same_column) != 0:
+                for target in same_column.keys():
+                    rename_dict = {}
+                    rename_target_col = []
+                    df_cols = list(df.columns.values)
+                    i = 1
+                    for col in df_cols:
+                        if target in col:
+                            rename_dict[org_df_column[len(rename_dict)]] = str(col + str(i))
+                            i = i + 1
+                        else:
+                            rename_dict[org_df_column[len(rename_dict)]] = col
+                    df = df.set_axis(list(rename_dict.values()), axis=1)
+                    i = 1
+                    for col in org_target_column:
+                        rename_target_col.append(rename_dict[col])
+
+                    task.target_columns = rename_target_col
+
             tpl = template_env.get_template("rename_columns.py.jinja")
-            code.validation += _render(tpl, training=True, test=True, cols_has_symbols=cols_has_symbols)
-            code.test += _render(tpl, training=True, test=True, cols_has_symbols=cols_has_symbols)
-            code.train += _render(tpl, training=True, test=False, cols_has_symbols=cols_has_symbols)
-            code.predict += _render(tpl, training=False, test=True, cols_has_symbols=cols_has_symbols)
+            code.validation += _render(
+                tpl, training=True, test=True, cols_has_symbols=cols_has_symbols, rename_dict=rename_dict
+            )
+            code.test += _render(
+                tpl, training=True, test=True, cols_has_symbols=cols_has_symbols, rename_dict=rename_dict
+            )
+            code.train += _render(
+                tpl, training=True, test=False, cols_has_symbols=cols_has_symbols, rename_dict=rename_dict
+            )
+            code.predict += _render(
+                tpl, training=False, test=True, cols_has_symbols=cols_has_symbols, rename_dict=rename_dict
+            )
 
         # handle list(tuple, dict) value in dataframe.
         # in generated scripts, visualisation will be executed before pre-processing such as handle mixed-type.
diff --git a/sapientml_preprocess/templates/rename_columns.py.jinja b/sapientml_preprocess/templates/rename_columns.py.jinja
index a1bd71e..4f32c5d 100644
--- a/sapientml_preprocess/templates/rename_columns.py.jinja
+++ b/sapientml_preprocess/templates/rename_columns.py.jinja
@@ -1,8 +1,12 @@
 # Remove special symbols that interfere with visualization and model training
 import re
 cols_has_symbols = {{ cols_has_symbols }}
-inhibited_symbol_pattern = re.compile(r"[\{\}\[\]\",:<'\\]+")
-rename_symbol_cols = {col: inhibited_symbol_pattern.sub("", col) if col in cols_has_symbols else col in cols_has_symbols for col in cols_has_symbols }
+rename_dict = {{ rename_dict }}
+inhibited_symbol_pattern = re.compile(r"[\{\}\[\]\",:<'\\\+]+")
+if len(rename_dict) == 0 :
+    rename_symbol_cols = {col: inhibited_symbol_pattern.sub("", col) if col in cols_has_symbols else col in cols_has_symbols for col in cols_has_symbols }
+else:
+    rename_symbol_cols = rename_dict
 {% if training %}
 train_dataset = train_dataset.rename(columns=rename_symbol_cols)
 {% endif %}

From e1da54cda2cc03cd1fd448dd64a32b3e461e910a Mon Sep 17 00:00:00 2001
From: tashiro-akira <fj0822cr@fujitsu.com>
Date: Mon, 22 Apr 2024 16:51:22 +0900
Subject: [PATCH 13/16] fix:Reflect Review Results

Signed-off-by: tashiro-akira <fj0822cr@fujitsu.com>
---
 sapientml_preprocess/generator.py             | 34 ++++++++-----------
 .../templates/rename_columns.py.jinja         |  9 ++---
 2 files changed, 19 insertions(+), 24 deletions(-)

diff --git a/sapientml_preprocess/generator.py b/sapientml_preprocess/generator.py
index ab5aa6b..9c654a4 100644
--- a/sapientml_preprocess/generator.py
+++ b/sapientml_preprocess/generator.py
@@ -231,31 +231,25 @@ def generate_code(self, dataset: Dataset, task: Task) -> Tuple[Dataset, Code]:
                 f"Symbols that inhibit training and visualization will be removed from column name {str(cols_has_symbols)}."
             )
             org_df_column = df.columns.values
-            org_target_column = task.target_columns
+            org_target_columns = task.target_columns
+            no_symbol_columns = [ col for col in df.columns.values if col not in cols_has_symbols ]
             df = df.rename(columns=lambda col: remove_symbols(col) if col in cols_has_symbols else col)
             task.target_columns = [
                 remove_symbols(col) if col in cols_has_symbols else col for col in task.target_columns
             ]
-            same_column = {k: v for k, v in collections.Counter(list(df.columns.values)).items() if v > 1}
             rename_dict = {}
-            if len(same_column) != 0:
-                for target in same_column.keys():
-                    rename_dict = {}
-                    rename_target_col = []
-                    df_cols = list(df.columns.values)
-                    i = 1
-                    for col in df_cols:
-                        if target in col:
-                            rename_dict[org_df_column[len(rename_dict)]] = str(col + str(i))
-                            i = i + 1
-                        else:
-                            rename_dict[org_df_column[len(rename_dict)]] = col
-                    df = df.set_axis(list(rename_dict.values()), axis=1)
-                    i = 1
-                    for col in org_target_column:
-                        rename_target_col.append(rename_dict[col])
-
-                    task.target_columns = rename_target_col
+            if df.columns.duplicated().any() :
+                same_column = {k: v for k, v in collections.Counter(list(df.columns.values)).items() if v > 1 and k in no_symbol_columns}
+                for target, org_column in zip(df.columns.to_list(), org_df_column.tolist()):
+                    if target in same_column.keys():
+                        rename_dict[org_column] = target + str(same_column[target] - 1)
+                        same_column[target] = same_column[target] - 1
+                    else:
+                        rename_dict[org_column] = target
+                    
+                df = df.set_axis(list(rename_dict.values()), axis=1)
+                # task.target_columns = [col for col in list(rename_dict.values()) if task.target_columns in list(rename_dict.values())]
+                task.target_columns = [rename_dict[col] for col in org_target_columns]
 
             tpl = template_env.get_template("rename_columns.py.jinja")
             code.validation += _render(
diff --git a/sapientml_preprocess/templates/rename_columns.py.jinja b/sapientml_preprocess/templates/rename_columns.py.jinja
index 4f32c5d..62d45a8 100644
--- a/sapientml_preprocess/templates/rename_columns.py.jinja
+++ b/sapientml_preprocess/templates/rename_columns.py.jinja
@@ -3,10 +3,11 @@ import re
 cols_has_symbols = {{ cols_has_symbols }}
 rename_dict = {{ rename_dict }}
 inhibited_symbol_pattern = re.compile(r"[\{\}\[\]\",:<'\\\+]+")
-if len(rename_dict) == 0 :
-    rename_symbol_cols = {col: inhibited_symbol_pattern.sub("", col) if col in cols_has_symbols else col in cols_has_symbols for col in cols_has_symbols }
-else:
-    rename_symbol_cols = rename_dict
+{% if rename_dict %}
+rename_symbol_cols = rename_dict
+{% else %}
+rename_symbol_cols = {col: inhibited_symbol_pattern.sub("", col) if col in cols_has_symbols else col in cols_has_symbols for col in cols_has_symbols }
+{% endif %}
 {% if training %}
 train_dataset = train_dataset.rename(columns=rename_symbol_cols)
 {% endif %}

From bde5ad92232711eb3e30aa063f8a59c19c9e2bb5 Mon Sep 17 00:00:00 2001
From: tashiro-akira <fj0822cr@fujitsu.com>
Date: Thu, 25 Apr 2024 11:00:26 +0900
Subject: [PATCH 14/16] fix:Fixed error in running lint

Signed-off-by: tashiro-akira <fj0822cr@fujitsu.com>
---
 sapientml_preprocess/generator.py                   | 13 ++++++++-----
 .../templates/rename_columns.py.jinja               |  2 ++
 2 files changed, 10 insertions(+), 5 deletions(-)

diff --git a/sapientml_preprocess/generator.py b/sapientml_preprocess/generator.py
index 52399bc..efc4897 100644
--- a/sapientml_preprocess/generator.py
+++ b/sapientml_preprocess/generator.py
@@ -232,23 +232,26 @@ def generate_code(self, dataset: Dataset, task: Task) -> Tuple[Dataset, Code]:
             )
             org_df_column = df.columns.values
             org_target_columns = task.target_columns
-            no_symbol_columns = [ col for col in df.columns.values if col not in cols_has_symbols ]
+            no_symbol_columns = [col for col in df.columns.values if col not in cols_has_symbols]
             df = df.rename(columns=lambda col: remove_symbols(col) if col in cols_has_symbols else col)
             task.target_columns = [
                 remove_symbols(col) if col in cols_has_symbols else col for col in task.target_columns
             ]
             rename_dict = {}
-            if df.columns.duplicated().any() :
-                same_column = {k: v for k, v in collections.Counter(list(df.columns.values)).items() if v > 1 and k in no_symbol_columns}
+            if df.columns.duplicated().any():
+                same_column = {
+                    k: v
+                    for k, v in collections.Counter(list(df.columns.values)).items()
+                    if v > 1 and k in no_symbol_columns
+                }
                 for target, org_column in zip(df.columns.to_list(), org_df_column.tolist()):
                     if target in same_column.keys():
                         rename_dict[org_column] = target + str(same_column[target] - 1)
                         same_column[target] = same_column[target] - 1
                     else:
                         rename_dict[org_column] = target
-                    
+
                 df = df.set_axis(list(rename_dict.values()), axis=1)
-                # task.target_columns = [col for col in list(rename_dict.values()) if task.target_columns in list(rename_dict.values())]
                 task.target_columns = [rename_dict[col] for col in org_target_columns]
 
             tpl = template_env.get_template("rename_columns.py.jinja")
diff --git a/sapientml_preprocess/templates/rename_columns.py.jinja b/sapientml_preprocess/templates/rename_columns.py.jinja
index 62d45a8..d6b540e 100644
--- a/sapientml_preprocess/templates/rename_columns.py.jinja
+++ b/sapientml_preprocess/templates/rename_columns.py.jinja
@@ -1,7 +1,9 @@
 # Remove special symbols that interfere with visualization and model training
 import re
 cols_has_symbols = {{ cols_has_symbols }}
+{% if rename_dict %}
 rename_dict = {{ rename_dict }}
+{% endif %}
 inhibited_symbol_pattern = re.compile(r"[\{\}\[\]\",:<'\\\+]+")
 {% if rename_dict %}
 rename_symbol_cols = rename_dict

From 8c15e2151073132a461494e4983f6b8131accbb6 Mon Sep 17 00:00:00 2001
From: tashiro-akira <fj0822cr@fujitsu.com>
Date: Thu, 25 Apr 2024 11:06:19 +0900
Subject: [PATCH 15/16] fix:Reflect Review Results

Signed-off-by: tashiro-akira <fj0822cr@fujitsu.com>
---
 sapientml_preprocess/templates/rename_columns.py.jinja | 5 +----
 1 file changed, 1 insertion(+), 4 deletions(-)

diff --git a/sapientml_preprocess/templates/rename_columns.py.jinja b/sapientml_preprocess/templates/rename_columns.py.jinja
index d6b540e..45a5f62 100644
--- a/sapientml_preprocess/templates/rename_columns.py.jinja
+++ b/sapientml_preprocess/templates/rename_columns.py.jinja
@@ -1,12 +1,9 @@
 # Remove special symbols that interfere with visualization and model training
 import re
 cols_has_symbols = {{ cols_has_symbols }}
-{% if rename_dict %}
-rename_dict = {{ rename_dict }}
-{% endif %}
 inhibited_symbol_pattern = re.compile(r"[\{\}\[\]\",:<'\\\+]+")
 {% if rename_dict %}
-rename_symbol_cols = rename_dict
+rename_symbol_cols = {{ rename_dict }}
 {% else %}
 rename_symbol_cols = {col: inhibited_symbol_pattern.sub("", col) if col in cols_has_symbols else col in cols_has_symbols for col in cols_has_symbols }
 {% endif %}

From 1f62efd62a3222a72bd2741c283a813c51c64717 Mon Sep 17 00:00:00 2001
From: tashiro-akira <fj0822cr@fujitsu.com>
Date: Thu, 25 Apr 2024 13:42:51 +0900
Subject: [PATCH 16/16] fix:Reflect Review Results

Signed-off-by: tashiro-akira <fj0822cr@fujitsu.com>
---
 sapientml_preprocess/generator.py                      | 2 +-
 sapientml_preprocess/templates/rename_columns.py.jinja | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/sapientml_preprocess/generator.py b/sapientml_preprocess/generator.py
index efc4897..41dc4b0 100644
--- a/sapientml_preprocess/generator.py
+++ b/sapientml_preprocess/generator.py
@@ -32,7 +32,7 @@
 
 logger = setup_logger()
 
-INHIBITED_SYMBOL_PATTERN = re.compile(r"[\{\}\[\]\",:<'\\\+]+")
+INHIBITED_SYMBOL_PATTERN = re.compile(r"[\{\}\[\]\",:<'\\]+")
 
 
 template_env = Environment(loader=FileSystemLoader(f"{os.path.dirname(__file__)}/templates"), trim_blocks=True)
diff --git a/sapientml_preprocess/templates/rename_columns.py.jinja b/sapientml_preprocess/templates/rename_columns.py.jinja
index 45a5f62..846e637 100644
--- a/sapientml_preprocess/templates/rename_columns.py.jinja
+++ b/sapientml_preprocess/templates/rename_columns.py.jinja
@@ -1,7 +1,7 @@
 # Remove special symbols that interfere with visualization and model training
 import re
 cols_has_symbols = {{ cols_has_symbols }}
-inhibited_symbol_pattern = re.compile(r"[\{\}\[\]\",:<'\\\+]+")
+inhibited_symbol_pattern = re.compile(r"[\{\}\[\]\",:<'\\]+")
 {% if rename_dict %}
 rename_symbol_cols = {{ rename_dict }}
 {% else %}