@@ -46,6 +46,11 @@ def synthesize_column(col, dtype, n_rows):
4646 synth = np .random .normal (mu , sigma if sigma > 0 else 1 , size = n_rows )
4747 return np .clip (synth , data .min (), data .max ()).round (4 )
4848
49+ elif dtype == "int" :
50+ min_val , max_val = int (col .min ()), int (col .max ())
51+ values = np .random .randint (min_val , max_val + 1 , n_rows , dtype = np .int64 )
52+ return pd .Series (values , dtype = "int64" )
53+
4954 elif dtype == "category" :
5055 freqs = col .value_counts (normalize = True )
5156 return np .random .choice (freqs .index , size = n_rows , p = freqs .values )
@@ -86,7 +91,9 @@ def synthesize_table(df, table_meta, fk_maps, n_rows=None, schema_table_names=No
8691 dtype = col_meta ["dtype" ]
8792
8893 if dtype == "primary_key" :
89- new_ids = [str (uuid .uuid4 ()) for _ in range (n_rows )]
94+ # new_ids = [str(uuid.uuid4()) for _ in range(n_rows)]
95+ new_ids = np .random .randint (int (df [col ].values .min ()), int (df [col ].values .max ()),
96+ n_rows , dtype = np .int64 )
9097 synthetic_df [col ] = new_ids
9198 # Map original IDs (cycled if needed)
9299 orig_ids = np .resize (df [col ].values , n_rows )
@@ -98,7 +105,10 @@ def synthesize_table(df, table_meta, fk_maps, n_rows=None, schema_table_names=No
98105
99106 if (ref_table not in schema_table_names ) or (ref_key not in fk_maps ):
100107 # If n mapping available, to synthesize fresh IDs
101- synthetic_df [col ] = [str (uuid .uuid4 ()) for _ in range (n_rows )]
108+ # synthetic_df[col] = [str(uuid.uuid4()) for _ in range(n_rows)]
109+ synthetic_df [col ] = np .random .randint (int (df [col ].values .min ()),
110+ int (df [col ].values .max ()), n_rows ,
111+ dtype = np .int64 )
102112 else :
103113 ref_map = fk_maps [ref_key ]
104114 orig_vals = np .resize (df [col ].values , n_rows )
@@ -126,7 +136,6 @@ def synthesize_database(datapath, meta_file, size_config=None):
126136 for table_name , table_meta in list (remaining .items ()):
127137 print ("Processing table" , table_name )
128138 df = load_raw_table (datapath , table_meta ["source" ], table_meta ["format" ])
129- # print("\t", df)
130139 n_rows = size_config .get (table_name , len (df )) if size_config else len (df )
131140 synth_df , id_map = synthesize_table (df , table_meta , fk_maps , n_rows , remaining )
132141 synthetic_dfs [table_name ] = (synth_df , table_meta ["source" ])
@@ -141,15 +150,12 @@ def synthesize_database(datapath, meta_file, size_config=None):
141150 for split in ["train" , "validation" , "test" ]:
142151 source = task_meta ["source" ].replace ("{split}" , split )
143152 if os .path .exists (os .path .join (datapath , source )):
144- print ("Processing tasks" , source )
145153 df = load_raw_table (datapath , source , task_meta ["format" ])
146- # print("\t", df)
147154 task_name = f"{ task_meta ['name' ]} _{ split } "
148155 n_rows = size_config .get (task_name , len (df )) if size_config else len (df )
149156 synth_df , _ = synthesize_table (df , task_meta , fk_maps , n_rows , remaining )
150157 synthetic_dfs [task_name ] = (synth_df , source )
151158
152- # print(">", synthetic_dfs)
153159 return synthetic_dfs
154160
155161
@@ -176,11 +182,13 @@ def main_avs(args):
176182 # copy metadata.yaml file
177183 shutil .copyfile (os .path .join (input_path , "metadata.yaml" ),
178184 os .path .join (output_path , "metadata.yaml" ))
185+ shutil .copyfile (os .path .join (input_path , "information.txt" ),
186+ os .path .join (output_path , "information.txt" ))
179187
180188
181189if __name__ == '__main__' :
182190 parser = argparse .ArgumentParser ()
183- parser .add_argument ("--data-path" , type = str , default = '/data/datasets/avs/' ,
191+ parser .add_argument ("--data-path" , type = str , default = '. /data/datasets/avs/' ,
184192 help = ("The path to the root of avs dataset, and where the "
185193 "synthetic avs data to be saved." ))
186194 args = parser .parse_args ()
0 commit comments