Skip to content

Commit 94aa5bb

Browse files
authored
Merge pull request #17 from Oxfordblue7/han_pipeline
Update PK/FK generation for data synthesizing.
2 parents 7a8552b + 293a13a commit 94aa5bb

File tree

2 files changed

+35
-13
lines changed

2 files changed

+35
-13
lines changed

synthesize_data/example_data_avs.py

Lines changed: 15 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -46,6 +46,11 @@ def synthesize_column(col, dtype, n_rows):
4646
synth = np.random.normal(mu, sigma if sigma > 0 else 1, size=n_rows)
4747
return np.clip(synth, data.min(), data.max()).round(4)
4848

49+
elif dtype == "int":
50+
min_val, max_val = int(col.min()), int(col.max())
51+
values = np.random.randint(min_val, max_val + 1, n_rows, dtype=np.int64)
52+
return pd.Series(values, dtype="int64")
53+
4954
elif dtype == "category":
5055
freqs = col.value_counts(normalize=True)
5156
return np.random.choice(freqs.index, size=n_rows, p=freqs.values)
@@ -86,7 +91,9 @@ def synthesize_table(df, table_meta, fk_maps, n_rows=None, schema_table_names=No
8691
dtype = col_meta["dtype"]
8792

8893
if dtype == "primary_key":
89-
new_ids = [str(uuid.uuid4()) for _ in range(n_rows)]
94+
# new_ids = [str(uuid.uuid4()) for _ in range(n_rows)]
95+
new_ids = np.random.randint(int(df[col].values.min()), int(df[col].values.max()),
96+
n_rows, dtype=np.int64)
9097
synthetic_df[col] = new_ids
9198
# Map original IDs (cycled if needed)
9299
orig_ids = np.resize(df[col].values, n_rows)
@@ -98,7 +105,10 @@ def synthesize_table(df, table_meta, fk_maps, n_rows=None, schema_table_names=No
98105

99106
if (ref_table not in schema_table_names) or (ref_key not in fk_maps):
100107
# If n mapping available, to synthesize fresh IDs
101-
synthetic_df[col] = [str(uuid.uuid4()) for _ in range(n_rows)]
108+
# synthetic_df[col] = [str(uuid.uuid4()) for _ in range(n_rows)]
109+
synthetic_df[col] = np.random.randint(int(df[col].values.min()),
110+
int(df[col].values.max()), n_rows,
111+
dtype=np.int64)
102112
else:
103113
ref_map = fk_maps[ref_key]
104114
orig_vals = np.resize(df[col].values, n_rows)
@@ -126,7 +136,6 @@ def synthesize_database(datapath, meta_file, size_config=None):
126136
for table_name, table_meta in list(remaining.items()):
127137
print("Processing table", table_name)
128138
df = load_raw_table(datapath, table_meta["source"], table_meta["format"])
129-
# print("\t", df)
130139
n_rows = size_config.get(table_name, len(df)) if size_config else len(df)
131140
synth_df, id_map = synthesize_table(df, table_meta, fk_maps, n_rows, remaining)
132141
synthetic_dfs[table_name] = (synth_df, table_meta["source"])
@@ -141,15 +150,12 @@ def synthesize_database(datapath, meta_file, size_config=None):
141150
for split in ["train", "validation", "test"]:
142151
source = task_meta["source"].replace("{split}", split)
143152
if os.path.exists(os.path.join(datapath, source)):
144-
print("Processing tasks", source)
145153
df = load_raw_table(datapath, source, task_meta["format"])
146-
# print("\t", df)
147154
task_name = f"{task_meta['name']}_{split}"
148155
n_rows = size_config.get(task_name, len(df)) if size_config else len(df)
149156
synth_df, _ = synthesize_table(df, task_meta, fk_maps, n_rows, remaining)
150157
synthetic_dfs[task_name] = (synth_df, source)
151158

152-
# print(">", synthetic_dfs)
153159
return synthetic_dfs
154160

155161

@@ -176,11 +182,13 @@ def main_avs(args):
176182
# copy metadata.yaml file
177183
shutil.copyfile(os.path.join(input_path, "metadata.yaml"),
178184
os.path.join(output_path, "metadata.yaml"))
185+
shutil.copyfile(os.path.join(input_path, "information.txt"),
186+
os.path.join(output_path, "information.txt"))
179187

180188

181189
if __name__ == '__main__':
182190
parser = argparse.ArgumentParser()
183-
parser.add_argument("--data-path", type=str, default='/data/datasets/avs/',
191+
parser.add_argument("--data-path", type=str, default='./data/datasets/avs/',
184192
help=("The path to the root of avs dataset, and where the "
185193
"synthetic avs data to be saved."))
186194
args = parser.parse_args()

synthesize_data/example_data_mag.py

Lines changed: 20 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -63,11 +63,18 @@ def synthesize_column(col, dtype, n_rows):
6363
return np.zeros(n_rows)
6464
mu, sigma = np.mean(data), np.std(data)
6565
synth = np.random.normal(mu, sigma if sigma > 0 else 1, size=n_rows)
66-
return np.clip(synth, data.min(), data.max()).round(4)
66+
values = np.clip(synth, data.min(), data.max()).round(4)
67+
return pd.Series(values, dtype="float32")
68+
69+
elif dtype == "int":
70+
min_val, max_val = int(col.min()), int(col.max())
71+
values = np.random.randint(min_val, max_val + 1, n_rows, dtype=np.int64)
72+
return pd.Series(values, dtype="int64")
6773

6874
elif dtype == "category":
6975
freqs = col.value_counts(normalize=True)
70-
return np.random.choice(freqs.index, size=n_rows, p=freqs.values)
76+
values = np.random.choice(freqs.index, size=n_rows, p=freqs.values)
77+
return pd.Series(values, dtype="category")
7178

7279
elif dtype == "datetime":
7380
col = pd.to_datetime(col, errors="coerce").dropna()
@@ -76,10 +83,15 @@ def synthesize_column(col, dtype, n_rows):
7683
min_date, max_date = col.min(), col.max()
7784
delta = (max_date - min_date).days
7885
random_days = np.random.randint(0, delta + 1, size=n_rows)
79-
return [min_date + pd.Timedelta(days=int(d)) for d in random_days]
86+
values = [min_date + pd.Timedelta(days=int(d)) for d in random_days]
87+
return pd.Series(values, dtype="datetime64[ns]")
8088

8189
else:
82-
return np.random.choice(col.dropna(), size=n_rows)
90+
uniq = col.astype(str).unique()
91+
if len(uniq) == 0:
92+
uniq = ["value0"]
93+
values = np.random.choice(uniq, size=n_rows, replace=True)
94+
return pd.Series(values, dtype="object")
8395

8496
def create_missing_table(col_name, fk_values, n_rows=100):
8597
""" Create a synthetic table for missing foreign-key references. """
@@ -103,7 +115,8 @@ def synthesize_table(df, table_meta, fk_maps, n_rows=None, schema_table_names=No
103115
dtype = col_meta["dtype"]
104116

105117
if dtype == "primary_key":
106-
new_ids = [str(uuid.uuid4()) for _ in range(n_rows)]
118+
# new_ids = [str(uuid.uuid4()) for _ in range(n_rows)]
119+
new_ids = np.random.randint(int(df[col].values.min()), int(df[col].values.max()), n_rows, dtype=np.int64)
107120
synthetic_df[col] = new_ids
108121
# Map original IDs (cycled if needed)
109122
orig_ids = np.resize(df[col].values, n_rows)
@@ -115,7 +128,8 @@ def synthesize_table(df, table_meta, fk_maps, n_rows=None, schema_table_names=No
115128

116129
if (ref_table not in schema_table_names) or (ref_key not in fk_maps):
117130
# If n mapping available, to synthesize fresh IDs
118-
synthetic_df[col] = [str(uuid.uuid4()) for _ in range(n_rows)]
131+
# synthetic_df[col] = [str(uuid.uuid4()) for _ in range(n_rows)]
132+
synthetic_df[col] = np.random.randint(int(df[col].values.min()), int(df[col].values.max()), n_rows, dtype=np.int64)
119133
else:
120134
ref_map = fk_maps[ref_key]
121135
orig_vals = np.resize(df[col].values, n_rows)

0 commit comments

Comments
 (0)