From 38c3aab8ecd41963a2d973291a581009efe2c275 Mon Sep 17 00:00:00 2001 From: dmikester1 Date: Fri, 18 Apr 2025 11:45:52 -0500 Subject: [PATCH] Added Windows support, multi-line insert import, multiple csv file creation, csv headers, other bug fixes --- README.md | 3 + mysqldump_to_csv.py | 170 +++++++++++++++++++++++++++++--------------- 2 files changed, 115 insertions(+), 58 deletions(-) diff --git a/README.md b/README.md index 576baac..d0331a5 100644 --- a/README.md +++ b/README.md @@ -21,6 +21,9 @@ is turned into the following CSV: 1,0,April,1,0,0,0.778582929065,20140312223924,20140312223929,4657771,20236,0 2,0,August,0,0,0,0.123830928525,20140312221818,20140312221822,4360163,11466,0 + +## UPDATES (4-18-25) +I've added support for Windows so it won't crash when trying to load in Windows. I added a fix to read multi-line inserts. I fixed an overflow error I was getting when setting the field_size_limit. I added funcionality to create a single CSV for each table in the SQL file, CSV names being read dynamically from the SQL file. Also fixed it so it creates a seperate row for each record in the CSV instead of all on one row. And finally added in headers to the CSV files also pulled dynamically from the SQL file. ## License The code is strung together from other public repos, I'm pretty sure the license is standard MIT License. diff --git a/mysqldump_to_csv.py b/mysqldump_to_csv.py index 6d53366..afcc5cb 100644 --- a/mysqldump_to_csv.py +++ b/mysqldump_to_csv.py @@ -2,20 +2,49 @@ import fileinput import csv import sys +import re # This prevents prematurely closed pipes from raising # an exception in Python -from signal import signal, SIGPIPE, SIG_DFL -signal(SIGPIPE, SIG_DFL) +import signal +import sys + +if hasattr(signal, 'SIGPIPE'): + signal.signal(signal.SIGPIPE, signal.SIG_DFL) + # allow large content in the dump -csv.field_size_limit(sys.maxsize) +# csv.field_size_limit(sys.maxsize) +csv.field_size_limit(2**31 - 1) + + +def extract_columns(line): + """ + Extracts column names from the INSERT INTO statement. + Returns a list of column names, or None if not found. + """ + match = re.search(r'INSERT INTO\s+`?\w+`?\s*\(([^)]+)\)', line, re.IGNORECASE) + if match: + cols = match.group(1) + return [col.strip().strip('`') for col in cols.split(',')] + return None + + +def extract_table_name(line): + """ + Extracts the table name from an INSERT INTO statement. + Handles backticks and spacing. + """ + match = re.search(r'INSERT INTO\s+`?(\w+)`?\s', line, re.IGNORECASE) + return match.group(1) if match else None + def is_insert(line): """ Returns true if the line begins a SQL insert statement. + Ignores leading whitespace and case. """ - return line.startswith('INSERT INTO') + return line.lstrip().upper().startswith('INSERT INTO') def get_values(line): @@ -37,73 +66,98 @@ def values_sanity_check(values): def parse_values(values, outfile): """ - Given a file handle and the raw values from a MySQL INSERT - statement, write the equivalent CSV to the file + Parses SQL INSERT values and writes clean CSV rows, removing single quotes around strings. """ - latest_row = [] + values = values.rstrip(';') + tuples = re.findall(r'\([^\)]*\)', values) + writer = csv.writer(outfile, quoting=csv.QUOTE_MINIMAL) - reader = csv.reader([values], delimiter=',', - doublequote=False, - escapechar='\\', - quotechar="'", - strict=True - ) + for val in tuples: + # Remove outer parentheses + val = val.strip()[1:-1] - writer = csv.writer(outfile, quoting=csv.QUOTE_MINIMAL) - for reader_row in reader: - for column in reader_row: - # If our current string is empty... - if len(column) == 0 or column == 'NULL': - latest_row.append(chr(0)) - continue - # If our string starts with an open paren - if column[0] == "(": - # If we've been filling out a row - if len(latest_row) > 0: - # Check if the previous entry ended in - # a close paren. If so, the row we've - # been filling out has been COMPLETED - # as: - # 1) the previous entry ended in a ) - # 2) the current entry starts with a ( - if latest_row[-1][-1] == ")": - # Remove the close paren. - latest_row[-1] = latest_row[-1][:-1] - writer.writerow(latest_row) - latest_row = [] - # If we're beginning a new row, eliminate the - # opening parentheses. - if len(latest_row) == 0: - column = column[1:] - # Add our column to the row we're working on. - latest_row.append(column) - # At the end of an INSERT statement, we'll - # have the semicolon. - # Make sure to remove the semicolon and - # the close paren. - if latest_row[-1][-2:] == ");": - latest_row[-1] = latest_row[-1][:-2] - writer.writerow(latest_row) + # Split respecting commas inside quotes + parts = [] + current = '' + in_quote = False + escape = False + + for char in val: + if escape: + current += char + escape = False + elif char == '\\': + escape = True + elif char == "'": + in_quote = not in_quote + current += char + elif char == ',' and not in_quote: + parts.append(current.strip()) + current = '' + else: + current += char + if current: + parts.append(current.strip()) + + # Clean each part: remove quotes, handle NULL + clean_row = [] + for col in parts: + if col.upper() == 'NULL': + clean_row.append('') + elif col.startswith("'") and col.endswith("'"): + # Strip surrounding quotes and unescape inner quotes + unquoted = col[1:-1].replace("\\'", "'").replace('\\\\', '\\') + clean_row.append(unquoted) + else: + clean_row.append(col) + + writer.writerow(clean_row) def main(): """ Parse arguments and start the program """ - # Iterate over all lines in all files - # listed in sys.argv[1:] - # or stdin if no args given. try: + written_tables = set() # keep track of tables we've written headers for + + buffer = '' for line in fileinput.input(): - # Look for an INSERT statement and parse it. - if not is_insert(line): - raise Exception("SQL INSERT statement could not be found!") - values = get_values(line) - if not values_sanity_check(values): - raise Exception("Getting substring of SQL INSERT statement after ' VALUES ' failed!") - parse_values(values, sys.stdout) + line = line.strip() + if not line or line.startswith('--') or line.startswith('/*'): + continue # skip comments and empty lines + + buffer += ' ' + line # accumulate SQL statement lines + + if line.endswith(';'): + if is_insert(buffer): + table_name = extract_table_name(buffer) + if not table_name: + raise Exception("Could not extract table name from INSERT statement!") + + columns = extract_columns(buffer) + if not columns: + raise Exception("Could not extract column names from INSERT statement!") + + values = get_values(buffer) + if not values_sanity_check(values): + raise Exception("Getting substring of SQL INSERT statement after ' VALUES ' failed!") + + # Open the CSV file and write header if needed + write_header = table_name not in written_tables + with open(f"{table_name}.csv", "a", newline='', encoding='utf-8') as outfile: + writer = csv.writer(outfile) + if write_header: + writer.writerow(columns) + written_tables.add(table_name) + + parse_values(values, outfile) + + + buffer = '' # clear buffer for next statement except KeyboardInterrupt: sys.exit(0) + if __name__ == "__main__": main()