From 941f9589907ce936ccb3081e28504b4153cb2984 Mon Sep 17 00:00:00 2001 From: Wolf Behrenhoff Date: Wed, 7 Nov 2018 15:34:59 +0100 Subject: [PATCH 1/2] read tables with umlauts correctly --- pandas_access/__init__.py | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/pandas_access/__init__.py b/pandas_access/__init__.py index 9fffccb..07bd20e 100644 --- a/pandas_access/__init__.py +++ b/pandas_access/__init__.py @@ -14,16 +14,16 @@ DEF_RE = re.compile("\s*\[(\w+)\]\s*(.*?),") -def list_tables(rdb_file, encoding="latin-1"): +def list_tables(rdb_file, encoding="utf-8"): """ :param rdb_file: The MS Access database file. - :param encoding: The content encoding of the output. I assume `latin-1` - because so many of MS files have that encoding. But, MDBTools may - actually be UTF-8. + :param encoding: The content encoding of the output. MDBTools + print the output in UTF-8. :return: A list of the tables in a given database. """ - tables = subprocess.check_output(['mdb-tables', rdb_file]).decode(encoding) - return tables.strip().split(" ") + # We use -1 (one table name per line) to support stange table names + tables = subprocess.check_output(['mdb-tables', '-1', rdb_file]).decode(encoding) + return tables.split("\n") def _extract_dtype(data_type): From 4db95e8ebeb9a9ee1e786377a355a2e861312e98 Mon Sep 17 00:00:00 2001 From: Wolf Behrenhoff Date: Fri, 9 Nov 2018 13:53:18 +0100 Subject: [PATCH 2/2] Introduce MdbTable and MdbColumn classes & add tests When dealing with data types where no 1:1 mapping to np.type is possible, the code needs to have more possibilites to modify column properties. In particular, this commit addresses - DateTime columns (exported as ISO 8601 string, column index put into parse_csv's parse_dates argument) - Integer & Long Integer columns which may be NULL and promotion rules into np.float_ (either all int columns or the ones not markes "NOT NULL") - There were no tests at all. This commit introduces a few basic tests for column parsing. --- pandas_access/__init__.py | 224 ++++++++++++++++++++++++---------- pandas_access/test_parsing.py | 54 ++++++++ setup.py | 4 +- 3 files changed, 218 insertions(+), 64 deletions(-) create mode 100644 pandas_access/test_parsing.py diff --git a/pandas_access/__init__.py b/pandas_access/__init__.py index 07bd20e..1a6b3dc 100644 --- a/pandas_access/__init__.py +++ b/pandas_access/__init__.py @@ -11,7 +11,131 @@ TABLE_RE = re.compile("CREATE TABLE \[(\w+)\]\s+\((.*?\));", re.MULTILINE | re.DOTALL) -DEF_RE = re.compile("\s*\[(\w+)\]\s*(.*?),") + +class MdbTable: + """ A MdbTable is basically a list of MdbColumns with some added + functionality. + :param name: Name of the table + """ + def __init__(self, name): + self._name = name + # array instead of dict to preserve the order + self._columns = [] + + def update_dtypes(self, newDtypes): + """ sets the dtype manually to the given types + :param newDtypes: a dictionary {columnName: newDtype} + """ + for c in self._columns: + if c.get_name() in newDtypes: + c.set_dtype(newDtypes[c.get_name()]) + + def get_dtypes(self, promote=None): + """ return a dictionary of {columnName: dataType} + :param promote: see MdbColumn.get_dtype + """ + return {c.get_name(): c.get_dtype(promote) for c in self._columns} + + def date_field_indices(self): + """ returns the column indices of all datetime fields """ + result = [] + for idx, col in enumerate(self._columns): + if col.is_datetime(): + result.append(idx) + return result + + def parse_columns(self, defs_str, implicit_string=True): + """ + Initialize the columns of the table from a schema definition string + created by mdb-schema. The defs_str needs to look like: + [FieldA] Text (100) NOT NULL, + [FieldB] DateTime NOT NULL + ... + Even though the table name can be included in the defs_str, the table + name will NOT be altered by this function. + """ + defs = [] + lines = defs_str.splitlines() + for line in lines: + col = MdbColumn.try_parse_schema_line(line) + if col is None: + continue + if col.get_dtype() is None and implicit_string: + col.set_dtype(np.str_) + defs.append(col) + self._columns = defs + + def get_columns(self): + return self._columns + + def get_name(self): + return self._name + + +class MdbColumn: + __type_conversions = { + 'single': np.float32, + 'double': np.float64, + 'long integer': np.int64, + 'integer': np.int_, + 'text': np.str_, + 'long text': np.str_, + 'boolean': np.bool_, + 'datetime': np.str_, # additional special handling + } + __schema_line_regex = re.compile( + "^\s*\[(\w+)\]\s*(.*?)(?:\s+(NOT NULL))?,?\s*$", re.IGNORECASE) + + @staticmethod + def try_parse_schema_line(line): + """ Create a new MdbColumn object from the given line if possible. + If the format doesn't fit, return None. """ + m = MdbColumn.__schema_line_regex.match(line) + if m: + return MdbColumn(m.group(1), m.group(2), m.group(3) == 'NOT NULL') + return None + + def __init__(self, name, mdb_type_name, not_null): + self._name = name + self._data_type_name = mdb_type_name + self._dtype = self.__get_numpy_type(mdb_type_name) + self._not_null = not_null + + def is_datetime(self): + return self._data_type_name.lower().startswith('datetime') + + def __get_numpy_type(self, mdb_type_name): + mdb_name_lc = mdb_type_name.lower() + for mdbstart, nptype in MdbColumn.__type_conversions.items(): + if mdb_name_lc.startswith(mdbstart): + return nptype + # print("Unknown type:", mdb_type_name) + return None + + def get_name(self): + return self._name + + def get_dtype(self, promote=None): + """ + Returns the data type of a column, possibly promoted to a different + type - promotions are useful for NAN values where no NAN is supported + in pandas. + :param promote: Valid values: 'int_to_float', 'nullable_int_to_float' + """ + if self._dtype in [np.int_, np.int64]: + if (promote == 'nullable_int_to_float' and self.maybe_null()) or \ + (promote == 'int_to_float'): + return np.float_ + return self._dtype + + def set_dtype(self, newtype): + self._dtype = newtype + + def is_not_null(self): + return self._not_null + + def maybe_null(self): + return not self.is_not_null() def list_tables(rdb_file, encoding="utf-8"): @@ -22,71 +146,30 @@ def list_tables(rdb_file, encoding="utf-8"): :return: A list of the tables in a given database. """ # We use -1 (one table name per line) to support stange table names - tables = subprocess.check_output(['mdb-tables', '-1', rdb_file]).decode(encoding) - return tables.split("\n") - - -def _extract_dtype(data_type): - # Note, this list is surely incomplete. But, I only had one .mdb file - # at the time of creation. If you see a new data-type, patch-pull or just - # open an issue. - data_type = data_type.lower() - if data_type.startswith('double'): - return np.float_ - elif data_type.startswith('long'): - return np.int_ - else: - return None - - -def _extract_defs(defs_str): - defs = {} - lines = defs_str.splitlines() - for line in lines: - m = DEF_RE.match(line) - if m: - defs[m.group(1)] = m.group(2) - return defs + tables = subprocess.check_output(['mdb-tables', '-1', rdb_file]) + return tables.decode(encoding).splitlines() -def read_schema(rdb_file, encoding='utf8'): +def read_schema(rdb_file, encoding='utf8', implicit_string=True): """ :param rdb_file: The MS Access database file. :param encoding: The schema encoding. I'm almost positive that MDBTools spits out UTF-8, exclusively. - :return: a dictionary of table -> column -> access_data_type + :return: a dictionary of tablename -> MdbTable object """ output = subprocess.check_output(['mdb-schema', rdb_file]) lines = output.decode(encoding).splitlines() schema_ddl = "\n".join(l for l in lines if l and not l.startswith('-')) schema = {} - for table, defs in TABLE_RE.findall(schema_ddl): - schema[table] = _extract_defs(defs) + for tablename, defs in TABLE_RE.findall(schema_ddl): + table = MdbTable(tablename) + table.parse_columns(defs, implicit_string) + schema[tablename] = table return schema -def to_pandas_schema(schema, implicit_string=True): - """ - :param schema: the output of `read_schema` - :param implicit_string: mark strings and unknown dtypes as `np.str_`. - :return: a dictionary of table -> column -> np.dtype - """ - pd_schema = {} - for tbl, defs in schema.items(): - pd_schema[tbl] = None - sub_schema = {} - for column, data_type in defs.items(): - dtype = _extract_dtype(data_type) - if dtype is not None: - sub_schema[column] = dtype - elif implicit_string: - sub_schema[column] = np.str_ - pd_schema[tbl] = sub_schema - return pd_schema - - def read_table(rdb_file, table_name, *args, **kwargs): """ Read a MS Access database as a Pandas DataFrame. @@ -95,9 +178,15 @@ def read_table(rdb_file, table_name, *args, **kwargs): want to infer the schema from the Access database's schema. This sets the `dtype` argument of `read_csv`, which makes things much faster, in most cases. If you set the `dtype` keyword argument also, it overrides - inferences. The `schema_encoding keyword argument passes through to - `read_schema`. The `implicit_string` argument passes through to - `to_pandas_schema`. + inferences. The `schema_encoding and implicit_string keyword arguments are + passed through to `read_schema`. + + In case you have integer columns with NaNs (not supported by pandas), you + can either manually set the corresponding columns to float by passing the + `dtype` argument. By passing `promote='int_to_float'`, all ints are + automatically converted to float64. For NOT NULL int columns, it is safe + to keep them as int. To promote only int columns that aren't marked NOT + NULL, pass `promote='nullable_int_to_float'`to `read_table`. I recommend setting `chunksize=k`, where k is some reasonable number of rows. This is a simple interface, that doesn't do basic things like @@ -115,13 +204,24 @@ def read_table(rdb_file, table_name, *args, **kwargs): if kwargs.pop('converters_from_schema', True): specified_dtypes = kwargs.pop('dtype', {}) schema_encoding = kwargs.pop('schema_encoding', 'utf8') - schemas = to_pandas_schema(read_schema(rdb_file, schema_encoding), - kwargs.pop('implicit_string', True)) - dtypes = schemas[table_name] - dtypes.update(specified_dtypes) - if dtypes != {}: - kwargs['dtype'] = dtypes - - cmd = ['mdb-export', rdb_file, table_name] + promote = kwargs.pop('promote', None) + schemas = read_schema(rdb_file, schema_encoding, + kwargs.pop('implicit_string', True)) + table = schemas[table_name] + table.update_dtypes(specified_dtypes) + kwargs['dtype'] = table.get_dtypes(promote) + kwargs['parse_dates'] = table.date_field_indices() + + cmd = ['mdb-export', '-D', '%Y-%m-%d %H:%M:%S', rdb_file, table_name] proc = subprocess.Popen(cmd, stdout=subprocess.PIPE) - return pd.read_csv(proc.stdout, *args, **kwargs) + try: + return pd.read_csv(proc.stdout, *args, **kwargs) + except ValueError as ve: + if 'Integer column has NA values' in str(ve): + msg = str(ve).splitlines()[-1] + raise ValueError("\n".join(( + msg, + "Consider passing promote='nullable_int_to_float' or", + "passing promote='int_to_float' to read_table"))) + else: + raise ve diff --git a/pandas_access/test_parsing.py b/pandas_access/test_parsing.py new file mode 100644 index 0000000..bea69dd --- /dev/null +++ b/pandas_access/test_parsing.py @@ -0,0 +1,54 @@ +from . import MdbColumn, MdbTable +import numpy as np +# import pytest + + +def test_column_parsing(): + col = MdbColumn.try_parse_schema_line(" [myName] Integer NOT NULL,") + assert col is not None + assert col.get_dtype() == np.int_ + assert col.get_dtype(promote='int_to_float') == np.float_ + assert col.get_dtype(promote='nullable_int_to_float') == np.int_ + assert col.is_not_null() is True + assert col.maybe_null() is False + + col = MdbColumn.try_parse_schema_line(" [myName] Integer,") + assert col is not None + assert col.get_dtype() == np.int_ + assert col.get_dtype(promote='int_to_float') == np.float_ + assert col.get_dtype(promote='nullable_int_to_float') == np.float_ + assert col.is_not_null() is False + assert col.maybe_null() is True + + col = MdbColumn.try_parse_schema_line(" [myName] DateTime") + assert col is not None + assert col.get_dtype() == np.str_ + assert col.get_dtype(promote='int_to_float') == np.str_ + assert col.get_dtype(promote='nullable_int_to_float') == np.str_ + assert col.is_not_null() is False + assert col.maybe_null() is True + + +def test_table_parsing(): + t = MdbTable("GreatTable") + t.parse_columns( + "CREATE TABLE [ThisNameIsIgnored]\n" + " (\n" + "\t[SomeDate]\t\t\tDateTime, \n" + "\t[SomeTime]\t\t\tDateTime NOT NULL, \n" + "\t[UserName]\t\t\tText (100), \n" + "\t[IsTested]\t\t\tBoolean NOT NULL, \n" + "\t[Value]\t\t\tDouble, \n" + "\t[Number]\t\t\tLong Integer \n" + ");") + assert t.get_name() == 'GreatTable' + cols = t.get_columns() + assert len(cols) == 6 + assert cols[0].get_name() == 'SomeDate' + assert cols[0].get_dtype() == np.str_ + assert cols[0].maybe_null() is True + assert cols[1].maybe_null() is False + assert cols[4].get_name() == 'Value' + assert cols[4].get_dtype() == np.float_ + assert cols[5].get_name() == 'Number' + assert cols[5].get_dtype() == np.int64 diff --git a/setup.py b/setup.py index 4fa5074..b1eeb33 100644 --- a/setup.py +++ b/setup.py @@ -7,7 +7,7 @@ setup( name="pandas_access", version="0.0.1", - packages=["pandas_access"], # Basically, reserve that namespace. + packages=["pandas_access"], # Basically, reserve that namespace. license="License :: OSI Approved :: MIT License", author="John Bjorn Nelson", author_email="jbn@abreka.com", @@ -15,4 +15,4 @@ long_description=open(README_FILE).read(), data_files=['README.md'], url="https://github.com/jbn/pandas_access" -) \ No newline at end of file +)