jbn · behrenhoff · Nov 7, 2018 · Nov 9, 2018
diff --git a/pandas_access/__init__.py b/pandas_access/__init__.py
@@ -11,82 +11,165 @@
 TABLE_RE = re.compile("CREATE TABLE \[(\w+)\]\s+\((.*?\));",
                       re.MULTILINE | re.DOTALL)
 
-DEF_RE = re.compile("\s*\[(\w+)\]\s*(.*?),")
 
+class MdbTable:
+    """ A MdbTable is basically a list of MdbColumns with some added
+    functionality.
+    :param name: Name of the table
+    """
+    def __init__(self, name):
+        self._name = name
+        # array instead of dict to preserve the order
+        self._columns = []
+
+    def update_dtypes(self, newDtypes):
+        """ sets the dtype manually to the given types
+        :param newDtypes: a dictionary {columnName: newDtype}
+        """
+        for c in self._columns:
+            if c.get_name() in newDtypes:
+                c.set_dtype(newDtypes[c.get_name()])
+
+    def get_dtypes(self, promote=None):
+        """ return a dictionary of {columnName: dataType}
+        :param promote: see MdbColumn.get_dtype
+        """
+        return {c.get_name(): c.get_dtype(promote) for c in self._columns}
+
+    def date_field_indices(self):
+        """ returns the column indices of all datetime fields """
+        result = []
+        for idx, col in enumerate(self._columns):
+            if col.is_datetime():
+                result.append(idx)
+        return result
+
+    def parse_columns(self, defs_str, implicit_string=True):
+        """
+        Initialize the columns of the table from a schema definition string
+        created by mdb-schema. The defs_str needs to look like:
+            [FieldA]   Text (100) NOT NULL,
+            [FieldB]   DateTime NOT NULL
+            ...
+        Even though the table name can be included in the defs_str, the table
+        name will NOT be altered by this function.
+        """
+        defs = []
+        lines = defs_str.splitlines()
+        for line in lines:
+            col = MdbColumn.try_parse_schema_line(line)
+            if col is None:
+                continue
+            if col.get_dtype() is None and implicit_string:
+                col.set_dtype(np.str_)
+            defs.append(col)
+        self._columns = defs
+
+    def get_columns(self):
+        return self._columns
+
+    def get_name(self):
+        return self._name
+
+
+class MdbColumn:
+    __type_conversions = {
+        'single': np.float32,
+        'double': np.float64,
+        'long integer': np.int64,
+        'integer': np.int_,
+        'text': np.str_,
+        'long text': np.str_,
+        'boolean': np.bool_,
+        'datetime': np.str_,  # additional special handling
+    }
+    __schema_line_regex = re.compile(
+        "^\s*\[(\w+)\]\s*(.*?)(?:\s+(NOT NULL))?,?\s*$", re.IGNORECASE)
+
+    @staticmethod
+    def try_parse_schema_line(line):
+        """ Create a new MdbColumn object from the given line if possible.
+        If the format doesn't fit, return None. """
+        m = MdbColumn.__schema_line_regex.match(line)
+        if m:
+            return MdbColumn(m.group(1), m.group(2), m.group(3) == 'NOT NULL')
+        return None
+
+    def __init__(self, name, mdb_type_name, not_null):
+        self._name = name
+        self._data_type_name = mdb_type_name
+        self._dtype = self.__get_numpy_type(mdb_type_name)
+        self._not_null = not_null
+
+    def is_datetime(self):
+        return self._data_type_name.lower().startswith('datetime')
+
+    def __get_numpy_type(self, mdb_type_name):
+        mdb_name_lc = mdb_type_name.lower()
+        for mdbstart, nptype in MdbColumn.__type_conversions.items():
+            if mdb_name_lc.startswith(mdbstart):
+                return nptype
+        # print("Unknown type:", mdb_type_name)
+        return None
+
+    def get_name(self):
+        return self._name
+
+    def get_dtype(self, promote=None):
+        """
+        Returns the data type of a column, possibly promoted to a different
+        type - promotions are useful for NAN values where no NAN is supported
+        in pandas.
+        :param promote: Valid values: 'int_to_float', 'nullable_int_to_float'
+        """
+        if self._dtype in [np.int_, np.int64]:
+            if (promote == 'nullable_int_to_float' and self.maybe_null()) or \
+               (promote == 'int_to_float'):
+                return np.float_
+        return self._dtype
+
+    def set_dtype(self, newtype):
+        self._dtype = newtype
+
+    def is_not_null(self):
+        return self._not_null
+
+    def maybe_null(self):
+        return not self.is_not_null()
 
-def list_tables(rdb_file, encoding="latin-1"):
+
+def list_tables(rdb_file, encoding="utf-8"):
     """
     :param rdb_file: The MS Access database file.
-    :param encoding: The content encoding of the output. I assume `latin-1`
-        because so many of MS files have that encoding. But, MDBTools may
-        actually be UTF-8.
+    :param encoding: The content encoding of the output. MDBTools
+        print the output in UTF-8.
     :return: A list of the tables in a given database.
     """
-    tables = subprocess.check_output(['mdb-tables', rdb_file]).decode(encoding)
-    return tables.strip().split(" ")
-
-
-def _extract_dtype(data_type):
-    # Note, this list is surely incomplete. But, I only had one .mdb file
-    # at the time of creation. If you see a new data-type, patch-pull or just
-    # open an issue.
-    data_type = data_type.lower()
-    if data_type.startswith('double'):
-        return np.float_
-    elif data_type.startswith('long'):
-        return np.int_
-    else:
-        return None
-
-
-def _extract_defs(defs_str):
-    defs = {}
-    lines = defs_str.splitlines()
-    for line in lines:
-        m = DEF_RE.match(line)
-        if m:
-            defs[m.group(1)] = m.group(2)
-    return defs
+    # We use -1 (one table name per line) to support stange table names
+    tables = subprocess.check_output(['mdb-tables', '-1', rdb_file])
+    return tables.decode(encoding).splitlines()
 
 
-def read_schema(rdb_file, encoding='utf8'):
+def read_schema(rdb_file, encoding='utf8', implicit_string=True):
     """
     :param rdb_file: The MS Access database file.
     :param encoding: The schema encoding. I'm almost positive that MDBTools
         spits out UTF-8, exclusively.
-    :return: a dictionary of table -> column -> access_data_type
+    :return: a dictionary of tablename -> MdbTable object
     """
     output = subprocess.check_output(['mdb-schema', rdb_file])
     lines = output.decode(encoding).splitlines()
     schema_ddl = "\n".join(l for l in lines if l and not l.startswith('-'))
 
     schema = {}
-    for table, defs in TABLE_RE.findall(schema_ddl):
-        schema[table] = _extract_defs(defs)
+    for tablename, defs in TABLE_RE.findall(schema_ddl):
+        table = MdbTable(tablename)
+        table.parse_columns(defs, implicit_string)
+        schema[tablename] = table
 
     return schema
 
 
-def to_pandas_schema(schema, implicit_string=True):
-    """
-    :param schema: the output of `read_schema`
-    :param implicit_string: mark strings and unknown dtypes as `np.str_`.
-    :return: a dictionary of table -> column -> np.dtype
-    """
-    pd_schema = {}
-    for tbl, defs in schema.items():
-        pd_schema[tbl] = None
-        sub_schema = {}
-        for column, data_type in defs.items():
-            dtype = _extract_dtype(data_type)
-            if dtype is not None:
-                sub_schema[column] = dtype
-            elif implicit_string:
-                sub_schema[column] = np.str_
-        pd_schema[tbl] = sub_schema
-    return pd_schema
-
-
 def read_table(rdb_file, table_name, *args, **kwargs):
     """
     Read a MS Access database as a Pandas DataFrame.
@@ -95,9 +178,15 @@ def read_table(rdb_file, table_name, *args, **kwargs):
     want to infer the schema from the Access database's schema. This sets the
     `dtype` argument of `read_csv`, which makes things much faster, in most
     cases. If you set the `dtype` keyword argument also, it overrides
-    inferences. The `schema_encoding keyword argument passes through to
-    `read_schema`. The `implicit_string` argument passes through to
-    `to_pandas_schema`.
+    inferences. The `schema_encoding and implicit_string keyword arguments are
+    passed through to `read_schema`.
+
+    In case you have integer columns with NaNs (not supported by pandas), you
+    can either manually set the corresponding columns to float by passing the
+    `dtype` argument. By passing `promote='int_to_float'`, all ints are
+    automatically converted to float64. For NOT NULL int columns, it is safe
+    to keep them as int. To promote only int columns that aren't marked NOT
+    NULL, pass `promote='nullable_int_to_float'`to `read_table`.
 
     I recommend setting `chunksize=k`, where k is some reasonable number of
     rows. This is a simple interface, that doesn't do basic things like
@@ -115,13 +204,24 @@ def read_table(rdb_file, table_name, *args, **kwargs):
     if kwargs.pop('converters_from_schema', True):
         specified_dtypes = kwargs.pop('dtype', {})
         schema_encoding = kwargs.pop('schema_encoding', 'utf8')
-        schemas = to_pandas_schema(read_schema(rdb_file, schema_encoding),
-                                   kwargs.pop('implicit_string', True))
-        dtypes = schemas[table_name]
-        dtypes.update(specified_dtypes)
-        if dtypes != {}:
-            kwargs['dtype'] = dtypes
-
-    cmd = ['mdb-export', rdb_file, table_name]
+        promote = kwargs.pop('promote', None)
+        schemas = read_schema(rdb_file, schema_encoding,
+                              kwargs.pop('implicit_string', True))
+        table = schemas[table_name]
+        table.update_dtypes(specified_dtypes)
+        kwargs['dtype'] = table.get_dtypes(promote)
+        kwargs['parse_dates'] = table.date_field_indices()
+
+    cmd = ['mdb-export', '-D', '%Y-%m-%d %H:%M:%S', rdb_file, table_name]
     proc = subprocess.Popen(cmd, stdout=subprocess.PIPE)
-    return pd.read_csv(proc.stdout, *args, **kwargs)
+    try:
+        return pd.read_csv(proc.stdout, *args, **kwargs)
+    except ValueError as ve:
+        if 'Integer column has NA values' in str(ve):
+            msg = str(ve).splitlines()[-1]
+            raise ValueError("\n".join((
+                msg,
+                "Consider passing promote='nullable_int_to_float' or",
+                "passing promote='int_to_float' to read_table")))
+        else:
+            raise ve
diff --git a/pandas_access/test_parsing.py b/pandas_access/test_parsing.py
@@ -0,0 +1,54 @@
+from . import MdbColumn, MdbTable
+import numpy as np
+# import pytest
+
+
+def test_column_parsing():
+    col = MdbColumn.try_parse_schema_line(" [myName]  Integer NOT NULL,")
+    assert col is not None
+    assert col.get_dtype() == np.int_
+    assert col.get_dtype(promote='int_to_float') == np.float_
+    assert col.get_dtype(promote='nullable_int_to_float') == np.int_
+    assert col.is_not_null() is True
+    assert col.maybe_null() is False
+
+    col = MdbColumn.try_parse_schema_line(" [myName]  Integer,")
+    assert col is not None
+    assert col.get_dtype() == np.int_
+    assert col.get_dtype(promote='int_to_float') == np.float_
+    assert col.get_dtype(promote='nullable_int_to_float') == np.float_
+    assert col.is_not_null() is False
+    assert col.maybe_null() is True
+
+    col = MdbColumn.try_parse_schema_line(" [myName]  DateTime")
+    assert col is not None
+    assert col.get_dtype() == np.str_
+    assert col.get_dtype(promote='int_to_float') == np.str_
+    assert col.get_dtype(promote='nullable_int_to_float') == np.str_
+    assert col.is_not_null() is False
+    assert col.maybe_null() is True
+
+
+def test_table_parsing():
+    t = MdbTable("GreatTable")
+    t.parse_columns(
+        "CREATE TABLE [ThisNameIsIgnored]\n"
+        " (\n"
+        "\t[SomeDate]\t\t\tDateTime, \n"
+        "\t[SomeTime]\t\t\tDateTime NOT NULL, \n"
+        "\t[UserName]\t\t\tText (100), \n"
+        "\t[IsTested]\t\t\tBoolean NOT NULL, \n"
+        "\t[Value]\t\t\tDouble, \n"
+        "\t[Number]\t\t\tLong Integer \n"
+        ");")
+    assert t.get_name() == 'GreatTable'
+    cols = t.get_columns()
+    assert len(cols) == 6
+    assert cols[0].get_name() == 'SomeDate'
+    assert cols[0].get_dtype() == np.str_
+    assert cols[0].maybe_null() is True
+    assert cols[1].maybe_null() is False
+    assert cols[4].get_name() == 'Value'
+    assert cols[4].get_dtype() == np.float_
+    assert cols[5].get_name() == 'Number'
+    assert cols[5].get_dtype() == np.int64
diff --git a/setup.py b/setup.py
@@ -7,12 +7,12 @@
 setup(
     name="pandas_access",
     version="0.0.1",
-    packages=["pandas_access"], # Basically, reserve that namespace.
+    packages=["pandas_access"],  # Basically, reserve that namespace.
     license="License :: OSI Approved :: MIT License",
     author="John Bjorn Nelson",
     author_email="[email protected]",
     description="A tiny, subprocess-based tool for reading a MS Access database(.rdb) as a Pandas DataFrame.",
     long_description=open(README_FILE).read(),
     data_files=['README.md'],
     url="https://github.com/jbn/pandas_access"
-)
+)