From aaf88180978066cb35ae77e2050e63c6bb62a6a8 Mon Sep 17 00:00:00 2001 From: Pieter Roggemans Date: Thu, 17 Oct 2024 22:00:29 +0200 Subject: [PATCH 01/59] ENH: deal properly with naive datetimes with arrow --- pyogrio/tests/test_geopandas_io.py | 24 ++++++++++++++++++++++++ 1 file changed, 24 insertions(+) diff --git a/pyogrio/tests/test_geopandas_io.py b/pyogrio/tests/test_geopandas_io.py index 0675c197..953d259a 100644 --- a/pyogrio/tests/test_geopandas_io.py +++ b/pyogrio/tests/test_geopandas_io.py @@ -1,5 +1,6 @@ import contextlib import locale +import time import warnings from datetime import datetime from io import BytesIO @@ -351,6 +352,29 @@ def test_read_write_datetime_tz_with_nulls(tmp_path, use_arrow): assert_geodataframe_equal(df, result) +@pytest.mark.filterwarnings( + "ignore: Non-conformant content for record 1 in column dates" +) +@pytest.mark.requires_arrow_write_api +def test_read_write_datetime_no_tz(tmp_path, use_arrow): + dates_raw = ["2020-01-01 09:00:00.123", "2020-01-01 10:00:00"] + if PANDAS_GE_20: + dates = pd.to_datetime(dates_raw, format="ISO8601").as_unit("ms") + else: + dates = pd.to_datetime(dates_raw) + df = gp.GeoDataFrame( + {"dates": dates, "geometry": [Point(1, 1), Point(1, 1)]}, + crs="EPSG:4326", + ) + fpath = tmp_path / "test.gpkg" + write_dataframe(df, fpath, use_arrow=use_arrow) + result = read_dataframe(fpath, use_arrow=use_arrow) + if use_arrow: + # with Arrow, the datetimes are always read as UTC + df["dates"] = df["dates"].dt.tz_localize(time.timezone).dt.tz_convert("UTC") + assert_geodataframe_equal(df, result) + + def test_read_null_values(tmp_path, use_arrow): filename = tmp_path / "test_null_values_no_geometry.gpkg" From 3e463a19e383319f3148ad6f176854b68b858a74 Mon Sep 17 00:00:00 2001 From: Pieter Roggemans Date: Fri, 18 Oct 2024 18:38:43 +0200 Subject: [PATCH 02/59] Add more testcases, also for tz datetimes --- pyogrio/tests/test_geopandas_io.py | 52 ++++++++++++++++++++++-------- 1 file changed, 38 insertions(+), 14 deletions(-) diff --git a/pyogrio/tests/test_geopandas_io.py b/pyogrio/tests/test_geopandas_io.py index 953d259a..91e2c416 100644 --- a/pyogrio/tests/test_geopandas_io.py +++ b/pyogrio/tests/test_geopandas_io.py @@ -7,6 +7,7 @@ from zipfile import ZipFile import numpy as np +import pytz from pyogrio import ( __gdal_version__, @@ -299,7 +300,7 @@ def test_read_datetime_tz(datetime_tz_file, tmp_path, use_arrow): write_dataframe(df, fpath, use_arrow=use_arrow) df_read = read_dataframe(fpath, use_arrow=use_arrow) if use_arrow: - # with Arrow, the datetimes are always read as UTC + # with Arrow, the datetimes are always read as UTC for .gpkg expected = expected.dt.tz_convert("UTC") assert_series_equal(df_read.datetime_col, expected) @@ -329,49 +330,72 @@ def test_write_datetime_mixed_offset(tmp_path, use_arrow): assert_series_equal(result["dates"], utc_col) +@pytest.mark.parametrize("ext", [ext for ext in ALL_EXTS if ext != ".shp"]) @pytest.mark.filterwarnings( "ignore: Non-conformant content for record 1 in column dates" ) @pytest.mark.requires_arrow_write_api -def test_read_write_datetime_tz_with_nulls(tmp_path, use_arrow): - dates_raw = ["2020-01-01T09:00:00.123-05:00", "2020-01-01T10:00:00-05:00", pd.NaT] +def test_read_write_datetime_no_tz(tmp_path, ext, use_arrow): + dates_raw = ["2020-01-01 09:00:00.123", "2020-01-01 10:00:00"] if PANDAS_GE_20: dates = pd.to_datetime(dates_raw, format="ISO8601").as_unit("ms") else: dates = pd.to_datetime(dates_raw) df = gp.GeoDataFrame( - {"dates": dates, "geometry": [Point(1, 1), Point(1, 1), Point(1, 1)]}, + {"dates": dates, "geometry": [Point(1, 1), Point(1, 1)]}, crs="EPSG:4326", ) - fpath = tmp_path / "test.gpkg" + fpath = tmp_path / f"test{ext}" write_dataframe(df, fpath, use_arrow=use_arrow) result = read_dataframe(fpath, use_arrow=use_arrow) - if use_arrow: - # with Arrow, the datetimes are always read as UTC - df["dates"] = df["dates"].dt.tz_convert("UTC") + if use_arrow and ext == ".gpkg": + # for GPKG with Arrow, the datetime is written as naive datetime with the + # correct times, but when read the naive time is assumed to be UTC, which + # changes the effective time so this seems wrong. + df["dates"] = df["dates"].dt.tz_localize(time.timezone).dt.tz_convert("UTC") assert_geodataframe_equal(df, result) +@pytest.mark.parametrize("ext", [ext for ext in ALL_EXTS if ext != ".shp"]) @pytest.mark.filterwarnings( "ignore: Non-conformant content for record 1 in column dates" ) @pytest.mark.requires_arrow_write_api -def test_read_write_datetime_no_tz(tmp_path, use_arrow): - dates_raw = ["2020-01-01 09:00:00.123", "2020-01-01 10:00:00"] +def test_read_write_datetime_tz_with_nulls(tmp_path, ext, use_arrow): + dates_raw = ["2020-01-01T09:00:00.123-05:00", "2020-01-01T10:00:00-05:00", pd.NaT] if PANDAS_GE_20: dates = pd.to_datetime(dates_raw, format="ISO8601").as_unit("ms") else: dates = pd.to_datetime(dates_raw) df = gp.GeoDataFrame( - {"dates": dates, "geometry": [Point(1, 1), Point(1, 1)]}, + {"dates": dates, "geometry": [Point(1, 1), Point(1, 1), Point(1, 1)]}, crs="EPSG:4326", ) - fpath = tmp_path / "test.gpkg" + fpath = tmp_path / f"test{ext}" write_dataframe(df, fpath, use_arrow=use_arrow) result = read_dataframe(fpath, use_arrow=use_arrow) if use_arrow: - # with Arrow, the datetimes are always read as UTC - df["dates"] = df["dates"].dt.tz_localize(time.timezone).dt.tz_convert("UTC") + if ext == ".fgb": + # when FlatGeoBuffer is read with Arrow, for datetimes with equal timezone, + # a column type with the appropriate minutes offset is returned. + # REMARK: For .fgb, the timezone is just dropped when reading or writing!!! + # -> 2020-01-01T09:00:00.123-05:00 becomes 2020-01-01T09:00:00.123 + df["dates"] = df["dates"].dt.tz_localize(tz=None) + elif ext in (".geojson", ".geojsonl"): + # when GeoJSON is read with Arrow, for datetimes with equal timezone, a + # column type with the appropriate minutes offset is returned. + # REMARK: for .geojson, the data is written fine, but when reading it goes + # wrong: 2020-01-01T09:00:00.123-05:00 becomes 2020-01-01T04:00:00.123-05:00 + df["dates"] = ( + df["dates"] + .dt.tz_localize(tz=None) + .dt.tz_localize(tz="UTC") + .dt.tz_convert(pytz.FixedOffset(-300)) + ) + elif ext == ".gpkg": + # when GPKG is read with Arrow, datetimes with timezone are converted to + # UTC. + df["dates"] = df["dates"].dt.tz_convert("UTC") assert_geodataframe_equal(df, result) From c18ab22b5e16bf43777a706f720131ad3857f4fc Mon Sep 17 00:00:00 2001 From: Pieter Roggemans Date: Fri, 17 Jan 2025 09:09:25 +0100 Subject: [PATCH 03/59] Use datetime_as_string for reading with arrow --- pyogrio/_io.pyx | 10 +++++++- pyogrio/geopandas.py | 14 +++++++---- pyogrio/raw.py | 8 ++++++ pyogrio/tests/test_geopandas_io.py | 39 ++---------------------------- 4 files changed, 28 insertions(+), 43 deletions(-) diff --git a/pyogrio/_io.pyx b/pyogrio/_io.pyx index d7334838..574c88fa 100644 --- a/pyogrio/_io.pyx +++ b/pyogrio/_io.pyx @@ -1430,6 +1430,7 @@ def ogr_open_arrow( int return_fids=False, int batch_size=0, use_pyarrow=False, + datetime_as_string=False, ): cdef int err = 0 @@ -1624,6 +1625,12 @@ def ogr_open_arrow( "GEOARROW".encode('UTF-8') ) + # Read DateTime fields as strings, as the Arrow DateTime column type is + # quite limited regarding support for mixed timezones,... + IF CTE_GDAL_VERSION >= (3, 11, 0): + if datetime_as_string: + options = CSLSetNameValue(options, "DATETIME_AS_STRING", "YES") + # make sure layer is read from beginning OGR_L_ResetReading(ogr_layer) @@ -1649,6 +1656,7 @@ def ogr_open_arrow( 'crs': crs, 'encoding': encoding, 'fields': fields[:,2], # return only names + "dtypes": fields[:,3], 'geometry_type': geometry_type, 'geometry_name': geometry_name, 'fid_column': fid_column, @@ -2552,7 +2560,7 @@ def ogr_write_arrow( object path_or_fp, str layer, str driver, - object arrow_obj, + obje ct arrow_obj, str crs, str geometry_type, str geometry_name, diff --git a/pyogrio/geopandas.py b/pyogrio/geopandas.py index 11672b25..62f2b532 100644 --- a/pyogrio/geopandas.py +++ b/pyogrio/geopandas.py @@ -257,11 +257,9 @@ def read_dataframe( read_func = read_arrow if use_arrow else read gdal_force_2d = False if use_arrow else force_2d - if not use_arrow: - # For arrow, datetimes are read as is. - # For numpy IO, datetimes are read as string values to preserve timezone info - # as numpy does not directly support timezones. - kwargs["datetime_as_string"] = True + + # Always read datetimes are as string values to preserve (mixed) timezone info + # as numpy does not directly support timezones and arrow support is also limited. result = read_func( path_or_buffer, layer=layer, @@ -278,6 +276,7 @@ def read_dataframe( sql=sql, sql_dialect=sql_dialect, return_fids=fid_as_index, + datetime_as_string=True, **kwargs, ) @@ -292,6 +291,11 @@ def read_dataframe( df = table.to_pandas(**kwargs) del table + # convert datetime columns that were read as string to datetime + for dtype, column in zip(meta["dtypes"], meta["fields"]): + if dtype is not None and dtype.startswith("datetime"): + df[column] = _try_parse_datetime(df[column]) + if fid_as_index: df = df.set_index(meta["fid_column"]) df.index.names = ["fid"] diff --git a/pyogrio/raw.py b/pyogrio/raw.py index 0f0c3063..09bd5aa2 100644 --- a/pyogrio/raw.py +++ b/pyogrio/raw.py @@ -233,6 +233,7 @@ def read_arrow( sql=None, sql_dialect=None, return_fids=False, + datetime_as_string=False, **kwargs, ): """Read OGR data source into a pyarrow Table. @@ -303,6 +304,7 @@ def read_arrow( skip_features=gdal_skip_features, batch_size=batch_size, use_pyarrow=True, + datetime_as_string=datetime_as_string, **kwargs, ) as source: meta, reader = source @@ -358,6 +360,7 @@ def open_arrow( return_fids=False, batch_size=65_536, use_pyarrow=False, + datetime_as_string=False, **kwargs, ): """Open OGR data source as a stream of Arrow record batches. @@ -386,6 +389,9 @@ def open_arrow( ArrowStream object. In the default case, this stream object needs to be passed to another library supporting the Arrow PyCapsule Protocol to consume the stream of data. + datetime_as_string : bool, optional (default: False) + If True, will return datetime dtypes as detected by GDAL as strings, + as arrow doesn't support e.g. mixed timezones. Examples -------- @@ -423,6 +429,7 @@ def open_arrow( Meta is: { "crs": "", "fields": , + "dtypes": "encoding": "", "geometry_type": "", "geometry_name": "", @@ -453,6 +460,7 @@ def open_arrow( dataset_kwargs=dataset_kwargs, batch_size=batch_size, use_pyarrow=use_pyarrow, + datetime_as_string=datetime_as_string, ) diff --git a/pyogrio/tests/test_geopandas_io.py b/pyogrio/tests/test_geopandas_io.py index f2373526..543370c3 100644 --- a/pyogrio/tests/test_geopandas_io.py +++ b/pyogrio/tests/test_geopandas_io.py @@ -1,13 +1,11 @@ import contextlib import locale -import time import warnings from datetime import datetime from io import BytesIO from zipfile import ZipFile import numpy as np -import pytz from pyogrio import ( __gdal_version__, @@ -316,9 +314,6 @@ def test_read_datetime_tz(datetime_tz_file, tmp_path, use_arrow): fpath = tmp_path / "test.gpkg" write_dataframe(df, fpath, use_arrow=use_arrow) df_read = read_dataframe(fpath, use_arrow=use_arrow) - if use_arrow: - # with Arrow, the datetimes are always read as UTC for .gpkg - expected = expected.dt.tz_convert("UTC") assert_series_equal(df_read.datetime_col, expected) @@ -348,9 +343,6 @@ def test_write_datetime_mixed_offset(tmp_path, use_arrow): @pytest.mark.parametrize("ext", [ext for ext in ALL_EXTS if ext != ".shp"]) -@pytest.mark.filterwarnings( - "ignore: Non-conformant content for record 1 in column dates" -) @pytest.mark.requires_arrow_write_api def test_read_write_datetime_no_tz(tmp_path, ext, use_arrow): dates_raw = ["2020-01-01 09:00:00.123", "2020-01-01 10:00:00"] @@ -359,17 +351,11 @@ def test_read_write_datetime_no_tz(tmp_path, ext, use_arrow): else: dates = pd.to_datetime(dates_raw) df = gp.GeoDataFrame( - {"dates": dates, "geometry": [Point(1, 1), Point(1, 1)]}, - crs="EPSG:4326", + {"dates": dates, "geometry": [Point(1, 1), Point(1, 1)]}, crs="EPSG:4326" ) fpath = tmp_path / f"test{ext}" write_dataframe(df, fpath, use_arrow=use_arrow) result = read_dataframe(fpath, use_arrow=use_arrow) - if use_arrow and ext == ".gpkg": - # for GPKG with Arrow, the datetime is written as naive datetime with the - # correct times, but when read the naive time is assumed to be UTC, which - # changes the effective time so this seems wrong. - df["dates"] = df["dates"].dt.tz_localize(time.timezone).dt.tz_convert("UTC") assert_geodataframe_equal(df, result) @@ -391,28 +377,7 @@ def test_read_write_datetime_tz_with_nulls(tmp_path, ext, use_arrow): fpath = tmp_path / f"test{ext}" write_dataframe(df, fpath, use_arrow=use_arrow) result = read_dataframe(fpath, use_arrow=use_arrow) - if use_arrow: - if ext == ".fgb": - # when FlatGeoBuffer is read with Arrow, for datetimes with equal timezone, - # a column type with the appropriate minutes offset is returned. - # REMARK: For .fgb, the timezone is just dropped when reading or writing!!! - # -> 2020-01-01T09:00:00.123-05:00 becomes 2020-01-01T09:00:00.123 - df["dates"] = df["dates"].dt.tz_localize(tz=None) - elif ext in (".geojson", ".geojsonl"): - # when GeoJSON is read with Arrow, for datetimes with equal timezone, a - # column type with the appropriate minutes offset is returned. - # REMARK: for .geojson, the data is written fine, but when reading it goes - # wrong: 2020-01-01T09:00:00.123-05:00 becomes 2020-01-01T04:00:00.123-05:00 - df["dates"] = ( - df["dates"] - .dt.tz_localize(tz=None) - .dt.tz_localize(tz="UTC") - .dt.tz_convert(pytz.FixedOffset(-300)) - ) - elif ext == ".gpkg": - # when GPKG is read with Arrow, datetimes with timezone are converted to - # UTC. - df["dates"] = df["dates"].dt.tz_convert("UTC") + assert_geodataframe_equal(df, result) From 597855f72936d421fd3606cc0e3d541584261cd0 Mon Sep 17 00:00:00 2001 From: Pieter Roggemans Date: Fri, 17 Jan 2025 09:23:59 +0100 Subject: [PATCH 04/59] Update _io.pyx --- pyogrio/_io.pyx | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pyogrio/_io.pyx b/pyogrio/_io.pyx index 574c88fa..cd8e17e2 100644 --- a/pyogrio/_io.pyx +++ b/pyogrio/_io.pyx @@ -2560,7 +2560,7 @@ def ogr_write_arrow( object path_or_fp, str layer, str driver, - obje ct arrow_obj, + object arrow_obj, str crs, str geometry_type, str geometry_name, From fa4b86e489d895ccfe68116c45a440d481f2b83b Mon Sep 17 00:00:00 2001 From: Pieter Roggemans Date: Fri, 17 Jan 2025 09:37:09 +0100 Subject: [PATCH 05/59] Skip tests where appropriate --- pyogrio/tests/test_geopandas_io.py | 9 +++++++++ 1 file changed, 9 insertions(+) diff --git a/pyogrio/tests/test_geopandas_io.py b/pyogrio/tests/test_geopandas_io.py index 543370c3..750c8ca5 100644 --- a/pyogrio/tests/test_geopandas_io.py +++ b/pyogrio/tests/test_geopandas_io.py @@ -298,6 +298,9 @@ def test_read_datetime(datetime_file, use_arrow): @pytest.mark.filterwarnings("ignore: Non-conformant content for record 1 in column ") @pytest.mark.requires_arrow_write_api def test_read_datetime_tz(datetime_tz_file, tmp_path, use_arrow): + if use_arrow and __gdal_version__ < (3, 11, 0): + pytest.skip("Arrow datetime handling improved in GDAL >= 3.11") + df = read_dataframe(datetime_tz_file) # Make the index non-consecutive to test this case as well. Added for issue # https://github.com/geopandas/pyogrio/issues/324 @@ -345,6 +348,9 @@ def test_write_datetime_mixed_offset(tmp_path, use_arrow): @pytest.mark.parametrize("ext", [ext for ext in ALL_EXTS if ext != ".shp"]) @pytest.mark.requires_arrow_write_api def test_read_write_datetime_no_tz(tmp_path, ext, use_arrow): + if use_arrow and ext == ".gpkg" and __gdal_version__ < (3, 11, 0): + pytest.skip("Arrow datetime handling improved in GDAL >= 3.11") + dates_raw = ["2020-01-01 09:00:00.123", "2020-01-01 10:00:00"] if PANDAS_GE_20: dates = pd.to_datetime(dates_raw, format="ISO8601").as_unit("ms") @@ -365,6 +371,9 @@ def test_read_write_datetime_no_tz(tmp_path, ext, use_arrow): ) @pytest.mark.requires_arrow_write_api def test_read_write_datetime_tz_with_nulls(tmp_path, ext, use_arrow): + if use_arrow and __gdal_version__ < (3, 11, 0): + pytest.skip("Arrow datetime handling improved in GDAL >= 3.11") + dates_raw = ["2020-01-01T09:00:00.123-05:00", "2020-01-01T10:00:00-05:00", pd.NaT] if PANDAS_GE_20: dates = pd.to_datetime(dates_raw, format="ISO8601").as_unit("ms") From 0e41ae4f7f5ef1a13cca443929d90825b1c86199 Mon Sep 17 00:00:00 2001 From: Pieter Roggemans Date: Fri, 17 Jan 2025 21:32:24 +0100 Subject: [PATCH 06/59] Improve support for mixed and naive datetimes --- pyogrio/geopandas.py | 94 +++++++++++++++++++++++++++--- pyogrio/tests/test_geopandas_io.py | 49 +++++++++++----- 2 files changed, 123 insertions(+), 20 deletions(-) diff --git a/pyogrio/geopandas.py b/pyogrio/geopandas.py index 62f2b532..f209c191 100644 --- a/pyogrio/geopandas.py +++ b/pyogrio/geopandas.py @@ -39,6 +39,7 @@ def _try_parse_datetime(ser): datetime_kwargs = {"format": "ISO8601", "errors": "ignore"} else: datetime_kwargs = {"yearfirst": True} + with warnings.catch_warnings(): warnings.filterwarnings( "ignore", @@ -51,12 +52,6 @@ def _try_parse_datetime(ser): res = pd.to_datetime(ser, **datetime_kwargs) except Exception: res = ser - # if object dtype, try parse as utc instead - if res.dtype == "object": - try: - res = pd.to_datetime(ser, utc=True, **datetime_kwargs) - except Exception: - pass if res.dtype != "object": # GDAL only supports ms precision, convert outputs to match. @@ -66,6 +61,7 @@ def _try_parse_datetime(ser): res = res.dt.as_unit("ms") else: res = res.dt.round(freq="ms") + return res @@ -486,6 +482,8 @@ def write_dataframe( gdal_tz_offsets = {} for name in fields: col = df[name] + values = None + if isinstance(col.dtype, pd.DatetimeTZDtype): # Deal with datetimes with timezones by passing down timezone separately # pass down naive datetime @@ -500,8 +498,20 @@ def write_dataframe( # Convert each row offset to a signed multiple of 15m and add to GMT value gdal_offset_representation = tz_offset // pd.Timedelta("15m") + 100 gdal_tz_offsets[name] = gdal_offset_representation.values - else: + + elif col.dtype == "object": + # Column of Timestamp objects, also split in naive datetime and tz offset + col_na = df[col.notna()][name] + if len(col_na) and all(isinstance(x, pd.Timestamp) for x in col_na): + tz_offset = col.apply(lambda x: None if pd.isna(x) else x.utcoffset()) + gdal_offset_repr = tz_offset // pd.Timedelta("15m") + 100 + gdal_tz_offsets[name] = gdal_offset_repr.values + naive = col.apply(lambda x: None if pd.isna(x) else x.tz_localize(None)) + values = naive.values + + if values is None: values = col.values + if isinstance(values, pd.api.extensions.ExtensionArray): from pandas.arrays import BooleanArray, FloatingArray, IntegerArray @@ -624,8 +634,33 @@ def write_dataframe( df = pd.DataFrame(df, copy=False) df[geometry_column] = geometry + # Convert all datetime columns to isoformat strings, to avoid mixed timezone + # information getting lost. + datetime_cols = [] + for name, dtype in df.dtypes.items(): + col = df[name] + if dtype == "object": + # When all non-NA values are Timestamps, treat as datetime column + col_na = df[col.notna()][name] + if len(col_na) and all(isinstance(x, pd.Timestamp) for x in col_na): + df[name] = col.apply( + lambda x: None if pd.isna(x) else x.isoformat() + ) + datetime_cols.append(name) + elif isinstance(dtype, pd.DatetimeTZDtype): + # Also for regular datetime columns with timezone mixed timezones are + # possible when thera is a difference between summer and winter time. + df[name] = col.apply(lambda x: None if pd.isna(x) else x.isoformat()) + datetime_cols.append(name) + table = pa.Table.from_pandas(df, preserve_index=False) + # Add metadata to datetime columns so GDAL knows they are datetimes. + for datetime_col in datetime_cols: + table = _add_column_metadata( + table, column_metadata={datetime_col: {"GDAL:OGR:type": "DateTime"}} + ) + if geometry_column is not None: # ensure that the geometry column is binary (for all-null geometries, # this could be a wrong type) @@ -685,3 +720,48 @@ def write_dataframe( gdal_tz_offsets=gdal_tz_offsets, **kwargs, ) + + +def _add_column_metadata(table, column_metadata: dict = {}): + """Add or update column-level metadata to an arrow table. + + Parameters + ---------- + table : pyarrow.Table + The table to add the column metadata to. + column_metadata : dict + A dictionary with column metadata in the form + { + "column_1": {"some": "data"}, + "column_2": {"more": "stuff"}, + } + + Returns + ------- + pyarrow.Table: table with the updated column metadata. + """ + import pyarrow as pa + + if not column_metadata: + return table + + # Create updated column fields with new metadata + fields = [] + for col in table.schema.names: + if col in column_metadata: + # Add/update column metadata + metadata = table.field(col).metadata or {} + for key, value in column_metadata[col].items(): + metadata[key] = value + # Update field with updated metadata + fields.append(table.field(col).with_metadata(metadata)) + else: + fields.append(table.field(col)) + + # Create new schema with the updated field metadata + schema = pa.schema(fields, metadata=table.schema.metadata) + + # Build new table with updated schema (shouldn't copy data) + table = table.cast(schema) + + return table diff --git a/pyogrio/tests/test_geopandas_io.py b/pyogrio/tests/test_geopandas_io.py index 750c8ca5..5a517cf5 100644 --- a/pyogrio/tests/test_geopandas_io.py +++ b/pyogrio/tests/test_geopandas_io.py @@ -324,14 +324,13 @@ def test_read_datetime_tz(datetime_tz_file, tmp_path, use_arrow): "ignore: Non-conformant content for record 1 in column dates" ) @pytest.mark.requires_arrow_write_api -def test_write_datetime_mixed_offset(tmp_path, use_arrow): +def test_write_datetime_localized_mixed_offset(tmp_path, use_arrow): + """Test with localized dates across a different summer/winter timezone offset.""" # Australian Summer Time AEDT (GMT+11), Standard Time AEST (GMT+10) dates = ["2023-01-01 11:00:01.111", "2023-06-01 10:00:01.111"] naive_col = pd.Series(pd.to_datetime(dates), name="dates") localised_col = naive_col.dt.tz_localize("Australia/Sydney") - utc_col = localised_col.dt.tz_convert("UTC") - if PANDAS_GE_20: - utc_col = utc_col.dt.as_unit("ms") + localised_ts_col = localised_col.map(pd.Timestamp.isoformat).map(pd.Timestamp) df = gp.GeoDataFrame( {"dates": localised_col, "geometry": [Point(1, 1), Point(1, 1)]}, @@ -340,9 +339,30 @@ def test_write_datetime_mixed_offset(tmp_path, use_arrow): fpath = tmp_path / "test.gpkg" write_dataframe(df, fpath, use_arrow=use_arrow) result = read_dataframe(fpath, use_arrow=use_arrow) + # GDAL tz only encodes offsets, not timezones - # check multiple offsets are read as utc datetime instead of string values - assert_series_equal(result["dates"], utc_col) + assert_series_equal(result["dates"], localised_ts_col) + + +@pytest.mark.filterwarnings( + "ignore: Non-conformant content for record 1 in column dates" +) +@pytest.mark.requires_arrow_write_api +def test_write_datetime_mixed_offsets(tmp_path, use_arrow): + """Test with dates with mixed timezone offsets.""" + # Pandas datetime64 column types doesn't support mixed timezone offsets, so this + # list converts to pandas.Timestamp objects instead. + dates = ["2023-01-01 11:00:01.111+01:00", "2023-06-01 10:00:01.111+05:00"] + offset_col = pd.Series(pd.to_datetime(dates), name="dates") + df = gp.GeoDataFrame( + {"dates": offset_col, "geometry": [Point(1, 1), Point(1, 1)]}, + crs="EPSG:4326", + ) + fpath = tmp_path / "test.gpkg" + write_dataframe(df, fpath, use_arrow=use_arrow) + result = read_dataframe(fpath, use_arrow=use_arrow) + + assert_series_equal(result["dates"], offset_col) @pytest.mark.parametrize("ext", [ext for ext in ALL_EXTS if ext != ".shp"]) @@ -370,15 +390,18 @@ def test_read_write_datetime_no_tz(tmp_path, ext, use_arrow): "ignore: Non-conformant content for record 1 in column dates" ) @pytest.mark.requires_arrow_write_api -def test_read_write_datetime_tz_with_nulls(tmp_path, ext, use_arrow): +def test_read_write_datetime_timestamp_with_nulls(tmp_path, ext, use_arrow): if use_arrow and __gdal_version__ < (3, 11, 0): pytest.skip("Arrow datetime handling improved in GDAL >= 3.11") - dates_raw = ["2020-01-01T09:00:00.123-05:00", "2020-01-01T10:00:00-05:00", pd.NaT] - if PANDAS_GE_20: - dates = pd.to_datetime(dates_raw, format="ISO8601").as_unit("ms") - else: - dates = pd.to_datetime(dates_raw) + dates_raw = [ + pd.Timestamp("2020-01-01T09:00:00.123-05:00"), + pd.Timestamp("2020-01-01T10:00:00-05:00"), + None, + ] + dates = pd.Series(dates_raw, dtype="O") + dates_expected = pd.Series(pd.to_datetime(dates_raw).as_unit("ms"), name="dates") + df = gp.GeoDataFrame( {"dates": dates, "geometry": [Point(1, 1), Point(1, 1), Point(1, 1)]}, crs="EPSG:4326", @@ -387,7 +410,7 @@ def test_read_write_datetime_tz_with_nulls(tmp_path, ext, use_arrow): write_dataframe(df, fpath, use_arrow=use_arrow) result = read_dataframe(fpath, use_arrow=use_arrow) - assert_geodataframe_equal(df, result) + assert_series_equal(result.dates, dates_expected) def test_read_null_values(tmp_path, use_arrow): From 1378ace45939f35d56973a8539d4ccb7d55a6ebc Mon Sep 17 00:00:00 2001 From: Pieter Roggemans Date: Fri, 17 Jan 2025 22:42:35 +0100 Subject: [PATCH 07/59] Skip use_arrow tests with old gdal versions --- pyogrio/tests/test_geopandas_io.py | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/pyogrio/tests/test_geopandas_io.py b/pyogrio/tests/test_geopandas_io.py index 5a517cf5..0c94bf7a 100644 --- a/pyogrio/tests/test_geopandas_io.py +++ b/pyogrio/tests/test_geopandas_io.py @@ -326,6 +326,9 @@ def test_read_datetime_tz(datetime_tz_file, tmp_path, use_arrow): @pytest.mark.requires_arrow_write_api def test_write_datetime_localized_mixed_offset(tmp_path, use_arrow): """Test with localized dates across a different summer/winter timezone offset.""" + if use_arrow and __gdal_version__ < (3, 11, 0): + pytest.skip("Arrow datetime handling improved in GDAL >= 3.11") + # Australian Summer Time AEDT (GMT+11), Standard Time AEST (GMT+10) dates = ["2023-01-01 11:00:01.111", "2023-06-01 10:00:01.111"] naive_col = pd.Series(pd.to_datetime(dates), name="dates") @@ -350,6 +353,9 @@ def test_write_datetime_localized_mixed_offset(tmp_path, use_arrow): @pytest.mark.requires_arrow_write_api def test_write_datetime_mixed_offsets(tmp_path, use_arrow): """Test with dates with mixed timezone offsets.""" + if use_arrow and __gdal_version__ < (3, 11, 0): + pytest.skip("Arrow datetime handling improved in GDAL >= 3.11") + # Pandas datetime64 column types doesn't support mixed timezone offsets, so this # list converts to pandas.Timestamp objects instead. dates = ["2023-01-01 11:00:01.111+01:00", "2023-06-01 10:00:01.111+05:00"] From 0f1ab272aac3a436e3052fe939ada857bc8c023c Mon Sep 17 00:00:00 2001 From: Pieter Roggemans Date: Fri, 17 Jan 2025 23:06:14 +0100 Subject: [PATCH 08/59] Take in account pandas version --- pyogrio/tests/test_geopandas_io.py | 9 +++++++-- 1 file changed, 7 insertions(+), 2 deletions(-) diff --git a/pyogrio/tests/test_geopandas_io.py b/pyogrio/tests/test_geopandas_io.py index 0c94bf7a..6630ed80 100644 --- a/pyogrio/tests/test_geopandas_io.py +++ b/pyogrio/tests/test_geopandas_io.py @@ -406,7 +406,12 @@ def test_read_write_datetime_timestamp_with_nulls(tmp_path, ext, use_arrow): None, ] dates = pd.Series(dates_raw, dtype="O") - dates_expected = pd.Series(pd.to_datetime(dates_raw).as_unit("ms"), name="dates") + + if PANDAS_GE_20: + expected = pd.to_datetime(dates_raw, format="ISO8601").as_unit("ms") + else: + expected = pd.to_datetime(dates_raw) + expected = pd.Series(expected, name="dates") df = gp.GeoDataFrame( {"dates": dates, "geometry": [Point(1, 1), Point(1, 1), Point(1, 1)]}, @@ -416,7 +421,7 @@ def test_read_write_datetime_timestamp_with_nulls(tmp_path, ext, use_arrow): write_dataframe(df, fpath, use_arrow=use_arrow) result = read_dataframe(fpath, use_arrow=use_arrow) - assert_series_equal(result.dates, dates_expected) + assert_series_equal(result.dates, expected) def test_read_null_values(tmp_path, use_arrow): From 6f78c68f5d65a95643e2316cbdf578884ea71ffa Mon Sep 17 00:00:00 2001 From: Pieter Roggemans Date: Sat, 18 Jan 2025 00:45:31 +0100 Subject: [PATCH 09/59] Update test_geopandas_io.py --- pyogrio/tests/test_geopandas_io.py | 1 + 1 file changed, 1 insertion(+) diff --git a/pyogrio/tests/test_geopandas_io.py b/pyogrio/tests/test_geopandas_io.py index 6630ed80..3684156d 100644 --- a/pyogrio/tests/test_geopandas_io.py +++ b/pyogrio/tests/test_geopandas_io.py @@ -368,6 +368,7 @@ def test_write_datetime_mixed_offsets(tmp_path, use_arrow): write_dataframe(df, fpath, use_arrow=use_arrow) result = read_dataframe(fpath, use_arrow=use_arrow) + assert result["dates"][0] == offset_col[0] assert_series_equal(result["dates"], offset_col) From 336d0d80476f64489d70cee6b8191ea50b3886db Mon Sep 17 00:00:00 2001 From: Pieter Roggemans Date: Sat, 18 Jan 2025 02:08:51 +0100 Subject: [PATCH 10/59] Also support columns with datetime objects --- pyogrio/geopandas.py | 13 ++++++++++++- pyogrio/tests/test_geopandas_io.py | 25 +++++++++++++++++++------ 2 files changed, 31 insertions(+), 7 deletions(-) diff --git a/pyogrio/geopandas.py b/pyogrio/geopandas.py index f209c191..5b2151e3 100644 --- a/pyogrio/geopandas.py +++ b/pyogrio/geopandas.py @@ -2,6 +2,7 @@ import os import warnings +from datetime import datetime import numpy as np @@ -508,6 +509,14 @@ def write_dataframe( gdal_tz_offsets[name] = gdal_offset_repr.values naive = col.apply(lambda x: None if pd.isna(x) else x.tz_localize(None)) values = naive.values + elif len(col_na) and all(isinstance(x, datetime) for x in col_na): + tz_offset = col.apply(lambda x: None if pd.isna(x) else x.utcoffset()) + gdal_offset_repr = tz_offset // pd.Timedelta("15m") + 100 + gdal_tz_offsets[name] = gdal_offset_repr.values + naive = col.apply( + lambda x: None if pd.isna(x) else x.replace(tzinfo=None) + ) + values = naive.values if values is None: values = col.values @@ -642,7 +651,9 @@ def write_dataframe( if dtype == "object": # When all non-NA values are Timestamps, treat as datetime column col_na = df[col.notna()][name] - if len(col_na) and all(isinstance(x, pd.Timestamp) for x in col_na): + if len(col_na) and all( + isinstance(x, (pd.Timestamp, datetime)) for x in col_na + ): df[name] = col.apply( lambda x: None if pd.isna(x) else x.isoformat() ) diff --git a/pyogrio/tests/test_geopandas_io.py b/pyogrio/tests/test_geopandas_io.py index 3684156d..d81cec3d 100644 --- a/pyogrio/tests/test_geopandas_io.py +++ b/pyogrio/tests/test_geopandas_io.py @@ -393,19 +393,29 @@ def test_read_write_datetime_no_tz(tmp_path, ext, use_arrow): @pytest.mark.parametrize("ext", [ext for ext in ALL_EXTS if ext != ".shp"]) +@pytest.mark.parametrize( + "dates_raw", + [ + ( + pd.Timestamp("2020-01-01T09:00:00.123-05:00"), + pd.Timestamp("2020-01-01T10:00:00-05:00"), + None, + ), + ( + datetime.fromisoformat("2020-01-01T09:00:00.123-05:00"), + datetime.fromisoformat("2020-01-01T10:00:00-05:00"), + None, + ), + ], +) @pytest.mark.filterwarnings( "ignore: Non-conformant content for record 1 in column dates" ) @pytest.mark.requires_arrow_write_api -def test_read_write_datetime_timestamp_with_nulls(tmp_path, ext, use_arrow): +def test_read_write_datetime_objects_with_nulls(tmp_path, dates_raw, ext, use_arrow): if use_arrow and __gdal_version__ < (3, 11, 0): pytest.skip("Arrow datetime handling improved in GDAL >= 3.11") - dates_raw = [ - pd.Timestamp("2020-01-01T09:00:00.123-05:00"), - pd.Timestamp("2020-01-01T10:00:00-05:00"), - None, - ] dates = pd.Series(dates_raw, dtype="O") if PANDAS_GE_20: @@ -422,6 +432,9 @@ def test_read_write_datetime_timestamp_with_nulls(tmp_path, ext, use_arrow): write_dataframe(df, fpath, use_arrow=use_arrow) result = read_dataframe(fpath, use_arrow=use_arrow) + # With some older versions, the offset is represented slightly differently + if str(result.dates.dtype) == "datetime64[ns, pytz.FixedOffset(-300)]": + result.dates = result.dates.astype(expected.dtype) assert_series_equal(result.dates, expected) From 3035a1166061e91dd10474d8bdca76186dcadc7f Mon Sep 17 00:00:00 2001 From: Pieter Roggemans Date: Sat, 18 Jan 2025 09:00:05 +0100 Subject: [PATCH 11/59] Rename some test functions for consistency --- pyogrio/tests/test_geopandas_io.py | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/pyogrio/tests/test_geopandas_io.py b/pyogrio/tests/test_geopandas_io.py index d81cec3d..98bc7657 100644 --- a/pyogrio/tests/test_geopandas_io.py +++ b/pyogrio/tests/test_geopandas_io.py @@ -297,7 +297,7 @@ def test_read_datetime(datetime_file, use_arrow): @pytest.mark.filterwarnings("ignore: Non-conformant content for record 1 in column ") @pytest.mark.requires_arrow_write_api -def test_read_datetime_tz(datetime_tz_file, tmp_path, use_arrow): +def test_write_datetime_tz(datetime_tz_file, tmp_path, use_arrow): if use_arrow and __gdal_version__ < (3, 11, 0): pytest.skip("Arrow datetime handling improved in GDAL >= 3.11") @@ -374,7 +374,8 @@ def test_write_datetime_mixed_offsets(tmp_path, use_arrow): @pytest.mark.parametrize("ext", [ext for ext in ALL_EXTS if ext != ".shp"]) @pytest.mark.requires_arrow_write_api -def test_read_write_datetime_no_tz(tmp_path, ext, use_arrow): +def test_write_datetime_no_tz(tmp_path, ext, use_arrow): + """Test writing/reading a datetime column without timezone information.""" if use_arrow and ext == ".gpkg" and __gdal_version__ < (3, 11, 0): pytest.skip("Arrow datetime handling improved in GDAL >= 3.11") @@ -412,7 +413,7 @@ def test_read_write_datetime_no_tz(tmp_path, ext, use_arrow): "ignore: Non-conformant content for record 1 in column dates" ) @pytest.mark.requires_arrow_write_api -def test_read_write_datetime_objects_with_nulls(tmp_path, dates_raw, ext, use_arrow): +def test_write_datetime_objects_with_nulls(tmp_path, dates_raw, ext, use_arrow): if use_arrow and __gdal_version__ < (3, 11, 0): pytest.skip("Arrow datetime handling improved in GDAL >= 3.11") From 9efdc091b915a7c4cbf348a87e7063d79d898d37 Mon Sep 17 00:00:00 2001 From: Pieter Roggemans Date: Sat, 18 Jan 2025 09:18:15 +0100 Subject: [PATCH 12/59] Avoid warning in test --- pyogrio/tests/test_geopandas_io.py | 17 ++++++++++------- 1 file changed, 10 insertions(+), 7 deletions(-) diff --git a/pyogrio/tests/test_geopandas_io.py b/pyogrio/tests/test_geopandas_io.py index 98bc7657..50fc758e 100644 --- a/pyogrio/tests/test_geopandas_io.py +++ b/pyogrio/tests/test_geopandas_io.py @@ -356,20 +356,23 @@ def test_write_datetime_mixed_offsets(tmp_path, use_arrow): if use_arrow and __gdal_version__ < (3, 11, 0): pytest.skip("Arrow datetime handling improved in GDAL >= 3.11") - # Pandas datetime64 column types doesn't support mixed timezone offsets, so this - # list converts to pandas.Timestamp objects instead. - dates = ["2023-01-01 11:00:01.111+01:00", "2023-06-01 10:00:01.111+05:00"] - offset_col = pd.Series(pd.to_datetime(dates), name="dates") + # Pandas datetime64 column types doesn't support mixed timezone offsets, so + # it needs to be a list of pandas.Timestamp objects instead. + dates = [ + pd.Timestamp("2023-01-01 11:00:01.111+01:00"), + pd.Timestamp("2023-06-01 10:00:01.111+05:00"), + ] + expected = pd.Series(dates, name="dates") + df = gp.GeoDataFrame( - {"dates": offset_col, "geometry": [Point(1, 1), Point(1, 1)]}, + {"dates": dates, "geometry": [Point(1, 1), Point(1, 1)]}, crs="EPSG:4326", ) fpath = tmp_path / "test.gpkg" write_dataframe(df, fpath, use_arrow=use_arrow) result = read_dataframe(fpath, use_arrow=use_arrow) - assert result["dates"][0] == offset_col[0] - assert_series_equal(result["dates"], offset_col) + assert_series_equal(result["dates"], expected) @pytest.mark.parametrize("ext", [ext for ext in ALL_EXTS if ext != ".shp"]) From eb80e0872bd4d15a60b2397108bd4850275ab05e Mon Sep 17 00:00:00 2001 From: Pieter Roggemans Date: Sat, 18 Jan 2025 09:35:47 +0100 Subject: [PATCH 13/59] Improve inline comment --- pyogrio/geopandas.py | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/pyogrio/geopandas.py b/pyogrio/geopandas.py index 5b2151e3..e708b65f 100644 --- a/pyogrio/geopandas.py +++ b/pyogrio/geopandas.py @@ -255,8 +255,9 @@ def read_dataframe( read_func = read_arrow if use_arrow else read gdal_force_2d = False if use_arrow else force_2d - # Always read datetimes are as string values to preserve (mixed) timezone info - # as numpy does not directly support timezones and arrow support is also limited. + # Always read datetimes as string values to preserve (mixed) timezone info + # as numpy does not directly support timezones and arrow datetime columns + # don't support mixed timezones. result = read_func( path_or_buffer, layer=layer, From d50b2d04da7b14e1e20792788549c1dea8561f52 Mon Sep 17 00:00:00 2001 From: Pieter Roggemans Date: Sat, 18 Jan 2025 12:25:22 +0100 Subject: [PATCH 14/59] Update CHANGES.md --- CHANGES.md | 1 + 1 file changed, 1 insertion(+) diff --git a/CHANGES.md b/CHANGES.md index 117751cd..4af6563a 100644 --- a/CHANGES.md +++ b/CHANGES.md @@ -5,6 +5,7 @@ ### Improvements - Capture all errors logged by gdal when opening a file fails (#495). +- Improve support for datetime columns with mixed or naive times (#486). ### Bug fixes From 1efa5bfb1d09e1e20eac6d9e9e2983e64152715f Mon Sep 17 00:00:00 2001 From: Pieter Roggemans Date: Mon, 20 Jan 2025 11:12:49 +0100 Subject: [PATCH 15/59] Symplify code --- pyogrio/geopandas.py | 14 +++++--------- 1 file changed, 5 insertions(+), 9 deletions(-) diff --git a/pyogrio/geopandas.py b/pyogrio/geopandas.py index 19793113..4728b4c4 100644 --- a/pyogrio/geopandas.py +++ b/pyogrio/geopandas.py @@ -600,7 +600,7 @@ def write_dataframe( datetime_cols.append(name) elif isinstance(dtype, pd.DatetimeTZDtype): # Also for regular datetime columns with timezone mixed timezones are - # possible when thera is a difference between summer and winter time. + # possible when there is a difference between summer and winter time. df[name] = col.apply(lambda x: None if pd.isna(x) else x.isoformat()) datetime_cols.append(name) @@ -678,15 +678,11 @@ def write_dataframe( gdal_tz_offsets[name] = gdal_offset_representation.values elif col.dtype == "object": - # Column of Timestamp objects, also split in naive datetime and tz offset + # Column of Timestamp/datetime objects, split in naive datetime and tz. col_na = df[col.notna()][name] - if len(col_na) and all(isinstance(x, pd.Timestamp) for x in col_na): - tz_offset = col.apply(lambda x: None if pd.isna(x) else x.utcoffset()) - gdal_offset_repr = tz_offset // pd.Timedelta("15m") + 100 - gdal_tz_offsets[name] = gdal_offset_repr.values - naive = col.apply(lambda x: None if pd.isna(x) else x.tz_localize(None)) - values = naive.values - elif len(col_na) and all(isinstance(x, datetime) for x in col_na): + if len(col_na) and all( + isinstance(x, (pd.Timestamp, datetime)) for x in col_na + ): tz_offset = col.apply(lambda x: None if pd.isna(x) else x.utcoffset()) gdal_offset_repr = tz_offset // pd.Timedelta("15m") + 100 gdal_tz_offsets[name] = gdal_offset_repr.values From 0032839d22ba7d4157809e39d31ec7fe77ecd0af Mon Sep 17 00:00:00 2001 From: Pieter Roggemans Date: Mon, 20 Jan 2025 17:25:48 +0100 Subject: [PATCH 16/59] Don't cast UTC data to string when writing --- pyogrio/geopandas.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/pyogrio/geopandas.py b/pyogrio/geopandas.py index 4728b4c4..d3cf011e 100644 --- a/pyogrio/geopandas.py +++ b/pyogrio/geopandas.py @@ -598,9 +598,9 @@ def write_dataframe( lambda x: None if pd.isna(x) else x.isoformat() ) datetime_cols.append(name) - elif isinstance(dtype, pd.DatetimeTZDtype): - # Also for regular datetime columns with timezone mixed timezones are - # possible when there is a difference between summer and winter time. + elif isinstance(dtype, pd.DatetimeTZDtype) and str(dtype.tz) != "UTC": + # When a timezone has daylight saving time the offsets can also be + # different. UTC doesn't have this issue. df[name] = col.apply(lambda x: None if pd.isna(x) else x.isoformat()) datetime_cols.append(name) From 9d2bfce2bafdc9bc31d1fbfc32ef01a2436bf1b0 Mon Sep 17 00:00:00 2001 From: Pieter Roggemans Date: Mon, 20 Jan 2025 17:26:58 +0100 Subject: [PATCH 17/59] Various improvements to tests - Test result < GDAL 3.11 instead of skipping - Add UTC test - ... --- pyogrio/tests/test_geopandas_io.py | 183 ++++++++++++++++++++--------- 1 file changed, 126 insertions(+), 57 deletions(-) diff --git a/pyogrio/tests/test_geopandas_io.py b/pyogrio/tests/test_geopandas_io.py index 50fc758e..be191a4c 100644 --- a/pyogrio/tests/test_geopandas_io.py +++ b/pyogrio/tests/test_geopandas_io.py @@ -6,6 +6,7 @@ from zipfile import ZipFile import numpy as np +from pandas.api.types import is_datetime64_dtype from pyogrio import ( __gdal_version__, @@ -295,93 +296,117 @@ def test_read_datetime(datetime_file, use_arrow): assert df.col.dtype.name == "datetime64[ns]" +@pytest.mark.parametrize("ext", [ext for ext in ALL_EXTS if ext != ".shp"]) @pytest.mark.filterwarnings("ignore: Non-conformant content for record 1 in column ") @pytest.mark.requires_arrow_write_api -def test_write_datetime_tz(datetime_tz_file, tmp_path, use_arrow): - if use_arrow and __gdal_version__ < (3, 11, 0): - pytest.skip("Arrow datetime handling improved in GDAL >= 3.11") +def test_write_read_datetime_tz(tmp_path, ext, use_arrow): + """Write and read file with all equal timezones. + + This should result in the result being in pandas datetime64 dtype column. + """ + dates_raw = ["2020-01-01T09:00:00.123-05:00", "2020-01-01T10:00:00-05:00"] + if PANDAS_GE_20: + dates = pd.to_datetime(dates_raw, format="ISO8601").as_unit("ms") + else: + dates = pd.to_datetime(dates_raw) - df = read_dataframe(datetime_tz_file) # Make the index non-consecutive to test this case as well. Added for issue # https://github.com/geopandas/pyogrio/issues/324 - df = df.set_index(np.array([0, 2])) - raw_expected = ["2020-01-01T09:00:00.123-05:00", "2020-01-01T10:00:00-05:00"] + df = gp.GeoDataFrame( + {"dates": dates, "geometry": [Point(1, 1), Point(1, 1)]}, + index=[0, 2], + crs="EPSG:4326", + ) + assert isinstance(df.dates.dtype, pd.DatetimeTZDtype) - if PANDAS_GE_20: - expected = pd.to_datetime(raw_expected, format="ISO8601").as_unit("ms") - else: - expected = pd.to_datetime(raw_expected) - expected = pd.Series(expected, name="datetime_col") - assert_series_equal(df.datetime_col, expected, check_index=False) - # test write and read round trips - fpath = tmp_path / "test.gpkg" + fpath = tmp_path / f"test{ext}" write_dataframe(df, fpath, use_arrow=use_arrow) - df_read = read_dataframe(fpath, use_arrow=use_arrow) - assert_series_equal(df_read.datetime_col, expected) + result = read_dataframe(fpath, use_arrow=use_arrow) + + # With some older versions, the offset is represented slightly differently + if str(result.dates.dtype).endswith(", pytz.FixedOffset(-300)]"): + result.dates = result.dates.astype(df.dates.dtype) + + if use_arrow and __gdal_version__ < (3, 11, 0): + if ext in (".fgb", ".gpkg"): + # With GDAL < 3.11 with arrow, datetime columns are written as string type + # columns + df.dates = df.dates.map( + lambda x: x.isoformat() if x is not pd.NaT else pd.NaT + ) + + assert_series_equal(result.dates, df.dates, check_index=False) @pytest.mark.filterwarnings( "ignore: Non-conformant content for record 1 in column dates" ) @pytest.mark.requires_arrow_write_api -def test_write_datetime_localized_mixed_offset(tmp_path, use_arrow): +def test_write_read_datetime_localized_mixed_offset(tmp_path, use_arrow): """Test with localized dates across a different summer/winter timezone offset.""" - if use_arrow and __gdal_version__ < (3, 11, 0): - pytest.skip("Arrow datetime handling improved in GDAL >= 3.11") - # Australian Summer Time AEDT (GMT+11), Standard Time AEST (GMT+10) - dates = ["2023-01-01 11:00:01.111", "2023-06-01 10:00:01.111"] - naive_col = pd.Series(pd.to_datetime(dates), name="dates") - localised_col = naive_col.dt.tz_localize("Australia/Sydney") - localised_ts_col = localised_col.map(pd.Timestamp.isoformat).map(pd.Timestamp) + dates_raw = ["2023-01-01 11:00:01.111", "2023-06-01 10:00:01.111"] + dates_naive = pd.Series(pd.to_datetime(dates_raw), name="dates") + dates_local = dates_naive.dt.tz_localize("Australia/Sydney") + dates_local_offsets_str = dates_local.map(pd.Timestamp.isoformat) + dates_exp = dates_local_offsets_str.map(pd.Timestamp) df = gp.GeoDataFrame( - {"dates": localised_col, "geometry": [Point(1, 1), Point(1, 1)]}, + {"dates": dates_local, "geometry": [Point(1, 1), Point(1, 1)]}, crs="EPSG:4326", ) fpath = tmp_path / "test.gpkg" write_dataframe(df, fpath, use_arrow=use_arrow) result = read_dataframe(fpath, use_arrow=use_arrow) + if use_arrow and __gdal_version__ < (3, 11, 0): + # With GDAL < 3.11 with arrow, datetime columns written as string type columns + dates_exp = dates_local_offsets_str + # GDAL tz only encodes offsets, not timezones - assert_series_equal(result["dates"], localised_ts_col) + assert_series_equal(result["dates"], dates_exp) +@pytest.mark.parametrize("ext", [ext for ext in ALL_EXTS if ext != ".shp"]) @pytest.mark.filterwarnings( "ignore: Non-conformant content for record 1 in column dates" ) @pytest.mark.requires_arrow_write_api -def test_write_datetime_mixed_offsets(tmp_path, use_arrow): +def test_write_read_datetime_mixed_offsets(tmp_path, ext, use_arrow): """Test with dates with mixed timezone offsets.""" - if use_arrow and __gdal_version__ < (3, 11, 0): - pytest.skip("Arrow datetime handling improved in GDAL >= 3.11") - # Pandas datetime64 column types doesn't support mixed timezone offsets, so # it needs to be a list of pandas.Timestamp objects instead. - dates = [ - pd.Timestamp("2023-01-01 11:00:01.111+01:00"), - pd.Timestamp("2023-06-01 10:00:01.111+05:00"), - ] - expected = pd.Series(dates, name="dates") + dates_raw = ["2023-01-01 11:00:01.111+01:00", "2023-06-01 10:00:01.111+05:00"] + dates_ts = list(map(pd.Timestamp, dates_raw)) df = gp.GeoDataFrame( - {"dates": dates, "geometry": [Point(1, 1), Point(1, 1)]}, + {"dates": dates_ts, "geometry": [Point(1, 1), Point(1, 1)]}, crs="EPSG:4326", ) - fpath = tmp_path / "test.gpkg" + fpath = tmp_path / f"test{ext}" write_dataframe(df, fpath, use_arrow=use_arrow) result = read_dataframe(fpath, use_arrow=use_arrow) - assert_series_equal(result["dates"], expected) + if use_arrow and __gdal_version__ < (3, 11, 0): + if ext in (".geojson", ".geojsonl"): + # With GDAL < 3.11 with arrow, GDAL converts mixed timezone datetimes to UTC + # when read as the arrow datetime column type does not support mixed tz. + if PANDAS_GE_20: + df.dates = pd.to_datetime(dates_ts, utc=True).as_unit("ms") + else: + df.dates = pd.to_datetime(dates_ts, utc=True) + elif ext in (".gpkg", ".fgb"): + # With arrow and GDAL < 3.11, mixed timezone datetimes are written as string + # type columns, so no proper roundtrip possible. + df.dates = df.dates.map(pd.Timestamp.isoformat) + + assert_geodataframe_equal(result, df) @pytest.mark.parametrize("ext", [ext for ext in ALL_EXTS if ext != ".shp"]) @pytest.mark.requires_arrow_write_api -def test_write_datetime_no_tz(tmp_path, ext, use_arrow): - """Test writing/reading a datetime column without timezone information.""" - if use_arrow and ext == ".gpkg" and __gdal_version__ < (3, 11, 0): - pytest.skip("Arrow datetime handling improved in GDAL >= 3.11") - +def test_write_read_datetime_no_tz(tmp_path, ext, use_arrow): + """Test writing/reading a column with naive datetimes (no timezone information).""" dates_raw = ["2020-01-01 09:00:00.123", "2020-01-01 10:00:00"] if PANDAS_GE_20: dates = pd.to_datetime(dates_raw, format="ISO8601").as_unit("ms") @@ -390,10 +415,21 @@ def test_write_datetime_no_tz(tmp_path, ext, use_arrow): df = gp.GeoDataFrame( {"dates": dates, "geometry": [Point(1, 1), Point(1, 1)]}, crs="EPSG:4326" ) + fpath = tmp_path / f"test{ext}" write_dataframe(df, fpath, use_arrow=use_arrow) result = read_dataframe(fpath, use_arrow=use_arrow) - assert_geodataframe_equal(df, result) + + if use_arrow and ext == ".gpkg" and __gdal_version__ < (3, 11, 0): + # With GDAL < 3.11 with arrow, columns with naive datetimes are written + # correctly, but when read they are wrongly interpreted as being in UTC. + # The reason is complicated, but more info can be found e.g. here: + # https://github.com/geopandas/pyogrio/issues/487#issuecomment-2423762807 + assert_series_equal(result.dates, df.dates.dt.tz_localize("UTC")) + pytest.xfail("naive datetimes read wrong in GPKG with GDAL < 3.11 via arrow") + + assert is_datetime64_dtype(result.dates.dtype) + assert_geodataframe_equal(result, df) @pytest.mark.parametrize("ext", [ext for ext in ALL_EXTS if ext != ".shp"]) @@ -416,30 +452,63 @@ def test_write_datetime_no_tz(tmp_path, ext, use_arrow): "ignore: Non-conformant content for record 1 in column dates" ) @pytest.mark.requires_arrow_write_api -def test_write_datetime_objects_with_nulls(tmp_path, dates_raw, ext, use_arrow): +def test_write_read_datetime_objects_with_nulls(tmp_path, dates_raw, ext, use_arrow): + """Datetime objects with null values and the equal offset are read as datetime64.""" + dates = pd.Series(dates_raw, dtype="O") + df = gp.GeoDataFrame( + {"dates": dates, "geometry": [Point(1, 1), Point(1, 1), Point(1, 1)]}, + crs="EPSG:4326", + ) + if PANDAS_GE_20: + dates_exp = pd.to_datetime(dates_raw, format="ISO8601").as_unit("ms") + else: + dates_exp = pd.to_datetime(dates_raw) + exp_df = df.copy() + exp_df.dates = pd.Series(dates_exp, name="dates") + + fpath = tmp_path / f"test{ext}" + write_dataframe(df, fpath, use_arrow=use_arrow) + result = read_dataframe(fpath, use_arrow=use_arrow) + + # With some older versions, the offset is represented slightly differently + if str(result.dates.dtype).endswith(", pytz.FixedOffset(-300)]"): + result.dates = result.dates.astype(exp_df.dates.dtype) + if use_arrow and __gdal_version__ < (3, 11, 0): - pytest.skip("Arrow datetime handling improved in GDAL >= 3.11") + if ext in (".fgb", ".gpkg"): + # With GDAL < 3.11 with arrow, datetime columns are written as string type + # columns + exp_df.dates = exp_df.dates.map( + lambda x: x.isoformat() if x is not pd.NaT else pd.NaT + ) - dates = pd.Series(dates_raw, dtype="O") + assert_geodataframe_equal(result, exp_df) + +@pytest.mark.parametrize("ext", [ext for ext in ALL_EXTS if ext != ".shp"]) +@pytest.mark.requires_arrow_write_api +def test_write_read_datetime_utc(tmp_path, ext, use_arrow): + """Test writing/reading a column with UTC datetimes.""" + dates_raw = ["2020-01-01 09:00:00.123Z", "2020-01-01 10:00:00Z"] if PANDAS_GE_20: - expected = pd.to_datetime(dates_raw, format="ISO8601").as_unit("ms") + dates = pd.to_datetime(dates_raw, format="ISO8601").as_unit("ms") else: - expected = pd.to_datetime(dates_raw) - expected = pd.Series(expected, name="dates") - + dates = pd.to_datetime(dates_raw) df = gp.GeoDataFrame( - {"dates": dates, "geometry": [Point(1, 1), Point(1, 1), Point(1, 1)]}, - crs="EPSG:4326", + {"dates": dates, "geometry": [Point(1, 1), Point(1, 1)]}, crs="EPSG:4326" ) + fpath = tmp_path / f"test{ext}" write_dataframe(df, fpath, use_arrow=use_arrow) result = read_dataframe(fpath, use_arrow=use_arrow) - # With some older versions, the offset is represented slightly differently - if str(result.dates.dtype) == "datetime64[ns, pytz.FixedOffset(-300)]": - result.dates = result.dates.astype(expected.dtype) - assert_series_equal(result.dates, expected) + if use_arrow and ext == ".fgb" and __gdal_version__ < (3, 11, 0): + # With GDAL < 3.11 with arrow, timezone information is dropped when reading .fgb + assert_series_equal(result.dates, df.dates.dt.tz_localize(None)) + pytest.xfail("UTC datetimes read wrong in .fgb with GDAL < 3.11 via arrow") + + assert isinstance(result.dates.dtype, pd.DatetimeTZDtype) + assert_geodataframe_equal(result, df) def test_read_null_values(tmp_path, use_arrow): From ca9a8ae24003e637a78c7db718c7839275ace597 Mon Sep 17 00:00:00 2001 From: Pieter Roggemans Date: Mon, 20 Jan 2025 17:58:50 +0100 Subject: [PATCH 18/59] Smal fixes to tests --- pyogrio/tests/test_geopandas_io.py | 8 +++----- 1 file changed, 3 insertions(+), 5 deletions(-) diff --git a/pyogrio/tests/test_geopandas_io.py b/pyogrio/tests/test_geopandas_io.py index be191a4c..2f0331f2 100644 --- a/pyogrio/tests/test_geopandas_io.py +++ b/pyogrio/tests/test_geopandas_io.py @@ -6,7 +6,6 @@ from zipfile import ZipFile import numpy as np -from pandas.api.types import is_datetime64_dtype from pyogrio import ( __gdal_version__, @@ -40,6 +39,7 @@ import geopandas as gp import pandas as pd from geopandas.array import from_wkt + from pandas.api.types import is_datetime64_dtype import shapely # if geopandas is present, shapely is expected to be present from shapely.geometry import Point @@ -331,9 +331,7 @@ def test_write_read_datetime_tz(tmp_path, ext, use_arrow): if ext in (".fgb", ".gpkg"): # With GDAL < 3.11 with arrow, datetime columns are written as string type # columns - df.dates = df.dates.map( - lambda x: x.isoformat() if x is not pd.NaT else pd.NaT - ) + df.dates = df.dates.map(lambda x: x.isoformat()) assert_series_equal(result.dates, df.dates, check_index=False) @@ -479,7 +477,7 @@ def test_write_read_datetime_objects_with_nulls(tmp_path, dates_raw, ext, use_ar # With GDAL < 3.11 with arrow, datetime columns are written as string type # columns exp_df.dates = exp_df.dates.map( - lambda x: x.isoformat() if x is not pd.NaT else pd.NaT + lambda x: x.isoformat() if x is not pd.NaT else None ) assert_geodataframe_equal(result, exp_df) From deb862c2f5dcd6605bc2e972e9bce5bb00ea6b82 Mon Sep 17 00:00:00 2001 From: Pieter Roggemans Date: Mon, 20 Jan 2025 18:32:53 +0100 Subject: [PATCH 19/59] Xfail some tests where needed --- pyogrio/tests/test_geopandas_io.py | 12 ++++++++++++ 1 file changed, 12 insertions(+) diff --git a/pyogrio/tests/test_geopandas_io.py b/pyogrio/tests/test_geopandas_io.py index 2f0331f2..f3eac4b4 100644 --- a/pyogrio/tests/test_geopandas_io.py +++ b/pyogrio/tests/test_geopandas_io.py @@ -304,6 +304,12 @@ def test_write_read_datetime_tz(tmp_path, ext, use_arrow): This should result in the result being in pandas datetime64 dtype column. """ + if use_arrow and __gdal_version__ < (3, 10, 0) and ext in (".geojson", ".geojsonl"): + # With GDAL < 3.10 with arrow, the timezone offset was applied to the datetime + # as well as retaining the timezone. + # This was fixed in https://github.com/OSGeo/gdal/pull/11049 + pytest.xfail("Wrong datetimes read in GeoJSON with GDAL < 3.10 via arrow") + dates_raw = ["2020-01-01T09:00:00.123-05:00", "2020-01-01T10:00:00-05:00"] if PANDAS_GE_20: dates = pd.to_datetime(dates_raw, format="ISO8601").as_unit("ms") @@ -452,6 +458,12 @@ def test_write_read_datetime_no_tz(tmp_path, ext, use_arrow): @pytest.mark.requires_arrow_write_api def test_write_read_datetime_objects_with_nulls(tmp_path, dates_raw, ext, use_arrow): """Datetime objects with null values and the equal offset are read as datetime64.""" + if use_arrow and __gdal_version__ < (3, 10, 0) and ext in (".geojson", ".geojsonl"): + # With GDAL < 3.10 with arrow, the timezone offset was applied to the datetime + # as well as retaining the timezone. + # This was fixed in https://github.com/OSGeo/gdal/pull/11049 + pytest.xfail("Wrong datetimes read in GeoJSON with GDAL < 3.10 via arrow") + dates = pd.Series(dates_raw, dtype="O") df = gp.GeoDataFrame( {"dates": dates, "geometry": [Point(1, 1), Point(1, 1), Point(1, 1)]}, From e35c356176c965ab5bfddb35927adca97da7dc9b Mon Sep 17 00:00:00 2001 From: Pieter Roggemans Date: Wed, 22 Jan 2025 22:23:39 +0100 Subject: [PATCH 20/59] Make UTC assert more specific --- pyogrio/tests/test_geopandas_io.py | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/pyogrio/tests/test_geopandas_io.py b/pyogrio/tests/test_geopandas_io.py index f3eac4b4..0566f59a 100644 --- a/pyogrio/tests/test_geopandas_io.py +++ b/pyogrio/tests/test_geopandas_io.py @@ -39,7 +39,6 @@ import geopandas as gp import pandas as pd from geopandas.array import from_wkt - from pandas.api.types import is_datetime64_dtype import shapely # if geopandas is present, shapely is expected to be present from shapely.geometry import Point @@ -432,7 +431,7 @@ def test_write_read_datetime_no_tz(tmp_path, ext, use_arrow): assert_series_equal(result.dates, df.dates.dt.tz_localize("UTC")) pytest.xfail("naive datetimes read wrong in GPKG with GDAL < 3.11 via arrow") - assert is_datetime64_dtype(result.dates.dtype) + assert str(result.dates.dtype) == "datetime64[ms, UTC]" assert_geodataframe_equal(result, df) From 593b2820c5668e16129d6d856bb5a39e765f0793 Mon Sep 17 00:00:00 2001 From: Pieter Roggemans Date: Wed, 22 Jan 2025 22:58:01 +0100 Subject: [PATCH 21/59] Revert "Make UTC assert more specific" This reverts commit e35c356176c965ab5bfddb35927adca97da7dc9b. --- pyogrio/tests/test_geopandas_io.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/pyogrio/tests/test_geopandas_io.py b/pyogrio/tests/test_geopandas_io.py index 0566f59a..f3eac4b4 100644 --- a/pyogrio/tests/test_geopandas_io.py +++ b/pyogrio/tests/test_geopandas_io.py @@ -39,6 +39,7 @@ import geopandas as gp import pandas as pd from geopandas.array import from_wkt + from pandas.api.types import is_datetime64_dtype import shapely # if geopandas is present, shapely is expected to be present from shapely.geometry import Point @@ -431,7 +432,7 @@ def test_write_read_datetime_no_tz(tmp_path, ext, use_arrow): assert_series_equal(result.dates, df.dates.dt.tz_localize("UTC")) pytest.xfail("naive datetimes read wrong in GPKG with GDAL < 3.11 via arrow") - assert str(result.dates.dtype) == "datetime64[ms, UTC]" + assert is_datetime64_dtype(result.dates.dtype) assert_geodataframe_equal(result, df) From 35d8d87239445249bbe37bf204bedc7773e2a4ac Mon Sep 17 00:00:00 2001 From: Pieter Roggemans Date: Wed, 22 Jan 2025 22:58:37 +0100 Subject: [PATCH 22/59] Update test_geopandas_io.py --- pyogrio/tests/test_geopandas_io.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pyogrio/tests/test_geopandas_io.py b/pyogrio/tests/test_geopandas_io.py index f3eac4b4..bf9f70dc 100644 --- a/pyogrio/tests/test_geopandas_io.py +++ b/pyogrio/tests/test_geopandas_io.py @@ -517,7 +517,7 @@ def test_write_read_datetime_utc(tmp_path, ext, use_arrow): assert_series_equal(result.dates, df.dates.dt.tz_localize(None)) pytest.xfail("UTC datetimes read wrong in .fgb with GDAL < 3.11 via arrow") - assert isinstance(result.dates.dtype, pd.DatetimeTZDtype) + assert str(result.dates.dtype) == "datetime64[ms, UTC]" assert_geodataframe_equal(result, df) From 41c9da6a5937d8557af10ef354bb605c6844d843 Mon Sep 17 00:00:00 2001 From: Pieter Roggemans Date: Thu, 23 Jan 2025 02:33:07 +0100 Subject: [PATCH 23/59] Use astype("string") instead of apply Needs to be astype"string") instead of astype(str) to support nan values --- pyogrio/geopandas.py | 10 ++++------ 1 file changed, 4 insertions(+), 6 deletions(-) diff --git a/pyogrio/geopandas.py b/pyogrio/geopandas.py index d3cf011e..e2128cd5 100644 --- a/pyogrio/geopandas.py +++ b/pyogrio/geopandas.py @@ -594,14 +594,12 @@ def write_dataframe( if len(col_na) and all( isinstance(x, (pd.Timestamp, datetime)) for x in col_na ): - df[name] = col.apply( - lambda x: None if pd.isna(x) else x.isoformat() - ) + df[name] = col.astype("string") datetime_cols.append(name) elif isinstance(dtype, pd.DatetimeTZDtype) and str(dtype.tz) != "UTC": - # When a timezone has daylight saving time the offsets can also be - # different. UTC doesn't have this issue. - df[name] = col.apply(lambda x: None if pd.isna(x) else x.isoformat()) + # When it is a datetime column with a timezone different than UTC, it + # needs to be converted to string, otherwise the timezone info is lost. + df[name] = col.astype("string") datetime_cols.append(name) table = pa.Table.from_pandas(df, preserve_index=False) From f53af87dc55e8ee94cc6cd44dd8a7a13d1416b78 Mon Sep 17 00:00:00 2001 From: Pieter Roggemans Date: Thu, 23 Jan 2025 02:33:23 +0100 Subject: [PATCH 24/59] Improve tests --- pyogrio/tests/test_geopandas_io.py | 161 ++++++++++++++++------------- 1 file changed, 92 insertions(+), 69 deletions(-) diff --git a/pyogrio/tests/test_geopandas_io.py b/pyogrio/tests/test_geopandas_io.py index bf9f70dc..015c99c6 100644 --- a/pyogrio/tests/test_geopandas_io.py +++ b/pyogrio/tests/test_geopandas_io.py @@ -39,7 +39,7 @@ import geopandas as gp import pandas as pd from geopandas.array import from_wkt - from pandas.api.types import is_datetime64_dtype + from pandas.api.types import is_datetime64_dtype, is_object_dtype import shapely # if geopandas is present, shapely is expected to be present from shapely.geometry import Point @@ -296,6 +296,35 @@ def test_read_datetime(datetime_file, use_arrow): assert df.col.dtype.name == "datetime64[ns]" +@pytest.mark.parametrize("ext", [ext for ext in ALL_EXTS if ext != ".shp"]) +@pytest.mark.requires_arrow_write_api +def test_write_read_datetime_no_tz(tmp_path, ext, use_arrow): + """Test writing/reading a column with naive datetimes (no timezone information).""" + dates_raw = ["2020-01-01 09:00:00.123", "2020-01-01 10:00:00", None] + if PANDAS_GE_20: + dates = pd.to_datetime(dates_raw, format="ISO8601").as_unit("ms") + else: + dates = pd.to_datetime(dates_raw) + df = gp.GeoDataFrame( + {"dates": dates, "geometry": [Point(1, 1)] * 3}, crs="EPSG:4326" + ) + + fpath = tmp_path / f"test{ext}" + write_dataframe(df, fpath, use_arrow=use_arrow) + result = read_dataframe(fpath, use_arrow=use_arrow) + + if use_arrow and ext == ".gpkg" and __gdal_version__ < (3, 11, 0): + # With GDAL < 3.11 with arrow, columns with naive datetimes are written + # correctly, but when read they are wrongly interpreted as being in UTC. + # The reason is complicated, but more info can be found e.g. here: + # https://github.com/geopandas/pyogrio/issues/487#issuecomment-2423762807 + assert_series_equal(result.dates, df.dates.dt.tz_localize("UTC")) + pytest.xfail("naive datetimes read wrong in GPKG with GDAL < 3.11 via arrow") + + assert is_datetime64_dtype(result.dates.dtype) + assert_geodataframe_equal(result, df) + + @pytest.mark.parametrize("ext", [ext for ext in ALL_EXTS if ext != ".shp"]) @pytest.mark.filterwarnings("ignore: Non-conformant content for record 1 in column ") @pytest.mark.requires_arrow_write_api @@ -310,7 +339,7 @@ def test_write_read_datetime_tz(tmp_path, ext, use_arrow): # This was fixed in https://github.com/OSGeo/gdal/pull/11049 pytest.xfail("Wrong datetimes read in GeoJSON with GDAL < 3.10 via arrow") - dates_raw = ["2020-01-01T09:00:00.123-05:00", "2020-01-01T10:00:00-05:00"] + dates_raw = ["2020-01-01T09:00:00.123-05:00", "2020-01-01T10:00:00-05:00", None] if PANDAS_GE_20: dates = pd.to_datetime(dates_raw, format="ISO8601").as_unit("ms") else: @@ -319,8 +348,8 @@ def test_write_read_datetime_tz(tmp_path, ext, use_arrow): # Make the index non-consecutive to test this case as well. Added for issue # https://github.com/geopandas/pyogrio/issues/324 df = gp.GeoDataFrame( - {"dates": dates, "geometry": [Point(1, 1), Point(1, 1)]}, - index=[0, 2], + {"dates": dates, "geometry": [Point(1, 1)] * 3}, + index=[0, 2, 3], crs="EPSG:4326", ) assert isinstance(df.dates.dtype, pd.DatetimeTZDtype) @@ -330,45 +359,58 @@ def test_write_read_datetime_tz(tmp_path, ext, use_arrow): result = read_dataframe(fpath, use_arrow=use_arrow) # With some older versions, the offset is represented slightly differently - if str(result.dates.dtype).endswith(", pytz.FixedOffset(-300)]"): + if result.dates.dtype.name.endswith(", pytz.FixedOffset(-300)]"): result.dates = result.dates.astype(df.dates.dtype) - if use_arrow and __gdal_version__ < (3, 11, 0): - if ext in (".fgb", ".gpkg"): - # With GDAL < 3.11 with arrow, datetime columns are written as string type - # columns - df.dates = df.dates.map(lambda x: x.isoformat()) + if use_arrow and ext in (".fgb", ".gpkg") and __gdal_version__ < (3, 11, 0): + # With GDAL < 3.11 with arrow, datetime columns are written as string type + df_exp = df.copy() + df_exp.dates = df_exp[df_exp.dates.notna()].dates.astype(str) + assert_series_equal(result.dates, df_exp.dates, check_index=False) + pytest.xfail("datetime columns written as string with GDAL < 3.11 via arrow") + assert isinstance(df.dates.dtype, pd.DatetimeTZDtype) assert_series_equal(result.dates, df.dates, check_index=False) +@pytest.mark.parametrize("ext", [ext for ext in ALL_EXTS if ext != ".shp"]) @pytest.mark.filterwarnings( "ignore: Non-conformant content for record 1 in column dates" ) @pytest.mark.requires_arrow_write_api -def test_write_read_datetime_localized_mixed_offset(tmp_path, use_arrow): +def test_write_read_datetime_tz_localized_mixed_offset(tmp_path, ext, use_arrow): """Test with localized dates across a different summer/winter timezone offset.""" # Australian Summer Time AEDT (GMT+11), Standard Time AEST (GMT+10) - dates_raw = ["2023-01-01 11:00:01.111", "2023-06-01 10:00:01.111"] + dates_raw = ["2023-01-01 11:00:01.111", "2023-06-01 10:00:01.111", None] dates_naive = pd.Series(pd.to_datetime(dates_raw), name="dates") dates_local = dates_naive.dt.tz_localize("Australia/Sydney") - dates_local_offsets_str = dates_local.map(pd.Timestamp.isoformat) + dates_local_offsets_str = dates_local.astype("string").astype("O") dates_exp = dates_local_offsets_str.map(pd.Timestamp) df = gp.GeoDataFrame( - {"dates": dates_local, "geometry": [Point(1, 1), Point(1, 1)]}, - crs="EPSG:4326", + {"dates": dates_local, "geometry": [Point(1, 1)] * 3}, crs="EPSG:4326" ) - fpath = tmp_path / "test.gpkg" + fpath = tmp_path / f"test{ext}" write_dataframe(df, fpath, use_arrow=use_arrow) result = read_dataframe(fpath, use_arrow=use_arrow) if use_arrow and __gdal_version__ < (3, 11, 0): - # With GDAL < 3.11 with arrow, datetime columns written as string type columns - dates_exp = dates_local_offsets_str + if ext in (".geojson", ".geojsonl"): + # With GDAL < 3.11 with arrow, GDAL converts mixed timezone datetimes to UTC + # when read as the arrow datetime column type does not support mixed tz. + dates_utc = dates_local.dt.tz_convert("UTC") + if PANDAS_GE_20: + dates_utc = dates_utc.dt.as_unit("ms") + assert_series_equal(result.dates, dates_utc) + pytest.xfail("mixed tz datetimes converted to UTC with GDAL < 3.11 + arrow") + elif ext in (".gpkg", ".fgb"): + # With GDAL < 3.11 with arrow, datetime columns written as string type + assert_series_equal(result.dates, dates_local_offsets_str) + pytest.xfail("datetime columns written as string with GDAL < 3.11 + arrow") # GDAL tz only encodes offsets, not timezones - assert_series_equal(result["dates"], dates_exp) + assert is_object_dtype(result.dates.dtype) + assert_series_equal(result.dates, dates_exp) @pytest.mark.parametrize("ext", [ext for ext in ALL_EXTS if ext != ".shp"]) @@ -376,16 +418,18 @@ def test_write_read_datetime_localized_mixed_offset(tmp_path, use_arrow): "ignore: Non-conformant content for record 1 in column dates" ) @pytest.mark.requires_arrow_write_api -def test_write_read_datetime_mixed_offsets(tmp_path, ext, use_arrow): +def test_write_read_datetime_tz_mixed_offsets(tmp_path, ext, use_arrow): """Test with dates with mixed timezone offsets.""" # Pandas datetime64 column types doesn't support mixed timezone offsets, so # it needs to be a list of pandas.Timestamp objects instead. - dates_raw = ["2023-01-01 11:00:01.111+01:00", "2023-06-01 10:00:01.111+05:00"] - dates_ts = list(map(pd.Timestamp, dates_raw)) + dates = [ + pd.Timestamp("2023-01-01 11:00:01.111+01:00"), + pd.Timestamp("2023-06-01 10:00:01.111+05:00"), + None, + ] df = gp.GeoDataFrame( - {"dates": dates_ts, "geometry": [Point(1, 1), Point(1, 1)]}, - crs="EPSG:4326", + {"dates": dates, "geometry": [Point(1, 1)] * 3}, crs="EPSG:4326" ) fpath = tmp_path / f"test{ext}" write_dataframe(df, fpath, use_arrow=use_arrow) @@ -395,44 +439,21 @@ def test_write_read_datetime_mixed_offsets(tmp_path, ext, use_arrow): if ext in (".geojson", ".geojsonl"): # With GDAL < 3.11 with arrow, GDAL converts mixed timezone datetimes to UTC # when read as the arrow datetime column type does not support mixed tz. + df_exp = df.copy() + df_exp.dates = pd.to_datetime(dates, utc=True) if PANDAS_GE_20: - df.dates = pd.to_datetime(dates_ts, utc=True).as_unit("ms") - else: - df.dates = pd.to_datetime(dates_ts, utc=True) + df_exp.dates = df_exp.dates.dt.as_unit("ms") + assert_geodataframe_equal(result, df_exp) + pytest.xfail("mixed tz datetimes converted to UTC with GDAL < 3.11 + arrow") elif ext in (".gpkg", ".fgb"): # With arrow and GDAL < 3.11, mixed timezone datetimes are written as string # type columns, so no proper roundtrip possible. - df.dates = df.dates.map(pd.Timestamp.isoformat) + df_exp = df.copy() + df_exp.dates = df_exp.dates.astype("string").astype("O") + assert_geodataframe_equal(result, df_exp) + pytest.xfail("mixed tz datetimes converted to UTC with GDAL < 3.11 + arrow") - assert_geodataframe_equal(result, df) - - -@pytest.mark.parametrize("ext", [ext for ext in ALL_EXTS if ext != ".shp"]) -@pytest.mark.requires_arrow_write_api -def test_write_read_datetime_no_tz(tmp_path, ext, use_arrow): - """Test writing/reading a column with naive datetimes (no timezone information).""" - dates_raw = ["2020-01-01 09:00:00.123", "2020-01-01 10:00:00"] - if PANDAS_GE_20: - dates = pd.to_datetime(dates_raw, format="ISO8601").as_unit("ms") - else: - dates = pd.to_datetime(dates_raw) - df = gp.GeoDataFrame( - {"dates": dates, "geometry": [Point(1, 1), Point(1, 1)]}, crs="EPSG:4326" - ) - - fpath = tmp_path / f"test{ext}" - write_dataframe(df, fpath, use_arrow=use_arrow) - result = read_dataframe(fpath, use_arrow=use_arrow) - - if use_arrow and ext == ".gpkg" and __gdal_version__ < (3, 11, 0): - # With GDAL < 3.11 with arrow, columns with naive datetimes are written - # correctly, but when read they are wrongly interpreted as being in UTC. - # The reason is complicated, but more info can be found e.g. here: - # https://github.com/geopandas/pyogrio/issues/487#issuecomment-2423762807 - assert_series_equal(result.dates, df.dates.dt.tz_localize("UTC")) - pytest.xfail("naive datetimes read wrong in GPKG with GDAL < 3.11 via arrow") - - assert is_datetime64_dtype(result.dates.dtype) + assert is_object_dtype(result.dates.dtype) assert_geodataframe_equal(result, df) @@ -456,8 +477,8 @@ def test_write_read_datetime_no_tz(tmp_path, ext, use_arrow): "ignore: Non-conformant content for record 1 in column dates" ) @pytest.mark.requires_arrow_write_api -def test_write_read_datetime_objects_with_nulls(tmp_path, dates_raw, ext, use_arrow): - """Datetime objects with null values and the equal offset are read as datetime64.""" +def test_write_read_datetime_tz_objects(tmp_path, dates_raw, ext, use_arrow): + """Datetime objects with equal offsets are read as datetime64.""" if use_arrow and __gdal_version__ < (3, 10, 0) and ext in (".geojson", ".geojsonl"): # With GDAL < 3.10 with arrow, the timezone offset was applied to the datetime # as well as retaining the timezone. @@ -466,9 +487,9 @@ def test_write_read_datetime_objects_with_nulls(tmp_path, dates_raw, ext, use_ar dates = pd.Series(dates_raw, dtype="O") df = gp.GeoDataFrame( - {"dates": dates, "geometry": [Point(1, 1), Point(1, 1), Point(1, 1)]}, - crs="EPSG:4326", + {"dates": dates, "geometry": [Point(1, 1)] * 3}, crs="EPSG:4326" ) + if PANDAS_GE_20: dates_exp = pd.to_datetime(dates_raw, format="ISO8601").as_unit("ms") else: @@ -481,17 +502,18 @@ def test_write_read_datetime_objects_with_nulls(tmp_path, dates_raw, ext, use_ar result = read_dataframe(fpath, use_arrow=use_arrow) # With some older versions, the offset is represented slightly differently - if str(result.dates.dtype).endswith(", pytz.FixedOffset(-300)]"): + if result.dates.dtype.name.endswith(", pytz.FixedOffset(-300)]"): result.dates = result.dates.astype(exp_df.dates.dtype) if use_arrow and __gdal_version__ < (3, 11, 0): if ext in (".fgb", ".gpkg"): # With GDAL < 3.11 with arrow, datetime columns are written as string type - # columns - exp_df.dates = exp_df.dates.map( - lambda x: x.isoformat() if x is not pd.NaT else None - ) + exp2_df = exp_df.copy() + exp2_df.dates = exp2_df.dates.astype("string").astype("O") + assert_geodataframe_equal(result, exp2_df) + pytest.xfail("datetime columns written as string with GDAL < 3.11 + arrow") + assert isinstance(result.dates.dtype, pd.DatetimeTZDtype) assert_geodataframe_equal(result, exp_df) @@ -499,14 +521,15 @@ def test_write_read_datetime_objects_with_nulls(tmp_path, dates_raw, ext, use_ar @pytest.mark.requires_arrow_write_api def test_write_read_datetime_utc(tmp_path, ext, use_arrow): """Test writing/reading a column with UTC datetimes.""" - dates_raw = ["2020-01-01 09:00:00.123Z", "2020-01-01 10:00:00Z"] + dates_raw = ["2020-01-01 09:00:00.123Z", "2020-01-01 10:00:00Z", None] if PANDAS_GE_20: dates = pd.to_datetime(dates_raw, format="ISO8601").as_unit("ms") else: dates = pd.to_datetime(dates_raw) df = gp.GeoDataFrame( - {"dates": dates, "geometry": [Point(1, 1), Point(1, 1)]}, crs="EPSG:4326" + {"dates": dates, "geometry": [Point(1, 1)] * 3}, crs="EPSG:4326" ) + assert df.dates.dtype.name == "datetime64[ms, UTC]" fpath = tmp_path / f"test{ext}" write_dataframe(df, fpath, use_arrow=use_arrow) @@ -517,7 +540,7 @@ def test_write_read_datetime_utc(tmp_path, ext, use_arrow): assert_series_equal(result.dates, df.dates.dt.tz_localize(None)) pytest.xfail("UTC datetimes read wrong in .fgb with GDAL < 3.11 via arrow") - assert str(result.dates.dtype) == "datetime64[ms, UTC]" + assert result.dates.dtype.name == "datetime64[ms, UTC]" assert_geodataframe_equal(result, df) From a8c85b752c27c2132e6ef66a7e78f16bb9e5c023 Mon Sep 17 00:00:00 2001 From: Pieter Roggemans Date: Thu, 23 Jan 2025 02:46:56 +0100 Subject: [PATCH 25/59] Fix tests for older versions --- pyogrio/tests/test_geopandas_io.py | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/pyogrio/tests/test_geopandas_io.py b/pyogrio/tests/test_geopandas_io.py index 015c99c6..fbf86cf7 100644 --- a/pyogrio/tests/test_geopandas_io.py +++ b/pyogrio/tests/test_geopandas_io.py @@ -385,7 +385,9 @@ def test_write_read_datetime_tz_localized_mixed_offset(tmp_path, ext, use_arrow) dates_naive = pd.Series(pd.to_datetime(dates_raw), name="dates") dates_local = dates_naive.dt.tz_localize("Australia/Sydney") dates_local_offsets_str = dates_local.astype("string").astype("O") - dates_exp = dates_local_offsets_str.map(pd.Timestamp) + dates_exp = dates_local_offsets_str.apply( + lambda x: pd.Timestamp(x) if pd.notna(x) else None + ) df = gp.GeoDataFrame( {"dates": dates_local, "geometry": [Point(1, 1)] * 3}, crs="EPSG:4326" @@ -540,7 +542,7 @@ def test_write_read_datetime_utc(tmp_path, ext, use_arrow): assert_series_equal(result.dates, df.dates.dt.tz_localize(None)) pytest.xfail("UTC datetimes read wrong in .fgb with GDAL < 3.11 via arrow") - assert result.dates.dtype.name == "datetime64[ms, UTC]" + assert result.dates.dtype.name in ("datetime64[ms, UTC]", "datetime64[ns, UTC]") assert_geodataframe_equal(result, df) From 40ca1a53b2fa352e6ccd800fb4fefdc512c30fe2 Mon Sep 17 00:00:00 2001 From: Pieter Roggemans Date: Thu, 23 Jan 2025 02:50:00 +0100 Subject: [PATCH 26/59] Update test_geopandas_io.py --- pyogrio/tests/test_geopandas_io.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pyogrio/tests/test_geopandas_io.py b/pyogrio/tests/test_geopandas_io.py index fbf86cf7..7891fa10 100644 --- a/pyogrio/tests/test_geopandas_io.py +++ b/pyogrio/tests/test_geopandas_io.py @@ -531,7 +531,7 @@ def test_write_read_datetime_utc(tmp_path, ext, use_arrow): df = gp.GeoDataFrame( {"dates": dates, "geometry": [Point(1, 1)] * 3}, crs="EPSG:4326" ) - assert df.dates.dtype.name == "datetime64[ms, UTC]" + assert df.dates.dtype.name in ("datetime64[ms, UTC]", "datetime64[ns, UTC]") fpath = tmp_path / f"test{ext}" write_dataframe(df, fpath, use_arrow=use_arrow) From e0273b5c0b94fdf734b78a95214b27d86c494990 Mon Sep 17 00:00:00 2001 From: Pieter Roggemans Date: Sun, 3 Aug 2025 00:08:56 +0200 Subject: [PATCH 27/59] Add parameter to specify if dates need to be read as UTC or not --- pyogrio/geopandas.py | 30 +++++++++-- pyogrio/tests/test_geopandas_io.py | 81 ++++++++++++++++++++++-------- 2 files changed, 86 insertions(+), 25 deletions(-) diff --git a/pyogrio/geopandas.py b/pyogrio/geopandas.py index 4ad5ab9c..1683d26d 100644 --- a/pyogrio/geopandas.py +++ b/pyogrio/geopandas.py @@ -38,7 +38,7 @@ def _stringify_path(path): return path -def _try_parse_datetime(ser): +def _try_parse_datetime(ser, datetimes="UTC"): import pandas as pd # only called when pandas is known to be installed if PANDAS_GE_22: @@ -48,6 +48,17 @@ def _try_parse_datetime(ser): else: datetime_kwargs = {"yearfirst": True} + datetimes = datetimes.upper() + if datetimes == "UTC": + datetime_kwargs["utc"] = True + elif datetimes == "DATETIME": + datetime_kwargs["utc"] = False + else: + raise ValueError( + f"Invalid value for 'datetimes': {datetimes!r}. " + "Must be 'UTC' or 'DATETIME'." + ) + with warnings.catch_warnings(): warnings.filterwarnings( "ignore", @@ -93,6 +104,7 @@ def read_dataframe( use_arrow=None, on_invalid="raise", arrow_to_pandas_kwargs=None, + datetimes="UTC", **kwargs, ): """Read from an OGR data source to a GeoPandas GeoDataFrame or Pandas DataFrame. @@ -220,6 +232,18 @@ def read_dataframe( arrow_to_pandas_kwargs : dict, optional (default: None) When `use_arrow` is True, these kwargs will be passed to the `to_pandas`_ call for the arrow to pandas conversion. + datetimes : str, optional (default: "UTC") + The way datetime columns are returned. Possible values: + + - **"UTC"**: all datetime columns will be returned as pandas datetime64 columns + converted to UTC. Naive datetimes (without timezone information) will be + assumed to be in UTC timezone. + - **"DATETIME"**: datetimes will be returned in the timezone as they were read + from the data source. Columns with values in a single timezone or without + timezone information will be returned as pandas datetime64 columns. + Columns with mixed timezone data are returned as object columns with + pandas.Timestamp values. + **kwargs Additional driver-specific dataset open options passed to OGR. Invalid options will trigger a warning. @@ -330,7 +354,7 @@ def read_dataframe( # convert datetime columns that were read as string to datetime for dtype, column in zip(meta["dtypes"], meta["fields"]): if dtype is not None and dtype.startswith("datetime"): - df[column] = _try_parse_datetime(df[column]) + df[column] = _try_parse_datetime(df[column], datetimes=datetimes) if fid_as_index: df = df.set_index(meta["fid_column"]) @@ -363,7 +387,7 @@ def read_dataframe( df = pd.DataFrame(data, columns=columns, index=index) for dtype, c in zip(meta["dtypes"], df.columns): if dtype.startswith("datetime"): - df[c] = _try_parse_datetime(df[c]) + df[c] = _try_parse_datetime(df[c], datetimes=datetimes) if geometry is None or not read_geometry: return df diff --git a/pyogrio/tests/test_geopandas_io.py b/pyogrio/tests/test_geopandas_io.py index 7e254967..f7f8d9a7 100644 --- a/pyogrio/tests/test_geopandas_io.py +++ b/pyogrio/tests/test_geopandas_io.py @@ -48,7 +48,7 @@ import geopandas as gp import pandas as pd from geopandas.array import from_wkt - from pandas.api.types import is_datetime64_dtype, is_object_dtype + from pandas.api.types import is_datetime64_any_dtype, is_object_dtype import shapely # if geopandas is present, shapely is expected to be present from shapely.geometry import Point @@ -329,14 +329,20 @@ def test_read_datetime(datetime_file, use_arrow): df = read_dataframe(datetime_file, use_arrow=use_arrow) if PANDAS_GE_20: # starting with pandas 2.0, it preserves the passed datetime resolution - assert df.col.dtype.name == "datetime64[ms]" + assert df.col.dtype.name == "datetime64[ms, UTC]" else: - assert df.col.dtype.name == "datetime64[ns]" + assert df.col.dtype.name == "datetime64[ns, UTC]" + + +def test_read_datetimes_invalid_param(datetime_file, use_arrow): + with pytest.raises(ValueError, match="Invalid value for 'datetimes'"): + read_dataframe(datetime_file, use_arrow=use_arrow, datetimes="INVALID") @pytest.mark.parametrize("ext", [ext for ext in ALL_EXTS if ext != ".shp"]) +@pytest.mark.parametrize("datetimes", ["UTC", "DATETIME"]) @pytest.mark.requires_arrow_write_api -def test_write_read_datetime_no_tz(tmp_path, ext, use_arrow): +def test_write_read_datetime_no_tz(tmp_path, ext, datetimes, use_arrow): """Test writing/reading a column with naive datetimes (no timezone information).""" dates_raw = ["2020-01-01 09:00:00.123", "2020-01-01 10:00:00", None] if PANDAS_GE_20: @@ -349,7 +355,7 @@ def test_write_read_datetime_no_tz(tmp_path, ext, use_arrow): fpath = tmp_path / f"test{ext}" write_dataframe(df, fpath, use_arrow=use_arrow) - result = read_dataframe(fpath, use_arrow=use_arrow) + result = read_dataframe(fpath, use_arrow=use_arrow, datetimes=datetimes) if use_arrow and ext == ".gpkg" and __gdal_version__ < (3, 11, 0): # With GDAL < 3.11 with arrow, columns with naive datetimes are written @@ -359,14 +365,18 @@ def test_write_read_datetime_no_tz(tmp_path, ext, use_arrow): assert_series_equal(result.dates, df.dates.dt.tz_localize("UTC")) pytest.xfail("naive datetimes read wrong in GPKG with GDAL < 3.11 via arrow") - assert is_datetime64_dtype(result.dates.dtype) - assert_geodataframe_equal(result, df) + assert is_datetime64_any_dtype(result.dates.dtype) + if datetimes == "UTC": + assert_series_equal(result.dates, df.dates.dt.tz_localize("UTC")) + else: + assert_geodataframe_equal(result, df) @pytest.mark.parametrize("ext", [ext for ext in ALL_EXTS if ext != ".shp"]) +@pytest.mark.parametrize("datetimes", ["UTC", "DATETIME"]) @pytest.mark.filterwarnings("ignore: Non-conformant content for record 1 in column ") @pytest.mark.requires_arrow_write_api -def test_write_read_datetime_tz(tmp_path, ext, use_arrow): +def test_write_read_datetime_tz(tmp_path, ext, datetimes, use_arrow): """Write and read file with all equal timezones. This should result in the result being in pandas datetime64 dtype column. @@ -394,7 +404,7 @@ def test_write_read_datetime_tz(tmp_path, ext, use_arrow): fpath = tmp_path / f"test{ext}" write_dataframe(df, fpath, use_arrow=use_arrow) - result = read_dataframe(fpath, use_arrow=use_arrow) + result = read_dataframe(fpath, use_arrow=use_arrow, datetimes=datetimes) # With some older versions, the offset is represented slightly differently if result.dates.dtype.name.endswith(", pytz.FixedOffset(-300)]"): @@ -408,31 +418,44 @@ def test_write_read_datetime_tz(tmp_path, ext, use_arrow): pytest.xfail("datetime columns written as string with GDAL < 3.11 via arrow") assert isinstance(df.dates.dtype, pd.DatetimeTZDtype) - assert_series_equal(result.dates, df.dates, check_index=False) + if datetimes == "UTC": + assert_series_equal( + result.dates, df.dates.dt.tz_convert("UTC"), check_index=False + ) + else: + assert_series_equal(result.dates, df.dates, check_index=False) @pytest.mark.parametrize("ext", [ext for ext in ALL_EXTS if ext != ".shp"]) +@pytest.mark.parametrize("datetimes", ["UTC", "DATETIME"]) @pytest.mark.filterwarnings( "ignore: Non-conformant content for record 1 in column dates" ) @pytest.mark.requires_arrow_write_api -def test_write_read_datetime_tz_localized_mixed_offset(tmp_path, ext, use_arrow): +def test_write_read_datetime_tz_localized_mixed_offset( + tmp_path, ext, datetimes, use_arrow +): """Test with localized dates across a different summer/winter timezone offset.""" # Australian Summer Time AEDT (GMT+11), Standard Time AEST (GMT+10) dates_raw = ["2023-01-01 11:00:01.111", "2023-06-01 10:00:01.111", None] dates_naive = pd.Series(pd.to_datetime(dates_raw), name="dates") dates_local = dates_naive.dt.tz_localize("Australia/Sydney") dates_local_offsets_str = dates_local.astype("string").astype("O") - dates_exp = dates_local_offsets_str.apply( - lambda x: pd.Timestamp(x) if pd.notna(x) else None - ) + if datetimes == "UTC": + dates_exp = dates_local.dt.tz_convert("UTC") + if PANDAS_GE_20: + dates_exp = dates_exp.dt.as_unit("ms") + else: + dates_exp = dates_local_offsets_str.apply( + lambda x: pd.Timestamp(x) if pd.notna(x) else None + ) df = gp.GeoDataFrame( {"dates": dates_local, "geometry": [Point(1, 1)] * 3}, crs="EPSG:4326" ) fpath = tmp_path / f"test{ext}" write_dataframe(df, fpath, use_arrow=use_arrow) - result = read_dataframe(fpath, use_arrow=use_arrow) + result = read_dataframe(fpath, use_arrow=use_arrow, datetimes=datetimes) if use_arrow and __gdal_version__ < (3, 11, 0): if ext in (".geojson", ".geojsonl"): @@ -449,16 +472,20 @@ def test_write_read_datetime_tz_localized_mixed_offset(tmp_path, ext, use_arrow) pytest.xfail("datetime columns written as string with GDAL < 3.11 + arrow") # GDAL tz only encodes offsets, not timezones - assert is_object_dtype(result.dates.dtype) + if datetimes == "UTC": + assert isinstance(result.dates.dtype, pd.DatetimeTZDtype) + else: + assert is_object_dtype(result.dates.dtype) assert_series_equal(result.dates, dates_exp) @pytest.mark.parametrize("ext", [ext for ext in ALL_EXTS if ext != ".shp"]) +@pytest.mark.parametrize("datetimes", ["UTC", "DATETIME"]) @pytest.mark.filterwarnings( "ignore: Non-conformant content for record 1 in column dates" ) @pytest.mark.requires_arrow_write_api -def test_write_read_datetime_tz_mixed_offsets(tmp_path, ext, use_arrow): +def test_write_read_datetime_tz_mixed_offsets(tmp_path, ext, datetimes, use_arrow): """Test with dates with mixed timezone offsets.""" # Pandas datetime64 column types doesn't support mixed timezone offsets, so # it needs to be a list of pandas.Timestamp objects instead. @@ -473,7 +500,7 @@ def test_write_read_datetime_tz_mixed_offsets(tmp_path, ext, use_arrow): ) fpath = tmp_path / f"test{ext}" write_dataframe(df, fpath, use_arrow=use_arrow) - result = read_dataframe(fpath, use_arrow=use_arrow) + result = read_dataframe(fpath, use_arrow=use_arrow, datetimes=datetimes) if use_arrow and __gdal_version__ < (3, 11, 0): if ext in (".geojson", ".geojsonl"): @@ -493,8 +520,15 @@ def test_write_read_datetime_tz_mixed_offsets(tmp_path, ext, use_arrow): assert_geodataframe_equal(result, df_exp) pytest.xfail("mixed tz datetimes converted to UTC with GDAL < 3.11 + arrow") - assert is_object_dtype(result.dates.dtype) - assert_geodataframe_equal(result, df) + if datetimes == "UTC": + assert isinstance(result.dates.dtype, pd.DatetimeTZDtype) + exp_dates = pd.to_datetime(df.dates, utc=True) + if PANDAS_GE_20: + exp_dates = exp_dates.dt.as_unit("ms") + assert_series_equal(result.dates, exp_dates) + else: + assert is_object_dtype(result.dates.dtype) + assert_geodataframe_equal(result, df) @pytest.mark.parametrize("ext", [ext for ext in ALL_EXTS if ext != ".shp"]) @@ -513,11 +547,12 @@ def test_write_read_datetime_tz_mixed_offsets(tmp_path, ext, use_arrow): ), ], ) +@pytest.mark.parametrize("datetimes", ["UTC", "DATETIME"]) @pytest.mark.filterwarnings( "ignore: Non-conformant content for record 1 in column dates" ) @pytest.mark.requires_arrow_write_api -def test_write_read_datetime_tz_objects(tmp_path, dates_raw, ext, use_arrow): +def test_write_read_datetime_tz_objects(tmp_path, dates_raw, ext, use_arrow, datetimes): """Datetime objects with equal offsets are read as datetime64.""" if use_arrow and __gdal_version__ < (3, 10, 0) and ext in (".geojson", ".geojsonl"): # With GDAL < 3.10 with arrow, the timezone offset was applied to the datetime @@ -539,7 +574,7 @@ def test_write_read_datetime_tz_objects(tmp_path, dates_raw, ext, use_arrow): fpath = tmp_path / f"test{ext}" write_dataframe(df, fpath, use_arrow=use_arrow) - result = read_dataframe(fpath, use_arrow=use_arrow) + result = read_dataframe(fpath, use_arrow=use_arrow, datetimes=datetimes) # With some older versions, the offset is represented slightly differently if result.dates.dtype.name.endswith(", pytz.FixedOffset(-300)]"): @@ -553,6 +588,8 @@ def test_write_read_datetime_tz_objects(tmp_path, dates_raw, ext, use_arrow): assert_geodataframe_equal(result, exp2_df) pytest.xfail("datetime columns written as string with GDAL < 3.11 + arrow") + if datetimes == "UTC": + exp_df.dates = exp_df.dates.dt.tz_convert("UTC") assert isinstance(result.dates.dtype, pd.DatetimeTZDtype) assert_geodataframe_equal(result, exp_df) From 91027d1a62f440f78c4c8ceb2a2216e798b4933c Mon Sep 17 00:00:00 2001 From: Pieter Roggemans Date: Sun, 3 Aug 2025 11:18:10 +0200 Subject: [PATCH 28/59] Fix tests for old gdal versions --- pyogrio/tests/test_geopandas_io.py | 12 +++++++++--- 1 file changed, 9 insertions(+), 3 deletions(-) diff --git a/pyogrio/tests/test_geopandas_io.py b/pyogrio/tests/test_geopandas_io.py index f7f8d9a7..a38bfd5d 100644 --- a/pyogrio/tests/test_geopandas_io.py +++ b/pyogrio/tests/test_geopandas_io.py @@ -595,8 +595,9 @@ def test_write_read_datetime_tz_objects(tmp_path, dates_raw, ext, use_arrow, dat @pytest.mark.parametrize("ext", [ext for ext in ALL_EXTS if ext != ".shp"]) +@pytest.mark.parametrize("datetimes", ["UTC", "DATETIME"]) @pytest.mark.requires_arrow_write_api -def test_write_read_datetime_utc(tmp_path, ext, use_arrow): +def test_write_read_datetime_utc(tmp_path, ext, use_arrow, datetimes): """Test writing/reading a column with UTC datetimes.""" dates_raw = ["2020-01-01 09:00:00.123Z", "2020-01-01 10:00:00Z", None] if PANDAS_GE_20: @@ -610,9 +611,14 @@ def test_write_read_datetime_utc(tmp_path, ext, use_arrow): fpath = tmp_path / f"test{ext}" write_dataframe(df, fpath, use_arrow=use_arrow) - result = read_dataframe(fpath, use_arrow=use_arrow) + result = read_dataframe(fpath, use_arrow=use_arrow, datetimes=datetimes) - if use_arrow and ext == ".fgb" and __gdal_version__ < (3, 11, 0): + if ( + use_arrow + and datetimes != "UTC" + and ext == ".fgb" + and __gdal_version__ < (3, 11, 0) + ): # With GDAL < 3.11 with arrow, timezone information is dropped when reading .fgb assert_series_equal(result.dates, df.dates.dt.tz_localize(None)) pytest.xfail("UTC datetimes read wrong in .fgb with GDAL < 3.11 via arrow") From 46f7847670932713d74a0735dbab7417c815d072 Mon Sep 17 00:00:00 2001 From: Pieter Roggemans Date: Sun, 3 Aug 2025 12:56:55 +0200 Subject: [PATCH 29/59] Treat Obkect column as datetime if the first non-null value is a datetime --- pyogrio/geopandas.py | 15 ++++++++------- 1 file changed, 8 insertions(+), 7 deletions(-) diff --git a/pyogrio/geopandas.py b/pyogrio/geopandas.py index 1683d26d..0d4ae1da 100644 --- a/pyogrio/geopandas.py +++ b/pyogrio/geopandas.py @@ -610,6 +610,7 @@ def write_dataframe( crs = geometry.crs.to_wkt("WKT1_GDAL") if use_arrow: + import pandas as pd # only called when pandas is known to be installed import pyarrow as pa from pyogrio.raw import write_arrow @@ -651,13 +652,13 @@ def write_dataframe( for name, dtype in df.dtypes.items(): col = df[name] if dtype == "object": - # When all non-NA values are Timestamps, treat as datetime column - col_na = df[col.notna()][name] - if len(col_na) and all( - isinstance(x, (pd.Timestamp, datetime)) for x in col_na - ): - df[name] = col.astype("string") - datetime_cols.append(name) + # If first non-NA value is a datetime-like object, treat as datetime + # column. + first_non_na_index = col.first_valid_index() + if first_non_na_index is not None: + if isinstance(col[first_non_na_index], (pd.Timestamp, datetime)): + df[name] = col.astype("string") + datetime_cols.append(name) elif isinstance(dtype, pd.DatetimeTZDtype) and str(dtype.tz) != "UTC": # When it is a datetime column with a timezone different than UTC, it # needs to be converted to string, otherwise the timezone info is lost. From 8f7d8534db5bca6cabd1ef9c4489431c483431e5 Mon Sep 17 00:00:00 2001 From: Pieter Roggemans Date: Sun, 3 Aug 2025 15:49:14 +0200 Subject: [PATCH 30/59] Add support to return datetimes as string --- pyogrio/_io.pyx | 14 ++++-- pyogrio/_ogr.pxd | 8 ++++ pyogrio/geopandas.py | 26 ++++++---- pyogrio/tests/test_geopandas_io.py | 77 ++++++++++++++++++++++-------- pyogrio/tests/test_raw_io.py | 6 +-- 5 files changed, 95 insertions(+), 36 deletions(-) diff --git a/pyogrio/_io.pyx b/pyogrio/_io.pyx index af5a68ea..215289ee 100644 --- a/pyogrio/_io.pyx +++ b/pyogrio/_io.pyx @@ -936,10 +936,16 @@ cdef process_fields( if datetime_as_string: # defer datetime parsing to user/ pandas layer - # Update to OGR_F_GetFieldAsISO8601DateTime when GDAL 3.7+ only - data[i] = get_string( - OGR_F_GetFieldAsString(ogr_feature, field_index), encoding=encoding - ) + IF CTE_GDAL_VERSION >= (3, 7, 0): + data[i] = get_string( + OGR_F_GetFieldAsISO8601DateTime(ogr_feature, field_index, NULL), + encoding=encoding, + ) + ELSE: + data[i] = get_string( + OGR_F_GetFieldAsString(ogr_feature, field_index), + encoding=encoding, + ) else: success = OGR_F_GetFieldAsDateTimeEx( ogr_feature, diff --git a/pyogrio/_ogr.pxd b/pyogrio/_ogr.pxd index ca400f6a..4d07b2f4 100644 --- a/pyogrio/_ogr.pxd +++ b/pyogrio/_ogr.pxd @@ -415,6 +415,14 @@ IF CTE_GDAL_VERSION >= (3, 6, 0): ) +IF CTE_GDAL_VERSION >= (3, 7, 0): + + cdef extern from "ogr_api.h": + const char* OGR_F_GetFieldAsISO8601DateTime( + OGRFeatureH feature, int n, char** papszOptions + ) + + IF CTE_GDAL_VERSION >= (3, 8, 0): cdef extern from "ogr_api.h": diff --git a/pyogrio/geopandas.py b/pyogrio/geopandas.py index 0d4ae1da..358e0182 100644 --- a/pyogrio/geopandas.py +++ b/pyogrio/geopandas.py @@ -13,6 +13,7 @@ PANDAS_GE_22, PANDAS_GE_30, PYARROW_GE_19, + __gdal_version__, ) from pyogrio.errors import DataSourceError from pyogrio.raw import ( @@ -41,15 +42,15 @@ def _stringify_path(path): def _try_parse_datetime(ser, datetimes="UTC"): import pandas as pd # only called when pandas is known to be installed - if PANDAS_GE_22: - datetime_kwargs = {"format": "ISO8601"} - elif PANDAS_GE_20: - datetime_kwargs = {"format": "ISO8601", "errors": "ignore"} - else: - datetime_kwargs = {"yearfirst": True} - datetimes = datetimes.upper() - if datetimes == "UTC": + datetime_kwargs = {} + if datetimes == "STRING": + # do not convert to datetime, return as-is + if __gdal_version__ < (3, 7, 0): + # GDAL < 3.7 doesn't return datetimes in ISO8601 format, so fix that + ser = ser.str.replace(" ", "T").str.replace("/", "-") + return ser + elif datetimes == "UTC": datetime_kwargs["utc"] = True elif datetimes == "DATETIME": datetime_kwargs["utc"] = False @@ -59,6 +60,14 @@ def _try_parse_datetime(ser, datetimes="UTC"): "Must be 'UTC' or 'DATETIME'." ) + if PANDAS_GE_22: + datetime_kwargs["format"] = "ISO8601" + elif PANDAS_GE_20: + datetime_kwargs["format"] = "ISO8601" + datetime_kwargs["errors"] = "ignore" + else: + datetime_kwargs["yearfirst"] = True + with warnings.catch_warnings(): warnings.filterwarnings( "ignore", @@ -243,6 +252,7 @@ def read_dataframe( timezone information will be returned as pandas datetime64 columns. Columns with mixed timezone data are returned as object columns with pandas.Timestamp values. + - **"STRING"**: all datetime columns will be returned as strings. **kwargs Additional driver-specific dataset open options passed to OGR. Invalid diff --git a/pyogrio/tests/test_geopandas_io.py b/pyogrio/tests/test_geopandas_io.py index a38bfd5d..6cb41c7d 100644 --- a/pyogrio/tests/test_geopandas_io.py +++ b/pyogrio/tests/test_geopandas_io.py @@ -48,7 +48,12 @@ import geopandas as gp import pandas as pd from geopandas.array import from_wkt - from pandas.api.types import is_datetime64_any_dtype, is_object_dtype + from pandas.api.types import ( + is_datetime64_any_dtype, + is_datetime64_dtype, + is_object_dtype, + is_string_dtype, + ) import shapely # if geopandas is present, shapely is expected to be present from shapely.geometry import Point @@ -340,11 +345,11 @@ def test_read_datetimes_invalid_param(datetime_file, use_arrow): @pytest.mark.parametrize("ext", [ext for ext in ALL_EXTS if ext != ".shp"]) -@pytest.mark.parametrize("datetimes", ["UTC", "DATETIME"]) +@pytest.mark.parametrize("datetimes", ["UTC", "DATETIME", "STRING"]) @pytest.mark.requires_arrow_write_api def test_write_read_datetime_no_tz(tmp_path, ext, datetimes, use_arrow): """Test writing/reading a column with naive datetimes (no timezone information).""" - dates_raw = ["2020-01-01 09:00:00.123", "2020-01-01 10:00:00", None] + dates_raw = ["2020-01-01T09:00:00.123", "2020-01-01T10:00:00", None] if PANDAS_GE_20: dates = pd.to_datetime(dates_raw, format="ISO8601").as_unit("ms") else: @@ -365,15 +370,20 @@ def test_write_read_datetime_no_tz(tmp_path, ext, datetimes, use_arrow): assert_series_equal(result.dates, df.dates.dt.tz_localize("UTC")) pytest.xfail("naive datetimes read wrong in GPKG with GDAL < 3.11 via arrow") - assert is_datetime64_any_dtype(result.dates.dtype) if datetimes == "UTC": + assert is_datetime64_any_dtype(result.dates.dtype) assert_series_equal(result.dates, df.dates.dt.tz_localize("UTC")) - else: + elif datetimes == "DATETIME": + assert is_datetime64_dtype(result.dates.dtype) assert_geodataframe_equal(result, df) + elif datetimes == "STRING": + assert is_string_dtype(result.dates.dtype) + dates_str = pd.Series(dates_raw, name="dates") + assert_series_equal(result.dates, dates_str, check_dtype=False) @pytest.mark.parametrize("ext", [ext for ext in ALL_EXTS if ext != ".shp"]) -@pytest.mark.parametrize("datetimes", ["UTC", "DATETIME"]) +@pytest.mark.parametrize("datetimes", ["UTC", "DATETIME", "STRING"]) @pytest.mark.filterwarnings("ignore: Non-conformant content for record 1 in column ") @pytest.mark.requires_arrow_write_api def test_write_read_datetime_tz(tmp_path, ext, datetimes, use_arrow): @@ -422,12 +432,15 @@ def test_write_read_datetime_tz(tmp_path, ext, datetimes, use_arrow): assert_series_equal( result.dates, df.dates.dt.tz_convert("UTC"), check_index=False ) - else: + elif datetimes == "DATETIME": assert_series_equal(result.dates, df.dates, check_index=False) + elif datetimes == "STRING": + dates_str = pd.Series(dates_raw, name="dates") + assert_series_equal(result.dates, dates_str, check_index=False) @pytest.mark.parametrize("ext", [ext for ext in ALL_EXTS if ext != ".shp"]) -@pytest.mark.parametrize("datetimes", ["UTC", "DATETIME"]) +@pytest.mark.parametrize("datetimes", ["UTC", "DATETIME", "STRING"]) @pytest.mark.filterwarnings( "ignore: Non-conformant content for record 1 in column dates" ) @@ -445,10 +458,13 @@ def test_write_read_datetime_tz_localized_mixed_offset( dates_exp = dates_local.dt.tz_convert("UTC") if PANDAS_GE_20: dates_exp = dates_exp.dt.as_unit("ms") - else: + elif datetimes == "DATETIME": dates_exp = dates_local_offsets_str.apply( lambda x: pd.Timestamp(x) if pd.notna(x) else None ) + elif datetimes == "STRING": + dates_exp = dates_local_offsets_str.str.replace(" ", "T") + dates_exp = dates_exp.str.replace(".111000", ".111") df = gp.GeoDataFrame( {"dates": dates_local, "geometry": [Point(1, 1)] * 3}, crs="EPSG:4326" @@ -480,7 +496,7 @@ def test_write_read_datetime_tz_localized_mixed_offset( @pytest.mark.parametrize("ext", [ext for ext in ALL_EXTS if ext != ".shp"]) -@pytest.mark.parametrize("datetimes", ["UTC", "DATETIME"]) +@pytest.mark.parametrize("datetimes", ["UTC", "DATETIME", "STRING"]) @pytest.mark.filterwarnings( "ignore: Non-conformant content for record 1 in column dates" ) @@ -526,9 +542,15 @@ def test_write_read_datetime_tz_mixed_offsets(tmp_path, ext, datetimes, use_arro if PANDAS_GE_20: exp_dates = exp_dates.dt.as_unit("ms") assert_series_equal(result.dates, exp_dates) - else: + elif datetimes == "DATETIME": assert is_object_dtype(result.dates.dtype) assert_geodataframe_equal(result, df) + elif datetimes == "STRING": + assert is_string_dtype(result.dates.dtype) + dates_str = df.dates.map( + lambda x: x.isoformat(timespec="milliseconds") if pd.notna(x) else None + ) + assert_series_equal(result.dates, dates_str) @pytest.mark.parametrize("ext", [ext for ext in ALL_EXTS if ext != ".shp"]) @@ -547,7 +569,7 @@ def test_write_read_datetime_tz_mixed_offsets(tmp_path, ext, datetimes, use_arro ), ], ) -@pytest.mark.parametrize("datetimes", ["UTC", "DATETIME"]) +@pytest.mark.parametrize("datetimes", ["UTC", "DATETIME", "STRING"]) @pytest.mark.filterwarnings( "ignore: Non-conformant content for record 1 in column dates" ) @@ -565,6 +587,11 @@ def test_write_read_datetime_tz_objects(tmp_path, dates_raw, ext, use_arrow, dat {"dates": dates, "geometry": [Point(1, 1)] * 3}, crs="EPSG:4326" ) + fpath = tmp_path / f"test{ext}" + write_dataframe(df, fpath, use_arrow=use_arrow) + result = read_dataframe(fpath, use_arrow=use_arrow, datetimes=datetimes) + + # Check result if PANDAS_GE_20: dates_exp = pd.to_datetime(dates_raw, format="ISO8601").as_unit("ms") else: @@ -572,10 +599,6 @@ def test_write_read_datetime_tz_objects(tmp_path, dates_raw, ext, use_arrow, dat exp_df = df.copy() exp_df.dates = pd.Series(dates_exp, name="dates") - fpath = tmp_path / f"test{ext}" - write_dataframe(df, fpath, use_arrow=use_arrow) - result = read_dataframe(fpath, use_arrow=use_arrow, datetimes=datetimes) - # With some older versions, the offset is represented slightly differently if result.dates.dtype.name.endswith(", pytz.FixedOffset(-300)]"): result.dates = result.dates.astype(exp_df.dates.dtype) @@ -589,17 +612,24 @@ def test_write_read_datetime_tz_objects(tmp_path, dates_raw, ext, use_arrow, dat pytest.xfail("datetime columns written as string with GDAL < 3.11 + arrow") if datetimes == "UTC": + assert isinstance(result.dates.dtype, pd.DatetimeTZDtype) exp_df.dates = exp_df.dates.dt.tz_convert("UTC") - assert isinstance(result.dates.dtype, pd.DatetimeTZDtype) + elif datetimes == "DATETIME": + assert isinstance(result.dates.dtype, pd.DatetimeTZDtype) + elif datetimes == "STRING": + assert is_string_dtype(result.dates.dtype) + exp_df.dates = df.dates.map( + lambda x: x.isoformat(timespec="milliseconds") if pd.notna(x) else None + ).str.replace(".000", "") assert_geodataframe_equal(result, exp_df) @pytest.mark.parametrize("ext", [ext for ext in ALL_EXTS if ext != ".shp"]) -@pytest.mark.parametrize("datetimes", ["UTC", "DATETIME"]) +@pytest.mark.parametrize("datetimes", ["UTC", "DATETIME", "STRING"]) @pytest.mark.requires_arrow_write_api def test_write_read_datetime_utc(tmp_path, ext, use_arrow, datetimes): """Test writing/reading a column with UTC datetimes.""" - dates_raw = ["2020-01-01 09:00:00.123Z", "2020-01-01 10:00:00Z", None] + dates_raw = ["2020-01-01T09:00:00.123Z", "2020-01-01T10:00:00Z", None] if PANDAS_GE_20: dates = pd.to_datetime(dates_raw, format="ISO8601").as_unit("ms") else: @@ -623,8 +653,13 @@ def test_write_read_datetime_utc(tmp_path, ext, use_arrow, datetimes): assert_series_equal(result.dates, df.dates.dt.tz_localize(None)) pytest.xfail("UTC datetimes read wrong in .fgb with GDAL < 3.11 via arrow") - assert result.dates.dtype.name in ("datetime64[ms, UTC]", "datetime64[ns, UTC]") - assert_geodataframe_equal(result, df) + if datetimes in ("UTC", "DATETIME"): + assert result.dates.dtype.name in ("datetime64[ms, UTC]", "datetime64[ns, UTC]") + assert_geodataframe_equal(result, df) + elif datetimes == "STRING": + assert is_string_dtype(result.dates.dtype) + dates_str = pd.Series(dates_raw, name="dates") + assert_series_equal(result.dates, dates_str, check_dtype=False) def test_read_null_values(tmp_path, use_arrow): diff --git a/pyogrio/tests/test_raw_io.py b/pyogrio/tests/test_raw_io.py index e9a6176a..646fbf2a 100644 --- a/pyogrio/tests/test_raw_io.py +++ b/pyogrio/tests/test_raw_io.py @@ -1053,9 +1053,9 @@ def test_read_datetime_as_string(datetime_tz_file): field = read(datetime_tz_file, datetime_as_string=True)[3][0] assert field.dtype == "object" - # GDAL doesn't return strings in ISO format (yet) - assert field[0] == "2020/01/01 09:00:00.123-05" - assert field[1] == "2020/01/01 10:00:00-05" + + assert field[0] == "2020-01-01T09:00:00.123-05:00" + assert field[1] == "2020-01-01T10:00:00-05:00" @pytest.mark.parametrize("ext", ["gpkg", "geojson"]) From 9398ae41d8dcff3b7dad62c590630e0e1017a68c Mon Sep 17 00:00:00 2001 From: Pieter Roggemans Date: Sun, 3 Aug 2025 16:08:05 +0200 Subject: [PATCH 31/59] Convert to string for older GDAL versions --- pyogrio/geopandas.py | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/pyogrio/geopandas.py b/pyogrio/geopandas.py index 358e0182..fc8cf591 100644 --- a/pyogrio/geopandas.py +++ b/pyogrio/geopandas.py @@ -41,14 +41,17 @@ def _stringify_path(path): def _try_parse_datetime(ser, datetimes="UTC"): import pandas as pd # only called when pandas is known to be installed + from pandas.api.types import is_string_dtype datetimes = datetimes.upper() datetime_kwargs = {} if datetimes == "STRING": - # do not convert to datetime, return as-is + if not is_string_dtype(ser.dtype): + res = ser.astype("string").str.replace(" ", "T") + return res if __gdal_version__ < (3, 7, 0): # GDAL < 3.7 doesn't return datetimes in ISO8601 format, so fix that - ser = ser.str.replace(" ", "T").str.replace("/", "-") + return ser.str.replace(" ", "T").str.replace("/", "-") return ser elif datetimes == "UTC": datetime_kwargs["utc"] = True From 3d3e3dac4fba418b723a2de202c67752bbd19f0f Mon Sep 17 00:00:00 2001 From: Pieter Roggemans Date: Sun, 3 Aug 2025 16:22:28 +0200 Subject: [PATCH 32/59] Fixes to tests for old gdal versions --- pyogrio/tests/test_geopandas_io.py | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/pyogrio/tests/test_geopandas_io.py b/pyogrio/tests/test_geopandas_io.py index 6cb41c7d..b0323da0 100644 --- a/pyogrio/tests/test_geopandas_io.py +++ b/pyogrio/tests/test_geopandas_io.py @@ -378,6 +378,7 @@ def test_write_read_datetime_no_tz(tmp_path, ext, datetimes, use_arrow): assert_geodataframe_equal(result, df) elif datetimes == "STRING": assert is_string_dtype(result.dates.dtype) + result.dates = result.dates.str.replace(".000", "") dates_str = pd.Series(dates_raw, name="dates") assert_series_equal(result.dates, dates_str, check_dtype=False) @@ -435,6 +436,7 @@ def test_write_read_datetime_tz(tmp_path, ext, datetimes, use_arrow): elif datetimes == "DATETIME": assert_series_equal(result.dates, df.dates, check_index=False) elif datetimes == "STRING": + result.dates = result.dates.str.replace(".000", "") dates_str = pd.Series(dates_raw, name="dates") assert_series_equal(result.dates, dates_str, check_index=False) @@ -618,6 +620,7 @@ def test_write_read_datetime_tz_objects(tmp_path, dates_raw, ext, use_arrow, dat assert isinstance(result.dates.dtype, pd.DatetimeTZDtype) elif datetimes == "STRING": assert is_string_dtype(result.dates.dtype) + result.dates = result.dates.str.replace(".000", "") exp_df.dates = df.dates.map( lambda x: x.isoformat(timespec="milliseconds") if pd.notna(x) else None ).str.replace(".000", "") @@ -658,6 +661,7 @@ def test_write_read_datetime_utc(tmp_path, ext, use_arrow, datetimes): assert_geodataframe_equal(result, df) elif datetimes == "STRING": assert is_string_dtype(result.dates.dtype) + result.dates = result.dates.str.replace(".000", "") dates_str = pd.Series(dates_raw, name="dates") assert_series_equal(result.dates, dates_str, check_dtype=False) From 097d85a82ef15ac2eaf70129d26018d1e788938a Mon Sep 17 00:00:00 2001 From: Pieter Roggemans Date: Sun, 3 Aug 2025 19:13:53 +0200 Subject: [PATCH 33/59] Fix tests for older GDAL versions --- pyogrio/tests/test_geopandas_io.py | 42 ++++++++++++++++++++++++++---- 1 file changed, 37 insertions(+), 5 deletions(-) diff --git a/pyogrio/tests/test_geopandas_io.py b/pyogrio/tests/test_geopandas_io.py index b0323da0..811f7947 100644 --- a/pyogrio/tests/test_geopandas_io.py +++ b/pyogrio/tests/test_geopandas_io.py @@ -362,12 +362,21 @@ def test_write_read_datetime_no_tz(tmp_path, ext, datetimes, use_arrow): write_dataframe(df, fpath, use_arrow=use_arrow) result = read_dataframe(fpath, use_arrow=use_arrow, datetimes=datetimes) - if use_arrow and ext == ".gpkg" and __gdal_version__ < (3, 11, 0): + if ( + datetimes is not "UTC" + and use_arrow + and ext == ".gpkg" + and __gdal_version__ < (3, 11, 0) + ): # With GDAL < 3.11 with arrow, columns with naive datetimes are written # correctly, but when read they are wrongly interpreted as being in UTC. # The reason is complicated, but more info can be found e.g. here: # https://github.com/geopandas/pyogrio/issues/487#issuecomment-2423762807 - assert_series_equal(result.dates, df.dates.dt.tz_localize("UTC")) + if datetimes == "DATETIME": + assert_series_equal(result.dates, df.dates.dt.tz_localize("UTC")) + elif datetimes == "STRING": + exp_dates = df.dates.dt.tz_localize("UTC").astype("string") + assert_series_equal(result.dates, exp_dates) pytest.xfail("naive datetimes read wrong in GPKG with GDAL < 3.11 via arrow") if datetimes == "UTC": @@ -381,6 +390,8 @@ def test_write_read_datetime_no_tz(tmp_path, ext, datetimes, use_arrow): result.dates = result.dates.str.replace(".000", "") dates_str = pd.Series(dates_raw, name="dates") assert_series_equal(result.dates, dates_str, check_dtype=False) + else: + raise ValueError(f"Invalid value for 'datetimes': {datetimes!r}.") @pytest.mark.parametrize("ext", [ext for ext in ALL_EXTS if ext != ".shp"]) @@ -436,9 +447,14 @@ def test_write_read_datetime_tz(tmp_path, ext, datetimes, use_arrow): elif datetimes == "DATETIME": assert_series_equal(result.dates, df.dates, check_index=False) elif datetimes == "STRING": + assert is_string_dtype(result.dates.dtype) result.dates = result.dates.str.replace(".000", "") dates_str = pd.Series(dates_raw, name="dates") - assert_series_equal(result.dates, dates_str, check_index=False) + assert_series_equal( + result.dates, dates_str, check_index=False, check_dtype=False + ) + else: + raise ValueError(f"Invalid value for 'datetimes': {datetimes!r}.") @pytest.mark.parametrize("ext", [ext for ext in ALL_EXTS if ext != ".shp"]) @@ -467,6 +483,8 @@ def test_write_read_datetime_tz_localized_mixed_offset( elif datetimes == "STRING": dates_exp = dates_local_offsets_str.str.replace(" ", "T") dates_exp = dates_exp.str.replace(".111000", ".111") + else: + raise ValueError(f"Invalid value for 'datetimes': {datetimes!r}.") df = gp.GeoDataFrame( {"dates": dates_local, "geometry": [Point(1, 1)] * 3}, crs="EPSG:4326" @@ -492,8 +510,11 @@ def test_write_read_datetime_tz_localized_mixed_offset( # GDAL tz only encodes offsets, not timezones if datetimes == "UTC": assert isinstance(result.dates.dtype, pd.DatetimeTZDtype) - else: + elif datetimes in ("DATETIME", "STRING"): assert is_object_dtype(result.dates.dtype) + else: + raise ValueError(f"Invalid value for 'datetimes': {datetimes!r}.") + assert_series_equal(result.dates, dates_exp) @@ -553,6 +574,8 @@ def test_write_read_datetime_tz_mixed_offsets(tmp_path, ext, datetimes, use_arro lambda x: x.isoformat(timespec="milliseconds") if pd.notna(x) else None ) assert_series_equal(result.dates, dates_str) + else: + raise ValueError(f"Invalid value for 'datetimes': {datetimes!r}.") @pytest.mark.parametrize("ext", [ext for ext in ALL_EXTS if ext != ".shp"]) @@ -624,6 +647,8 @@ def test_write_read_datetime_tz_objects(tmp_path, dates_raw, ext, use_arrow, dat exp_df.dates = df.dates.map( lambda x: x.isoformat(timespec="milliseconds") if pd.notna(x) else None ).str.replace(".000", "") + else: + raise ValueError(f"Invalid value for 'datetimes': {datetimes!r}.") assert_geodataframe_equal(result, exp_df) @@ -653,7 +678,12 @@ def test_write_read_datetime_utc(tmp_path, ext, use_arrow, datetimes): and __gdal_version__ < (3, 11, 0) ): # With GDAL < 3.11 with arrow, timezone information is dropped when reading .fgb - assert_series_equal(result.dates, df.dates.dt.tz_localize(None)) + if datetimes == "DATETIME": + assert_series_equal(result.dates, df.dates.dt.tz_localize(None)) + elif datetimes == "STRING": + assert is_string_dtype(result.dates.dtype) + exp_dates = df.dates.dt.tz_localize(None).astype("string") + assert_series_equal(result.dates, exp_dates, check_dtype=False) pytest.xfail("UTC datetimes read wrong in .fgb with GDAL < 3.11 via arrow") if datetimes in ("UTC", "DATETIME"): @@ -664,6 +694,8 @@ def test_write_read_datetime_utc(tmp_path, ext, use_arrow, datetimes): result.dates = result.dates.str.replace(".000", "") dates_str = pd.Series(dates_raw, name="dates") assert_series_equal(result.dates, dates_str, check_dtype=False) + else: + raise ValueError(f"Invalid value for 'datetimes': {datetimes!r}.") def test_read_null_values(tmp_path, use_arrow): From 41be45cd4e6ac4a5fa730dcdbd0bbd622c25d5cf Mon Sep 17 00:00:00 2001 From: Pieter Roggemans Date: Sun, 3 Aug 2025 20:00:18 +0200 Subject: [PATCH 34/59] Fixes in tests for old GDAL versions --- pyogrio/tests/test_geopandas_io.py | 21 +++++++++++++-------- 1 file changed, 13 insertions(+), 8 deletions(-) diff --git a/pyogrio/tests/test_geopandas_io.py b/pyogrio/tests/test_geopandas_io.py index 811f7947..c7863e3e 100644 --- a/pyogrio/tests/test_geopandas_io.py +++ b/pyogrio/tests/test_geopandas_io.py @@ -387,8 +387,10 @@ def test_write_read_datetime_no_tz(tmp_path, ext, datetimes, use_arrow): assert_geodataframe_equal(result, df) elif datetimes == "STRING": assert is_string_dtype(result.dates.dtype) - result.dates = result.dates.str.replace(".000", "") - dates_str = pd.Series(dates_raw, name="dates") + if use_arrow and __gdal_version__ < (3, 11, 0): + dates_str = df.dates.astype("string").str.replace(" ", "T") + else: + dates_str = pd.Series(dates_raw, name="dates") assert_series_equal(result.dates, dates_str, check_dtype=False) else: raise ValueError(f"Invalid value for 'datetimes': {datetimes!r}.") @@ -448,8 +450,10 @@ def test_write_read_datetime_tz(tmp_path, ext, datetimes, use_arrow): assert_series_equal(result.dates, df.dates, check_index=False) elif datetimes == "STRING": assert is_string_dtype(result.dates.dtype) - result.dates = result.dates.str.replace(".000", "") - dates_str = pd.Series(dates_raw, name="dates") + if use_arrow and __gdal_version__ < (3, 11, 0): + dates_str = df.dates.astype("string").str.replace(" ", "T") + else: + dates_str = pd.Series(dates_raw, name="dates") assert_series_equal( result.dates, dates_str, check_index=False, check_dtype=False ) @@ -643,13 +647,12 @@ def test_write_read_datetime_tz_objects(tmp_path, dates_raw, ext, use_arrow, dat assert isinstance(result.dates.dtype, pd.DatetimeTZDtype) elif datetimes == "STRING": assert is_string_dtype(result.dates.dtype) - result.dates = result.dates.str.replace(".000", "") exp_df.dates = df.dates.map( lambda x: x.isoformat(timespec="milliseconds") if pd.notna(x) else None ).str.replace(".000", "") else: raise ValueError(f"Invalid value for 'datetimes': {datetimes!r}.") - assert_geodataframe_equal(result, exp_df) + assert_geodataframe_equal(result, exp_df, check_dtype=False) @pytest.mark.parametrize("ext", [ext for ext in ALL_EXTS if ext != ".shp"]) @@ -691,8 +694,10 @@ def test_write_read_datetime_utc(tmp_path, ext, use_arrow, datetimes): assert_geodataframe_equal(result, df) elif datetimes == "STRING": assert is_string_dtype(result.dates.dtype) - result.dates = result.dates.str.replace(".000", "") - dates_str = pd.Series(dates_raw, name="dates") + if use_arrow and __gdal_version__ < (3, 11, 0): + dates_str = df.dates.astype("string").str.replace(" ", "T") + else: + dates_str = pd.Series(dates_raw, name="dates") assert_series_equal(result.dates, dates_str, check_dtype=False) else: raise ValueError(f"Invalid value for 'datetimes': {datetimes!r}.") From 4f27049e4bd222fa45c2b794e45bbb0a3549bba2 Mon Sep 17 00:00:00 2001 From: Pieter Roggemans Date: Sun, 3 Aug 2025 20:58:06 +0200 Subject: [PATCH 35/59] Fix tests for older gdal versions --- pyogrio/tests/test_geopandas_io.py | 31 ++++++++++++++++++------------ 1 file changed, 19 insertions(+), 12 deletions(-) diff --git a/pyogrio/tests/test_geopandas_io.py b/pyogrio/tests/test_geopandas_io.py index c7863e3e..cdec6b73 100644 --- a/pyogrio/tests/test_geopandas_io.py +++ b/pyogrio/tests/test_geopandas_io.py @@ -372,10 +372,11 @@ def test_write_read_datetime_no_tz(tmp_path, ext, datetimes, use_arrow): # correctly, but when read they are wrongly interpreted as being in UTC. # The reason is complicated, but more info can be found e.g. here: # https://github.com/geopandas/pyogrio/issues/487#issuecomment-2423762807 + exp_dates = df.dates.dt.tz_localize("UTC") if datetimes == "DATETIME": - assert_series_equal(result.dates, df.dates.dt.tz_localize("UTC")) + assert_series_equal(result.dates, exp_dates) elif datetimes == "STRING": - exp_dates = df.dates.dt.tz_localize("UTC").astype("string") + exp_dates = exp_dates.astype("string").str.replace(" ", "T") assert_series_equal(result.dates, exp_dates) pytest.xfail("naive datetimes read wrong in GPKG with GDAL < 3.11 via arrow") @@ -477,16 +478,16 @@ def test_write_read_datetime_tz_localized_mixed_offset( dates_local = dates_naive.dt.tz_localize("Australia/Sydney") dates_local_offsets_str = dates_local.astype("string").astype("O") if datetimes == "UTC": - dates_exp = dates_local.dt.tz_convert("UTC") + exp_dates = dates_local.dt.tz_convert("UTC") if PANDAS_GE_20: - dates_exp = dates_exp.dt.as_unit("ms") + exp_dates = exp_dates.dt.as_unit("ms") elif datetimes == "DATETIME": - dates_exp = dates_local_offsets_str.apply( + exp_dates = dates_local_offsets_str.apply( lambda x: pd.Timestamp(x) if pd.notna(x) else None ) elif datetimes == "STRING": - dates_exp = dates_local_offsets_str.str.replace(" ", "T") - dates_exp = dates_exp.str.replace(".111000", ".111") + exp_dates = dates_local_offsets_str.str.replace(" ", "T") + exp_dates = exp_dates.str.replace(".111000", ".111") else: raise ValueError(f"Invalid value for 'datetimes': {datetimes!r}.") @@ -504,6 +505,8 @@ def test_write_read_datetime_tz_localized_mixed_offset( dates_utc = dates_local.dt.tz_convert("UTC") if PANDAS_GE_20: dates_utc = dates_utc.dt.as_unit("ms") + if datetimes == "STRING": + dates_utc = dates_utc.astype("string").str.replace(" ", "T") assert_series_equal(result.dates, dates_utc) pytest.xfail("mixed tz datetimes converted to UTC with GDAL < 3.11 + arrow") elif ext in (".gpkg", ".fgb"): @@ -519,7 +522,7 @@ def test_write_read_datetime_tz_localized_mixed_offset( else: raise ValueError(f"Invalid value for 'datetimes': {datetimes!r}.") - assert_series_equal(result.dates, dates_exp) + assert_series_equal(result.dates, exp_dates) @pytest.mark.parametrize("ext", [ext for ext in ALL_EXTS if ext != ".shp"]) @@ -553,6 +556,8 @@ def test_write_read_datetime_tz_mixed_offsets(tmp_path, ext, datetimes, use_arro df_exp.dates = pd.to_datetime(dates, utc=True) if PANDAS_GE_20: df_exp.dates = df_exp.dates.dt.as_unit("ms") + if datetimes == "STRING": + df_exp.dates = df_exp.dates.astype("string").str.replace(" ", "T") assert_geodataframe_equal(result, df_exp) pytest.xfail("mixed tz datetimes converted to UTC with GDAL < 3.11 + arrow") elif ext in (".gpkg", ".fgb"): @@ -622,11 +627,11 @@ def test_write_read_datetime_tz_objects(tmp_path, dates_raw, ext, use_arrow, dat # Check result if PANDAS_GE_20: - dates_exp = pd.to_datetime(dates_raw, format="ISO8601").as_unit("ms") + exp_dates = pd.to_datetime(dates_raw, format="ISO8601").as_unit("ms") else: - dates_exp = pd.to_datetime(dates_raw) + exp_dates = pd.to_datetime(dates_raw) exp_df = df.copy() - exp_df.dates = pd.Series(dates_exp, name="dates") + exp_df.dates = pd.Series(exp_dates, name="dates") # With some older versions, the offset is represented slightly differently if result.dates.dtype.name.endswith(", pytz.FixedOffset(-300)]"): @@ -649,7 +654,9 @@ def test_write_read_datetime_tz_objects(tmp_path, dates_raw, ext, use_arrow, dat assert is_string_dtype(result.dates.dtype) exp_df.dates = df.dates.map( lambda x: x.isoformat(timespec="milliseconds") if pd.notna(x) else None - ).str.replace(".000", "") + ) + if __gdal_version__ < (3, 11, 0) and use_arrow: + exp_df.dates = exp_df.dates.str.replace(".000", "") else: raise ValueError(f"Invalid value for 'datetimes': {datetimes!r}.") assert_geodataframe_equal(result, exp_df, check_dtype=False) From cf9eba9f032e1c5dbec3d4dd515582492b545eba Mon Sep 17 00:00:00 2001 From: Pieter Roggemans Date: Sun, 3 Aug 2025 22:44:40 +0200 Subject: [PATCH 36/59] Fix tests --- pyogrio/tests/test_geopandas_io.py | 18 ++++++++++++------ 1 file changed, 12 insertions(+), 6 deletions(-) diff --git a/pyogrio/tests/test_geopandas_io.py b/pyogrio/tests/test_geopandas_io.py index cdec6b73..ef68dc8d 100644 --- a/pyogrio/tests/test_geopandas_io.py +++ b/pyogrio/tests/test_geopandas_io.py @@ -652,11 +652,15 @@ def test_write_read_datetime_tz_objects(tmp_path, dates_raw, ext, use_arrow, dat assert isinstance(result.dates.dtype, pd.DatetimeTZDtype) elif datetimes == "STRING": assert is_string_dtype(result.dates.dtype) - exp_df.dates = df.dates.map( - lambda x: x.isoformat(timespec="milliseconds") if pd.notna(x) else None - ) - if __gdal_version__ < (3, 11, 0) and use_arrow: - exp_df.dates = exp_df.dates.str.replace(".000", "") + if use_arrow and __gdal_version__ < (3, 11, 0): + # With GDAL < 3.11 with arrow, datetime columns are written as string type + exp_df.dates = exp_df.dates.astype("string").str.replace(" ", "T") + else: + exp_df.dates = df.dates.map( + lambda x: x.isoformat(timespec="milliseconds").replace(".000", "") + if pd.notna(x) + else None + ) else: raise ValueError(f"Invalid value for 'datetimes': {datetimes!r}.") assert_geodataframe_equal(result, exp_df, check_dtype=False) @@ -692,7 +696,9 @@ def test_write_read_datetime_utc(tmp_path, ext, use_arrow, datetimes): assert_series_equal(result.dates, df.dates.dt.tz_localize(None)) elif datetimes == "STRING": assert is_string_dtype(result.dates.dtype) - exp_dates = df.dates.dt.tz_localize(None).astype("string") + exp_dates = ( + df.dates.dt.tz_localize(None).astype("string").str.replace(" ", "T") + ) assert_series_equal(result.dates, exp_dates, check_dtype=False) pytest.xfail("UTC datetimes read wrong in .fgb with GDAL < 3.11 via arrow") From 86239e2d40e0b908cd8642007b41f09a20f38621 Mon Sep 17 00:00:00 2001 From: Pieter Roggemans Date: Sun, 3 Aug 2025 23:07:54 +0200 Subject: [PATCH 37/59] Fix tests for gdal < 3.7 --- pyogrio/tests/test_geopandas_io.py | 16 ++++++++++++++++ pyogrio/tests/test_raw_io.py | 9 +++++++-- 2 files changed, 23 insertions(+), 2 deletions(-) diff --git a/pyogrio/tests/test_geopandas_io.py b/pyogrio/tests/test_geopandas_io.py index ef68dc8d..2510775d 100644 --- a/pyogrio/tests/test_geopandas_io.py +++ b/pyogrio/tests/test_geopandas_io.py @@ -453,6 +453,10 @@ def test_write_read_datetime_tz(tmp_path, ext, datetimes, use_arrow): assert is_string_dtype(result.dates.dtype) if use_arrow and __gdal_version__ < (3, 11, 0): dates_str = df.dates.astype("string").str.replace(" ", "T") + elif __gdal_version__ < (3, 7, 0): + # With GDAL < 3.7, timezone minutes aren't included in the string + dates_str = [x[:-3] for x in dates_raw if x is not None] + [None] + dates_str = pd.Series(dates_str, name="dates") else: dates_str = pd.Series(dates_raw, name="dates") assert_series_equal( @@ -488,6 +492,9 @@ def test_write_read_datetime_tz_localized_mixed_offset( elif datetimes == "STRING": exp_dates = dates_local_offsets_str.str.replace(" ", "T") exp_dates = exp_dates.str.replace(".111000", ".111") + if __gdal_version__ < (3, 7, 0): + # With GDAL < 3.7, timezone minutes aren't included in the string + exp_dates = exp_dates.str.slice(0, -3) else: raise ValueError(f"Invalid value for 'datetimes': {datetimes!r}.") @@ -582,6 +589,9 @@ def test_write_read_datetime_tz_mixed_offsets(tmp_path, ext, datetimes, use_arro dates_str = df.dates.map( lambda x: x.isoformat(timespec="milliseconds") if pd.notna(x) else None ) + if __gdal_version__ < (3, 7, 0): + # With GDAL < 3.7, timezone minutes aren't included in the string + dates_str = dates_str.str.slice(0, -3) assert_series_equal(result.dates, dates_str) else: raise ValueError(f"Invalid value for 'datetimes': {datetimes!r}.") @@ -661,6 +671,9 @@ def test_write_read_datetime_tz_objects(tmp_path, dates_raw, ext, use_arrow, dat if pd.notna(x) else None ) + if __gdal_version__ < (3, 7, 0): + # With GDAL < 3.7, timezone minutes aren't included in the string + exp_df.dates = exp_df.dates.str.slice(0, -3) else: raise ValueError(f"Invalid value for 'datetimes': {datetimes!r}.") assert_geodataframe_equal(result, exp_df, check_dtype=False) @@ -711,6 +724,9 @@ def test_write_read_datetime_utc(tmp_path, ext, use_arrow, datetimes): dates_str = df.dates.astype("string").str.replace(" ", "T") else: dates_str = pd.Series(dates_raw, name="dates") + if __gdal_version__ < (3, 7, 0): + # With GDAL < 3.7, timezone minutes aren't included in the string + dates_str = dates_str.str.slice(0, -3) assert_series_equal(result.dates, dates_str, check_dtype=False) else: raise ValueError(f"Invalid value for 'datetimes': {datetimes!r}.") diff --git a/pyogrio/tests/test_raw_io.py b/pyogrio/tests/test_raw_io.py index 646fbf2a..4702702e 100644 --- a/pyogrio/tests/test_raw_io.py +++ b/pyogrio/tests/test_raw_io.py @@ -1054,8 +1054,13 @@ def test_read_datetime_as_string(datetime_tz_file): field = read(datetime_tz_file, datetime_as_string=True)[3][0] assert field.dtype == "object" - assert field[0] == "2020-01-01T09:00:00.123-05:00" - assert field[1] == "2020-01-01T10:00:00-05:00" + if __gdal_version__ < (3, 7, 0): + # With GDAL < 3.7, timezone minutes aren't included in the string + assert field[0] == "2020-01-01T09:00:00.123-05" + assert field[1] == "2020-01-01T10:00:00-05" + else: + assert field[0] == "2020-01-01T09:00:00.123-05:00" + assert field[1] == "2020-01-01T10:00:00-05:00" @pytest.mark.parametrize("ext", ["gpkg", "geojson"]) From 7d28e3d2745d9624d1ef47dedbd7d44d8521db9b Mon Sep 17 00:00:00 2001 From: Pieter Roggemans Date: Sun, 3 Aug 2025 23:19:47 +0200 Subject: [PATCH 38/59] Test fixes for gdal <3.7 --- pyogrio/tests/test_geopandas_io.py | 10 +++++----- pyogrio/tests/test_raw_io.py | 4 ++-- 2 files changed, 7 insertions(+), 7 deletions(-) diff --git a/pyogrio/tests/test_geopandas_io.py b/pyogrio/tests/test_geopandas_io.py index 2510775d..11036d32 100644 --- a/pyogrio/tests/test_geopandas_io.py +++ b/pyogrio/tests/test_geopandas_io.py @@ -671,9 +671,9 @@ def test_write_read_datetime_tz_objects(tmp_path, dates_raw, ext, use_arrow, dat if pd.notna(x) else None ) - if __gdal_version__ < (3, 7, 0): - # With GDAL < 3.7, timezone minutes aren't included in the string - exp_df.dates = exp_df.dates.str.slice(0, -3) + if __gdal_version__ < (3, 7, 0): + # With GDAL < 3.7, timezone minutes aren't included in the string + exp_df.dates = exp_df.dates.str.slice(0, -3) else: raise ValueError(f"Invalid value for 'datetimes': {datetimes!r}.") assert_geodataframe_equal(result, exp_df, check_dtype=False) @@ -725,8 +725,8 @@ def test_write_read_datetime_utc(tmp_path, ext, use_arrow, datetimes): else: dates_str = pd.Series(dates_raw, name="dates") if __gdal_version__ < (3, 7, 0): - # With GDAL < 3.7, timezone minutes aren't included in the string - dates_str = dates_str.str.slice(0, -3) + # With GDAL < 3.7, datetime ends with +00 for UTC, not Z + dates_str = dates_str.str.replace("Z", "+00") assert_series_equal(result.dates, dates_str, check_dtype=False) else: raise ValueError(f"Invalid value for 'datetimes': {datetimes!r}.") diff --git a/pyogrio/tests/test_raw_io.py b/pyogrio/tests/test_raw_io.py index 4702702e..86e7d322 100644 --- a/pyogrio/tests/test_raw_io.py +++ b/pyogrio/tests/test_raw_io.py @@ -1056,8 +1056,8 @@ def test_read_datetime_as_string(datetime_tz_file): if __gdal_version__ < (3, 7, 0): # With GDAL < 3.7, timezone minutes aren't included in the string - assert field[0] == "2020-01-01T09:00:00.123-05" - assert field[1] == "2020-01-01T10:00:00-05" + assert field[0] == "2020/01/01 09:00:00.123-05" + assert field[1] == "2020/01/01 10:00:00-05" else: assert field[0] == "2020-01-01T09:00:00.123-05:00" assert field[1] == "2020-01-01T10:00:00-05:00" From 201c01e4e44b72d51f8945d2e015d0cd736467de Mon Sep 17 00:00:00 2001 From: Pieter Roggemans Date: Sun, 3 Aug 2025 23:46:51 +0200 Subject: [PATCH 39/59] Fix linter error --- pyogrio/tests/test_geopandas_io.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pyogrio/tests/test_geopandas_io.py b/pyogrio/tests/test_geopandas_io.py index 11036d32..682bc395 100644 --- a/pyogrio/tests/test_geopandas_io.py +++ b/pyogrio/tests/test_geopandas_io.py @@ -363,7 +363,7 @@ def test_write_read_datetime_no_tz(tmp_path, ext, datetimes, use_arrow): result = read_dataframe(fpath, use_arrow=use_arrow, datetimes=datetimes) if ( - datetimes is not "UTC" + datetimes != "UTC" and use_arrow and ext == ".gpkg" and __gdal_version__ < (3, 11, 0) From 4923178eeabaf08f833a205366b75230122076d3 Mon Sep 17 00:00:00 2001 From: Pieter Roggemans Date: Mon, 4 Aug 2025 00:08:14 +0200 Subject: [PATCH 40/59] Several textual improvements --- CHANGES.md | 8 +++++++- pyogrio/geopandas.py | 24 ++++++++++++------------ pyogrio/tests/test_raw_io.py | 2 +- 3 files changed, 20 insertions(+), 14 deletions(-) diff --git a/CHANGES.md b/CHANGES.md index d15fcb4a..55f7a28f 100644 --- a/CHANGES.md +++ b/CHANGES.md @@ -1,5 +1,12 @@ # CHANGELOG +## 0.12.0 (????-??-??) + +### Improvements + +- Add `datetime` parameter to `read_dateframe` to choose the way datetimecolumns are + returned + several fixes when reading and writing datetimes (#486). + ## 0.11.1 (2025-08-XX) ### Bug fixes @@ -15,7 +22,6 @@ ### Improvements - Capture all errors logged by gdal when opening a file fails (#495). -- Improve support for datetime columns (#486). - Add support to read and write ".gpkg.zip" (GDAL >= 3.7), ".shp.zip", and ".shz" files (#527). - Compatibility with the string dtype in the upcoming pandas 3.0 release (#493). diff --git a/pyogrio/geopandas.py b/pyogrio/geopandas.py index fc8cf591..64b91662 100644 --- a/pyogrio/geopandas.py +++ b/pyogrio/geopandas.py @@ -245,20 +245,20 @@ def read_dataframe( When `use_arrow` is True, these kwargs will be passed to the `to_pandas`_ call for the arrow to pandas conversion. datetimes : str, optional (default: "UTC") - The way datetime columns are returned. Possible values: - - - **"UTC"**: all datetime columns will be returned as pandas datetime64 columns - converted to UTC. Naive datetimes (without timezone information) will be - assumed to be in UTC timezone. - - **"DATETIME"**: datetimes will be returned in the timezone as they were read - from the data source. Columns with values in a single timezone or without - timezone information will be returned as pandas datetime64 columns. - Columns with mixed timezone data are returned as object columns with - pandas.Timestamp values. - - **"STRING"**: all datetime columns will be returned as strings. + The way datetime columns should be returned. Possible values: + + - **"UTC"**: return all datetime columns as pandas datetime64 columns + converted to UTC. Naive datetimes (without timezone information) will + be assumed to be in UTC timezone. + - **"DATETIME"**: return datetimes in the timezone as they were read + from the data source. Columns with values in a single timezone or + without timezone information will be returned as pandas datetime64 + columns. Columns with mixed timezone data are returned as object + columns with pandas.Timestamp values. + - **"STRING"**: return all datetimes as ISO8601 strings. **kwargs - Additional driver-specific dataset open options passed to OGR. Invalid + Additional driver-specific dataset open options passed to OGR. Invalid options will trigger a warning. Returns diff --git a/pyogrio/tests/test_raw_io.py b/pyogrio/tests/test_raw_io.py index 86e7d322..dbb49170 100644 --- a/pyogrio/tests/test_raw_io.py +++ b/pyogrio/tests/test_raw_io.py @@ -1055,7 +1055,7 @@ def test_read_datetime_as_string(datetime_tz_file): assert field.dtype == "object" if __gdal_version__ < (3, 7, 0): - # With GDAL < 3.7, timezone minutes aren't included in the string + # With GDAL < 3.7, datetimes are not returned as ISO8601 strings assert field[0] == "2020/01/01 09:00:00.123-05" assert field[1] == "2020/01/01 10:00:00-05" else: From f0e740948352ab9df80731969ceb0cf18ca1011d Mon Sep 17 00:00:00 2001 From: Pieter Roggemans Date: Mon, 4 Aug 2025 00:42:55 +0200 Subject: [PATCH 41/59] Add test for dates long ago --- pyogrio/tests/conftest.py | 21 +++++++++++++++++++++ pyogrio/tests/test_geopandas_io.py | 26 ++++++++++++++++++++++++++ 2 files changed, 47 insertions(+) diff --git a/pyogrio/tests/conftest.py b/pyogrio/tests/conftest.py index 63df12c3..95b08923 100644 --- a/pyogrio/tests/conftest.py +++ b/pyogrio/tests/conftest.py @@ -340,6 +340,27 @@ def geojson_bytes(tmp_path): return bytes_buffer +@pytest.fixture(scope="function") +def geojson_datetime_long_ago(tmp_path): + # create GeoJSON file with datetimes from long ago + datetime_tz_geojson = """{ + "type": "FeatureCollection", + "features": [ + { + "type": "Feature", + "properties": { "datetime_col": "1670-01-01T09:00:00" }, + "geometry": { "type": "Point", "coordinates": [1, 1] } + } + ] + }""" + + filename = tmp_path / "test_datetime_long_ago.geojson" + with open(filename, "w") as f: + f.write(datetime_tz_geojson) + + return filename + + @pytest.fixture(scope="function") def geojson_filelike(tmp_path): """Extracts first 3 records from naturalearth_lowres and writes to GeoJSON, diff --git a/pyogrio/tests/test_geopandas_io.py b/pyogrio/tests/test_geopandas_io.py index 682bc395..6957c0f3 100644 --- a/pyogrio/tests/test_geopandas_io.py +++ b/pyogrio/tests/test_geopandas_io.py @@ -344,6 +344,32 @@ def test_read_datetimes_invalid_param(datetime_file, use_arrow): read_dataframe(datetime_file, use_arrow=use_arrow, datetimes="INVALID") +@pytest.mark.parametrize("datetimes", ["UTC", "DATETIME", "STRING"]) +def test_read_datetime_long_ago(geojson_datetime_long_ago, use_arrow, datetimes): + """Test writing/reading a column with a datetime far in the past.""" + df = read_dataframe( + geojson_datetime_long_ago, use_arrow=use_arrow, datetimes=datetimes + ) + + exp_dates = pd.Series(["1670-01-01T09:00:00"], name="datetime_col") + + if datetimes == "UTC": + pytest.xfail("datetimes of long ago cannot be parsed as UTC") + assert is_datetime64_any_dtype(df.datetime_col.dtype) + assert_series_equal(df.datetime_col, exp_dates) + elif datetimes == "DATETIME": + pytest.xfail("datetimes of long ago cannot be parsed as datetime") + assert is_datetime64_dtype(df.datetime_col.dtype) + if PANDAS_GE_20: + exp_dates = pd.to_datetime(exp_dates, format="ISO8601").as_unit("ms") + else: + exp_dates = pd.to_datetime(exp_dates) + assert_series_equal(df.datetime_col, exp_dates) + elif datetimes == "STRING": + assert is_string_dtype(df.datetime_col.dtype) + assert_series_equal(df.datetime_col, exp_dates) + + @pytest.mark.parametrize("ext", [ext for ext in ALL_EXTS if ext != ".shp"]) @pytest.mark.parametrize("datetimes", ["UTC", "DATETIME", "STRING"]) @pytest.mark.requires_arrow_write_api From 538b95f22428711e2940f962f44f17037be99adf Mon Sep 17 00:00:00 2001 From: Pieter Roggemans Date: Mon, 4 Aug 2025 00:48:51 +0200 Subject: [PATCH 42/59] Improve docstring of test with dates from long ago --- pyogrio/tests/test_geopandas_io.py | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/pyogrio/tests/test_geopandas_io.py b/pyogrio/tests/test_geopandas_io.py index 6957c0f3..f3c20858 100644 --- a/pyogrio/tests/test_geopandas_io.py +++ b/pyogrio/tests/test_geopandas_io.py @@ -346,7 +346,12 @@ def test_read_datetimes_invalid_param(datetime_file, use_arrow): @pytest.mark.parametrize("datetimes", ["UTC", "DATETIME", "STRING"]) def test_read_datetime_long_ago(geojson_datetime_long_ago, use_arrow, datetimes): - """Test writing/reading a column with a datetime far in the past.""" + """Test writing/reading a column with a datetime far in the past. + + Dates from before 1678-1-1 aren't parsed correctly by pandas < 3.0, so they + stay strings. + Reported in https://github.com/geopandas/pyogrio/issues/553. + """ df = read_dataframe( geojson_datetime_long_ago, use_arrow=use_arrow, datetimes=datetimes ) From f23bd3820deccc61a08f0452385c9bb0ea868fba Mon Sep 17 00:00:00 2001 From: Pieter Roggemans Date: Mon, 4 Aug 2025 00:58:58 +0200 Subject: [PATCH 43/59] Fix long ago test for old versions of gdal --- pyogrio/tests/test_geopandas_io.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pyogrio/tests/test_geopandas_io.py b/pyogrio/tests/test_geopandas_io.py index f3c20858..f708ab46 100644 --- a/pyogrio/tests/test_geopandas_io.py +++ b/pyogrio/tests/test_geopandas_io.py @@ -372,7 +372,7 @@ def test_read_datetime_long_ago(geojson_datetime_long_ago, use_arrow, datetimes) assert_series_equal(df.datetime_col, exp_dates) elif datetimes == "STRING": assert is_string_dtype(df.datetime_col.dtype) - assert_series_equal(df.datetime_col, exp_dates) + assert_series_equal(df.datetime_col, exp_dates, check_dtype=False) @pytest.mark.parametrize("ext", [ext for ext in ALL_EXTS if ext != ".shp"]) From b55cc2f60846cfdadc74e02180871abdd50d6095 Mon Sep 17 00:00:00 2001 From: Pieter Roggemans Date: Mon, 4 Aug 2025 01:19:29 +0200 Subject: [PATCH 44/59] xfail dates of long ago with arrow and gdal <3.11 --- pyogrio/tests/test_geopandas_io.py | 9 ++++++++- 1 file changed, 8 insertions(+), 1 deletion(-) diff --git a/pyogrio/tests/test_geopandas_io.py b/pyogrio/tests/test_geopandas_io.py index f708ab46..af073fc4 100644 --- a/pyogrio/tests/test_geopandas_io.py +++ b/pyogrio/tests/test_geopandas_io.py @@ -352,12 +352,19 @@ def test_read_datetime_long_ago(geojson_datetime_long_ago, use_arrow, datetimes) stay strings. Reported in https://github.com/geopandas/pyogrio/issues/553. """ + if use_arrow and __gdal_version__ < (3, 11, 0): + # With use_arrow and GDAL < 3.11, datetimes are converted to python + # objects in to_pandas. For a datetime far in the past this gives an + # overflow though. + pytest.xfail( + "datetimes far in the past overflow is use_arrow is used with GDAL < 3.11" + ) + df = read_dataframe( geojson_datetime_long_ago, use_arrow=use_arrow, datetimes=datetimes ) exp_dates = pd.Series(["1670-01-01T09:00:00"], name="datetime_col") - if datetimes == "UTC": pytest.xfail("datetimes of long ago cannot be parsed as UTC") assert is_datetime64_any_dtype(df.datetime_col.dtype) From 616a1447f7ea7bdb74bb007a89218eaa428de1d1 Mon Sep 17 00:00:00 2001 From: Pieter Roggemans Date: Mon, 4 Aug 2025 01:34:33 +0200 Subject: [PATCH 45/59] Fix some errors with pandas 3 --- pyogrio/tests/test_geopandas_io.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/pyogrio/tests/test_geopandas_io.py b/pyogrio/tests/test_geopandas_io.py index af073fc4..de184f8d 100644 --- a/pyogrio/tests/test_geopandas_io.py +++ b/pyogrio/tests/test_geopandas_io.py @@ -562,8 +562,10 @@ def test_write_read_datetime_tz_localized_mixed_offset( # GDAL tz only encodes offsets, not timezones if datetimes == "UTC": assert isinstance(result.dates.dtype, pd.DatetimeTZDtype) - elif datetimes in ("DATETIME", "STRING"): + elif datetimes == "DATETIME": assert is_object_dtype(result.dates.dtype) + elif datetimes == "STRING": + assert is_string_dtype(result.dates.dtype) else: raise ValueError(f"Invalid value for 'datetimes': {datetimes!r}.") From 93dbc6e52d93d2f6e5e8da4b5117c051c540c5fe Mon Sep 17 00:00:00 2001 From: Pieter Roggemans Date: Mon, 4 Aug 2025 11:39:04 +0200 Subject: [PATCH 46/59] Improve error --- pyogrio/tests/test_geopandas_io.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pyogrio/tests/test_geopandas_io.py b/pyogrio/tests/test_geopandas_io.py index de184f8d..2d639a5c 100644 --- a/pyogrio/tests/test_geopandas_io.py +++ b/pyogrio/tests/test_geopandas_io.py @@ -357,7 +357,7 @@ def test_read_datetime_long_ago(geojson_datetime_long_ago, use_arrow, datetimes) # objects in to_pandas. For a datetime far in the past this gives an # overflow though. pytest.xfail( - "datetimes far in the past overflow is use_arrow is used with GDAL < 3.11" + "datetimes before 1678-1-1 give overflow if arrow is used with GDAL<3.11" ) df = read_dataframe( From 6af6d63951a5fc2681f381cefaf4da01ed93d040 Mon Sep 17 00:00:00 2001 From: Pieter Roggemans Date: Mon, 4 Aug 2025 22:55:50 +0200 Subject: [PATCH 47/59] Support for pandas 3.0 --- pyogrio/geopandas.py | 10 ++++++++-- 1 file changed, 8 insertions(+), 2 deletions(-) diff --git a/pyogrio/geopandas.py b/pyogrio/geopandas.py index 64b91662..deafd00c 100644 --- a/pyogrio/geopandas.py +++ b/pyogrio/geopandas.py @@ -81,8 +81,14 @@ def _try_parse_datetime(ser, datetimes="UTC"): # (can tighten the exception type in future when it does) try: res = pd.to_datetime(ser, **datetime_kwargs) - except Exception: - res = ser + except ValueError as ex: + if "Mixed timezones detected" in str(ex): + # Parsing mixed timezones with to_datetime is not supported anymore in + # pandas>=3.0, so convert to pd.Timestamp objects manually. + # Using 2 times map seems to be the fastest way to do this. + res = res.map(datetime.fromisoformat, na_action="ignore").map( + pd.Timestamp, na_action="ignore" + ) if res.dtype.kind == "M": # any datetime64 # GDAL only supports ms precision, convert outputs to match. From 066ec42758b0d6aa4e9a1cfa0a94e26ffc00bea6 Mon Sep 17 00:00:00 2001 From: Pieter Roggemans Date: Tue, 5 Aug 2025 00:47:36 +0200 Subject: [PATCH 48/59] Update geopandas.py --- pyogrio/geopandas.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/pyogrio/geopandas.py b/pyogrio/geopandas.py index deafd00c..acc21651 100644 --- a/pyogrio/geopandas.py +++ b/pyogrio/geopandas.py @@ -77,8 +77,8 @@ def _try_parse_datetime(ser, datetimes="UTC"): ".*parsing datetimes with mixed time zones will raise.*", FutureWarning, ) - # pre-emptive try catch for when pandas will raise - # (can tighten the exception type in future when it does) + + res = ser try: res = pd.to_datetime(ser, **datetime_kwargs) except ValueError as ex: @@ -86,7 +86,7 @@ def _try_parse_datetime(ser, datetimes="UTC"): # Parsing mixed timezones with to_datetime is not supported anymore in # pandas>=3.0, so convert to pd.Timestamp objects manually. # Using 2 times map seems to be the fastest way to do this. - res = res.map(datetime.fromisoformat, na_action="ignore").map( + res = ser.map(datetime.fromisoformat, na_action="ignore").map( pd.Timestamp, na_action="ignore" ) From 5c9efa1f2d069fef3fd12ce89c99090af16529e6 Mon Sep 17 00:00:00 2001 From: Pieter Roggemans Date: Tue, 5 Aug 2025 01:27:37 +0200 Subject: [PATCH 49/59] Support pandas 3.0 --- pyogrio/tests/test_geopandas_io.py | 22 +++++++++++----------- 1 file changed, 11 insertions(+), 11 deletions(-) diff --git a/pyogrio/tests/test_geopandas_io.py b/pyogrio/tests/test_geopandas_io.py index de184f8d..c8b84d36 100644 --- a/pyogrio/tests/test_geopandas_io.py +++ b/pyogrio/tests/test_geopandas_io.py @@ -387,7 +387,7 @@ def test_read_datetime_long_ago(geojson_datetime_long_ago, use_arrow, datetimes) @pytest.mark.requires_arrow_write_api def test_write_read_datetime_no_tz(tmp_path, ext, datetimes, use_arrow): """Test writing/reading a column with naive datetimes (no timezone information).""" - dates_raw = ["2020-01-01T09:00:00.123", "2020-01-01T10:00:00", None] + dates_raw = ["2020-01-01T09:00:00.123", "2020-01-01T10:00:00", np.nan] if PANDAS_GE_20: dates = pd.to_datetime(dates_raw, format="ISO8601").as_unit("ms") else: @@ -450,7 +450,7 @@ def test_write_read_datetime_tz(tmp_path, ext, datetimes, use_arrow): # This was fixed in https://github.com/OSGeo/gdal/pull/11049 pytest.xfail("Wrong datetimes read in GeoJSON with GDAL < 3.10 via arrow") - dates_raw = ["2020-01-01T09:00:00.123-05:00", "2020-01-01T10:00:00-05:00", None] + dates_raw = ["2020-01-01T09:00:00.123-05:00", "2020-01-01T10:00:00-05:00", np.nan] if PANDAS_GE_20: dates = pd.to_datetime(dates_raw, format="ISO8601").as_unit("ms") else: @@ -493,7 +493,7 @@ def test_write_read_datetime_tz(tmp_path, ext, datetimes, use_arrow): dates_str = df.dates.astype("string").str.replace(" ", "T") elif __gdal_version__ < (3, 7, 0): # With GDAL < 3.7, timezone minutes aren't included in the string - dates_str = [x[:-3] for x in dates_raw if x is not None] + [None] + dates_str = [x[:-3] for x in dates_raw if pd.notna(x)] + [np.nan] dates_str = pd.Series(dates_str, name="dates") else: dates_str = pd.Series(dates_raw, name="dates") @@ -515,7 +515,7 @@ def test_write_read_datetime_tz_localized_mixed_offset( ): """Test with localized dates across a different summer/winter timezone offset.""" # Australian Summer Time AEDT (GMT+11), Standard Time AEST (GMT+10) - dates_raw = ["2023-01-01 11:00:01.111", "2023-06-01 10:00:01.111", None] + dates_raw = ["2023-01-01 11:00:01.111", "2023-06-01 10:00:01.111", np.nan] dates_naive = pd.Series(pd.to_datetime(dates_raw), name="dates") dates_local = dates_naive.dt.tz_localize("Australia/Sydney") dates_local_offsets_str = dates_local.astype("string").astype("O") @@ -525,7 +525,7 @@ def test_write_read_datetime_tz_localized_mixed_offset( exp_dates = exp_dates.dt.as_unit("ms") elif datetimes == "DATETIME": exp_dates = dates_local_offsets_str.apply( - lambda x: pd.Timestamp(x) if pd.notna(x) else None + lambda x: pd.Timestamp(x) if pd.notna(x) else np.nan ) elif datetimes == "STRING": exp_dates = dates_local_offsets_str.str.replace(" ", "T") @@ -585,7 +585,7 @@ def test_write_read_datetime_tz_mixed_offsets(tmp_path, ext, datetimes, use_arro dates = [ pd.Timestamp("2023-01-01 11:00:01.111+01:00"), pd.Timestamp("2023-06-01 10:00:01.111+05:00"), - None, + np.nan, ] df = gp.GeoDataFrame( @@ -627,7 +627,7 @@ def test_write_read_datetime_tz_mixed_offsets(tmp_path, ext, datetimes, use_arro elif datetimes == "STRING": assert is_string_dtype(result.dates.dtype) dates_str = df.dates.map( - lambda x: x.isoformat(timespec="milliseconds") if pd.notna(x) else None + lambda x: x.isoformat(timespec="milliseconds") if pd.notna(x) else np.nan ) if __gdal_version__ < (3, 7, 0): # With GDAL < 3.7, timezone minutes aren't included in the string @@ -644,12 +644,12 @@ def test_write_read_datetime_tz_mixed_offsets(tmp_path, ext, datetimes, use_arro ( pd.Timestamp("2020-01-01T09:00:00.123-05:00"), pd.Timestamp("2020-01-01T10:00:00-05:00"), - None, + np.nan, ), ( datetime.fromisoformat("2020-01-01T09:00:00.123-05:00"), datetime.fromisoformat("2020-01-01T10:00:00-05:00"), - None, + np.nan, ), ], ) @@ -709,7 +709,7 @@ def test_write_read_datetime_tz_objects(tmp_path, dates_raw, ext, use_arrow, dat exp_df.dates = df.dates.map( lambda x: x.isoformat(timespec="milliseconds").replace(".000", "") if pd.notna(x) - else None + else np.nan ) if __gdal_version__ < (3, 7, 0): # With GDAL < 3.7, timezone minutes aren't included in the string @@ -724,7 +724,7 @@ def test_write_read_datetime_tz_objects(tmp_path, dates_raw, ext, use_arrow, dat @pytest.mark.requires_arrow_write_api def test_write_read_datetime_utc(tmp_path, ext, use_arrow, datetimes): """Test writing/reading a column with UTC datetimes.""" - dates_raw = ["2020-01-01T09:00:00.123Z", "2020-01-01T10:00:00Z", None] + dates_raw = ["2020-01-01T09:00:00.123Z", "2020-01-01T10:00:00Z", np.nan] if PANDAS_GE_20: dates = pd.to_datetime(dates_raw, format="ISO8601").as_unit("ms") else: From b421a06b53f84882b14bcf6592d8ee5fd201e7c1 Mon Sep 17 00:00:00 2001 From: Pieter Roggemans Date: Tue, 5 Aug 2025 01:32:51 +0200 Subject: [PATCH 50/59] Support pandas 3.0 --- pyogrio/tests/test_geopandas_io.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pyogrio/tests/test_geopandas_io.py b/pyogrio/tests/test_geopandas_io.py index c8b84d36..1e6ea688 100644 --- a/pyogrio/tests/test_geopandas_io.py +++ b/pyogrio/tests/test_geopandas_io.py @@ -569,7 +569,7 @@ def test_write_read_datetime_tz_localized_mixed_offset( else: raise ValueError(f"Invalid value for 'datetimes': {datetimes!r}.") - assert_series_equal(result.dates, exp_dates) + assert_series_equal(result.dates, exp_dates, check_dtype=False) @pytest.mark.parametrize("ext", [ext for ext in ALL_EXTS if ext != ".shp"]) From 4486c9e35970796779e2a2e6094fdfcb1f39673e Mon Sep 17 00:00:00 2001 From: Pieter Roggemans Date: Tue, 5 Aug 2025 02:18:40 +0200 Subject: [PATCH 51/59] Update test_geopandas_io.py --- pyogrio/tests/test_geopandas_io.py | 13 +++++++++---- 1 file changed, 9 insertions(+), 4 deletions(-) diff --git a/pyogrio/tests/test_geopandas_io.py b/pyogrio/tests/test_geopandas_io.py index 1e6ea688..ebc1489a 100644 --- a/pyogrio/tests/test_geopandas_io.py +++ b/pyogrio/tests/test_geopandas_io.py @@ -518,14 +518,14 @@ def test_write_read_datetime_tz_localized_mixed_offset( dates_raw = ["2023-01-01 11:00:01.111", "2023-06-01 10:00:01.111", np.nan] dates_naive = pd.Series(pd.to_datetime(dates_raw), name="dates") dates_local = dates_naive.dt.tz_localize("Australia/Sydney") - dates_local_offsets_str = dates_local.astype("string").astype("O") + dates_local_offsets_str = dates_local.astype(str) if datetimes == "UTC": exp_dates = dates_local.dt.tz_convert("UTC") if PANDAS_GE_20: exp_dates = exp_dates.dt.as_unit("ms") elif datetimes == "DATETIME": exp_dates = dates_local_offsets_str.apply( - lambda x: pd.Timestamp(x) if pd.notna(x) else np.nan + lambda x: pd.Timestamp(x) if pd.notna(x) else None ) elif datetimes == "STRING": exp_dates = dates_local_offsets_str.str.replace(" ", "T") @@ -533,6 +533,7 @@ def test_write_read_datetime_tz_localized_mixed_offset( if __gdal_version__ < (3, 7, 0): # With GDAL < 3.7, timezone minutes aren't included in the string exp_dates = exp_dates.str.slice(0, -3) + exp_dates[2] = None else: raise ValueError(f"Invalid value for 'datetimes': {datetimes!r}.") @@ -551,11 +552,14 @@ def test_write_read_datetime_tz_localized_mixed_offset( if PANDAS_GE_20: dates_utc = dates_utc.dt.as_unit("ms") if datetimes == "STRING": - dates_utc = dates_utc.astype("string").str.replace(" ", "T") - assert_series_equal(result.dates, dates_utc) + assert is_string_dtype(result.dates.dtype) + dates_utc = dates_utc.astype(str).str.replace(" ", "T") + dates_utc[2] = pd.NA + assert_series_equal(result.dates, dates_utc, check_dtype=False) pytest.xfail("mixed tz datetimes converted to UTC with GDAL < 3.11 + arrow") elif ext in (".gpkg", ".fgb"): # With GDAL < 3.11 with arrow, datetime columns written as string type + dates_local_offsets_str[2] = None assert_series_equal(result.dates, dates_local_offsets_str) pytest.xfail("datetime columns written as string with GDAL < 3.11 + arrow") @@ -605,6 +609,7 @@ def test_write_read_datetime_tz_mixed_offsets(tmp_path, ext, datetimes, use_arro df_exp.dates = df_exp.dates.dt.as_unit("ms") if datetimes == "STRING": df_exp.dates = df_exp.dates.astype("string").str.replace(" ", "T") + df_exp.loc[2, "dates"] = pd.NA assert_geodataframe_equal(result, df_exp) pytest.xfail("mixed tz datetimes converted to UTC with GDAL < 3.11 + arrow") elif ext in (".gpkg", ".fgb"): From a3a039383ff69b99bdfa7da600eba957b13b615d Mon Sep 17 00:00:00 2001 From: Pieter Roggemans Date: Tue, 5 Aug 2025 02:51:51 +0200 Subject: [PATCH 52/59] Update test_geopandas_io.py --- pyogrio/tests/test_geopandas_io.py | 16 ++++++++++------ 1 file changed, 10 insertions(+), 6 deletions(-) diff --git a/pyogrio/tests/test_geopandas_io.py b/pyogrio/tests/test_geopandas_io.py index ebc1489a..058c501c 100644 --- a/pyogrio/tests/test_geopandas_io.py +++ b/pyogrio/tests/test_geopandas_io.py @@ -533,7 +533,6 @@ def test_write_read_datetime_tz_localized_mixed_offset( if __gdal_version__ < (3, 7, 0): # With GDAL < 3.7, timezone minutes aren't included in the string exp_dates = exp_dates.str.slice(0, -3) - exp_dates[2] = None else: raise ValueError(f"Invalid value for 'datetimes': {datetimes!r}.") @@ -554,13 +553,15 @@ def test_write_read_datetime_tz_localized_mixed_offset( if datetimes == "STRING": assert is_string_dtype(result.dates.dtype) dates_utc = dates_utc.astype(str).str.replace(" ", "T") - dates_utc[2] = pd.NA - assert_series_equal(result.dates, dates_utc, check_dtype=False) + assert pd.isna(result.dates[2]) + assert_series_equal( + result.dates.head(2), dates_utc.head(2), check_dtype=False + ) pytest.xfail("mixed tz datetimes converted to UTC with GDAL < 3.11 + arrow") elif ext in (".gpkg", ".fgb"): # With GDAL < 3.11 with arrow, datetime columns written as string type - dates_local_offsets_str[2] = None - assert_series_equal(result.dates, dates_local_offsets_str) + assert pd.isna(result.dates[2]) + assert_series_equal(result.dates.head(2), dates_local_offsets_str.head(2)) pytest.xfail("datetime columns written as string with GDAL < 3.11 + arrow") # GDAL tz only encodes offsets, not timezones @@ -573,7 +574,10 @@ def test_write_read_datetime_tz_localized_mixed_offset( else: raise ValueError(f"Invalid value for 'datetimes': {datetimes!r}.") - assert_series_equal(result.dates, exp_dates, check_dtype=False) + # Check isna for the third value seperately as depending on versions this is + # different + pandas 3.0 assert_series_equal becomes strict about this. + assert pd.isna(result.dates[2]) + assert_series_equal(result.dates.head(2), exp_dates.head(2), check_dtype=False) @pytest.mark.parametrize("ext", [ext for ext in ALL_EXTS if ext != ".shp"]) From 7248d1be258c1ccb408c4a1b2733c12abee97b5c Mon Sep 17 00:00:00 2001 From: Pieter Roggemans Date: Tue, 5 Aug 2025 09:33:39 +0200 Subject: [PATCH 53/59] Small textual improvements --- CHANGES.md | 2 +- pyogrio/geopandas.py | 3 ++- 2 files changed, 3 insertions(+), 2 deletions(-) diff --git a/CHANGES.md b/CHANGES.md index f2666001..13be2b42 100644 --- a/CHANGES.md +++ b/CHANGES.md @@ -4,7 +4,7 @@ ### Improvements -- Add `datetime` parameter to `read_dateframe` to choose the way datetimecolumns are +- Add `datetime` parameter to `read_dateframe` to choose the way datetime columns are returned + several fixes when reading and writing datetimes (#486). ## 0.11.1 (2025-08-02) diff --git a/pyogrio/geopandas.py b/pyogrio/geopandas.py index acc21651..b574f9dd 100644 --- a/pyogrio/geopandas.py +++ b/pyogrio/geopandas.py @@ -260,7 +260,8 @@ def read_dataframe( from the data source. Columns with values in a single timezone or without timezone information will be returned as pandas datetime64 columns. Columns with mixed timezone data are returned as object - columns with pandas.Timestamp values. + columns with pandas.Timestamp values. If you want to roundtrip + datetimes as good as possible, use this option. - **"STRING"**: return all datetimes as ISO8601 strings. **kwargs From 86529a2a92b776385a30508339fba44b0927bf5d Mon Sep 17 00:00:00 2001 From: Pieter Roggemans Date: Tue, 5 Aug 2025 12:15:11 +0200 Subject: [PATCH 54/59] Typo in changelog --- CHANGES.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/CHANGES.md b/CHANGES.md index 13be2b42..cec437ee 100644 --- a/CHANGES.md +++ b/CHANGES.md @@ -4,7 +4,7 @@ ### Improvements -- Add `datetime` parameter to `read_dateframe` to choose the way datetime columns are +- Add `datetimes` parameter to `read_dateframe` to choose the way datetime columns are returned + several fixes when reading and writing datetimes (#486). ## 0.11.1 (2025-08-02) From 8303c05b27f9e90f7d59f5a3fc7c541f93a8472c Mon Sep 17 00:00:00 2001 From: Pieter Roggemans Date: Thu, 7 Aug 2025 03:54:39 +0200 Subject: [PATCH 55/59] Fix UTC mode so it is backwards compatible --- pyogrio/geopandas.py | 105 +++++++++++++++++++---------- pyogrio/tests/test_geopandas_io.py | 31 ++++----- 2 files changed, 82 insertions(+), 54 deletions(-) diff --git a/pyogrio/geopandas.py b/pyogrio/geopandas.py index b574f9dd..11490138 100644 --- a/pyogrio/geopandas.py +++ b/pyogrio/geopandas.py @@ -47,20 +47,20 @@ def _try_parse_datetime(ser, datetimes="UTC"): datetime_kwargs = {} if datetimes == "STRING": if not is_string_dtype(ser.dtype): + # Support to return datetimes as strings using arrow only available for + # GDAL >= 3.11, so convert to string here if needed. res = ser.astype("string").str.replace(" ", "T") return res if __gdal_version__ < (3, 7, 0): # GDAL < 3.7 doesn't return datetimes in ISO8601 format, so fix that return ser.str.replace(" ", "T").str.replace("/", "-") return ser - elif datetimes == "UTC": - datetime_kwargs["utc"] = True - elif datetimes == "DATETIME": - datetime_kwargs["utc"] = False + elif datetimes in ("UTC", "DATETIME"): + pass else: raise ValueError( f"Invalid value for 'datetimes': {datetimes!r}. " - "Must be 'UTC' or 'DATETIME'." + "Must be 'UTC', 'DATETIME' or 'STRING'." ) if PANDAS_GE_22: @@ -78,17 +78,43 @@ def _try_parse_datetime(ser, datetimes="UTC"): FutureWarning, ) - res = ser + warning = "Error parsing datetimes, original strings are returned: {message}" try: res = pd.to_datetime(ser, **datetime_kwargs) - except ValueError as ex: - if "Mixed timezones detected" in str(ex): - # Parsing mixed timezones with to_datetime is not supported anymore in - # pandas>=3.0, so convert to pd.Timestamp objects manually. - # Using 2 times map seems to be the fastest way to do this. - res = ser.map(datetime.fromisoformat, na_action="ignore").map( - pd.Timestamp, na_action="ignore" - ) + except Exception as ex: + if isinstance(ex, ValueError) and "Mixed timezones detected" in str(ex): + # Parsing mixed timezones with to_datetime is not supported + # anymore in pandas >= 3.0, leading to a ValueError. + if datetimes == "DATETIME": + # Using 2 times map seems to be the fastest way to convert the + # strings to Timestamps. + try: + res = ser.map(datetime.fromisoformat, na_action="ignore").map( + pd.Timestamp, na_action="ignore" + ) + except Exception as ex: + warnings.warn(warning.format(message=str(ex)), stacklevel=1) + return ser + elif datetimes == "UTC": + # Convert mixed timezone datetimes to UTC. + # This happens right after this exception handler. + pass + else: + warnings.warn(warning.format(message=str(ex)), stacklevel=1) + return ser + else: + # If the error is not related to mixed timezones, log it and return + # the original series. + warnings.warn(warning.format(message=str(ex)), stacklevel=1) + return ser + + # For pandas < 3.0, to_datetime converted mixed timezone data to datetime objects. + # For this datetimes option they should be converted to UTC though... + if datetimes == "UTC" and res.dtype in ("object", "string"): + try: + res = pd.to_datetime(ser, utc=True, **datetime_kwargs) + except Exception as ex: + warnings.warn(warning.format(message=str(ex)), stacklevel=1) if res.dtype.kind == "M": # any datetime64 # GDAL only supports ms precision, convert outputs to match. @@ -253,15 +279,24 @@ def read_dataframe( datetimes : str, optional (default: "UTC") The way datetime columns should be returned. Possible values: - - **"UTC"**: return all datetime columns as pandas datetime64 columns - converted to UTC. Naive datetimes (without timezone information) will - be assumed to be in UTC timezone. + - **"UTC"**: return all datetime columns as pandas datetime64 columns. + The data is returned as-is if a column contains only naive datetimes + (without timezone information), only UTC datetimes, or if all datetimes + in the column have the same timezone offset. + Note that in timezones with daylight saving time datetimes will have + different offsets throughout the year! + For columns that don't comply to the above, all datetimes are converted + to UTC. In that case naive datetimes are assumed to be in UTC already. - **"DATETIME"**: return datetimes in the timezone as they were read - from the data source. Columns with values in a single timezone or - without timezone information will be returned as pandas datetime64 - columns. Columns with mixed timezone data are returned as object - columns with pandas.Timestamp values. If you want to roundtrip - datetimes as good as possible, use this option. + from the data source, even if a column contains mixed timezone offsets. + Columns will be returned as pandas datetime64 column if a column contains + only naive datetimes (without timezone information), only UTC datetimes, + or if all datetimes in the column have the same timezone offset. + Note that in timezones with daylight saving time datetimes will have + different offsets throughout the year! + Columns that don't comply to the above are returned as object columns with + pandas.Timestamp values. If you want to roundtrip datetimes without data + loss, this is the recommended option. - **"STRING"**: return all datetimes as ISO8601 strings. **kwargs @@ -666,23 +701,23 @@ def write_dataframe( df = pd.DataFrame(df, copy=False) df[geometry_column] = geometry - # Convert all datetime columns to isoformat strings, to avoid mixed timezone - # information getting lost. + # Arrow doesn't support datetime columns with mixed timezones, and GDAL only + # supports timezone offsets. Hence, to avoid data loss, convert columns that can + # contain datetime values with different offsets to strings. + # Also pass a list of these columns on so GDAL so it can still treat them as + # datetime columns when writing the dataset. datetime_cols = [] for name, dtype in df.dtypes.items(): - col = df[name] if dtype == "object": - # If first non-NA value is a datetime-like object, treat as datetime - # column. - first_non_na_index = col.first_valid_index() - if first_non_na_index is not None: - if isinstance(col[first_non_na_index], (pd.Timestamp, datetime)): - df[name] = col.astype("string") - datetime_cols.append(name) + # An object column with datetimes can contain multiple offsets. + if pd.api.types.infer_dtype(df[name]) == "datetime": + df[name] = df[name].astype("string") + datetime_cols.append(name) + elif isinstance(dtype, pd.DatetimeTZDtype) and str(dtype.tz) != "UTC": - # When it is a datetime column with a timezone different than UTC, it - # needs to be converted to string, otherwise the timezone info is lost. - df[name] = col.astype("string") + # A pd.datetime64 column with a timezone different than UTC can contain + # data with different offsets because of summer/winter time. + df[name] = df[name].astype("string") datetime_cols.append(name) table = pa.Table.from_pandas(df, preserve_index=False) diff --git a/pyogrio/tests/test_geopandas_io.py b/pyogrio/tests/test_geopandas_io.py index b3b191f7..8055de22 100644 --- a/pyogrio/tests/test_geopandas_io.py +++ b/pyogrio/tests/test_geopandas_io.py @@ -334,9 +334,9 @@ def test_read_datetime(datetime_file, use_arrow): df = read_dataframe(datetime_file, use_arrow=use_arrow) if PANDAS_GE_20: # starting with pandas 2.0, it preserves the passed datetime resolution - assert df.col.dtype.name == "datetime64[ms, UTC]" + assert df.col.dtype.name == "datetime64[ms]" else: - assert df.col.dtype.name == "datetime64[ns, UTC]" + assert df.col.dtype.name == "datetime64[ns]" def test_read_datetimes_invalid_param(datetime_file, use_arrow): @@ -359,6 +359,11 @@ def test_read_datetime_long_ago(geojson_datetime_long_ago, use_arrow, datetimes) pytest.xfail( "datetimes before 1678-1-1 give overflow if arrow is used with GDAL<3.11" ) + if False and not PANDAS_GE_30 and datetimes != "STRING": + pytest.xfail( + "datetimes before 1678-1-1 are not supported with datetimes='UTC' with " + "pandas < 3.0" + ) df = read_dataframe( geojson_datetime_long_ago, use_arrow=use_arrow, datetimes=datetimes @@ -400,12 +405,7 @@ def test_write_read_datetime_no_tz(tmp_path, ext, datetimes, use_arrow): write_dataframe(df, fpath, use_arrow=use_arrow) result = read_dataframe(fpath, use_arrow=use_arrow, datetimes=datetimes) - if ( - datetimes != "UTC" - and use_arrow - and ext == ".gpkg" - and __gdal_version__ < (3, 11, 0) - ): + if use_arrow and ext == ".gpkg" and __gdal_version__ < (3, 11, 0): # With GDAL < 3.11 with arrow, columns with naive datetimes are written # correctly, but when read they are wrongly interpreted as being in UTC. # The reason is complicated, but more info can be found e.g. here: @@ -420,7 +420,7 @@ def test_write_read_datetime_no_tz(tmp_path, ext, datetimes, use_arrow): if datetimes == "UTC": assert is_datetime64_any_dtype(result.dates.dtype) - assert_series_equal(result.dates, df.dates.dt.tz_localize("UTC")) + assert_series_equal(result.dates, df.dates) elif datetimes == "DATETIME": assert is_datetime64_dtype(result.dates.dtype) assert_geodataframe_equal(result, df) @@ -482,9 +482,7 @@ def test_write_read_datetime_tz(tmp_path, ext, datetimes, use_arrow): assert isinstance(df.dates.dtype, pd.DatetimeTZDtype) if datetimes == "UTC": - assert_series_equal( - result.dates, df.dates.dt.tz_convert("UTC"), check_index=False - ) + assert_series_equal(result.dates, df.dates, check_index=False) elif datetimes == "DATETIME": assert_series_equal(result.dates, df.dates, check_index=False) elif datetimes == "STRING": @@ -747,14 +745,9 @@ def test_write_read_datetime_utc(tmp_path, ext, use_arrow, datetimes): write_dataframe(df, fpath, use_arrow=use_arrow) result = read_dataframe(fpath, use_arrow=use_arrow, datetimes=datetimes) - if ( - use_arrow - and datetimes != "UTC" - and ext == ".fgb" - and __gdal_version__ < (3, 11, 0) - ): + if use_arrow and ext == ".fgb" and __gdal_version__ < (3, 11, 0): # With GDAL < 3.11 with arrow, timezone information is dropped when reading .fgb - if datetimes == "DATETIME": + if datetimes in ("UTC", "DATETIME"): assert_series_equal(result.dates, df.dates.dt.tz_localize(None)) elif datetimes == "STRING": assert is_string_dtype(result.dates.dtype) From ac5f20fd0f8d3616510acfdcb7f2a4727f4c873a Mon Sep 17 00:00:00 2001 From: Pieter Roggemans Date: Thu, 7 Aug 2025 04:06:42 +0200 Subject: [PATCH 56/59] Fix for pandas 3.0 --- CHANGES.md | 2 +- pyogrio/geopandas.py | 7 +++++-- 2 files changed, 6 insertions(+), 3 deletions(-) diff --git a/CHANGES.md b/CHANGES.md index cec437ee..f7004d08 100644 --- a/CHANGES.md +++ b/CHANGES.md @@ -4,7 +4,7 @@ ### Improvements -- Add `datetimes` parameter to `read_dateframe` to choose the way datetime columns are +- Add `datetimes` parameter to `read_dataframe` to choose the way datetime columns are returned + several fixes when reading and writing datetimes (#486). ## 0.11.1 (2025-08-02) diff --git a/pyogrio/geopandas.py b/pyogrio/geopandas.py index 11490138..2c12955e 100644 --- a/pyogrio/geopandas.py +++ b/pyogrio/geopandas.py @@ -97,8 +97,11 @@ def _try_parse_datetime(ser, datetimes="UTC"): return ser elif datetimes == "UTC": # Convert mixed timezone datetimes to UTC. - # This happens right after this exception handler. - pass + try: + res = pd.to_datetime(ser, utc=True, **datetime_kwargs) + except Exception as ex: + warnings.warn(warning.format(message=str(ex)), stacklevel=1) + return ser else: warnings.warn(warning.format(message=str(ex)), stacklevel=1) return ser From 23bf3484c82fc9853db9bae7519b08e53d70bc92 Mon Sep 17 00:00:00 2001 From: Pieter Roggemans Date: Thu, 7 Aug 2025 15:29:48 +0200 Subject: [PATCH 57/59] Improve inline doc. --- pyogrio/geopandas.py | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/pyogrio/geopandas.py b/pyogrio/geopandas.py index 2c12955e..f4706463 100644 --- a/pyogrio/geopandas.py +++ b/pyogrio/geopandas.py @@ -348,8 +348,11 @@ def read_dataframe( gdal_force_2d = False if use_arrow else force_2d # Always read datetimes as string values to preserve (mixed) timezone info - # as numpy does not directly support timezones and arrow datetime columns - # don't support mixed timezones. + # correctly. If arrow is not used, it is needed because numpy does not + # directly support timezones. If arrow is used, needed because datetime + # columns don't support mixed timezone offsets + e.g. for .fgb files + # timezone info isn't handled correctly even for unique timezone offsets + # if datetimes are not read as string. result = read_func( path_or_buffer, layer=layer, From aab9240add008219963f4c5a8d926f6ca486991e Mon Sep 17 00:00:00 2001 From: Pieter Roggemans Date: Wed, 27 Aug 2025 22:30:18 +0200 Subject: [PATCH 58/59] Rename datetimes options --- pyogrio/geopandas.py | 39 ++++++++++++--------- pyogrio/tests/test_geopandas_io.py | 54 ++++++++++++++++-------------- 2 files changed, 52 insertions(+), 41 deletions(-) diff --git a/pyogrio/geopandas.py b/pyogrio/geopandas.py index f4706463..f7bfe437 100644 --- a/pyogrio/geopandas.py +++ b/pyogrio/geopandas.py @@ -39,11 +39,16 @@ def _stringify_path(path): return path -def _try_parse_datetime(ser, datetimes="UTC"): +def _try_parse_datetime(ser, datetimes): import pandas as pd # only called when pandas is known to be installed from pandas.api.types import is_string_dtype datetimes = datetimes.upper() + datetimes_values = [ + "MIXED_TO_UTC", + "MIXED_TO_DATETIME", + "STRING", + ] datetime_kwargs = {} if datetimes == "STRING": if not is_string_dtype(ser.dtype): @@ -55,12 +60,12 @@ def _try_parse_datetime(ser, datetimes="UTC"): # GDAL < 3.7 doesn't return datetimes in ISO8601 format, so fix that return ser.str.replace(" ", "T").str.replace("/", "-") return ser - elif datetimes in ("UTC", "DATETIME"): + elif datetimes in datetimes_values: pass else: raise ValueError( f"Invalid value for 'datetimes': {datetimes!r}. " - "Must be 'UTC', 'DATETIME' or 'STRING'." + f"Must be one of {datetimes_values!r}." ) if PANDAS_GE_22: @@ -81,21 +86,25 @@ def _try_parse_datetime(ser, datetimes="UTC"): warning = "Error parsing datetimes, original strings are returned: {message}" try: res = pd.to_datetime(ser, **datetime_kwargs) + + # With pandas < 3.0, mixed timezones were returned as pandas Timestamps, so + # convert them to datetime objects. + if datetimes == "MIXED_TO_DATETIME" and res.dtype == "object": + res = res.map(lambda x: x.to_pydatetime(), na_action="ignore") + except Exception as ex: if isinstance(ex, ValueError) and "Mixed timezones detected" in str(ex): # Parsing mixed timezones with to_datetime is not supported # anymore in pandas >= 3.0, leading to a ValueError. - if datetimes == "DATETIME": - # Using 2 times map seems to be the fastest way to convert the - # strings to Timestamps. + if datetimes == "MIXED_TO_DATETIME": + # Using map seems to be the fastest way to convert the strings to + # datetimes. try: - res = ser.map(datetime.fromisoformat, na_action="ignore").map( - pd.Timestamp, na_action="ignore" - ) + res = ser.map(datetime.fromisoformat, na_action="ignore") except Exception as ex: warnings.warn(warning.format(message=str(ex)), stacklevel=1) return ser - elif datetimes == "UTC": + elif datetimes == "MIXED_TO_UTC": # Convert mixed timezone datetimes to UTC. try: res = pd.to_datetime(ser, utc=True, **datetime_kwargs) @@ -113,7 +122,7 @@ def _try_parse_datetime(ser, datetimes="UTC"): # For pandas < 3.0, to_datetime converted mixed timezone data to datetime objects. # For this datetimes option they should be converted to UTC though... - if datetimes == "UTC" and res.dtype in ("object", "string"): + if datetimes == "MIXED_TO_UTC" and res.dtype in ("object", "string"): try: res = pd.to_datetime(ser, utc=True, **datetime_kwargs) except Exception as ex: @@ -151,7 +160,7 @@ def read_dataframe( use_arrow=None, on_invalid="raise", arrow_to_pandas_kwargs=None, - datetimes="UTC", + datetimes="MIXED_TO_UTC", **kwargs, ): """Read from an OGR data source to a GeoPandas GeoDataFrame or Pandas DataFrame. @@ -282,7 +291,7 @@ def read_dataframe( datetimes : str, optional (default: "UTC") The way datetime columns should be returned. Possible values: - - **"UTC"**: return all datetime columns as pandas datetime64 columns. + - **"MIXED_TO_UTC"**: return all datetime columns as pandas datetime64 columns. The data is returned as-is if a column contains only naive datetimes (without timezone information), only UTC datetimes, or if all datetimes in the column have the same timezone offset. @@ -290,7 +299,7 @@ def read_dataframe( different offsets throughout the year! For columns that don't comply to the above, all datetimes are converted to UTC. In that case naive datetimes are assumed to be in UTC already. - - **"DATETIME"**: return datetimes in the timezone as they were read + - **"MIXED_TO_DATETIME"**: return datetimes in the timezone as they were read from the data source, even if a column contains mixed timezone offsets. Columns will be returned as pandas datetime64 column if a column contains only naive datetimes (without timezone information), only UTC datetimes, @@ -298,7 +307,7 @@ def read_dataframe( Note that in timezones with daylight saving time datetimes will have different offsets throughout the year! Columns that don't comply to the above are returned as object columns with - pandas.Timestamp values. If you want to roundtrip datetimes without data + python datetime values. If you want to roundtrip datetimes without data loss, this is the recommended option. - **"STRING"**: return all datetimes as ISO8601 strings. diff --git a/pyogrio/tests/test_geopandas_io.py b/pyogrio/tests/test_geopandas_io.py index 8055de22..e23e5015 100644 --- a/pyogrio/tests/test_geopandas_io.py +++ b/pyogrio/tests/test_geopandas_io.py @@ -344,7 +344,9 @@ def test_read_datetimes_invalid_param(datetime_file, use_arrow): read_dataframe(datetime_file, use_arrow=use_arrow, datetimes="INVALID") -@pytest.mark.parametrize("datetimes", ["UTC", "DATETIME", "STRING"]) +@pytest.mark.parametrize( + "datetimes", ["MIXED_TO_UTC", "MIXED_TO_DATETIME", "STRING"] +) def test_read_datetime_long_ago(geojson_datetime_long_ago, use_arrow, datetimes): """Test writing/reading a column with a datetime far in the past. @@ -361,8 +363,8 @@ def test_read_datetime_long_ago(geojson_datetime_long_ago, use_arrow, datetimes) ) if False and not PANDAS_GE_30 and datetimes != "STRING": pytest.xfail( - "datetimes before 1678-1-1 are not supported with datetimes='UTC' with " - "pandas < 3.0" + "datetimes before 1678-1-1 are not supported with datetimes='MIXED_TO_UTC' " + "with pandas < 3.0" ) df = read_dataframe( @@ -370,11 +372,11 @@ def test_read_datetime_long_ago(geojson_datetime_long_ago, use_arrow, datetimes) ) exp_dates = pd.Series(["1670-01-01T09:00:00"], name="datetime_col") - if datetimes == "UTC": + if datetimes == "MIXED_TO_UTC": pytest.xfail("datetimes of long ago cannot be parsed as UTC") assert is_datetime64_any_dtype(df.datetime_col.dtype) assert_series_equal(df.datetime_col, exp_dates) - elif datetimes == "DATETIME": + elif datetimes == "MIXED_TO_DATETIME": pytest.xfail("datetimes of long ago cannot be parsed as datetime") assert is_datetime64_dtype(df.datetime_col.dtype) if PANDAS_GE_20: @@ -388,7 +390,7 @@ def test_read_datetime_long_ago(geojson_datetime_long_ago, use_arrow, datetimes) @pytest.mark.parametrize("ext", [ext for ext in ALL_EXTS if ext != ".shp"]) -@pytest.mark.parametrize("datetimes", ["UTC", "DATETIME", "STRING"]) +@pytest.mark.parametrize("datetimes", ["MIXED_TO_UTC", "MIXED_TO_DATETIME", "STRING"]) @pytest.mark.requires_arrow_write_api def test_write_read_datetime_no_tz(tmp_path, ext, datetimes, use_arrow): """Test writing/reading a column with naive datetimes (no timezone information).""" @@ -411,17 +413,17 @@ def test_write_read_datetime_no_tz(tmp_path, ext, datetimes, use_arrow): # The reason is complicated, but more info can be found e.g. here: # https://github.com/geopandas/pyogrio/issues/487#issuecomment-2423762807 exp_dates = df.dates.dt.tz_localize("UTC") - if datetimes == "DATETIME": + if datetimes == "MIXED_TO_DATETIME": assert_series_equal(result.dates, exp_dates) elif datetimes == "STRING": exp_dates = exp_dates.astype("string").str.replace(" ", "T") assert_series_equal(result.dates, exp_dates) pytest.xfail("naive datetimes read wrong in GPKG with GDAL < 3.11 via arrow") - if datetimes == "UTC": + if datetimes == "MIXED_TO_UTC": assert is_datetime64_any_dtype(result.dates.dtype) assert_series_equal(result.dates, df.dates) - elif datetimes == "DATETIME": + elif datetimes == "MIXED_TO_DATETIME": assert is_datetime64_dtype(result.dates.dtype) assert_geodataframe_equal(result, df) elif datetimes == "STRING": @@ -436,7 +438,7 @@ def test_write_read_datetime_no_tz(tmp_path, ext, datetimes, use_arrow): @pytest.mark.parametrize("ext", [ext for ext in ALL_EXTS if ext != ".shp"]) -@pytest.mark.parametrize("datetimes", ["UTC", "DATETIME", "STRING"]) +@pytest.mark.parametrize("datetimes", ["MIXED_TO_UTC", "MIXED_TO_DATETIME", "STRING"]) @pytest.mark.filterwarnings("ignore: Non-conformant content for record 1 in column ") @pytest.mark.requires_arrow_write_api def test_write_read_datetime_tz(tmp_path, ext, datetimes, use_arrow): @@ -481,9 +483,9 @@ def test_write_read_datetime_tz(tmp_path, ext, datetimes, use_arrow): pytest.xfail("datetime columns written as string with GDAL < 3.11 via arrow") assert isinstance(df.dates.dtype, pd.DatetimeTZDtype) - if datetimes == "UTC": + if datetimes == "MIXED_TO_UTC": assert_series_equal(result.dates, df.dates, check_index=False) - elif datetimes == "DATETIME": + elif datetimes == "MIXED_TO_DATETIME": assert_series_equal(result.dates, df.dates, check_index=False) elif datetimes == "STRING": assert is_string_dtype(result.dates.dtype) @@ -503,7 +505,7 @@ def test_write_read_datetime_tz(tmp_path, ext, datetimes, use_arrow): @pytest.mark.parametrize("ext", [ext for ext in ALL_EXTS if ext != ".shp"]) -@pytest.mark.parametrize("datetimes", ["UTC", "DATETIME", "STRING"]) +@pytest.mark.parametrize("datetimes", ["MIXED_TO_UTC", "MIXED_TO_DATETIME", "STRING"]) @pytest.mark.filterwarnings( "ignore: Non-conformant content for record 1 in column dates" ) @@ -517,11 +519,11 @@ def test_write_read_datetime_tz_localized_mixed_offset( dates_naive = pd.Series(pd.to_datetime(dates_raw), name="dates") dates_local = dates_naive.dt.tz_localize("Australia/Sydney") dates_local_offsets_str = dates_local.astype(str) - if datetimes == "UTC": + if datetimes == "MIXED_TO_UTC": exp_dates = dates_local.dt.tz_convert("UTC") if PANDAS_GE_20: exp_dates = exp_dates.dt.as_unit("ms") - elif datetimes == "DATETIME": + elif datetimes == "MIXED_TO_DATETIME": exp_dates = dates_local_offsets_str.apply( lambda x: pd.Timestamp(x) if pd.notna(x) else None ) @@ -563,9 +565,9 @@ def test_write_read_datetime_tz_localized_mixed_offset( pytest.xfail("datetime columns written as string with GDAL < 3.11 + arrow") # GDAL tz only encodes offsets, not timezones - if datetimes == "UTC": + if datetimes == "MIXED_TO_UTC": assert isinstance(result.dates.dtype, pd.DatetimeTZDtype) - elif datetimes == "DATETIME": + elif datetimes == "MIXED_TO_DATETIME": assert is_object_dtype(result.dates.dtype) elif datetimes == "STRING": assert is_string_dtype(result.dates.dtype) @@ -579,7 +581,7 @@ def test_write_read_datetime_tz_localized_mixed_offset( @pytest.mark.parametrize("ext", [ext for ext in ALL_EXTS if ext != ".shp"]) -@pytest.mark.parametrize("datetimes", ["UTC", "DATETIME", "STRING"]) +@pytest.mark.parametrize("datetimes", ["MIXED_TO_UTC", "MIXED_TO_DATETIME", "STRING"]) @pytest.mark.filterwarnings( "ignore: Non-conformant content for record 1 in column dates" ) @@ -622,13 +624,13 @@ def test_write_read_datetime_tz_mixed_offsets(tmp_path, ext, datetimes, use_arro assert_geodataframe_equal(result, df_exp) pytest.xfail("mixed tz datetimes converted to UTC with GDAL < 3.11 + arrow") - if datetimes == "UTC": + if datetimes == "MIXED_TO_UTC": assert isinstance(result.dates.dtype, pd.DatetimeTZDtype) exp_dates = pd.to_datetime(df.dates, utc=True) if PANDAS_GE_20: exp_dates = exp_dates.dt.as_unit("ms") assert_series_equal(result.dates, exp_dates) - elif datetimes == "DATETIME": + elif datetimes == "MIXED_TO_DATETIME": assert is_object_dtype(result.dates.dtype) assert_geodataframe_equal(result, df) elif datetimes == "STRING": @@ -660,7 +662,7 @@ def test_write_read_datetime_tz_mixed_offsets(tmp_path, ext, datetimes, use_arro ), ], ) -@pytest.mark.parametrize("datetimes", ["UTC", "DATETIME", "STRING"]) +@pytest.mark.parametrize("datetimes", ["MIXED_TO_UTC", "MIXED_TO_DATETIME", "STRING"]) @pytest.mark.filterwarnings( "ignore: Non-conformant content for record 1 in column dates" ) @@ -702,10 +704,10 @@ def test_write_read_datetime_tz_objects(tmp_path, dates_raw, ext, use_arrow, dat assert_geodataframe_equal(result, exp2_df) pytest.xfail("datetime columns written as string with GDAL < 3.11 + arrow") - if datetimes == "UTC": + if datetimes == "MIXED_TO_UTC": assert isinstance(result.dates.dtype, pd.DatetimeTZDtype) exp_df.dates = exp_df.dates.dt.tz_convert("UTC") - elif datetimes == "DATETIME": + elif datetimes == "MIXED_TO_DATETIME": assert isinstance(result.dates.dtype, pd.DatetimeTZDtype) elif datetimes == "STRING": assert is_string_dtype(result.dates.dtype) @@ -727,7 +729,7 @@ def test_write_read_datetime_tz_objects(tmp_path, dates_raw, ext, use_arrow, dat @pytest.mark.parametrize("ext", [ext for ext in ALL_EXTS if ext != ".shp"]) -@pytest.mark.parametrize("datetimes", ["UTC", "DATETIME", "STRING"]) +@pytest.mark.parametrize("datetimes", ["MIXED_TO_UTC", "MIXED_TO_DATETIME", "STRING"]) @pytest.mark.requires_arrow_write_api def test_write_read_datetime_utc(tmp_path, ext, use_arrow, datetimes): """Test writing/reading a column with UTC datetimes.""" @@ -747,7 +749,7 @@ def test_write_read_datetime_utc(tmp_path, ext, use_arrow, datetimes): if use_arrow and ext == ".fgb" and __gdal_version__ < (3, 11, 0): # With GDAL < 3.11 with arrow, timezone information is dropped when reading .fgb - if datetimes in ("UTC", "DATETIME"): + if datetimes in ("MIXED_TO_UTC", "MIXED_TO_DATETIME"): assert_series_equal(result.dates, df.dates.dt.tz_localize(None)) elif datetimes == "STRING": assert is_string_dtype(result.dates.dtype) @@ -757,7 +759,7 @@ def test_write_read_datetime_utc(tmp_path, ext, use_arrow, datetimes): assert_series_equal(result.dates, exp_dates, check_dtype=False) pytest.xfail("UTC datetimes read wrong in .fgb with GDAL < 3.11 via arrow") - if datetimes in ("UTC", "DATETIME"): + if datetimes in ("MIXED_TO_UTC", "MIXED_TO_DATETIME"): assert result.dates.dtype.name in ("datetime64[ms, UTC]", "datetime64[ns, UTC]") assert_geodataframe_equal(result, df) elif datetimes == "STRING": From d3a2ae813394fd1b5ad2e4e2c5d25012d1f5b9f5 Mon Sep 17 00:00:00 2001 From: Pieter Roggemans Date: Wed, 27 Aug 2025 23:39:53 +0200 Subject: [PATCH 59/59] Fix for pandas <2 --- pyogrio/geopandas.py | 10 +++++++--- pyogrio/tests/test_geopandas_io.py | 4 +--- 2 files changed, 8 insertions(+), 6 deletions(-) diff --git a/pyogrio/geopandas.py b/pyogrio/geopandas.py index f7bfe437..17eece5f 100644 --- a/pyogrio/geopandas.py +++ b/pyogrio/geopandas.py @@ -87,9 +87,13 @@ def _try_parse_datetime(ser, datetimes): try: res = pd.to_datetime(ser, **datetime_kwargs) - # With pandas < 3.0, mixed timezones were returned as pandas Timestamps, so - # convert them to datetime objects. - if datetimes == "MIXED_TO_DATETIME" and res.dtype == "object": + # With pandas >2 and <3, mixed timezones were returned as pandas Timestamps, + # so convert them to datetime objects. + if ( + datetimes == "MIXED_TO_DATETIME" + and PANDAS_GE_20 + and res.dtype == "object" + ): res = res.map(lambda x: x.to_pydatetime(), na_action="ignore") except Exception as ex: diff --git a/pyogrio/tests/test_geopandas_io.py b/pyogrio/tests/test_geopandas_io.py index e23e5015..45a46ba9 100644 --- a/pyogrio/tests/test_geopandas_io.py +++ b/pyogrio/tests/test_geopandas_io.py @@ -344,9 +344,7 @@ def test_read_datetimes_invalid_param(datetime_file, use_arrow): read_dataframe(datetime_file, use_arrow=use_arrow, datetimes="INVALID") -@pytest.mark.parametrize( - "datetimes", ["MIXED_TO_UTC", "MIXED_TO_DATETIME", "STRING"] -) +@pytest.mark.parametrize("datetimes", ["MIXED_TO_UTC", "MIXED_TO_DATETIME", "STRING"]) def test_read_datetime_long_ago(geojson_datetime_long_ago, use_arrow, datetimes): """Test writing/reading a column with a datetime far in the past.