Skip to content

Commit

Permalink
Fix crash during normalization of time zones
Browse files Browse the repository at this point in the history
Add another chak for the tests

Address PR comments and add more test cases

Fix edge case on Windows

Fix broken test on windows

Recreate the timestamp with timezone

Coerce all timezone to datetime.timezone

Add metadata to compat tests
  • Loading branch information
G-D-Petrov committed Mar 15, 2024
1 parent 3a01362 commit d423bfb
Show file tree
Hide file tree
Showing 4 changed files with 70 additions and 15 deletions.
6 changes: 6 additions & 0 deletions docs/mkdocs/docs/faq.md
Original file line number Diff line number Diff line change
Expand Up @@ -155,3 +155,9 @@ The handling of `NaN` in ArcticDB depends on the type of the column under consid
* For string columns, `NaN`, as well as Python `None`, are fully supported.
* For floating-point numeric columns, `NaN` is also fully supported.
* For integer numeric columns `NaN` is not supported. A column that otherwise contains only integers will be treated as a floating point column if a `NaN` is encountered by ArcticDB, at which point [the usual rules](api/arctic.md#LibraryOptions) around type promotion for libraries configured with or without dynamic schema all apply as usual.

### How does ArcticDB handle `time zone` information?

ArcticDB takes a more strict approach to time zone handling than Pandas. While Pandas support multiple types for storing time zone information, ArcticDB coerces all time zone information to `datetime.timezone`. This way we can ensure that all time zone information is stored in a consistent manner, and that all time zone information is preserved when data is written to and read from ArcticDB.

When writing data with time zone information to ArcticDB, we preserve the time zone offset and name. When reading data with time zone information from ArcticDB, this data is used to create a `datetime.timezone` object.
42 changes: 30 additions & 12 deletions python/arcticdb/version_store/_normalization.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,9 +7,9 @@
"""
import copy
import datetime
from datetime import timedelta
from datetime import timedelta, timezone
import math

import pytz
import numpy as np
import os
import sys
Expand All @@ -22,7 +22,12 @@
from arcticc.pb2.storage_pb2 import VersionStoreConfig
from mmap import mmap
from collections import Counter
from arcticdb.exceptions import ArcticNativeException, ArcticDbNotYetImplemented, NormalizationException, SortingException
from arcticdb.exceptions import (
ArcticNativeException,
ArcticDbNotYetImplemented,
NormalizationException,
SortingException,
)
from arcticdb.supported_types import DateRangeInput, time_types as supported_time_types
from arcticdb.util._versions import IS_PANDAS_TWO, IS_PANDAS_ZERO
from arcticdb.version_store.read_result import ReadResult
Expand Down Expand Up @@ -740,14 +745,16 @@ def denormalize(self, item, norm_meta):
# We cast it back to "float" so that it matches Pandas 1.0 default for empty series.
# Moreover, we explicitly provide the index otherwise Pandas 0.X overrides it for a RangeIndex
empty_columns_names = (
[] if data is None else
[
name for name, np_array in data.items()
[]
if data is None
else [
name
for name, np_array in data.items()
if np_array.dtype in OBJECT_TOKENS and len(df[name]) == 0
]
)
for column_name in empty_columns_names:
df[column_name] = pd.Series([], index=index, dtype='float64')
df[column_name] = pd.Series([], index=index, dtype="float64")

else:
if index is not None:
Expand Down Expand Up @@ -1016,8 +1023,15 @@ def _nested_msgpack_unpackb(buff, raw=False):

def _custom_pack(self, obj):
if isinstance(obj, pd.Timestamp):
tz = obj.tz.zone if obj.tz is not None else None
return ExtType(MsgPackSerialization.PD_TIMESTAMP, MsgPackNormalizer._nested_msgpack_packb((obj.value, tz)))
offset = obj.tzinfo.utcoffset(obj).total_seconds() if obj.tzinfo else 0
if obj.tz and hasattr(obj.tz, "zone"):
tz = obj.tz.zone
else:
tz = obj.tzname()

return ExtType(
MsgPackSerialization.PD_TIMESTAMP, MsgPackNormalizer._nested_msgpack_packb((obj.value, tz, offset))
)

if isinstance(obj, datetime.datetime):
return ExtType(
Expand All @@ -1040,7 +1054,12 @@ def _custom_pack(self, obj):
def _ext_hook(self, code, data):
if code == MsgPackSerialization.PD_TIMESTAMP:
data = MsgPackNormalizer._nested_msgpack_unpackb(data)
return pd.Timestamp(data[0], tz=data[1]) if data[1] is not None else pd.Timestamp(data[0])
val, tz, offset = data

if tz is None:
return pd.Timestamp(val)

return pd.Timestamp(val, tz=timezone(timedelta(seconds=offset), tz))

if code == MsgPackSerialization.PY_DATETIME:
data = MsgPackNormalizer._nested_msgpack_unpackb(data)
Expand Down Expand Up @@ -1370,8 +1389,7 @@ def _strip_tz(s, e):
if not getattr(data, "timezone", None):
start, end = _strip_tz(start, end)
data = data[
start.to_pydatetime(warn=False)
- timedelta(microseconds=1) : end.to_pydatetime(warn=False)
start.to_pydatetime(warn=False) - timedelta(microseconds=1) : end.to_pydatetime(warn=False)
+ timedelta(microseconds=1)
]
return data
Expand Down
29 changes: 29 additions & 0 deletions python/tests/unit/arcticdb/version_store/test_normalization.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,7 @@
As of the Change Date specified in that file, in accordance with the Business Source License, use of this software will be governed by the Apache License, version 2.0.
"""
import datetime
from datetime import timezone, timedelta
from collections import namedtuple

import numpy as np
Expand Down Expand Up @@ -117,6 +118,34 @@ def test_write_tz(lmdb_version_store, sym, tz):
assert str(df_tz) == str(tz)


@pytest.mark.parametrize(
"tz",
[
None,
"UTC",
"Europe/Amsterdam",
"America/New_York",
timezone(timedelta(seconds=0), "UTC"),
timezone(timedelta(seconds=0), "Europe/Amsterdam"),
timezone(timedelta(seconds=0), "America/New_York"),
pytz.UTC,
pytz.timezone("Europe/Amsterdam"),
pytz.timezone("America/New_York"),
du.tz.gettz("UTC"),
du.tz.gettz("Europe/Amsterdam"),
du.tz.gettz("America/New_York"),
],
)
def test_write_metadata_tz(lmdb_version_store, sym, tz):
df = pd.DataFrame(data={"col1": np.arange(10)}, index=pd.date_range(pd.Timestamp(0), periods=10))
meta = pd.Timestamp("2024-03-04", tz=tz)
lmdb_version_store.write(sym, df, metadata={"index_start": meta})
df_tz = lmdb_version_store.read(sym).metadata["index_start"]
assert str(df_tz.tzinfo) == str(meta.tzinfo) or df_tz.tzname() == meta.tzname()
if meta.tzinfo is not None:
assert df_tz.tzinfo.utcoffset(df_tz) == meta.tzinfo.utcoffset(meta)


def get_multiindex_df_with_tz(tz):
dt1 = datetime.datetime(2019, 4, 8, 10, 5, 2, 1)
dt2 = datetime.datetime(2019, 4, 9, 10, 5, 2, 1)
Expand Down
8 changes: 5 additions & 3 deletions python/tests/util/storage_test.py
Original file line number Diff line number Diff line change
Expand Up @@ -134,18 +134,20 @@ def is_strategy_branch_valid_format(input_string):


def write_persistent_library(lib, latest: bool = False):
meta = pd.Timestamp("2024-03-04", tz="Europe/Amsterdam")

for df, sym in get_basic_dfs():
lib.write(sym, df)
lib.write(sym, df, metadata={"meta": meta})
lib.append(sym, df)

series, sym = get_empty_series()
lib.write(sym, series)
lib.write(sym, series, metadata={"meta": meta})

df, sym = get_csv_df()
if not latest:
# 'cast' to str because we only support string index names in past versions
df.index.rename(str(df.index.name), inplace=True)
lib.write(sym, df)
lib.write(sym, df, metadata={"meta": meta})

res = lib.read(sym).data
assert_frame_equal(res, df)
Expand Down

0 comments on commit d423bfb

Please sign in to comment.