Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

store DateTime as nanoseconds in doc store #2486

Open
wants to merge 1 commit into
base: main
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
30 changes: 28 additions & 2 deletions common/src/datetime.rs
Original file line number Diff line number Diff line change
@@ -1,11 +1,11 @@
use std::fmt;
use std::io::{Read, Write};
use std::io::{self, Read, Write};

use serde::{Deserialize, Serialize};
use time::format_description::well_known::Rfc3339;
use time::{OffsetDateTime, PrimitiveDateTime, UtcOffset};

use crate::BinarySerializable;
use crate::{BinarySerializable, BinarySerializableConfig, ConfigurableBinarySerializable};

/// Precision with which datetimes are truncated when stored in fast fields. This setting is only
/// relevant for fast fields. In the docstore, datetimes are always saved with nanosecond precision.
Expand Down Expand Up @@ -174,3 +174,29 @@ impl BinarySerializable for DateTime {
Ok(Self::from_timestamp_micros(timestamp_micros))
}
}

impl ConfigurableBinarySerializable for DateTime {
fn serialize<W: Write + ?Sized>(
&self,
writer: &mut W,
config: &BinarySerializableConfig,
) -> io::Result<()> {
let truncated_val = match config.date_time_precision {
DateTimePrecision::Seconds => self.into_timestamp_secs(),
DateTimePrecision::Milliseconds => self.into_timestamp_millis(),
DateTimePrecision::Microseconds => self.into_timestamp_micros(),
DateTimePrecision::Nanoseconds => self.into_timestamp_nanos(),
};
<i64 as BinarySerializable>::serialize(&truncated_val, writer)
}

fn deserialize<R: Read>(reader: &mut R, config: &BinarySerializableConfig) -> io::Result<Self> {
let timestamp = <i64 as BinarySerializable>::deserialize(reader)?;
match config.date_time_precision {
DateTimePrecision::Seconds => Ok(Self::from_timestamp_secs(timestamp)),
DateTimePrecision::Milliseconds => Ok(Self::from_timestamp_millis(timestamp)),
DateTimePrecision::Microseconds => Ok(Self::from_timestamp_micros(timestamp)),
DateTimePrecision::Nanoseconds => Ok(Self::from_timestamp_nanos(timestamp)),
}
}
}
2 changes: 1 addition & 1 deletion common/src/lib.rs
Original file line number Diff line number Diff line change
Expand Up @@ -20,7 +20,7 @@ pub use datetime::{DateTime, DateTimePrecision};
pub use group_by::GroupByIteratorExtended;
pub use json_path_writer::JsonPathWriter;
pub use ownedbytes::{OwnedBytes, StableDeref};
pub use serialize::{BinarySerializable, DeserializeFrom, FixedSize};
pub use serialize::*;
pub use vint::{
read_u32_vint, read_u32_vint_no_advance, serialize_vint_u32, write_u32_vint, VInt, VIntU128,
};
Expand Down
79 changes: 68 additions & 11 deletions common/src/serialize.rs
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,7 @@ use std::{fmt, io};

use byteorder::{ReadBytesExt, WriteBytesExt};

use crate::{Endianness, VInt};
use crate::{DateTimePrecision, Endianness, VInt};

#[derive(Default)]
struct Counter(u64);
Expand All @@ -25,6 +25,63 @@ impl io::Write for Counter {
}
}

/// The configuration for binary serialization.
#[derive(Debug, Clone, Copy, PartialEq, Eq)]
pub struct BinarySerializableConfig {
/// The precision with which DateTime is de/serialized
pub date_time_precision: DateTimePrecision,
}

/// Trait for configurable binary serialization
pub trait ConfigurableBinarySerializable: fmt::Debug + Sized {
fn serialize<W: Write + ?Sized>(
&self,
writer: &mut W,
config: &BinarySerializableConfig,
) -> io::Result<()>;
fn deserialize<R: Read>(reader: &mut R, config: &BinarySerializableConfig) -> io::Result<Self>;
fn num_bytes(&self, config: &BinarySerializableConfig) -> u64 {
let mut counter = Counter::default();
self.serialize(&mut counter, config).unwrap();
counter.0
}
}

/// Implement ConfigurableBinarySerializable trait for types that don't need versioning by
/// forwarding to BinarySerializable trait.
#[macro_export]
macro_rules! impl_configurable_binary_serializable_by_calling_binary_serializable {
($type:ty) => {
impl ConfigurableBinarySerializable for $type {
fn serialize<W: Write + ?Sized>(
&self,
writer: &mut W,
_version: &BinarySerializableConfig,
) -> io::Result<()> {
<Self as BinarySerializable>::serialize(self, writer)
}

fn deserialize<R: Read>(
reader: &mut R,
_version: &BinarySerializableConfig,
) -> io::Result<Self> {
<Self as BinarySerializable>::deserialize(reader)
}
}
};
}

impl_configurable_binary_serializable_by_calling_binary_serializable!(String);
impl_configurable_binary_serializable_by_calling_binary_serializable!(u64);
impl_configurable_binary_serializable_by_calling_binary_serializable!(i64);
impl_configurable_binary_serializable_by_calling_binary_serializable!(f64);
impl_configurable_binary_serializable_by_calling_binary_serializable!(Vec<u8>);
impl_configurable_binary_serializable_by_calling_binary_serializable!(u128);
impl_configurable_binary_serializable_by_calling_binary_serializable!(bool);
impl_configurable_binary_serializable_by_calling_binary_serializable!(());
impl_configurable_binary_serializable_by_calling_binary_serializable!(Cow<'_, str>);
impl_configurable_binary_serializable_by_calling_binary_serializable!(Cow<'_, [u8]>);

/// Trait for a simple binary serialization.
pub trait BinarySerializable: fmt::Debug + Sized {
/// Serialize
Expand Down Expand Up @@ -74,14 +131,14 @@ impl FixedSize for () {

impl<T: BinarySerializable> BinarySerializable for Vec<T> {
fn serialize<W: Write + ?Sized>(&self, writer: &mut W) -> io::Result<()> {
VInt(self.len() as u64).serialize(writer)?;
BinarySerializable::serialize(&VInt(self.len() as u64), writer)?;
for it in self {
it.serialize(writer)?;
}
Ok(())
}
fn deserialize<R: Read>(reader: &mut R) -> io::Result<Vec<T>> {
let num_items = VInt::deserialize(reader)?.val();
let num_items = <VInt as BinarySerializable>::deserialize(reader)?.val();
let mut items: Vec<T> = Vec::with_capacity(num_items as usize);
for _ in 0..num_items {
let item = T::deserialize(reader)?;
Expand Down Expand Up @@ -236,12 +293,12 @@ impl FixedSize for bool {
impl BinarySerializable for String {
fn serialize<W: Write + ?Sized>(&self, writer: &mut W) -> io::Result<()> {
let data: &[u8] = self.as_bytes();
VInt(data.len() as u64).serialize(writer)?;
BinarySerializable::serialize(&VInt(data.len() as u64), writer)?;
writer.write_all(data)
}

fn deserialize<R: Read>(reader: &mut R) -> io::Result<String> {
let string_length = VInt::deserialize(reader)?.val() as usize;
let string_length = <VInt as BinarySerializable>::deserialize(reader)?.val() as usize;
let mut result = String::with_capacity(string_length);
reader
.take(string_length as u64)
Expand All @@ -253,12 +310,12 @@ impl BinarySerializable for String {
impl<'a> BinarySerializable for Cow<'a, str> {
fn serialize<W: Write + ?Sized>(&self, writer: &mut W) -> io::Result<()> {
let data: &[u8] = self.as_bytes();
VInt(data.len() as u64).serialize(writer)?;
BinarySerializable::serialize(&VInt(data.len() as u64), writer)?;
writer.write_all(data)
}

fn deserialize<R: Read>(reader: &mut R) -> io::Result<Cow<'a, str>> {
let string_length = VInt::deserialize(reader)?.val() as usize;
let string_length = <VInt as BinarySerializable>::deserialize(reader)?.val() as usize;
let mut result = String::with_capacity(string_length);
reader
.take(string_length as u64)
Expand All @@ -269,18 +326,18 @@ impl<'a> BinarySerializable for Cow<'a, str> {

impl<'a> BinarySerializable for Cow<'a, [u8]> {
fn serialize<W: Write + ?Sized>(&self, writer: &mut W) -> io::Result<()> {
VInt(self.len() as u64).serialize(writer)?;
BinarySerializable::serialize(&VInt(self.len() as u64), writer)?;
for it in self.iter() {
it.serialize(writer)?;
BinarySerializable::serialize(it, writer)?;
}
Ok(())
}

fn deserialize<R: Read>(reader: &mut R) -> io::Result<Cow<'a, [u8]>> {
let num_items = VInt::deserialize(reader)?.val();
let num_items = <VInt as BinarySerializable>::deserialize(reader)?.val();
let mut items: Vec<u8> = Vec::with_capacity(num_items as usize);
for _ in 0..num_items {
let item = u8::deserialize(reader)?;
let item = <u8 as BinarySerializable>::deserialize(reader)?;
items.push(item);
}
Ok(Cow::Owned(items))
Expand Down
15 changes: 13 additions & 2 deletions src/compat_tests.rs
Original file line number Diff line number Diff line change
Expand Up @@ -44,8 +44,19 @@ fn test_format_6() {
assert_date_time_precision(&index, DateTimePrecision::Microseconds);
}

/// feature flag quickwit uses a different dictionary type
#[test]
#[cfg(not(feature = "quickwit"))]
fn test_format_7() {
let path = path_for_version("7");

let index = Index::open_in_dir(path).expect("Failed to open index");
// dates are not truncated in v7 in the docstore
assert_date_time_precision(&index, DateTimePrecision::Nanoseconds);
}

#[cfg(not(feature = "quickwit"))]
fn assert_date_time_precision(index: &Index, precision: DateTimePrecision) {
fn assert_date_time_precision(index: &Index, doc_store_precision: DateTimePrecision) {
use collector::TopDocs;
let reader = index.reader().expect("Failed to create reader");
let searcher = reader.searcher();
Expand Down Expand Up @@ -75,6 +86,6 @@ fn assert_date_time_precision(index: &Index, precision: DateTimePrecision) {
.as_datetime()
.unwrap();

let expected = DateTime::from_timestamp_nanos(123456).truncate(precision);
let expected = DateTime::from_timestamp_nanos(123456).truncate(doc_store_precision);
assert_eq!(date_value, expected,);
}
2 changes: 1 addition & 1 deletion src/lib.rs
Original file line number Diff line number Diff line change
Expand Up @@ -125,8 +125,8 @@
//!
//! - **Searching**: [Searcher] searches the segments with anything that implements
//! [Query](query::Query) and merges the results. The list of [supported
//! queries](query::Query#implementors). Custom Queries are supported by implementing the

Check warning on line 128 in src/lib.rs

View workflow job for this annotation

GitHub Actions / clippy

doc list item missing indentation

warning: doc list item missing indentation --> src/lib.rs:128:5 | 128 | //! queries](query::Query#implementors). Custom Queries are supported by implementing the | ^ | = help: if this is supposed to be its own paragraph, add a blank line = help: for further information visit https://rust-lang.github.io/rust-clippy/master/index.html#doc_lazy_continuation = note: `#[warn(clippy::doc_lazy_continuation)]` on by default help: indent this line | 128 | //! queries](query::Query#implementors). Custom Queries are supported by implementing the | ++
//! [Query](query::Query) trait.

Check warning on line 129 in src/lib.rs

View workflow job for this annotation

GitHub Actions / clippy

doc list item missing indentation

warning: doc list item missing indentation --> src/lib.rs:129:5 | 129 | //! [Query](query::Query) trait. | ^ | = help: if this is supposed to be its own paragraph, add a blank line = help: for further information visit https://rust-lang.github.io/rust-clippy/master/index.html#doc_lazy_continuation help: indent this line | 129 | //! [Query](query::Query) trait. | ++
//!
//! - **[Directory](directory)**: Abstraction over the storage where the index data is stored.
//!
Expand Down Expand Up @@ -232,7 +232,7 @@
pub use crate::schema::{Document, TantivyDocument, Term};

/// Index format version.
pub const INDEX_FORMAT_VERSION: u32 = 6;
pub const INDEX_FORMAT_VERSION: u32 = 7;
/// Oldest index format version this tantivy version can read.
pub const INDEX_FORMAT_OLDEST_SUPPORTED_VERSION: u32 = 4;

Expand Down
Loading
Loading