Skip to content

Commit

Permalink
Clean code and improve docs.
Browse files Browse the repository at this point in the history
  • Loading branch information
fmassot committed Jun 21, 2023
1 parent c9491c0 commit 9ca0913
Show file tree
Hide file tree
Showing 3 changed files with 39 additions and 37 deletions.
2 changes: 1 addition & 1 deletion src/tokenizer/mod.rs
Original file line number Diff line number Diff line change
Expand Up @@ -154,7 +154,7 @@ pub use self::split_compound_words::SplitCompoundWords;
pub use self::stemmer::{Language, Stemmer};
pub use self::stop_word_filter::StopWordFilter;
pub use self::tokenized_string::{PreTokenizedStream, PreTokenizedString};
pub use self::tokenizer::TextAnalyzer;
pub use self::tokenizer::{BoxTokenFilter, TextAnalyzer};
pub use self::tokenizer_manager::TokenizerManager;
pub use self::whitespace_tokenizer::WhitespaceTokenizer;

Expand Down
72 changes: 37 additions & 35 deletions src/tokenizer/tokenizer.rs
Original file line number Diff line number Diff line change
@@ -1,5 +1,3 @@
use std::ops::Deref;

/// The tokenizer module contains all of the tools used to process
/// text in `tantivy`.
use tokenizer_api::{BoxTokenStream, TokenFilter, TokenStream, Tokenizer};
Expand All @@ -12,7 +10,7 @@ pub struct TextAnalyzer {
}

/// A boxable `Tokenizer`, with its `TokenStream` type erased.
pub trait BoxableTokenizer: 'static + Send + Sync {
trait BoxableTokenizer: 'static + Send + Sync {
/// Creates a boxed token stream for a given `str`.
fn box_token_stream<'a>(&'a mut self, text: &'a str) -> BoxTokenStream<'a>;
/// Clone this tokenizer.
Expand All @@ -28,69 +26,73 @@ impl<T: Tokenizer> BoxableTokenizer for T {
}
}

pub struct BoxedTokenizer(Box<dyn BoxableTokenizer>);
impl Tokenizer for Box<dyn BoxableTokenizer> {
type TokenStream<'a> = Box<dyn TokenStream + 'a>;

impl Clone for BoxedTokenizer {
fn clone(&self) -> BoxedTokenizer {
Self(self.0.box_clone())
fn token_stream<'a>(&'a mut self, text: &'a str) -> Self::TokenStream<'a> {
self.box_token_stream(text).into()
}
}

impl Tokenizer for BoxedTokenizer {
type TokenStream<'a> = Box<dyn TokenStream + 'a>;

fn token_stream<'a>(&'a mut self, text: &'a str) -> Self::TokenStream<'a> {
self.0.box_token_stream(text).into()
impl Clone for Box<dyn BoxableTokenizer> {
fn clone(&self) -> Self {
self.box_clone()
}
}

/// Trait for the pluggable components of `Tokenizer`s.
pub trait BoxableTokenFilter: 'static + Send + Sync {
/// Wraps a Tokenizer and returns a new one.
fn box_transform(&self, tokenizer: BoxedTokenizer) -> Box<dyn BoxableTokenizer>;
/// A boxable `TokenFilter`, with its `Tokenizer` type erased.
trait BoxableTokenFilter: 'static + Send + Sync {
/// Wraps a `BoxedTokenizer` and returns a new one.
fn box_transform(&self, tokenizer: Box<dyn BoxableTokenizer>) -> Box<dyn BoxableTokenizer>;
}

impl<T: TokenFilter> BoxableTokenFilter for T {
fn box_transform(&self, tokenizer: BoxedTokenizer) -> Box<dyn BoxableTokenizer> {
fn box_transform(&self, tokenizer: Box<dyn BoxableTokenizer>) -> Box<dyn BoxableTokenizer> {
let tokenizer = self.clone().transform(tokenizer);
tokenizer.box_clone()
Box::new(tokenizer)
}
}

/// A boxed `TokenFilter`, with its `Tokenizer` type erased.
pub struct BoxTokenFilter(Box<dyn BoxableTokenFilter>);

impl Deref for BoxTokenFilter {
type Target = dyn BoxableTokenFilter;

fn deref(&self) -> &dyn BoxableTokenFilter {
&*self.0
}
}

impl<T: TokenFilter> From<T> for BoxTokenFilter {
fn from(tokenizer: T) -> BoxTokenFilter {
BoxTokenFilter(Box::new(tokenizer))
fn from(token_filter: T) -> BoxTokenFilter {
BoxTokenFilter(Box::new(token_filter))
}
}

impl TextAnalyzer {
/// Builds a new `TextAnalyzer` given a tokenizer and a vector of `BoxTokenFilter`.
///
/// When creating a `TextAnalyzer` from a `Tokenizer` alone, prefer using
/// `TextAnalyzer::from(tokenizer)`.
/// When creating a `TextAnalyzer` from a `Tokenizer` and a static set of `TokenFilter`,
/// prefer using `TextAnalyzer::builder(tokenizer).filter(token_filter).build()`.
/// prefer using `TextAnalyzer::builder(tokenizer).filter(token_filter).build()` as it
/// will be more performant and only create one `Box<dyn BoxableTokenizer>` instead of
/// one per `TokenFilter`.
///
/// # Example
///
/// ```rust
/// use tantivy::tokenizer::*;
///
/// let en_stem = TextAnalyzer::build(
/// SimpleTokenizer::default(),
/// vec![
/// BoxTokenFilter::from(RemoveLongFilter::limit(40)),
/// BoxTokenFilter::from(LowerCaser),
/// BoxTokenFilter::from(Stemmer::default()),
/// ]);
/// ```
pub fn build<T: Tokenizer>(
tokenizer: T,
boxed_token_filters: Vec<BoxTokenFilter>,
) -> TextAnalyzer {
let mut boxed_tokenizer = BoxedTokenizer(Box::new(tokenizer));
let mut boxed_tokenizer: Box<dyn BoxableTokenizer> = Box::new(tokenizer);
for filter in boxed_token_filters.into_iter() {
let filtered_boxed_tokenizer = filter.box_transform(boxed_tokenizer);
boxed_tokenizer = BoxedTokenizer(filtered_boxed_tokenizer);
boxed_tokenizer = filter.0.box_transform(boxed_tokenizer);
}
TextAnalyzer {
tokenizer: boxed_tokenizer.0,
tokenizer: boxed_tokenizer,
}
}

Expand Down
2 changes: 1 addition & 1 deletion tokenizer-api/src/lib.rs
Original file line number Diff line number Diff line change
Expand Up @@ -148,7 +148,7 @@ pub trait TokenFilter: 'static + Send + Sync + Clone {
/// The Tokenizer type returned by this filter, typically parametrized by the underlying
/// Tokenizer.
type Tokenizer<T: Tokenizer>: Tokenizer;
/// Wraps a Tokenizer and returns a new onex .
/// Wraps a Tokenizer and returns a new one.
fn transform<T: Tokenizer>(self, tokenizer: T) -> Self::Tokenizer<T>;
}

Expand Down

0 comments on commit 9ca0913

Please sign in to comment.