From 5ef640b74677f91e3a33dfa396bd839b23b40f81 Mon Sep 17 00:00:00 2001 From: Roman Yurchak Date: Wed, 1 May 2019 06:09:48 +0200 Subject: [PATCH 01/11] Add Tokenizer trait to RegexpTokenizer --- src/tokenize/mod.rs | 14 +++++++++++--- src/vectorize/mod.rs | 1 + 2 files changed, 12 insertions(+), 3 deletions(-) diff --git a/src/tokenize/mod.rs b/src/tokenize/mod.rs index 0e9ed83..eeca244 100644 --- a/src/tokenize/mod.rs +++ b/src/tokenize/mod.rs @@ -19,7 +19,7 @@ let s = "The “brown” fox can't jump 32.3 feet, right?"; Using a regular expression tokenizer we would get, ```rust # let s = "The “brown” fox can't jump 32.3 feet, right?"; -# use vtext::tokenize::RegexpTokenizer; +# use vtext::tokenize::*; let tokenizer = RegexpTokenizer::new(r"\b\w\w+\b".to_string()); let tokens: Vec<&str> = tokenizer.tokenize(s).collect(); assert_eq!(tokens, &["The", "brown", "fox", "can", "jump", "32", "feet", "right"]); @@ -59,6 +59,11 @@ use unicode_segmentation::UnicodeSegmentation; #[cfg(test)] mod tests; +pub trait Tokenizer { + fn tokenize<'a>(&'a self, text: &'a str) -> Box + 'a>; +} + + /// Regular expression tokenizer /// #[derive(Debug)] @@ -77,9 +82,12 @@ impl RegexpTokenizer { regexp: regexp, } } +} + +impl Tokenizer for RegexpTokenizer { /// Tokenize a string - pub fn tokenize<'a>(&'a self, text: &'a str) -> impl Iterator { - self.regexp.find_iter(text).map(|m| m.as_str()) + fn tokenize<'a>(&'a self, text: &'a str) -> Box + 'a> { + Box::new(self.regexp.find_iter(text).map(|m| m.as_str())) } } diff --git a/src/vectorize/mod.rs b/src/vectorize/mod.rs index ba5e114..abd1a35 100644 --- a/src/vectorize/mod.rs +++ b/src/vectorize/mod.rs @@ -12,6 +12,7 @@ This module allows computing a sparse document term matrix from a text corpus. use crate::math::CSRArray; use crate::tokenize; +use crate::tokenize::Tokenizer; use hashbrown::HashMap; use ndarray::Array; use sprs::CsMat; From 2c2ae35d177f8b982023d2730597147c633c2389 Mon Sep 17 00:00:00 2001 From: Roman Yurchak Date: Wed, 1 May 2019 06:14:59 +0200 Subject: [PATCH 02/11] Use Tokenizer traits for all tokenizers --- src/lib.rs | 2 +- src/tokenize/mod.rs | 18 +++++++++++++----- 2 files changed, 14 insertions(+), 6 deletions(-) diff --git a/src/lib.rs b/src/lib.rs index ca5cede..fe1fe3c 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -31,7 +31,7 @@ A simple tokenization example can be found below, ```rust extern crate vtext; -use vtext::tokenize::VTextTokenizer; +use vtext::tokenize::{VTextTokenizer,Tokenizer}; let tok = VTextTokenizer::new("en"); let tokens = tok.tokenize("Flights can't depart after 2:00 pm."); diff --git a/src/tokenize/mod.rs b/src/tokenize/mod.rs index eeca244..1b75c1c 100644 --- a/src/tokenize/mod.rs +++ b/src/tokenize/mod.rs @@ -28,7 +28,7 @@ assert_eq!(tokens, &["The", "brown", "fox", "can", "jump", "32", "feet", "right" which would remove all punctuation. A more general approach is to apply unicode segmentation, ```rust # let s = "The “brown” fox can't jump 32.3 feet, right?"; -# use vtext::tokenize::UnicodeSegmentTokenizer; +# use vtext::tokenize::*; let tokenizer = UnicodeSegmentTokenizer::new(true); let tokens: Vec<&str> = tokenizer.tokenize(s).collect(); assert_eq!(tokens, &["The", "“", "brown", "”", "fox", "can't", "jump", "32.3", "feet", ",", "right", "?"]); @@ -41,7 +41,7 @@ as "ca", "n't" in English. To address such issues, we apply several additional r ```rust # let s = "The “brown” fox can't jump 32.3 feet, right?"; -# use vtext::tokenize::VTextTokenizer; +# use vtext::tokenize::*; let tokenizer = VTextTokenizer::new("en"); let tokens: Vec<&str> = tokenizer.tokenize(s).collect(); assert_eq!(tokens, &["The", "“", "brown", "”", "fox", "ca", "n't", "jump", "32.3", "feet", ",", "right", "?"]); @@ -111,8 +111,11 @@ impl UnicodeSegmentTokenizer { word_bounds: word_bounds, } } +} + +impl Tokenizer for UnicodeSegmentTokenizer { /// Tokenize a string - pub fn tokenize<'a>(&self, text: &'a str) -> Box + 'a> { + fn tokenize<'a>(&self, text: &'a str) -> Box + 'a> { if self.word_bounds { let res = text.split_word_bounds().filter(|x| x != &" "); return Box::new(res); @@ -160,8 +163,11 @@ impl VTextTokenizer { lang: lang_valid.to_string(), } } +} + +impl Tokenizer for VTextTokenizer { /// Tokenize a string - pub fn tokenize<'a>(&self, text: &'a str) -> Box + 'a> { + fn tokenize<'a>(&self, text: &'a str) -> Box + 'a> { let tokens = text.split_word_bounds(); let mut res: Vec<&'a str> = Vec::new(); @@ -279,9 +285,11 @@ impl CharacterTokenizer { window_size: window_size, } } +} +impl Tokenizer for CharacterTokenizer { /// Tokenize a string - pub fn tokenize<'a>(&self, text: &'a str) -> Box + 'a> { + fn tokenize<'a>(&self, text: &'a str) -> Box + 'a> { let res = text .char_indices() .zip( From d9ffc9052a68afb59d1b66d1edfd9659546de5f3 Mon Sep 17 00:00:00 2001 From: Roman Yurchak Date: Wed, 1 May 2019 07:51:50 +0200 Subject: [PATCH 03/11] Implement dynamic dispatch for RegexpTokenizer --- src/main.rs | 64 ------------------------------------------ src/tokenize/mod.rs | 11 ++++++-- src/vectorize/mod.rs | 14 ++++----- src/vectorize/tests.rs | 26 +++++++++++++++-- 4 files changed, 39 insertions(+), 76 deletions(-) delete mode 100644 src/main.rs diff --git a/src/main.rs b/src/main.rs deleted file mode 100644 index 0ac14ee..0000000 --- a/src/main.rs +++ /dev/null @@ -1,64 +0,0 @@ -// Copyright 2019 vtext developers -// -// Licensed under the Apache License, Version 2.0, -// . This file may not be copied, -// modified, or distributed except according to those terms. - -#![allow(non_snake_case)] - -extern crate vtext; - -use std::fs; -use std::io::prelude::*; -use std::time::SystemTime; -use vtext::vectorize::{CountVectorizer, HashingVectorizer}; - -fn main() { - let _dirs_list = fs::read_dir("./data/").unwrap(); - - // let mut indices: Vec = Vec::new() - // let mut idptr: Vec = Vec::new() - // let mut values: Vec = Vec::new() - - let mut documents: Vec = Vec::new(); - - for dir_path in _dirs_list { - let dir_path = dir_path.unwrap(); - if dir_path.path().is_dir() { - let _file_list = fs::read_dir(dir_path.path()).unwrap(); - for path in _file_list { - let mut fh = fs::File::open(path.unwrap().path()).expect("file not found"); - let mut contents = String::new(); - fh.read_to_string(&mut contents) - .expect("something went wrong"); - documents.push(contents) - } - } - } - - let t0 = SystemTime::now(); - - let mut vect = CountVectorizer::new(); - let _X = vect.fit_transform(&documents); - - let n_documents = documents.len(); - - let t_end = SystemTime::now(); - let dt = t_end.duration_since(t0).unwrap(); - println!( - "CountVectorizer: vectorized {} documents in {:?}", - n_documents, dt - ); - - let t0 = SystemTime::now(); - - let vect = HashingVectorizer::new(); - let _X = vect.fit_transform(&documents); - - let t_end = SystemTime::now(); - let dt = t_end.duration_since(t0).unwrap(); - println!( - "HashingVectorizer: vectorized {} documents in {:?}", - n_documents, dt - ); -} diff --git a/src/tokenize/mod.rs b/src/tokenize/mod.rs index 1b75c1c..81507a5 100644 --- a/src/tokenize/mod.rs +++ b/src/tokenize/mod.rs @@ -52,6 +52,7 @@ extern crate unicode_segmentation; use itertools::Itertools; use std::cmp; +use std::fmt; use regex::Regex; use unicode_segmentation::UnicodeSegmentation; @@ -59,14 +60,13 @@ use unicode_segmentation::UnicodeSegmentation; #[cfg(test)] mod tests; -pub trait Tokenizer { +pub trait Tokenizer: fmt::Debug { fn tokenize<'a>(&'a self, text: &'a str) -> Box + 'a>; } /// Regular expression tokenizer /// -#[derive(Debug)] pub struct RegexpTokenizer { pub pattern: String, regexp: Regex, @@ -91,6 +91,13 @@ impl Tokenizer for RegexpTokenizer { } } +impl fmt::Debug for RegexpTokenizer { + fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result { + write!(f, "RegexpTokenizer {{ pattern: {} }}", self.pattern) + } +} + + /// Unicode Segmentation tokenizer /// /// This implementation is a thin wrapper around the diff --git a/src/vectorize/mod.rs b/src/vectorize/mod.rs index abd1a35..6d91aeb 100644 --- a/src/vectorize/mod.rs +++ b/src/vectorize/mod.rs @@ -65,9 +65,9 @@ fn _sum_duplicates(tf: &mut CSRArray, indices_local: &[u32], nnz: &mut usize) { } #[derive(Debug)] -pub struct HashingVectorizer { +pub struct HashingVectorizer<'b> { lowercase: bool, - token_pattern: String, + tokenizer: &'b Tokenizer, n_features: u64, } @@ -165,12 +165,12 @@ impl CountVectorizer { } } -impl HashingVectorizer { +impl<'b> HashingVectorizer<'b> { /// Create a new HashingVectorizer estimator - pub fn new() -> Self { + pub fn new(tokenizer: &'b Tokenizer) -> Self { HashingVectorizer { lowercase: true, - token_pattern: String::from(TOKEN_PATTERN_DEFAULT), + tokenizer: tokenizer, n_features: 1048576, } } @@ -195,8 +195,6 @@ impl HashingVectorizer { let mut indices_local = Vec::new(); let mut nnz: usize = 0; - let tokenizer = tokenize::RegexpTokenizer::new(TOKEN_PATTERN_DEFAULT.to_string()); - // String.to_lowercase() is very slow // https://www.reddit.com/r/rust/comments/6wbru2/performance_issue_can_i_avoid_of_using_the_slow/ // https://github.com/rust-lang/rust/issues/26244 @@ -205,7 +203,7 @@ impl HashingVectorizer { let pipe = X.iter().map(|doc| doc.to_ascii_lowercase()); for (_document_id, document) in pipe.enumerate() { - let tokens = tokenizer.tokenize(&document); + let tokens = self.tokenizer.tokenize(&document); indices_local.clear(); for token in tokens { // set the RNG seeds to get reproducible hashing diff --git a/src/vectorize/tests.rs b/src/vectorize/tests.rs index e8f564f..eac5cac 100644 --- a/src/vectorize/tests.rs +++ b/src/vectorize/tests.rs @@ -4,6 +4,7 @@ // . This file may not be copied, // modified, or distributed except according to those terms. +use crate::tokenize::*; use crate::vectorize::*; use crate::*; @@ -47,7 +48,9 @@ fn test_hashing_vectorizer_simple() { String::from("The sky is blue"), ]; - let vect = HashingVectorizer::new(); + let tokenizer = VTextTokenizer::new("en"); + + let vect = HashingVectorizer::new(&tokenizer); let vect = vect.fit(&documents); let X = vect.transform(&documents); assert_eq!(X.indptr(), &[0, 4, 8]); @@ -83,10 +86,29 @@ fn test_empty_dataset() { assert_eq!(X.indices(), &[]); assert_eq!(X.indptr(), &[0]); - let vectorizer = HashingVectorizer::new(); + let tokenizer = VTextTokenizer::new("en"); + + let vectorizer = HashingVectorizer::new(&tokenizer); let X = vectorizer.fit_transform(&documents); assert_eq!(X.data(), &[]); assert_eq!(X.indices(), &[]); assert_eq!(X.indptr(), &[0]); } + +#[test] +fn test_dynamic_dispatch_tokenizer() { + + let tokenizer = VTextTokenizer::new("en"); + HashingVectorizer::new(&tokenizer); + + let tokenizer = UnicodeSegmentTokenizer::new(false); + HashingVectorizer::new(&tokenizer); + + let tokenizer = RegexpTokenizer::new("\\b\\w+\\w\\b".to_string()); + HashingVectorizer::new(&tokenizer); + + let tokenizer = CharacterTokenizer::new(4); + HashingVectorizer::new(&tokenizer); +} + From 1c11025dd38e65d118f85b230dd7a0650f4d05b5 Mon Sep 17 00:00:00 2001 From: Roman Yurchak Date: Wed, 1 May 2019 08:01:47 +0200 Subject: [PATCH 04/11] Finalize tokenizer dynamic dispatch for CountVectorizer --- src/vectorize/mod.rs | 10 +++++----- src/vectorize/tests.rs | 17 +++++++++++------ 2 files changed, 16 insertions(+), 11 deletions(-) diff --git a/src/vectorize/mod.rs b/src/vectorize/mod.rs index 6d91aeb..dd85677 100644 --- a/src/vectorize/mod.rs +++ b/src/vectorize/mod.rs @@ -72,20 +72,20 @@ pub struct HashingVectorizer<'b> { } #[derive(Debug)] -pub struct CountVectorizer { +pub struct CountVectorizer<'b> { lowercase: bool, - token_pattern: String, + tokenizer: &'b Tokenizer, pub vocabulary: HashMap, } pub enum Vectorizer {} -impl CountVectorizer { +impl<'b> CountVectorizer<'b> { /// Initialize a CountVectorizer estimator - pub fn new() -> Self { + pub fn new(tokenizer: &'b Tokenizer) -> Self { CountVectorizer { lowercase: true, - token_pattern: String::from(TOKEN_PATTERN_DEFAULT), + tokenizer: tokenizer, vocabulary: HashMap::with_capacity_and_hasher(1000, Default::default()), } } diff --git a/src/vectorize/tests.rs b/src/vectorize/tests.rs index eac5cac..e1daca2 100644 --- a/src/vectorize/tests.rs +++ b/src/vectorize/tests.rs @@ -11,9 +11,10 @@ use crate::*; #[test] fn test_count_vectorizer_simple() { // Example 1 + let tokenizer = RegexpTokenizer::new("\\b\\w+\\w\\b".to_string()); let documents = vec![String::from("cat dog cat")]; - let mut vect = CountVectorizer::new(); + let mut vect = CountVectorizer::new(&tokenizer); let X = vect.fit_transform(&documents); assert_eq!(X.to_dense(), array![[2, 1]]); @@ -23,7 +24,7 @@ fn test_count_vectorizer_simple() { String::from("The sky sky sky is blue"), ]; - let mut vect = CountVectorizer::new(); + let mut vect = CountVectorizer::new(&tokenizer); vect.fit(&documents); let X = vect.transform(&documents); @@ -79,15 +80,14 @@ fn test_hashing_vectorizer_simple() { fn test_empty_dataset() { let documents: Vec = vec![]; - let mut vectorizer = CountVectorizer::new(); + let tokenizer = VTextTokenizer::new("en"); + let mut vectorizer = CountVectorizer::new(&tokenizer); let X = vectorizer.fit_transform(&documents); assert_eq!(X.data(), &[]); assert_eq!(X.indices(), &[]); assert_eq!(X.indptr(), &[0]); - let tokenizer = VTextTokenizer::new("en"); - let vectorizer = HashingVectorizer::new(&tokenizer); let X = vectorizer.fit_transform(&documents); @@ -100,15 +100,20 @@ fn test_empty_dataset() { fn test_dynamic_dispatch_tokenizer() { let tokenizer = VTextTokenizer::new("en"); + CountVectorizer::new(&tokenizer); HashingVectorizer::new(&tokenizer); let tokenizer = UnicodeSegmentTokenizer::new(false); + CountVectorizer::new(&tokenizer); HashingVectorizer::new(&tokenizer); let tokenizer = RegexpTokenizer::new("\\b\\w+\\w\\b".to_string()); + CountVectorizer::new(&tokenizer); HashingVectorizer::new(&tokenizer); + let tokenizer = CharacterTokenizer::new(4); + CountVectorizer::new(&tokenizer); HashingVectorizer::new(&tokenizer); -} +} From 33304bc38cfe45cc4bc0536770b74c589ebc48c6 Mon Sep 17 00:00:00 2001 From: Roman Yurchak Date: Wed, 1 May 2019 08:03:05 +0200 Subject: [PATCH 05/11] Lint --- src/tokenize/mod.rs | 2 -- src/vectorize/tests.rs | 3 --- 2 files changed, 5 deletions(-) diff --git a/src/tokenize/mod.rs b/src/tokenize/mod.rs index 81507a5..c1107f1 100644 --- a/src/tokenize/mod.rs +++ b/src/tokenize/mod.rs @@ -64,7 +64,6 @@ pub trait Tokenizer: fmt::Debug { fn tokenize<'a>(&'a self, text: &'a str) -> Box + 'a>; } - /// Regular expression tokenizer /// pub struct RegexpTokenizer { @@ -97,7 +96,6 @@ impl fmt::Debug for RegexpTokenizer { } } - /// Unicode Segmentation tokenizer /// /// This implementation is a thin wrapper around the diff --git a/src/vectorize/tests.rs b/src/vectorize/tests.rs index e1daca2..8d6b099 100644 --- a/src/vectorize/tests.rs +++ b/src/vectorize/tests.rs @@ -98,7 +98,6 @@ fn test_empty_dataset() { #[test] fn test_dynamic_dispatch_tokenizer() { - let tokenizer = VTextTokenizer::new("en"); CountVectorizer::new(&tokenizer); HashingVectorizer::new(&tokenizer); @@ -111,9 +110,7 @@ fn test_dynamic_dispatch_tokenizer() { CountVectorizer::new(&tokenizer); HashingVectorizer::new(&tokenizer); - let tokenizer = CharacterTokenizer::new(4); CountVectorizer::new(&tokenizer); HashingVectorizer::new(&tokenizer); - } From db5c966d64a5f5f2961f54d0a06b209b8ae4dd56 Mon Sep 17 00:00:00 2001 From: Roman Yurchak Date: Wed, 1 May 2019 08:26:51 +0200 Subject: [PATCH 06/11] Add vectorization example --- src/vectorize/mod.rs | 17 +++++++++++++++++ 1 file changed, 17 insertions(+) diff --git a/src/vectorize/mod.rs b/src/vectorize/mod.rs index dd85677..69ac108 100644 --- a/src/vectorize/mod.rs +++ b/src/vectorize/mod.rs @@ -8,6 +8,23 @@ # Vectorization module This module allows computing a sparse document term matrix from a text corpus. + +```rust +extern crate vtext; + +use vtext::tokenize::{VTextTokenizer,Tokenizer}; +use vtext::vectorize::CountVectorizer; + +let documents = vec![ + String::from("Some text input"), + String::from("Another line"), +]; + +let tokenizer = VTextTokenizer::new("en"); + +let mut vectorizer = CountVectorizer::new(&tokenizer); +let X = vectorizer.fit_transform(&documents); +// returns a sparse CSR matrix with document-terms counts */ use crate::math::CSRArray; From 99fdd17fe12af3be11a82fb29ee32fad2b500058 Mon Sep 17 00:00:00 2001 From: Roman Yurchak Date: Wed, 1 May 2019 09:02:41 +0200 Subject: [PATCH 07/11] Use lifetimes in python wrappers --- python/src/lib.rs | 19 +++++++++++-------- 1 file changed, 11 insertions(+), 8 deletions(-) diff --git a/python/src/lib.rs b/python/src/lib.rs index 1b5e0e2..70cf393 100644 --- a/python/src/lib.rs +++ b/python/src/lib.rs @@ -25,6 +25,7 @@ use pyo3::wrap_pyfunction; use vtext::metrics; use vtext::tokenize; +use vtext::tokenize::Tokenizer; use vtext::vectorize; type PyCsrArray = (Py>, Py>, Py>); @@ -59,15 +60,16 @@ fn result_to_csr(py: Python, x: CsMat) -> PyResult { } #[pyclass] -pub struct _HashingVectorizerWrapper { - inner: vtext::vectorize::HashingVectorizer, +pub struct _HashingVectorizerWrapper<'b> { + inner: vtext::vectorize::HashingVectorizer<'b>, } #[pymethods] -impl _HashingVectorizerWrapper { +impl<'b> _HashingVectorizerWrapper<'b> { #[new] fn new(obj: &PyRawObject) { - let estimator = vtext::vectorize::HashingVectorizer::new(); + let tokenizer = vtext::tokenize::RegexpTokenizer::new("\\b\\w\\w+\\b".to_string()); + let estimator = vtext::vectorize::HashingVectorizer::new(tokenizer); obj.init(_HashingVectorizerWrapper { inner: estimator }); } @@ -83,15 +85,16 @@ impl _HashingVectorizerWrapper { } #[pyclass] -pub struct _CountVectorizerWrapper { - inner: vtext::vectorize::CountVectorizer, +pub struct _CountVectorizerWrapper<'b> { + inner: vtext::vectorize::CountVectorizer<'b>, } #[pymethods] -impl _CountVectorizerWrapper { +impl<'b> _CountVectorizerWrapper<'b> { #[new] fn new(obj: &PyRawObject) { - let estimator = vtext::vectorize::CountVectorizer::new(); + let tokenizer = vtext::tokenize::RegexpTokenizer::new("\\b\\w\\w+\\b".to_string()); + let estimator = vtext::vectorize::CountVectorizer::new(tokenizer); obj.init(_CountVectorizerWrapper { inner: estimator }); } From 8e0887c421aaa93e8bf3f8ebba8ea550705034e7 Mon Sep 17 00:00:00 2001 From: Roman Yurchak Date: Thu, 6 Jun 2019 01:51:07 -0500 Subject: [PATCH 08/11] Fix merge conflicts --- python/src/lib.rs | 2 +- src/vectorize/mod.rs | 14 +++----------- src/vectorize/tests.rs | 14 +++++++++----- 3 files changed, 13 insertions(+), 17 deletions(-) diff --git a/python/src/lib.rs b/python/src/lib.rs index 148b122..dcb38b8 100644 --- a/python/src/lib.rs +++ b/python/src/lib.rs @@ -69,7 +69,7 @@ impl<'b> _HashingVectorizerWrapper<'b> { fn new(obj: &PyRawObject, n_jobs: usize) { let tokenizer = vtext::tokenize::RegexpTokenizer::new("\\b\\w\\w+\\b".to_string()); let estimator = vtext::vectorize::HashingVectorizer::new(tokenizer).n_jobs(n_jobs); - + obj.init(_HashingVectorizerWrapper { inner: estimator }); } diff --git a/src/vectorize/mod.rs b/src/vectorize/mod.rs index e60f0a1..6e18810 100644 --- a/src/vectorize/mod.rs +++ b/src/vectorize/mod.rs @@ -90,14 +90,6 @@ fn _sum_duplicates(tf: &mut CSRArray, indices_local: &[i32], nnz: &mut usize) { tf.indptr.push(*nnz); } -#[derive(Debug)] -pub struct HashingVectorizer<'b> { - lowercase: bool, - tokenizer: &'b Tokenizer, - n_features: u64, - _n_jobs: usize -} - #[derive(Debug)] pub struct CountVectorizer<'b> { lowercase: bool, @@ -105,7 +97,7 @@ pub struct CountVectorizer<'b> { // vocabulary uses i32 indices, to avoid memory copies when converting // to sparse CSR arrays in Python with scipy.sparse pub vocabulary: HashMap, - _n_jobs: usize + _n_jobs: usize, } pub enum Vectorizer {} @@ -287,9 +279,9 @@ impl<'b> CountVectorizer<'b> { } #[derive(Debug)] -pub struct HashingVectorizer { +pub struct HashingVectorizer<'b> { lowercase: bool, - token_pattern: String, + tokenizer: &'b Tokenizer, n_features: u64, _n_jobs: usize, thread_pool: Option, diff --git a/src/vectorize/tests.rs b/src/vectorize/tests.rs index 662021e..6c7f761 100644 --- a/src/vectorize/tests.rs +++ b/src/vectorize/tests.rs @@ -11,7 +11,7 @@ use crate::vectorize::*; fn test_count_vectorizer_simple() { // Example 1 let tokenizer = RegexpTokenizer::new("\\b\\w+\\w\\b".to_string()); - + let documents = vec!["cat dog cat".to_string()]; let mut vect = CountVectorizer::new(&tokenizer); @@ -40,7 +40,9 @@ fn test_count_vectorizer_simple() { fn test_vectorize_empty_countvectorizer() { let documents = vec!["some tokens".to_string(), "".to_string()]; - let mut vect = CountVectorizer::new(); + let tokenizer = RegexpTokenizer::new("\\b\\w+\\w\\b".to_string()); + + let mut vect = CountVectorizer::new(&tokenizer); vect.fit_transform(&documents); vect.fit(&documents); @@ -50,8 +52,9 @@ fn test_vectorize_empty_countvectorizer() { #[test] fn test_vectorize_empty_hashingvectorizer() { let documents = vec!["some tokens".to_string(), "".to_string()]; + let tokenizer = RegexpTokenizer::new("\\b\\w+\\w\\b".to_string()); - let vect = HashingVectorizer::new(); + let vect = HashingVectorizer::new(&tokenizer); vect.fit_transform(&documents); vect.transform(&documents); @@ -59,12 +62,13 @@ fn test_vectorize_empty_hashingvectorizer() { #[test] fn test_count_vectorizer_fit_transform() { + let tokenizer = RegexpTokenizer::new("\\b\\w+\\w\\b".to_string()); for documents in &[vec!["cat dog cat".to_string()]] { - let mut vect = CountVectorizer::new(); + let mut vect = CountVectorizer::new(&tokenizer); vect.fit(&documents); let X = vect.transform(&documents); - let mut vect2 = CountVectorizer::new(); + let mut vect2 = CountVectorizer::new(&tokenizer); let X2 = vect2.fit_transform(&documents); assert_eq!(vect.vocabulary, vect2.vocabulary); println!("{:?}", vect.vocabulary); From c191b0d302824d7a20cfbf086963741e2b8aae31 Mon Sep 17 00:00:00 2001 From: Roman Yurchak Date: Fri, 7 Jun 2019 13:46:47 -0500 Subject: [PATCH 09/11] Try to fix send issues --- src/vectorize/mod.rs | 22 +++++++++++----------- 1 file changed, 11 insertions(+), 11 deletions(-) diff --git a/src/vectorize/mod.rs b/src/vectorize/mod.rs index 340481c..a5fb341 100644 --- a/src/vectorize/mod.rs +++ b/src/vectorize/mod.rs @@ -22,7 +22,7 @@ let documents = vec![ let tokenizer = VTextTokenizer::new("en"); -let mut vectorizer = CountVectorizer::new(&tokenizer); +let mut vectorizer = CountVectorizer::new(); let X = vectorizer.fit_transform(&documents); // returns a sparse CSR matrix with document-terms counts */ @@ -91,9 +91,9 @@ fn _sum_duplicates(tf: &mut CSRArray, indices_local: &[i32], nnz: &mut usize) { } #[derive(Debug)] -pub struct CountVectorizer<'b> { +pub struct CountVectorizer { lowercase: bool, - tokenizer: &'b Tokenizer, + //tokenizer: d Tokenizer + Sync, // vocabulary uses i32 indices, to avoid memory copies when converting // to sparse CSR arrays in Python with scipy.sparse pub vocabulary: HashMap, @@ -102,12 +102,11 @@ pub struct CountVectorizer<'b> { pub enum Vectorizer {} -impl<'b> CountVectorizer<'b> { +impl CountVectorizer { /// Initialize a CountVectorizer estimator - pub fn new(tokenizer: &'b Tokenizer) -> Self { + pub fn new() -> Self { CountVectorizer { lowercase: true, - tokenizer: tokenizer, vocabulary: HashMap::with_capacity_and_hasher(1000, Default::default()), _n_jobs: 1, } @@ -179,13 +178,14 @@ impl<'b> CountVectorizer<'b> { tf.indptr.push(0); let mut nnz: usize = 0; + let tokenizer = tokenize::RegexpTokenizer::new(TOKEN_PATTERN_DEFAULT.to_string()); let tokenize_map = |doc: &str| -> Vec { // Closure to tokenize a document and returns hash indices for each token let mut indices_local: Vec = Vec::with_capacity(10); - for token in self.tokenizer.tokenize(doc) { + for token in tokenizer.tokenize(doc) { if let Some(_id) = self.vocabulary.get(token) { indices_local.push(*_id) }; @@ -279,17 +279,17 @@ impl<'b> CountVectorizer<'b> { } #[derive(Debug)] -pub struct HashingVectorizer<'b> { +pub struct HashingVectorizer { lowercase: bool, - tokenizer: &'b Tokenizer, + tokenizer: &Tokenizer + Sync, n_features: u64, _n_jobs: usize, thread_pool: Option, } -impl<'b> HashingVectorizer<'b> { +impl HashingVectorizer { /// Create a new HashingVectorizer estimator - pub fn new(tokenizer: &'b Tokenizer) -> Self { + pub fn new(tokenizer: &Tokenizer + Sync) -> Self { HashingVectorizer { lowercase: true, tokenizer: tokenizer, From 45657db540070d37397457d39d7d2ca69e8d0112 Mon Sep 17 00:00:00 2001 From: Roman Yurchak Date: Fri, 7 Jun 2019 16:35:00 -0500 Subject: [PATCH 10/11] Static dispatch in rust lib --- src/tokenize/mod.rs | 7 ++++--- src/vectorize/mod.rs | 33 ++++++++++++++------------------- src/vectorize/tests.rs | 36 ++++++++++++++++++------------------ 3 files changed, 36 insertions(+), 40 deletions(-) diff --git a/src/tokenize/mod.rs b/src/tokenize/mod.rs index a074331..fa570cf 100644 --- a/src/tokenize/mod.rs +++ b/src/tokenize/mod.rs @@ -63,6 +63,7 @@ pub trait Tokenizer: fmt::Debug { /// Regular expression tokenizer /// +#[derive(Clone)] pub struct RegexpTokenizer { pub pattern: String, regexp: Regex, @@ -98,7 +99,7 @@ impl fmt::Debug for RegexpTokenizer { /// ## References /// /// * [Unicode® Standard Annex #29](http://www.unicode.org/reports/tr29/) -#[derive(Debug)] +#[derive(Debug, Clone)] pub struct UnicodeSegmentTokenizer { pub word_bounds: bool, } @@ -135,7 +136,7 @@ impl Tokenizer for UnicodeSegmentTokenizer { /// ## References /// /// * [Unicode® Standard Annex #29](http://www.unicode.org/reports/tr29/) -#[derive(Debug)] +#[derive(Debug, Clone)] pub struct VTextTokenizer { pub lang: String, } @@ -270,7 +271,7 @@ impl Tokenizer for VTextTokenizer { } /// Character tokenizer -#[derive(Debug)] +#[derive(Debug, Clone)] pub struct CharacterTokenizer { pub window_size: usize, } diff --git a/src/vectorize/mod.rs b/src/vectorize/mod.rs index a5fb341..afa57fc 100644 --- a/src/vectorize/mod.rs +++ b/src/vectorize/mod.rs @@ -22,13 +22,12 @@ let documents = vec![ let tokenizer = VTextTokenizer::new("en"); -let mut vectorizer = CountVectorizer::new(); +let mut vectorizer = CountVectorizer::new(tokenizer); let X = vectorizer.fit_transform(&documents); // returns a sparse CSR matrix with document-terms counts */ use crate::math::CSRArray; -use crate::tokenize; use crate::tokenize::Tokenizer; use hashbrown::{HashMap, HashSet}; use itertools::sorted; @@ -91,9 +90,9 @@ fn _sum_duplicates(tf: &mut CSRArray, indices_local: &[i32], nnz: &mut usize) { } #[derive(Debug)] -pub struct CountVectorizer { +pub struct CountVectorizer { lowercase: bool, - //tokenizer: d Tokenizer + Sync, + tokenizer: T, // vocabulary uses i32 indices, to avoid memory copies when converting // to sparse CSR arrays in Python with scipy.sparse pub vocabulary: HashMap, @@ -102,13 +101,14 @@ pub struct CountVectorizer { pub enum Vectorizer {} -impl CountVectorizer { +impl CountVectorizer { /// Initialize a CountVectorizer estimator - pub fn new() -> Self { + pub fn new(tokenizer: T) -> Self { CountVectorizer { lowercase: true, vocabulary: HashMap::with_capacity_and_hasher(1000, Default::default()), _n_jobs: 1, + tokenizer, } } @@ -127,14 +127,12 @@ impl CountVectorizer { /// /// This lists the vocabulary pub fn fit(&mut self, X: &[String]) -> () { - let tokenizer = tokenize::RegexpTokenizer::new(TOKEN_PATTERN_DEFAULT.to_string()); - let tokenize = |X: &[String]| -> HashSet { let mut _vocab: HashSet = HashSet::with_capacity(1000); for doc in X { let doc = doc.to_ascii_lowercase(); - let tokens = tokenizer.tokenize(&doc); + let tokens = self.tokenizer.tokenize(&doc); for token in tokens { if !_vocab.contains(token) { @@ -178,14 +176,13 @@ impl CountVectorizer { tf.indptr.push(0); let mut nnz: usize = 0; - let tokenizer = tokenize::RegexpTokenizer::new(TOKEN_PATTERN_DEFAULT.to_string()); let tokenize_map = |doc: &str| -> Vec { // Closure to tokenize a document and returns hash indices for each token let mut indices_local: Vec = Vec::with_capacity(10); - for token in tokenizer.tokenize(doc) { + for token in self.tokenizer.tokenize(doc) { if let Some(_id) = self.vocabulary.get(token) { indices_local.push(*_id) }; @@ -241,14 +238,12 @@ impl CountVectorizer { let mut nnz: usize = 0; let mut indices_local: Vec = Vec::new(); - let tokenizer = tokenize::RegexpTokenizer::new(TOKEN_PATTERN_DEFAULT.to_string()); - let pipe = X.iter().map(|doc| doc.to_ascii_lowercase()); let mut vocabulary_size: i32 = 0; for document in pipe { - let tokens = tokenizer.tokenize(&document); + let tokens = self.tokenizer.tokenize(&document); indices_local.clear(); @@ -279,23 +274,23 @@ impl CountVectorizer { } #[derive(Debug)] -pub struct HashingVectorizer { +pub struct HashingVectorizer { lowercase: bool, - tokenizer: &Tokenizer + Sync, + tokenizer: T, n_features: u64, _n_jobs: usize, thread_pool: Option, } -impl HashingVectorizer { +impl HashingVectorizer { /// Create a new HashingVectorizer estimator - pub fn new(tokenizer: &Tokenizer + Sync) -> Self { + pub fn new(tokenizer: T) -> Self { HashingVectorizer { lowercase: true, - tokenizer: tokenizer, n_features: 1048576, _n_jobs: 1, thread_pool: None, + tokenizer, } } diff --git a/src/vectorize/tests.rs b/src/vectorize/tests.rs index 6c7f761..48ec33c 100644 --- a/src/vectorize/tests.rs +++ b/src/vectorize/tests.rs @@ -13,7 +13,7 @@ fn test_count_vectorizer_simple() { let tokenizer = RegexpTokenizer::new("\\b\\w+\\w\\b".to_string()); let documents = vec!["cat dog cat".to_string()]; - let mut vect = CountVectorizer::new(&tokenizer); + let mut vect = CountVectorizer::new(tokenizer.clone()); let X = vect.fit_transform(&documents); assert_eq!(X.to_dense(), array![[2, 1]]); @@ -24,7 +24,7 @@ fn test_count_vectorizer_simple() { "The sky sky sky is blue".to_string(), ]; let X_ref = array![[0, 1, 0, 1, 1, 2], [1, 0, 1, 0, 3, 1]]; - let mut vect = CountVectorizer::new(&tokenizer); + let mut vect = CountVectorizer::new(tokenizer); let X = vect.fit_transform(&documents); assert_eq!(X.to_dense().shape(), X_ref.shape()); @@ -42,7 +42,7 @@ fn test_vectorize_empty_countvectorizer() { let tokenizer = RegexpTokenizer::new("\\b\\w+\\w\\b".to_string()); - let mut vect = CountVectorizer::new(&tokenizer); + let mut vect = CountVectorizer::new(tokenizer); vect.fit_transform(&documents); vect.fit(&documents); @@ -54,7 +54,7 @@ fn test_vectorize_empty_hashingvectorizer() { let documents = vec!["some tokens".to_string(), "".to_string()]; let tokenizer = RegexpTokenizer::new("\\b\\w+\\w\\b".to_string()); - let vect = HashingVectorizer::new(&tokenizer); + let vect = HashingVectorizer::new(tokenizer); vect.fit_transform(&documents); vect.transform(&documents); @@ -64,11 +64,11 @@ fn test_vectorize_empty_hashingvectorizer() { fn test_count_vectorizer_fit_transform() { let tokenizer = RegexpTokenizer::new("\\b\\w+\\w\\b".to_string()); for documents in &[vec!["cat dog cat".to_string()]] { - let mut vect = CountVectorizer::new(&tokenizer); + let mut vect = CountVectorizer::new(tokenizer.clone()); vect.fit(&documents); let X = vect.transform(&documents); - let mut vect2 = CountVectorizer::new(&tokenizer); + let mut vect2 = CountVectorizer::new(tokenizer.clone()); let X2 = vect2.fit_transform(&documents); assert_eq!(vect.vocabulary, vect2.vocabulary); println!("{:?}", vect.vocabulary); @@ -95,7 +95,7 @@ fn test_hashing_vectorizer_simple() { let tokenizer = VTextTokenizer::new("en"); - let vect = HashingVectorizer::new(&tokenizer); + let vect = HashingVectorizer::new(tokenizer); let vect = vect.fit(&documents); let X = vect.transform(&documents); assert_eq!(X.indptr(), &[0, 4, 8]); @@ -125,14 +125,14 @@ fn test_empty_dataset() { let documents: Vec = vec![]; let tokenizer = VTextTokenizer::new("en"); - let mut vectorizer = CountVectorizer::new(&tokenizer); + let mut vectorizer = CountVectorizer::new(tokenizer.clone()); let X = vectorizer.fit_transform(&documents); assert_eq!(X.data(), &[]); assert_eq!(X.indices(), &[]); assert_eq!(X.indptr(), &[0]); - let vectorizer = HashingVectorizer::new(&tokenizer); + let vectorizer = HashingVectorizer::new(tokenizer); let X = vectorizer.fit_transform(&documents); assert_eq!(X.data(), &[]); @@ -141,20 +141,20 @@ fn test_empty_dataset() { } #[test] -fn test_dynamic_dispatch_tokenizer() { +fn test_dispatch_tokenizer() { let tokenizer = VTextTokenizer::new("en"); - CountVectorizer::new(&tokenizer); - HashingVectorizer::new(&tokenizer); + CountVectorizer::new(tokenizer.clone()); + HashingVectorizer::new(tokenizer); let tokenizer = UnicodeSegmentTokenizer::new(false); - CountVectorizer::new(&tokenizer); - HashingVectorizer::new(&tokenizer); + CountVectorizer::new(tokenizer.clone()); + HashingVectorizer::new(tokenizer); let tokenizer = RegexpTokenizer::new("\\b\\w+\\w\\b".to_string()); - CountVectorizer::new(&tokenizer); - HashingVectorizer::new(&tokenizer); + CountVectorizer::new(tokenizer.clone()); + HashingVectorizer::new(tokenizer); let tokenizer = CharacterTokenizer::new(4); - CountVectorizer::new(&tokenizer); - HashingVectorizer::new(&tokenizer); + CountVectorizer::new(tokenizer.clone()); + HashingVectorizer::new(tokenizer); } From a9bc3cdf72ef5e4efd9c6d3c71fd41193df52266 Mon Sep 17 00:00:00 2001 From: Roman Yurchak Date: Fri, 7 Jun 2019 17:09:46 -0500 Subject: [PATCH 11/11] Fix python wrappe --- python/src/lib.rs | 14 +++++++------- 1 file changed, 7 insertions(+), 7 deletions(-) diff --git a/python/src/lib.rs b/python/src/lib.rs index dcb38b8..55a72ea 100644 --- a/python/src/lib.rs +++ b/python/src/lib.rs @@ -58,12 +58,12 @@ fn result_to_csr(py: Python, x: CsMat) -> PyResult { } #[pyclass] -pub struct _HashingVectorizerWrapper<'b> { - inner: vtext::vectorize::HashingVectorizer<'b>, +pub struct _HashingVectorizerWrapper { + inner: vtext::vectorize::HashingVectorizer, } #[pymethods] -impl<'b> _HashingVectorizerWrapper<'b> { +impl _HashingVectorizerWrapper { #[new] #[args(n_jobs = 1)] fn new(obj: &PyRawObject, n_jobs: usize) { @@ -85,14 +85,14 @@ impl<'b> _HashingVectorizerWrapper<'b> { } #[pyclass] -pub struct _CountVectorizerWrapper<'b> { - inner: vtext::vectorize::CountVectorizer<'b>, +pub struct _CountVectorizerWrapper { + inner: vtext::vectorize::CountVectorizer, } #[pymethods] -impl<'b> _CountVectorizerWrapper<'b> { +impl _CountVectorizerWrapper { #[new] - + #[args(n_jobs = 1)] fn new(obj: &PyRawObject, n_jobs: usize) { let tokenizer = vtext::tokenize::RegexpTokenizer::new("\\b\\w\\w+\\b".to_string()); let estimator = vtext::vectorize::CountVectorizer::new(tokenizer).n_jobs(n_jobs);