From 691afca4d88e789fc4ffd0878a24731a45e09b3a Mon Sep 17 00:00:00 2001 From: Andrew DalPino Date: Mon, 26 Apr 2021 13:02:43 -0500 Subject: [PATCH] Move Token Hashing Vectorizer to main package --- README.md | 2 +- .../TokenHashingVectorizerBench.php | 64 --------- composer.json | 2 +- src/Transformers/TokenHashingVectorizer.php | 124 ------------------ .../TokenHashingVectorizerTest.php | 63 --------- 5 files changed, 2 insertions(+), 253 deletions(-) delete mode 100644 benchmarks/Transformers/TokenHashingVectorizerBench.php delete mode 100644 src/Transformers/TokenHashingVectorizer.php delete mode 100644 tests/Transformers/TokenHashingVectorizerTest.php diff --git a/README.md b/README.md index 26aa23d..a89521e 100644 --- a/README.md +++ b/README.md @@ -8,7 +8,7 @@ $ composer require rubix/extras ``` ### Requirements -- [PHP](https://php.net/manual/en/install.php) 7.2 or above +- [PHP](https://php.net/manual/en/install.php) 7.4 or above ## License The code is licensed [MIT](LICENSE) and the documentation is licensed [CC BY-NC 4.0](https://creativecommons.org/licenses/by-nc/4.0/). diff --git a/benchmarks/Transformers/TokenHashingVectorizerBench.php b/benchmarks/Transformers/TokenHashingVectorizerBench.php deleted file mode 100644 index 8540b4a..0000000 --- a/benchmarks/Transformers/TokenHashingVectorizerBench.php +++ /dev/null @@ -1,64 +0,0 @@ -dataset = Unlabeled::quick($samples); - - $this->transformer = new TokenHashingVectorizer(1000); - } - - /** - * @Subject - * @Iterations(3) - * @OutputTimeUnit("milliseconds", precision=3) - */ - public function apply() : void - { - $this->dataset->apply($this->transformer); - } -} diff --git a/composer.json b/composer.json index cf7518b..bc0fcdc 100644 --- a/composer.json +++ b/composer.json @@ -31,7 +31,7 @@ "require-dev": { "friendsofphp/php-cs-fixer": "2.18.*", "league/flysystem-memory": "^2.0", - "phpbench/phpbench": "1.0.0-alpha6", + "phpbench/phpbench": "1.0.0-alpha8", "phpstan/extension-installer": "^1.0", "phpstan/phpstan": "0.12.*", "phpstan/phpstan-phpunit": "0.12.*", diff --git a/src/Transformers/TokenHashingVectorizer.php b/src/Transformers/TokenHashingVectorizer.php deleted file mode 100644 index 40c7a64..0000000 --- a/src/Transformers/TokenHashingVectorizer.php +++ /dev/null @@ -1,124 +0,0 @@ - self::MAX_DIMENSIONS) { - throw new InvalidArgumentException('Dimensions must be' - . ' between 0 and ' . self::MAX_DIMENSIONS - . ", $dimensions given."); - } - - $this->dimensions = $dimensions; - $this->tokenizer = $tokenizer ?? new Word(); - } - - /** - * Return the data types that this transformer is compatible with. - * - * @return \Rubix\ML\DataType[] - */ - public function compatibility() : array - { - return DataType::all(); - } - - /** - * Transform the dataset in place. - * - * @param array[] $samples - */ - public function transform(array &$samples) : void - { - $scale = $this->dimensions / self::MAX_DIMENSIONS; - - foreach ($samples as &$sample) { - $vectors = []; - - foreach ($sample as $column => $value) { - if (is_string($value)) { - $template = array_fill(0, $this->dimensions, 0); - - $tokens = $this->tokenizer->tokenize($value); - - $counts = array_count_values($tokens); - - foreach ($counts as $token => $count) { - $offset = (int) floor(crc32($token) * $scale); - - $template[$offset] += $count; - } - - $vectors[] = $template; - - unset($sample[$column]); - } - } - - $sample = array_merge($sample, ...$vectors); - } - } - - /** - * Return the string representation of the object. - * - * @return string - */ - public function __toString() : string - { - return "Token Hashing Vectorizer (dimensions: {$this->dimensions}," - . " tokenizer: {$this->tokenizer})"; - } -} diff --git a/tests/Transformers/TokenHashingVectorizerTest.php b/tests/Transformers/TokenHashingVectorizerTest.php deleted file mode 100644 index e5c7010..0000000 --- a/tests/Transformers/TokenHashingVectorizerTest.php +++ /dev/null @@ -1,63 +0,0 @@ -dataset = Unlabeled::quick([ - ['the quick brown fox jumped over the lazy man sitting at a bus stop drinking a can of coke'], - ['with a dandy umbrella'], - ]); - - $this->transformer = new TokenHashingVectorizer(20, new Word()); - } - - /** - * @test - */ - public function build() : void - { - $this->assertInstanceOf(TokenHashingVectorizer::class, $this->transformer); - $this->assertInstanceOf(Transformer::class, $this->transformer); - } - - /** - * @test - */ - public function transform() : void - { - $this->dataset->apply($this->transformer); - - $outcome = [ - [1, 1, 0, 1, 2, 0, 0, 1, 3, 0, 0, 1, 0, 0, 2, 1, 0, 1, 5, 0], - [0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0], - ]; - - $this->assertEquals($outcome, $this->dataset->samples()); - } -}