-
Notifications
You must be signed in to change notification settings - Fork 0
/
wordvec_test.clj
69 lines (62 loc) · 2.56 KB
/
wordvec_test.clj
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
(ns zensols.nlparse.wordvec-test
(:require [clojure.test :refer :all]
[clojure.java.io :as io]
[clojure.string :as s]
[zensols.util.zip :as zip]
[zensols.nlparse.parse :as p]
[zensols.nlparse.config :as conf :refer (with-context)]
[zensols.nlparse.stopword :as st]
[zensols.nlparse.feature.word-count :as wc]
[zensols.nlparse.feature.word-similarity :as ws]
[clojure.string :as s]))
(defonce ^:private parse-context
(->> (conf/create-parse-config
:pipeline [(conf/tokenize)
(conf/sentence)
(conf/part-of-speech)
(conf/morphology)
(conf/stopword)])
conf/create-context))
(def ^:private word-context
(assoc wc/*word-count-config*
:word-form-fn #(-> % :lemma s/lower-case)
:words-by-label-count 25))
(defn- parse [utterance]
(with-context parse-context
(p/parse utterance)))
(defn clean-tokens [utterance]
(binding [wc/*word-count-config* word-context]
(->> utterance
parse
p/tokens
(filter st/go-word?))))
(defn word-count-feature-stats [corpus-lines]
(binding [wc/*word-count-config* word-context]
(with-open [is (io/input-stream (io/resource "books.zip"))]
(->> (zip/doentries [is entry-is entry]
(when-let [label (->> entry .getName (re-find #".*\/(.+).txt") second)]
(with-open [reader (io/reader entry-is)]
(->> (line-seq reader)
(take corpus-lines)
(s/join " ")
parse
(hash-map :class-label label :instance)
(hash-map :continue true :result)))))
(remove nil?)
wc/calculate-feature-stats))))
;;; tests
(deftest similarity-test
(testing "similarity"
(is (= 78.0
(-> (ws/similarity "king" "queen") (* 100) Math/floor)))))
(deftest similarity-distribution-test
(testing "similarity-distribution-test"
(binding [wc/*word-count-config* word-context]
(let [wc-stats (word-count-feature-stats 5000)
labels (->> ["Some say the whale can't open his mouth, but that is a fable"
"Doth represent with human countenance"
"The hero of the poem."]
(map clean-tokens)
(map #(ws/similarity-distribution % wc-stats))
(map #(-> % first first)))]
(is (= ["moby-dick" "divine-comedy" "beowulf"] labels))))))