-
Notifications
You must be signed in to change notification settings - Fork 1
/
Copy pathvsm.py
86 lines (76 loc) · 2.95 KB
/
vsm.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
"""
In this module we are going to code a very simple Outliers Detection system, in perticular Novelty Detection
For reference, visit http://scikit-learn.org/stable/modules/outlier_detection.html
Author: Nilesh Chaudhari(github:nonlocal)
Date : 28/Jan/18/Sun
Copyright (c) 2018-* Nilesh Chaudhari aka nonlocal.
All Rights Reserved.
"""
from __future__ import division
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from gensim.models.word2vec import Word2Vec
from data import data_model
import numpy as np
# This module contains all types of Vector Space Modellings (VSMs i.e. feature representation)
# possible, relevant to our case.
# Has Following VSMs:
# 1. Count Vectorizer
# 2. TF-IDF
# 3. word2vec
def bar():
print ("bar..")
class Tfidf(TfidfVectorizer):
"""docstring for Tdi."""
def __init__(self, documents):
super(Tfidf, self).__init__()
self.documents = documents
self.preprocess = data_model.preprocess
self.train()
def train(self):
documents = [self.preprocess(doc) for doc in self.documents]
self.fit(documents)
print "TFIDF fitted to given documents"
def vectorize_doc(self, doc):
doc = self.preprocess(doc)
return self.transform([doc]).toarray()
def vectorize_docs(self, docs):
docs = [self.preprocess(doc) for doc in docs]
return self.transform(docs).toarray()
class CountVect(CountVectorizer):
"""docstring for CountVect."""
def __init__(self, documents):
super(CountVect, self).__init__()
self.documents = documents
self.preprocess = data_model.preprocess
self.train()
def train(self):
documents = [self.preprocess(doc) for doc in self.documents]
self.fit(documents)
print "CountVectorizer fitted to given documents"
def vectorize_doc(self, doc):
doc = self.preprocess(doc)
return self.transform([doc]).toarray()
def vectorize_docs(self, docs):
docs = [self.preprocess(doc) for doc in docs]
return self.transform(docs).toarray()
class W2V(object):
"""docstring for Word2vec."""
def __init__(self, documents):
super(W2V, self).__init__()
self.documents = documents
self.preprocess = data_model.preprocess
self.model = None
self.train()
def train(self):
#pass documents as sentences to word2vec; really bad idea but what else to do?
documents = [self.preprocess(doc).split() for doc in self.documents]
self.model = Word2Vec(sentences=documents)
print "Word2Vec Model trained!"
def vectorize_doc(self, doc):
tokens = self.preprocess(doc).split()
doc_repr = np.array([self.model.wv[token] for token in tokens if token in self.model.wv.vocab])
vector = doc_repr.mean(axis=0)
return vector
def vectorize_docs(self, docs):
vectors = np.array([self.vectorize_doc(doc) for doc in docs])
return vectors