-
Notifications
You must be signed in to change notification settings - Fork 2
/
Copy pathAGG Cluster.py
59 lines (51 loc) · 2.16 KB
/
AGG Cluster.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
import collections
from nltk import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
from sklearn.cluster import FeatureAgglomeration
from sklearn.feature_extraction.text import TfidfVectorizer
from pprint import pprint
import csv
import pandas
def word_tokenizer(text):
#tokenizes and stems the text
tokens = word_tokenize(text)
stemmer = PorterStemmer()
tokens = [stemmer.stem(t) for t in tokens if t not in stopwords.words('english')]
return tokens
def cluster_sentences(sentences, nb_of_clusters=5):
tfidf_vectorizer = TfidfVectorizer(tokenizer=word_tokenizer,
stop_words=stopwords.words('english'),
max_df=0.9,
min_df=0.05,
lowercase=True)
#builds a tf-idf matrix for the sentences
tfidf_matrix_1 = tfidf_vectorizer.fit_transform(sentences)
tfidf_matrix = tfidf_matrix_1.todense()
kmeans = FeatureAgglomeration(n_clusters=nb_of_clusters)
kmeans.fit(tfidf_matrix)
clusters = collections.defaultdict(list)
for i, label in enumerate(kmeans.labels_):
clusters[label].append(i)
return dict(clusters)
import csv
with open(r"PATH" ) as f: #add the path to the CSV file
reader = csv.reader(f)
Pre_sentence = list(reader)
flatten = lambda l: [item for sublist in l for item in sublist]
sentences = flatten(Pre_sentence)
with open(r'Path') as g: #enables comparision to pre-labeled data-set
reader_cat = csv.reader(g)
Pre_Cat = list(reader_cat)
Cats = flatten(Pre_Cat)
if __name__ == "__main__":
#Example data set, uncomment to use
# sentences = ["Nature is beautiful","I like green apples",
# "We should protect the trees","Fruit trees provide fruits",
# "Green apples are tasty","My name is Dami"]
nclusters = 19
clusters = cluster_sentences(sentences, nclusters)
for cluster in range(nclusters):
print ("Grouped Engagements ",cluster,":")
for i,sentence in enumerate(clusters[cluster]):
print ("\tEngagement ", Cats[sentence],": ", sentences[sentence])