-
Notifications
You must be signed in to change notification settings - Fork 65
/
Copy pathMLlibSentimentAnalyzer.scala
77 lines (70 loc) · 2.95 KB
/
MLlibSentimentAnalyzer.scala
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
package org.p7h.spark.sentiment.mllib
import org.apache.spark.broadcast.Broadcast
import org.apache.spark.mllib.classification.NaiveBayesModel
import org.apache.spark.mllib.feature.HashingTF
import org.apache.spark.mllib.linalg.Vector
object MLlibSentimentAnalyzer {
/**
* Predicts sentiment of the tweet text with Naive Bayes model passed after removing the stop words.
*
* @param text -- Complete text of a tweet.
* @param stopWordsList -- Broadcast variable for list of stop words to be removed from the tweets.
* @param model -- Naive Bayes Model of the trained data.
* @return Int Sentiment of the tweet.
*/
def computeSentiment(text: String, stopWordsList: Broadcast[List[String]], model: NaiveBayesModel): Int = {
val tweetInWords: Seq[String] = getBarebonesTweetText(text, stopWordsList.value)
val polarity = model.predict(MLlibSentimentAnalyzer.transformFeatures(tweetInWords))
normalizeMLlibSentiment(polarity)
}
/**
* Normalize sentiment for visualization perspective.
* We are normalizing sentiment as we need to be consistent with the polarity value with Core NLP and for visualization.
*
* @param sentiment polarity of the tweet
* @return normalized to either -1, 0 or 1 based on tweet being negative, neutral and positive.
*/
def normalizeMLlibSentiment(sentiment: Double) = {
sentiment match {
case x if x == 0 => -1 // negative
case x if x == 2 => 0 // neutral
case x if x == 4 => 1 // positive
case _ => 0 // if cant figure the sentiment, term it as neutral
}
}
/**
* Strips the extra characters in tweets. And also removes stop words from the tweet text.
*
* @param tweetText -- Complete text of a tweet.
* @param stopWordsList -- Broadcast variable for list of stop words to be removed from the tweets.
* @return Seq[String] after removing additional characters and stop words from the tweet.
*/
def getBarebonesTweetText(tweetText: String, stopWordsList: List[String]): Seq[String] = {
//Remove URLs, RT, MT and other redundant chars / strings from the tweets.
tweetText.toLowerCase()
.replaceAll("\n", "")
.replaceAll("rt\\s+", "")
.replaceAll("\\s+@\\w+", "")
.replaceAll("@\\w+", "")
.replaceAll("\\s+#\\w+", "")
.replaceAll("#\\w+", "")
.replaceAll("(?:https?|http?)://[\\w/%.-]+", "")
.replaceAll("(?:https?|http?)://[\\w/%.-]+\\s+", "")
.replaceAll("(?:https?|http?)//[\\w/%.-]+\\s+", "")
.replaceAll("(?:https?|http?)//[\\w/%.-]+", "")
.split("\\W+")
.filter(_.matches("^[a-zA-Z]+$"))
.filter(!stopWordsList.contains(_))
//.fold("")((a,b) => a.trim + " " + b.trim).trim
}
val hashingTF = new HashingTF()
/**
* Transforms features to Vectors.
*
* @param tweetText -- Complete text of a tweet.
* @return Vector
*/
def transformFeatures(tweetText: Seq[String]): Vector = {
hashingTF.transform(tweetText)
}
}