-
Notifications
You must be signed in to change notification settings - Fork 0
/
Sentiment_scoring.R
55 lines (43 loc) · 2.42 KB
/
Sentiment_scoring.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
## Load table containing reviews
aznv_ch <- readRDS("aznv_ch.Rds")
chreviews <- aznv_ch$text
## Sentiment lexicon from Bing Liu (cs.uic.edu/~liub/FBS/sentiment-analysis.html)
pos_words <- scan("positive-words.txt", what='character', comment.char=';')
neg_words <- scan("negative-words.txt", what='character', comment.char=';')
## Sentiment analysis function
score.sentiment <- function(text_vector, pos.words, neg.words, .progress='none')
{
require(plyr)
require(stringr)
scores <- ldply(text_vector, function(text_vector, pos.words, neg.words) {
# clean up text with regex
text_vector <- gsub('[[:punct:]]', '', text_vector)
text_vector <- gsub('[[:cntrl:]]', '', text_vector)
text_vector <- gsub('\\d+', '', text_vector)
text_vector <- tolower(text_vector)
# split into words using 'stringr' package
word.list <- str_split(text_vector, '\\s+')
words <- unlist(word.list)
# compare our words to the dictionaries of positive & negative terms
pos.matches <- match(words, pos.words)
neg.matches <- match(words, neg.words)
# match() returns the position of the matched term or NA
# we just want a TRUE/FALSE:
pos_matches <- !is.na(pos.matches)
neg_matches <- !is.na(neg.matches)
score <- sum(pos_matches) - sum(neg_matches)
}, pos.words, neg.words, .progress=.progress)
scores_final <- data.frame(sentiment_score = scores, text = text_vector)
return(scores_final)
}
## Score sentiment for all Chinese restaurants reviews from Nevada and Arizona
ch_sentiment <- score.sentiment(chreviews, pos_words, neg_words)
ch_starsentiment <- cbind(aznv_ch[, c(4,6)], ch_sentiment[,1])
colnames(ch_starsentiment) <- c("stars", "text", "sentiment_score")
## Compare distribution of ratings and review sentiment; ratings are dominated by 4 and 5 stars whereas sentiment is normally distributed.
hist(scale(ch_starsentiment$sentiment_score), breaks = 100, main = "Sentiment Distribution All Chinese Reviews", xlab = "Scaled Sentiment Score (0 = Neutral)")
hist(aznv_ch$stars.x, main = "Ratings Distribution for All Chinese Reviews", xlab = "Restaurant Ratings")
## Normality persists for 1 and 5 star ratings - sentiment are not predictive of ratings
extreme_stars <- filter(ch_starsentiment, stars == 1 | stars == 5)
hist(scale(extreme_stars$sentiment_score), breaks = 100, main = "Sentiment Distribution (1 or 5 stars)", xlab = "Scaled Sentiment Score (0 = Neutral)")
```