-
Notifications
You must be signed in to change notification settings - Fork 0
/
Yelp.R
252 lines (204 loc) · 10.7 KB
/
Yelp.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
#### Section 1 - Load JSON Datasets and save as .Rds #######################################
## Load packages
pkg <- c("dplyr", "tidyr", "ggplot2", "jsonlite", "stringr", "qdap", "scales", "tm")
install.packages(pkg, dependencies = FALSE)
sapply(pkg, require, character.only = TRUE)
## Set working directory with the path string in brackets
setwd(...)
## Load JSON datasets using jsonlite::stream_in function
#biz_json <- stream_in(file("yelp_academic_dataset_business.json"))
#review_json <- stream_in(file("yelp_academic_dataset_review.json"))
#user_json <- stream_in(file("yelp_academic_dataset_user.json"))
## Save as .Rds for faster loading going forward
#saveRDS(biz_json, "biz.Rds")
#saveRDS(review_rds, "review.Rds")
#saveRDS(review_rds, "user.Rds")
## Load .Rds files as data frames and inspect the structure
biz <- readRDS("biz.Rds")
biz_df <- as.data.frame(biz)
str(biz_df, max.level = 1)
summary(biz_df)
review <- readRDS("review.Rds")
review_df <- as.data.frame(review)
str(review_df, max.level = 1)
summary(review_df)
#### Section 2 - Explore Datasets ##################################################
## Do we have enough reviews to analyse?
# Arranging into two columns comprising date and cumulative review count
ts_reviews <- review_df %>%
select(date, review_id) %>%
group_by(date) %>%
summarise_all(funs("reviews" = n())) %>%
mutate(dates = as.Date(date), cumulative = cumsum(reviews)) %>%
arrange(dates)
# Plot time series chart of review count - we have large sample size N = 2,225,213
ggplot(ts_reviews, aes(x = dates, y = cumulative, col = "red")) +
geom_line(size = 1) +
scale_y_continuous(labels = comma) +
labs(x = "Year of Review", y = "Cumulative Review Count") +
scale_x_date(date_breaks = "1 year", date_labels = "%Y") +
theme(legend.position = "none", axis.text.x = element_text(size = 8, angle = 45), axis.text.y = element_text(size = 8))
## Scrub Inactive Users
# Summary statistics - some users posted only 1 review in 10 years!
ratings_stats <- review_df %>%
select(user_id, stars) %>%
group_by(user_id) %>%
summarise_all(funs("count" = n(), "mean" = mean, "median" = median)) %>%
arrange(count) %>%
ungroup()
head(ratings_stats)
# Plot to determine proportion of users at each review frequency
inactive_ratio <- function(criteria){
ratio <- rep(0, as.numeric(criteria))
for (i in 1:as.numeric(criteria)){
ratio[i] <- nrow(filter(ratings_stats, count <= i)) / nrow(ratings_stats)
}
plot(ratio, type = "b", xlim = c(1,i), xlab = "Frequency of Reviews", ylab = "Ratio (vs. Total Reviews)", col = "blue")
}
inactive_ratio(10)
## Users posting 1 to 2 reviews make up 70% of total - we can only remove users with single reviews
few_lazy_raters <- ratings_stats %>%
filter(count == 1)
# Remove inactive users from the Review data
review_clean_df <- review_df[!(review_df$user_id %in% few_lazy_raters$user_id),]
intersect(review_clean_df$user_id, few_lazy_raters$user_id) # No overlap
round(nrow(review_clean_df) / nrow(review_df), 2) # We removed 13% of observations
## Merge the Reviews and Businesses datasets and remove extraneous columns
review_biz <- merge(review_clean_df, biz_df, by = "business_id")
rev_biz_tidy <- review_biz %>%
select(-starts_with("hour"), -starts_with("attribute"), -contains("votes"),-contains("type"))
# Save as .Rds for faster loading
saveRDS(rev_biz_tidy, "rev_biz_tidy.Rds")
## Chinese reviews were 15% of top 5 restaurant categories - sufficient sample size!
cat_count <- rev_biz_tidy %>%
group_by(as.character(categories)) %>%
summarise(Count = n()) %>%
arrange(desc(Count))
head(cat_count[, 1:2])
## Sanity check - 'restaurant' should not be grouped with 'bar' and 'nightlife'
genre_count <- rev_biz_tidy %>%
select(state, categories) %>%
filter(str_detect(categories, "Restaurant")) %>%
unnest(categories) %>%
group_by(state) %>%
count(categories) %>%
arrange(desc(n))
genre_count[1:10,]
## Remove non-essential categories AND filtering only 90th percentile
state_rest_count <- genre_count %>%
group_by(state) %>%
filter(categories != "Restaurants" || categories != "Nightlife" || categories != "Bars") %>%
filter(n > quantile(n, 0.9))
## Plot Share of Reviews by State (90th percentile)
state_table <- state_rest_count %>%
select(state, n) %>%
group_by(state) %>%
summarise_all(funs("count" = sum(n))) %>%
arrange(desc(count)) %>%
mutate(proportion = round(count / sum(count), 2))
plot_state_table <- state_table %>%
ggplot(aes(x = reorder(state, -proportion), y = proportion, fill = state)) +
geom_bar(stat = "identity") +
scale_y_continuous(labels = comma) + # Requires 'scales' package
ggtitle("Share of Reviews by State (Top 10% only)") +
labs(x = "State", y = "Share of Reviews") +
theme(legend.position = "none", axis.text.x = element_text(face = "bold", size = 8, angle = 45), axis.text.y = element_text(face = "bold", size = 8))
plot_state_table # Nevada and Arizona together contribute 83% of total reviews
## Visualise cuisine review counts in Arizona and Nevada
plot_aznv_cuisine <- state_rest_count %>%
filter(state == "AZ" | state == "NV") %>%
ggplot(aes(x = reorder(categories, -n), y = n, fill = categories)) +
geom_bar(stat = "identity") +
facet_grid(state~.) +
# Require 'scales' package to add comma separators to Y-axis labels
scale_y_continuous(labels = comma) +
ggtitle("Cuisine Review Count (Top 10% only)") +
labs(x = "Cuisine", y = "Total Reviews (n)") +
theme(legend.position = "none", axis.text.x = element_text(face = "bold", size = 7, angle = 90),
axis.text.y = element_text(face = "bold", size = 8))
plot_aznv_cuisine # Up to 30K Chinese reviews each in Nevada and Arizona
#### Section 3 - Create Corpus of Reviews ############################################
## Filtering Chinese reviews from Nevada and Arizona
aznv_ch <- rev_biz_tidy %>%
filter(state == "AZ" | state == "NV") %>%
filter(str_detect(categories, "Chinese"))
# Save as .Rds for quick loading by Shiny App
saveRDS(aznv_ch, "aznv_ch.Rds")
## Filtering only positive reviews and converting to matrix
aznv_ch_text <- aznv_ch[aznv_ch$stars.x >= 4,]$text
aznv_ch_matrix <- as.matrix(aznv_ch_text)
# Randomised sampling
random.rows <- sample(1:nrow(aznv_ch_matrix), 0.3 * nrow(aznv_ch_matrix), replace = FALSE)
aznv_ch_sample <- aznv_ch_matrix[random.rows,]
## Creating the corpus
aznv_ch_corpus <- VCorpus(VectorSource(aznv_ch_sample))
aznv_ch_corpus
#### Section 4 - Extract Popular Dishes from Review Texts ##############################################
## PREPROCESSING CORPUS
# Remove stopwords, capitalisation, punctuation, abbreviation and numbers
clean_corpus <- function(corpus){
corpus <- tm_map(corpus, content_transformer(function(x) iconv(x, to='UTF-8-MAC', sub='byte')), mc.cores=1)
corpus <- tm_map(corpus, removePunctuation)
corpus <- tm_map(corpus, content_transformer(tolower))
corpus <- tm_map(corpus, content_transformer(replace_abbreviation))
corpus <- tm_map(corpus, removeNumbers)
corpus <- tm_map(corpus, removeWords, c(stopwords("en"), "food"))
corpus <- tm_map(corpus, stripWhitespace)
return(corpus)
}
## Count single words or unigrams - too many garbage words to make sense of reviews
unigram_count <- function(cleaned_corpus) {
tdm <- TermDocumentMatrix(cleaned_corpus)
tdm_matrix <- as.matrix(tdm)
term_freq <- rowSums(tdm_matrix)
term_freq <- sort(term_freq, decreasing = TRUE)
barplot(term_freq[1:20], col = "turquoise", las = 2, main = "Plot of Top 20 Unigrams")
}
clean_aznv_ch <- clean_corpus(aznv_ch_corpus)
unigram_count(clean_aznv_ch)
## Expand stopword list in corpus cleaner
clean_corpus2 <- function(corpus){
corpus <- tm_map(corpus, content_transformer(function(x) iconv(x, to = "UTF-8-MAC", sub = "byte")), mc.cores=1)
corpus <- tm_map(corpus, removePunctuation)
corpus <- tm_map(corpus, content_transformer(tolower))
corpus <- tm_map(corpus, content_transformer(replace_abbreviation))
corpus <- tm_map(corpus, removeNumbers)
corpus <- tm_map(corpus, removeWords, c(stopwords("en"), "food", "good", "place", "great", "service", "time", "really", "restaurant", "always", "just", "get", "one", "will", "also", "ordered", "can", "try", "ive", "well", "eat", "little", "definitely", "vegas", "back", "amazing", "got", "dont"))
corpus <- tm_map(corpus, stripWhitespace)
return(corpus)
}
clean_aznv_ch2 <- clean_corpus2(aznv_ch_corpus)
## Create term document matrix for two-word (bigram) and three-word (trigram) phrases
install.packages("SnowballC")
library(SnowballC) #required by latest version of 'tm' package
library(tm)
update.packages("tm", checkBuilt = TRUE) #updating 'tm' package
# Create bigram and trigram tokenizer functions
BigramTokenizer <- function(x) unlist(lapply(ngrams(words(x), 2), paste, "", collapse = " "), use.names = FALSE)
TrigramTokenizer <- function(x) unlist(lapply(ngrams(words(x), 3), paste, "", collapse = " "), use.names = FALSE)
## Create term document matrix (tdm) of bigrams and trigrams
bigram_ch_tdm <- TermDocumentMatrix(clean_aznv_ch2, control = list(tokenize = BigramTokenizer))
trigram_ch_tdm <- TermDocumentMatrix(clean_aznv_ch2, control = list(tokenize = TrigramTokenizer))
## Convert tdm into data frames of bigram and trigram counts
ngram_freq <- function(tdm){
freq <- sort(rowSums(as.matrix(tdm)), decreasing=TRUE)
freq_df <- data.frame(word=names(freq), freq=freq)
return(freq_df)
}
bigram_ch_freq <- ngram_freq(bigram_ch_tdm)
trigram_ch_freq <- ngram_freq(trigram_ch_tdm)
# Plot frequencies of bigrams and trigrams - trigrams capture dish names the most accurately!
plot_bigram_ch <- bigram_ch_freq[1:20,] %>%
ggplot(aes(x = reorder(word, -freq), y = freq)) +
geom_bar(stat = "identity", fill = "green", col = "red") +
scale_y_continuous(labels = comma) +
ggtitle("Histogram of 20 Most Frequent Bigrams") +
labs(x = "Words / Phrases", y = "Frequency") +
theme(legend.position = "none", axis.text.x = element_text(face = "bold", size = 9, angle = 45), axis.text.y = element_text(size = 9))
plot_trigram_ch <- trigram_ch_freq[1:20,] %>%
ggplot(aes(x = reorder(word, -freq), y = freq)) +
geom_bar(stat = "identity", fill = "green", col = "red") +
scale_y_continuous(labels = comma) +
ggtitle("Histogram of 20 Most Frequent Trigrams") +
labs(x = "Words / Phrases", y = "Frequency") +
theme(legend.position = "none", axis.text.x = element_text(face = "bold", size = 8, angle = 45), axis.text.y = element_text(size = 9))s