-
Notifications
You must be signed in to change notification settings - Fork 5
/
Copy pathtweet-chunking.R
67 lines (47 loc) · 2.1 KB
/
tweet-chunking.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
# assume that we already have tweets in a data.frame
# named tweet_df
# load packages
pacman::p_load(NLP,openNLP,tidytext,stringr,qdap,qdapRegex)
if(!installed.packages("openNLPmodels.en")) {
install.packages("openNLPmodels.en", repos = "http://datacube.wu.ac.at/", type = "source")
}
# combine text from all tweets into one string
tweet_text = tweet_df$text
Encoding(tweet_text) <- "UTF-8"
# remove hyperlinks
tweet_text1 = stringr::str_remove_all(tweet_text,
pattern = 'http\\S*://\\S+\\.\\S+/\\S+')
# remove emojis
tweet_text2 = stringr::str_remove_all(tweet_text1,
pattern = '[\U{1f000}-\U{1f700}]')
# remove other non-ascii symbols
tweet_text3 = qdapRegex::rm_non_ascii(tweet_text2)
# remove new line escapes
tweet_text4 = qdap::mgsub(c('\n'),c(' '),tweet_text3)
# remove stopwords
#tweet_text4 = qdap::mgsub(tidytext::stop_words[[1]],'',tweet_text3)
# remove retweet text that precedes the actual text in the tweet
tweet_text5 = stringr::str_remove_all(tweet_text4,
pattern = 'RT @\\S+:')
# convert text into a string object for NLP
s = NLP::as.String(tweet_text5)
# create the annotation functions
sent_token_annotator <- Maxent_Sent_Token_Annotator()
word_token_annotator <- Maxent_Word_Token_Annotator()
pos_tag_annotator <- Maxent_POS_Tag_Annotator()
chunker = system.file("models","en-chunker.bin",package = "openNLPmodels.en")
chunk_annotator <- Maxent_Chunk_Annotator(model = chunker)
# use the annotation function on the string of text
a3 <- NLP::annotate(s,
list(sent_token_annotator,
word_token_annotator,
pos_tag_annotator))
# use chunk_annotator to get phrases
s2 = NLP::annotate(s, chunk_annotator, a3)
sentences = subset(s2,type == "sentence")
sentences2 = character(max(sentences$id))
sentences2 = sapply(sentences$id, function(x) substr(s, sentences$start[x], sentences$end[x]))
phrases = subset(s2,type == "word")
tags <- sapply(s2$features,'[[',"chunk_tag")
tags
paste(sprintf("%s/%s",s[s2],tags), collapse = ' ')