-
Notifications
You must be signed in to change notification settings - Fork 1
/
Copy path01-public-api.Rmd
104 lines (79 loc) · 2.58 KB
/
01-public-api.Rmd
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
---
title: "01-public-api"
author: "Ryan Wesslen"
date: "April 3, 2018"
output: html_document
---
```{r setup, include=FALSE}
knitr::opts_chunk$set(echo = TRUE)
```
## Public API
```{r data, eval=FALSE}
library(ROAuth); library(streamR)
requestURL <- "https://api.twitter.com/oauth/request_token"
accessURL <- "https://api.twitter.com/oauth/access_token"
authURL <- "https://api.twitter.com/oauth/authorize"
consumerKey <- "xxx"
consumerSecret <- "yyy"
my_oauth <- OAuthFactory$new(consumerKey=consumerKey,
consumerSecret=consumerSecret, requestURL=requestURL,
accessURL=accessURL, authURL=authURL)
## run this line and go to the URL that appears on screen
my_oauth$handshake(cainfo = system.file("CurlSSL", "cacert.pem", package = "RCurl"))
```
### Import ID
```{r api, eval=FALSE}
library(tidyverse)
userlevel <- read_csv("./data/user-stats.csv") %>%
filter(userlevel, Valid == "Yes") # Valid based on qualitative analysis
#save profile ids
ids <- userlevel$id
endDate <- "2017-06-07 00:00:00 EDT"
while(Sys.time() < endDate){
time <- gsub("[: -]", "" , Sys.time(), perl=TRUE)
file <- paste0("./data/stream",time,".json")
filterStream(file.name = file, timeout = 60, follow = ids, oauth = my_oauth)}
```
## Data Preparation
```{r eval=FALSE}
# set to where the data is
dir <- "./data/stream"
files <- list.files(dir)
# find the user-stats file
user <- read_csv("./user-stats.csv")
col <- c("screenName", "LABEL")
ids <- user[,col]
# initialize first time
tweets <- tweets[0,]
start.time <- Sys.time()
for (i in files){
t <- parseTweets(paste0(dir,i), simplify = TRUE)
t <- merge(t, ids, by.x = "screen_name", by.y = "screenName")
tweets <- rbind(tweets, t)
}
end.time <- Sys.time()
time.taken <- end.time - start.time
time.taken
## 1 - 989 ( "stream20170529123230.json")
## total 3,766,539; 51,345 parsed
## 990 - 2217
## total 4,866,263; 113,744 parsed
```
### Save file
```{r eval=FALSE}
tweets$created_at <- parse_datetime(substr(tweets$created_at,5,100),
format = "%b %d %H:%M:%S %z %Y")
tweets$user_created_at <- parse_datetime(substr(tweets$user_created_at,5,100),
format = "%b %d %H:%M:%S %z %Y")
max(tweets$created_at)
# "2017-06-07 04:06:01 UTC"
min(tweets$created_at)
# "2017-05-22 20:22:07 UTC"
library(lubridate)
# set cutoff days
date1 <- as.POSIXct("2017-05-23 00:00:00")
date2 <- as.POSIXct("2017-06-06 00:00:00")
int <- interval(date1, date2)
df <- tweets[tweets$created_at %within% int,]
write_csv(df, "./data/full-tweets-14days.csv")
```