-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathscraper.py
53 lines (43 loc) · 1.92 KB
/
scraper.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
#!/usr/bin/env python
# encoding: utf-8
import tweepy
import pandas as pd
import markov
import os
import sys
#Twitter API credentials
consumer_key = "BB3TrvSEOM9jTC6nqCsTBRz9O"
consumer_secret = "2f6miJPcxmUEbwL6XX93UK9o27Sysq49tqYOBiv1SlPIBCcKd6"
access_key = ""
access_secret = ""
def init_tweepy():
#authorize twitter, initialize tweepy
auth = tweepy.OAuthHandler(consumer_key, consumer_secret)
auth.set_access_token(access_key, access_secret)
return tweepy.API(auth)
def download_new_tweets(api, screen_name):
#grabs the relevant information from the most recent 200 tweets
api_tweets_raw = pd.DataFrame([[tweet.id_str, tweet.created_at, tweet.text.encode("utf-8")] for tweet in api.user_timeline(screen_name = screen_name,count=200)], columns=['id', 'created_at', 'text'])
#removes tweets containing links and retweets/DMs
api_tweets_cleaned = api_tweets_raw[~api_tweets_raw['text'].str.contains('@|http')]
#filters out newlines from the tweets' text
api_tweets_cleaned = api_tweets_cleaned.applymap(lambda x: x.replace('\n', ' ') if isinstance(x, basestring) else x)
#loads previously stored tweets
csv_tweets = pd.read_csv(os.path.realpath('tweets.csv'))
#concats the previously stored tweets with the newly downloaded tweets
comp = pd.concat([csv_tweets, api_tweets_cleaned], axis=0, ignore_index=True)
#removes duplicates and sorts based on tweed id (age)
comp_sorted = comp.drop_duplicates(subset='text', take_last=True).sort('id', ascending=False)
#saves new table to disk
comp_sorted.to_csv(os.path.realpath('tweets.csv'), index=False)
return comp
def send_tweet(api, tweet):
api.update_status(tweet)
if __name__ == '__main__':
api = init_tweepy()
if len(sys.argv) > 1:
download_new_tweets(api, sys.argv[1])
else:
download_new_tweets(api, 'lilbthebasedgod')
if len(access_key) > 0:
send_tweet(api, markov.build_tweet())