-
Notifications
You must be signed in to change notification settings - Fork 1
/
Copy pathget_region_words.py
44 lines (36 loc) · 1.33 KB
/
get_region_words.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
#script to get count of region words/tweet in training data
import numpy as np
import pandas as pd
from reg_words_check import NEWordCounter
from AppalachiaWordCounter import AppalachiaWordCounter
from sklearn.base import BaseEstimator, clone
import csv
#import train and test sets
train_data = pd.read_csv('short_clean_train.csv')
test_data = pd.read_csv('short_test.csv')
#convert Tweets to objects
train_data.dtypes
train_data['Tweet'] = train_data['Tweet'].astype(object)
test_data.dtypes
test_data['Tweet'] = test_data['Tweet'].astype(object)
#define features
tweets = train_data.Tweet
test_tweets = test_data.Tweet
#run training tweets through region word counter
#get array of ratio of region words: total words
ne = NEWordCounter()
custom = ne.transform(tweets)
test_custom = ne.transform(test_tweets)
#reshape custom from one row with 50k values to one column with 50k values
customswap = np.swapaxes(custom, 0,1)
test_customswap = np.swapaxes(test_custom, 0, 1)
#save new england word ratio as csv
np.savetxt('ne_wordcount.csv', customswap)
np.savetxt('ne_wordcount_test.csv', test_customswap)
#onto Appalachia!
appalachia = AppalachiaWordCounter()
custom = appalachia.transform(tweets)
custom_test = appalachia.transform(test_tweets)
my_custom = open("appalachia_wordcount.csv", 'wb')
wr_custom = csv.writer(my_custom)
wr_custom.writerows(custom)