-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathutils.py
132 lines (116 loc) · 4.18 KB
/
utils.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
from os import listdir
from os.path import isfile, join
import pandas as pd
import traceback
from pprint import pprint
MIN_WORDS = 5
MAX_WORDS = 20
def get_files(training_directory, path):
"""Returns list of file names in directory specified by path.
Parameters
----------
training_directory: string
Directory with training data inside.
path: string
Directory path to get files from.
"""
return [f for f in listdir(path) if isfile(join(training_directory, f))]
def handle_exception(exception, msg=None):
"""Handles exceptions and logs error message. For now, just prints error to stdout.
Parameters
----------
exception: Exception
Exception to be logged.
"""
print("\nIncurred the following error:\n" if msg is None else msg)
if exception is None:
print(traceback.format_exc())
else:
print(exception)
def load_files(training_directory, filenames):
"""Takes a list of file names for .csv files and returns a DataFrame with all entries combined.
Parameters
----------
training_directory: string
Directory with training data inside.
filenames: list of strings
A list of strings containing the names of .csv files for headline data.
"""
clean_headline = lambda headline: headline.strip().lower()
df = pd.DataFrame(columns=["title", "link"])
for filename in filenames:
df = df.append(pd.read_csv(training_directory + filename), ignore_index=True, sort=False)
df = df.drop_duplicates()
df["title"] = [clean_headline(headline) for headline in df["title"]]
return df
def option_mux(message, options):
"""Prompts user with message, compares user input with options to decide which function to execute.
Parameters
----------
message: string
Message prompt shown to user before input is taken.
options: dictionary (string, function)
Maps strings (compared to user's input) to functions with no args.
"""
user_choice = None
while user_choice not in options:
user_choice = input(message).lower().strip()
if user_choice in options:
return options[user_choice]
else:
print("Not a valid option; please enter a valid keyword.\n")
def print_soup(soup, log_file = None):
"""Pretty prints soup to log_file. Used for debugging only.
Parameters
----------
soup: Soup
Soup to be pretty printed.
log_file: file
File for soup to be printed to. Default None indicates printed to stdout.
"""
soup = soup.prettify().encode("utf-8")
if log_file is None:
pprint(soup)
else:
pprint(soup, log_file)
def validate_headline(headline, entries):
"""Checks that headline is valid.
Checks that headline is not identical to any headline in the training data,
that the headline is between 5 and 20 words, and that 'florida man' is in
the headline.
Parameters
----------
headline: str
Language model generated headlines
entries: DataFrame
A DataFrame containing all headline entries.
"""
return headline not in list(entries["title"])\
and len(headline.split()) >= MIN_WORDS\
and len(headline.split()) <= MAX_WORDS\
and "florida man" in headline
def write_to_text(content, filename):
"""Saves content to a .txt file specified by filename.
Parameters
----------
content: list
A list of strings, where each item will be written to a separate line.
filename: string
Name of the file the content is being written to.
"""
f = open(filename, "w")
f.writelines([line + "\n" for line in content])
f.close()
def write_to_csv(content, filename):
"""Writes list to a .csv file specified by filename.
Parameters
----------
content: pandas DataFrame
A DataFrame containing titles and links for all headlines.
filename: string
The name of the file the content is being written into.
"""
content = content.drop_duplicates()
content.to_csv(filename, index=False)
print("Scraped {} sources.".format(len(content.index)))
print("Saving currently scraped results in '{}'.".format(filename))