-
Notifications
You must be signed in to change notification settings - Fork 1
/
sentiment_nltk.py
92 lines (70 loc) · 2.9 KB
/
sentiment_nltk.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
""" Luke Abbatessa, Yitian Liang, Naman Razdan, Jasmine Wong, Yu Xiao, & Yuting Zheng
DS3500
Final Project
December 7, 2022
Tokenizes and performs nltk vader analysis on texts
Consulted nltk website for documentation on tokenize
https://www.nltk.org/api/nltk.tokenize.html
Consulted nltk website for documentation on vader
https://www.nltk.org/howto/sentiment.html
"""
# Import the necessary libraries/packages
import nltk
nltk.download("punkt")
nltk.download("vader_lexicon")
from nltk.tokenize import sent_tokenize
from nltk.sentiment.vader import SentimentIntensityAnalyzer
import pandas as pd
def tokenize_sent(text):
"""
Divide a text into sentences
Parameters:
text - entire text of interest (a string)
Returns: sentences (a list)
"""
sentences = sent_tokenize(text)
return sentences
def nltk_score_sent(sentences):
"""
Take in a list of sentences, implement VADER sentiment scoring on them and return a df
containing the following columns of information:
[sentence: the sentence from the text,
sent_order: the sentence's index in the text,
sent_pctile: sentence order percentile in the text,
neg: negative score,
neu: neutral score,
pos: positive score
compound: compound sentiment score for sentence,
cum_score: text's cumulative compound sentiment score]
Parameters:
sentences - sentences as strings (a list)
Returns: a dataframe
"""
# Turn sentences list into a df with each sentence as own row
df = pd.DataFrame(sentences, columns=["sentence"])
# Make sentence order the index
df["sent_order"] = df.index
# Set max index as text length
text_length = max(df["sent_order"])
# Create sentence percentile column by dividing sentence index by text length
df["sent_pctile"] = df["sent_order"] / text_length
# Initialize VADER sentiment analyzer
sia = SentimentIntensityAnalyzer()
# Store sentiment analysis scores into df
df["neg"] = df["sentence"].apply(lambda x: sia.polarity_scores(x)["neg"])
df["neu"] = df["sentence"].apply(lambda x: sia.polarity_scores(x)["neu"])
df["pos"] = df["sentence"].apply(lambda x: sia.polarity_scores(x)["pos"])
df["compound"] = df["sentence"].apply(lambda x: sia.polarity_scores(x)["compound"])
# Turn compound scores into a list
compound_lst = df["compound"].values.tolist()
# Initialize empty list to store cumulative compound scores
cumulative_comp_scores = []
# Initialize cumulative score to be 0
cum_score = 0
for comp_score in compound_lst:
# Set cum_score as old cum_score + compound score for each sentence's compound score
cum_score = cum_score + comp_score
cumulative_comp_scores.append(cum_score)
# Turn cumulative scores into a column in pandas df
df["cum_score"] = cumulative_comp_scores
return df