Skip to content

Commit

Permalink
stopwords remover in clean text
Browse files Browse the repository at this point in the history
  • Loading branch information
edwardcqian committed Mar 14, 2019
1 parent a115bf1 commit 351a97b
Showing 1 changed file with 6 additions and 1 deletion.
7 changes: 6 additions & 1 deletion utils.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
import re
import numpy as np
from nltk.corpus import stopwords

# clean up given string
def clean_text(raw):
Expand All @@ -14,7 +15,11 @@ def clean_text(raw):
raw = re.sub(r'&', "and", raw)
# remove non valid characters
raw = re.sub('[^A-Za-z0-9#@ ]+', "", raw)
return(raw)
words = raw.split()

stops = set(stopwords.words("english"))
words = [w for w in words if not w in stops]
return( " ".join(words))

# create onehot representation of the label
def get_onehot(arr, num_class):
Expand Down

0 comments on commit 351a97b

Please sign in to comment.