forked from SwastikUdupa/TF-IDF-hadoop
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathmapper1.py
27 lines (26 loc) · 882 Bytes
/
mapper1.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
#!/usr/bin/env python
import os
import sys
for line in sys.stdin:
filename = os.environ["map_input_file"]
line = line.strip()
line = line.replace("*", "")
line = line.replace(".", "")
line = line.replace("?", "")
line = line.replace("'", "")
line = line.replace(",", "")
line = line.replace(":", "")
line = line.replace(";", "")
line = line.replace("(", "")
line = line.replace("&", "")
line = line.replace(")", "")
line = line.replace("]", "")
line = line.replace("[", "")
line = line.replace("\t", " ")
line = line.replace("-", " d")
line = line.replace("!", " ")
word_list = (str(line).strip().split(" "))
for word in word_list:
word = word.strip().lower()
if word and word not in word_list:
sys.stdout.write(("{0}@{1},{2}\n".format(word, filename, 1)))