-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathcrawler.sh
executable file
·165 lines (145 loc) · 5.3 KB
/
crawler.sh
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
#!/bin/bash
if [ $1 == "" ] || [ $1 == "0" ];
then
MAX_SITES=150
else
MAX_SITES=$1
fi
mkdir -p temp
mkdir -p statistics
mkdir -p sites
echo "WMDD4950 - Security & Cloud & Server Admn"
echo "William Takashi Mimura - id:100284327"
echo
echo "Web crawler from https://en.wikipedia.org/wiki/Cloud_computing"
echo
echo "Max sites to crawl: $MAX_SITES"
echo
START_TIME=$(date +%s)
echo "Process started at $(date)"
echo
echo
echo "Step 1 - BFS Algorithm to crawl the web"
echo "Executing..."
# STEP 1 - BFS ALGORITHMS TO CRAWL THE WEB
MAIN_ARRAY=("/wiki/Cloud_computing")
ARRAY_LEN=${#MAIN_ARRAY[@]}
MAIN_IND=0
while [ $MAIN_IND -lt $ARRAY_LEN ] && [ $MAIN_IND -le $MAX_SITES ]
do
# getting the name of the wiki page because it's stored as href="/wiki/page" so this command removes the href= and leaves only /wiki/page
THIS_FILE=$(echo ${MAIN_ARRAY[$MAIN_IND]} | cut -d '"' -f 2)
# removing the /wiki/
THIS_FILE=${THIS_FILE:6}
# download the page in queue
echo "Downloading page from https://en.wikipedia.org/wiki/${THIS_FILE}"
curl -s "https://en.wikipedia.org/wiki/${THIS_FILE}" > "sites/${THIS_FILE}.html"
# verify if reached maximum sites, so we don't need to fetch and add more links
if [ $ARRAY_LEN -lt $MAX_SITES ];
then
echo "Fetching links..."
# search for href="/wiki/______" and store as array
TEMP_ARRAY=($(grep -o "\(href=\"\/wiki\/\w*\"\)" "sites/${THIS_FILE}.html"))
echo "Adding links to the queue..."
# search in MAIN_ARRAY for each item of TEMP_ARRAY
ADDED=0
REPEATED=0
for i in "${TEMP_ARRAY[@]}"
do
FOUND="n"
for j in "${MAIN_ARRAY[@]}"
do
if [ ${i,,} == ${j,,} ];
then
FOUND="y"
REPEATED=$((REPEATED + 1))
break
fi
done
if [ "$FOUND" == "n" ];
then
MAIN_ARRAY+=($i)
ARRAY_LEN=${#MAIN_ARRAY[@]}
ADDED=$((ADDED + 1))
if [ $ARRAY_LEN -ge $MAX_SITES ];
then
echo "Queue limit reached. Downloading the rest of the pages from queue..."
break
fi
fi
done
echo "$ADDED links added | $REPEATED links repeated (discarted)"
fi
MAIN_IND=$((MAIN_IND + 1))
done
echo
END_TIME_1=$(date +%s)
echo "Step 1 completed at $(date) - $(($END_TIME_1 - $START_TIME)) seconds"
echo
echo
# STEP 2 - INDEXING WORDS
echo "Step 2 - Indexing words"
echo "Executing..."
ARRAY_LEN=${#MAIN_ARRAY[@]}
MAIN_IND=0
while [ $MAIN_IND -lt $ARRAY_LEN ] && [ $MAIN_IND -le $MAX_SITES ]
do
# getting the name of the wiki page because it's stored as href="/wiki/page" so this command removes the href= and leaves only /wiki/page
THIS_FILE=$(echo ${MAIN_ARRAY[$MAIN_IND]} | cut -d '"' -f 2)
# removing the /wiki/
THIS_FILE=${THIS_FILE:6}
echo "Treating file $THIS_FILE: getting words, treating and counting..."
# transforming html file into text
lynx -dump sites/"$THIS_FILE".html > temp/"$THIS_FILE".txt
# treating the file, removing numbers and some special characters
cat "temp/$THIS_FILE.txt" | tr -dc "[:alpha:] \-\/\_\.\n\r" | tr "[:upper:]" "[:lower:]" > "temp/$THIS_FILE.treated1.txt"
# treating the file, separating each word in a line
for w in `cat temp/"$THIS_FILE".treated1.txt`; do echo "$w"; done > temp/"$THIS_FILE".separated.txt
# creating a new file to keep debug organization
cp temp/"$THIS_FILE".separated.txt temp/"$THIS_FILE".treated2.txt
# treating the file, removing some more special characters
# removing files links "file://"
# removing real links "https://" "http://" "android-app://"
# removing special characters starting with -,:. etc
sed -i "s/^file\/\/.*//g; s/^https\/\/.*//g; s/^http\/\/.*//g; s/^android-app\/\/.*//g; s/^-//g; s/^-//g; s/^-//g; s/^-//g; s/-$//g; s/,$//g; s/\.$//g; s/\.$//g; s/\.$//g; s/\/$//g; s/\.$//g; s/\.$//g; s/\.$//g; s/:$//g; s/\;$//g; /^$/d" "temp/$THIS_FILE.treated2.txt"
# sorting the words for better counting algorithm
sort "temp/$THIS_FILE.treated2.txt" -o "temp/$THIS_FILE.sorted.txt"
lw=""
count=0
first="y"
# count algorithm, considering words are sorted and treated
for w in `cat "temp/$THIS_FILE.sorted.txt"`;
do
if [ $first == "y" ];
then
first="n"
lw=$w
fi
if [ $w == $lw ];
then
count=$((count + 1))
else
echo "$lw => $count"
count=1
fi
lw=$w
done > "temp/$THIS_FILE.counted.txt"
# adding the last word because of the algorithm
sed -i "\$a$lw => $count" "temp/$THIS_FILE.counted.txt"
# moving the final file to the right directory "statistics"
mv "temp/$THIS_FILE.counted.txt" "statistics/$THIS_FILE.txt"
MAIN_IND=$((MAIN_IND + 1))
done
# final display with the total time spent
echo
END_TIME_2=$(date +%s)
echo "Step 2 completed at $(date) - $(($END_TIME_2 - $END_TIME_1)) seconds"
echo
echo "Process finished!!! $(($END_TIME_2 - $START_TIME)) seconds"
echo "Some temporary files were criated in the process, would you like to remove them? (y/n)"
read input
if [ $input == "y" ] || [ $input == "Y" ];
then
rm -r temp/*
fi
echo "done :)"