-
Notifications
You must be signed in to change notification settings - Fork 0
/
DasMalwerkCollector.py
67 lines (57 loc) · 2.48 KB
/
DasMalwerkCollector.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
import os
import requests
from bs4 import BeautifulSoup
from malware_collector import MalwareCollector
class DasMalwerkCollector(MalwareCollector):
# plan: hit https://dasmalwerk.eu daily, get the list of malware links, find any
# that we haven't already downloaded, pull those.
collection_url = "https://das-malwerk.herokuapp.com/"
def __init__(self):
super(DasMalwerkCollector, self).__init__()
self.base_path = self.configfile.get("DasMalwerk", "path")
def _get_file(self, href, hash_name) -> str:
response = requests.get(href, stream=True, timeout=60)
file_path = os.path.join(self.base_path, hash_name + ".zip")
with open(file_path, 'wb') as fileHandle:
for chunk in response.iter_content(8192):
fileHandle.write(chunk)
return file_path
def get(self):
already_done = set()
grabbed_this_run = list()
completed_file_path = os.path.join(self.base_path, ".completed")
# read the list of files we've already fetched from them.
if os.path.exists(completed_file_path):
with open(completed_file_path) as fileHandle:
for line in fileHandle:
line = line.strip()
already_done.add(line)
response = requests.get(self.collection_url, timeout=60)
soup = BeautifulSoup(response.content, features='html.parser')
table = soup.table
rows = table.find_all("tr")
did_something = False
# get files if we didn't already have them
for row in rows:
elements = row.find_all("td")
if len(elements) < 3:
continue
hash_val = elements[2].text.strip()
if hash_val not in already_done:
url = elements[1].a.attrs['href']
saved_path = self._get_file(url, hash_val)
print(f"fetching {hash_val}")
already_done.add(hash_val)
grabbed_this_run.append(saved_path)
did_something = True
# update the list of files we've downloaded
if did_something:
with open(completed_file_path, "w") as fileHandle:
for entry in already_done:
fileHandle.write("{}\n".format(entry))
return grabbed_this_run
if __name__ == "__main__":
malwerk = DasMalwerkCollector()
new_files = malwerk.get()
print(f"completed. grabbed {len(new_files)} new files")
print("completed run")