From 509145f315d3b42cd3f262de454d269501bd6602 Mon Sep 17 00:00:00 2001 From: KingAkeem Date: Sun, 5 Aug 2018 17:02:21 -0400 Subject: [PATCH 01/34] Using multithreading and queues for speed increase --- modules/getweblinks.py | 40 +++++++++++++++++++++++++++++----------- modules/net_utils.py | 8 -------- 2 files changed, 29 insertions(+), 19 deletions(-) diff --git a/modules/getweblinks.py b/modules/getweblinks.py index 79b7883d..fa6b033e 100644 --- a/modules/getweblinks.py +++ b/modules/getweblinks.py @@ -1,7 +1,8 @@ from modules.net_utils import get_urls_from_page, get_url_status -from modules import pagereader from bs4 import BeautifulSoup from modules.bcolors import Bcolors +from threading import Thread +from queue import Queue def add_green(link): @@ -34,21 +35,38 @@ def get_links(soup, ext=False, live=False): print('------------------------------------') if live: - for link in websites: - if get_url_status(link) != 0: - coloredlink = add_green(link) - page = pagereader.read_first_page(link)[0] - if page is not None and page.title is not None: - print_row(coloredlink, page.title.string) - else: - coloredlink = add_red(link) - print_row(coloredlink, "Not found") - + display_link_status(websites) return websites else: raise(Exception('Method parameter is not of instance BeautifulSoup')) +def display_links(q): + while True: + link = q.get() + resp = get_url_status(link) + if resp != 0: + title = BeautifulSoup(resp.text, 'html.parser').title + coloredlink = add_green(link) + print_row(coloredlink, title) + else: + coloredlink = add_red(link) + print_row(coloredlink, "Not found") + q.task_done() + + +def display_link_status(websites): + q = Queue(len(websites)*2) + for _ in websites: + t = Thread(target=display_links, args=(q,)) + t.daemon = True + t.start() + + for link in websites: + q.put(link) + q.join() + + def print_row(url, description): print("%-80s %-30s" % (url, description)) diff --git a/modules/net_utils.py b/modules/net_utils.py index be71d46d..09e9f43d 100644 --- a/modules/net_utils.py +++ b/modules/net_utils.py @@ -2,14 +2,6 @@ import requests -def check_connection(url): - print("Attempting to connect to {site}".format(site=url)) - if get_url_status(url) != 0: - return 1 - - return 0 - - def get_url_status(url, headers=False): """ Uses head request because it uses less bandwith than get and timeout is From e068cd75112588a48da3548c11db8d9bc927e21f Mon Sep 17 00:00:00 2001 From: KingAkeem Date: Sun, 5 Aug 2018 18:10:53 -0400 Subject: [PATCH 02/34] Removing daemon threads for safety --- modules/getweblinks.py | 1 - 1 file changed, 1 deletion(-) diff --git a/modules/getweblinks.py b/modules/getweblinks.py index fa6b033e..f2dd89a0 100644 --- a/modules/getweblinks.py +++ b/modules/getweblinks.py @@ -60,7 +60,6 @@ def display_link_status(websites): q = Queue(len(websites)*2) for _ in websites: t = Thread(target=display_links, args=(q,)) - t.daemon = True t.start() for link in websites: From 434854a6d24fa8de894bc1c59a6bcefb25fbea90 Mon Sep 17 00:00:00 2001 From: KingAkeem Date: Mon, 6 Aug 2018 01:13:07 -0400 Subject: [PATCH 03/34] Made multi-threading queue more generalized so it can't be used in other areas. Added Daemon threads back to remove hanging if blocking occurs once the rest of the program has terminated --- modules/getweblinks.py | 61 +++++++++++++++++++++++++++--------------- 1 file changed, 39 insertions(+), 22 deletions(-) diff --git a/modules/getweblinks.py b/modules/getweblinks.py index f2dd89a0..744604e5 100644 --- a/modules/getweblinks.py +++ b/modules/getweblinks.py @@ -17,14 +17,15 @@ def add_red(link): def get_links(soup, ext=False, live=False): """ - Searches through all (hyperlinks) tags and stores them in a - list then validates if the url is formatted correctly. - + Returns list of links listed on the webpage of the soup passed. If live + is set to true then it will also print the status of each of the links + and setting ext to an actual extension such as '.com' will allow those + extensions to be recognized as valid urls and not just '.tor'. Args: - soup: BeautifulSoup instance currently being used. + soup (bs4.BeautifulSoup): webpage to be searched for links. Returns: - websites: List of websites that were found + websites (list(str)): List of websites that were found """ b_colors = Bcolors() if isinstance(soup, BeautifulSoup): @@ -35,35 +36,51 @@ def get_links(soup, ext=False, live=False): print('------------------------------------') if live: - display_link_status(websites) + queue_tasks(websites, display_link) return websites else: raise(Exception('Method parameter is not of instance BeautifulSoup')) -def display_links(q): +def display_link(link): + resp = get_url_status(link) + if resp != 0: + title = BeautifulSoup(resp.text, 'html.parser').title + coloredlink = add_green(link) + print_row(coloredlink, title) + else: + coloredlink = add_red(link) + print_row(coloredlink, "Not found") + + +def execute_tasks(q, task_func, tasks_args=tuple()): while True: - link = q.get() - resp = get_url_status(link) - if resp != 0: - title = BeautifulSoup(resp.text, 'html.parser').title - coloredlink = add_green(link) - print_row(coloredlink, title) + task = q.get() + if tasks_args: + task_func(task, tasks_args) else: - coloredlink = add_red(link) - print_row(coloredlink, "Not found") + task_func(task) q.task_done() -def display_link_status(websites): - q = Queue(len(websites)*2) - for _ in websites: - t = Thread(target=display_links, args=(q,)) - t.start() +def queue_tasks(tasks, task_func, tasks_args=tuple()): + q = Queue(len(tasks)*2) + for _ in tasks: + if tasks_args: + if isinstance(tasks_args, tuple): + t = Thread(target=execute_tasks, args=(q, task_func, tasks_args)) + t.daemon = True + t.start() + else: + raise(Exception('Function arguments must be passed in the form of a tuple.')) + else: + t = Thread(target=execute_tasks, args=(q, task_func)) + t.daemon = True + t.start() - for link in websites: - q.put(link) + for task in tasks: + q.put(task) q.join() From ac2986b9078a1978012c863fd89e6d0ec38ad6ec Mon Sep 17 00:00:00 2001 From: KingAkeem Date: Mon, 6 Aug 2018 01:28:11 -0400 Subject: [PATCH 04/34] Adding comments --- modules/getweblinks.py | 33 +++++++++++++++++++++++++++++++++ 1 file changed, 33 insertions(+) diff --git a/modules/getweblinks.py b/modules/getweblinks.py index 744604e5..c5237296 100644 --- a/modules/getweblinks.py +++ b/modules/getweblinks.py @@ -21,6 +21,7 @@ def get_links(soup, ext=False, live=False): is set to true then it will also print the status of each of the links and setting ext to an actual extension such as '.com' will allow those extensions to be recognized as valid urls and not just '.tor'. + Args: soup (bs4.BeautifulSoup): webpage to be searched for links. @@ -44,6 +45,16 @@ def get_links(soup, ext=False, live=False): def display_link(link): + """ + Prints the status of a link based on if it can be reached using a GET + request. Link is printed with a color based on status. + Green for a reachable status code and red for not reachable. + + Args: + link (str): url to be printed + Returns: + None + """ resp = get_url_status(link) if resp != 0: title = BeautifulSoup(resp.text, 'html.parser').title @@ -55,6 +66,17 @@ def display_link(link): def execute_tasks(q, task_func, tasks_args=tuple()): + """ + Executes tasks inside of queue using function and arguments passed + inside of threads + + Args: + q (queue.Queue): contains tasks + task_func (function): function to be executed on tasks and args + task_args (tuple): contains arguments for function + Returns: + None + """ while True: task = q.get() if tasks_args: @@ -65,6 +87,17 @@ def execute_tasks(q, task_func, tasks_args=tuple()): def queue_tasks(tasks, task_func, tasks_args=tuple()): + """ + Starts threads with tasks and queue, then queues tasks and spawned threads + begin to pull tasks off queue to execute + + Args: + tasks (list): lists of values that you'd like to operate on + task_func (function): function that you would like to use + tasks_args (tuple): arguments for function + Returns: + None + """ q = Queue(len(tasks)*2) for _ in tasks: if tasks_args: From 7f96121a78a3b754d273e91f3ad8af57837550c5 Mon Sep 17 00:00:00 2001 From: KingAkeem Date: Mon, 6 Aug 2018 01:43:24 -0400 Subject: [PATCH 05/34] Using title string instead of html --- modules/getweblinks.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/modules/getweblinks.py b/modules/getweblinks.py index c5237296..add1302d 100644 --- a/modules/getweblinks.py +++ b/modules/getweblinks.py @@ -57,7 +57,7 @@ def display_link(link): """ resp = get_url_status(link) if resp != 0: - title = BeautifulSoup(resp.text, 'html.parser').title + title = BeautifulSoup(resp.text, 'html.parser').title.string coloredlink = add_green(link) print_row(coloredlink, title) else: From 148002c4164b39bc8d10a3888124c5d5cb855f06 Mon Sep 17 00:00:00 2001 From: KingAkeem Date: Tue, 7 Aug 2018 21:27:33 -0400 Subject: [PATCH 06/34] Added BFS traveral function for links --- modules/getweblinks.py | 40 ++++++++++++++++++++++++++++++++++++++++ 1 file changed, 40 insertions(+) diff --git a/modules/getweblinks.py b/modules/getweblinks.py index add1302d..c748214e 100644 --- a/modules/getweblinks.py +++ b/modules/getweblinks.py @@ -1,3 +1,5 @@ +import requests + from modules.net_utils import get_urls_from_page, get_url_status from bs4 import BeautifulSoup from modules.bcolors import Bcolors @@ -5,6 +7,44 @@ from queue import Queue +def traverse_links(links, ext, depth=0, stop_depth=None, targetLink=None): + """ + Traverses links passed using Breadth First Search. You can specify stop depth + or specify a target to look for. The depth argument is used for recursion + + Args: + links (list): list of urls to traverse + ext (string): string representing extension to use for URLs + depth (int): used for recursion + stop_depth (int): stops traversing at this depth if specified + targetLink (string): stops at this link if specified + + Returns: + depth (int): depth stopped at + """ + + if depth == stop_depth: + return depth + + toVisit = list() + for link in links: + if targetLink == link and targetLink: + return depth + resp = requests.get(link) + soup = BeautifulSoup(resp.text, 'html.parser') + websitesToVisit = get_urls_from_page(soup, extension=ext) + for site in websitesToVisit: + toVisit.append(site) + depth += 1 + traverse_links(toVisit, ext, depth) + + +def search_page(html_text, ext, stop=None): + soup = BeautifulSoup(html_text, 'html.parser') + links = get_urls_from_page(soup, extension=ext) + traverse_links(links, ext, stop=stop) if stop else traverse_links(links, ext) + + def add_green(link): colors = Bcolors() return '\t' + colors.OKGREEN + link + colors.ENDC From 10e3cb918d0cafba26cf56675e8ba966a4a85feb Mon Sep 17 00:00:00 2001 From: KingAkeem Date: Tue, 7 Aug 2018 21:38:13 -0400 Subject: [PATCH 07/34] Fixing for CodeFactor --- modules/getweblinks.py | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/modules/getweblinks.py b/modules/getweblinks.py index c748214e..902bab7b 100644 --- a/modules/getweblinks.py +++ b/modules/getweblinks.py @@ -42,7 +42,10 @@ def traverse_links(links, ext, depth=0, stop_depth=None, targetLink=None): def search_page(html_text, ext, stop=None): soup = BeautifulSoup(html_text, 'html.parser') links = get_urls_from_page(soup, extension=ext) - traverse_links(links, ext, stop=stop) if stop else traverse_links(links, ext) + if stop: + traverse_links(links, ext, stop=stop) + else: + traverse_links(links, ext) def add_green(link): From dd8e6a8572a1ebf9a2d36fbdd14d5904d8e5d7cd Mon Sep 17 00:00:00 2001 From: KingAkeem Date: Tue, 7 Aug 2018 21:55:45 -0400 Subject: [PATCH 08/34] Adding more test coverage --- tests/test_getemails.py | 21 ++++++++++++-- tests/test_getweblinks.py | 60 ++++++++++++++++++++++++++++++++++++--- 2 files changed, 75 insertions(+), 6 deletions(-) diff --git a/tests/test_getemails.py b/tests/test_getemails.py index 7082a01c..3306b957 100644 --- a/tests/test_getemails.py +++ b/tests/test_getemails.py @@ -1,15 +1,31 @@ import sys sys.path.append('../') -import pytest import modules.getemails as getemails from bs4 import BeautifulSoup from yattag import Doc +def test_get_emails_fail(): + doc, tag, _, line = Doc().ttl() + doc.asis('') + with tag('html'): + with tag('body'): + line('a', 'test_anchor') + + mock_html = doc.getvalue() + + mock_soup = BeautifulSoup(mock_html, 'html.parser') + emails = getemails.getMails(mock_soup) + assert emails == [] + + def test_get_emails(): - test_emails = ['hello@helloaddress.com'] + test_emails = ['hello@helloaddress.com', + 'test@testemail.com', + 'foo@bar.com', + 'lol@me.biz'] doc, tag, _, line = Doc().ttl() doc.asis('') with tag('html'): @@ -26,6 +42,7 @@ def test_get_emails(): def test_run(): test_get_emails() + test_get_emails_fail() if __name__ == '__main__': diff --git a/tests/test_getweblinks.py b/tests/test_getweblinks.py index 61903790..48dab5e2 100644 --- a/tests/test_getweblinks.py +++ b/tests/test_getweblinks.py @@ -10,9 +10,59 @@ @pytest.fixture -def test_get_links(): +def test_get_links_fail(): + test_data = ['ssh://aff.ironsocket.tor', + 'ftp://aff.ironsocket.tor', + 'lol://wsrs.tor', + 'dial://cmsgear.tor'] + + doc, tag, _, line = Doc().ttl() + doc.asis('') + with tag('html'): + with tag('body'): + for data in test_data: + line('a', 'test_anchor', href=data) + + mock_html = doc.getvalue() + + mock_soup = BeautifulSoup(mock_html, 'html.parser') + with requests_mock.Mocker() as mock_connection: + for data in test_data: + mock_connection.register_uri('GET', data, text='Received') + + result = getweblinks.get_links(mock_soup, ext=['.tor']) + assert result == [] + + +@pytest.fixture +def test_get_links_tor(): + test_data = ['https://aff.ironsocket.tor', + 'https://aff.ironsocket.tor', + 'https://wsrs.tor', + 'https://cmsgear.tor'] + + doc, tag, _, line = Doc().ttl() + doc.asis('') + with tag('html'): + with tag('body'): + for data in test_data: + line('a', 'test_anchor', href=data) + + mock_html = doc.getvalue() + + mock_soup = BeautifulSoup(mock_html, 'html.parser') + with requests_mock.Mocker() as mock_connection: + for data in test_data: + mock_connection.register_uri('GET', data, text='Received') + + result = getweblinks.get_links(mock_soup, ext=['.tor']) + assert result == test_data + + +@pytest.fixture +def test_get_links_ext(): test_data = ['https://aff.ironsocket.com/SH7L', - 'https://aff.ironsocket.com/SH7L', + 'https://aff.ironsocket.gov/SH7L', 'https://wsrs.net/', 'https://cmsgear.com/'] @@ -30,12 +80,14 @@ def test_get_links(): for data in test_data: mock_connection.register_uri('GET', data, text='Received') - result = getweblinks.get_links(mock_soup, ext=['.com', '.net']) + result = getweblinks.get_links(mock_soup, ext=['.com', '.gov', '.net']) assert result == test_data def test_run(): - test_get_links() + test_get_links_fail() + test_get_links_tor() + test_get_links_ext() if __name__ == '__main__': From 86c5aee727c107fd80aa1e8eb85a2db4f0b3156f Mon Sep 17 00:00:00 2001 From: KingAkeem Date: Tue, 7 Aug 2018 22:02:43 -0400 Subject: [PATCH 09/34] Fixing issue with CodeFactor --- tests/test_getweblinks.py | 31 +++++++++++++------------------ 1 file changed, 13 insertions(+), 18 deletions(-) diff --git a/tests/test_getweblinks.py b/tests/test_getweblinks.py index 48dab5e2..66c3e4a3 100644 --- a/tests/test_getweblinks.py +++ b/tests/test_getweblinks.py @@ -9,22 +9,25 @@ from yattag import Doc -@pytest.fixture -def test_get_links_fail(): - test_data = ['ssh://aff.ironsocket.tor', - 'ftp://aff.ironsocket.tor', - 'lol://wsrs.tor', - 'dial://cmsgear.tor'] - +def setup_html(test_links): doc, tag, _, line = Doc().ttl() doc.asis('') with tag('html'): with tag('body'): - for data in test_data: + for data in test_links: line('a', 'test_anchor', href=data) - mock_html = doc.getvalue() + return doc.getvalue() + + +@pytest.fixture +def test_get_links_fail(): + test_data = ['ssh://aff.ironsocket.tor', + 'ftp://aff.ironsocket.tor', + 'lol://wsrs.tor', + 'dial://cmsgear.tor'] + mock_html = setup_html(test_data) mock_soup = BeautifulSoup(mock_html, 'html.parser') with requests_mock.Mocker() as mock_connection: for data in test_data: @@ -41,15 +44,7 @@ def test_get_links_tor(): 'https://wsrs.tor', 'https://cmsgear.tor'] - doc, tag, _, line = Doc().ttl() - doc.asis('') - with tag('html'): - with tag('body'): - for data in test_data: - line('a', 'test_anchor', href=data) - - mock_html = doc.getvalue() - + mock_html = setup_html(test_data) mock_soup = BeautifulSoup(mock_html, 'html.parser') with requests_mock.Mocker() as mock_connection: for data in test_data: From 08d58cea9b56fee90e4e1245062c93c113c38574 Mon Sep 17 00:00:00 2001 From: KingAkeem Date: Tue, 7 Aug 2018 22:28:41 -0400 Subject: [PATCH 10/34] Adding support for optional parameters --- modules/getweblinks.py | 9 ++++++++- 1 file changed, 8 insertions(+), 1 deletion(-) diff --git a/modules/getweblinks.py b/modules/getweblinks.py index 902bab7b..51bcde81 100644 --- a/modules/getweblinks.py +++ b/modules/getweblinks.py @@ -36,7 +36,14 @@ def traverse_links(links, ext, depth=0, stop_depth=None, targetLink=None): for site in websitesToVisit: toVisit.append(site) depth += 1 - traverse_links(toVisit, ext, depth) + if stop_depth and targetLink: + traverse_links(toVisit, ext, depth, stop_depth, targetLink) + elif stop_depth: + traverse_links(toVisit, ext, depth, stop_depth=stop_depth) + elif targetLink: + traverse_links(toVisit, ext, depth, targetLink=targetLink) + else: + traverse_links(toVisit, ext, depth) def search_page(html_text, ext, stop=None): From b1586d4b9d383487eaa42e2000eee881debb7370 Mon Sep 17 00:00:00 2001 From: KingAkeem Date: Wed, 8 Aug 2018 22:29:18 -0400 Subject: [PATCH 11/34] Adding exception handling for get requests within traversal function --- modules/getweblinks.py | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/modules/getweblinks.py b/modules/getweblinks.py index 51bcde81..246f78f9 100644 --- a/modules/getweblinks.py +++ b/modules/getweblinks.py @@ -30,7 +30,10 @@ def traverse_links(links, ext, depth=0, stop_depth=None, targetLink=None): for link in links: if targetLink == link and targetLink: return depth - resp = requests.get(link) + try: + resp = requests.get(link) + except Exception: + pass soup = BeautifulSoup(resp.text, 'html.parser') websitesToVisit = get_urls_from_page(soup, extension=ext) for site in websitesToVisit: From eb83451a413f31315149fcc5dc372e7843c78965 Mon Sep 17 00:00:00 2001 From: KingAkeem Date: Wed, 8 Aug 2018 22:30:34 -0400 Subject: [PATCH 12/34] Skip to next element if GET requests fail for traversal --- modules/getweblinks.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/modules/getweblinks.py b/modules/getweblinks.py index 246f78f9..7837e94f 100644 --- a/modules/getweblinks.py +++ b/modules/getweblinks.py @@ -33,7 +33,7 @@ def traverse_links(links, ext, depth=0, stop_depth=None, targetLink=None): try: resp = requests.get(link) except Exception: - pass + continue soup = BeautifulSoup(resp.text, 'html.parser') websitesToVisit = get_urls_from_page(soup, extension=ext) for site in websitesToVisit: From 063a2099c4ce60d9e5966e5c508c85bc781d8c7f Mon Sep 17 00:00:00 2001 From: KingAkeem Date: Wed, 8 Aug 2018 22:40:50 -0400 Subject: [PATCH 13/34] Adding specific exceptions to satisfy CodeFactor --- modules/getweblinks.py | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/modules/getweblinks.py b/modules/getweblinks.py index 7837e94f..de95425a 100644 --- a/modules/getweblinks.py +++ b/modules/getweblinks.py @@ -1,5 +1,6 @@ import requests +from requests import HTTPError, ConnectionError from modules.net_utils import get_urls_from_page, get_url_status from bs4 import BeautifulSoup from modules.bcolors import Bcolors @@ -32,8 +33,8 @@ def traverse_links(links, ext, depth=0, stop_depth=None, targetLink=None): return depth try: resp = requests.get(link) - except Exception: - continue + except (HTTPError, ConnectionError): + continue soup = BeautifulSoup(resp.text, 'html.parser') websitesToVisit = get_urls_from_page(soup, extension=ext) for site in websitesToVisit: From 86ae5f9c18848c2fb297e19dca426ab32b70ee26 Mon Sep 17 00:00:00 2001 From: PS NaraYanan Date: Sat, 11 Aug 2018 20:49:10 +0530 Subject: [PATCH 14/34] Updated Req.txt --- __pycache__/settings.cpython-36.pyc | Bin 251 -> 262 bytes modules/savedb.py | 2 +- requirements.txt | 2 ++ 3 files changed, 3 insertions(+), 1 deletion(-) diff --git a/__pycache__/settings.cpython-36.pyc b/__pycache__/settings.cpython-36.pyc index 771a3a92c9ec09598f2b6dc06bd30c22f98b2da6..767ab548b3552581077c9353c0df778269e78340 100644 GIT binary patch delta 52 zcmey(*v7bD*0Wr?5!7(BEMNau4F%uuD000JW4TJyy diff --git a/modules/savedb.py b/modules/savedb.py index 8e0b2757..daf95ddc 100644 --- a/modules/savedb.py +++ b/modules/savedb.py @@ -28,7 +28,7 @@ def saveToDatabase(database, user, password, links): cur = db.cursor() except Exception as e: - print "Error '{0}' occurred. Arguments {1}.".format(e.message, e.args): + print("Error '{0}' occurred. Arguments {1}.".format(e.message, e.args)) try: query = """ CREATE TABLE IF NOT EXISTS `tor_url` ( diff --git a/requirements.txt b/requirements.txt index 253ef4ed..4a639d54 100644 --- a/requirements.txt +++ b/requirements.txt @@ -5,3 +5,5 @@ requests==2.18.4 requests_mock==1.4.0 tldextract==2.2.0 yattag==1.10.0 +mysqlclient==1.3.13 +python-dotenv==0.9.1 From 9cef70d73f0cc463b2409588265a325380dfe8a5 Mon Sep 17 00:00:00 2001 From: PS NaraYanan Date: Sat, 11 Aug 2018 20:51:45 +0530 Subject: [PATCH 15/34] CodeFactor Fix --- modules/savedb.py | 1 - 1 file changed, 1 deletion(-) diff --git a/modules/savedb.py b/modules/savedb.py index daf95ddc..5765ef57 100644 --- a/modules/savedb.py +++ b/modules/savedb.py @@ -48,4 +48,3 @@ def saveToDatabase(database, user, password, links): finally: cur.close() db.close() - From f147ed5d424dc8771ca179e28875673e9fb6f5c8 Mon Sep 17 00:00:00 2001 From: PS NaraYanan Date: Sat, 11 Aug 2018 21:00:15 +0530 Subject: [PATCH 16/34] removing __pycache__ --- .gitignore | 1 + __pycache__/settings.cpython-36.pyc | Bin 262 -> 0 bytes 2 files changed, 1 insertion(+) delete mode 100644 __pycache__/settings.cpython-36.pyc diff --git a/.gitignore b/.gitignore index b82cfc55..be5ff8df 100644 --- a/.gitignore +++ b/.gitignore @@ -17,6 +17,7 @@ tests/.ropeproject/ *.pyc .pytestc* .pytest_cache +__pycache__/ # Misc torBot diff --git a/__pycache__/settings.cpython-36.pyc b/__pycache__/settings.cpython-36.pyc deleted file mode 100644 index 767ab548b3552581077c9353c0df778269e78340..0000000000000000000000000000000000000000 GIT binary patch literal 0 HcmV?d00001 literal 262 zcmXr!<>iw4mlxf^z`*brh~a=4$Z`PUVk01t!jQt4!;s4u#mER^GvzSlGDk5pGNdx6 zFgG)%ut+mRv7|DlvZS!~f>f{tGib8E1nOhZWW2?llb@IppORmanpXzovIHcSWZYuZ zt76du3q)~4q~Z&J;(nT}QEU)?5fe~*6gx;XCo`!C Date: Tue, 14 Aug 2018 23:19:34 +0530 Subject: [PATCH 17/34] MySQLdb --- requirements.txt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/requirements.txt b/requirements.txt index 4a639d54..bbeb1834 100644 --- a/requirements.txt +++ b/requirements.txt @@ -5,5 +5,5 @@ requests==2.18.4 requests_mock==1.4.0 tldextract==2.2.0 yattag==1.10.0 -mysqlclient==1.3.13 +MySQL-python==1.2.5 python-dotenv==0.9.1 From f65c50c22c6ef10d9deaef56e46737deca35da8e Mon Sep 17 00:00:00 2001 From: PS NaraYanan Date: Tue, 14 Aug 2018 23:35:28 +0530 Subject: [PATCH 18/34] libmysqlclient-dev --- requirements.txt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/requirements.txt b/requirements.txt index bbeb1834..88ff84f3 100644 --- a/requirements.txt +++ b/requirements.txt @@ -5,5 +5,5 @@ requests==2.18.4 requests_mock==1.4.0 tldextract==2.2.0 yattag==1.10.0 -MySQL-python==1.2.5 +libmysqlclient-dev==5.5.6 python-dotenv==0.9.1 From d2c0aeff9b7eafdd2dc2b933a634505d0cc86ef3 Mon Sep 17 00:00:00 2001 From: PS NaraYanan Date: Wed, 15 Aug 2018 23:05:46 +0530 Subject: [PATCH 19/34] Adding FAQ.md --- .gitignore | 1 + FAQ.md | 1 + requirements.txt | 1 - 3 files changed, 2 insertions(+), 1 deletion(-) create mode 100644 FAQ.md diff --git a/.gitignore b/.gitignore index be5ff8df..f3fc5792 100644 --- a/.gitignore +++ b/.gitignore @@ -17,6 +17,7 @@ tests/.ropeproject/ *.pyc .pytestc* .pytest_cache +__pycache* __pycache__/ # Misc diff --git a/FAQ.md b/FAQ.md new file mode 100644 index 00000000..d0511ba4 --- /dev/null +++ b/FAQ.md @@ -0,0 +1 @@ +## FAQ diff --git a/requirements.txt b/requirements.txt index 88ff84f3..0c415d7b 100644 --- a/requirements.txt +++ b/requirements.txt @@ -5,5 +5,4 @@ requests==2.18.4 requests_mock==1.4.0 tldextract==2.2.0 yattag==1.10.0 -libmysqlclient-dev==5.5.6 python-dotenv==0.9.1 From a5e75d04ac17580d26f2912da13b73720425b2a5 Mon Sep 17 00:00:00 2001 From: KingAkeem Date: Sun, 19 Aug 2018 10:46:52 -0400 Subject: [PATCH 20/34] Revert "Merge pull request #75 from tiagoCMatias/feature-db" This reverts commit e2e7690adad8dfb624bac3a34819003c49354170, reversing changes made to 8c7263bea3fe7c35cfaacf21b728f797896417f0. --- .env | 4 ---- settings.py | 4 ---- torBot.py | 23 +++-------------------- 3 files changed, 3 insertions(+), 28 deletions(-) delete mode 100644 .env delete mode 100644 settings.py diff --git a/.env b/.env deleted file mode 100644 index 95bb29c6..00000000 --- a/.env +++ /dev/null @@ -1,4 +0,0 @@ -URL="YOUR_URL" -DATABASE_NAME="YOUR_DATABASE_NAME" -DATABASE_USERNAME="YOUR_DATABASE_USERNAME" -DATABASE_PASSWORD="YOUR_DATABASE_PASSWORD" \ No newline at end of file diff --git a/settings.py b/settings.py deleted file mode 100644 index cd9e2f91..00000000 --- a/settings.py +++ /dev/null @@ -1,4 +0,0 @@ -from dotenv import load_dotenv -from pathlib import Path # python3 only -env_path = Path('.') / '.env' -load_dotenv(dotenv_path=env_path) diff --git a/torBot.py b/torBot.py index a6185a35..c7edbc52 100644 --- a/torBot.py +++ b/torBot.py @@ -1,10 +1,8 @@ import argparse import socket import socks -import os -import settings from modules import (bcolors, getemails, pagereader, getweblinks, updater, - info, savefile, savedb) + info, savefile) # GLOBAL CONSTS LOCALHOST = "127.0.0.1" @@ -116,10 +114,7 @@ def get_args(): action="store_true", help=' '.join(("Info displays basic info of the", "scanned site, (very slow)"))) - parser.add_argument("-db", "--database", - action="store_true", - help="Specify a database to connect.") - args = parser.parse_args() + return parser.parse_args() def main(conn=False): @@ -135,16 +130,6 @@ def main(conn=False): if args.update: updater.updateTor() exit() - if args.database: - if 'DATABASE_NAME' and 'DATABASE_USERNAME' and 'DATABASE_PASSWORD' in os.environ: - DATABASE_PASSWORD = os.getenv("DATABASE_PASSWORD") - DATABASE_USERNAME = os.getenv("DATABASE_USERNAME") - DATABASE_NAME = os.getenv("DATABASE_NAME") - #print("DB - ", DATABASE_NAME, " - ", DATABASE_USERNAME, " - ", DATABASE_PASSWORD) - else: - print("Wrong Database Configurations") - exit() - if not args.quiet: header() # If url flag is set then check for accompanying flag set. Only one @@ -170,11 +155,9 @@ def main(conn=False): links = getweblinks.get_links(soup=html_content, ext=args.extension, live=args.live) if args.save: savefile.saveJson("Links", links) - if(args.database): - savedb.saveToDatabase(DATABASE_NAME, DATABASE_USERNAME, DATABASE_PASSWORD, links) else: print("usage: torBot.py [-h] [-v] [--update] [-q] [-u URL] [-s] [-m]", - "[-e EXTENSION] [-l] [-i] [-db]") + "[-e EXTENSION] [-l] [-i]") print("\n\n") From 6643cc012e0b94a92f7261585b98b6589d6a2807 Mon Sep 17 00:00:00 2001 From: Akeem King Date: Sun, 19 Aug 2018 10:53:44 -0400 Subject: [PATCH 21/34] Delete savedb.py --- modules/savedb.py | 50 ----------------------------------------------- 1 file changed, 50 deletions(-) delete mode 100644 modules/savedb.py diff --git a/modules/savedb.py b/modules/savedb.py deleted file mode 100644 index 5765ef57..00000000 --- a/modules/savedb.py +++ /dev/null @@ -1,50 +0,0 @@ -import MySQLdb - - -def saveToDatabase(database, user, password, links): - """ - Connects to a MYSQL DB - Create MYSQL Table - Add link to the DB - - Args: - data = data that is being stored in the database - user = username to login into MYSQL - password = password of MYSQL - link = URLs from the crawler - """ - if not database and not user and not password: - print("Wrong DB Credentials") - exit() - - #Debug - #print("Database:", database, "\nuser:",user, "\npass:",password) - try: - db = MySQLdb.connect(host="localhost", # your host - user=user, # your username - passwd=password, # your password - db=database) # name of the data base - - cur = db.cursor() - - except Exception as e: - print("Error '{0}' occurred. Arguments {1}.".format(e.message, e.args)) - - try: - query = """ CREATE TABLE IF NOT EXISTS `tor_url` ( - id INT(6) UNSIGNED AUTO_INCREMENT PRIMARY KEY, - link VARCHAR(30) NOT NULL UNIQUE, - reg_date TIMESTAMP)""" - cur.execute(query) - for link in links: - query = "INSERT IGNORE INTO `tor_url` (link) VALUES ('{0}')".format(link) - cur.execute(query) - #print(query) - db.commit() - - except (MySQLdb.Error, MySQLdb.Warning, TypeError, ValueError) as e: - print(e) - return None - finally: - cur.close() - db.close() From 37db2c50c06445f7ef1f47ba80f6dea36f4ae56a Mon Sep 17 00:00:00 2001 From: Akeem King Date: Sun, 19 Aug 2018 13:23:34 -0400 Subject: [PATCH 22/34] Update TESTING.md --- tests/TESTING.md | 59 +----------------------------------------------- 1 file changed, 1 insertion(+), 58 deletions(-) diff --git a/tests/TESTING.md b/tests/TESTING.md index fc267bf4..8088ca43 100755 --- a/tests/TESTING.md +++ b/tests/TESTING.md @@ -1,60 +1,3 @@ # Testing Documentation -We are currently using Pytest as our testing framework so if you want to run the test suite. Run `pytest` from the base directory of TorBot. - -If you are interested in contributing to the GoBot (TorBot written in Golang instead of Python), then you can run tests by navigating to the `/modules/lib/` directory and running `go test` - -### Testing a PR Locally - -1. Make note of the PR number. For example, Rod's latest is PR #99: https://github.com/DedSecInside/TorBot/pull/99 - -2. Fetch the PR's pseudo-branch, and give it a local branch name. Here we'll name it `pr99`: - ``` - $ git fetch origin pull/99/head:pr99 - ``` - -3. Switch to that branch: - ``` - $ git checkout pr99 - ``` - -4. Compile and test. - -If the PR code changes and you want to update: - -``` -# Do this while in the pr99 branch -$ git pull origin pull/99/head -``` - -(I try to avoid `pull` and instead use `fetch`+`merge`, but... I don't know how to do it for this.) - -### Merging the PR - -You can use the Github web interface, but there's a [TOCTOU](https://en.wikipedia.org/wiki/Time_of_check_to_time_of_use) problem: If the pull-requester changes their master (or whatever they're PRing from) between the time you test and the time you merge, then you'll be merging code that you haven't reviewed/tested. So let's do it on the command line. - -First, checkout the upstream master code: - -You'll only do this the first time -- it creates the local `new_master` branch, tracks it to `new_master`, and switches to the branch: -``` -$ git checkout -t -b new_master origin/master -``` - -After the first time you'll just do: -``` -$ git checkout new_master -``` - -Now merge the PR: -``` -$ git merge pr99 -``` - -NOTE: You should edit the merge commit message to reference the PR (using, say `#99` in it). - -Now push: -``` -$ git push origin HEAD:master -``` - -(You can't just `git push` because your local branch name is different than the remote.) +We are currently using Pytest as our testing framework so if you want to run the test suite. Run `pytest` from the base directory of TorBot or from the `tests` directory. We're using mock objects to simulate HTTP requests and HTML webpages using `mock_requests` and `yattag` which allows us to have much faster tests that don't rely on network connections. In order to create test using these mocks, the general pattern is to create some HTML using [`yattag`](http://www.yattag.org/) and registering the data to a URL using [`requests_mock`](https://requests-mock.readthedocs.io/en/latest/) which will be used to simulate HTTP requests. From 931382258f936b3a32f6e8670cf7ded662bee543 Mon Sep 17 00:00:00 2001 From: Akeem King Date: Sun, 19 Aug 2018 13:24:52 -0400 Subject: [PATCH 23/34] Update TESTING.md --- tests/TESTING.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/TESTING.md b/tests/TESTING.md index 8088ca43..02791f81 100755 --- a/tests/TESTING.md +++ b/tests/TESTING.md @@ -1,3 +1,3 @@ # Testing Documentation -We are currently using Pytest as our testing framework so if you want to run the test suite. Run `pytest` from the base directory of TorBot or from the `tests` directory. We're using mock objects to simulate HTTP requests and HTML webpages using `mock_requests` and `yattag` which allows us to have much faster tests that don't rely on network connections. In order to create test using these mocks, the general pattern is to create some HTML using [`yattag`](http://www.yattag.org/) and registering the data to a URL using [`requests_mock`](https://requests-mock.readthedocs.io/en/latest/) which will be used to simulate HTTP requests. +We are currently using [`pytest`](https://docs.pytest.org/en/latest/) as our testing framework so if you want to run the test suite. Run `pytest` from the base directory of TorBot or from the `tests` directory. We're using mock objects to simulate HTTP requests and HTML webpages using `mock_requests` and `yattag` which allows us to have much faster tests that don't rely on network connections. In order to create test using these mocks, the general pattern is to create some HTML using [`yattag`](http://www.yattag.org/) and registering the data to a URL using [`requests_mock`](https://requests-mock.readthedocs.io/en/latest/) which will be used to simulate HTTP requests. From a9b4cf8a28fe9652e6d5b67363017d3a7cf178e8 Mon Sep 17 00:00:00 2001 From: Akeem King Date: Sun, 19 Aug 2018 13:34:42 -0400 Subject: [PATCH 24/34] Update README.md --- README.md | 37 ++++++++++++++++++++++--------------- 1 file changed, 22 insertions(+), 15 deletions(-) diff --git a/README.md b/README.md index dc3db793..f814ab5b 100755 --- a/README.md +++ b/README.md @@ -62,18 +62,21 @@ If its a new module, it should be put inside the modules directory and imported The branch name should be your new feature name in the format . For example, Feature_FasterCrawl_1.0. Contributor name will be updated to the below list. :D -## Dependencies -1. Tor -2. Python 3.x (Make sure pip3 is installed) -3. requests -4. Beautiful Soup 4 -5. Socket -6. Sock -7. Argparse -8. Git -9. termcolor -10. tldextract -11. Golang +### OS Dependencies +- Tor +- Python 3.x +- Golang 1.x (Not Currently Used) + +### Python Dependencies +- beautifulsoup4 +- PySocks +- termcolor +- requests +- python-dotenv +- tldextract +- requests_mock +- yattag + ## Basic setup Before you run the torBot make sure the following things are done properly: @@ -83,6 +86,9 @@ Before you run the torBot make sure the following things are done properly: * Make sure that your torrc is configured to SOCKS_PORT localhost:9050 +* Install TorBot Python requirements +`pip3 install -r requirements.txt` + On Linux platforms, you can make an executable for TorBot by using the install.sh script. You will need to give the script the correct permissions using `chmod +x install.sh` Now you can run `./install.sh` to create the torBot binary. @@ -116,9 +122,10 @@ Read more about torrc here : [Torrc](https://github.com/DedSecInside/TorBoT/blob ## TO-DO - [ ] Visualization Module -- [ ] Implement A\* Search for webcrawler -- [X] Multithreading -- [ ] Optimization +- [x] Implement BFS Search for webcrawler +- [X] Multithreading for Get Links +- [ ] Improve stability (Handle errors gracefully, expand test coverage and etc.) +- [ ] Create a user-friendly GUI - [ ] Randomize Tor Connection (Random Header and Identity) - [ ] Keyword/Phrase search - [ ] Social Media Integration From 17b9902ed238b647873023e9f6963b280bd757ae Mon Sep 17 00:00:00 2001 From: PS Narayanan Date: Sun, 26 Aug 2018 17:32:14 +0530 Subject: [PATCH 25/34] Update README.md --- README.md | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/README.md b/README.md index f814ab5b..e1c8601a 100755 --- a/README.md +++ b/README.md @@ -160,13 +160,13 @@ GNU Public License ## CREDITS - [X] [P5N4PPZ](https://github.com/PSNAppz) - Owner +- [X] [KingAkeem](https://github.com/KingAkeem) - Experienced Contributor,Reviewer,Core Member - [X] [agrepravin](https://github.com/agrepravin) - Contributor,Reviewer -- [X] [KingAkeem](https://github.com/KingAkeem) - Experienced Contributor,Reviewer +- [X] [shivankar-madaan](https://github.com/shivankar-madaan) - Experienced Contributor - [X] [y-mehta](https://github.com/y-mehta) - Contributor - [X] [Manfredi Martorana](https://github.com/Agostinelli) - Contributor - [X] [Evan Sia Wai Suan](https://github.com/waisuan) - New Contributor - [X] [Lean](https://github.com/leaen) - New Contributor -- [X] [shivankar-madaan](https://github.com/shivankar-madaan) - New Contributor - [X] [Gus](https://github.com/HotPushUpGuy420) - New Contributor - [X] [SubaruSama](https://github.com/SubaruSama) - New Contributor - [X] [robly78746](https://github.com/robly78746) - New Contributor From 82d36bba9741a1a3458ef4f238eba646d77a17f5 Mon Sep 17 00:00:00 2001 From: KingAkeem Date: Sun, 26 Aug 2018 16:14:22 -0400 Subject: [PATCH 26/34] Refactoring --- modules/getemails.py | 2 +- modules/getweblinks.py | 175 ++++++++++-------------------------- modules/net_utils.py | 62 ------------- modules/pagereader.py | 42 ++++++++- modules/utils.py | 200 +++++++++++++++++++++++++++++++++++++++++ 5 files changed, 285 insertions(+), 196 deletions(-) delete mode 100644 modules/net_utils.py create mode 100644 modules/utils.py diff --git a/modules/getemails.py b/modules/getemails.py index bd9f84ca..0ed30643 100644 --- a/modules/getemails.py +++ b/modules/getemails.py @@ -1,5 +1,5 @@ from modules.bcolors import Bcolors -from modules.net_utils import get_urls_from_page +from modules.pagereader import get_urls_from_page from bs4 import BeautifulSoup diff --git a/modules/getweblinks.py b/modules/getweblinks.py index de95425a..4db3057e 100644 --- a/modules/getweblinks.py +++ b/modules/getweblinks.py @@ -1,72 +1,64 @@ -import requests - -from requests import HTTPError, ConnectionError -from modules.net_utils import get_urls_from_page, get_url_status from bs4 import BeautifulSoup from modules.bcolors import Bcolors -from threading import Thread -from queue import Queue +from modules.utils import is_url, is_onion_url, bfs_urls, queue_tasks, display_link -def traverse_links(links, ext, depth=0, stop_depth=None, targetLink=None): +def get_urls_from_page(page_soup, email=False, extension=False): """ - Traverses links passed using Breadth First Search. You can specify stop depth - or specify a target to look for. The depth argument is used for recursion + Searches for urls on page using the anchor tag and href attribute, + also searchs for emails using 'mailto' if specified. Args: - links (list): list of urls to traverse - ext (string): string representing extension to use for URLs - depth (int): used for recursion - stop_depth (int): stops traversing at this depth if specified - targetLink (string): stops at this link if specified + page (bs4.BeauitulSoup): html soup to search + email (bool): flag whether to collect emails as well + extension (bool): flag whether to use additional extensions Returns: - depth (int): depth stopped at + urls (list): urls found on page """ + if not isinstance(page_soup, BeautifulSoup): + raise(Exception("First arg must be bs4.BeautifulSoup object")) + + urls = [] + anchors_on_page = page_soup.find_all('a') + for anchor_tag in anchors_on_page: + url = anchor_tag.get('href') + if extension: + if url and is_url(url) == 1: + urls.append(url) + elif email: + if url and 'mailto' in url: + email_addr = url.split(':') + if len(email_addr) > 1: + urls.append(email_addr[1]) + else: + if url and is_onion_url(url) == 1: + urls.append(url) - if depth == stop_depth: - return depth - - toVisit = list() - for link in links: - if targetLink == link and targetLink: - return depth - try: - resp = requests.get(link) - except (HTTPError, ConnectionError): - continue - soup = BeautifulSoup(resp.text, 'html.parser') - websitesToVisit = get_urls_from_page(soup, extension=ext) - for site in websitesToVisit: - toVisit.append(site) - depth += 1 - if stop_depth and targetLink: - traverse_links(toVisit, ext, depth, stop_depth, targetLink) - elif stop_depth: - traverse_links(toVisit, ext, depth, stop_depth=stop_depth) - elif targetLink: - traverse_links(toVisit, ext, depth, targetLink=targetLink) - else: - traverse_links(toVisit, ext, depth) - + return urls -def search_page(html_text, ext, stop=None): - soup = BeautifulSoup(html_text, 'html.parser') - links = get_urls_from_page(soup, extension=ext) - if stop: - traverse_links(links, ext, stop=stop) - else: - traverse_links(links, ext) +def search_page(html, ext, stop_depth=None): + """ + Takes in a pages HTML and searches the links on the page using + BFS. -def add_green(link): - colors = Bcolors() - return '\t' + colors.OKGREEN + link + colors.ENDC + Args: + html (str): HTML with links to search + add_exts (str): additional extension + stop_depth (int): The depth at which to stop + Returns: + links_found (list): links found on page and associated pages + """ + soup = BeautifulSoup(html, 'html.parser') + links = get_urls_from_page(soup, extension=ext) + if stop_depth: + links_found = bfs_urls(links, ext, stop_depth=stop_depth) + else: + links_found = bfs_urls(links, ext) -def add_red(link): - colors = Bcolors() - return '\t' + colors.On_Red + link + colors.ENDC + return links_found def get_links(soup, ext=False, live=False): @@ -96,80 +88,3 @@ def get_links(soup, ext=False, live=False): else: raise(Exception('Method parameter is not of instance BeautifulSoup')) - - -def display_link(link): - """ - Prints the status of a link based on if it can be reached using a GET - request. Link is printed with a color based on status. - Green for a reachable status code and red for not reachable. - - Args: - link (str): url to be printed - Returns: - None - """ - resp = get_url_status(link) - if resp != 0: - title = BeautifulSoup(resp.text, 'html.parser').title.string - coloredlink = add_green(link) - print_row(coloredlink, title) - else: - coloredlink = add_red(link) - print_row(coloredlink, "Not found") - - -def execute_tasks(q, task_func, tasks_args=tuple()): - """ - Executes tasks inside of queue using function and arguments passed - inside of threads - - Args: - q (queue.Queue): contains tasks - task_func (function): function to be executed on tasks and args - task_args (tuple): contains arguments for function - Returns: - None - """ - while True: - task = q.get() - if tasks_args: - task_func(task, tasks_args) - else: - task_func(task) - q.task_done() - - -def queue_tasks(tasks, task_func, tasks_args=tuple()): - """ - Starts threads with tasks and queue, then queues tasks and spawned threads - begin to pull tasks off queue to execute - - Args: - tasks (list): lists of values that you'd like to operate on - task_func (function): function that you would like to use - tasks_args (tuple): arguments for function - Returns: - None - """ - q = Queue(len(tasks)*2) - for _ in tasks: - if tasks_args: - if isinstance(tasks_args, tuple): - t = Thread(target=execute_tasks, args=(q, task_func, tasks_args)) - t.daemon = True - t.start() - else: - raise(Exception('Function arguments must be passed in the form of a tuple.')) - else: - t = Thread(target=execute_tasks, args=(q, task_func)) - t.daemon = True - t.start() - - for task in tasks: - q.put(task) - q.join() - - -def print_row(url, description): - print("%-80s %-30s" % (url, description)) diff --git a/modules/net_utils.py b/modules/net_utils.py deleted file mode 100644 index 09e9f43d..00000000 --- a/modules/net_utils.py +++ /dev/null @@ -1,62 +0,0 @@ -import re -import requests - - -def get_url_status(url, headers=False): - """ - Uses head request because it uses less bandwith than get and timeout is - set to 10 seconds and then link is automatically declared as dead. - - Args: - link: link to be tested - colors: object containing colors for link - - Return: - something?: either an int or return value of the connection object's - get request if successful & zero is failure - """ - try: - if headers: - resp = requests.get(url, headers=headers) - else: - resp = requests.get(url) - resp.raise_for_status() - return resp - except (requests.exceptions.ConnectionError, requests.exceptions.HTTPError): - return 0 - - -def is_url(url): - pattern = r"^https?:\/\/(www\.)?([a-z,A-Z,0-9]*)\.([a-z, A-Z]+)(.*)" - regex = re.compile(pattern) - if regex.match(url): - return 1 - return 0 - - -def is_onion_url(url): - pattern = r"^https?:\/\/(www\.)?([a-z,A-Z,0-9]*)\.onion/(.*)" - regex = re.compile(pattern) - if regex.match(url): - return 1 - return 0 - - -def get_urls_from_page(page, email=False, extension=False): - urls = [] - anchors_on_page = page.find_all('a') - for anchor_tag in anchors_on_page: - url = anchor_tag.get('href') - if extension: - if url and is_url(url) == 1: - urls.append(url) - elif email: - if url and 'mailto' in url: - email_addr = url.split(':') - if len(email_addr) > 1: - urls.append(email_addr[1]) - else: - if url and is_onion_url(url) == 1: - urls.append(url) - - return urls diff --git a/modules/pagereader.py b/modules/pagereader.py index 99e04bc7..51242412 100644 --- a/modules/pagereader.py +++ b/modules/pagereader.py @@ -1,9 +1,44 @@ import sys from bs4 import BeautifulSoup -from modules.net_utils import get_url_status +from modules.utils import get_url_status from modules.bcolors import Bcolors +def display_url(url): + """ + Prints the status of a url based on if it can be reached using a GET + request. url is printed with a color based on status. + Green for a reachable status code and red for not reachable. + + Args: + url (str): url to be printed + Returns: + None + """ + resp = get_url_status(url) + if resp != 0: + title = BeautifulSoup(resp.text, 'html.parser').title.string + coloredurl = add_green(url) + print_row(coloredurl, title) + else: + coloredurl = add_red(url) + print_row(coloredurl, "Not found") + + +def print_row(url, description): + print("%-80s %-30s" % (url, description)) + + +def add_green(link): + colors = Bcolors() + return '\t' + colors.OKGREEN + link + colors.ENDC + + +def add_red(link): + colors = Bcolors() + return '\t' + colors.On_Red + link + colors.ENDC + + def connection_msg(site): yield "Attempting to connect to {site}".format(site=site) @@ -61,5 +96,6 @@ def get_ip(): page = read_first_page('https://check.torproject.org/')[0] pg = page.find('strong') ip_addr = pg.renderContents() - - return b_colors.WARNING + b_colors.BOLD + ip_addr.decode("utf-8") + b_colors.ENDC + COLOR_BEGIN = b_colors.WARNING + b_colors.BOLD + COLOR_END = b_colors.ENDC + return COLOR_BEGIN + ip_addr.decode("utf-8") + COLOR_END diff --git a/modules/utils.py b/modules/utils.py new file mode 100644 index 00000000..1c8ffd55 --- /dev/null +++ b/modules/utils.py @@ -0,0 +1,200 @@ +import re +import requests + +from bs4 import BeautifulSoup +from requests.exceptions import HTTPError, ConnectionError +from queue import Queue +from threading import Thread +from modules.getweblinks import get_urls_from_page + +""" + +ALGORITHM UTILITY FUNCTIONS + +""" + + +def bfs_urls(urls, add_exts, rec_depth=0, stop_depth=None, target_url=None): + """ + Traverses urls passed using Breadth First Search. You can specify stop + depth or specify a target to look for. The rec_depth argument is used + for recursion. + + *NOTE: This function uses a GET request for each url found, this can + be very expensive so avoid if possible try to acquire the urls to + be traversed and use bfs function. + + Args: + urls (list): urls to traverse + add_exts (str): additional extensions to use + rec_depth (int): used for recursion + stop_depth (int): stops traversing at this depth if specified + target_url (str): stops at this url if specified + + Returns: + rec_depth (int): depth stopped at + """ + + if rec_depth == stop_depth: + return rec_depth + + urls_to_visit = list() + for url in urls: + if target_url == url and target_url: + return rec_depth + try: + resp = requests.get(url) + except (HTTPError, ConnectionError): + continue + soup = BeautifulSoup(resp.text, 'html.parser') + page_urls = get_urls_from_page(soup, extension=add_exts) + for url in page_urls: + urls_to_visit.append(url) + rec_depth += 1 + if stop_depth and target_url: + bfs_urls(urls_to_visit, add_exts, rec_depth, stop_depth, target_url) + elif stop_depth: + bfs_urls(urls_to_visit, add_exts, rec_depth, stop_depth=stop_depth) + elif target_url: + bfs_urls(urls_to_visit, add_exts, rec_depth, target_url=target_url) + else: + bfs_urls(urls_to_visit, add_exts, rec_depth=rec_depth) + + +def bfs(nodes, target_node=None, rec_depth=0, stop_depth=None): + """ + Traverses nodes using Breadth First Search. You can specify stop + depth or specify a target to look for. The rec_depth argument is used + for recursion. + + Args: + nodes (list): objects to traverse + target_node (object): object being searched for + rec_depth (int): used for recursion + stop_depth (int): stops traversing at this depth if specified + + Returns: + rec_depth (int): depth stopped at + """ + + if rec_depth == stop_depth: + return rec_depth + + adjacent_nodes = list() + # Checks that nodes is a list or has a Visit method + if not isinstance(nodes, list) and not hasattr(nodes, 'Visit', False): + raise(Exception('nodes must be a list')) + + for node in nodes: + if target_node == node and target_node: + return rec_depth + node.Visit() + adjacent_nodes.append(node) + rec_depth += 1 + if target_node and not stop_depth: + bfs(adjacent_nodes, target_node, rec_depth) + elif not target_node and stop_depth: + bfs(adjacent_nodes, rec_depth=rec_depth, stop_depth=stop_depth) + elif target_node and stop_depth: + bfs(adjacent_nodes, target_node, rec_depth, stop_depth) + else: + bfs(adjacent_nodes, rec_depth) + + +def exec_tasks(q, task_func, tasks_args=tuple()): + """ + Executes tasks inside of queue using function and arguments passed + inside of threads + + Args: + q (queue.Queue): contains tasks + task_func (function): function to be executed on tasks and args + task_args (tuple): contains arguments for function + Returns: + None + """ + while True: + task = q.get() + if tasks_args: + task_func(task, tasks_args) + else: + task_func(task) + q.task_done() + + +def queue_tasks(tasks, task_func, tasks_args=tuple()): + """ + Starts threads with tasks and queue, then queues tasks and spawned + threads begin to pull tasks off queue to execute + + Args: + tasks (list): lists of values that you'd like to operate on + task_func (function): function that you would like to use + tasks_args (tuple): arguments for function + Returns: + None + """ + q = Queue(len(tasks)*2) + for _ in tasks: + if tasks_args: + if isinstance(tasks_args, tuple): + t = Thread(target=exec_tasks, args=(q, task_func, tasks_args)) + t.daemon = True + t.start() + else: + raise(Exception('Arguments must be in the form of a tuple.')) + else: + t = Thread(target=exec_tasks, args=(q, task_func)) + t.daemon = True + t.start() + + for task in tasks: + q.put(task) + q.join() + + +""" + +Networking functions + +""" + + +def get_url_status(url, headers=False): + """ + Uses GET request to check if website exists + + *NOTE: May look into changing this to HEAD requests to improve perf + + Args: + url (str): url to be tested + + Return: + something? (int/Response object): return value of the connection + object's GET request if successful & zero upon failure + """ + try: + if headers: + resp = requests.get(url, headers=headers) + else: + resp = requests.get(url) + resp.raise_for_status() + return resp + except (ConnectionError, HTTPError): + return 0 + + +def is_url(url): + pattern = r"^https?:\/\/(www\.)?([a-z,A-Z,0-9]*)\.([a-z, A-Z]+)(.*)" + regex = re.compile(pattern) + if regex.match(url): + return 1 + return 0 + + +def is_onion_url(url): + pattern = r"^https?:\/\/(www\.)?([a-z,A-Z,0-9]*)\.onion/(.*)" + regex = re.compile(pattern) + if regex.match(url): + return 1 + return 0 From 69b237451b68ad5ded84da9e4d8b37f31992f23e Mon Sep 17 00:00:00 2001 From: Akeem King Date: Fri, 14 Sep 2018 07:22:16 -0400 Subject: [PATCH 27/34] Fixing PyLint --- modules/utils.py | 12 ++---------- 1 file changed, 2 insertions(+), 10 deletions(-) diff --git a/modules/utils.py b/modules/utils.py index 1c8ffd55..84b3f34e 100644 --- a/modules/utils.py +++ b/modules/utils.py @@ -7,12 +7,7 @@ from threading import Thread from modules.getweblinks import get_urls_from_page -""" - -ALGORITHM UTILITY FUNCTIONS - -""" - +# ALGORITHM UTILITY FUNCTIONS def bfs_urls(urls, add_exts, rec_depth=0, stop_depth=None, target_url=None): """ @@ -153,11 +148,8 @@ def queue_tasks(tasks, task_func, tasks_args=tuple()): q.join() -""" - -Networking functions +# Networking functions -""" def get_url_status(url, headers=False): From 461545527aefc6d0dd3b2cab684c8b5facd1d65c Mon Sep 17 00:00:00 2001 From: Akeem King Date: Fri, 14 Sep 2018 07:59:15 -0400 Subject: [PATCH 28/34] Fixing imports --- modules/getemails.py | 8 ++++---- modules/getweblinks.py | 24 ++++++++++++++++++++---- modules/utils.py | 20 ++------------------ 3 files changed, 26 insertions(+), 26 deletions(-) diff --git a/modules/getemails.py b/modules/getemails.py index 0ed30643..e1299ff8 100644 --- a/modules/getemails.py +++ b/modules/getemails.py @@ -1,6 +1,7 @@ -from modules.bcolors import Bcolors -from modules.pagereader import get_urls_from_page +import modules.getweblinks as getweblinks + from bs4 import BeautifulSoup +from modules.bcolors import Bcolors def getMails(soup): @@ -19,8 +20,7 @@ def getMails(soup): b_colors = Bcolors() if isinstance(type(soup), type(BeautifulSoup)): - - emails = get_urls_from_page(soup, email=True) + emails = getweblinks.get_urls_from_page(soup, email=True) # Pretty print output as below print('') diff --git a/modules/getweblinks.py b/modules/getweblinks.py index 4db3057e..57650f17 100644 --- a/modules/getweblinks.py +++ b/modules/getweblinks.py @@ -1,7 +1,23 @@ +import re +import modules.utils + from bs4 import BeautifulSoup from modules.bcolors import Bcolors -from modules.utils import is_url, is_onion_url, bfs_urls, queue_tasks, display_link +def is_url(url): + pattern = r"^https?:\/\/(www\.)?([a-z,A-Z,0-9]*)\.([a-z, A-Z]+)(.*)" + regex = re.compile(pattern) + if regex.match(url): + return 1 + return 0 + + +def is_onion_url(url): + pattern = r"^https?:\/\/(www\.)?([a-z,A-Z,0-9]*)\.onion/(.*)" + regex = re.compile(pattern) + if regex.match(url): + return 1 + return 0 def get_urls_from_page(page_soup, email=False, extension=False): """ @@ -54,9 +70,9 @@ def search_page(html, ext, stop_depth=None): soup = BeautifulSoup(html, 'html.parser') links = get_urls_from_page(soup, extension=ext) if stop_depth: - links_found = bfs_urls(links, ext, stop_depth=stop_depth) + links_found = utils.bfs_urls(links, ext, stop_depth=stop_depth) else: - links_found = bfs_urls(links, ext) + links_found = utils.bfs_urls(links, ext) return links_found @@ -83,7 +99,7 @@ def get_links(soup, ext=False, live=False): print('------------------------------------') if live: - queue_tasks(websites, display_link) + utils.queue_tasks(websites, utils.display_link) return websites else: diff --git a/modules/utils.py b/modules/utils.py index 84b3f34e..d60e96a8 100644 --- a/modules/utils.py +++ b/modules/utils.py @@ -1,11 +1,11 @@ import re import requests +import modules.getweblinks from bs4 import BeautifulSoup from requests.exceptions import HTTPError, ConnectionError from queue import Queue from threading import Thread -from modules.getweblinks import get_urls_from_page # ALGORITHM UTILITY FUNCTIONS @@ -42,7 +42,7 @@ def bfs_urls(urls, add_exts, rec_depth=0, stop_depth=None, target_url=None): except (HTTPError, ConnectionError): continue soup = BeautifulSoup(resp.text, 'html.parser') - page_urls = get_urls_from_page(soup, extension=add_exts) + page_urls = getweblinks.get_urls_from_page(soup, extension=add_exts) for url in page_urls: urls_to_visit.append(url) rec_depth += 1 @@ -174,19 +174,3 @@ def get_url_status(url, headers=False): return resp except (ConnectionError, HTTPError): return 0 - - -def is_url(url): - pattern = r"^https?:\/\/(www\.)?([a-z,A-Z,0-9]*)\.([a-z, A-Z]+)(.*)" - regex = re.compile(pattern) - if regex.match(url): - return 1 - return 0 - - -def is_onion_url(url): - pattern = r"^https?:\/\/(www\.)?([a-z,A-Z,0-9]*)\.onion/(.*)" - regex = re.compile(pattern) - if regex.match(url): - return 1 - return 0 From 28a3133780a706881d7239b304a1d4c77f385e0d Mon Sep 17 00:00:00 2001 From: Akeem King Date: Fri, 14 Sep 2018 10:06:08 -0400 Subject: [PATCH 29/34] A lot more refactoring --- modules/bcolors.py | 14 ------- modules/colors.py | 51 +++++++++++++++++++++++++ modules/getemails.py | 21 +++++----- modules/getweblinks.py | 76 ++++++++++++++++++++++++------------- modules/pagereader.py | 79 +++++++++++++++++++++----------------- modules/utils.py | 82 ++++++++++++++++++++-------------------- tests/test_getemails.py | 4 +- tests/test_pagereader.py | 2 +- torBot.py | 41 ++++++++++++-------- 9 files changed, 226 insertions(+), 144 deletions(-) delete mode 100644 modules/bcolors.py create mode 100644 modules/colors.py diff --git a/modules/bcolors.py b/modules/bcolors.py deleted file mode 100644 index 78b05842..00000000 --- a/modules/bcolors.py +++ /dev/null @@ -1,14 +0,0 @@ -class Bcolors: - - def __init__(self): - self.HEADER = '\033[95m' - self.OKBLUE = '\033[94m' - self.OKGREEN = '\033[92m' - self.WARNING = '\033[93m' - self.FAIL = '\033[91m' - self.ENDC = '\033[0m' - self.BOLD = '\033[1m' - self.UNDERLINE = '\033[4m' - self.WHITE = '\033[97m' - self.On_Black = '\033[40m' - self.On_Red = '\033[41m' diff --git a/modules/colors.py b/modules/colors.py new file mode 100644 index 00000000..a7f9e5cb --- /dev/null +++ b/modules/colors.py @@ -0,0 +1,51 @@ +""" +Module containing class with colors +""" + +class Colors: + """ + Class that contains colors used for TorBot in terminal and a method + that adds colr to a string + + Attributes: + _colors (dict): A map containing all of the color codes needed + """ + def __init__(self): + self._colors = { + 'white': "\033[1;37m", + 'yellow': "\033[1;33m", + 'green': "\033[1;32m", + 'blue': "\033[1;34m", + 'cyan': "\033[1;36m", + 'red': "\033[1;31m", + 'magenta': "\033[1;35m", + 'black': "\033[1;30m", + 'darkwhite': "\033[0;37m", + 'darkyellow': "\033[0;33m", + 'darkgreen': "\033[0;32m", + 'darkblue': "\033[0;34m", + 'darkcyan': "\033[0;36m", + 'darkred': "\033[0;31m", + 'darkmagenta':"\033[0;35m", + 'darkblack': "\033[0;30m", + 'end': "\033[0;0m" + } + + def add(self, string, color): + """ + Method that adds color to a given string + + Args: + string (str): string to add color to + color (str): string of color to add + """ + return self.get(color) + string + self.get('end') + + def get(self, color): + """ + Method that returns the color code of the given color string + + Args: + color (str): color code to be returned + """ + return self._colors[color] diff --git a/modules/getemails.py b/modules/getemails.py index e1299ff8..150c7b3f 100644 --- a/modules/getemails.py +++ b/modules/getemails.py @@ -1,10 +1,14 @@ -import modules.getweblinks as getweblinks - +""" +Module returns emails found on webpage +""" from bs4 import BeautifulSoup -from modules.bcolors import Bcolors +import modules.getweblinks +from modules.colors import Colors + +COLOR = Colors() -def getMails(soup): +def get_mails(soup): """ Searches for tags for links then checks if link contains the substring 'mailto' indicating that it's an email. If it is determined @@ -17,17 +21,16 @@ def getMails(soup): Returns: emails: list of email IDs """ - b_colors = Bcolors() if isinstance(type(soup), type(BeautifulSoup)): - emails = getweblinks.get_urls_from_page(soup, email=True) + emails = modules.getweblinks.get_urls_from_page(soup, email=True) # Pretty print output as below print('') - print(b_colors.OKGREEN+'Mails Found - '+b_colors.ENDC+str(len(emails))) + success_string = 'Mails Found - ' + str(len(emails)) + print(COLOR.add(success_string, 'green')) print('-------------------------------') return emails - else: - raise ValueError('Method parameter is not of instance BeautifulSoup') + raise ValueError('Method parameter is not of instance BeautifulSoup') diff --git a/modules/getweblinks.py b/modules/getweblinks.py index 57650f17..c0c69e5d 100644 --- a/modules/getweblinks.py +++ b/modules/getweblinks.py @@ -1,10 +1,26 @@ +""" +Module used to interact with a pages urls +""" import re -import modules.utils from bs4 import BeautifulSoup -from modules.bcolors import Bcolors + +import modules.utils +import modules.pagereader + +from modules.colors import Colors + +COLOR = Colors() def is_url(url): + """ + Returns an integer representing validity of url syntax + + Args: + url (str): url to be verified + Returns + (int): integer representing if url is a valid format + """ pattern = r"^https?:\/\/(www\.)?([a-z,A-Z,0-9]*)\.([a-z, A-Z]+)(.*)" regex = re.compile(pattern) if regex.match(url): @@ -13,6 +29,14 @@ def is_url(url): def is_onion_url(url): + """ + Returns an integer representing validity of an onion url syntax + + Args: + url (str): url to be verified + Returns + (int): integer representing if url is a valid format + """ pattern = r"^https?:\/\/(www\.)?([a-z,A-Z,0-9]*)\.onion/(.*)" regex = re.compile(pattern) if regex.match(url): @@ -21,19 +45,19 @@ def is_onion_url(url): def get_urls_from_page(page_soup, email=False, extension=False): """ - Searches for urls on page using the anchor tag and href attribute, - also searchs for emails using 'mailto' if specified. + Searches for urls on page using the anchor tag and href attribute, + also searchs for emails using 'mailto' if specified. - Args: - page (bs4.BeauitulSoup): html soup to search - email (bool): flag whether to collect emails as well - extension (bool): flag whether to use additional extensions + Args: + page (bs4.BeauitulSoup): html soup to search + email (bool): flag whether to collect emails as well + extension (bool): flag whether to use additional extensions - Returns: - urls (list): urls found on page + Returns: + urls (list): urls found on page """ if not isinstance(page_soup, BeautifulSoup): - raise(Exception("First arg must be bs4.BeautifulSoup object")) + raise Exception("First arg must be bs4.BeautifulSoup object") urls = [] anchors_on_page = page_soup.find_all('a') @@ -70,37 +94,35 @@ def search_page(html, ext, stop_depth=None): soup = BeautifulSoup(html, 'html.parser') links = get_urls_from_page(soup, extension=ext) if stop_depth: - links_found = utils.bfs_urls(links, ext, stop_depth=stop_depth) + links_found = modules.utils.bfs_urls(links, ext, stop_depth=stop_depth) else: - links_found = utils.bfs_urls(links, ext) + links_found = modules.utils.bfs_urls(links, ext) return links_found def get_links(soup, ext=False, live=False): """ - Returns list of links listed on the webpage of the soup passed. If live - is set to true then it will also print the status of each of the links - and setting ext to an actual extension such as '.com' will allow those - extensions to be recognized as valid urls and not just '.tor'. + Returns list of links listed on the webpage of the soup passed. If live + is set to true then it will also print the status of each of the links + and setting ext to an actual extension such as '.com' will allow those + extensions to be recognized as valid urls and not just '.tor'. - Args: - soup (bs4.BeautifulSoup): webpage to be searched for links. + Args: + soup (bs4.BeautifulSoup): webpage to be searched for links. - Returns: - websites (list(str)): List of websites that were found + Returns: + websites (list(str)): List of websites that were found """ - b_colors = Bcolors() if isinstance(soup, BeautifulSoup): websites = get_urls_from_page(soup, extension=ext) # Pretty print output as below - print(''.join((b_colors.OKGREEN, - 'Websites Found - ', b_colors.ENDC, str(len(websites))))) + success_string = 'Websites Found - ' + str(len(websites)) + print(COLOR.add(success_string, 'green')) print('------------------------------------') if live: - utils.queue_tasks(websites, utils.display_link) + modules.utils.queue_tasks(websites, modules.pagereader.display_url) return websites - else: - raise(Exception('Method parameter is not of instance BeautifulSoup')) + raise Exception('Method parameter is not of instance BeautifulSoup') diff --git a/modules/pagereader.py b/modules/pagereader.py index 51242412..d26d2208 100644 --- a/modules/pagereader.py +++ b/modules/pagereader.py @@ -1,8 +1,13 @@ +""" +This module is used for reading HTML pages using either bs4.BeautifulSoup objects or url strings +""" + import sys from bs4 import BeautifulSoup from modules.utils import get_url_status -from modules.bcolors import Bcolors +from modules.colors import Colors +COLOR = Colors() def display_url(url): """ @@ -18,70 +23,76 @@ def display_url(url): resp = get_url_status(url) if resp != 0: title = BeautifulSoup(resp.text, 'html.parser').title.string - coloredurl = add_green(url) + coloredurl = COLOR.add(url, 'green') print_row(coloredurl, title) else: - coloredurl = add_red(url) + coloredurl = COLOR.add(url, 'red') print_row(coloredurl, "Not found") def print_row(url, description): + """ + Prints row in specified format + """ print("%-80s %-30s" % (url, description)) -def add_green(link): - colors = Bcolors() - return '\t' + colors.OKGREEN + link + colors.ENDC - - -def add_red(link): - colors = Bcolors() - return '\t' + colors.On_Red + link + colors.ENDC - +def connection_msg(url): + """ + Generator used to yield message while waiting for response + """ + yield "Attempting to connect to {url}".format(url=url) -def connection_msg(site): - yield "Attempting to connect to {site}".format(site=site) +def read_page(url): + """ + Attempts to connect to url and returns the HTML from page -def read_first_page(site): + Args: + url (str): url of website to be read + Returns: + page (str): html from page + response (int): indicator of success + """ headers = {'User-Agent': 'XXXX-XXXXX-XXXX'} attempts_left = 3 err = " " while attempts_left: if attempts_left == 3: - response = get_url_status(site, headers) + print(next(connection_msg(url))) + response = get_url_status(url, headers) if response != 0: page = BeautifulSoup(response.text, 'html.parser') return page, response - else: - attempts_left -= 1 - continue + + attempts_left -= 1 + continue if attempts_left == 2: - https_url = 'https://' + site + https_url = 'https://' + url print(next(connection_msg(https_url))) response = get_url_status(https_url, headers) if response != 0: page = BeautifulSoup(response.text, 'html.parser') return page, response - else: - attempts_left -= 1 - continue + + attempts_left -= 1 + continue if attempts_left == 1: - http_url = 'http://' + site + http_url = 'http://' + url print(next(connection_msg(http_url))) response = get_url_status(http_url, headers) if response != 0: page = BeautifulSoup(response.text, 'html.parser') return page, response - else: - attempts_left -= 1 - continue + + attempts_left -= 1 + continue if not attempts_left: msg = ''.join(("There has been an {err} while attempting to ", - "connect to {site}.")).format(err=err, site=site) + "connect to {url}.")).format(err=err, url=url) sys.exit(msg) @@ -92,10 +103,8 @@ def get_ip(): displays your IP address which we scape and return """ - b_colors = Bcolors() - page = read_first_page('https://check.torproject.org/')[0] - pg = page.find('strong') - ip_addr = pg.renderContents() - COLOR_BEGIN = b_colors.WARNING + b_colors.BOLD - COLOR_END = b_colors.ENDC - return COLOR_BEGIN + ip_addr.decode("utf-8") + COLOR_END + page = read_page('https://check.torproject.org/')[0] + ip_cont = page.find('strong') + ip_addr = ip_cont.renderContents() + ip_string = ip_addr.decode("utf-8") + return COLOR.add(ip_string, 'yellow') diff --git a/modules/utils.py b/modules/utils.py index d60e96a8..c5cee5ec 100644 --- a/modules/utils.py +++ b/modules/utils.py @@ -1,11 +1,13 @@ -import re -import requests -import modules.getweblinks - -from bs4 import BeautifulSoup -from requests.exceptions import HTTPError, ConnectionError +""" +Provides essential utilites for the rest of TorBot app +""" from queue import Queue from threading import Thread +from bs4 import BeautifulSoup +from requests.exceptions import HTTPError + +import requests +import modules.getweblinks # ALGORITHM UTILITY FUNCTIONS @@ -42,18 +44,19 @@ def bfs_urls(urls, add_exts, rec_depth=0, stop_depth=None, target_url=None): except (HTTPError, ConnectionError): continue soup = BeautifulSoup(resp.text, 'html.parser') - page_urls = getweblinks.get_urls_from_page(soup, extension=add_exts) - for url in page_urls: - urls_to_visit.append(url) + page_urls = modules.getweblinks.get_urls_from_page(soup, extension=add_exts) + for page_url in page_urls: + urls_to_visit.append(page_url) rec_depth += 1 + if stop_depth and target_url: - bfs_urls(urls_to_visit, add_exts, rec_depth, stop_depth, target_url) - elif stop_depth: - bfs_urls(urls_to_visit, add_exts, rec_depth, stop_depth=stop_depth) - elif target_url: - bfs_urls(urls_to_visit, add_exts, rec_depth, target_url=target_url) - else: - bfs_urls(urls_to_visit, add_exts, rec_depth=rec_depth) + return bfs_urls(urls_to_visit, add_exts, rec_depth, stop_depth, target_url) + if stop_depth: + return bfs_urls(urls_to_visit, add_exts, rec_depth, stop_depth=stop_depth) + if target_url: + return bfs_urls(urls_to_visit, add_exts, rec_depth, target_url=target_url) + + return bfs_urls(urls_to_visit, add_exts, rec_depth=rec_depth) def bfs(nodes, target_node=None, rec_depth=0, stop_depth=None): @@ -78,7 +81,7 @@ def bfs(nodes, target_node=None, rec_depth=0, stop_depth=None): adjacent_nodes = list() # Checks that nodes is a list or has a Visit method if not isinstance(nodes, list) and not hasattr(nodes, 'Visit', False): - raise(Exception('nodes must be a list')) + raise Exception('nodes must be a list') for node in nodes: if target_node == node and target_node: @@ -86,35 +89,36 @@ def bfs(nodes, target_node=None, rec_depth=0, stop_depth=None): node.Visit() adjacent_nodes.append(node) rec_depth += 1 + if target_node and not stop_depth: - bfs(adjacent_nodes, target_node, rec_depth) - elif not target_node and stop_depth: - bfs(adjacent_nodes, rec_depth=rec_depth, stop_depth=stop_depth) - elif target_node and stop_depth: - bfs(adjacent_nodes, target_node, rec_depth, stop_depth) - else: - bfs(adjacent_nodes, rec_depth) + return bfs(adjacent_nodes, target_node, rec_depth) + if not target_node and stop_depth: + return bfs(adjacent_nodes, rec_depth=rec_depth, stop_depth=stop_depth) + if target_node and stop_depth: + return bfs(adjacent_nodes, target_node, rec_depth, stop_depth) + return bfs(adjacent_nodes, rec_depth) -def exec_tasks(q, task_func, tasks_args=tuple()): + +def exec_tasks(que, task_func, tasks_args=tuple()): """ Executes tasks inside of queue using function and arguments passed inside of threads Args: - q (queue.Queue): contains tasks + que (queue.Queue): contains tasks task_func (function): function to be executed on tasks and args task_args (tuple): contains arguments for function Returns: None """ while True: - task = q.get() + task = que.get() if tasks_args: task_func(task, tasks_args) else: task_func(task) - q.task_done() + que.task_done() def queue_tasks(tasks, task_func, tasks_args=tuple()): @@ -129,29 +133,27 @@ def queue_tasks(tasks, task_func, tasks_args=tuple()): Returns: None """ - q = Queue(len(tasks)*2) + que = Queue(len(tasks)*2) for _ in tasks: if tasks_args: if isinstance(tasks_args, tuple): - t = Thread(target=exec_tasks, args=(q, task_func, tasks_args)) - t.daemon = True - t.start() + thd = Thread(target=exec_tasks, args=(que, task_func, tasks_args)) + thd.daemon = True + thd.start() else: - raise(Exception('Arguments must be in the form of a tuple.')) + raise Exception('Arguments must be in the form of a tuple.') else: - t = Thread(target=exec_tasks, args=(q, task_func)) - t.daemon = True - t.start() + thd = Thread(target=exec_tasks, args=(que, task_func)) + thd.daemon = True + thd.start() for task in tasks: - q.put(task) - q.join() + que.put(task) + que.join() # Networking functions - - def get_url_status(url, headers=False): """ Uses GET request to check if website exists diff --git a/tests/test_getemails.py b/tests/test_getemails.py index 3306b957..e65a6a38 100644 --- a/tests/test_getemails.py +++ b/tests/test_getemails.py @@ -17,7 +17,7 @@ def test_get_emails_fail(): mock_html = doc.getvalue() mock_soup = BeautifulSoup(mock_html, 'html.parser') - emails = getemails.getMails(mock_soup) + emails = getemails.get_mails(mock_soup) assert emails == [] @@ -36,7 +36,7 @@ def test_get_emails(): mock_html = doc.getvalue() mock_soup = BeautifulSoup(mock_html, 'html.parser') - emails = getemails.getMails(mock_soup) + emails = getemails.get_mails(mock_soup) assert emails == test_emails diff --git a/tests/test_pagereader.py b/tests/test_pagereader.py index 2d38c0f0..0253fdb3 100644 --- a/tests/test_pagereader.py +++ b/tests/test_pagereader.py @@ -33,7 +33,7 @@ def test_read_first_page(): mock_connection.register_uri('GET', test_data[i][0], text=test_data[i][1]) - result = str(pagereader.read_first_page(test_data[i][0])[0]) + result = str(pagereader.read_page(test_data[i][0])[0]) assert result == test_data[i][1] diff --git a/torBot.py b/torBot.py index c7edbc52..42bd73ab 100644 --- a/torBot.py +++ b/torBot.py @@ -1,12 +1,16 @@ +""" +MAIN MODULE +""" import argparse import socket import socks -from modules import (bcolors, getemails, pagereader, getweblinks, updater, +from modules import (colors, getemails, pagereader, getweblinks, updater, info, savefile) # GLOBAL CONSTS LOCALHOST = "127.0.0.1" DEFPORT = 9050 +COLOR = colors.Colors() # TorBot VERSION __VERSION = "1.2" @@ -46,7 +50,7 @@ def getaddrinfo(*args): Last two arguments should be a tuple containing the address and port """ return [(socket.AF_INET, socket.SOCK_STREAM, 6, - '', (args[0], args[1]))] + '', (args[0], args[1]))] socket.getaddrinfo = getaddrinfo @@ -55,11 +59,7 @@ def header(): Prints out header ASCII art """ - b_color = bcolors.Bcolors() - D3DSEC = b_color.FAIL + " D3DSEC " + b_color.WHITE - INS1DE = b_color.FAIL + " INS1DE " + b_color.WHITE - - header = r""" + title = r""" __ ____ ____ __ ______ / /_/ __ \/ __ \/ /_ ____/_ __/ / __/ / / / /_/ / __ \/ __ \/ / @@ -71,15 +71,21 @@ def header(): # GitHub : https://github.com/DedsecInside/TorBot # # Help : use -h for help text # ####################################################### - {FAIL} LICENSE: GNU Public License {END}""".format( - D3DSEC=D3DSEC, INS1DE=INS1DE, FAIL=b_color.FAIL, - BOLD=b_color.BOLD, VERSION=__VERSION, END=b_color.ENDC, - On_Black=b_color.On_Black, WHITE=b_color.WHITE - ) - print(header) + {FAIL} LICENSE: GNU Public License {END}""" + + title = title.format( + FAIL=COLOR.get('red'), + VERSION=__VERSION, + END=COLOR.get('end'), + On_Black=COLOR.get('black') + ) + print(title) def get_args(): + """ + Parses user flags passed to TorBot + """ parser = argparse.ArgumentParser() parser.add_argument("-v", "--version", action="store_true", @@ -117,7 +123,10 @@ def get_args(): return parser.parse_args() -def main(conn=False): +def main(): + """ + TorBot's Core + """ args = get_args() connect(args.ip, args.port) link = args.url @@ -136,11 +145,11 @@ def main(conn=False): # additional flag can be set with -u/--url flag if args.url: print("Tor IP Address :", pagereader.get_ip()) - html_content, response = pagereader.read_first_page(link) + html_content, response = pagereader.read_page(link) print("Connection successful.") # -m/--mail if args.mail: - emails = getemails.getMails(html_content) + emails = getemails.get_mails(html_content) print(emails) if args.save: savefile.saveJson('Emails', emails) From 3409666fe9ff0fc4738713fc594317fe8d8cba41 Mon Sep 17 00:00:00 2001 From: Akeem King Date: Fri, 14 Sep 2018 10:09:06 -0400 Subject: [PATCH 30/34] Trying to remove cyclic import error --- modules/colors.py | 1 + modules/getemails.py | 1 + modules/getweblinks.py | 1 + modules/pagereader.py | 1 + modules/utils.py | 1 + 5 files changed, 5 insertions(+) diff --git a/modules/colors.py b/modules/colors.py index a7f9e5cb..4efd3b18 100644 --- a/modules/colors.py +++ b/modules/colors.py @@ -1,3 +1,4 @@ + """ Module containing class with colors """ diff --git a/modules/getemails.py b/modules/getemails.py index 150c7b3f..7e1d5b0b 100644 --- a/modules/getemails.py +++ b/modules/getemails.py @@ -1,3 +1,4 @@ + """ Module returns emails found on webpage """ diff --git a/modules/getweblinks.py b/modules/getweblinks.py index c0c69e5d..9748299c 100644 --- a/modules/getweblinks.py +++ b/modules/getweblinks.py @@ -1,3 +1,4 @@ + """ Module used to interact with a pages urls """ diff --git a/modules/pagereader.py b/modules/pagereader.py index d26d2208..fc78ef84 100644 --- a/modules/pagereader.py +++ b/modules/pagereader.py @@ -1,3 +1,4 @@ + """ This module is used for reading HTML pages using either bs4.BeautifulSoup objects or url strings """ diff --git a/modules/utils.py b/modules/utils.py index c5cee5ec..eb3041e5 100644 --- a/modules/utils.py +++ b/modules/utils.py @@ -1,3 +1,4 @@ + """ Provides essential utilites for the rest of TorBot app """ From 9e31c338d8119b7b92bbb6d9c30627906c7c4db4 Mon Sep 17 00:00:00 2001 From: Akeem King Date: Fri, 14 Sep 2018 10:25:47 -0400 Subject: [PATCH 31/34] Updating requirements --- requirements.txt | 2 -- 1 file changed, 2 deletions(-) diff --git a/requirements.txt b/requirements.txt index 0c415d7b..47377e35 100644 --- a/requirements.txt +++ b/requirements.txt @@ -3,6 +3,4 @@ PySocks==1.6.7 termcolor==1.1.0 requests==2.18.4 requests_mock==1.4.0 -tldextract==2.2.0 yattag==1.10.0 -python-dotenv==0.9.1 From f32fe1272cc7630d3c5d5ab7f1e3108b9f454d96 Mon Sep 17 00:00:00 2001 From: Akeem King Date: Fri, 14 Sep 2018 10:27:34 -0400 Subject: [PATCH 32/34] Updating README --- README.md | 2 -- 1 file changed, 2 deletions(-) diff --git a/README.md b/README.md index e1c8601a..f3f35a83 100755 --- a/README.md +++ b/README.md @@ -72,8 +72,6 @@ Contributor name will be updated to the below list. :D - PySocks - termcolor - requests -- python-dotenv -- tldextract - requests_mock - yattag From b2f9967d354b0218ca6d41019d498106033b00b6 Mon Sep 17 00:00:00 2001 From: Akeem King Date: Fri, 14 Sep 2018 10:29:27 -0400 Subject: [PATCH 33/34] Adding pyinstaller to requirements and to install script --- install.sh | 2 ++ requirements.txt | 1 + 2 files changed, 3 insertions(+) diff --git a/install.sh b/install.sh index cfaaaaf1..80ea9db6 100755 --- a/install.sh +++ b/install.sh @@ -8,6 +8,8 @@ go get golang.org/x/net/html mkdir -p tmp_build mkdir -p tmp_dist +pip install pyinstaller + # Creates executable file and sends dependences to the recently created directories pyinstaller --onefile --workpath ./tmp_build --distpath ./tmp_dist torBot.py diff --git a/requirements.txt b/requirements.txt index 47377e35..a79c45d0 100644 --- a/requirements.txt +++ b/requirements.txt @@ -4,3 +4,4 @@ termcolor==1.1.0 requests==2.18.4 requests_mock==1.4.0 yattag==1.10.0 +pyinstaller==3.4.0 From 551192f6278952de414d257790d6cc045f87e9c5 Mon Sep 17 00:00:00 2001 From: Akeem King Date: Fri, 14 Sep 2018 10:30:01 -0400 Subject: [PATCH 34/34] Updating README --- README.md | 1 + 1 file changed, 1 insertion(+) diff --git a/README.md b/README.md index f3f35a83..f3a0cbb3 100755 --- a/README.md +++ b/README.md @@ -69,6 +69,7 @@ Contributor name will be updated to the below list. :D ### Python Dependencies - beautifulsoup4 +- pyinstaller - PySocks - termcolor - requests