diff --git a/src/modules/analyzer.py b/src/modules/analyzer.py index 4d1828e2..5dd4caa9 100644 --- a/src/modules/analyzer.py +++ b/src/modules/analyzer.py @@ -3,14 +3,12 @@ """ from requests.exceptions import HTTPError -from ete3 import Tree, TreeStyle, TextFace, add_face_to_node -from .link import LinkNode -from .utils import multi_thread +from ete3 import faces, Tree, TreeStyle, TextFace, add_face_to_node def default_layout(node): node_style = TextFace(node.name, tight_text=True) - add_face_to_node(node_style, node, column=0, position='branch-bottom') + faces.add_face_to_node(node_style, node, column=0, position='branch-bottom') default_style = TreeStyle() @@ -51,6 +49,7 @@ def save(self, file_name, tree_style=default_style): file_name (str): Name of file being saved to tree_style (TreeStyle): Styling of downloaded tree """ + self._tree.layout_fn = default_layout self._tree.render(file_name, tree_style) def show(self, tree_style=default_style): @@ -60,15 +59,16 @@ def show(self, tree_style=default_style): Args: tree_style (TreeStyle): Styling of downloaded tree """ - self._tree.show(tree_style) + self._tree.layout_fn = default_layout + self._tree.show(tree_style=tree_style) -def build_tree(link, stop=1, rec=0): +def build_tree(node, stop=1, rec=0): """ Builds link tree by traversing through children nodes. Args: - link (LinkNode): root node of tree + node (LinkNode): root node of tree stop (int): depth of tree rec (int): level of recursion @@ -76,23 +76,18 @@ def build_tree(link, stop=1, rec=0): tree (ete3.Tree): Built tree. """ - tree = Tree(name=link.name) + print('Adding node for: ', node.get_name()) + tree = Tree(name=node.get_name()) if rec == stop: return tree else: rec += 1 - for child in link.links: - try: - node = LinkNode(child) - except Exception as error: - print(f"Failed to create LinkNode for link: {child}.") - print(f"Error: {error}") - continue - if node.links: - tree.add_child(build_tree(node, stop, rec)) + for child in node.get_children(): + if child.get_children(): + tree.add_child(build_tree(child, stop, rec)) else: - tree.add_child(Tree(name=node.name)) + tree.add_child(Tree(name=child.get_name())) return tree diff --git a/src/modules/collect_data.py b/src/modules/collect_data.py index 6866b5c3..b4e04284 100644 --- a/src/modules/collect_data.py +++ b/src/modules/collect_data.py @@ -8,12 +8,12 @@ from bs4 import BeautifulSoup from dotenv import load_dotenv -from .link import LinkNode -from .utils import multi_thread from .utils import find_file from threadsafe.safe_csv import SafeDictWriter from progress.bar import Bar +from .validators import validate_link + dev_file = find_file("torbot_dev.env", "../") if not dev_file: @@ -31,7 +31,7 @@ def parse_links(html): """ soup = BeautifulSoup(html, 'html.parser') tags = soup.find_all('a') - return [tag['href'] for tag in tags if LinkNode.valid_link(tag['href'])] + return [tag['href'] for tag in tags if validate_link(tag['href'])] def parse_meta_tags(soup): diff --git a/src/modules/info.py b/src/modules/info.py index 4f3d96b8..304701e1 100644 --- a/src/modules/info.py +++ b/src/modules/info.py @@ -9,9 +9,7 @@ from re import search, findall from requests.exceptions import HTTPError import requests -from requests import get import re -from .link_io import LinkIO def execute_all(link, *, display_status=False): @@ -40,9 +38,8 @@ def execute_all(link, *, display_status=False): bad_scripts = set() # unclean javascript file urls datasets = [files, intel, robots, custom, failed, scripts, external, fuzzable, endpoints, keys] dataset_names = ['files', 'intel', 'robots', 'custom', 'failed', 'scripts', 'external', 'fuzzable', 'endpoints', 'keys'] - page, response = LinkIO.read(link, response=True, show_msg=display_status) - response = get(link, verify=False).text - soup = BeautifulSoup(page, 'html.parser') + response = requests.get(link) + soup = BeautifulSoup(response.text, 'html.parser') validation_functions = [get_robots_txt, get_dot_git, get_dot_svn, get_dot_git, get_intel, get_bitcoin_address] for validate_func in validation_functions: try: diff --git a/src/modules/link.py b/src/modules/link.py index ab836271..c274f316 100644 --- a/src/modules/link.py +++ b/src/modules/link.py @@ -2,14 +2,12 @@ This module is used to create a LinkNode that can be consumued by a LinkTree and contains useful Link methods. """ -import requests -import requests.exceptions -import validators import re +import requests from bs4 import BeautifulSoup -from .utils import multi_thread + from .color import color -import sys +from .validators import validate_email, validate_link def get_emails(node): """Finds all emails associated with node @@ -21,29 +19,21 @@ def get_emails(node): emails (list): List of emails. """ emails = [] - response = node.response.text - mails = re.findall(r'[\w\.-]+@[\w\.-]+', response) + mails = re.findall(r'[\w\.-]+@[\w\.-]+', node._node.get_text()) for email in mails: - if LinkNode.valid_email(email): + if validate_email(email): emails.append(email) return emails -def get_links(node): - """Finds all links associated with node - - Args: - node (LinkNode): Node used to get links from. - - Returns: - links (list): List of links. - """ - links = [] - for child in node.children: - link = child.get('href') - if link and LinkNode.valid_link(link): - links.append(link) - return links +def get_children(node): + children = [] + for anchor_tag in node._node.find_all('a'): + link = anchor_tag.get('href') + if validate_link(link): + chlid_node = LinkNode(link) + children.append(chlid_node) + return children def get_json_data(node): @@ -56,13 +46,9 @@ def get_json_data(node): titles (list): List of Titles. """ json = [] - for child in node.children: - link = child.get('href') - title = "Not Available" - if link and LinkNode.valid_link(link): - node = LinkNode(link) - title = node.name - json.append({"link":link,"title":title}) + for anchor_tag in node._node.find_all('a'): + link = anchor_tag.get('href') + json.append({"link":link,"tag":anchor_tag}) return json @@ -73,26 +59,14 @@ def get_images(node): node (LinkNode): Node used to get links from. Returns: - links (list): List of links. + imageEls (list): A collection of img HTML elements """ - links = [] - for child in node.children: - link = child.get('src') - if link and LinkNode.valid_link(link): - links.append(link) - return links - - -def get_metadata(node): - """Collect response headers. - - Args: - node (LinkNode): Node used to get metadata from. - - Returns: - metadata (dict): Dictionary with metadata. - """ - return node.response.headers + imageEls = [] + for anchor_tag in node._node.find_all('a'): + image = anchor_tag.get('src') + if validate_link(image): + imageEls.append(image) + return imageEls class LinkNode: @@ -105,113 +79,59 @@ def __init__(self, link): link (str): URL used to initialise node. """ # If link has invalid form, throw an error - if not self.valid_link(link): + if not validate_link(link): raise ValueError("Invalid link format.") - self._children = [] - self._emails = [] - self._links = [] - self._images = [] - self._json_data = [] - self._metadata = {} + self._loaded = False + self._name = link + self._link = link - # Attempts to connect to link, throws an error if link is unreachable + def load_data(self): + response = requests.get(self._link) + status = str(response.status_code) try: - self.response = requests.get(link) - except (requests.exceptions.ChunkedEncodingError, - requests.exceptions.HTTPError, - requests.exceptions.ConnectionError, - ConnectionError) as err: - print("Error connecting to Tor:", err) - sys.exit(1) - - self._node = BeautifulSoup(self.response.text, 'html.parser') - self.uri = link - if not self._node.title: - self.name = "TITLE NOT FOUND" - self.status = color(link, 'yellow') - else: - self.name = self._node.title.string - self.status = color(link, 'green') - - @property - def emails(self): - """ - Getter for node emails - """ - if not self._emails: + response.raise_for_status() + self._metadata = response.headers + self._node = BeautifulSoup(response.text, 'html.parser') + self.status = color(status, 'green') + self._name = self._node.title.string self._emails = get_emails(self) - return self._emails - - @property - def json_data(self): - """ - Getter for node titles - """ - if not self._json_data: + self._children = get_children(self) + self._emails = get_emails(self) + self._images = get_images(self) self._json_data = get_json_data(self) - return self._json_data + except Exception: + self._node = None + self.status = color(status, 'yellow') + self._name = 'TITLE NOT FOUND' + finally: + self._loaded = True - @property - def links(self): - """ - Getter for node links - """ - if not self._links: - self._links = get_links(self) - return self._links - @property - def images(self): - """ - Getter for node images - """ - if not self._images: - self._images = get_images(self) - return self._images + def get_link(self): + return self._link - @property - def children(self): - """ - Getter for node children - """ - if not self._children: - self._children = self._node.find_all('a') + def get_name(self): + if not self._loaded: + self.load_data() + return self._name + + def get_children(self): + if not self._loaded: + self.load_data() return self._children - @property - def metadata(self): - """ - Getter for node metadata - """ - if not self._metadata: - self._metadata = get_metadata(self) + def get_emails(self): + if not self._loaded: + self.load_data() + return self._emails + + def get_json(self): + if not self._loaded: + self.load_data() + return self._json_data + + def get_meatadta(self): + if not self._loaded: + self.load_data() return self._metadata - - @staticmethod - def valid_email(email): - """Static method used to validate emails. - - Args: - email (str): Email string to be validated. - - Returns: - (bool): True if email string is valid, else false. - """ - if validators.email(email): - return True - return False - - @staticmethod - def valid_link(link): - """Static method used to validate links - - Args: - link (str): URL string to be validated. - - Returns: - (bool): True if URL string is valid, else false. - """ - if validators.url(link): - return True - return False diff --git a/src/modules/link_io.py b/src/modules/link_io.py index ea09f00f..5370c782 100644 --- a/src/modules/link_io.py +++ b/src/modules/link_io.py @@ -2,101 +2,50 @@ This module is used for reading HTML pages using either bs4.BeautifulSoup objects or url strings """ -import requests.exceptions +import requests from bs4 import BeautifulSoup -from .link import LinkNode -from .utils import multi_thread from .color import color -class LinkIO: +def print_tor_ip_address(): """ - Class to interact with and interrogate links. + https://check.torproject.org/ tells you if you are using tor and it + displays your IP address which we scape and display """ - @staticmethod - def display_children(root): - """ - Static method to display status of child nodes. + print('Attempting to connect to https://check.torproject.org/') + response = requests.get('https://check.torproject.org/') + page = BeautifulSoup(response.text, 'html.parser') + ip_cont = page.find('strong') + ip_addr = ip_cont.renderContents() + ip_string = color(ip_addr.decode("utf-8"), 'yellow') + print(f'Tor IP Address: {ip_string}') - Args: - root (LinkNode): root of children to be displayed. - """ - sucess_msg = color(f'Links Found - {len(root.links)}', 'green') - print(sucess_msg + '\n' + '---------------------------------') - multi_thread(root.links, LinkIO.display) - @staticmethod - def read(link, *, response=False, show_msg=False, headers=None, schemes=None): - """ - Attempts to retrieve HTML from link. - - Args: - link (str): Link to read. - response (bool): Determines if response is returned. - show_msg (bool): Determines if message is displayed for connection. - headers (dict): Header for request, defaults to None. - schemes (list): Different schemes to attempt to use. - - Returns: - str: HTML from page. - requests.Response (optional): Response returned from requests. - """ - headers = {'User-Agent': 'XXXX-XXXXX-XXXX'} if not headers else headers - # Attempts to connect directly to site if no scheme is passed - if not schemes: - if show_msg: - print(f'Attempting to connect to {link}') - if LinkNode.valid_link(link): - node = LinkNode(link) - if response: - return node.response.text, node.response - return node.response.text - - schemes = ['https://', 'http://'] if not schemes else schemes - # Attempt to use different schemes until one is successful - for scheme in schemes: - temp_url = scheme + link - if show_msg: - print(f'Attempting to connect to {link}') - if LinkNode.valid_link(temp_url): - node = LinkNode(temp_url) - if response: - return node.response.text, node.response - return node.response.text - raise ConnectionError - - @staticmethod - def display(link): - """ - Prints the status of a link based on it's connection status. - - Args: - link (str): Link to return status of. - """ - if LinkNode.valid_link(link): - try: - node = LinkNode(link) - title = node.name - link_status = node.status - except (requests.exceptions.HTTPError, - requests.exceptions.ConnectionError, - ConnectionError): - title = 'Not Found' - link_status = color(link, 'red') +def display_children(node): + """ + Static method to display status of child nodes + Args: + node (LinkNode): root of children to be displayed + """ + children = node.get_children() + sucess_msg = color(f'Links Found - {len(children)}', 'green') + print(sucess_msg + '\n' + '---------------------------------') + for child in children: + display(child) - status_msg = "%-80s %-30s" % (link_status, title) - print(status_msg) - @staticmethod - def display_ip(): - """ - Uses https://check.torproject.org/ to determine if you - are using Tor which is then scraped and displayed. - """ - page = LinkIO.read('https://check.torproject.org/', show_msg=True) - page = BeautifulSoup(page, 'html.parser') - ip_cont = page.find('strong') - ip_addr = ip_cont.renderContents() - ip_string = color(ip_addr.decode("utf-8"), 'yellow') - print(f'Tor IP Address: {ip_string}') +def display(node): + """ + Prints the status of a link based on it's connection status + Args: + link (str): link to get status of + """ + try: + title = node.get_name() + status = node.status + except Exception: + title = "NOT FOUND" + status = color('Unable to reach destination.', 'red') + status_msg = "%-30s %-20s %-70s" % (title, status, node.get_link()) + print(status_msg) \ No newline at end of file diff --git a/src/modules/validators.py b/src/modules/validators.py new file mode 100644 index 00000000..a8fd4dba --- /dev/null +++ b/src/modules/validators.py @@ -0,0 +1,11 @@ +import validators + +def validate_email(email): + if not isinstance(email, str): + return False + return validators.email(email) + +def validate_link(link): + if not isinstance(link, str): + return False + return validators.url(link) \ No newline at end of file diff --git a/src/torBot.py b/src/torBot.py index 516e154f..826395e5 100644 --- a/src/torBot.py +++ b/src/torBot.py @@ -9,7 +9,7 @@ from modules.analyzer import LinkTree from modules.color import color -from modules.link_io import LinkIO +from modules.link_io import print_tor_ip_address, display_children from modules.link import LinkNode from modules.updater import updateTor from modules.savefile import saveJson @@ -36,15 +36,7 @@ def connect(address, port, no_socks): """ if no_socks: return - if address and port: - socks.set_default_proxy(socks.PROXY_TYPE_SOCKS5, address, int(port)) - elif address: - socks.set_default_proxy(socks.PROXY_TYPE_SOCKS5, address, DEFPORT) - elif port: - socks.set_default_proxy(socks.PROXY_TYPE_SOCKS5, LOCALHOST, int(port)) - else: - socks.set_default_proxy(socks.PROXY_TYPE_SOCKS5, LOCALHOST, DEFPORT) - + socks.set_default_proxy(socks.PROXY_TYPE_SOCKS5, address, port) socket.socket = socks.socksocket # Monkey Patch our socket to tor socket def getaddrinfo(*args): @@ -103,8 +95,8 @@ def get_args(): help="Update TorBot to the latest stable version") parser.add_argument("-q", "--quiet", action="store_true") parser.add_argument("-u", "--url", help="Specifiy a website link to crawl") - parser.add_argument("--ip", help="Change default ip of tor") - parser.add_argument("-p", "--port", help="Change default port of tor") + parser.add_argument("--ip", help="Change default ip of tor", default=LOCALHOST) + parser.add_argument("-p", "--port", help="Change default port of tor", default=DEFPORT) parser.add_argument("-s", "--save", action="store_true", help="Save results in a file") parser.add_argument("-m", "--mail", action="store_true", @@ -154,19 +146,17 @@ def main(): # If url flag is set then check for accompanying flag set. Only one # additional flag can be set with -u/--url flag if args.url: - try: - node = LinkNode(args.url) - except (ValueError, HTTPError, ConnectionError) as err: - raise err - LinkIO.display_ip() + node = LinkNode(args.url) + print_tor_ip_address() # -m/--mail if args.mail: - print(node.emails) + emails = node.get_emails() + print(emails) if args.save: - saveJson('Emails', node.emails) + saveJson('Emails', emails) # -i/--info if args.info: - execute_all(node.uri) + execute_all(node.get_link()) if args.save: print('Nothing to save.\n') if args.visualize: @@ -180,9 +170,9 @@ def main(): file_name = str(input("File Name (.pdf/.png/.svg): ")) tree.save(file_name) else: - LinkIO.display_children(node) + display_children(node) if args.save: - print(node.json_data) + print(node.get_json()) #saveJson("Links", node.links) else: print("usage: See torBot.py -h for possible arguments.") @@ -222,8 +212,7 @@ def test(args): print("Link Node",LinkNode(url)) except (ValueError, HTTPError, ConnectionError) as err: raise err - LinkIO.display_ip() - print("display_ip()",LinkIO.display_ip()) + print("display_ip()",print_tor_ip_address()) # -m/--mail if args['mail']==True: print(node.emails) @@ -245,7 +234,7 @@ def test(args): file_name = str(input("File Name (.pdf/.png/.svg): ")) tree.save(file_name) else: - LinkIO.display_children(node) + display_children(node) if args['save']==True: saveJson("Links", node.links) else: