Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Simplify LinkNode and add new display #202

Merged
merged 3 commits into from
Mar 25, 2021
Merged
Show file tree
Hide file tree
Changes from 2 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
22 changes: 9 additions & 13 deletions src/modules/analyzer.py
Original file line number Diff line number Diff line change
Expand Up @@ -63,36 +63,32 @@ def show(self, tree_style=default_style):
self._tree.show(tree_style)


def build_tree(link, stop=1, rec=0):
def build_tree(node, stop=1, rec=0):
"""
Builds link tree by traversing through children nodes.

Args:
link (LinkNode): root node of tree
node (LinkNode): root node of tree
stop (int): depth of tree
rec (int): level of recursion

Returns:
tree (ete3.Tree): Built tree.
"""

tree = Tree(name=link.name)
tree = Tree(name=node.get_name())

if rec == stop:
return tree
else:
rec += 1

for child in link.links:
try:
node = LinkNode(child)
except Exception as error:
print(f"Failed to create LinkNode for link: {child}.")
print(f"Error: {error}")
continue
if node.links:
tree.add_child(build_tree(node, stop, rec))
node.load_data()
for child in node.get_children():
child.load_data()
if child.get_children():
tree.add_child(build_tree(child, stop, rec))
else:
tree.add_child(Tree(name=node.name))
tree.add_child(Tree(name=child.get_name()))

return tree
5 changes: 3 additions & 2 deletions src/modules/collect_data.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,12 +8,13 @@

from bs4 import BeautifulSoup
from dotenv import load_dotenv
from .link import LinkNode
from .utils import multi_thread
from .utils import find_file
from threadsafe.safe_csv import SafeDictWriter
from progress.bar import Bar

from .validators import validate_link


dev_file = find_file("torbot_dev.env", "../")
if not dev_file:
Expand All @@ -31,7 +32,7 @@ def parse_links(html):
"""
soup = BeautifulSoup(html, 'html.parser')
tags = soup.find_all('a')
return [tag['href'] for tag in tags if LinkNode.valid_link(tag['href'])]
return [tag['href'] for tag in tags if validate_link(tag['href'])]


def parse_meta_tags(soup):
Expand Down
7 changes: 2 additions & 5 deletions src/modules/info.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,9 +9,7 @@
from re import search, findall
from requests.exceptions import HTTPError
import requests
from requests import get
import re
from .link_io import LinkIO


def execute_all(link, *, display_status=False):
Expand Down Expand Up @@ -40,9 +38,8 @@ def execute_all(link, *, display_status=False):
bad_scripts = set() # unclean javascript file urls
datasets = [files, intel, robots, custom, failed, scripts, external, fuzzable, endpoints, keys]
dataset_names = ['files', 'intel', 'robots', 'custom', 'failed', 'scripts', 'external', 'fuzzable', 'endpoints', 'keys']
page, response = LinkIO.read(link, response=True, show_msg=display_status)
response = get(link, verify=False).text
soup = BeautifulSoup(page, 'html.parser')
response = requests.get(link)
soup = BeautifulSoup(response.text, 'html.parser')
validation_functions = [get_robots_txt, get_dot_git, get_dot_svn, get_dot_git, get_intel, get_bitcoin_address]
for validate_func in validation_functions:
try:
Expand Down
222 changes: 73 additions & 149 deletions src/modules/link.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,14 +2,12 @@
This module is used to create a LinkNode that can be consumued by a LinkTree
and contains useful Link methods.
"""
import requests
import requests.exceptions
import validators
import re
import requests
from bs4 import BeautifulSoup
from .utils import multi_thread

from .color import color
import sys
from .validators import validate_email, validate_link

def get_emails(node):
"""Finds all emails associated with node
Expand All @@ -21,29 +19,21 @@ def get_emails(node):
emails (list): List of emails.
"""
emails = []
response = node.response.text
mails = re.findall(r'[\w\.-]+@[\w\.-]+', response)
mails = re.findall(r'[\w\.-]+@[\w\.-]+', node._node.get_text())
for email in mails:
if LinkNode.valid_email(email):
if validate_email(email):
emails.append(email)
return emails


def get_links(node):
"""Finds all links associated with node

Args:
node (LinkNode): Node used to get links from.

Returns:
links (list): List of links.
"""
links = []
for child in node.children:
link = child.get('href')
if link and LinkNode.valid_link(link):
links.append(link)
return links
def get_children(node):
children = []
for anchor_tag in node._node.find_all('a'):
link = anchor_tag.get('href')
if validate_link(link):
chlid_node = LinkNode(link)
children.append(chlid_node)
return children


def get_json_data(node):
Expand All @@ -56,12 +46,12 @@ def get_json_data(node):
titles (list): List of Titles.
"""
json = []
for child in node.children:
link = child.get('href')
for anchor_tag in node._node.find_all('a'):
link = anchor_tag.get('href')
title = "Not Available"
if link and LinkNode.valid_link(link):
if validate_link(link):
node = LinkNode(link)
title = node.name
title = node.get_name()
json.append({"link":link,"title":title})
return json

Expand All @@ -73,26 +63,14 @@ def get_images(node):
node (LinkNode): Node used to get links from.

Returns:
links (list): List of links.
imageEls (list): A collection of img HTML elements
"""
links = []
for child in node.children:
link = child.get('src')
if link and LinkNode.valid_link(link):
links.append(link)
return links


def get_metadata(node):
"""Collect response headers.

Args:
node (LinkNode): Node used to get metadata from.

Returns:
metadata (dict): Dictionary with metadata.
"""
return node.response.headers
imageEls = []
for anchor_tag in node._node.find_all('a'):
image = anchor_tag.get('src')
if validate_link(image):
imageEls.append(image)
return imageEls


class LinkNode:
Expand All @@ -105,113 +83,59 @@ def __init__(self, link):
link (str): URL used to initialise node.
"""
# If link has invalid form, throw an error
if not self.valid_link(link):
if not validate_link(link):
raise ValueError("Invalid link format.")

self._children = []
self._emails = []
self._links = []
self._images = []
self._json_data = []
self._metadata = {}

# Attempts to connect to link, throws an error if link is unreachable
try:
self.response = requests.get(link)
except (requests.exceptions.ChunkedEncodingError,
requests.exceptions.HTTPError,
requests.exceptions.ConnectionError,
ConnectionError) as err:
print("Error connecting to Tor:", err)
sys.exit(1)

self._node = BeautifulSoup(self.response.text, 'html.parser')
self.uri = link
if not self._node.title:
self.name = "TITLE NOT FOUND"
self.status = color(link, 'yellow')
else:
self.name = self._node.title.string
self.status = color(link, 'green')

@property
def emails(self):
"""
Getter for node emails
"""
if not self._emails:
self._emails = get_emails(self)
return self._emails

@property
def json_data(self):
"""
Getter for node titles
"""
if not self._json_data:
self._json_data = get_json_data(self)
return self._json_data

@property
def links(self):
"""
Getter for node links
"""
if not self._links:
self._links = get_links(self)
return self._links
self._loaded = False
self._name = link
self._link = link

@property
def images(self):
"""
Getter for node images
"""
if not self._images:
self._images = get_images(self)
return self._images
def load_data(self):
if self._loaded:
return

@property
def children(self):
"""
Getter for node children
"""
if not self._children:
self._children = self._node.find_all('a')
response = requests.get(self._link)
status = str(response.status_code)
try:
response.raise_for_status()
self._metadata = response.headers
self._node = BeautifulSoup(response.text, 'html.parser')
self.status = color(status, 'green')
self._name = self._node.title.string
except Exception:
self._node = None
self.status = color(status, 'yellow')
self._name = 'TITLE NOT FOUND'

self._emails = get_emails(self)
self._children = get_children(self)
self._emails = get_emails(self)
self._images = get_images(self)
self._json_data = get_json_data(self)
self._loaded = True

def get_link(self):
return self._link

def get_name(self):
return self._name

def get_children(self):
if not self._loaded:
raise Exception("node is not loaded")
return self._children

@property
def metadata(self):
"""
Getter for node metadata
"""
if not self._metadata:
self._metadata = get_metadata(self)
def get_emails(self):
if not self._loaded:
raise Exception("node is not loaded")
return self._emails

def get_json(self):
if not self._loaded:
raise Exception("node is not loaded")
return self._json_data

def get_meatadta(self):
if not self._loaded:
raise Exception("node is not loaded")
return self._metadata

@staticmethod
def valid_email(email):
"""Static method used to validate emails.

Args:
email (str): Email string to be validated.

Returns:
(bool): True if email string is valid, else false.
"""
if validators.email(email):
return True
return False

@staticmethod
def valid_link(link):
"""Static method used to validate links

Args:
link (str): URL string to be validated.

Returns:
(bool): True if URL string is valid, else false.
"""
if validators.url(link):
return True
return False
Loading