Merge pull request #8 from DedSecInside/dev

Merge
DedSecInside · Sep 20, 2018 · 1ac142d · 1ac142d
2 parents e2e7690 + 0fe0d58
commit 1ac142d
Show file tree

Hide file tree

Showing 21 changed files with 553 additions and 330 deletions.
diff --git a/.env b/.env
diff --git a/.gitignore b/.gitignore
@@ -17,6 +17,8 @@ tests/.ropeproject/
 *.pyc
 .pytestc*
 .pytest_cache
+__pycache*
+__pycache__/
 
 # Misc
 torBot

diff --git a/FAQ.md b/FAQ.md
@@ -0,0 +1 @@
+## FAQ
diff --git a/README.md b/README.md
@@ -62,18 +62,20 @@ If its a new module, it should be put inside the modules directory and imported
 The branch name should be your new feature name in the format <Feature_featurename_version(optional)>. For example, <i>Feature_FasterCrawl_1.0</i>.
 Contributor name will be updated to the below list. :D
 
-## Dependencies
-1. Tor
-2. Python 3.x (Make sure pip3 is installed)
-3. requests
-4. Beautiful Soup 4
-5. Socket
-6. Sock
-7. Argparse
-8. Git
-9. termcolor
-10. tldextract
-11. Golang
+### OS Dependencies
+- Tor
+- Python 3.x
+- Golang 1.x (Not Currently Used)
+
+### Python Dependencies
+- beautifulsoup4
+- pyinstaller
+- PySocks
+- termcolor
+- requests
+- requests_mock
+- yattag
+
 
 ## Basic setup
 Before you run the torBot make sure the following things are done properly:
@@ -83,6 +85,9 @@ Before you run the torBot make sure the following things are done properly:
 
 * Make sure that your torrc is configured to SOCKS_PORT localhost:9050
 
+* Install TorBot Python requirements
+`pip3 install -r requirements.txt`
+
 On Linux platforms, you can make an executable for TorBot by using the install.sh script.
 You will need to give the script the correct permissions using `chmod +x install.sh`
 Now you can run `./install.sh` to create the torBot binary.
@@ -116,9 +121,10 @@ Read more about torrc here : [Torrc](https://github.com/DedSecInside/TorBoT/blob
 
 ## TO-DO
 - [ ] Visualization Module
-- [ ] Implement A\* Search for webcrawler
-- [X] Multithreading
-- [ ] Optimization
+- [x] Implement BFS Search for webcrawler
+- [X] Multithreading for Get Links
+- [ ] Improve stability (Handle errors gracefully, expand test coverage and etc.)
+- [ ] Create a user-friendly GUI 
 - [ ] Randomize Tor Connection (Random Header and Identity)
 - [ ] Keyword/Phrase search
 - [ ] Social Media Integration
@@ -153,13 +159,13 @@ GNU Public License
 ## CREDITS
 
 - [X] [P5N4PPZ](https://github.com/PSNAppz) - Owner
+- [X] [KingAkeem](https://github.com/KingAkeem) - Experienced Contributor,Reviewer,Core Member
 - [X] [agrepravin](https://github.com/agrepravin) - Contributor,Reviewer
-- [X] [KingAkeem](https://github.com/KingAkeem) - Experienced Contributor,Reviewer
+- [X] [shivankar-madaan](https://github.com/shivankar-madaan) - Experienced Contributor
 - [X] [y-mehta](https://github.com/y-mehta) - Contributor
 - [X] [Manfredi Martorana](https://github.com/Agostinelli) - Contributor
 - [X] [Evan Sia Wai Suan](https://github.com/waisuan) - New Contributor
 - [X] [Lean](https://github.com/leaen) - New Contributor
-- [X] [shivankar-madaan](https://github.com/shivankar-madaan) - New Contributor
 - [X] [Gus](https://github.com/HotPushUpGuy420) - New Contributor
 - [X] [SubaruSama](https://github.com/SubaruSama) - New Contributor
 - [X] [robly78746](https://github.com/robly78746) - New Contributor

diff --git a/__pycache__/settings.cpython-36.pyc b/__pycache__/settings.cpython-36.pyc
diff --git a/install.sh b/install.sh
@@ -8,6 +8,8 @@ go get golang.org/x/net/html
 mkdir -p tmp_build 
 mkdir -p tmp_dist
 
+pip install pyinstaller 
+
 # Creates executable file and sends dependences to the recently created directories
 pyinstaller --onefile --workpath ./tmp_build --distpath ./tmp_dist torBot.py
 

diff --git a/modules/bcolors.py b/modules/bcolors.py
diff --git a/modules/colors.py b/modules/colors.py
@@ -0,0 +1,52 @@
+
+"""
+Module containing class with colors
+"""
+
+class Colors:
+    """
+    Class that contains colors used for TorBot in terminal and a method
+    that adds colr to a string
+
+    Attributes:
+        _colors (dict): A map containing all of the color codes needed
+    """
+    def __init__(self):
+        self._colors = {
+            'white':    "\033[1;37m",
+            'yellow':   "\033[1;33m",
+            'green':    "\033[1;32m",
+            'blue':     "\033[1;34m",
+            'cyan':     "\033[1;36m",
+            'red':      "\033[1;31m",
+            'magenta':  "\033[1;35m",
+            'black':      "\033[1;30m",
+            'darkwhite':  "\033[0;37m",
+            'darkyellow': "\033[0;33m",
+            'darkgreen':  "\033[0;32m",
+            'darkblue':   "\033[0;34m",
+            'darkcyan':   "\033[0;36m",
+            'darkred':    "\033[0;31m",
+            'darkmagenta':"\033[0;35m",
+            'darkblack':  "\033[0;30m",
+            'end':        "\033[0;0m"
+        }
+
+    def add(self, string, color):
+        """
+        Method that adds color to a given string
+
+        Args:
+            string (str): string to add color to
+            color (str): string of color to add
+        """
+        return self.get(color) + string + self.get('end')
+
+    def get(self, color):
+        """
+        Method that returns the color code of the given color string
+
+        Args:
+            color (str): color code to be returned
+        """
+        return self._colors[color]
diff --git a/modules/getemails.py b/modules/getemails.py
@@ -1,9 +1,15 @@
-from modules.bcolors import Bcolors
-from modules.net_utils import get_urls_from_page
+
+"""
+Module returns emails found on webpage
+"""
 from bs4 import BeautifulSoup
 
+import modules.getweblinks
+from modules.colors import Colors
+
+COLOR = Colors()
 
-def getMails(soup):
+def get_mails(soup):
     """
         Searches for <a href> tags for links then checks if link contains the
         substring 'mailto' indicating that it's an email. If it is determined
@@ -16,18 +22,16 @@ def getMails(soup):
         Returns:
             emails: list of email IDs
     """
-    b_colors = Bcolors()
 
     if isinstance(type(soup), type(BeautifulSoup)):
-
-        emails = get_urls_from_page(soup, email=True)
+        emails = modules.getweblinks.get_urls_from_page(soup, email=True)
 
         # Pretty print output as below
         print('')
-        print(b_colors.OKGREEN+'Mails Found - '+b_colors.ENDC+str(len(emails)))
+        success_string = 'Mails Found - ' + str(len(emails))
+        print(COLOR.add(success_string, 'green'))
         print('-------------------------------')
 
         return emails
 
-    else:
-        raise ValueError('Method parameter is not of instance BeautifulSoup')
+    raise ValueError('Method parameter is not of instance BeautifulSoup')
diff --git a/modules/getweblinks.py b/modules/getweblinks.py
@@ -1,54 +1,129 @@
-from modules.net_utils import get_urls_from_page, get_url_status
-from modules import pagereader
+
+"""
+Module used to interact with a pages urls
+"""
+import re
+
 from bs4 import BeautifulSoup
-from modules.bcolors import Bcolors
 
+import modules.utils
+import modules.pagereader
 
-def add_green(link):
-    colors = Bcolors()
-    return '\t' + colors.OKGREEN + link + colors.ENDC
+from modules.colors import Colors
 
+COLOR = Colors()
 
-def add_red(link):
-    colors = Bcolors()
-    return '\t' + colors.On_Red + link + colors.ENDC
+def is_url(url):
+    """
+    Returns an integer representing validity of url syntax
 
+    Args:
+        url (str): url to be verified
+    Returns
+        (int): integer representing if url is a valid format
+    """
+    pattern = r"^https?:\/\/(www\.)?([a-z,A-Z,0-9]*)\.([a-z, A-Z]+)(.*)"
+    regex = re.compile(pattern)
+    if regex.match(url):
+        return 1
+    return 0
 
-def get_links(soup, ext=False, live=False):
+
+def is_onion_url(url):
     """
-        Searches through all <a ref> (hyperlinks) tags and stores them in a
-        list then validates if the url is formatted correctly.
+    Returns an integer representing validity of an onion url syntax
 
-        Args:
-            soup: BeautifulSoup instance currently being used.
+    Args:
+        url (str): url to be verified
+    Returns
+        (int): integer representing if url is a valid format
+    """
+    pattern = r"^https?:\/\/(www\.)?([a-z,A-Z,0-9]*)\.onion/(.*)"
+    regex = re.compile(pattern)
+    if regex.match(url):
+        return 1
+    return 0
 
+def get_urls_from_page(page_soup, email=False, extension=False):
+    """
+    Searches for urls on page using the anchor tag and href attribute,
+    also searchs for emails using 'mailto' if specified.
+
+    Args:
+        page (bs4.BeauitulSoup): html soup to search
+        email (bool): flag whether to collect emails as well
+        extension (bool): flag whether to use additional extensions
+
+    Returns:
+        urls (list): urls found on page
+    """
+    if not isinstance(page_soup, BeautifulSoup):
+        raise Exception("First arg must be bs4.BeautifulSoup object")
+
+    urls = []
+    anchors_on_page = page_soup.find_all('a')
+    for anchor_tag in anchors_on_page:
+        url = anchor_tag.get('href')
+        if extension:
+            if url and is_url(url) == 1:
+                urls.append(url)
+        elif email:
+            if url and 'mailto' in url:
+                email_addr = url.split(':')
+                if len(email_addr) > 1:
+                    urls.append(email_addr[1])
+        else:
+            if url and is_onion_url(url) == 1:
+                urls.append(url)
+
+    return urls
+
+
+def search_page(html, ext, stop_depth=None):
+    """
+        Takes in a pages HTML and searches the links on the page using
+        BFS.
+
+        Args:
+            html (str): HTML with links to search
+            add_exts (str): additional extension
+            stop_depth (int): The depth at which to stop
         Returns:
-            websites: List of websites that were found
+            links_found (list): links found on page and associated pages
+    """
+
+    soup = BeautifulSoup(html, 'html.parser')
+    links = get_urls_from_page(soup, extension=ext)
+    if stop_depth:
+        links_found = modules.utils.bfs_urls(links, ext, stop_depth=stop_depth)
+    else:
+        links_found = modules.utils.bfs_urls(links, ext)
+
+    return links_found
+
+
+def get_links(soup, ext=False, live=False):
+    """
+    Returns list of links listed on the webpage of the soup passed. If live
+    is set to true then it will also print the status of each of the links
+    and setting ext to an actual extension such as '.com' will allow those
+    extensions to be recognized as valid urls and not just '.tor'.
+
+    Args:
+        soup (bs4.BeautifulSoup): webpage to be searched for links.
+
+    Returns:
+        websites (list(str)): List of websites that were found
     """
-    b_colors = Bcolors()
     if isinstance(soup, BeautifulSoup):
         websites = get_urls_from_page(soup, extension=ext)
         # Pretty print output as below
-        print(''.join((b_colors.OKGREEN,
-              'Websites Found - ', b_colors.ENDC, str(len(websites)))))
+        success_string = 'Websites Found - ' + str(len(websites))
+        print(COLOR.add(success_string, 'green'))
         print('------------------------------------')
 
         if live:
-            for link in websites:
-                if get_url_status(link) != 0:
-                    coloredlink = add_green(link)
-                    page = pagereader.read_first_page(link)[0]
-                    if page is not None and page.title is not None:
-                        print_row(coloredlink, page.title.string)
-                else:
-                    coloredlink = add_red(link)
-                    print_row(coloredlink, "Not found")
-
+            modules.utils.queue_tasks(websites, modules.pagereader.display_url)
         return websites
 
-    else:
-        raise(Exception('Method parameter is not of instance BeautifulSoup'))
-
-
-def print_row(url, description):
-    print("%-80s %-30s" % (url, description))
+    raise Exception('Method parameter is not of instance BeautifulSoup')