#!/usr/bin/python3.5 import asyncio import hashlib import json import os import aiohttp import requests import magic from bs4 import BeautifulSoup async def download_report(session, report): report_date = report['Date'] report_title = report['Title'] report_year = report['Year'] report_source = report['Source'] report_link = report['Link'] report_filename = report['Filename'] report_sha1 = report['SHA-1'] # Ensure directory exists os.makedirs(report_year, exist_ok=True) # Set hash check hash_check = hashlib.sha1() # Set download path download_path = os.path.join(report_year, report_filename) # File with PDF extension path pdf_extension_path = download_path + ".pdf" if os.path.exists(download_path) or os.path.exists(pdf_extension_path): print("[!] File {} already exists".format(report_filename)) else: try: # Download report preview page for parsing async with session.get(report_link) as splash_response: splash_page = await splash_response.content.read() # Parse preview page for desired elements to build download URL soup = BeautifulSoup(splash_page, 'lxml') sections = soup.find('body').find('script').contents[0].split(';') app_api = json.loads(sections[1].split('=')[1])['/app-api/enduserapp/shared-item'] # Build download URL box_url = "https://app.box.com/index.php" box_args = "?rm=box_download_shared_file&shared_name={}&file_id={}" file_url = box_url + box_args.format(app_api['sharedName'], 'f_{}'.format(app_api['itemID'])) # Use semaphore to limit download rate async with sem: # Download file in chunks and save to folder location async with session.get(file_url) as download_response: with open(download_path, 'wb') as f_handle: while True: chunk = await download_response.content.read(1024) hash_check.update(chunk) if not chunk: break f_handle.write(chunk) await download_response.release() # Verify file contents based on expected hash value if hash_check.hexdigest() != report_sha1: os.remove(download_path) raise ValueError("File integrity check failed") # Identify filetype and add extension if PDF file_type = magic.from_file(download_path, mime=True) if file_type == "application/pdf": os.rename(download_path, pdf_extension_path) print("[+] Successfully downloaded {}".format(report_filename)) except Exception as unexpected_error: message = "[!] Download failure for {}".format(report['Filename']) print(message, unexpected_error) async def download_all_reports(loop, APT_reports): with aiohttp.ClientSession(loop=loop) as session: download_queue = [loop.create_task(download_report(session, report)) for report in APT_reports] await asyncio.wait(download_queue) if __name__ == '__main__': # Retrieve APT Note Data github_url = "https://raw.githubusercontent.com/aptnotes/data/master/APTnotes.json" APTnotes = requests.get(github_url) if APTnotes.status_code == 200: # Load APT report metadata into JSON container APT_reports = json.loads(APTnotes.text) # Reverse order of reports in order to download newest to oldest APT_reports.reverse() # Set semaphore for rate limiting sem = asyncio.Semaphore(10) # Create async loop loop = asyncio.get_event_loop() loop.run_until_complete(download_all_reports(loop, APT_reports))