From 1997cc4b32b21b35a6cb92d345bad6678867681b Mon Sep 17 00:00:00 2001 From: Nils Date: Thu, 13 Jun 2024 00:12:07 +0200 Subject: [PATCH] add download sciebo folders --- setup.cfg | 1 + syncmymoodle/__main__.py | 109 ++++++++++++++++++++++++++++++++++----- 2 files changed, 98 insertions(+), 12 deletions(-) diff --git a/setup.cfg b/setup.cfg index 3d085d7..d4b6f99 100644 --- a/setup.cfg +++ b/setup.cfg @@ -24,6 +24,7 @@ install_requires = yt-dlp>=2021.12.27 pdfkit>=0.6.0 tqdm>=4.0.0 + lxml>=5.0.0 [options.extras_require] keyring = diff --git a/syncmymoodle/__main__.py b/syncmymoodle/__main__.py index 0a9ff2a..8e5b695 100755 --- a/syncmymoodle/__main__.py +++ b/syncmymoodle/__main__.py @@ -55,9 +55,7 @@ def __init__( self.type = type self.parent = parent self.children: List[Node] = [] - self.additional_info = ( - additional_info # Currently only used for course_id in opencast - ) + self.additional_info = additional_info # Currently only used for course_id in opencast and auth header in sciebo self.is_downloaded = ( is_downloaded # Can also be used to exclude files from being downloaded ) @@ -829,6 +827,8 @@ def download_file(self, node): else: resume_size = 0 header = dict() + if node.type.lower() == "sciebo file": + header = {**header, **node.additional_info} with closing( self.session.get(node.url, headers=header, stream=True) @@ -1050,19 +1050,104 @@ def scanForLinks( # https://rwth-aachen.sciebo.de/s/XXX if self.config.get("used_modules", {}).get("url", {}).get("sciebo", {}): - sciebo_links = re.findall( - "https://rwth-aachen.sciebo.de/s/[a-zA-Z0-9-]+", text + sciebo_links = list( + set(re.findall("https://rwth-aachen.sciebo.de/s/[a-zA-Z0-9-]+", text)) ) - for vid in sciebo_links: - response = self.session.get(vid) + sciebo_url = "https://rwth-aachen.sciebo.de" + webdav_location = "/public.php/webdav/" + for link in sciebo_links: + logging.info(f"Found Sciebo Link: {link}") + + # get the download page + response = self.session.get(link) + + # parse html code soup = bs(response.text, features="html.parser") - url = soup.find("input", {"name": "downloadURL"}) - filename = soup.find("input", {"name": "filename"}) - if url and filename: - parent_node.add_child( - filename["value"], url["value"], "Sciebo file", url=url["value"] + + # get the requesttoken + requestToken = soup.head["data-requesttoken"] + logger.info(f"RequestToken: {requestToken}") + + # print the property value of the input tag with the name sharingToken + sharingToken = soup.find("input", {"name": "sharingToken"})["value"] + logger.info(f"SharingToken: {sharingToken}") + + # get baseauthentication secret + baseAuthSecret = base64.b64encode( + (sharingToken + ":null").encode() + ).decode() + logger.info(f"BaseAuthSecret: {baseAuthSecret}") + + # get auth header + auth_header = { + "Authorization": "Basic " + baseAuthSecret, + "requesttoken": requestToken, + } + + parent_node = parent_node.add_child( + f"sciebo-{sharingToken}", None, "Sciebo Folder" + ) + + # recursive function to get all files in the sciebo folder + def get_sciebo_files( + href: str, parent_node: Node, sharingToken: str, auth_header: dict + ): + + # request the URL with the PROPFIND method and the header + response = self.session.request( + "PROPFIND", sciebo_url + href, headers=auth_header ) + # parse the response + soup = bs(response.text, features="xml") + + for response in soup.find_all("d:response"): + # get the href of the response + new_href = response.find("d:href").text + + if new_href == href: + logger.info( + f"Skipping {new_href} because it is the current folder" + ) + continue + + logger.info(f"response: {response.find('d:href').text}") + # get the displayname of the response + displayname = ( + new_href.split("/")[-2] + if new_href.endswith("/") + else new_href.split("/")[-1] + ) + displayname = ( + f"sciebo-{sharingToken}" + if displayname == "webdav" + else displayname + ) + + # check if the response is a folder + if new_href.endswith("/"): + # create a new node for the folder + folder_node = parent_node.add_child( + displayname, None, "Sciebo Folder" + ) + # recursive call to get all files in the folder + get_sciebo_files( + new_href, folder_node, sharingToken, auth_header + ) + else: + # create a new node for the file + parent_node.add_child( + displayname, + None, + "Sciebo File", + url=sciebo_url + new_href, + additional_info=auth_header, + ) + + get_sciebo_files( + webdav_location, parent_node, sharingToken, auth_header + ) + def main(): parser = ArgumentParser(