Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Improve Sciebo downloads #117

Open
wants to merge 5 commits into
base: master
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from 2 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions setup.cfg
Original file line number Diff line number Diff line change
Expand Up @@ -24,6 +24,7 @@ install_requires =
yt-dlp>=2021.12.27
pdfkit>=0.6.0
tqdm>=4.0.0
lxml>=5.0.0
septatrix marked this conversation as resolved.
Show resolved Hide resolved

[options.extras_require]
keyring =
Expand Down
109 changes: 97 additions & 12 deletions syncmymoodle/__main__.py
Original file line number Diff line number Diff line change
Expand Up @@ -55,9 +55,7 @@ def __init__(
self.type = type
self.parent = parent
self.children: List[Node] = []
self.additional_info = (
additional_info # Currently only used for course_id in opencast
)
self.additional_info = additional_info # Currently only used for course_id in opencast and auth header in sciebo
self.is_downloaded = (
is_downloaded # Can also be used to exclude files from being downloaded
)
Expand Down Expand Up @@ -829,6 +827,8 @@ def download_file(self, node):
else:
resume_size = 0
header = dict()
if node.type.lower() == "sciebo file":
header = {**header, **node.additional_info}

with closing(
self.session.get(node.url, headers=header, stream=True)
Expand Down Expand Up @@ -1050,19 +1050,104 @@ def scanForLinks(

# https://rwth-aachen.sciebo.de/s/XXX
if self.config.get("used_modules", {}).get("url", {}).get("sciebo", {}):
sciebo_links = re.findall(
"https://rwth-aachen.sciebo.de/s/[a-zA-Z0-9-]+", text
sciebo_links = list(
set(re.findall("https://rwth-aachen.sciebo.de/s/[a-zA-Z0-9-]+", text))
nilshee marked this conversation as resolved.
Show resolved Hide resolved
)
for vid in sciebo_links:
response = self.session.get(vid)
sciebo_url = "https://rwth-aachen.sciebo.de"
webdav_location = "/public.php/webdav/"
for link in sciebo_links:
logging.info(f"Found Sciebo Link: {link}")

# get the download page
response = self.session.get(link)

# parse html code
soup = bs(response.text, features="html.parser")
url = soup.find("input", {"name": "downloadURL"})
filename = soup.find("input", {"name": "filename"})
if url and filename:
parent_node.add_child(
filename["value"], url["value"], "Sciebo file", url=url["value"]

# get the requesttoken
requestToken = soup.head["data-requesttoken"]
logger.info(f"RequestToken: {requestToken}")

# print the property value of the input tag with the name sharingToken
sharingToken = soup.find("input", {"name": "sharingToken"})["value"]
logger.info(f"SharingToken: {sharingToken}")

# get baseauthentication secret
baseAuthSecret = base64.b64encode(
f"{sharingToken}:null".encode()
).decode()
logger.info(f"BaseAuthSecret: {baseAuthSecret}")

# get auth header
auth_header = {
"Authorization": f"Basic {baseAuthSecret}",
"requesttoken": requestToken,
}

parent_node = parent_node.add_child(
f"sciebo-{sharingToken}", None, "Sciebo Folder"
)

# recursive function to get all files in the sciebo folder
def get_sciebo_files(
href: str, parent_node: Node, sharingToken: str, auth_header: dict
):

# request the URL with the PROPFIND method and the header
response = self.session.request(
"PROPFIND", sciebo_url + href, headers=auth_header
)

# parse the response
soup = bs(response.text, features="xml")

for response in soup.find_all("d:response"):
# get the href of the response
new_href = response.find("d:href").text

if new_href == href:
logger.info(
f"Skipping {new_href} because it is the current folder"
)
continue

logger.info(f"response: {response.find('d:href').text}")
# get the displayname of the response
displayname = (
new_href.split("/")[-2]
if new_href.endswith("/")
else new_href.split("/")[-1]
)
displayname = (
f"sciebo-{sharingToken}"
if displayname == "webdav"
else displayname
)

# check if the response is a folder
if new_href.endswith("/"):
# create a new node for the folder
folder_node = parent_node.add_child(
displayname, None, "Sciebo Folder"
)
# recursive call to get all files in the folder
get_sciebo_files(
new_href, folder_node, sharingToken, auth_header
)
else:
# create a new node for the file
parent_node.add_child(
displayname,
None,
"Sciebo File",
url=sciebo_url + new_href,
additional_info=auth_header,
)

get_sciebo_files(
webdav_location, parent_node, sharingToken, auth_header
)


def main():
parser = ArgumentParser(
Expand Down
Loading