From 0849c0363a755e0a639d15a05e7d3dba3efe2dc2 Mon Sep 17 00:00:00 2001 From: Johannes Erwerle Date: Wed, 13 Apr 2022 13:56:23 +0200 Subject: [PATCH] squashed all commits added parsing for mathematical foundations recordings added filepath sanitization worked around the changing login URL added streaming of requests for large downloads added gitignore added example --- .gitignore | 178 +++++++++++++++++++++++++++++++++++++++++++ example.py | 33 ++++++++ ilias_sync2/main.py | 88 +++++++++++++++++---- ilias_sync2/utils.py | 5 +- requirements.txt | 10 +++ 5 files changed, 300 insertions(+), 14 deletions(-) create mode 100644 .gitignore create mode 100644 example.py create mode 100644 requirements.txt diff --git a/.gitignore b/.gitignore new file mode 100644 index 0000000..d61c4a3 --- /dev/null +++ b/.gitignore @@ -0,0 +1,178 @@ +# Created by https://www.toptal.com/developers/gitignore/api/python,venv +# Edit at https://www.toptal.com/developers/gitignore?templates=python,venv + +### Python ### +# Byte-compiled / optimized / DLL files +__pycache__/ +*.py[cod] +*$py.class + +# C extensions +*.so + +# Distribution / packaging +.Python +build/ +develop-eggs/ +dist/ +downloads/ +eggs/ +.eggs/ +lib/ +lib64/ +parts/ +sdist/ +var/ +wheels/ +share/python-wheels/ +*.egg-info/ +.installed.cfg +*.egg +MANIFEST + +# PyInstaller +# Usually these files are written by a python script from a template +# before PyInstaller builds the exe, so as to inject date/other infos into it. +*.manifest +*.spec + +# Installer logs +pip-log.txt +pip-delete-this-directory.txt + +# Unit test / coverage reports +htmlcov/ +.tox/ +.nox/ +.coverage +.coverage.* +.cache +nosetests.xml +coverage.xml +*.cover +*.py,cover +.hypothesis/ +.pytest_cache/ +cover/ + +# Translations +*.mo +*.pot + +# Django stuff: +*.log +local_settings.py +db.sqlite3 +db.sqlite3-journal + +# Flask stuff: +instance/ +.webassets-cache + +# Scrapy stuff: +.scrapy + +# Sphinx documentation +docs/_build/ + +# PyBuilder +.pybuilder/ +target/ + +# Jupyter Notebook +.ipynb_checkpoints + +# IPython +profile_default/ +ipython_config.py + +# pyenv +# For a library or package, you might want to ignore these files since the code is +# intended to run in multiple environments; otherwise, check them in: +# .python-version + +# pipenv +# According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control. +# However, in case of collaboration, if having platform-specific dependencies or dependencies +# having no cross-platform support, pipenv may install dependencies that don't work, or not +# install all needed dependencies. +#Pipfile.lock + +# poetry +# Similar to Pipfile.lock, it is generally recommended to include poetry.lock in version control. +# This is especially recommended for binary packages to ensure reproducibility, and is more +# commonly ignored for libraries. +# https://python-poetry.org/docs/basic-usage/#commit-your-poetrylock-file-to-version-control +#poetry.lock + +# pdm +# Similar to Pipfile.lock, it is generally recommended to include pdm.lock in version control. +#pdm.lock +# pdm stores project-wide configurations in .pdm.toml, but it is recommended to not include it +# in version control. +# https://pdm.fming.dev/#use-with-ide +.pdm.toml + +# PEP 582; used by e.g. github.com/David-OConnor/pyflow and github.com/pdm-project/pdm +__pypackages__/ + +# Celery stuff +celerybeat-schedule +celerybeat.pid + +# SageMath parsed files +*.sage.py + +# Environments +.env +.venv +env/ +venv/ +ENV/ +env.bak/ +venv.bak/ + +# Spyder project settings +.spyderproject +.spyproject + +# Rope project settings +.ropeproject + +# mkdocs documentation +/site + +# mypy +.mypy_cache/ +.dmypy.json +dmypy.json + +# Pyre type checker +.pyre/ + +# pytype static type analyzer +.pytype/ + +# Cython debug symbols +cython_debug/ + +# PyCharm +# JetBrains specific template is maintained in a separate JetBrains.gitignore that can +# be found at https://github.com/github/gitignore/blob/main/Global/JetBrains.gitignore +# and can be added to the global gitignore or merged into this file. For a more nuclear +# option (not recommended) you can uncomment the following to ignore the entire idea folder. +#.idea/ + +### venv ### +# Virtualenv +# http://iamzed.com/2009/05/07/a-primer-on-virtualenv/ +[Bb]in +[Ii]nclude +[Ll]ib +[Ll]ib64 +[Ll]ocal +[Ss]cripts +pyvenv.cfg +pip-selfcheck.json + +# End of https://www.toptal.com/developers/gitignore/api/python,venv diff --git a/example.py b/example.py new file mode 100644 index 0000000..a21c8d3 --- /dev/null +++ b/example.py @@ -0,0 +1,33 @@ +from pathlib import Path +from ilias_sync2 import Synchronizer + +base_path = Path("/home/jo/Uni2") +cloud_computing = base_path / "Cloud-Computing" +discrete_optimization = base_path / "Discrete-Optimization" +distributed_system_i = base_path / "Distributed-Systems-1" +modern_cryptography = base_path / "Introduction-to-Modern-Cryptography" +uebertragungstechnik_1 = base_path / "Übertragungstechnik-1" +loose_coupling = base_path / "Loose-Coupling" +info_vis = base_path / "Information-Visualization" +info_vis_exercise = base_path / "Information-Visualization-Exercise" + +directories = [ + (cloud_computing, + "https://ilias3.uni-stuttgart.de/goto_Uni_Stuttgart_crs_2623454.html"), + (discrete_optimization, + "https://ilias3.uni-stuttgart.de/goto_Uni_Stuttgart_crs_2642312.html"), + (modern_cryptography, + "https://ilias3.uni-stuttgart.de/goto_Uni_Stuttgart_crs_2573315.html"), + (uebertragungstechnik_1, + "https://ilias3.uni-stuttgart.de/goto_Uni_Stuttgart_crs_2564663.html"), + (distributed_system_i, + "https://ilias3.uni-stuttgart.de/goto_Uni_Stuttgart_crs_2572756.html"), + (loose_coupling, + "https://ilias3.uni-stuttgart.de/goto_Uni_Stuttgart_crs_2572802.html"), + (info_vis, + "https://ilias3.uni-stuttgart.de/goto_Uni_Stuttgart_crs_2573138.html"), + (info_vis_exercise, + "https://ilias3.uni-stuttgart.de/goto_Uni_Stuttgart_crs_2573225.html"), + ] + +Synchronizer("st123456", r"yourpasswordhere", directories).synchronize() diff --git a/ilias_sync2/main.py b/ilias_sync2/main.py index 2b1c12b..3d40669 100644 --- a/ilias_sync2/main.py +++ b/ilias_sync2/main.py @@ -7,6 +7,7 @@ from urllib.parse import urljoin import requests from bs4 import BeautifulSoup +from pathvalidate import sanitize_filepath from .utils import ContentDisposition, heading_sanitization @@ -68,11 +69,11 @@ class CrawlObject(): def request_head(self) -> requests.Response: return self.request(method="HEAD") - def request_get(self) -> requests.Response: - return self.request(method="GET") + def request_get(self, **kwargs) -> requests.Response: + return self.request(method="GET", **kwargs) - def request(self, method="GET") -> requests.Response: - response = self.session.request(method=method, url=self.url) + def request(self, method="GET", **kwargs) -> requests.Response: + response = self.session.request(method=method, url=self.url, **kwargs) if response.status_code != requests.codes.ok: raise RequestError(f"Error getting '{ method }' for { self.url }", response) @@ -114,6 +115,8 @@ class CrawlObject(): logging.info(f"Improving filename from '{ self.path.name }' to '{ disposition_obj.filename }'") self.path = self.path.parent / disposition_obj.filename + # self.path = Path(sanitize_filepath(self.path)) + def process_items(self) -> None: subitems = list() @@ -131,17 +134,17 @@ class CrawlObject(): items = block.find_all("div", class_="il_ContainerListItem") for item in items: - #logging.debug(item.prettify()) + # logging.debug(item.prettify()) link = item.find("a") if link is None: continue - name = link.text + name = sanitize_filepath(link.text) url = link.attrs["href"] url = urljoin(self.url, url) if url in self.discovered: - continue logging.info(f"{ url } already discovered, skipping it") + continue self.discovered.add(url) subitems.append(CrawlObject(self.path / heading_sanitization(heading) / name, url, @@ -158,7 +161,7 @@ class CrawlObject(): name_element = name.find("a") if link_element is not None and name_element is not None: link = link_element.attrs["href"] - name = name_element.text + ".mp4" # rough estimate that all files are mp4 + name = sanitize_filepath(name_element.text + ".mp4") # rough estimate that all files are mp4 url = urljoin(self.url, link) path = self.path / name @@ -193,7 +196,7 @@ class CrawlObject(): logging.info("number of divs in formgroup != 2, dont know what to do.") continue - name = name_div.text + name = sanitize_filepath(name_div.text) link = link_div.find("a") if link is None: continue @@ -220,6 +223,45 @@ class CrawlObject(): # href = a_element.attrs["href"] # logging.info(href) + # find the il-card thumbnail videos + for item in content.find_all("div", class_=["il-card", "thumbnail"]): + captions = item.find_all("div", class_="caption") + if captions is None: + logging.debug("no captions found") + continue + + if len(captions) < 3: + logging.debug(f"to few captions found: { len(captions) }") + continue + + heading = captions[0].find("a") + if heading is None: + logging.debug(f"No found in { captions.prettify() }") + continue + + name = heading.text + + logging.debug(f"Found Heading: { name }") + + name = sanitize_filepath(name + ".mp4") + + link = captions[2].find("a") + if link is None: + logging.debug("No found in { link.prettify() }") + continue + + url = link.attrs["href"] + + if url in self.discovered: + logging.info(f"{ url } already discovered, skipping it") + continue + self.discovered.add(url) + subitems.append(CrawlObject(self.path / name, url, self.session, discovered=self.discovered)) + + + + + for item in subitems: item.process() @@ -231,14 +273,15 @@ class CrawlObject(): If needed the directories are created. """ - content = self.request_get().content + response = self.request_get(stream=True) if not self.path.parent.is_dir(): self.path.parent.mkdir(parents=True, exist_ok=True) logging.info(f"Writing to file '{ self.path }'.") with open(self.path, "wb+") as f: - f.write(content) + for chunk in response.iter_content(chunk_size=10*(2**10)): + f.write(chunk) def update_requiered(self) -> bool: """ @@ -266,7 +309,25 @@ class Synchronizer: self._session = requests.Session() # fill the session with the correct cookies - self._session.get(f"{ BASE_URL }/login.php?target=&client_id=Uni_Stuttgart&cmd=force_login&lang=en") + login_page = self._session.get(f"{ BASE_URL }/login.php?target=&client_id=Uni_Stuttgart&cmd=force_login&lang=en") + + login_page = BeautifulSoup(login_page.content) + + login_form = login_page.find("form", attrs={"name": "formlogin"}) + + if login_form is None: + logging.error("Login failed, login form not found!") + exit(1) + + logging.debug(login_form) + + login_url = login_form.attrs.get("action") + + if login_url is None: + logging.error("Could not find the action URL in the login form!") + exit(1) + + logging.debug(f"Login URL: {login_url}") login_data = { "username": self.username, @@ -275,7 +336,8 @@ class Synchronizer: } # do the actual login - self._session.post(f"{ BASE_URL }/ilias.php?lang=en&client_id=Uni_Stuttgart&cmd=post&cmdClass=ilstartupgui&cmdNode=123&baseClass=ilStartUpGUI&rtoken=", data=login_data) + self._session.post(f"{ BASE_URL }/{ login_url }", + data=login_data) def synchronize(self): diff --git a/ilias_sync2/utils.py b/ilias_sync2/utils.py index 279f4ef..106ebc2 100644 --- a/ilias_sync2/utils.py +++ b/ilias_sync2/utils.py @@ -2,6 +2,8 @@ import re from typing import Optional from dataclasses import dataclass +from pathvalidate import sanitize_filepath + @dataclass class ContentDisposition: @@ -33,5 +35,6 @@ def heading_sanitization(heading: str) -> str: if heading == "Inhalt": return "" - return heading + return sanitize_filepath(heading) + diff --git a/requirements.txt b/requirements.txt new file mode 100644 index 0000000..b707cea --- /dev/null +++ b/requirements.txt @@ -0,0 +1,10 @@ +beautifulsoup4==4.10.0 +bs4==0.0.1 +certifi==2021.10.8 +charset-normalizer==2.0.9 +idna==3.3 +-e git+ssh://git@git.srvspace.net/jo/ilias-sync2.git@b74c19af044e6fbd89432d073030094f53cda078#egg=ilias_sync2 +pathvalidate==2.5.0 +requests==2.26.0 +soupsieve==2.3.1 +urllib3==1.26.7