squashed all commits
added parsing for mathematical foundations recordings added filepath sanitization worked around the changing login URL added streaming of requests for large downloads added gitignore added example
This commit is contained in:
parent
544e609d44
commit
0849c0363a
5 changed files with 300 additions and 14 deletions
178
.gitignore
vendored
Normal file
178
.gitignore
vendored
Normal file
|
|
@ -0,0 +1,178 @@
|
|||
# Created by https://www.toptal.com/developers/gitignore/api/python,venv
|
||||
# Edit at https://www.toptal.com/developers/gitignore?templates=python,venv
|
||||
|
||||
### Python ###
|
||||
# Byte-compiled / optimized / DLL files
|
||||
__pycache__/
|
||||
*.py[cod]
|
||||
*$py.class
|
||||
|
||||
# C extensions
|
||||
*.so
|
||||
|
||||
# Distribution / packaging
|
||||
.Python
|
||||
build/
|
||||
develop-eggs/
|
||||
dist/
|
||||
downloads/
|
||||
eggs/
|
||||
.eggs/
|
||||
lib/
|
||||
lib64/
|
||||
parts/
|
||||
sdist/
|
||||
var/
|
||||
wheels/
|
||||
share/python-wheels/
|
||||
*.egg-info/
|
||||
.installed.cfg
|
||||
*.egg
|
||||
MANIFEST
|
||||
|
||||
# PyInstaller
|
||||
# Usually these files are written by a python script from a template
|
||||
# before PyInstaller builds the exe, so as to inject date/other infos into it.
|
||||
*.manifest
|
||||
*.spec
|
||||
|
||||
# Installer logs
|
||||
pip-log.txt
|
||||
pip-delete-this-directory.txt
|
||||
|
||||
# Unit test / coverage reports
|
||||
htmlcov/
|
||||
.tox/
|
||||
.nox/
|
||||
.coverage
|
||||
.coverage.*
|
||||
.cache
|
||||
nosetests.xml
|
||||
coverage.xml
|
||||
*.cover
|
||||
*.py,cover
|
||||
.hypothesis/
|
||||
.pytest_cache/
|
||||
cover/
|
||||
|
||||
# Translations
|
||||
*.mo
|
||||
*.pot
|
||||
|
||||
# Django stuff:
|
||||
*.log
|
||||
local_settings.py
|
||||
db.sqlite3
|
||||
db.sqlite3-journal
|
||||
|
||||
# Flask stuff:
|
||||
instance/
|
||||
.webassets-cache
|
||||
|
||||
# Scrapy stuff:
|
||||
.scrapy
|
||||
|
||||
# Sphinx documentation
|
||||
docs/_build/
|
||||
|
||||
# PyBuilder
|
||||
.pybuilder/
|
||||
target/
|
||||
|
||||
# Jupyter Notebook
|
||||
.ipynb_checkpoints
|
||||
|
||||
# IPython
|
||||
profile_default/
|
||||
ipython_config.py
|
||||
|
||||
# pyenv
|
||||
# For a library or package, you might want to ignore these files since the code is
|
||||
# intended to run in multiple environments; otherwise, check them in:
|
||||
# .python-version
|
||||
|
||||
# pipenv
|
||||
# According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
|
||||
# However, in case of collaboration, if having platform-specific dependencies or dependencies
|
||||
# having no cross-platform support, pipenv may install dependencies that don't work, or not
|
||||
# install all needed dependencies.
|
||||
#Pipfile.lock
|
||||
|
||||
# poetry
|
||||
# Similar to Pipfile.lock, it is generally recommended to include poetry.lock in version control.
|
||||
# This is especially recommended for binary packages to ensure reproducibility, and is more
|
||||
# commonly ignored for libraries.
|
||||
# https://python-poetry.org/docs/basic-usage/#commit-your-poetrylock-file-to-version-control
|
||||
#poetry.lock
|
||||
|
||||
# pdm
|
||||
# Similar to Pipfile.lock, it is generally recommended to include pdm.lock in version control.
|
||||
#pdm.lock
|
||||
# pdm stores project-wide configurations in .pdm.toml, but it is recommended to not include it
|
||||
# in version control.
|
||||
# https://pdm.fming.dev/#use-with-ide
|
||||
.pdm.toml
|
||||
|
||||
# PEP 582; used by e.g. github.com/David-OConnor/pyflow and github.com/pdm-project/pdm
|
||||
__pypackages__/
|
||||
|
||||
# Celery stuff
|
||||
celerybeat-schedule
|
||||
celerybeat.pid
|
||||
|
||||
# SageMath parsed files
|
||||
*.sage.py
|
||||
|
||||
# Environments
|
||||
.env
|
||||
.venv
|
||||
env/
|
||||
venv/
|
||||
ENV/
|
||||
env.bak/
|
||||
venv.bak/
|
||||
|
||||
# Spyder project settings
|
||||
.spyderproject
|
||||
.spyproject
|
||||
|
||||
# Rope project settings
|
||||
.ropeproject
|
||||
|
||||
# mkdocs documentation
|
||||
/site
|
||||
|
||||
# mypy
|
||||
.mypy_cache/
|
||||
.dmypy.json
|
||||
dmypy.json
|
||||
|
||||
# Pyre type checker
|
||||
.pyre/
|
||||
|
||||
# pytype static type analyzer
|
||||
.pytype/
|
||||
|
||||
# Cython debug symbols
|
||||
cython_debug/
|
||||
|
||||
# PyCharm
|
||||
# JetBrains specific template is maintained in a separate JetBrains.gitignore that can
|
||||
# be found at https://github.com/github/gitignore/blob/main/Global/JetBrains.gitignore
|
||||
# and can be added to the global gitignore or merged into this file. For a more nuclear
|
||||
# option (not recommended) you can uncomment the following to ignore the entire idea folder.
|
||||
#.idea/
|
||||
|
||||
### venv ###
|
||||
# Virtualenv
|
||||
# http://iamzed.com/2009/05/07/a-primer-on-virtualenv/
|
||||
[Bb]in
|
||||
[Ii]nclude
|
||||
[Ll]ib
|
||||
[Ll]ib64
|
||||
[Ll]ocal
|
||||
[Ss]cripts
|
||||
pyvenv.cfg
|
||||
pip-selfcheck.json
|
||||
|
||||
# End of https://www.toptal.com/developers/gitignore/api/python,venv
|
||||
33
example.py
Normal file
33
example.py
Normal file
|
|
@ -0,0 +1,33 @@
|
|||
from pathlib import Path
|
||||
from ilias_sync2 import Synchronizer
|
||||
|
||||
base_path = Path("/home/jo/Uni2")
|
||||
cloud_computing = base_path / "Cloud-Computing"
|
||||
discrete_optimization = base_path / "Discrete-Optimization"
|
||||
distributed_system_i = base_path / "Distributed-Systems-1"
|
||||
modern_cryptography = base_path / "Introduction-to-Modern-Cryptography"
|
||||
uebertragungstechnik_1 = base_path / "Übertragungstechnik-1"
|
||||
loose_coupling = base_path / "Loose-Coupling"
|
||||
info_vis = base_path / "Information-Visualization"
|
||||
info_vis_exercise = base_path / "Information-Visualization-Exercise"
|
||||
|
||||
directories = [
|
||||
(cloud_computing,
|
||||
"https://ilias3.uni-stuttgart.de/goto_Uni_Stuttgart_crs_2623454.html"),
|
||||
(discrete_optimization,
|
||||
"https://ilias3.uni-stuttgart.de/goto_Uni_Stuttgart_crs_2642312.html"),
|
||||
(modern_cryptography,
|
||||
"https://ilias3.uni-stuttgart.de/goto_Uni_Stuttgart_crs_2573315.html"),
|
||||
(uebertragungstechnik_1,
|
||||
"https://ilias3.uni-stuttgart.de/goto_Uni_Stuttgart_crs_2564663.html"),
|
||||
(distributed_system_i,
|
||||
"https://ilias3.uni-stuttgart.de/goto_Uni_Stuttgart_crs_2572756.html"),
|
||||
(loose_coupling,
|
||||
"https://ilias3.uni-stuttgart.de/goto_Uni_Stuttgart_crs_2572802.html"),
|
||||
(info_vis,
|
||||
"https://ilias3.uni-stuttgart.de/goto_Uni_Stuttgart_crs_2573138.html"),
|
||||
(info_vis_exercise,
|
||||
"https://ilias3.uni-stuttgart.de/goto_Uni_Stuttgart_crs_2573225.html"),
|
||||
]
|
||||
|
||||
Synchronizer("st123456", r"yourpasswordhere", directories).synchronize()
|
||||
|
|
@ -7,6 +7,7 @@ from urllib.parse import urljoin
|
|||
|
||||
import requests
|
||||
from bs4 import BeautifulSoup
|
||||
from pathvalidate import sanitize_filepath
|
||||
|
||||
from .utils import ContentDisposition, heading_sanitization
|
||||
|
||||
|
|
@ -68,11 +69,11 @@ class CrawlObject():
|
|||
def request_head(self) -> requests.Response:
|
||||
return self.request(method="HEAD")
|
||||
|
||||
def request_get(self) -> requests.Response:
|
||||
return self.request(method="GET")
|
||||
def request_get(self, **kwargs) -> requests.Response:
|
||||
return self.request(method="GET", **kwargs)
|
||||
|
||||
def request(self, method="GET") -> requests.Response:
|
||||
response = self.session.request(method=method, url=self.url)
|
||||
def request(self, method="GET", **kwargs) -> requests.Response:
|
||||
response = self.session.request(method=method, url=self.url, **kwargs)
|
||||
if response.status_code != requests.codes.ok:
|
||||
raise RequestError(f"Error getting '{ method }' for { self.url }",
|
||||
response)
|
||||
|
|
@ -114,6 +115,8 @@ class CrawlObject():
|
|||
logging.info(f"Improving filename from '{ self.path.name }' to '{ disposition_obj.filename }'")
|
||||
self.path = self.path.parent / disposition_obj.filename
|
||||
|
||||
# self.path = Path(sanitize_filepath(self.path))
|
||||
|
||||
def process_items(self) -> None:
|
||||
|
||||
subitems = list()
|
||||
|
|
@ -131,17 +134,17 @@ class CrawlObject():
|
|||
|
||||
items = block.find_all("div", class_="il_ContainerListItem")
|
||||
for item in items:
|
||||
#logging.debug(item.prettify())
|
||||
# logging.debug(item.prettify())
|
||||
link = item.find("a")
|
||||
if link is None:
|
||||
continue
|
||||
name = link.text
|
||||
name = sanitize_filepath(link.text)
|
||||
url = link.attrs["href"]
|
||||
url = urljoin(self.url, url)
|
||||
|
||||
if url in self.discovered:
|
||||
continue
|
||||
logging.info(f"{ url } already discovered, skipping it")
|
||||
continue
|
||||
self.discovered.add(url)
|
||||
subitems.append(CrawlObject(self.path / heading_sanitization(heading) / name,
|
||||
url,
|
||||
|
|
@ -158,7 +161,7 @@ class CrawlObject():
|
|||
name_element = name.find("a")
|
||||
if link_element is not None and name_element is not None:
|
||||
link = link_element.attrs["href"]
|
||||
name = name_element.text + ".mp4" # rough estimate that all files are mp4
|
||||
name = sanitize_filepath(name_element.text + ".mp4") # rough estimate that all files are mp4
|
||||
url = urljoin(self.url, link)
|
||||
|
||||
path = self.path / name
|
||||
|
|
@ -193,7 +196,7 @@ class CrawlObject():
|
|||
logging.info("number of divs in formgroup != 2, dont know what to do.")
|
||||
continue
|
||||
|
||||
name = name_div.text
|
||||
name = sanitize_filepath(name_div.text)
|
||||
link = link_div.find("a")
|
||||
if link is None:
|
||||
continue
|
||||
|
|
@ -220,6 +223,45 @@ class CrawlObject():
|
|||
# href = a_element.attrs["href"]
|
||||
# logging.info(href)
|
||||
|
||||
# find the il-card thumbnail videos
|
||||
for item in content.find_all("div", class_=["il-card", "thumbnail"]):
|
||||
captions = item.find_all("div", class_="caption")
|
||||
if captions is None:
|
||||
logging.debug("no captions found")
|
||||
continue
|
||||
|
||||
if len(captions) < 3:
|
||||
logging.debug(f"to few captions found: { len(captions) }")
|
||||
continue
|
||||
|
||||
heading = captions[0].find("a")
|
||||
if heading is None:
|
||||
logging.debug(f"No <a> found in { captions.prettify() }")
|
||||
continue
|
||||
|
||||
name = heading.text
|
||||
|
||||
logging.debug(f"Found Heading: { name }")
|
||||
|
||||
name = sanitize_filepath(name + ".mp4")
|
||||
|
||||
link = captions[2].find("a")
|
||||
if link is None:
|
||||
logging.debug("No <a> found in { link.prettify() }")
|
||||
continue
|
||||
|
||||
url = link.attrs["href"]
|
||||
|
||||
if url in self.discovered:
|
||||
logging.info(f"{ url } already discovered, skipping it")
|
||||
continue
|
||||
self.discovered.add(url)
|
||||
subitems.append(CrawlObject(self.path / name, url, self.session, discovered=self.discovered))
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
for item in subitems:
|
||||
item.process()
|
||||
|
||||
|
|
@ -231,14 +273,15 @@ class CrawlObject():
|
|||
If needed the directories are created.
|
||||
"""
|
||||
|
||||
content = self.request_get().content
|
||||
response = self.request_get(stream=True)
|
||||
|
||||
if not self.path.parent.is_dir():
|
||||
self.path.parent.mkdir(parents=True, exist_ok=True)
|
||||
|
||||
logging.info(f"Writing to file '{ self.path }'.")
|
||||
with open(self.path, "wb+") as f:
|
||||
f.write(content)
|
||||
for chunk in response.iter_content(chunk_size=10*(2**10)):
|
||||
f.write(chunk)
|
||||
|
||||
def update_requiered(self) -> bool:
|
||||
"""
|
||||
|
|
@ -266,7 +309,25 @@ class Synchronizer:
|
|||
self._session = requests.Session()
|
||||
|
||||
# fill the session with the correct cookies
|
||||
self._session.get(f"{ BASE_URL }/login.php?target=&client_id=Uni_Stuttgart&cmd=force_login&lang=en")
|
||||
login_page = self._session.get(f"{ BASE_URL }/login.php?target=&client_id=Uni_Stuttgart&cmd=force_login&lang=en")
|
||||
|
||||
login_page = BeautifulSoup(login_page.content)
|
||||
|
||||
login_form = login_page.find("form", attrs={"name": "formlogin"})
|
||||
|
||||
if login_form is None:
|
||||
logging.error("Login failed, login form not found!")
|
||||
exit(1)
|
||||
|
||||
logging.debug(login_form)
|
||||
|
||||
login_url = login_form.attrs.get("action")
|
||||
|
||||
if login_url is None:
|
||||
logging.error("Could not find the action URL in the login form!")
|
||||
exit(1)
|
||||
|
||||
logging.debug(f"Login URL: {login_url}")
|
||||
|
||||
login_data = {
|
||||
"username": self.username,
|
||||
|
|
@ -275,7 +336,8 @@ class Synchronizer:
|
|||
}
|
||||
|
||||
# do the actual login
|
||||
self._session.post(f"{ BASE_URL }/ilias.php?lang=en&client_id=Uni_Stuttgart&cmd=post&cmdClass=ilstartupgui&cmdNode=123&baseClass=ilStartUpGUI&rtoken=", data=login_data)
|
||||
self._session.post(f"{ BASE_URL }/{ login_url }",
|
||||
data=login_data)
|
||||
|
||||
def synchronize(self):
|
||||
|
||||
|
|
|
|||
|
|
@ -2,6 +2,8 @@ import re
|
|||
from typing import Optional
|
||||
from dataclasses import dataclass
|
||||
|
||||
from pathvalidate import sanitize_filepath
|
||||
|
||||
@dataclass
|
||||
class ContentDisposition:
|
||||
|
||||
|
|
@ -33,5 +35,6 @@ def heading_sanitization(heading: str) -> str:
|
|||
if heading == "Inhalt":
|
||||
return ""
|
||||
|
||||
return heading
|
||||
return sanitize_filepath(heading)
|
||||
|
||||
|
||||
|
|
|
|||
10
requirements.txt
Normal file
10
requirements.txt
Normal file
|
|
@ -0,0 +1,10 @@
|
|||
beautifulsoup4==4.10.0
|
||||
bs4==0.0.1
|
||||
certifi==2021.10.8
|
||||
charset-normalizer==2.0.9
|
||||
idna==3.3
|
||||
-e git+ssh://git@git.srvspace.net/jo/ilias-sync2.git@b74c19af044e6fbd89432d073030094f53cda078#egg=ilias_sync2
|
||||
pathvalidate==2.5.0
|
||||
requests==2.26.0
|
||||
soupsieve==2.3.1
|
||||
urllib3==1.26.7
|
||||
Loading…
Add table
Add a link
Reference in a new issue