squashed all commits
added parsing for mathematical foundations recordings added filepath sanitization worked around the changing login URL added streaming of requests for large downloads added gitignore added example
This commit is contained in:
parent
544e609d44
commit
0849c0363a
5 changed files with 300 additions and 14 deletions
|
|
@ -7,6 +7,7 @@ from urllib.parse import urljoin
|
|||
|
||||
import requests
|
||||
from bs4 import BeautifulSoup
|
||||
from pathvalidate import sanitize_filepath
|
||||
|
||||
from .utils import ContentDisposition, heading_sanitization
|
||||
|
||||
|
|
@ -68,11 +69,11 @@ class CrawlObject():
|
|||
def request_head(self) -> requests.Response:
|
||||
return self.request(method="HEAD")
|
||||
|
||||
def request_get(self) -> requests.Response:
|
||||
return self.request(method="GET")
|
||||
def request_get(self, **kwargs) -> requests.Response:
|
||||
return self.request(method="GET", **kwargs)
|
||||
|
||||
def request(self, method="GET") -> requests.Response:
|
||||
response = self.session.request(method=method, url=self.url)
|
||||
def request(self, method="GET", **kwargs) -> requests.Response:
|
||||
response = self.session.request(method=method, url=self.url, **kwargs)
|
||||
if response.status_code != requests.codes.ok:
|
||||
raise RequestError(f"Error getting '{ method }' for { self.url }",
|
||||
response)
|
||||
|
|
@ -114,6 +115,8 @@ class CrawlObject():
|
|||
logging.info(f"Improving filename from '{ self.path.name }' to '{ disposition_obj.filename }'")
|
||||
self.path = self.path.parent / disposition_obj.filename
|
||||
|
||||
# self.path = Path(sanitize_filepath(self.path))
|
||||
|
||||
def process_items(self) -> None:
|
||||
|
||||
subitems = list()
|
||||
|
|
@ -131,17 +134,17 @@ class CrawlObject():
|
|||
|
||||
items = block.find_all("div", class_="il_ContainerListItem")
|
||||
for item in items:
|
||||
#logging.debug(item.prettify())
|
||||
# logging.debug(item.prettify())
|
||||
link = item.find("a")
|
||||
if link is None:
|
||||
continue
|
||||
name = link.text
|
||||
name = sanitize_filepath(link.text)
|
||||
url = link.attrs["href"]
|
||||
url = urljoin(self.url, url)
|
||||
|
||||
if url in self.discovered:
|
||||
continue
|
||||
logging.info(f"{ url } already discovered, skipping it")
|
||||
continue
|
||||
self.discovered.add(url)
|
||||
subitems.append(CrawlObject(self.path / heading_sanitization(heading) / name,
|
||||
url,
|
||||
|
|
@ -158,7 +161,7 @@ class CrawlObject():
|
|||
name_element = name.find("a")
|
||||
if link_element is not None and name_element is not None:
|
||||
link = link_element.attrs["href"]
|
||||
name = name_element.text + ".mp4" # rough estimate that all files are mp4
|
||||
name = sanitize_filepath(name_element.text + ".mp4") # rough estimate that all files are mp4
|
||||
url = urljoin(self.url, link)
|
||||
|
||||
path = self.path / name
|
||||
|
|
@ -193,7 +196,7 @@ class CrawlObject():
|
|||
logging.info("number of divs in formgroup != 2, dont know what to do.")
|
||||
continue
|
||||
|
||||
name = name_div.text
|
||||
name = sanitize_filepath(name_div.text)
|
||||
link = link_div.find("a")
|
||||
if link is None:
|
||||
continue
|
||||
|
|
@ -220,6 +223,45 @@ class CrawlObject():
|
|||
# href = a_element.attrs["href"]
|
||||
# logging.info(href)
|
||||
|
||||
# find the il-card thumbnail videos
|
||||
for item in content.find_all("div", class_=["il-card", "thumbnail"]):
|
||||
captions = item.find_all("div", class_="caption")
|
||||
if captions is None:
|
||||
logging.debug("no captions found")
|
||||
continue
|
||||
|
||||
if len(captions) < 3:
|
||||
logging.debug(f"to few captions found: { len(captions) }")
|
||||
continue
|
||||
|
||||
heading = captions[0].find("a")
|
||||
if heading is None:
|
||||
logging.debug(f"No <a> found in { captions.prettify() }")
|
||||
continue
|
||||
|
||||
name = heading.text
|
||||
|
||||
logging.debug(f"Found Heading: { name }")
|
||||
|
||||
name = sanitize_filepath(name + ".mp4")
|
||||
|
||||
link = captions[2].find("a")
|
||||
if link is None:
|
||||
logging.debug("No <a> found in { link.prettify() }")
|
||||
continue
|
||||
|
||||
url = link.attrs["href"]
|
||||
|
||||
if url in self.discovered:
|
||||
logging.info(f"{ url } already discovered, skipping it")
|
||||
continue
|
||||
self.discovered.add(url)
|
||||
subitems.append(CrawlObject(self.path / name, url, self.session, discovered=self.discovered))
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
for item in subitems:
|
||||
item.process()
|
||||
|
||||
|
|
@ -231,14 +273,15 @@ class CrawlObject():
|
|||
If needed the directories are created.
|
||||
"""
|
||||
|
||||
content = self.request_get().content
|
||||
response = self.request_get(stream=True)
|
||||
|
||||
if not self.path.parent.is_dir():
|
||||
self.path.parent.mkdir(parents=True, exist_ok=True)
|
||||
|
||||
logging.info(f"Writing to file '{ self.path }'.")
|
||||
with open(self.path, "wb+") as f:
|
||||
f.write(content)
|
||||
for chunk in response.iter_content(chunk_size=10*(2**10)):
|
||||
f.write(chunk)
|
||||
|
||||
def update_requiered(self) -> bool:
|
||||
"""
|
||||
|
|
@ -266,7 +309,25 @@ class Synchronizer:
|
|||
self._session = requests.Session()
|
||||
|
||||
# fill the session with the correct cookies
|
||||
self._session.get(f"{ BASE_URL }/login.php?target=&client_id=Uni_Stuttgart&cmd=force_login&lang=en")
|
||||
login_page = self._session.get(f"{ BASE_URL }/login.php?target=&client_id=Uni_Stuttgart&cmd=force_login&lang=en")
|
||||
|
||||
login_page = BeautifulSoup(login_page.content)
|
||||
|
||||
login_form = login_page.find("form", attrs={"name": "formlogin"})
|
||||
|
||||
if login_form is None:
|
||||
logging.error("Login failed, login form not found!")
|
||||
exit(1)
|
||||
|
||||
logging.debug(login_form)
|
||||
|
||||
login_url = login_form.attrs.get("action")
|
||||
|
||||
if login_url is None:
|
||||
logging.error("Could not find the action URL in the login form!")
|
||||
exit(1)
|
||||
|
||||
logging.debug(f"Login URL: {login_url}")
|
||||
|
||||
login_data = {
|
||||
"username": self.username,
|
||||
|
|
@ -275,7 +336,8 @@ class Synchronizer:
|
|||
}
|
||||
|
||||
# do the actual login
|
||||
self._session.post(f"{ BASE_URL }/ilias.php?lang=en&client_id=Uni_Stuttgart&cmd=post&cmdClass=ilstartupgui&cmdNode=123&baseClass=ilStartUpGUI&rtoken=", data=login_data)
|
||||
self._session.post(f"{ BASE_URL }/{ login_url }",
|
||||
data=login_data)
|
||||
|
||||
def synchronize(self):
|
||||
|
||||
|
|
|
|||
Loading…
Add table
Add a link
Reference in a new issue