initial commit

2022-03-29 15:13:31 +02:00 · 2022-03-29 15:13:31 +02:00 · 4742e417f7
commit 4742e417f7
7 changed files with 345 additions and 0 deletions
--- a/ilias_sync2/init.py
+++ b/ilias_sync2/init.py
@ -0,0 +1 @@
+from .main import CrawlObject, Synchronizer
--- a/ilias_sync2/main.py
+++ b/ilias_sync2/main.py
@ -0,0 +1,282 @@
+import logging
+import sys
+from pathlib import Path
+from dataclasses import dataclass, field
+from typing import List, Tuple, Set
+from urllib.parse import urljoin
+
+import requests
+from bs4 import BeautifulSoup
+
+from .utils import ContentDisposition, heading_sanitization
+
+BASE_URL = "https://ilias3.uni-stuttgart.de"
+
+
+root = logging.getLogger()
+root.setLevel(logging.DEBUG)
+
+handler = logging.StreamHandler(sys.stdout)
+handler.setLevel(logging.DEBUG)
+formatter = logging.Formatter('%(asctime)s - %(name)s - %(levelname)s - %(message)s')
+handler.setFormatter(formatter)
+root.addHandler(handler)
+
+
+class NoIliasFilename(Exception):
+    pass
+
+
+class FileDownloadError(Exception):
+    pass
+
+
+class LoginFailed(Exception):
+    pass
+
+
+class RequestError(Exception):
+    pass
+
+
+class NoContentType(Exception):
+    pass
+
+# Fehler nach oben durchreichen
+# Seite
+# seite holen
+# seite parsen - pro Link
+# - Namen suchen
+# - Link suchen
+# - Neues Crawl-Objekt erstellen
+# rekursiv sich mit den Links beschäftigen
+# Wenn seite != HTML
+# - Daten aus HEAD mit Datei im System abgleichen
+# - wenn nicht vorhanden oder unterschiedlich runterladen und speichern
+
+
+@dataclass
+class CrawlObject():
+
+    path: Path
+    url: str
+    session: requests.Session
+    head: requests.Response = None
+    items: List["CrawlObject"] = field(default_factory=list)
+    discovered: Set[str] = field(default_factory=set)
+
+    def request_head(self) -> requests.Response:
+        return self.request(method="HEAD")
+
+    def request_get(self) -> requests.Response:
+        return self.request(method="GET")
+
+    def request(self, method="GET") -> requests.Response:
+        response = self.session.request(method=method, url=self.url)
+        if response.status_code != requests.codes.ok:
+            raise RequestError(f"Error getting '{ method }' for { self.url }",
+                               response)
+
+        return response
+
+    def parse(self) -> None:
+        pass
+        # self.get_content
+
+    def process(self):
+        if self.head is None:
+            self.head = self.request_head()
+        content_type = self.head.headers.get("Content-Type", None)
+
+        if content_type is None:
+            raise NoContentType(f"No Content-Type for { self.url }")
+
+        typestr = content_type.split(";")[0]
+        logging.debug(f"Content-Type is '{ typestr }'.")
+        if typestr == "text/html":
+            self.parse()
+            self.process_items()
+        else:
+            self.improve_name()
+            if self.update_requiered():
+                self.save()
+
+    def improve_name(self):
+        """
+        This method reads the header and tries to improve the filename
+        """
+
+        disposition = self.head.headers.get("Content-Disposition")
+        if disposition is not None:
+            disposition_obj = ContentDisposition.from_str(disposition)
+
+            if disposition_obj.filename is not None:
+                logging.info(f"Improving filename from '{ self.path.name }' to '{ disposition_obj.filename }'")
+                self.path = self.path.parent / disposition_obj.filename
+
+    def process_items(self) -> None:
+
+        subitems = list()
+        self.page_request = self.request_get()
+        self.html = BeautifulSoup(self.page_request.text, "html.parser")
+
+        content = self.html.find(id="ilContentContainer")
+
+        if content is None:
+            logging.info("no ilias content block found, nothing to do.")
+        else:
+            for block in content.find_all("div", class_="ilContainerBlock"):
+                header = block.find("div", class_="ilContainerBlockHeader")
+                heading = header.find("h2").text
+
+                items = block.find_all("div", class_="il_ContainerListItem")
+                for item in items:
+                    #logging.debug(item.prettify())
+                    link = item.find("a")
+                    if link is None:
+                        continue
+                    name = link.text
+                    url = link.attrs["href"]
+                    url = urljoin(self.url, url)
+
+                    if url in self.discovered:
+                        continue
+                        logging.info(f"{ url } already discovered, skipping it")
+                    self.discovered.add(url)
+                    subitems.append(CrawlObject(self.path / heading_sanitization(heading) / name,
+                                                url,
+                                                self.session, discovered=self.discovered))
+
+            # 4 column video download like used by Introduction to Modern Cryptography
+            for block in content.find_all("tr"):
+                items = block.find_all("td")
+                if len(items) == 4:
+                    _, name, _, download = items
+
+                    link = download.find("a").attrs["href"]
+                    name = name.find("a").text + ".mp4"  # rough estimate that all files are mp4
+                    url = urljoin(self.url, link)
+
+                    path = self.path / name
+
+                    if url in self.discovered:
+                        logging.info(f"{ url } already discovered, skipping it")
+                        continue
+                    self.discovered.add(url)
+                    subitems.append(CrawlObject(path, url, self.session, discovered=self.discovered))
+
+            # download things from exercise sections
+            exercise_div = content.find("div", class_="ilExcOverview")
+            if exercise_div is not None:
+                logging.debug(exercise_div.prettify())
+                containers = exercise_div.find_all("div", class_="il_VAccordionInnerContainer")
+                logging.info(f"found { len(containers) } containers.")
+                for block in containers:
+                    header = block.find("span", class_="ilAssignmentHeader")
+                    logging.debug(header.prettify())
+                    if header is None:
+                        logging.warning(f"Found an assignment without a header in '{ self.path }', '{ self.url }'")
+                        continue
+                    header_text = header.text
+
+                    forms = block.find_all("div", class_="form-group")
+                    for form in forms:
+                        logging.debug(form.prettify())
+                        divs = form.find_all("div")
+                        if len(divs) == 2:
+                            name_div, link_div = divs
+                        else:
+                            logging.info("number of divs in formgroup != 2, dont know what to do.")
+                            continue
+
+                        name = name_div.text
+                        link = link_div.find("a")
+                        if link is None:
+                            continue
+                        link = link.attrs["href"]
+                        url = urljoin(self.url, link)
+
+                        if url in self.discovered:
+                            logging.info(f"{ url } already discovered, skipping it")
+                            continue
+                        self.discovered.add(url)
+                        subitems.append(CrawlObject(self.path / header_text / name, url, self.session, discovered=self.discovered))
+
+#            # contents for videos with chapters
+#            menu_div = self.html.find("div", class_="il-mainbar-slates")
+#            if menu_div is not None:
+#
+#                blocks = menu_div.find_all("li")
+#                logging.info(f"found { len(blocks) } blocks in menu")
+#                for block in blocks:
+#                    logging.debug(block.prettify())
+#                    a_element = block.find("a")
+#                    if a_element is None:
+#                        continue
+#                    href = a_element.attrs["href"]
+#                    logging.info(href)
+
+        for item in subitems:
+            item.process()
+
+    def save(self) -> None:
+        """
+        makes a GET request to the URL of this objects and saves it to the
+        path of this object.
+
+        If needed the directories are created.
+        """
+
+        content = self.request_get().content
+
+        if not self.path.parent.is_dir():
+            self.path.parent.mkdir(parents=True, exist_ok=True)
+
+        logging.info(f"Writing to file '{ self.path }'.")
+        with open(self.path, "wb+") as f:
+            f.write(content)
+
+    def update_requiered(self) -> bool:
+        """
+        Tries to compare the file from the URL to the file on the filesystem.
+
+        Returns True if the file needs an update.
+        """
+
+        if not self.path.is_file():
+            logging.info(f"Update required for '{ self.path }' because the file does not exist.")
+            return True
+
+        logging.info(f"No update required for '{ self.path }'.")
+        return False
+
+
+@dataclass
+class Synchronizer:
+    username: str
+    password: str
+    directories: List[Tuple[str, str]] = field(default_factory=list)
+
+    def login(self):
+
+        self._session = requests.Session()
+
+        # fill the session with the correct cookies
+        self._session.get(f"{ BASE_URL }/login.php?target=&client_id=Uni_Stuttgart&cmd=force_login&lang=en")
+
+        login_data = {
+                "username": self.username,
+                "password": self.password,
+                "cmd[doStandardAuthentication]": "Anmelden"
+                }
+
+        # do the actual login
+        self._session.post(f"{ BASE_URL }/ilias.php?lang=en&client_id=Uni_Stuttgart&cmd=post&cmdClass=ilstartupgui&cmdNode=123&baseClass=ilStartUpGUI&rtoken=", data=login_data)
+
+    def synchronize(self):
+
+        self.login()
+
+        for path, url in self.directories:
+            obj = CrawlObject(path, url, self._session)
+            obj.process()
--- a/ilias_sync2/test/init.py
+++ b/ilias_sync2/test/init.py
--- a/ilias_sync2/test/test_page_parser.py
+++ b/ilias_sync2/test/test_page_parser.py
@ -0,0 +1,5 @@
+from unittest import TestCase
+
+class TestGetPageContent(TestCase):
+
+    def test_
--- a/ilias_sync2/utils.py
+++ b/ilias_sync2/utils.py
@ -0,0 +1,37 @@
+import re
+from typing import Optional
+from dataclasses import dataclass
+
+@dataclass
+class ContentDisposition:
+
+    method: str
+    filename: Optional[str] = None
+
+    @classmethod
+    def from_str(cls, disposition: str):
+
+        m = re.match("(?P<type>(inline)|(attachment))(;\\s*(filename=\"(?P<filename>.+)\"\\s*))?", disposition)
+
+        if m is None:
+            raise ValueError(f"Error while parsing disposition string '{ disposition }'.")
+
+        d = m.groupdict()
+
+        return cls(d["type"], d.get("filename", None))
+
+def heading_sanitization(heading: str) -> str:
+    """
+    Removes some parts of headings and path names.
+
+    Currently these optimizations are done:
+    - if the heading is "Inhalt" it is replaced by the empty string
+
+    Otherwise the heading is returned as is.
+    """
+
+    if heading == "Inhalt":
+        return ""
+
+    return heading
+