From: Andrew Lorimer <andrew@lorimer.id.au>
Date: Sat, 28 Dec 2019 00:46:00 +0000 (+1100)
Subject: add vcaa-grabber.py and exam-organise.sh
X-Git-Url: https://git.lorimer.id.au/scripts.git/diff_plain/5076ca44d79d7479b36556aba83076a75269f727

add vcaa-grabber.py and exam-organise.sh
---

diff --git a/exam-organise.sh b/exam-organise.sh
new file mode 100755
index 0000000..4b94384
--- /dev/null
+++ b/exam-organise.sh
@@ -0,0 +1,22 @@
+#! /bin/bash
+
+# random snippets for organising PDFs of exams (https://git.lorimer.id.au/textbooks/exams)
+
+prefix="\[Methods\]"
+
+#pattern='s/\[Methods\] (....) (\w+) Exam (.)( Solutions)?/\L\2\/\1-\L\2-exam-\3/p'
+pattern='s/\[Methods\]/deez/p'
+echo "$pattern"
+for f in $prefix*; do
+  if [[ $f =~ "Solutions" ]]; then
+    name=`echo "$f" | sed -E -e "s/$prefix (....) (\w+)\s+Exam (.) Solutions\.(.*)/\L\2\/\1-\L\2-exam-\3-solutions.\L\4/"`
+  else
+    name=`echo "$f" | sed -E -e "s/$prefix (....) (\w+) Exam (.)\.(.*)/\L\2\/\1-\L\2-exam-\3.\L\4/"`
+  fi
+
+  mkdir -p  `echo "$f" | sed -E -s "s/$prefix .... (\w+)/\L\1/"`
+  mv -n $f $name
+  
+  echo "$f -> $name"
+
+done
diff --git a/vcaa-grabber.py b/vcaa-grabber.py
new file mode 100755
index 0000000..dfe7791
--- /dev/null
+++ b/vcaa-grabber.py
@@ -0,0 +1,101 @@
+#! /usr/bin/python
+
+import argparse
+from bs4 import BeautifulSoup
+import requests
+from urllib.parse import urljoin
+
+ROOT_URL = "https://vcaa.vic.edu.au"
+INDEX_URL = "https://vcaa.vic.edu.au/assessment/vce-assessment/" \
+        "past-examinations/Pages/Index.aspx"
+
+def print_output(items, level):
+    for i, item in enumerate(filter(None, items)):
+        if type(item) is list:
+            print_output(item, level+1)
+        else:
+            if i == len(items) - 1:
+                print("â "*level + "ââ" + " ".join(item.split()))
+            else:
+                print("â "*level + "ââ" + " ".join(item.split()))
+
+
+def main():
+
+    # Get arguments
+    parser = argparse.ArgumentParser(description="Download VCAA exams and "
+            "reports from their website")
+    parser.add_argument("action", nargs="?", type=str,
+            choices=["list", "details", "download"])
+    parser.add_argument("subjects", nargs=argparse.REMAINDER, 
+            type=str)
+    args = parser.parse_args()
+
+    # Parse index page
+
+    index_page = requests.get(INDEX_URL)
+    index_tree = BeautifulSoup(index_page.text, "html.parser")
+    faculties = index_tree.find_all(class_="card")
+
+    # Generate dictionary (some <a> elements contain a <span> for some reason)
+    faculties_dict = {}
+    for f in faculties:
+        faculties_dict[f.div.strong.contents[0]] = \
+                {s.contents[0] if len(s.contents[0]) > 2 
+                        else s.span.contents[0]: urljoin(INDEX_URL, s.get("href") )
+                        for s in f.find(class_="links list-unstyled").find_all("a")}
+
+    # Remove NHT link
+    faculties_dict.pop("Northern Hemisphere Timetable", None)
+
+    output = []
+    # Iterate through all faculties
+    for faculty, subjects in faculties_dict.items():
+        # Check for subject match
+        matching_subjects = set(args.subjects).intersection(subjects.keys())
+        if args.action == "list":
+            if (matching_subjects) and not (faculty in args.subjects):
+                output.append[faculty]
+                output.append([s for s in list(matching_subjects)])
+
+            elif (not args.subjects) or (faculty in args.subjects):
+                output.append(faculty)
+                output.append([s for s in list(subjects.keys())])
+
+        elif args.action == "details":
+            # Print the available documents for requested subjects
+            if (matching_subjects) and not (faculty in args.subjects):
+                for subject in list(matching_subjects):
+                    output.append(subject)
+                    output.append([])
+                    subject_page = requests.get(subjects[subject])
+                    subject_tree = BeautifulSoup(subject_page.text, "html.parser")
+                    for element in subject_tree.find(class_="main-section").find(class_="container-fluid").find_all():
+                        if element.name == "h2":
+                            output[-1].append(element.get_text())
+                            output[-1].append([])
+                        elif element.name == "h3":
+                            output[-1][-1].append(element.get_text())
+                            output[-1][-1].append([])
+                        elif element.name == "p" and element.find("a", recursive=False):
+                            if not(type(output[-1][-1]) is list and len(output[-1][-1]) > 0):
+                                output[-1][-1].append([])
+                            output[-1][-1][-1].append("".join(element.get_text().split("\n")))
+                        elif element.name == "table" and element.get("class") == ["past-examtable"]:
+                            for row in element.find_all("tr"):
+
+                                if row.find_next().name == "th":
+                                    continue    # Skip header row
+
+                                columns = list(row.children)
+                                year = columns[0].get_text()
+                                for exam in columns[1].find_all("a"):
+                                    output[-1][-1][-1].append(year + " " + "".join(exam.get_text().split("\n")))
+                                for report in columns[2].find_all("a"):
+                                    output[-1][-1][-1].append(year + " " + " Report (".join(report.get_text().split("(")))
+
+    print_output(output, 0)
+
+if __name__ == "__main__":
+    main()
+