From: Andrew Lorimer Date: Sat, 28 Dec 2019 00:46:00 +0000 (+1100) Subject: add vcaa-grabber.py and exam-organise.sh X-Git-Url: https://git.lorimer.id.au/scripts.git/diff_plain/5076ca44d79d7479b36556aba83076a75269f727 add vcaa-grabber.py and exam-organise.sh --- diff --git a/exam-organise.sh b/exam-organise.sh new file mode 100755 index 0000000..4b94384 --- /dev/null +++ b/exam-organise.sh @@ -0,0 +1,22 @@ +#! /bin/bash + +# random snippets for organising PDFs of exams (https://git.lorimer.id.au/textbooks/exams) + +prefix="\[Methods\]" + +#pattern='s/\[Methods\] (....) (\w+) Exam (.)( Solutions)?/\L\2\/\1-\L\2-exam-\3/p' +pattern='s/\[Methods\]/deez/p' +echo "$pattern" +for f in $prefix*; do + if [[ $f =~ "Solutions" ]]; then + name=`echo "$f" | sed -E -e "s/$prefix (....) (\w+)\s+Exam (.) Solutions\.(.*)/\L\2\/\1-\L\2-exam-\3-solutions.\L\4/"` + else + name=`echo "$f" | sed -E -e "s/$prefix (....) (\w+) Exam (.)\.(.*)/\L\2\/\1-\L\2-exam-\3.\L\4/"` + fi + + mkdir -p `echo "$f" | sed -E -s "s/$prefix .... (\w+)/\L\1/"` + mv -n $f $name + + echo "$f -> $name" + +done diff --git a/vcaa-grabber.py b/vcaa-grabber.py new file mode 100755 index 0000000..dfe7791 --- /dev/null +++ b/vcaa-grabber.py @@ -0,0 +1,101 @@ +#! /usr/bin/python + +import argparse +from bs4 import BeautifulSoup +import requests +from urllib.parse import urljoin + +ROOT_URL = "https://vcaa.vic.edu.au" +INDEX_URL = "https://vcaa.vic.edu.au/assessment/vce-assessment/" \ + "past-examinations/Pages/Index.aspx" + +def print_output(items, level): + for i, item in enumerate(filter(None, items)): + if type(item) is list: + print_output(item, level+1) + else: + if i == len(items) - 1: + print("│ "*level + "└─" + " ".join(item.split())) + else: + print("│ "*level + "├─" + " ".join(item.split())) + + +def main(): + + # Get arguments + parser = argparse.ArgumentParser(description="Download VCAA exams and " + "reports from their website") + parser.add_argument("action", nargs="?", type=str, + choices=["list", "details", "download"]) + parser.add_argument("subjects", nargs=argparse.REMAINDER, + type=str) + args = parser.parse_args() + + # Parse index page + + index_page = requests.get(INDEX_URL) + index_tree = BeautifulSoup(index_page.text, "html.parser") + faculties = index_tree.find_all(class_="card") + + # Generate dictionary (some elements contain a for some reason) + faculties_dict = {} + for f in faculties: + faculties_dict[f.div.strong.contents[0]] = \ + {s.contents[0] if len(s.contents[0]) > 2 + else s.span.contents[0]: urljoin(INDEX_URL, s.get("href") ) + for s in f.find(class_="links list-unstyled").find_all("a")} + + # Remove NHT link + faculties_dict.pop("Northern Hemisphere Timetable", None) + + output = [] + # Iterate through all faculties + for faculty, subjects in faculties_dict.items(): + # Check for subject match + matching_subjects = set(args.subjects).intersection(subjects.keys()) + if args.action == "list": + if (matching_subjects) and not (faculty in args.subjects): + output.append[faculty] + output.append([s for s in list(matching_subjects)]) + + elif (not args.subjects) or (faculty in args.subjects): + output.append(faculty) + output.append([s for s in list(subjects.keys())]) + + elif args.action == "details": + # Print the available documents for requested subjects + if (matching_subjects) and not (faculty in args.subjects): + for subject in list(matching_subjects): + output.append(subject) + output.append([]) + subject_page = requests.get(subjects[subject]) + subject_tree = BeautifulSoup(subject_page.text, "html.parser") + for element in subject_tree.find(class_="main-section").find(class_="container-fluid").find_all(): + if element.name == "h2": + output[-1].append(element.get_text()) + output[-1].append([]) + elif element.name == "h3": + output[-1][-1].append(element.get_text()) + output[-1][-1].append([]) + elif element.name == "p" and element.find("a", recursive=False): + if not(type(output[-1][-1]) is list and len(output[-1][-1]) > 0): + output[-1][-1].append([]) + output[-1][-1][-1].append("".join(element.get_text().split("\n"))) + elif element.name == "table" and element.get("class") == ["past-examtable"]: + for row in element.find_all("tr"): + + if row.find_next().name == "th": + continue # Skip header row + + columns = list(row.children) + year = columns[0].get_text() + for exam in columns[1].find_all("a"): + output[-1][-1][-1].append(year + " " + "".join(exam.get_text().split("\n"))) + for report in columns[2].find_all("a"): + output[-1][-1][-1].append(year + " " + " Report (".join(report.get_text().split("("))) + + print_output(output, 0) + +if __name__ == "__main__": + main() +