+#! /usr/bin/python
+
+import argparse
+from bs4 import BeautifulSoup
+import requests
+from urllib.parse import urljoin
+
+ROOT_URL = "https://vcaa.vic.edu.au"
+INDEX_URL = "https://vcaa.vic.edu.au/assessment/vce-assessment/" \
+ "past-examinations/Pages/Index.aspx"
+
+def print_output(items, level):
+ for i, item in enumerate(filter(None, items)):
+ if type(item) is list:
+ print_output(item, level+1)
+ else:
+ if i == len(items) - 1:
+ print("│ "*level + "└─" + " ".join(item.split()))
+ else:
+ print("│ "*level + "├─" + " ".join(item.split()))
+
+
+def main():
+
+ # Get arguments
+ parser = argparse.ArgumentParser(description="Download VCAA exams and "
+ "reports from their website")
+ parser.add_argument("action", nargs="?", type=str,
+ choices=["list", "details", "download"])
+ parser.add_argument("subjects", nargs=argparse.REMAINDER,
+ type=str)
+ args = parser.parse_args()
+
+ # Parse index page
+
+ index_page = requests.get(INDEX_URL)
+ index_tree = BeautifulSoup(index_page.text, "html.parser")
+ faculties = index_tree.find_all(class_="card")
+
+ # Generate dictionary (some <a> elements contain a <span> for some reason)
+ faculties_dict = {}
+ for f in faculties:
+ faculties_dict[f.div.strong.contents[0]] = \
+ {s.contents[0] if len(s.contents[0]) > 2
+ else s.span.contents[0]: urljoin(INDEX_URL, s.get("href") )
+ for s in f.find(class_="links list-unstyled").find_all("a")}
+
+ # Remove NHT link
+ faculties_dict.pop("Northern Hemisphere Timetable", None)
+
+ output = []
+ # Iterate through all faculties
+ for faculty, subjects in faculties_dict.items():
+ # Check for subject match
+ matching_subjects = set(args.subjects).intersection(subjects.keys())
+ if args.action == "list":
+ if (matching_subjects) and not (faculty in args.subjects):
+ output.append[faculty]
+ output.append([s for s in list(matching_subjects)])
+
+ elif (not args.subjects) or (faculty in args.subjects):
+ output.append(faculty)
+ output.append([s for s in list(subjects.keys())])
+
+ elif args.action == "details":
+ # Print the available documents for requested subjects
+ if (matching_subjects) and not (faculty in args.subjects):
+ for subject in list(matching_subjects):
+ output.append(subject)
+ output.append([])
+ subject_page = requests.get(subjects[subject])
+ subject_tree = BeautifulSoup(subject_page.text, "html.parser")
+ for element in subject_tree.find(class_="main-section").find(class_="container-fluid").find_all():
+ if element.name == "h2":
+ output[-1].append(element.get_text())
+ output[-1].append([])
+ elif element.name == "h3":
+ output[-1][-1].append(element.get_text())
+ output[-1][-1].append([])
+ elif element.name == "p" and element.find("a", recursive=False):
+ if not(type(output[-1][-1]) is list and len(output[-1][-1]) > 0):
+ output[-1][-1].append([])
+ output[-1][-1][-1].append("".join(element.get_text().split("\n")))
+ elif element.name == "table" and element.get("class") == ["past-examtable"]:
+ for row in element.find_all("tr"):
+
+ if row.find_next().name == "th":
+ continue # Skip header row
+
+ columns = list(row.children)
+ year = columns[0].get_text()
+ for exam in columns[1].find_all("a"):
+ output[-1][-1][-1].append(year + " " + "".join(exam.get_text().split("\n")))
+ for report in columns[2].find_all("a"):
+ output[-1][-1][-1].append(year + " " + " Report (".join(report.get_text().split("(")))
+
+ print_output(output, 0)
+
+if __name__ == "__main__":
+ main()
+