1#! /usr/bin/python
2
3import argparse
4from bs4 import BeautifulSoup
5import requests
6from urllib.parse import urljoin
7
8ROOT_URL = "https://vcaa.vic.edu.au"
9INDEX_URL = "https://vcaa.vic.edu.au/assessment/vce-assessment/" \
10 "past-examinations/Pages/Index.aspx"
11
12def print_output(items, level):
13 for i, item in enumerate(filter(None, items)):
14 if type(item) is list:
15 print_output(item, level+1)
16 else:
17 if i == len(items) - 1:
18 print("│ "*level + "└─" + " ".join(item.split()))
19 else:
20 print("│ "*level + "├─" + " ".join(item.split()))
21
22
23def main():
24
25 # Get arguments
26 parser = argparse.ArgumentParser(description="Download VCAA exams and "
27 "reports from their website")
28 parser.add_argument("action", nargs="?", type=str,
29 choices=["list", "details", "download"])
30 parser.add_argument("subjects", nargs=argparse.REMAINDER,
31 type=str)
32 args = parser.parse_args()
33
34 # Parse index page
35
36 index_page = requests.get(INDEX_URL)
37 index_tree = BeautifulSoup(index_page.text, "html.parser")
38 faculties = index_tree.find_all(class_="card")
39
40 # Generate dictionary (some <a> elements contain a <span> for some reason)
41 faculties_dict = {}
42 for f in faculties:
43 faculties_dict[f.div.strong.contents[0]] = \
44 {s.contents[0] if len(s.contents[0]) > 2
45 else s.span.contents[0]: urljoin(INDEX_URL, s.get("href") )
46 for s in f.find(class_="links list-unstyled").find_all("a")}
47
48 # Remove NHT link
49 faculties_dict.pop("Northern Hemisphere Timetable", None)
50
51 output = []
52 # Iterate through all faculties
53 for faculty, subjects in faculties_dict.items():
54 # Check for subject match
55 matching_subjects = set(args.subjects).intersection(subjects.keys())
56 if args.action == "list":
57 if (matching_subjects) and not (faculty in args.subjects):
58 output.append[faculty]
59 output.append([s for s in list(matching_subjects)])
60
61 elif (not args.subjects) or (faculty in args.subjects):
62 output.append(faculty)
63 output.append([s for s in list(subjects.keys())])
64
65 elif args.action == "details":
66 # Print the available documents for requested subjects
67 if (matching_subjects) and not (faculty in args.subjects):
68 for subject in list(matching_subjects):
69 output.append(subject)
70 output.append([])
71 subject_page = requests.get(subjects[subject])
72 subject_tree = BeautifulSoup(subject_page.text, "html.parser")
73 for element in subject_tree.find(class_="main-section").find(class_="container-fluid").find_all():
74 if element.name == "h2":
75 output[-1].append(element.get_text())
76 output[-1].append([])
77 elif element.name == "h3":
78 output[-1][-1].append(element.get_text())
79 output[-1][-1].append([])
80 elif element.name == "p" and element.find("a", recursive=False):
81 if not(type(output[-1][-1]) is list and len(output[-1][-1]) > 0):
82 output[-1][-1].append([])
83 output[-1][-1][-1].append("".join(element.get_text().split("\n")))
84 elif element.name == "table" and element.get("class") == ["past-examtable"]:
85 for row in element.find_all("tr"):
86
87 if row.find_next().name == "th":
88 continue # Skip header row
89
90 columns = list(row.children)
91 year = columns[0].get_text()
92 for exam in columns[1].find_all("a"):
93 output[-1][-1][-1].append(year + " " + "".join(exam.get_text().split("\n")))
94 for report in columns[2].find_all("a"):
95 output[-1][-1][-1].append(year + " " + " Report (".join(report.get_text().split("(")))
96
97 print_output(output, 0)
98
99if __name__ == "__main__":
100 main()
101