#! /usr/bin/python import argparse from bs4 import BeautifulSoup import requests from urllib.parse import urljoin ROOT_URL = "https://vcaa.vic.edu.au" INDEX_URL = "https://vcaa.vic.edu.au/assessment/vce-assessment/" \ "past-examinations/Pages/Index.aspx" def print_output(items, level): for i, item in enumerate(filter(None, items)): if type(item) is list: print_output(item, level+1) else: if i == len(items) - 1: print("│ "*level + "└─" + " ".join(item.split())) else: print("│ "*level + "├─" + " ".join(item.split())) def main(): # Get arguments parser = argparse.ArgumentParser(description="Download VCAA exams and " "reports from their website") parser.add_argument("action", nargs="?", type=str, choices=["list", "details", "download"]) parser.add_argument("subjects", nargs=argparse.REMAINDER, type=str) args = parser.parse_args() # Parse index page index_page = requests.get(INDEX_URL) index_tree = BeautifulSoup(index_page.text, "html.parser") faculties = index_tree.find_all(class_="card") # Generate dictionary (some elements contain a for some reason) faculties_dict = {} for f in faculties: faculties_dict[f.div.strong.contents[0]] = \ {s.contents[0] if len(s.contents[0]) > 2 else s.span.contents[0]: urljoin(INDEX_URL, s.get("href") ) for s in f.find(class_="links list-unstyled").find_all("a")} # Remove NHT link faculties_dict.pop("Northern Hemisphere Timetable", None) output = [] # Iterate through all faculties for faculty, subjects in faculties_dict.items(): # Check for subject match matching_subjects = set(args.subjects).intersection(subjects.keys()) if args.action == "list": if (matching_subjects) and not (faculty in args.subjects): output.append[faculty] output.append([s for s in list(matching_subjects)]) elif (not args.subjects) or (faculty in args.subjects): output.append(faculty) output.append([s for s in list(subjects.keys())]) elif args.action == "details": # Print the available documents for requested subjects if (matching_subjects) and not (faculty in args.subjects): for subject in list(matching_subjects): output.append(subject) output.append([]) subject_page = requests.get(subjects[subject]) subject_tree = BeautifulSoup(subject_page.text, "html.parser") for element in subject_tree.find(class_="main-section").find(class_="container-fluid").find_all(): if element.name == "h2": output[-1].append(element.get_text()) output[-1].append([]) elif element.name == "h3": output[-1][-1].append(element.get_text()) output[-1][-1].append([]) elif element.name == "p" and element.find("a", recursive=False): if not(type(output[-1][-1]) is list and len(output[-1][-1]) > 0): output[-1][-1].append([]) output[-1][-1][-1].append("".join(element.get_text().split("\n"))) elif element.name == "table" and element.get("class") == ["past-examtable"]: for row in element.find_all("tr"): if row.find_next().name == "th": continue # Skip header row columns = list(row.children) year = columns[0].get_text() for exam in columns[1].find_all("a"): output[-1][-1][-1].append(year + " " + "".join(exam.get_text().split("\n"))) for report in columns[2].find_all("a"): output[-1][-1][-1].append(year + " " + " Report (".join(report.get_text().split("("))) print_output(output, 0) if __name__ == "__main__": main()