vcaa-grabber.pyon commit improve webp-convert.sh (b8ae487)
   1#! /usr/bin/python
   2
   3import argparse
   4from bs4 import BeautifulSoup
   5import requests
   6from urllib.parse import urljoin
   7
   8ROOT_URL = "https://vcaa.vic.edu.au"
   9INDEX_URL = "https://vcaa.vic.edu.au/assessment/vce-assessment/" \
  10        "past-examinations/Pages/Index.aspx"
  11
  12def print_output(items, level):
  13    for i, item in enumerate(filter(None, items)):
  14        if type(item) is list:
  15            print_output(item, level+1)
  16        else:
  17            if i == len(items) - 1:
  18                print("│ "*level + "└─" + " ".join(item.split()))
  19            else:
  20                print("│ "*level + "├─" + " ".join(item.split()))
  21
  22
  23def main():
  24
  25    # Get arguments
  26    parser = argparse.ArgumentParser(description="Download VCAA exams and "
  27            "reports from their website")
  28    parser.add_argument("action", nargs="?", type=str,
  29            choices=["list", "details", "download"])
  30    parser.add_argument("subjects", nargs=argparse.REMAINDER, 
  31            type=str)
  32    args = parser.parse_args()
  33
  34    # Parse index page
  35
  36    index_page = requests.get(INDEX_URL)
  37    index_tree = BeautifulSoup(index_page.text, "html.parser")
  38    faculties = index_tree.find_all(class_="card")
  39
  40    # Generate dictionary (some <a> elements contain a <span> for some reason)
  41    faculties_dict = {}
  42    for f in faculties:
  43        faculties_dict[f.div.strong.contents[0]] = \
  44                {s.contents[0] if len(s.contents[0]) > 2 
  45                        else s.span.contents[0]: urljoin(INDEX_URL, s.get("href") )
  46                        for s in f.find(class_="links list-unstyled").find_all("a")}
  47
  48    # Remove NHT link
  49    faculties_dict.pop("Northern Hemisphere Timetable", None)
  50
  51    output = []
  52    # Iterate through all faculties
  53    for faculty, subjects in faculties_dict.items():
  54        # Check for subject match
  55        matching_subjects = set(args.subjects).intersection(subjects.keys())
  56        if args.action == "list":
  57            if (matching_subjects) and not (faculty in args.subjects):
  58                output.append[faculty]
  59                output.append([s for s in list(matching_subjects)])
  60
  61            elif (not args.subjects) or (faculty in args.subjects):
  62                output.append(faculty)
  63                output.append([s for s in list(subjects.keys())])
  64
  65        elif args.action == "details":
  66            # Print the available documents for requested subjects
  67            if (matching_subjects) and not (faculty in args.subjects):
  68                for subject in list(matching_subjects):
  69                    output.append(subject)
  70                    output.append([])
  71                    subject_page = requests.get(subjects[subject])
  72                    subject_tree = BeautifulSoup(subject_page.text, "html.parser")
  73                    for element in subject_tree.find(class_="main-section").find(class_="container-fluid").find_all():
  74                        if element.name == "h2":
  75                            output[-1].append(element.get_text())
  76                            output[-1].append([])
  77                        elif element.name == "h3":
  78                            output[-1][-1].append(element.get_text())
  79                            output[-1][-1].append([])
  80                        elif element.name == "p" and element.find("a", recursive=False):
  81                            if not(type(output[-1][-1]) is list and len(output[-1][-1]) > 0):
  82                                output[-1][-1].append([])
  83                            output[-1][-1][-1].append("".join(element.get_text().split("\n")))
  84                        elif element.name == "table" and element.get("class") == ["past-examtable"]:
  85                            for row in element.find_all("tr"):
  86
  87                                if row.find_next().name == "th":
  88                                    continue    # Skip header row
  89
  90                                columns = list(row.children)
  91                                year = columns[0].get_text()
  92                                for exam in columns[1].find_all("a"):
  93                                    output[-1][-1][-1].append(year + " " + "".join(exam.get_text().split("\n")))
  94                                for report in columns[2].find_all("a"):
  95                                    output[-1][-1][-1].append(year + " " + " Report (".join(report.get_text().split("(")))
  96
  97    print_output(output, 0)
  98
  99if __name__ == "__main__":
 100    main()
 101