# -*- coding: utf-8 -*- """ Analyse Apache (httpd) server logs, including data transferred, requests, clients, user agents, and errors. Note that Apache's logs can get filled up very quickly with the default verbosity, leading to logparse taking a very long time to analyse them. In general the default verbosity is good, but logs should be cleared as soon as they are analysed (make sure 'rotate' enabled in the logparse config). """ import datetime import re import time from logparse.formatting import * from logparse.util import readlog, resolve from logparse import config from logparse.load_parsers import Parser IPv4_ADDR_REGEX = '(?:\d{1,3}\.){3}\d{1,3}' IPv6_ADDR_REGEX = "([0-9A-Fa-f]{0,4}:){2,7}([0-9A-Fa-f]{0,4})" IP_ADDR_REGEX = "("+IPv4_ADDR_REGEX+"|"+IPv6_ADDR_REGEX+")" LOG_VARS = { "%a": "(?P{})?".format(IPv4_ADDR_REGEX), # client IP "%A": "(?P{})?".format(IP_ADDR_REGEX), # local (peer) IP "%B": "(?P(\d+|-))", # bytes "%b": "(?P(\d+|\"-\"))", # bytes (CLF format) "%{[^}]+?}C": "(?P.*)", # contents of cookie "%D": "(?P-?\d+)", # time taken to serve request (μs) "%{[^}]+?}e": "(?P.*)", # environment variable contents "%f": "(?P.*)", # file name requested "%h": "(?P\S+)", # remote hostname or IP "%H": "(?P.*)", # request protocol "%{Referer}i": "(?P.*)", # referrer "%{User-Agent}i": "(?P.*)", # user agent string "%{[^}]+?}i": "(?P
.*)", # request header "%k": "(?P\d*)", # number of keepalive requests "%l": "(?P.*)", # remote logname "%m": "(?P.*)", # request method "%{[^}]+?}n": "(?P.*)", # notes "%{[^}]+?}o": "(?P.*)", # reply header "%p": "(?P\d*)", # canonical port on server "%{[^}]+?}p": "(?P\d*)", # optional port "%P": "(?P\d*)", # process ID of child "%{[^}]+?}P": "(?P.*)", # process or thread ID "%q": "(?P.*)", # query string "%r": "(?P.*)", # first line of request "%R": "(?P.*)", # handler generating response "%s": "(?P(\d+?|-))", # status code "%t": "\[(?P.*?)\]", # request date and time with offset "%{[^}]+?}t": "(?P\d+)", # request date and time ()custom format) "%T": "(?P\d+)", # time taken to serve request (seconds) "%{[^}]+?}T": "(?P\d+)", # time taken to serve request (custom format) "%u": "(?P.*)", # remote user if authenticated "%U": "(?P.*)", # URL path excluding query string "%v": "(?P.*)", # server name "%V": "(?P.*)", # server name (custom format) "%X": "(?P.?)", # status on response completion "%I": "(?P\d+)", # bytes received "%O": "(?P\d+)", # bytes sent "%S": "(?P\d+)?" # total bytes transferred } LOG_ESCAPES = { ">": "", # final value "<": "", # initial value "%%": "%" # percent escape } def convert_logformat(format_template): """ Convert an Apache LogFormat string to a regex pattern """ escape_pattern = re.compile('|'.join(LOG_ESCAPES.keys())) format_template = escape_pattern.sub(lambda x: LOG_ESCAPES[x.group()], format_template) var_pattern = re.compile('|'.join(LOG_VARS.keys())) format_template = var_pattern.sub(lambda x: LOG_VARS[x.group()], format_template) return re.compile(format_template) class AccessLine(object): """ Retrieves information from a line of the httpd access log """ def __init__(self, record, datefmt, pattern): """ Assign attributes and verify/cast those than require it. Note that the `pattern` argument must be a pre-compiled regex object (to save time). """ # Parse from a raw logfile string self.properties = pattern.search(record).groupdict() for field, value in self.properties.items(): if value and not (value == "-" or value == "\"-\""): setattr(self, field, value) else: setattr(self, field, None) # Verify data transfer metrics for field, value in [x for x in self.properties.items() if "bytes" in x[0]]: if isinstance(value, str) and value.isdigit(): setattr(self, field, int(value)) else: setattr(self, field, 0) # Verify date self.date = datetime.datetime.strptime(self.properties["date"], datefmt) # Verify client if (not hasattr(self, "client") or not self.client) \ and hasattr(self, "hostname") and self.hostname: self.client = self.hostname # Verify file if (not hasattr(self, "file") or not self.file) and hasattr(self, "requesthead"): try: self.file = re.search(r"^\w+\s(.*)\s\S+$", self.requesthead).group(1) except: self.file = "" def match_client(self, pattern): """ Check if the client of this object matches against a regex string and return a boolean result of this comparison. """ if hasattr(self, "client") and self.client: return re.fullmatch(pattern, self.client) elif hasattr(self, "hostname") and self.hostname: return re.fullmatch(pattern, self.hostname) else: return True def match_file(self, pattern): """ Check if the target of this object matches against a regex string and return a boolean result of this comparison. """ if hasattr(self, "file") and self.file: return re.fullmatch(pattern, self.file) else: return True def match_ref(self, pattern): """ Check if the referrer of this object matches against a regex string and return a boolean result of this comparison. """ if hasattr(self, "referer") and self.referer: return re.fullmatch(pattern, self.referer) else: return True class Httpd(Parser): def __init__(self): super().__init__() self.name = "httpd" self.info = "Analyse Apache (httpd) server logs, including data " \ "transferred, requests, clients, and errors." def parse_log(self): logger.debug("Starting httpd section") section = Section("httpd") datefmt = config.prefs.get("httpd", "datetime-format") if not datefmt: datefmt = config.prefs.get("logparse", "datetime-format") if not datefmt: logger.error("Invalid datetime-format configuration parameter") return None # Initialise patterns logger.debug("Converting pattern from {0}".format( config.prefs.get("httpd", "access-format"))) pattern = convert_logformat(config.prefs.get("httpd", "access-format")) logger.debug("Compiled log format {0}".format(pattern)) logger.debug("Retrieving log data") accesslog = readlog(config.prefs.get("logs", "httpd-access")) errorlog= readlog(config.prefs.get("logs", "httpd-error")) total_errors = len(errorlog.splitlines()) logger.debug("Parsing access logs") accesses = [] for line in accesslog.splitlines(): if not "GET" in line: continue try: ac_obj = AccessLine(line, datefmt, pattern) except Exception as e: logger.warning("Malformed access log: {0}. " "{1}: {2}".format(line, type(e).__name__, e)) else: if not section.period.compare(ac_obj.date): continue checks = [ ac_obj.match_client( config.prefs.get("httpd", "clients")), ac_obj.match_file( config.prefs.get("httpd", "files")), ac_obj.match_ref( config.prefs.get("httpd", "referrers")) ] if not all(checks): logger.debug("Ignoring access log due to config: " + line) continue accesses.append(ac_obj) logger.debug("Processed {0} access logs".format(len(accesses))) total_requests = len(accesses) section.append_data(Data("Total of " + plural("request", total_requests))) section.append_data(Data(plural("error", total_errors))) logger.debug("Parsing total size") size = Data() size.subtitle = "Transferred " \ + parsesize(sum([ac.bytessent for ac in accesses])) section.append_data(size) logger.debug("Parsing clients") # clients = Data() # clients.items = [resolve(ac.hostname, # config.prefs.get("httpd", "httpd-resolve-domains")) # for ac in accesses] # clients.orderbyfreq() # clients.subtitle = "Received requests from " \ # + plural("client", len(clients.items)) # clients.truncl(config.prefs.getint("logparse", "maxlist")) # section.append_data(clients) logger.debug("Parsing files") files = Data() files.items = [ac.file for ac in accesses if hasattr(ac, "file")] files.orderbyfreq() files.subtitle = plural("file", len(files.items)) + " requested" files.truncl(config.prefs.getint("logparse", "maxlist")) section.append_data(files) logger.debug("Parsing user agents") useragents = Data() useragents.items = [ac.useragent for ac in accesses] useragents.orderbyfreq() useragents.subtitle = plural("user agent", len(useragents.items)) useragents.truncl(config.prefs.getint("logparse", "maxlist")) section.append_data(useragents) logger.info("httpd has received " + str(total_requests) + " requests with " + str(total_errors) + " errors") logger.info("Finished httpd section") return section