-#
-# httpd.py
-#
-# Analyse Apache (httpd) server logs, including data transferred, requests,
-# clients, and errors. Note that Apache's logs can get filled up very quickly
-# with the default verbosity, leading to logparse taking a very long time to
-# analyse them. In general the default verbosity is good, but logs should be
-# cleared as soon as they are analysed (make sure 'rotate' is set to 'y').
-#
+# -*- coding: utf-8 -*-
+"""
+Analyse Apache (httpd) server logs, including data transferred, requests,
+clients, user agents, and errors. Note that Apache's logs can get filled up
+very quickly with the default verbosity, leading to logparse taking a very
+long time to analyse them. In general the default verbosity is good, but logs
+should be cleared as soon as they are analysed (make sure 'rotate' enabled in
+the logparse config).
+"""
+
+import datetime
import re
+import time
from logparse.formatting import *
from logparse.util import readlog, resolve
from logparse import config
from logparse.load_parsers import Parser
-ACCESS_REGEX = "^\s*(\S+).*\"GET (\S+) HTTP(?:\/\d\.\d)?\" (\d{3}) (\d*) \".+\" \"(.*)\""
+IPv4_ADDR_REGEX = '(?:\d{1,3}\.){3}\d{1,3}'
+IPv6_ADDR_REGEX = "([0-9A-Fa-f]{0,4}:){2,7}([0-9A-Fa-f]{0,4})"
+IP_ADDR_REGEX = "("+IPv4_ADDR_REGEX+"|"+IPv6_ADDR_REGEX+")"
+LOG_VARS = {
+ "%a": "(?P<client>{})?".format(IPv4_ADDR_REGEX), # client IP
+ "%A": "(?P<peer>{})?".format(IP_ADDR_REGEX), # local (peer) IP
+ "%B": "(?P<bytes>(\d+|-))", # bytes
+ "%b": "(?P<clfbytes>(\d+|\"-\"))", # bytes (CLF format)
+ "%{[^}]+?}C": "(?P<cookie>.*)", # contents of cookie
+ "%D": "(?P<serveus>-?\d+)", # time taken to serve request (μs)
+ "%{[^}]+?}e": "(?P<envvar>.*)", # environment variable contents
+ "%f": "(?P<file>.*)", # file name requested
+ "%h": "(?P<hostname>\S+)", # remote hostname or IP
+ "%H": "(?P<protocol>.*)", # request protocol
+ "%{Referer}i": "(?P<referer>.*)", # referrer
+ "%{User-Agent}i": "(?P<useragent>.*)", # user agent string
+ "%{[^}]+?}i": "(?P<header>.*)", # request header
+ "%k": "(?P<keepalive>\d*)", # number of keepalive requests
+ "%l": "(?P<logname>.*)", # remote logname
+ "%m": "(?P<method>.*)", # request method
+ "%{[^}]+?}n": "(?P<note>.*)", # notes
+ "%{[^}]+?}o": "(?P<replyheader>.*)", # reply header
+ "%p": "(?P<cport>\d*)", # canonical port on server
+ "%{[^}]+?}p": "(?P<port>\d*)", # optional port
+ "%P": "(?P<pid>\d*)", # process ID of child
+ "%{[^}]+?}P": "(?P<thread>.*)", # process or thread ID
+ "%q": "(?P<query>.*)", # query string
+ "%r": "(?P<requesthead>.*)", # first line of request
+ "%R": "(?P<handler>.*)", # handler generating response
+ "%s": "(?P<status>(\d+?|-))", # status code
+ "%t": "\[(?P<date>.*?)\]", # request date and time with offset
+ "%{[^}]+?}t": "(?P<fdate>\d+)", # request date and time ()custom format)
+ "%T": "(?P<serves>\d+)", # time taken to serve request (seconds)
+ "%{[^}]+?}T": "(?P<servec>\d+)", # time taken to serve request (custom format)
+ "%u": "(?P<user>.*)", # remote user if authenticated
+ "%U": "(?P<url>.*)", # URL path excluding query string
+ "%v": "(?P<servername>.*)", # server name
+ "%V": "(?P<servernamec>.*)", # server name (custom format)
+ "%X": "(?P<responsestatus>.?)", # status on response completion
+ "%I": "(?P<bytesreceived>\d+)", # bytes received
+ "%O": "(?P<bytessent>\d+)", # bytes sent
+ "%S": "(?P<bytestransferred>\d+)?" # total bytes transferred
+}
+LOG_ESCAPES = {
+ ">": "", # final value
+ "<": "", # initial value
+ "%%": "%" # percent escape
+}
+
+def convert_logformat(format_template):
+ """
+ Convert an Apache LogFormat string to a regex pattern
+ """
+ escape_pattern = re.compile('|'.join(LOG_ESCAPES.keys()))
+ format_template = escape_pattern.sub(lambda x: LOG_ESCAPES[x.group()], format_template)
+ var_pattern = re.compile('|'.join(LOG_VARS.keys()))
+ format_template = var_pattern.sub(lambda x: LOG_VARS[x.group()], format_template)
+ return re.compile(format_template)
+
class AccessLine(object):
+ """
+ Retrieves information from a line of the httpd access log
+ """
+
+ def __init__(self, record, datefmt, pattern):
+ """
+ Assign attributes and verify/cast those than require it. Note that the
+ `pattern` argument must be a pre-compiled regex object (to save time).
+ """
+
+ # Parse from a raw logfile string
+ self.properties = pattern.search(record).groupdict()
+ for field, value in self.properties.items():
+ if value and not (value == "-" or value == "\"-\""):
+ setattr(self, field, value)
+ else:
+ setattr(self, field, None)
+
+ # Verify data transfer metrics
+ for field, value in [x for x in self.properties.items() if "bytes" in x[0]]:
+ if isinstance(value, str) and value.isdigit():
+ setattr(self, field, int(value))
+ else:
+ setattr(self, field, 0)
+
+ # Verify date
+ self.date = datetime.datetime.strptime(self.properties["date"], datefmt)
+
+ # Verify client
+ if (not hasattr(self, "client") or not self.client) \
+ and hasattr(self, "hostname") and self.hostname:
+ self.client = self.hostname
+
+
+ # Verify file
+ if (not hasattr(self, "file") or not self.file) and hasattr(self, "requesthead"):
+ try:
+ self.file = re.search(r"^\w+\s(.*)\s\S+$", self.requesthead).group(1)
+ except:
+ self.file = ""
+
+ def match_client(self, pattern):
+ """
+ Check if the client of this object matches against a regex string and
+ return a boolean result of this comparison.
+ """
+ if hasattr(self, "client") and self.client:
+ return re.fullmatch(pattern, self.client)
+ elif hasattr(self, "hostname") and self.hostname:
+ return re.fullmatch(pattern, self.hostname)
+ else:
+ return True
+
+ def match_file(self, pattern):
+ """
+ Check if the target of this object matches against a regex string and
+ return a boolean result of this comparison.
+ """
+ if hasattr(self, "file") and self.file:
+ return re.fullmatch(pattern, self.file)
+ else:
+ return True
- def __init__(self, line):
- self.line = line
- fields = re.search(ACCESS_REGEX, line)
+ def match_ref(self, pattern):
+ """
+ Check if the referrer of this object matches against a regex string and
+ return a boolean result of this comparison.
+ """
+ if hasattr(self, "referer") and self.referer:
+ return re.fullmatch(pattern, self.referer)
+ else:
+ return True
- self.client = fields.group(1)
- self.file = fields.group(2)
- self.statuscode = int(fields.group(3))
- self.bytes = int(fields.group(4))
- self.useragent = fields.group(5)
class Httpd(Parser):
def __init__(self):
super().__init__()
self.name = "httpd"
- self.info = "Analyse Apache (httpd) server logs, including data transferred, requests, clients, and errors."
+ self.info = "Analyse Apache (httpd) server logs, including data " \
+ "transferred, requests, clients, and errors."
def parse_log(self):
logger.debug("Starting httpd section")
section = Section("httpd")
+ datefmt = config.prefs.get("httpd", "datetime-format")
+ if not datefmt:
+ datefmt = config.prefs.get("logparse", "datetime-format")
+ if not datefmt:
+ logger.error("Invalid datetime-format configuration parameter")
+ return None
+
+ # Initialise patterns
+ logger.debug("Converting pattern from {0}".format(
+ config.prefs.get("httpd", "access-format")))
+ pattern = convert_logformat(config.prefs.get("httpd", "access-format"))
+ logger.debug("Compiled log format {0}".format(pattern))
+
+ logger.debug("Retrieving log data")
+
accesslog = readlog(config.prefs.get("logs", "httpd-access"))
errorlog= readlog(config.prefs.get("logs", "httpd-error"))
total_errors = len(errorlog.splitlines())
- logger.debug("Retrieved log data")
-
- logger.debug("Searching through access log")
+ logger.debug("Parsing access logs")
accesses = []
for line in accesslog.splitlines():
- if "GET" in line:
- accesses.append(AccessLine(line))
+ if not "GET" in line:
+ continue
+ try:
+ ac_obj = AccessLine(line, datefmt, pattern)
+ except Exception as e:
+ logger.warning("Malformed access log: {0}. "
+ "{1}: {2}".format(line, type(e).__name__, e))
+ else:
+ if not section.period.compare(ac_obj.date):
+ continue
+
+ checks = [
+ ac_obj.match_client(
+ config.prefs.get("httpd", "clients")),
+ ac_obj.match_file(
+ config.prefs.get("httpd", "files")),
+ ac_obj.match_ref(
+ config.prefs.get("httpd", "referrers"))
+ ]
+ if not all(checks):
+ logger.debug("Ignoring access log due to config: " + line)
+ continue
+ accesses.append(ac_obj)
+
+ logger.debug("Processed {0} access logs".format(len(accesses)))
total_requests = len(accesses)
- section.append_data(Data("Total of " + plural("request", total_requests)))
+ section.append_data(Data("Total of "
+ + plural("request", total_requests)))
section.append_data(Data(plural("error", total_errors)))
+ logger.debug("Parsing total size")
+
size = Data()
- size.subtitle = "Transferred " + parsesize(sum([ac.bytes for ac in accesses]))
+ size.subtitle = "Transferred " \
+ + parsesize(sum([ac.bytessent for ac in accesses]))
section.append_data(size)
- clients = Data()
- clients.items = [resolve(ac.client, config.prefs.get("httpd", "httpd-resolve-domains")) for ac in accesses]
- clients.orderbyfreq()
- clients.subtitle = "Received requests from " + plural("client", len(clients.items))
- clients.truncl(config.prefs.getint("logparse", "maxlist"))
- section.append_data(clients)
+ logger.debug("Parsing clients")
+
+# clients = Data()
+# clients.items = [resolve(ac.hostname,
+# config.prefs.get("httpd", "httpd-resolve-domains"))
+# for ac in accesses]
+# clients.orderbyfreq()
+# clients.subtitle = "Received requests from " \
+# + plural("client", len(clients.items))
+# clients.truncl(config.prefs.getint("logparse", "maxlist"))
+# section.append_data(clients)
+
+ logger.debug("Parsing files")
files = Data()
- files.items = [ac.file for ac in accesses]
+ files.items = [ac.file for ac in accesses if hasattr(ac, "file")]
files.orderbyfreq()
files.subtitle = plural("file", len(files.items)) + " requested"
files.truncl(config.prefs.getint("logparse", "maxlist"))
section.append_data(files)
+ logger.debug("Parsing user agents")
+
useragents = Data()
useragents.items = [ac.useragent for ac in accesses]
useragents.orderbyfreq()
useragents.truncl(config.prefs.getint("logparse", "maxlist"))
section.append_data(useragents)
- logger.info("httpd has received " + str(total_requests) + " requests with " + str(total_errors) + " errors")
-
+ logger.info("httpd has received " + str(total_requests)
+ + " requests with " + str(total_errors) + " errors")
logger.info("Finished httpd section")
return section