rename parsers, better journald integration
[logparse.git] / logparse / parsers / httpd.py
index 7175ea8326fd582fd06b544478bb37950ac13a9e..9a20cc3fee605fb3c1fc2a7e93b0ee7fcd1a53e9 100644 (file)
-#
-#   httpd.py
-#   
-#   Analyse Apache (httpd) server logs, including data transferred, requests,
-#   clients, and errors. Note that Apache's logs can get filled up very quickly
-#   with the default verbosity, leading to logparse taking a very long time to
-#   analyse them. In general the default verbosity is good, but logs should be
-#   cleared as soon as they are analysed (make sure 'rotate' is set to 'y'). 
-#
+# -*- coding: utf-8 -*-
 
+"""
+Analyse Apache (httpd) server logs, including data transferred, requests,
+clients, user agents, and errors. Note that Apache's logs can get filled up 
+very quickly with the default verbosity, leading to logparse taking a very 
+long time to analyse them. In general the default verbosity is good, but logs 
+should be cleared as soon as they are analysed (make sure 'rotate' enabled in 
+the logparse config).
+"""
+
+import datetime
 import re
+import time
+
+from logparse.formatting import *
+from logparse.util import readlog, resolve
+from logparse import config
+from logparse.load_parsers import Parser
+
+IPv4_ADDR_REGEX = '(?:\d{1,3}\.){3}\d{1,3}'
+IPv6_ADDR_REGEX = "([0-9A-Fa-f]{0,4}:){2,7}([0-9A-Fa-f]{0,4})"
+IP_ADDR_REGEX = "("+IPv4_ADDR_REGEX+"|"+IPv6_ADDR_REGEX+")"
+LOG_VARS = {
+        "%a": "(?P<client>{})?".format(IPv4_ADDR_REGEX),    # client IP
+        "%A": "(?P<peer>{})?".format(IP_ADDR_REGEX),        # local (peer) IP
+        "%B": "(?P<bytes>(\d+|-))",                         # bytes
+        "%b": "(?P<clfbytes>(\d+|\"-\"))",                  # bytes (CLF format)
+        "%{[^}]+?}C": "(?P<cookie>.*)",                     # contents of cookie
+        "%D": "(?P<serveus>-?\d+)",                         # time taken to serve request (μs)
+        "%{[^}]+?}e": "(?P<envvar>.*)",                     # environment variable contents
+        "%f": "(?P<file>.*)",                               # file name requested
+        "%h": "(?P<hostname>\S+)",                          # remote hostname or IP
+        "%H": "(?P<protocol>.*)",                           # request protocol
+        "%{Referer}i": "(?P<referer>.*)",                   # referrer
+        "%{User-Agent}i": "(?P<useragent>.*)",              # user agent string
+        "%{[^}]+?}i": "(?P<header>.*)",                     # request header
+        "%k": "(?P<keepalive>\d*)",                         # number of keepalive requests
+        "%l": "(?P<logname>.*)",                            # remote logname
+        "%m": "(?P<method>.*)",                             # request method
+        "%{[^}]+?}n": "(?P<note>.*)",                       # notes
+        "%{[^}]+?}o": "(?P<replyheader>.*)",                # reply header
+        "%p": "(?P<cport>\d*)",                             # canonical port on server
+        "%{[^}]+?}p": "(?P<port>\d*)",                      # optional port
+        "%P": "(?P<pid>\d*)",                               # process ID of child
+        "%{[^}]+?}P": "(?P<thread>.*)",                     # process or thread ID
+        "%q": "(?P<query>.*)",                              # query string
+        "%r": "(?P<requesthead>.*)",                        # first line of request
+        "%R": "(?P<handler>.*)",                            # handler generating response
+        "%s": "(?P<status>(\d+?|-))",                       # status code
+        "%t": "\[(?P<date>.*?)\]",                          # request date and time with offset
+        "%{[^}]+?}t": "(?P<fdate>\d+)",                     # request date and time ()custom format)
+        "%T": "(?P<serves>\d+)",                            # time taken to serve request (seconds)
+        "%{[^}]+?}T": "(?P<servec>\d+)",                    # time taken to serve request (custom format)
+        "%u": "(?P<user>.*)",                               # remote user if authenticated
+        "%U": "(?P<url>.*)",                                # URL path excluding query string
+        "%v": "(?P<servername>.*)",                         # server name
+        "%V": "(?P<servernamec>.*)",                        # server name (custom format)
+        "%X": "(?P<responsestatus>.?)",                     # status on response completion
+        "%I": "(?P<bytesreceived>\d+)",                     # bytes received
+        "%O": "(?P<bytessent>\d+)",                         # bytes sent
+        "%S": "(?P<bytestransferred>\d+)?"                  # total bytes transferred
+}
+LOG_ESCAPES = {
+        ">": "",                                            # final value
+        "<": "",                                            # initial value
+        "%%": "%"                                           # percent escape
+}
+
+def convert_logformat(format_template):
+    """
+    Convert an Apache LogFormat string to a regex pattern
+    """
+    escape_pattern = re.compile('|'.join(LOG_ESCAPES.keys()))
+    format_template = escape_pattern.sub(lambda x: LOG_ESCAPES[x.group()], format_template)
+    var_pattern = re.compile('|'.join(LOG_VARS.keys()))
+    format_template = var_pattern.sub(lambda x: LOG_VARS[x.group()], format_template)
+    return re.compile(format_template)
+
+
+class AccessLine(object):
+    """
+    Retrieves information from a line of the httpd access log
+    """
+
+    def __init__(self, record, datefmt, pattern):
+        """
+        Assign attributes and verify/cast those than require it. Note that the 
+        `pattern` argument must be a pre-compiled regex object (to save time).
+        """
+
+        # Parse from a raw logfile string
+        self.properties = pattern.search(record).groupdict()
+        for field, value in self.properties.items():
+            if value and not (value == "-" or value == "\"-\""):
+                setattr(self, field, value)
+            else:
+                setattr(self, field, None)
+
+        # Verify data transfer metrics
+        for field, value in [x for x in self.properties.items() if "bytes" in x[0]]:
+            if isinstance(value, str) and value.isdigit():
+                setattr(self, field, int(value))
+            else:
+                setattr(self, field, 0)
+
+        # Verify date
+        self.date = datetime.datetime.strptime(self.properties["date"], datefmt)
+
+        # Verify client
+        if (not hasattr(self, "client") or not self.client) \
+                and hasattr(self, "hostname") and self.hostname:
+            self.client = self.hostname
+
+
+        # Verify file
+        if (not hasattr(self, "file") or not self.file) and hasattr(self, "requesthead"):
+            try:
+                self.file = re.search(r"^\w+\s(.*)\s\S+$", self.requesthead).group(1)
+            except:
+                self.file = ""
+
+    def match_client(self, pattern):
+        """
+        Check if the client of this object matches against a regex string and 
+        return a boolean result of this comparison.
+        """
+        if hasattr(self, "client") and self.client:
+            return re.fullmatch(pattern, self.client)
+        elif hasattr(self, "hostname") and self.hostname:
+            return re.fullmatch(pattern, self.hostname)
+        else:
+            return True
+
+    def match_file(self, pattern):
+        """
+        Check if the target of this object matches against a regex string and 
+        return a boolean result of this comparison.
+        """
+        if hasattr(self, "file") and self.file:
+            return re.fullmatch(pattern, self.file)
+        else:
+            return True
+
+    def match_ref(self, pattern):
+        """
+        Check if the referrer of this object matches against a regex string and 
+        return a boolean result of this comparison.
+        """
+        if hasattr(self, "referer") and self.referer:
+            return re.fullmatch(pattern, self.referer)
+        else:
+            return True
+        
+
+class Httpd(Parser):
+
+    def __init__(self):
+        super().__init__()
+        self.name = "httpd"
+        self.info = "Analyse Apache (httpd) server logs, including data " \
+                "transferred, requests, clients, and errors."
+
+    def parse_log(self):
 
-from ..formatting import *
-from ..util import readlog, resolve
-from .. import config
-
-import logging
-logger = logging.getLogger(__name__)
-
-def parse_log():
-    logger.debug("Starting httpd section")
-    section = Section("httpd")
-    accesslog = readlog(config.prefs['logs']['httpd'] + '/access.log')
-    a = len(accesslog.split('\n'))
-    errorlog = readlog(config.prefs['logs']['httpd'] + '/error.log')
-    e = len(errorlog.split('\n'))
-    data_b = 0
-    ips = []
-    files = []
-    useragents = []
-    errors = []
-    notfound = []
-    unprivileged = []
-
-    logger.debug("Searching through access log")
-    for line in accesslog.split('\n'):
-        fields = re.search('^(\S*) .*GET (\/.*) HTTP/\d\.\d\" 200 (\d*) \"(.*)\".*\((.*)\;', line)
-        try:
-            ips.append(resolve(fields.group(1), fqdn=config.prefs['httpd']['resolve-domains']))
-            files.append(fields.group(2))
-            useragents.append(fields.group(5))
-            data_b += int(fields.group(3))
-        except Exception as error:
-            if type(error) is AttributeError: # this line is not an access log
-                pass
+        logger.debug("Starting httpd section")
+        section = Section("httpd")
+
+        datefmt = config.prefs.get("httpd", "datetime-format")
+        if not datefmt:
+            datefmt = config.prefs.get("logparse", "datetime-format")
+        if not datefmt:
+            logger.error("Invalid datetime-format configuration parameter")
+            return None
+
+        # Initialise patterns
+        logger.debug("Converting pattern from {0}".format(
+            config.prefs.get("httpd", "access-format")))
+        pattern = convert_logformat(config.prefs.get("httpd", "access-format"))
+        logger.debug("Compiled log format {0}".format(pattern))
+
+        logger.debug("Retrieving log data")
+
+        accesslog = readlog(config.prefs.get("logs", "httpd-access"))
+
+        errorlog= readlog(config.prefs.get("logs", "httpd-error"))
+        total_errors = len(errorlog.splitlines())
+
+        logger.debug("Parsing access logs")
+
+        accesses = []
+
+        for line in accesslog.splitlines():
+            if not "GET" in line:
+                continue
+            try:
+                ac_obj = AccessLine(line, datefmt, pattern)
+            except Exception as e:
+                logger.warning("Malformed access log: {0}. "
+                    "{1}: {2}".format(line, type(e).__name__, e))
             else:
-                logger.warning("Error processing httpd access log: " + str(error))
-                traceback.print_exc()
-    data_h = parsesize(data_b)
-
-    logger.info("httpd has transferred " + str(data_b) + " bytes in response to " + str(a) + " requests with " + str(e) + " errors")
-    if (a > 0):
-        logger.debug("Parsing request statistics (this might take a while)")
-        request_data = Data()
-        request_data.items = backticks(files)
-        request_data.orderbyfreq()
-        request_data.truncl(config.prefs['maxlist'])
-        request_data.subtitle = plural(" request", a)
-        section.append_data(request_data)
-    if (ips != None):
-        logger.debug("Parsing client statistics")
-        client_data = Data()
-        client_data.items = orderbyfreq(ips)
-        client_data.subtitlte = plural(" client", str(len(ips)))
-        client_data.truncl(config.prefs['maxlist'])
-        section.append_data(client_data)
-    if (useragents != None):
-        logger.debug("Parsing user agent statistics")
-        ua_data = Data()
-        ua_data.items = orderbyfreq(useragents)
-        n_ua = str(len(ua_data.items))
-        ua_data.truncl(config.prefs['maxlist'])
-        ua_data.subtitle = plural(" user agent", n_ua)
-        section.append_data(client_data)
-
-    section.append_data(Data(data_h + " transferred"))
-    section.append_data(Data(plural(" error", e)))
-
-    logger.info("Finished httpd section")
-    return section
+                if not section.period.compare(ac_obj.date):
+                    continue
+
+                checks = [
+                        ac_obj.match_client(
+                            config.prefs.get("httpd", "clients")),
+                        ac_obj.match_file(
+                            config.prefs.get("httpd", "files")),
+                        ac_obj.match_ref(
+                            config.prefs.get("httpd", "referrers"))
+                        ]
+                if not all(checks):
+                    logger.debug("Ignoring access log due to config: " + line)
+                    continue
+                accesses.append(ac_obj)
+
+        logger.debug("Processed {0} access logs".format(len(accesses)))
+
+        total_requests = len(accesses)
+        
+        section.append_data(Data("Total of " 
+            + plural("request", total_requests)))
+        section.append_data(Data(plural("error", total_errors)))
+
+        logger.debug("Parsing total size")
+
+        size = Data()
+        size.subtitle = "Transferred " \
+                + parsesize(sum([ac.bytessent for ac in accesses]))
+        section.append_data(size)
+
+        logger.debug("Parsing clients")
+
+#        clients = Data()
+#        clients.items = [resolve(ac.hostname, 
+#            config.prefs.get("httpd", "httpd-resolve-domains")) 
+#            for ac in accesses]
+#        clients.orderbyfreq()
+#        clients.subtitle = "Received requests from " \
+#                + plural("client", len(clients.items))
+#        clients.truncl(config.prefs.getint("logparse", "maxlist"))
+#        section.append_data(clients)
+
+        logger.debug("Parsing files")
+
+        files = Data()
+        files.items = [ac.file for ac in accesses if hasattr(ac, "file")]
+        files.orderbyfreq()
+        files.subtitle = plural("file", len(files.items)) + " requested"
+        files.truncl(config.prefs.getint("logparse", "maxlist"))
+        section.append_data(files)
+
+        logger.debug("Parsing user agents")
+
+        useragents = Data()
+        useragents.items = [ac.useragent for ac in accesses]
+        useragents.orderbyfreq()
+        useragents.subtitle = plural("user agent", len(useragents.items))
+        useragents.truncl(config.prefs.getint("logparse", "maxlist"))
+        section.append_data(useragents)
+
+        logger.info("httpd has received " + str(total_requests) 
+                + " requests with " + str(total_errors) + " errors")
+
+        logger.info("Finished httpd section")
+        return section