rename parsers, better journald integration
[logparse.git] / logparse / parsers / httpd.py
index b86f1c1bd5b4830788ee5b627e00a196af6d835d..9a20cc3fee605fb3c1fc2a7e93b0ee7fcd1a53e9 100644 (file)
-#
-#   httpd.py
-#   
-#   Analyse Apache (httpd) server logs, including data transferred, requests,
-#   clients, and errors. Note that Apache's logs can get filled up very quickly
-#   with the default verbosity, leading to logparse taking a very long time to
-#   analyse them. In general the default verbosity is good, but logs should be
-#   cleared as soon as they are analysed (make sure 'rotate' is set to 'y'). 
-#
+# -*- coding: utf-8 -*-
 
+"""
+Analyse Apache (httpd) server logs, including data transferred, requests,
+clients, user agents, and errors. Note that Apache's logs can get filled up 
+very quickly with the default verbosity, leading to logparse taking a very 
+long time to analyse them. In general the default verbosity is good, but logs 
+should be cleared as soon as they are analysed (make sure 'rotate' enabled in 
+the logparse config).
+"""
+
+import datetime
 import re
+import time
 
 from logparse.formatting import *
 from logparse.util import readlog, resolve
 from logparse import config
 from logparse.load_parsers import Parser
 
-ACCESS_REGEX = "^\s*(\S+).*\"GET (\S+) HTTP(?:\/\d\.\d)?\" (\d{3}) (\d*) \".+\" \"(.*)\""
+IPv4_ADDR_REGEX = '(?:\d{1,3}\.){3}\d{1,3}'
+IPv6_ADDR_REGEX = "([0-9A-Fa-f]{0,4}:){2,7}([0-9A-Fa-f]{0,4})"
+IP_ADDR_REGEX = "("+IPv4_ADDR_REGEX+"|"+IPv6_ADDR_REGEX+")"
+LOG_VARS = {
+        "%a": "(?P<client>{})?".format(IPv4_ADDR_REGEX),    # client IP
+        "%A": "(?P<peer>{})?".format(IP_ADDR_REGEX),        # local (peer) IP
+        "%B": "(?P<bytes>(\d+|-))",                         # bytes
+        "%b": "(?P<clfbytes>(\d+|\"-\"))",                  # bytes (CLF format)
+        "%{[^}]+?}C": "(?P<cookie>.*)",                     # contents of cookie
+        "%D": "(?P<serveus>-?\d+)",                         # time taken to serve request (μs)
+        "%{[^}]+?}e": "(?P<envvar>.*)",                     # environment variable contents
+        "%f": "(?P<file>.*)",                               # file name requested
+        "%h": "(?P<hostname>\S+)",                          # remote hostname or IP
+        "%H": "(?P<protocol>.*)",                           # request protocol
+        "%{Referer}i": "(?P<referer>.*)",                   # referrer
+        "%{User-Agent}i": "(?P<useragent>.*)",              # user agent string
+        "%{[^}]+?}i": "(?P<header>.*)",                     # request header
+        "%k": "(?P<keepalive>\d*)",                         # number of keepalive requests
+        "%l": "(?P<logname>.*)",                            # remote logname
+        "%m": "(?P<method>.*)",                             # request method
+        "%{[^}]+?}n": "(?P<note>.*)",                       # notes
+        "%{[^}]+?}o": "(?P<replyheader>.*)",                # reply header
+        "%p": "(?P<cport>\d*)",                             # canonical port on server
+        "%{[^}]+?}p": "(?P<port>\d*)",                      # optional port
+        "%P": "(?P<pid>\d*)",                               # process ID of child
+        "%{[^}]+?}P": "(?P<thread>.*)",                     # process or thread ID
+        "%q": "(?P<query>.*)",                              # query string
+        "%r": "(?P<requesthead>.*)",                        # first line of request
+        "%R": "(?P<handler>.*)",                            # handler generating response
+        "%s": "(?P<status>(\d+?|-))",                       # status code
+        "%t": "\[(?P<date>.*?)\]",                          # request date and time with offset
+        "%{[^}]+?}t": "(?P<fdate>\d+)",                     # request date and time ()custom format)
+        "%T": "(?P<serves>\d+)",                            # time taken to serve request (seconds)
+        "%{[^}]+?}T": "(?P<servec>\d+)",                    # time taken to serve request (custom format)
+        "%u": "(?P<user>.*)",                               # remote user if authenticated
+        "%U": "(?P<url>.*)",                                # URL path excluding query string
+        "%v": "(?P<servername>.*)",                         # server name
+        "%V": "(?P<servernamec>.*)",                        # server name (custom format)
+        "%X": "(?P<responsestatus>.?)",                     # status on response completion
+        "%I": "(?P<bytesreceived>\d+)",                     # bytes received
+        "%O": "(?P<bytessent>\d+)",                         # bytes sent
+        "%S": "(?P<bytestransferred>\d+)?"                  # total bytes transferred
+}
+LOG_ESCAPES = {
+        ">": "",                                            # final value
+        "<": "",                                            # initial value
+        "%%": "%"                                           # percent escape
+}
+
+def convert_logformat(format_template):
+    """
+    Convert an Apache LogFormat string to a regex pattern
+    """
+    escape_pattern = re.compile('|'.join(LOG_ESCAPES.keys()))
+    format_template = escape_pattern.sub(lambda x: LOG_ESCAPES[x.group()], format_template)
+    var_pattern = re.compile('|'.join(LOG_VARS.keys()))
+    format_template = var_pattern.sub(lambda x: LOG_VARS[x.group()], format_template)
+    return re.compile(format_template)
+
 
 class AccessLine(object):
+    """
+    Retrieves information from a line of the httpd access log
+    """
+
+    def __init__(self, record, datefmt, pattern):
+        """
+        Assign attributes and verify/cast those than require it. Note that the 
+        `pattern` argument must be a pre-compiled regex object (to save time).
+        """
+
+        # Parse from a raw logfile string
+        self.properties = pattern.search(record).groupdict()
+        for field, value in self.properties.items():
+            if value and not (value == "-" or value == "\"-\""):
+                setattr(self, field, value)
+            else:
+                setattr(self, field, None)
+
+        # Verify data transfer metrics
+        for field, value in [x for x in self.properties.items() if "bytes" in x[0]]:
+            if isinstance(value, str) and value.isdigit():
+                setattr(self, field, int(value))
+            else:
+                setattr(self, field, 0)
+
+        # Verify date
+        self.date = datetime.datetime.strptime(self.properties["date"], datefmt)
+
+        # Verify client
+        if (not hasattr(self, "client") or not self.client) \
+                and hasattr(self, "hostname") and self.hostname:
+            self.client = self.hostname
+
+
+        # Verify file
+        if (not hasattr(self, "file") or not self.file) and hasattr(self, "requesthead"):
+            try:
+                self.file = re.search(r"^\w+\s(.*)\s\S+$", self.requesthead).group(1)
+            except:
+                self.file = ""
+
+    def match_client(self, pattern):
+        """
+        Check if the client of this object matches against a regex string and 
+        return a boolean result of this comparison.
+        """
+        if hasattr(self, "client") and self.client:
+            return re.fullmatch(pattern, self.client)
+        elif hasattr(self, "hostname") and self.hostname:
+            return re.fullmatch(pattern, self.hostname)
+        else:
+            return True
+
+    def match_file(self, pattern):
+        """
+        Check if the target of this object matches against a regex string and 
+        return a boolean result of this comparison.
+        """
+        if hasattr(self, "file") and self.file:
+            return re.fullmatch(pattern, self.file)
+        else:
+            return True
 
-    def __init__(self, line):
-        self.line = line
-        fields = re.search(ACCESS_REGEX, line)
+    def match_ref(self, pattern):
+        """
+        Check if the referrer of this object matches against a regex string and 
+        return a boolean result of this comparison.
+        """
+        if hasattr(self, "referer") and self.referer:
+            return re.fullmatch(pattern, self.referer)
+        else:
+            return True
         
-        self.client = fields.group(1)
-        self.file = fields.group(2)
-        self.statuscode = int(fields.group(3))
-        self.bytes = int(fields.group(4))
-        self.useragent = fields.group(5)
 
 class Httpd(Parser):
 
     def __init__(self):
         super().__init__()
         self.name = "httpd"
-        self.info = "Analyse Apache (httpd) server logs, including data transferred, requests, clients, and errors."
+        self.info = "Analyse Apache (httpd) server logs, including data " \
+                "transferred, requests, clients, and errors."
 
     def parse_log(self):
 
         logger.debug("Starting httpd section")
         section = Section("httpd")
 
+        datefmt = config.prefs.get("httpd", "datetime-format")
+        if not datefmt:
+            datefmt = config.prefs.get("logparse", "datetime-format")
+        if not datefmt:
+            logger.error("Invalid datetime-format configuration parameter")
+            return None
+
+        # Initialise patterns
+        logger.debug("Converting pattern from {0}".format(
+            config.prefs.get("httpd", "access-format")))
+        pattern = convert_logformat(config.prefs.get("httpd", "access-format"))
+        logger.debug("Compiled log format {0}".format(pattern))
+
+        logger.debug("Retrieving log data")
+
         accesslog = readlog(config.prefs.get("logs", "httpd-access"))
 
         errorlog= readlog(config.prefs.get("logs", "httpd-error"))
         total_errors = len(errorlog.splitlines())
 
-        logger.debug("Retrieved log data")
-
-        logger.debug("Searching through access log")
+        logger.debug("Parsing access logs")
 
         accesses = []
 
         for line in accesslog.splitlines():
-            if "GET" in line:
-                accesses.append(AccessLine(line))
+            if not "GET" in line:
+                continue
+            try:
+                ac_obj = AccessLine(line, datefmt, pattern)
+            except Exception as e:
+                logger.warning("Malformed access log: {0}. "
+                    "{1}: {2}".format(line, type(e).__name__, e))
+            else:
+                if not section.period.compare(ac_obj.date):
+                    continue
+
+                checks = [
+                        ac_obj.match_client(
+                            config.prefs.get("httpd", "clients")),
+                        ac_obj.match_file(
+                            config.prefs.get("httpd", "files")),
+                        ac_obj.match_ref(
+                            config.prefs.get("httpd", "referrers"))
+                        ]
+                if not all(checks):
+                    logger.debug("Ignoring access log due to config: " + line)
+                    continue
+                accesses.append(ac_obj)
+
+        logger.debug("Processed {0} access logs".format(len(accesses)))
 
         total_requests = len(accesses)
         
-        section.append_data(Data("Total of " + plural("request", total_requests)))
+        section.append_data(Data("Total of " 
+            + plural("request", total_requests)))
         section.append_data(Data(plural("error", total_errors)))
 
+        logger.debug("Parsing total size")
+
         size = Data()
-        size.subtitle = "Transferred " + parsesize(sum([ac.bytes for ac in accesses]))
+        size.subtitle = "Transferred " \
+                + parsesize(sum([ac.bytessent for ac in accesses]))
         section.append_data(size)
 
-        clients = Data()
-        clients.items = [resolve(ac.client, config.prefs.get("httpd", "httpd-resolve-domains")) for ac in accesses]
-        clients.orderbyfreq()
-        clients.subtitle = "Received requests from " + plural("client", len(clients.items))
-        clients.truncl(config.prefs.getint("logparse", "maxlist"))
-        section.append_data(clients)
+        logger.debug("Parsing clients")
+
+#        clients = Data()
+#        clients.items = [resolve(ac.hostname, 
+#            config.prefs.get("httpd", "httpd-resolve-domains")) 
+#            for ac in accesses]
+#        clients.orderbyfreq()
+#        clients.subtitle = "Received requests from " \
+#                + plural("client", len(clients.items))
+#        clients.truncl(config.prefs.getint("logparse", "maxlist"))
+#        section.append_data(clients)
+
+        logger.debug("Parsing files")
 
         files = Data()
-        files.items = [ac.file for ac in accesses]
+        files.items = [ac.file for ac in accesses if hasattr(ac, "file")]
         files.orderbyfreq()
         files.subtitle = plural("file", len(files.items)) + " requested"
         files.truncl(config.prefs.getint("logparse", "maxlist"))
         section.append_data(files)
 
+        logger.debug("Parsing user agents")
+
         useragents = Data()
         useragents.items = [ac.useragent for ac in accesses]
         useragents.orderbyfreq()
@@ -86,8 +260,8 @@ class Httpd(Parser):
         useragents.truncl(config.prefs.getint("logparse", "maxlist"))
         section.append_data(useragents)
 
-        logger.info("httpd has received " + str(total_requests) + " requests with " + str(total_errors) + " errors")
-
+        logger.info("httpd has received " + str(total_requests) 
+                + " requests with " + str(total_errors) + " errors")
 
         logger.info("Finished httpd section")
         return section