logparse / parsers / httpd.pyon commit rename parsers, better journald integration (e1f7605)
   1# -*- coding: utf-8 -*-
   2
   3"""
   4Analyse Apache (httpd) server logs, including data transferred, requests,
   5clients, user agents, and errors. Note that Apache's logs can get filled up 
   6very quickly with the default verbosity, leading to logparse taking a very 
   7long time to analyse them. In general the default verbosity is good, but logs 
   8should be cleared as soon as they are analysed (make sure 'rotate' enabled in 
   9the logparse config).
  10"""
  11
  12import datetime
  13import re
  14import time
  15
  16from logparse.formatting import *
  17from logparse.util import readlog, resolve
  18from logparse import config
  19from logparse.load_parsers import Parser
  20
  21IPv4_ADDR_REGEX = '(?:\d{1,3}\.){3}\d{1,3}'
  22IPv6_ADDR_REGEX = "([0-9A-Fa-f]{0,4}:){2,7}([0-9A-Fa-f]{0,4})"
  23IP_ADDR_REGEX = "("+IPv4_ADDR_REGEX+"|"+IPv6_ADDR_REGEX+")"
  24LOG_VARS = {
  25        "%a": "(?P<client>{})?".format(IPv4_ADDR_REGEX),    # client IP
  26        "%A": "(?P<peer>{})?".format(IP_ADDR_REGEX),        # local (peer) IP
  27        "%B": "(?P<bytes>(\d+|-))",                         # bytes
  28        "%b": "(?P<clfbytes>(\d+|\"-\"))",                  # bytes (CLF format)
  29        "%{[^}]+?}C": "(?P<cookie>.*)",                     # contents of cookie
  30        "%D": "(?P<serveus>-?\d+)",                         # time taken to serve request (μs)
  31        "%{[^}]+?}e": "(?P<envvar>.*)",                     # environment variable contents
  32        "%f": "(?P<file>.*)",                               # file name requested
  33        "%h": "(?P<hostname>\S+)",                          # remote hostname or IP
  34        "%H": "(?P<protocol>.*)",                           # request protocol
  35        "%{Referer}i": "(?P<referer>.*)",                   # referrer
  36        "%{User-Agent}i": "(?P<useragent>.*)",              # user agent string
  37        "%{[^}]+?}i": "(?P<header>.*)",                     # request header
  38        "%k": "(?P<keepalive>\d*)",                         # number of keepalive requests
  39        "%l": "(?P<logname>.*)",                            # remote logname
  40        "%m": "(?P<method>.*)",                             # request method
  41        "%{[^}]+?}n": "(?P<note>.*)",                       # notes
  42        "%{[^}]+?}o": "(?P<replyheader>.*)",                # reply header
  43        "%p": "(?P<cport>\d*)",                             # canonical port on server
  44        "%{[^}]+?}p": "(?P<port>\d*)",                      # optional port
  45        "%P": "(?P<pid>\d*)",                               # process ID of child
  46        "%{[^}]+?}P": "(?P<thread>.*)",                     # process or thread ID
  47        "%q": "(?P<query>.*)",                              # query string
  48        "%r": "(?P<requesthead>.*)",                        # first line of request
  49        "%R": "(?P<handler>.*)",                            # handler generating response
  50        "%s": "(?P<status>(\d+?|-))",                       # status code
  51        "%t": "\[(?P<date>.*?)\]",                          # request date and time with offset
  52        "%{[^}]+?}t": "(?P<fdate>\d+)",                     # request date and time ()custom format)
  53        "%T": "(?P<serves>\d+)",                            # time taken to serve request (seconds)
  54        "%{[^}]+?}T": "(?P<servec>\d+)",                    # time taken to serve request (custom format)
  55        "%u": "(?P<user>.*)",                               # remote user if authenticated
  56        "%U": "(?P<url>.*)",                                # URL path excluding query string
  57        "%v": "(?P<servername>.*)",                         # server name
  58        "%V": "(?P<servernamec>.*)",                        # server name (custom format)
  59        "%X": "(?P<responsestatus>.?)",                     # status on response completion
  60        "%I": "(?P<bytesreceived>\d+)",                     # bytes received
  61        "%O": "(?P<bytessent>\d+)",                         # bytes sent
  62        "%S": "(?P<bytestransferred>\d+)?"                  # total bytes transferred
  63}
  64LOG_ESCAPES = {
  65        ">": "",                                            # final value
  66        "<": "",                                            # initial value
  67        "%%": "%"                                           # percent escape
  68}
  69
  70def convert_logformat(format_template):
  71    """
  72    Convert an Apache LogFormat string to a regex pattern
  73    """
  74    escape_pattern = re.compile('|'.join(LOG_ESCAPES.keys()))
  75    format_template = escape_pattern.sub(lambda x: LOG_ESCAPES[x.group()], format_template)
  76    var_pattern = re.compile('|'.join(LOG_VARS.keys()))
  77    format_template = var_pattern.sub(lambda x: LOG_VARS[x.group()], format_template)
  78    return re.compile(format_template)
  79
  80
  81class AccessLine(object):
  82    """
  83    Retrieves information from a line of the httpd access log
  84    """
  85
  86    def __init__(self, record, datefmt, pattern):
  87        """
  88        Assign attributes and verify/cast those than require it. Note that the 
  89        `pattern` argument must be a pre-compiled regex object (to save time).
  90        """
  91
  92        # Parse from a raw logfile string
  93        self.properties = pattern.search(record).groupdict()
  94        for field, value in self.properties.items():
  95            if value and not (value == "-" or value == "\"-\""):
  96                setattr(self, field, value)
  97            else:
  98                setattr(self, field, None)
  99
 100        # Verify data transfer metrics
 101        for field, value in [x for x in self.properties.items() if "bytes" in x[0]]:
 102            if isinstance(value, str) and value.isdigit():
 103                setattr(self, field, int(value))
 104            else:
 105                setattr(self, field, 0)
 106
 107        # Verify date
 108        self.date = datetime.datetime.strptime(self.properties["date"], datefmt)
 109
 110        # Verify client
 111        if (not hasattr(self, "client") or not self.client) \
 112                and hasattr(self, "hostname") and self.hostname:
 113            self.client = self.hostname
 114
 115
 116        # Verify file
 117        if (not hasattr(self, "file") or not self.file) and hasattr(self, "requesthead"):
 118            try:
 119                self.file = re.search(r"^\w+\s(.*)\s\S+$", self.requesthead).group(1)
 120            except:
 121                self.file = ""
 122
 123    def match_client(self, pattern):
 124        """
 125        Check if the client of this object matches against a regex string and 
 126        return a boolean result of this comparison.
 127        """
 128        if hasattr(self, "client") and self.client:
 129            return re.fullmatch(pattern, self.client)
 130        elif hasattr(self, "hostname") and self.hostname:
 131            return re.fullmatch(pattern, self.hostname)
 132        else:
 133            return True
 134
 135    def match_file(self, pattern):
 136        """
 137        Check if the target of this object matches against a regex string and 
 138        return a boolean result of this comparison.
 139        """
 140        if hasattr(self, "file") and self.file:
 141            return re.fullmatch(pattern, self.file)
 142        else:
 143            return True
 144
 145    def match_ref(self, pattern):
 146        """
 147        Check if the referrer of this object matches against a regex string and 
 148        return a boolean result of this comparison.
 149        """
 150        if hasattr(self, "referer") and self.referer:
 151            return re.fullmatch(pattern, self.referer)
 152        else:
 153            return True
 154        
 155
 156class Httpd(Parser):
 157
 158    def __init__(self):
 159        super().__init__()
 160        self.name = "httpd"
 161        self.info = "Analyse Apache (httpd) server logs, including data " \
 162                "transferred, requests, clients, and errors."
 163
 164    def parse_log(self):
 165
 166        logger.debug("Starting httpd section")
 167        section = Section("httpd")
 168
 169        datefmt = config.prefs.get("httpd", "datetime-format")
 170        if not datefmt:
 171            datefmt = config.prefs.get("logparse", "datetime-format")
 172        if not datefmt:
 173            logger.error("Invalid datetime-format configuration parameter")
 174            return None
 175
 176        # Initialise patterns
 177        logger.debug("Converting pattern from {0}".format(
 178            config.prefs.get("httpd", "access-format")))
 179        pattern = convert_logformat(config.prefs.get("httpd", "access-format"))
 180        logger.debug("Compiled log format {0}".format(pattern))
 181
 182        logger.debug("Retrieving log data")
 183
 184        accesslog = readlog(config.prefs.get("logs", "httpd-access"))
 185
 186        errorlog= readlog(config.prefs.get("logs", "httpd-error"))
 187        total_errors = len(errorlog.splitlines())
 188
 189        logger.debug("Parsing access logs")
 190
 191        accesses = []
 192
 193        for line in accesslog.splitlines():
 194            if not "GET" in line:
 195                continue
 196            try:
 197                ac_obj = AccessLine(line, datefmt, pattern)
 198            except Exception as e:
 199                logger.warning("Malformed access log: {0}. "
 200                    "{1}: {2}".format(line, type(e).__name__, e))
 201            else:
 202                if not section.period.compare(ac_obj.date):
 203                    continue
 204
 205                checks = [
 206                        ac_obj.match_client(
 207                            config.prefs.get("httpd", "clients")),
 208                        ac_obj.match_file(
 209                            config.prefs.get("httpd", "files")),
 210                        ac_obj.match_ref(
 211                            config.prefs.get("httpd", "referrers"))
 212                        ]
 213                if not all(checks):
 214                    logger.debug("Ignoring access log due to config: " + line)
 215                    continue
 216                accesses.append(ac_obj)
 217
 218        logger.debug("Processed {0} access logs".format(len(accesses)))
 219
 220        total_requests = len(accesses)
 221        
 222        section.append_data(Data("Total of " 
 223            + plural("request", total_requests)))
 224        section.append_data(Data(plural("error", total_errors)))
 225
 226        logger.debug("Parsing total size")
 227
 228        size = Data()
 229        size.subtitle = "Transferred " \
 230                + parsesize(sum([ac.bytessent for ac in accesses]))
 231        section.append_data(size)
 232
 233        logger.debug("Parsing clients")
 234
 235#        clients = Data()
 236#        clients.items = [resolve(ac.hostname, 
 237#            config.prefs.get("httpd", "httpd-resolve-domains")) 
 238#            for ac in accesses]
 239#        clients.orderbyfreq()
 240#        clients.subtitle = "Received requests from " \
 241#                + plural("client", len(clients.items))
 242#        clients.truncl(config.prefs.getint("logparse", "maxlist"))
 243#        section.append_data(clients)
 244
 245        logger.debug("Parsing files")
 246
 247        files = Data()
 248        files.items = [ac.file for ac in accesses if hasattr(ac, "file")]
 249        files.orderbyfreq()
 250        files.subtitle = plural("file", len(files.items)) + " requested"
 251        files.truncl(config.prefs.getint("logparse", "maxlist"))
 252        section.append_data(files)
 253
 254        logger.debug("Parsing user agents")
 255
 256        useragents = Data()
 257        useragents.items = [ac.useragent for ac in accesses]
 258        useragents.orderbyfreq()
 259        useragents.subtitle = plural("user agent", len(useragents.items))
 260        useragents.truncl(config.prefs.getint("logparse", "maxlist"))
 261        section.append_data(useragents)
 262
 263        logger.info("httpd has received " + str(total_requests) 
 264                + " requests with " + str(total_errors) + " errors")
 265
 266        logger.info("Finished httpd section")
 267        return section