logswan

Fast Web log analyzer using probabilistic data structures
Log | Files | Refs | README | LICENSE

logswan.c (7310B)


      1 /*
      2  * Logswan 2.1.13
      3  * Copyright (c) 2015-2022, Frederic Cambus
      4  * https://www.logswan.org
      5  *
      6  * Created:      2015-05-31
      7  * Last Updated: 2022-02-26
      8  *
      9  * Logswan is released under the BSD 2-Clause license.
     10  * See LICENSE file for details.
     11  *
     12  * SPDX-License-Identifier: BSD-2-Clause
     13  */
     14 
     15 #include <sys/socket.h>
     16 #include <sys/stat.h>
     17 #include <sys/time.h>
     18 #include <arpa/inet.h>
     19 #include <err.h>
     20 #include <getopt.h>
     21 #include <inttypes.h>
     22 #include <netinet/in.h>
     23 #include <stdbool.h>
     24 #include <stdlib.h>
     25 #include <stdint.h>
     26 #include <stdio.h>
     27 #include <string.h>
     28 #include <time.h>
     29 
     30 #ifdef HAVE_SECCOMP
     31 #include <sys/prctl.h>
     32 #include <linux/seccomp.h>
     33 #include "seccomp.h"
     34 #endif
     35 
     36 #include <maxminddb.h>
     37 
     38 #include "compat.h"
     39 #include "config.h"
     40 #include "continents.h"
     41 #include "countries.h"
     42 #include "hll.h"
     43 #include "output.h"
     44 #include "parse.h"
     45 
     46 static void
     47 usage()
     48 {
     49 	printf("logswan [-ghv] [-d db] logfile\n\n"
     50 	    "The options are as follows:\n\n"
     51 	    "	-d db	Specify path to a GeoIP database.\n"
     52 	    "	-g	Enable GeoIP lookups.\n"
     53 	    "	-h	Display usage.\n"
     54 	    "	-v	Display version.\n");
     55 }
     56 
     57 int
     58 main(int argc, char *argv[])
     59 {
     60 	struct timespec begin, end, elapsed;
     61 	struct HLL unique_ipv4, unique_ipv6;
     62 	struct results results;
     63 	struct date parsed_date;
     64 	struct logline parsed_line;
     65 	struct request parsed_request;
     66 	struct stat logfile_stat;
     67 
     68 	struct sockaddr_in ipv4;
     69 	struct sockaddr_in6 ipv6;
     70 
     71 	uint64_t bandwidth;
     72 	uint32_t status_code;
     73 	uint32_t hour;
     74 	int gai_error, mmdb_error;
     75 	int opt;
     76 
     77 	const char *errstr;
     78 	char *linebuffer = NULL;
     79 	size_t linesize = 0;
     80 	char *input;
     81 	char *db = NULL;
     82 
     83 	bool geoip = false;
     84 	bool is_ipv4, is_ipv6;
     85 
     86 	MMDB_s geoip2;
     87 	MMDB_lookup_result_s lookup;
     88 
     89 	FILE *logfile;
     90 
     91 	if (pledge("stdio rpath", NULL) == -1) {
     92 		err(EXIT_FAILURE, "pledge");
     93 	}
     94 
     95 #ifdef HAVE_SECCOMP
     96 	if (prctl(PR_SET_NO_NEW_PRIVS, 1, 0, 0, 0)) {
     97 		perror("Can't initialize seccomp");
     98 		return EXIT_FAILURE;
     99 	}
    100 
    101 	if (prctl(PR_SET_SECCOMP, SECCOMP_MODE_FILTER, &logswan)) {
    102 		perror("Can't load seccomp filter");
    103 		return EXIT_FAILURE;
    104 	}
    105 #endif
    106 
    107 	while ((opt = getopt(argc, argv, "d:ghv")) != -1) {
    108 		switch (opt) {
    109 		case 'd':
    110 			db = optarg;
    111 			break;
    112 
    113 		case 'g':
    114 			geoip = true;
    115 			break;
    116 
    117 		case 'h':
    118 			usage();
    119 			return EXIT_SUCCESS;
    120 
    121 		case 'v':
    122 			printf("%s\n", VERSION);
    123 			return EXIT_SUCCESS;
    124 		}
    125 	}
    126 
    127 	if (optind < argc) {
    128 		input = argv[optind];
    129 	} else {
    130 		usage();
    131 		return EXIT_SUCCESS;
    132 	}
    133 
    134 	hll_init(&unique_ipv4, HLL_BITS);
    135 	hll_init(&unique_ipv6, HLL_BITS);
    136 
    137 	/* Starting timer */
    138 	clock_gettime(CLOCK_MONOTONIC, &begin);
    139 
    140 	/* Initializing GeoIP */
    141 	if (geoip) {
    142 		if (!db)
    143 			db = GEOIP2DIR GEOIP2DB;
    144 
    145 		if (MMDB_open(db, MMDB_MODE_MMAP, &geoip2) != MMDB_SUCCESS)
    146 			err(EXIT_FAILURE, "Can't open database (%s)", db);
    147 	}
    148 
    149 	/* Open log file */
    150 	if (!strcmp(input, "-")) {
    151 		/* Read from standard input */
    152 		logfile = stdin;
    153 	} else {
    154 		/* Attempt to read from file */
    155 		if (!(logfile = fopen(input, "r"))) {
    156 			perror("Can't open log file");
    157 			return EXIT_FAILURE;
    158 		}
    159 	}
    160 
    161 	/* Get log file size */
    162 	if (fstat(fileno(logfile), &logfile_stat)) {
    163 		perror("Can't stat log file");
    164 		return EXIT_FAILURE;
    165 	}
    166 
    167 	memset(&results, 0, sizeof(struct results));
    168 	results.file_name = input;
    169 	results.file_size = logfile_stat.st_size;
    170 
    171 	while (getline(&linebuffer, &linesize, logfile) != -1) {
    172 		/* Parse and tokenize line */
    173 		parse_line(&parsed_line, linebuffer);
    174 
    175 		/* Detect if remote host is IPv4 or IPv6 */
    176 		if (parsed_line.remote_host) { /* Do not feed NULL tokens to inet_pton */
    177 			if ((is_ipv4 = inet_pton(AF_INET, parsed_line.remote_host, &ipv4.sin_addr))) {
    178 				is_ipv6 = false;
    179 			} else {
    180 				is_ipv6 = inet_pton(AF_INET6, parsed_line.remote_host, &ipv6.sin6_addr);
    181 
    182 				if (!is_ipv6) {
    183 					results.invalid_lines++;
    184 					continue;
    185 				}
    186 			}
    187 		} else {
    188 			/* Invalid line */
    189 			results.invalid_lines++;
    190 			continue;
    191 		}
    192 
    193 		if (is_ipv4) {
    194 			/* Increment hits counter */
    195 			results.hits_ipv4++;
    196 
    197 			/* Unique visitors */
    198 			hll_add(&unique_ipv4, parsed_line.remote_host, strlen(parsed_line.remote_host));
    199 		}
    200 
    201 		if (is_ipv6) {
    202 			/* Increment hits counter */
    203 			results.hits_ipv6++;
    204 
    205 			/* Unique visitors */
    206 			hll_add(&unique_ipv6, parsed_line.remote_host, strlen(parsed_line.remote_host));
    207 		}
    208 
    209 		if (geoip) {
    210 			MMDB_entry_data_s entry_data;
    211 			memset(&entry_data, 0, sizeof(MMDB_entry_data_s));
    212 
    213 			lookup = MMDB_lookup_string(&geoip2, parsed_line.remote_host, &gai_error, &mmdb_error);
    214 
    215 			MMDB_get_value(&lookup.entry, &entry_data, "country", "iso_code", NULL);
    216 
    217 			if (entry_data.has_data) {
    218 				/* Increment countries array */
    219 				for (size_t loop = 0; loop < COUNTRIES; loop++) {
    220 					if (!strncmp(countries_id[loop], entry_data.utf8_string, 2)) {
    221 						results.countries[loop]++;
    222 						break;
    223 					}
    224 				}
    225 			}
    226 
    227 			MMDB_get_value(&lookup.entry, &entry_data, "continent", "code", NULL);
    228 
    229 			if (entry_data.has_data) {
    230 				/* Increment continents array */
    231 				for (size_t loop = 0; loop < CONTINENTS; loop++) {
    232 					if (!strncmp(continents_id[loop], entry_data.utf8_string, 2)) {
    233 						results.continents[loop]++;
    234 						break;
    235 					}
    236 				}
    237 			}
    238 		}
    239 
    240 		/* Hourly distribution */
    241 		if (parsed_line.date) {
    242 			parse_date(&parsed_date, parsed_line.date);
    243 
    244 			if (parsed_date.hour) {
    245 				hour = strtonum(parsed_date.hour, 0, 23, &errstr);
    246 
    247 				if (!errstr) {
    248 					results.hours[hour]++;
    249 				}
    250 			}
    251 		}
    252 
    253 		/* Parse request */
    254 		if (parsed_line.request) {
    255 			parse_request(&parsed_request, parsed_line.request);
    256 
    257 			if (parsed_request.method) {
    258 				for (size_t loop = 0; loop < METHODS; loop++) {
    259 					if (!strcmp(methods_names[loop], parsed_request.method)) {
    260 						results.methods[loop]++;
    261 						break;
    262 					}
    263 				}
    264 			}
    265 
    266 			if (parsed_request.protocol) {
    267 				for (size_t loop = 0; loop < PROTOCOLS; loop++) {
    268 					if (!strcmp(protocols_names[loop], parsed_request.protocol)) {
    269 						results.protocols[loop]++;
    270 						break;
    271 					}
    272 				}
    273 			}
    274 		}
    275 
    276 		/* Count HTTP status codes occurrences */
    277 		if (parsed_line.status_code) {
    278 			status_code = strtonum(parsed_line.status_code, 0, STATUS_CODE_MAX-1, &errstr);
    279 
    280 			if (!errstr) {
    281 				results.status[status_code]++;
    282 			}
    283 		}
    284 
    285 		/* Increment bandwidth usage */
    286 		if (parsed_line.object_size) {
    287 			bandwidth = strtonum(parsed_line.object_size, 0, INT64_MAX, &errstr);
    288 
    289 			if (!errstr) {
    290 				results.bandwidth += bandwidth;
    291 			}
    292 		}
    293 	}
    294 
    295 	/* Counting hits and processed lines */
    296 	results.hits = results.hits_ipv4 + results.hits_ipv6;
    297 	results.processed_lines = results.hits + results.invalid_lines;
    298 
    299 	/* Counting unique visitors */
    300 	results.visits_ipv4 = hll_count(&unique_ipv4);
    301 	results.visits_ipv6 = hll_count(&unique_ipv6);
    302 	results.visits = results.visits_ipv4 + results.visits_ipv6;
    303 
    304 	/* Stopping timer */
    305 	clock_gettime(CLOCK_MONOTONIC, &end);
    306 
    307 	timespecsub(&end, &begin, &elapsed);
    308 	results.runtime = elapsed.tv_sec + elapsed.tv_nsec / 1E9;
    309 
    310 	/* Generate timestamp */
    311 	time_t now = time(NULL);
    312 	strftime(results.timestamp, 20, "%Y-%m-%d %H:%M:%S", localtime(&now));
    313 
    314 	/* Printing results */
    315 	fprintf(stdout, "%s\n", output(&results));
    316 	fprintf(stderr, "Processed %" PRIu64 " lines in %f seconds.\n", results.processed_lines, results.runtime);
    317 
    318 	/* Clean up */
    319 	free(linebuffer);
    320 	fclose(logfile);
    321 
    322 	if (geoip)
    323 		MMDB_close(&geoip2);
    324 
    325 	hll_destroy(&unique_ipv4);
    326 	hll_destroy(&unique_ipv6);
    327 
    328 	return EXIT_SUCCESS;
    329 }