logswan

Fast Web log analyzer using probabilistic data structures
Log | Files | Refs | README | LICENSE

logswan.c (7241B)


      1 /*
      2  * Logswan 2.1.10
      3  * Copyright (c) 2015-2021, Frederic Cambus
      4  * https://www.logswan.org
      5  *
      6  * Created:      2015-05-31
      7  * Last Updated: 2021-02-15
      8  *
      9  * Logswan is released under the BSD 2-Clause license.
     10  * See LICENSE file for details.
     11  */
     12 
     13 #include <sys/socket.h>
     14 #include <sys/stat.h>
     15 #include <sys/time.h>
     16 #include <arpa/inet.h>
     17 #include <err.h>
     18 #include <getopt.h>
     19 #include <inttypes.h>
     20 #include <netinet/in.h>
     21 #include <stdbool.h>
     22 #include <stdlib.h>
     23 #include <stdint.h>
     24 #include <stdio.h>
     25 #include <string.h>
     26 #include <time.h>
     27 
     28 #ifdef HAVE_SECCOMP
     29 #include <sys/prctl.h>
     30 #include <linux/seccomp.h>
     31 #include "seccomp.h"
     32 #endif
     33 
     34 #include <maxminddb.h>
     35 
     36 #include "compat.h"
     37 #include "config.h"
     38 #include "continents.h"
     39 #include "countries.h"
     40 #include "hll.h"
     41 #include "output.h"
     42 #include "parse.h"
     43 
     44 static void
     45 usage()
     46 {
     47 	printf("logswan [-ghv] [-d db] logfile\n\n" \
     48 	    "The options are as follows:\n\n" \
     49 	    "	-d db	Specify path to a GeoIP database.\n" \
     50 	    "	-g	Enable GeoIP lookups.\n" \
     51 	    "	-h	Display usage.\n" \
     52 	    "	-v	Display version.\n");
     53 }
     54 
     55 int
     56 main(int argc, char *argv[])
     57 {
     58 	struct timespec begin, end, elapsed;
     59 	struct HLL unique_ipv4, unique_ipv6;
     60 	struct results results;
     61 	struct date parsed_date;
     62 	struct logline parsed_line;
     63 	struct request parsed_request;
     64 	struct stat logfile_stat;
     65 
     66 	struct sockaddr_in ipv4;
     67 	struct sockaddr_in6 ipv6;
     68 
     69 	uint64_t bandwidth;
     70 	uint32_t status_code;
     71 	uint32_t hour;
     72 	int gai_error, mmdb_error;
     73 	int opt;
     74 
     75 	const char *errstr;
     76 	char linebuffer[LINE_LENGTH_MAX];
     77 	char *input;
     78 	char *db = NULL;
     79 
     80 	bool geoip = false;
     81 	bool is_ipv4, is_ipv6;
     82 
     83 	MMDB_s geoip2;
     84 	MMDB_lookup_result_s lookup;
     85 
     86 	FILE *logfile;
     87 
     88 	if (pledge("stdio rpath", NULL) == -1) {
     89 		err(EXIT_FAILURE, "pledge");
     90 	}
     91 
     92 #ifdef HAVE_SECCOMP
     93 	if (prctl(PR_SET_NO_NEW_PRIVS, 1, 0, 0, 0)) {
     94 		perror("Can't initialize seccomp");
     95 		return EXIT_FAILURE;
     96 	}
     97 
     98 	if (prctl(PR_SET_SECCOMP, SECCOMP_MODE_FILTER, &logswan)) {
     99 		perror("Can't load seccomp filter");
    100 		return EXIT_FAILURE;
    101 	}
    102 #endif
    103 
    104 	hll_init(&unique_ipv4, HLL_BITS);
    105 	hll_init(&unique_ipv6, HLL_BITS);
    106 
    107 	while ((opt = getopt(argc, argv, "d:ghv")) != -1) {
    108 		switch (opt) {
    109 		case 'd':
    110 			db = optarg;
    111 			break;
    112 
    113 		case 'g':
    114 			geoip = true;
    115 			break;
    116 
    117 		case 'h':
    118 			usage();
    119 			return EXIT_SUCCESS;
    120 
    121 		case 'v':
    122 			printf("%s\n", VERSION);
    123 			return EXIT_SUCCESS;
    124 		}
    125 	}
    126 
    127 	if (optind < argc) {
    128 		input = argv[optind];
    129 	} else {
    130 		usage();
    131 		return EXIT_SUCCESS;
    132 	}
    133 
    134 	/* Starting timer */
    135 	clock_gettime(CLOCK_MONOTONIC, &begin);
    136 
    137 	/* Initializing GeoIP */
    138 	if (geoip) {
    139 		if (!db)
    140 			db = GEOIP2DIR GEOIP2DB;
    141 
    142 		if (MMDB_open(db, MMDB_MODE_MMAP, &geoip2) != MMDB_SUCCESS)
    143 			err(EXIT_FAILURE, "Can't open database (%s)", db);
    144 	}
    145 
    146 	/* Open log file */
    147 	if (!strcmp(input, "-")) {
    148 		/* Read from standard input */
    149 		logfile = stdin;
    150 	} else {
    151 		/* Attempt to read from file */
    152 		if (!(logfile = fopen(input, "r"))) {
    153 			perror("Can't open log file");
    154 			return EXIT_FAILURE;
    155 		}
    156 	}
    157 
    158 	/* Get log file size */
    159 	if (fstat(fileno(logfile), &logfile_stat)) {
    160 		perror("Can't stat log file");
    161 		return EXIT_FAILURE;
    162 	}
    163 
    164 	memset(&results, 0, sizeof(struct results));
    165 	results.file_name = input;
    166 	results.file_size = logfile_stat.st_size;
    167 
    168 	while (fgets(linebuffer, LINE_LENGTH_MAX, logfile)) {
    169 		/* Parse and tokenize line */
    170 		parse_line(&parsed_line, linebuffer);
    171 
    172 		/* Detect if remote host is IPv4 or IPv6 */
    173 		if (parsed_line.remote_host) { /* Do not feed NULL tokens to inet_pton */
    174 			if ((is_ipv4 = inet_pton(AF_INET, parsed_line.remote_host, &ipv4.sin_addr))) {
    175 				is_ipv6 = false;
    176 			} else {
    177 				is_ipv6 = inet_pton(AF_INET6, parsed_line.remote_host, &ipv6.sin6_addr);
    178 
    179 				if (!is_ipv6) {
    180 					results.invalid_lines++;
    181 					continue;
    182 				}
    183 			}
    184 		} else {
    185 			/* Invalid line */
    186 			results.invalid_lines++;
    187 			continue;
    188 		}
    189 
    190 		if (is_ipv4) {
    191 			/* Increment hits counter */
    192 			results.hits_ipv4++;
    193 
    194 			/* Unique visitors */
    195 			hll_add(&unique_ipv4, parsed_line.remote_host, strlen(parsed_line.remote_host));
    196 		}
    197 
    198 		if (is_ipv6) {
    199 			/* Increment hits counter */
    200 			results.hits_ipv6++;
    201 
    202 			/* Unique visitors */
    203 			hll_add(&unique_ipv6, parsed_line.remote_host, strlen(parsed_line.remote_host));
    204 		}
    205 
    206 		if (geoip) {
    207 			MMDB_entry_data_s entry_data;
    208 			memset(&entry_data, 0, sizeof(MMDB_entry_data_s));
    209 
    210 			lookup = MMDB_lookup_string(&geoip2, parsed_line.remote_host, &gai_error, &mmdb_error);
    211 
    212 			MMDB_get_value(&lookup.entry, &entry_data, "country", "iso_code", NULL);
    213 
    214 			if (entry_data.has_data) {
    215 				/* Increment countries array */
    216 				for (size_t loop = 0; loop < COUNTRIES; loop++) {
    217 					if (!strncmp(countries_id[loop], entry_data.utf8_string, 2)) {
    218 						results.countries[loop]++;
    219 						break;
    220 					}
    221 				}
    222 			}
    223 
    224 			MMDB_get_value(&lookup.entry, &entry_data, "continent", "code", NULL);
    225 
    226 			if (entry_data.has_data) {
    227 				/* Increment continents array */
    228 				for (size_t loop = 0; loop < CONTINENTS; loop++) {
    229 					if (!strncmp(continents_id[loop], entry_data.utf8_string, 2)) {
    230 						results.continents[loop]++;
    231 						break;
    232 					}
    233 				}
    234 			}
    235 		}
    236 
    237 		/* Hourly distribution */
    238 		if (parsed_line.date) {
    239 			parse_date(&parsed_date, parsed_line.date);
    240 
    241 			if (parsed_date.hour) {
    242 				hour = strtonum(parsed_date.hour, 0, 23, &errstr);
    243 
    244 				if (!errstr) {
    245 					results.hours[hour]++;
    246 				}
    247 			}
    248 		}
    249 
    250 		/* Parse request */
    251 		if (parsed_line.request) {
    252 			parse_request(&parsed_request, parsed_line.request);
    253 
    254 			if (parsed_request.method) {
    255 				for (size_t loop = 0; loop < METHODS; loop++) {
    256 					if (!strcmp(methods_names[loop], parsed_request.method)) {
    257 						results.methods[loop]++;
    258 						break;
    259 					}
    260 				}
    261 			}
    262 
    263 			if (parsed_request.protocol) {
    264 				for (size_t loop = 0; loop < PROTOCOLS; loop++) {
    265 					if (!strcmp(protocols_names[loop], parsed_request.protocol)) {
    266 						results.protocols[loop]++;
    267 						break;
    268 					}
    269 				}
    270 			}
    271 		}
    272 
    273 		/* Count HTTP status codes occurrences */
    274 		if (parsed_line.status_code) {
    275 			status_code = strtonum(parsed_line.status_code, 0, STATUS_CODE_MAX-1, &errstr);
    276 
    277 			if (!errstr) {
    278 				results.status[status_code]++;
    279 			}
    280 		}
    281 
    282 		/* Increment bandwidth usage */
    283 		if (parsed_line.object_size) {
    284 			bandwidth = strtonum(parsed_line.object_size, 0, INT64_MAX, &errstr);
    285 
    286 			if (!errstr) {
    287 				results.bandwidth += bandwidth;
    288 			}
    289 		}
    290 	}
    291 
    292 	/* Counting hits and processed lines */
    293 	results.hits = results.hits_ipv4 + results.hits_ipv6;
    294 	results.processed_lines = results.hits + results.invalid_lines;
    295 
    296 	/* Counting unique visitors */
    297 	results.visits_ipv4 = hll_count(&unique_ipv4);
    298 	results.visits_ipv6 = hll_count(&unique_ipv6);
    299 	results.visits = results.visits_ipv4 + results.visits_ipv6;
    300 
    301 	/* Stopping timer */
    302 	clock_gettime(CLOCK_MONOTONIC, &end);
    303 
    304 	timespecsub(&end, &begin, &elapsed);
    305 	results.runtime = elapsed.tv_sec + elapsed.tv_nsec / 1E9;
    306 
    307 	/* Generate timestamp */
    308 	time_t now = time(NULL);
    309 	strftime(results.timestamp, 20, "%Y-%m-%d %H:%M:%S", localtime(&now));
    310 
    311 	/* Printing results */
    312 	fprintf(stdout, "%s\n", output(&results));
    313 	fprintf(stderr, "Processed %" PRIu64 " lines in %f seconds.\n", results.processed_lines, results.runtime);
    314 
    315 	/* Clean up */
    316 	fclose(logfile);
    317 
    318 	if (geoip)
    319 		MMDB_close(&geoip2);
    320 
    321 	hll_destroy(&unique_ipv4);
    322 	hll_destroy(&unique_ipv6);
    323 
    324 	return EXIT_SUCCESS;
    325 }