logswan

Fast Web log analyzer using probabilistic data structures
Log | Files | Refs | README | LICENSE

hll.c (2631B)


      1 #include <stdlib.h>
      2 #include <errno.h>
      3 #include <math.h>
      4 #include <string.h>
      5 
      6 #include <stdio.h>
      7 
      8 #include "../MurmurHash3/MurmurHash3.h"
      9 #include "hll.h"
     10 
     11 static __inline uint8_t _hll_rank(uint32_t hash, uint8_t bits) {
     12 	uint8_t i;
     13 
     14 	for(i = 1; i <= 32 - bits; i++) {
     15 		if(hash & 1)
     16 			break;
     17 
     18 		hash >>= 1;
     19 	}
     20 
     21 	return i;
     22 }
     23 
     24 int hll_init(struct HLL *hll, uint8_t bits) {
     25 	if(bits < 4 || bits > 20) {
     26 		errno = ERANGE;
     27 		return -1;
     28 	}
     29 
     30 	hll->bits = bits;
     31 	hll->size = (size_t)1 << bits;
     32 	hll->registers = calloc(hll->size, 1);
     33 
     34 	return 0;
     35 }
     36 
     37 void hll_destroy(struct HLL *hll) {
     38 	free(hll->registers);
     39 
     40 	hll->registers = NULL;
     41 }
     42 
     43 static __inline void _hll_add_hash(struct HLL *hll, uint32_t hash) {
     44 	uint32_t index = hash >> (32 - hll->bits);
     45 	uint8_t rank = _hll_rank(hash, hll->bits);
     46 
     47 	if(rank > hll->registers[index]) {
     48 		hll->registers[index] = rank;
     49 	}
     50 }
     51 
     52 void hll_add(struct HLL *hll, const void *buf, size_t size) {
     53 	uint32_t hash = MurmurHash3_x86_32((const char *)buf, (uint32_t)size, 0x5f61767a);
     54 
     55 	_hll_add_hash(hll, hash);
     56 }
     57 
     58 double hll_count(const struct HLL *hll) {
     59 	double alpha_mm;
     60 	uint32_t i;
     61 
     62 	switch (hll->bits) {
     63 		case 4:
     64 			alpha_mm = 0.673;
     65 		break;
     66 		case 5:
     67 			alpha_mm = 0.697;
     68 		break;
     69 		case 6:
     70 			alpha_mm = 0.709;
     71 		break;
     72 		default:
     73 			alpha_mm = 0.7213 / (1.0 + 1.079 / (double)hll->size);
     74 		break;
     75 	}
     76 
     77 	alpha_mm *= ((double)hll->size * (double)hll->size);
     78 
     79 	double sum = 0;
     80 	for(i = 0; i < hll->size; i++) {
     81 		sum += 1.0 / (1 << hll->registers[i]);
     82 	}
     83 
     84 	double estimate = alpha_mm / sum;
     85 
     86 	if (estimate <= 5.0 / 2.0 * (double)hll->size) {
     87 		int zeros = 0;
     88 
     89 		for(i = 0; i < hll->size; i++)
     90 			zeros += (hll->registers[i] == 0);
     91 
     92 		if(zeros)
     93 			estimate = (double)hll->size * log((double)hll->size / zeros);
     94 
     95 	} else if (estimate > (1.0 / 30.0) * 4294967296.0) {
     96 		estimate = -4294967296.0 * log(1.0 - (estimate / 4294967296.0));
     97 	}
     98 
     99 	return estimate;
    100 }
    101 
    102 int hll_merge(struct HLL *dst, const struct HLL *src) {
    103 	uint32_t i;
    104 
    105 	if(dst->bits != src->bits) {
    106 		errno = EINVAL;
    107 		return -1;
    108 	}
    109 
    110 	for(i = 0; i < dst->size; i++) {
    111 		if(src->registers[i] > dst->registers[i])
    112 			dst->registers[i] = src->registers[i];
    113 	}
    114 
    115 	return 0;
    116 }
    117 
    118 int hll_load(struct HLL *hll, const void *registers, size_t size) {
    119 	uint8_t bits = 0;
    120 	size_t s = size;
    121 
    122 	while(s) {
    123 		if(s & 1)
    124 			break;
    125 
    126 		bits++;
    127 
    128 		s >>= 1;
    129 	}
    130 
    131 	if(!bits || ((size_t)1 << bits) != size) {
    132 		errno = EINVAL;
    133 		return -1;
    134 	}
    135 
    136 	if(hll_init(hll, bits) == -1)
    137 		return -1;
    138 
    139 	memcpy(hll->registers, registers, size);
    140 
    141 	return 0;
    142 }
    143 
    144 extern uint32_t _hll_hash(const struct HLL *hll) {
    145 	return MurmurHash3_x86_32(hll->registers, (uint32_t)hll->size, 0);
    146 }