See http://en.wikipedia.org/wiki/HyperLogLog
/*
* File: examples/rnd.c
*/
#include <stdio.h>
#include <stdlib.h>
#include "../src/hll.h"
int main(int argc, char *argv[]) {
long i;
struct HLL hll;
if(hll_init(&hll, 16) == -1) {
perror("hll_init");
exit(1);
}
for(i = 0; i < 100000000; i++) {
long r = random() % 1000000;
hll_add(&hll, &r, sizeof(r));
}
printf("Estimate: %f\n", hll_count(&hll));
hll_destroy(&hll);
return 0;
}% cc -O2 -L lib -l hyperloglog examples/rnd.c -o examples/rnd
% time examples/rnd
Estimate: 994154.464585
examples/rnd 2,86s user 0,01s system 99% cpu 2,875 total
| bits | size (bytes) | standard error |
|---|---|---|
| 4 | 16 | 26.00% |
| 5 | 32 | 18.38% |
| 6 | 64 | 13.00% |
| 7 | 128 | 9.19% |
| 8 | 256 | 6.50% |
| 9 | 512 | 4.60% |
| 10 | 1024 | 3.25% |
| 11 | 2048 | 2.30% |
| 12 | 4096 | 1.62% |
| 13 | 8192 | 1.15% |
| 14 | 16384 | 0.81% |
| 15 | 32768 | 0.57% |
| 16 | 65536 | 0.41% |
| 17 | 131072 | 0.29% |
| 18 | 262144 | 0.20% |
| 19 | 524288 | 0.14% |
| 20 | 1048576 | 0.10% |
struct HLL {
uint8_t bits;
/* registers buffer */
uint8_t *registers;
/* registers buffer size */
size_t size;
};Initialize a storage
int hll_init(struct HLL *hll, uint8_t bits);Add key to set
void hll_add(struct HLL *hll, const void *buf, size_t size);Get estimated set size
double hll_count(const struct HLL *hll);Merge storages
int hll_merge(struct HLL *dst, const struct HLL *src);Cleanup
void hll_destroy(struct HLL *hll);оптимизировать размер хранилища: при bits >= 4 && bits <= 16 достаточно
5 бит на регистр, вместо нынешних 8, получим экономию в 1.5+ раза. Для
bits >= 17 && bits <= 20 достаточно всего 4 бит на регистр, то есть
экономия в 2 раза.
| bits | regs count | bpr | bytes | saving | error |
|---|---|---|---|---|---|
| 4 | 16 | 5 | 10 | 37.50% | 26.00% |
| 5 | 32 | 5 | 20 | 37.50% | 18.38% |
| 6 | 64 | 5 | 40 | 37.50% | 13.00% |
| 7 | 128 | 5 | 80 | 37.50% | 9.19% |
| 8 | 256 | 5 | 160 | 37.50% | 6.50% |
| 9 | 512 | 5 | 320 | 37.50% | 4.60% |
| 10 | 1024 | 5 | 640 | 37.50% | 3.25% |
| 11 | 2048 | 5 | 1280 | 37.50% | 2.30% |
| 12 | 4096 | 5 | 2560 | 37.50% | 1.62% |
| 13 | 8192 | 5 | 5120 | 37.50% | 1.15% |
| 14 | 16384 | 5 | 10240 | 37.50% | 0.81% |
| 15 | 32768 | 5 | 20480 | 37.50% | 0.57% |
| 16 | 65536 | 5 | 40960 | 37.50% | 0.41% |
| 17 | 131072 | 4 | 65536 | 50.00% | 0.29% |
| 18 | 262144 | 4 | 131072 | 50.00% | 0.20% |
| 19 | 524288 | 4 | 262144 | 50.00% | 0.14% |
| 20 | 1048576 | 4 | 524288 | 50.00% | 0.10% |