bigData/zad34.py at main · woycik/bigData · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
import random
import matplotlib.pyplot as plt
import numpy as np
import hashlib

class EstimateUniqueWithKStatistics:
    def __init__(self, k: int):
        self.k = k
        self.smallest_hashes = []

        self.p = 2**61 - 1
        self.a = random.randint(1, self.p - 1)
        self.b = random.randint(0, self.p - 1)

    def _hash(self, x: int) -> float:
        h_int = (self.a * x + self.b) % self.p
        return h_int / self.p

    def add(self, item):
        h_val = self._hash(item)

        if len(self.smallest_hashes) == self.k and h_val >= self.smallest_hashes[-1]:
            return

        if h_val not in self.smallest_hashes:
            self.smallest_hashes.append(h_val)
            self.smallest_hashes.sort()

            if len(self.smallest_hashes) > self.k:
                self.smallest_hashes.pop()

    def smallest_hashes_len(self):
        return len(self.smallest_hashes)

    def estimate(self) -> int:
        if len(self.smallest_hashes) < self.k:
            return len(self.smallest_hashes)

        v_k = self.smallest_hashes[-1]
        if v_k == 0: return 0

        return int((self.k - 1) / v_k)

class KSplit:
    def  __init__(self, k):
        self.k = k
        self.L0_kStatistics = EstimateUniqueWithKStatistics(k/2)
        self.L1_kStatistics = EstimateUniqueWithKStatistics(k/2)

    def add(self, item):
        b = random.randint(0,1)

        if b == 0 :
            self.L0_kStatistics.add(item)
        else:
            self.L1_kStatistics.add(item)

    def estimate(self):
        l0_smallest = self.L0_kStatistics.smallest_hashes_len() if self.L0_kStatistics.smallest_hashes_len() == 0 else self.L0_kStatistics.smallest_hashes[-1]
        l1_smallest = self.L1_kStatistics.smallest_hashes_len() if self.L1_kStatistics.smallest_hashes_len() == 0 else self.L1_kStatistics.smallest_hashes[-1]

        return 2 * self.k / (l0_smallest + l1_smallest)


k = 400
true_n_limit = 20000

euk = KSplit(k)

true_counts = []
estimated_counts = []
relative_errors = []

stream = range(true_n_limit)

for i, item in enumerate(stream):
    euk.add(item)

    current_n = i + 1
    est = euk.estimate()

    true_counts.append(current_n)
    estimated_counts.append(est)

    err = (est - current_n) / current_n
    relative_errors.append(err)


fig, (ax1, ax2) = plt.subplots(2, 1, figsize=(10, 10))

ax1.plot(true_counts, true_counts, 'g-', label='Rzeczywista liczność', linewidth=2)
ax1.plot(true_counts, estimated_counts, 'b--', label='Estymacja', alpha=0.8)
ax1.set_title(f'Dokładność estymacji')
ax1.set_ylabel('Liczba unikalnych elementów')
ax1.legend()
ax1.grid(True)

ax2.plot(true_counts, relative_errors, 'r.-')
ax2.axhline(0, color='black', lw=1)

ax2.set_title('Błąd względny estymacji')
ax2.set_xlabel('Liczba przetworzonych elementów')
ax2.set_ylabel('Błąd')
ax2.legend()
ax2.grid(True)

plt.tight_layout()
plt.show()