-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathzad34.py
More file actions
108 lines (77 loc) · 2.91 KB
/
zad34.py
File metadata and controls
108 lines (77 loc) · 2.91 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
import random
import matplotlib.pyplot as plt
import numpy as np
import hashlib
class EstimateUniqueWithKStatistics:
def __init__(self, k: int):
self.k = k
self.smallest_hashes = []
self.p = 2**61 - 1
self.a = random.randint(1, self.p - 1)
self.b = random.randint(0, self.p - 1)
def _hash(self, x: int) -> float:
h_int = (self.a * x + self.b) % self.p
return h_int / self.p
def add(self, item):
h_val = self._hash(item)
if len(self.smallest_hashes) == self.k and h_val >= self.smallest_hashes[-1]:
return
if h_val not in self.smallest_hashes:
self.smallest_hashes.append(h_val)
self.smallest_hashes.sort()
if len(self.smallest_hashes) > self.k:
self.smallest_hashes.pop()
def smallest_hashes_len(self):
return len(self.smallest_hashes)
def estimate(self) -> int:
if len(self.smallest_hashes) < self.k:
return len(self.smallest_hashes)
v_k = self.smallest_hashes[-1]
if v_k == 0: return 0
return int((self.k - 1) / v_k)
class KSplit:
def __init__(self, k):
self.k = k
self.L0_kStatistics = EstimateUniqueWithKStatistics(k/2)
self.L1_kStatistics = EstimateUniqueWithKStatistics(k/2)
def add(self, item):
b = random.randint(0,1)
if b == 0 :
self.L0_kStatistics.add(item)
else:
self.L1_kStatistics.add(item)
def estimate(self):
l0_smallest = self.L0_kStatistics.smallest_hashes_len() if self.L0_kStatistics.smallest_hashes_len() == 0 else self.L0_kStatistics.smallest_hashes[-1]
l1_smallest = self.L1_kStatistics.smallest_hashes_len() if self.L1_kStatistics.smallest_hashes_len() == 0 else self.L1_kStatistics.smallest_hashes[-1]
return 2 * self.k / (l0_smallest + l1_smallest)
k = 400
true_n_limit = 20000
euk = KSplit(k)
true_counts = []
estimated_counts = []
relative_errors = []
stream = range(true_n_limit)
for i, item in enumerate(stream):
euk.add(item)
current_n = i + 1
est = euk.estimate()
true_counts.append(current_n)
estimated_counts.append(est)
err = (est - current_n) / current_n
relative_errors.append(err)
fig, (ax1, ax2) = plt.subplots(2, 1, figsize=(10, 10))
ax1.plot(true_counts, true_counts, 'g-', label='Rzeczywista liczność', linewidth=2)
ax1.plot(true_counts, estimated_counts, 'b--', label='Estymacja', alpha=0.8)
ax1.set_title(f'Dokładność estymacji')
ax1.set_ylabel('Liczba unikalnych elementów')
ax1.legend()
ax1.grid(True)
ax2.plot(true_counts, relative_errors, 'r.-')
ax2.axhline(0, color='black', lw=1)
ax2.set_title('Błąd względny estymacji')
ax2.set_xlabel('Liczba przetworzonych elementów')
ax2.set_ylabel('Błąd')
ax2.legend()
ax2.grid(True)
plt.tight_layout()
plt.show()