bigData/zad26.py at main · woycik/bigData · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
import random
import matplotlib.pyplot as plt
import numpy as np
from scipy.stats import chisquare
import collections


class ReservoirSampler:

    def __init__(self):
        self.sample = None
        self.count = 0

    def add(self, item, global_index):
        self.count += 1
        if random.random() < (1.0 / self.count):
            self.sample = (item, global_index)

    def get_content(self):
        return self.sample

class EquivalentWidthPartitions:
    def __init__(self, window_size):
        self.W = window_size
        self.t = 0

        self.bucket_A = None
        self.bucket_C = ReservoirSampler()

    def process(self, item):
        self.bucket_C.add(item, self.t)
        self.t += 1

        if self.bucket_C.count >= self.W:
            self.bucket_A = self.bucket_C
            self.bucket_C = ReservoirSampler()

    def get_window_sample(self):
        if self.t < self.W:
            return self.bucket_C.get_content()

        window_start = self.t - self.W

        if self.bucket_A:
            sample_A = self.bucket_A.get_content()
            if sample_A is not None:
                _, idx = sample_A
                if idx >= window_start:
                    return sample_A

        return self.bucket_C.get_content()


W_hist = 5
N_hist = 10000
sampler = EquivalentWidthPartitions(W_hist)
positions = []

for i in range(N_hist):
    sampler.process(i)

    if i >= W_hist:
        res = sampler.get_window_sample()
        if res is not None:
            idx_sampled = res[1]
            relative_pos = i - idx_sampled
            positions.append(relative_pos)

plt.figure(figsize=(10, 6))
plt.hist(positions, bins=range(W_hist + 1), density=True, edgecolor='black', label='Symulacja')
plt.title(f"Histogram ppb pozycji wylosowanego elementu \nWindow={W_hist}, Iteracje={N_hist}")
plt.xlabel("Pozycja w oknie")
plt.ylabel("Gęstość")
plt.legend()
plt.grid(axis='y', alpha=0.3)
plt.show()

W_test = 5
N_test = 10000
sampler_test = EquivalentWidthPartitions(W_test)
observed_counts = collections.defaultdict(int)
valid_samples = 0

for i in range(N_test):
    sampler_test.process(i)

    if i >= W_test:
        res = sampler_test.get_window_sample()
        if res is not None:
            idx_sampled = res[1]
            pos = i - idx_sampled

            if 0 <= pos < W_test:
                observed_counts[pos] += 1
                valid_samples += 1

obs = [observed_counts[j] for j in range(W_test)]
exp = [valid_samples / W_test] * W_test

chi2, p_val = chisquare(obs, exp)
critical_val = 11.345

print(f"\nWynik testu:")
print(f"Chi2 stat: {chi2:.4f}")
print(f"p-value:   {p_val:.4f}")
print(f"Wartość krytyczna: {critical_val}")

if chi2 < critical_val:
    print(f"Nie ma podstaw do odrzucenia H0 - rozkład jest jednostajny")
else:
    print(f"Rozkład nie jest jednostajny.")