-
Notifications
You must be signed in to change notification settings - Fork 2
Expand file tree
/
Copy pathdemo.py
More file actions
100 lines (65 loc) · 2.62 KB
/
demo.py
File metadata and controls
100 lines (65 loc) · 2.62 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
import vminhash as vmh
import numpy as np
# Run a vey minimal test
strings = [
'This is a test string',
'This is also a test string',
'This is something else',
'A fairly different string',
'completely unrelated text'
]
min_hash = vmh.VectorizedMinHash()
f = [min_hash.fingerprint(vmh.token_hashes(s.split(),n=1)) for s in strings]
vmh.jaccard(f[0],f[-1])
vmh.jaccard_matrix(f)
vmh.jaccard_cluster(f,threshold=0.85)
list(vmh.jaccard_similarities(f[:3],f[2:]))
list(vmh.jaccard_match(f[:3],f[2:]))
query_fingerprints = f[:3]
key_fingerprints = f[3:]
keys = np.array(range(len(key_fingerprints)))
for jaccard in jaccard_similarities(query_fingerprints,key_fingerprints,cuda=cuda):
yield keys[jaccard == jaccard.max()]
"""
Cuda speed test for clustering a large number of strings
"""
strings = strings*20000
f = [min_hash.fingerprint(vmh.token_hashes(s.split(),n=1)) for s in strings]
cluster_ids_cpu = vmh.jaccard_cluster(f,threshold=0.85,cuda=False)
cluster_ids_cuda = vmh.jaccard_cluster(f,threshold=0.85,cuda=True)
"""
Test the accuracy of the jaccard estimation on (moderately) large sets
"""
import vminhash as vmh
import numpy as np
import pandas as pd
import matplotlib as plt
n_tokens = 10000
results = []
for mirror in False,True:
for i in range(1001):
overlap = i/1000
n_intersect = int(n_tokens*overlap)
n_union = 2*n_tokens - n_intersect
# base_tokens = np.random.randint(0,t_max,n_union)
t0 = np.random.randint(0,2**30)
base_tokens = list(range(t0,t0+n_union))
doc_tokens = [
base_tokens[:n_tokens],
base_tokens[-n_tokens:]
]
strings = [
' '.join(str(t) for t in base_tokens[:n_tokens]),
' '.join(str(t) for t in base_tokens[-n_tokens:])
]
true_jaccard = n_intersect/n_union
min_hash = vmh.VectorizedMinHash(n_perm=64,mirror=mirror)
f = [min_hash.fingerprint(vmh.token_hashes(s.split(),n=1)) for s in strings]
# f = [min_hash.fingerprint(tokens) for tokens in doc_tokens]
est_jaccard = vmh.jaccard(f[0],f[1])
results.append({'true_jaccard':true_jaccard,'est_jaccard':est_jaccard,'mirror':mirror})
results_df = pd.DataFrame(results)
results_df['error'] = (results_df['est_jaccard'] - results_df['true_jaccard'])**2
results_df.groupby('mirror')['error'].mean()
results_df[~results_df['mirror']].plot.scatter(x='true_jaccard',y='est_jaccard',s=1,label='mirror=False')
results_df[results_df['mirror']].plot.scatter(x='true_jaccard',y='est_jaccard',s=1,label='mirror=True')