-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathzad47.py
More file actions
54 lines (39 loc) · 1.57 KB
/
zad47.py
File metadata and controls
54 lines (39 loc) · 1.57 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
import pandas as pd
import mmh3
from sklearn.cluster import KMeans
import numpy as np
f = open("Szekspir/RomeoJuliet.txt")
file_content = f.read()
print(file_content)
scenes = file_content.split("Scene ")[1:]
min_hash = [[] for _ in range(len(scenes))]
for id, scene in enumerate(scenes):
tokens = scene.split()
if len(tokens) < 3:
shingles = [scene]
else:
shingles = [' '.join(tokens[i:i+3]) for i in range(len(tokens) - 2)]
for it in range(100):
min_hash_value = min(mmh3.hash(shingle, it) for shingle in shingles)
min_hash[id].append(min_hash_value)
similarity_scores = [[0] * len(scenes) for _ in range(len(scenes))]
for i in range(len(scenes)):
for j in range(i + 1, len(scenes)):
similarity_scores[i][j] = sum(1 for a, b in zip(min_hash[i], min_hash[j]) if a == b) / 100
print(f"Similarity between scene {i} and scene {j}: {similarity_scores[i][j]:.4f}")
max_value = max(max(row) for row in similarity_scores)
min_value = min(min(row) for row in similarity_scores if max(row) != 0)
print(f"Max similarity: {max_value}")
print(f"Min similarity: {min_value}")
X = np.array(similarity_scores)
num_clusters = 3
kmeans = KMeans(n_clusters=num_clusters, random_state=42, n_init=10)
kmeans.fit(1-X)
labels = kmeans.labels_
clusters = {i: [] for i in range(num_clusters)}
for scene_idx, cluster_id in enumerate(labels):
clusters[cluster_id].append(scene_idx)
for cluster_id, scene_indices in clusters.items():
print(f"Cluster {cluster_id}")
print(f" Indices: {scene_indices}")
print("-" * 20)