-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathmanual_labeller.py
More file actions
80 lines (67 loc) · 2.5 KB
/
manual_labeller.py
File metadata and controls
80 lines (67 loc) · 2.5 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
import os
import textwrap
from sqlalchemy import func
__author__ = 'jamesgin'
from features import *
from model import *
# Get a list of all unlabelled questions, display the first n closest matches
# If any are correct, accept prompt to mark as done
# Also display if the question currently correct given a very basic cosine distance method!
def get_all_unlabelled():
return session.query(Question).filter(Question.related_clause == None)
def get_nearest_neighbours(nn, question_vec, ids):
indices = nn.kneighbors(question_vec, return_distance=False)
near_ids = ids[indices]
clauses = session.query(RawClause).filter(RawClause.id.in_(near_ids[0].tolist()))
return clauses
def get_mentions(txt):
clauses = session.query(RawClause).filter(RawClause.cleaned.ilike('%{}%'.format(txt))).all()
return [c.id for c in clauses]
def print_clauses(n_clauses, highlight):
for n in n_clauses:
print('-'*160)
print(n.id, highlight in n.cleaned.lower(), n.header)
print('-'*160)
print(textwrap.fill(n.cleaned, 160))
if __name__ == '__main__':
X, y, tfidf = generate_clause_set(get_clause_id)
unlabelled = get_all_unlabelled()
for q in unlabelled:
os.system('clear')
neigh = NearestNeighbors(5, algorithm='brute', metric='cosine')
neigh.fit(X)
ans = q.get_correct()
q_vec = tfidf.transform([q.text()])
n_clauses = get_nearest_neighbours(neigh, q_vec, y)
print_clauses(n_clauses, q.get_correct().lower())
print('-'*160)
print(q.body)
print(q.get_correct())
print('-'*160)
id = raw_input('Are any correct?')
if not id.isdigit():
parts = id.split(',')
mentioned = get_mentions(parts[0])
mask = np.in1d(y, mentioned)
neigh = NearestNeighbors(10, algorithm='brute', metric='cosine')
try:
neigh.fit(X[mask,:])
nn_clauses = get_nearest_neighbours(neigh, q_vec, y[mask])
print_clauses(nn_clauses, parts[1])
except:
print('Nothing')
id = raw_input('Are any correct?')
id = int(id)
if id == 0:
print('Passed')
q.related_clause = 54488
session.commit()
elif id < 5:
id = n_clauses[id].id
q.related_clause = id
session.commit()
print(id)
else:
q.related_clause = id
session.commit()
print(id)