-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathprocessing.py
More file actions
133 lines (103 loc) · 5.02 KB
/
processing.py
File metadata and controls
133 lines (103 loc) · 5.02 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
from sklearn.model_selection import train_test_split
import tensorflow as tf
import re
import unicodedata
import os
import numpy as np
from nltk.tokenize import word_tokenize
def preprocess_sentence(w):
return word_tokenize(w.lower())
def create_dataset(path, num_examples):
lines = open(path, encoding='UTF-8').read().strip().split('\n')
word_pairs = [[preprocess_sentence(w) for w in l.split('$$--$$')] for l in lines[:num_examples]]
return word_pairs
class LanguageIndex():
def __init__(self, vocab):
self.word2idx = {}
self.idx2word = {}
self.set_index(vocab)
def set_index(self, vocab):
for (i, word) in enumerate(vocab):
self.word2idx[word] = i
for word, index in self.word2idx.items():
self.idx2word[index] = word
def max_length(tensor):
return max(len(t) for t in tensor)
# accepts a string of any format, and returns its equivalent as a padded numpy array
# output: numpy array of length max_len
def sentence_to_idx(sentence, lang, max_len):
indices = [lang.word2idx[s] if (s in lang.word2idx) else lang.word2idx['<unk>'] for s in word_tokenize(sentence.lower())]
padded = tf.keras.preprocessing.sequence.pad_sequences([indices] ,
maxlen=max_len,
padding='post')
return padded
# accepts a (padded) array of index tokens and returns the string equivalent, filtering out any padding
# output: string token format
def idx_to_sentence(sentence, lang):
output = ""
for word in sentence:
if lang.idx2word[word] != "<pad>":
output += ' ' + lang.idx2word[word]
return output[1:]
def load_dataset(path, num_examples, path_to_vocab):
# creating cleaned input, output pairs
triples = create_dataset(path, num_examples)
# index language using the class defined above
vocab = [w.strip("\n") for w in open(path_to_vocab, 'r').readlines()]
lang = LanguageIndex(vocab)
# Vectorize the input and target languages
input_tensor = [[lang.word2idx[s] if (s in lang.word2idx) else lang.word2idx['<unk>'] for s in inp1] for inp1, inp2, targ in triples]
input_tensor2 = [[lang.word2idx[s] if (s in lang.word2idx) else lang.word2idx['<unk>'] for s in inp2] for inp1, inp2, targ in triples]
decoder_input = [[lang.word2idx[s] if (s in lang.word2idx) else lang.word2idx['<unk>'] for s in targ] for inp1, inp2, targ in triples]
# Add start and end of sentence markers to respective sentences, creating two sets for decoder input and output
for s in input_tensor:
s.append(lang.word2idx["<end>"])
for s in input_tensor2:
s.append(lang.word2idx["<end>"])
for s in decoder_input:
s.append(lang.word2idx["<end>"])
# Calculate max_length of input and output tensor
# Here, we'll set those to the longest sentence in the dataset
max_seq_length = max(max_length(input_tensor), max_length(decoder_input))
# Padding the input and output tensor to the maximum length
encoder_input = tf.keras.preprocessing.sequence.pad_sequences(input_tensor,
maxlen=max_seq_length,
padding='post')
encoder_input2 = tf.keras.preprocessing.sequence.pad_sequences(input_tensor2,
maxlen=max_seq_length,
padding='post')
decoder_input = tf.keras.preprocessing.sequence.pad_sequences(decoder_input,
maxlen=max_seq_length,
padding='post')
encoder_input = [np.concatenate((x,y)) for x, y in zip(encoder_input, encoder_input2)]
return encoder_input, decoder_input, lang, max_seq_length
def create_vocab(path, num_examples, path_to_vocab):
# creating cleaned input, output pairs
pairs = create_dataset(path, num_examples)
# index language using the class defined above
lang = [inp for inp, targ in pairs]
lang.append(pairs[-1][1])
word2idx = {}
words = {}
for sentence in lang:
for word in sentence:
if (word in words.keys()):
words[word] = words[word] + 1
else:
words[word] = 1
word_counts = sorted(words.items(), key=lambda kv: -kv[1])
top_words = [x[0] for x in word_counts[0:10000]]
word2idx['<pad>'] = 0
word2idx['<start>'] = 1
word2idx['<end>'] = 2
word2idx['<unk>'] = 3
word2idx['start_of_conversation_token']
for index, word in enumerate(top_words):
word2idx[word] = index + 4
f = open(path_to_vocab, 'w')
for k in word2idx:
f.write(k + "\n")
f.close()
if __name__ == "__main__":
create_vocab("/Users/mansurpasha/map79/partII/Individual Project/DialogueSystem/data/self_dialogue_corpus/processed/nples.txt",
324401, "vocab_file")