-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathbackprocessing.py
More file actions
124 lines (96 loc) · 4.47 KB
/
backprocessing.py
File metadata and controls
124 lines (96 loc) · 4.47 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
from sklearn.model_selection import train_test_split
import tensorflow as tf
import re
import unicodedata
import os
import numpy as np
from nltk.tokenize import word_tokenize
def preprocess_sentence(w):
return word_tokenize(w.lower())
def create_dataset(path, num_examples):
lines = open(path, encoding='UTF-8').read().strip().split('\n')
word_pairs = [[preprocess_sentence(w) for w in l.split('$$--$$')] for l in lines[:num_examples]]
return word_pairs
class LanguageIndex():
def __init__(self, vocab):
self.word2idx = {}
self.idx2word = {}
self.set_index(vocab)
def set_index(self, vocab):
for (i, word) in enumerate(vocab):
self.word2idx[word] = i
for word, index in self.word2idx.items():
self.idx2word[index] = word
def max_length(tensor):
return max(len(t) for t in tensor)
# accepts a string of any format, and returns its equivalent as a padded numpy array
# output: numpy array of length max_len
def sentence_to_idx(sentence, lang, max_len):
indices = [lang.word2idx[s] if (s in lang.word2idx) else lang.word2idx['<unk>'] for s in word_tokenize(sentence.lower())]
padded = tf.keras.preprocessing.sequence.pad_sequences([indices] ,
maxlen=max_len,
padding='post')
return padded
# accepts a (padded) array of index tokens and returns the string equivalent, filtering out any padding
# output: string token format
def idx_to_sentence(sentence, lang):
output = ""
for word in sentence:
if lang.idx2word[word] != "<pad>":
output += ' ' + lang.idx2word[word]
return output[1:]
def load_dataset(path, num_examples, path_to_vocab):
# creating cleaned input, output pairs
triples = create_dataset(path, num_examples)
# index language using the class defined above
vocab = [w.strip("\n") for w in open(path_to_vocab, 'r').readlines()]
lang = LanguageIndex(vocab)
# Vectorize the input and target languages
decoder_input = [[lang.word2idx[s] if (s in lang.word2idx) else lang.word2idx['<unk>'] for s in inp1] for inp1, targ in triples]
input_tensor = [[lang.word2idx[s] if (s in lang.word2idx) else lang.word2idx['<unk>'] for s in targ] for inp1, targ in triples]
# Add start and end of sentence markers to respective sentences, creating two sets for decoder input and output
for s in input_tensor:
s.append(lang.word2idx["<end>"])
for s in decoder_input:
s.append(lang.word2idx["<end>"])
# Calculate max_length of input and output tensor
# Here, we'll set those to the longest sentence in the dataset
max_seq_length = max(max_length(input_tensor), max_length(decoder_input))
# Padding the input and output tensor to the maximum length
encoder_input = tf.keras.preprocessing.sequence.pad_sequences(input_tensor,
maxlen=max_seq_length,
padding='post')
decoder_input = tf.keras.preprocessing.sequence.pad_sequences(decoder_input,
maxlen=max_seq_length,
padding='post')
return encoder_input, decoder_input, lang, max_seq_length
def create_vocab(path, num_examples, path_to_vocab):
# creating cleaned input, output pairs
pairs = create_dataset(path, num_examples)
# index language using the class defined above
lang = [inp for inp, targ in pairs]
lang.append(pairs[-1][1])
word2idx = {}
words = {}
for sentence in lang:
for word in sentence:
if (word in words.keys()):
words[word] = words[word] + 1
else:
words[word] = 1
word_counts = sorted(words.items(), key=lambda kv: -kv[1])
top_words = [x[0] for x in word_counts[0:10000]]
word2idx['<pad>'] = 0
word2idx['<start>'] = 1
word2idx['<end>'] = 2
word2idx['<unk>'] = 3
word2idx['start_of_conversation_token']
for index, word in enumerate(top_words):
word2idx[word] = index + 4
f = open(path_to_vocab, 'w')
for k in word2idx:
f.write(k + "\n")
f.close()
if __name__ == "__main__":
create_vocab("/Users/mansurpasha/map79/partII/Individual Project/DialogueSystem/data/self_dialogue_corpus/processed/nples.txt",
324401, "vocab_file")