partII/processing.py at master · mansurpasha/partII · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
from sklearn.model_selection import train_test_split
import tensorflow as tf
import re
import unicodedata
import os
import numpy as np

from nltk.tokenize import word_tokenize


def preprocess_sentence(w):
    return word_tokenize(w.lower())

def create_dataset(path, num_examples):
    lines = open(path, encoding='UTF-8').read().strip().split('\n')

    word_pairs = [[preprocess_sentence(w) for w in l.split('$$--$$')] for l in lines[:num_examples]]

    return word_pairs

class LanguageIndex():
    def __init__(self, vocab):
        self.word2idx = {}
        self.idx2word = {}

        self.set_index(vocab)

    def set_index(self, vocab):
        for (i, word) in enumerate(vocab):
            self.word2idx[word] = i
        for word, index in self.word2idx.items():
            self.idx2word[index] = word

def max_length(tensor):
    return max(len(t) for t in tensor)

# accepts a string of any format, and returns its equivalent as a padded numpy array
# output: numpy array of length max_len
def sentence_to_idx(sentence, lang, max_len):
    indices = [lang.word2idx[s] if (s in lang.word2idx) else lang.word2idx['<unk>'] for s in word_tokenize(sentence.lower())]
    padded = tf.keras.preprocessing.sequence.pad_sequences([indices] ,
                                                           maxlen=max_len,
                                                           padding='post')
    return padded

# accepts a (padded) array of index tokens and returns the string equivalent, filtering out any padding
# output: string token format
def idx_to_sentence(sentence, lang):
    output = ""
    for word in sentence:
        if lang.idx2word[word] != "<pad>":
            output += ' ' + lang.idx2word[word]
    return output[1:]

def load_dataset(path, num_examples, path_to_vocab):
    # creating cleaned input, output pairs
    triples = create_dataset(path, num_examples)

    # index language using the class defined above
    vocab = [w.strip("\n") for w in open(path_to_vocab, 'r').readlines()]

    lang = LanguageIndex(vocab)

    # Vectorize the input and target languages
    input_tensor = [[lang.word2idx[s] if (s in lang.word2idx) else lang.word2idx['<unk>'] for s in inp1] for inp1, inp2, targ in triples]
    input_tensor2 = [[lang.word2idx[s] if (s in lang.word2idx) else lang.word2idx['<unk>'] for s in inp2] for inp1, inp2, targ in triples]
    decoder_input = [[lang.word2idx[s] if (s in lang.word2idx) else lang.word2idx['<unk>'] for s in targ] for inp1, inp2, targ in triples]

    # Add start and end of sentence markers to respective sentences, creating two sets for decoder input and output
    for s in input_tensor:
        s.append(lang.word2idx["<end>"])
    for s in input_tensor2:
        s.append(lang.word2idx["<end>"])
    for s in decoder_input:
        s.append(lang.word2idx["<end>"])

    # Calculate max_length of input and output tensor
    # Here, we'll set those to the longest sentence in the dataset
    max_seq_length = max(max_length(input_tensor), max_length(decoder_input))

    # Padding the input and output tensor to the maximum length
    encoder_input = tf.keras.preprocessing.sequence.pad_sequences(input_tensor,
                                                                 maxlen=max_seq_length,
                                                                 padding='post')

    encoder_input2 = tf.keras.preprocessing.sequence.pad_sequences(input_tensor2,
                                                                 maxlen=max_seq_length,
                                                                 padding='post')

    decoder_input = tf.keras.preprocessing.sequence.pad_sequences(decoder_input,
                                                                  maxlen=max_seq_length,
                                                                  padding='post')

    encoder_input = [np.concatenate((x,y)) for x, y in zip(encoder_input, encoder_input2)]

    return encoder_input, decoder_input, lang, max_seq_length

def create_vocab(path, num_examples, path_to_vocab):
    # creating cleaned input, output pairs
    pairs = create_dataset(path, num_examples)

    # index language using the class defined above
    lang = [inp for inp, targ in pairs]
    lang.append(pairs[-1][1])

    word2idx = {}
    words = {}
    for sentence in lang:
        for word in sentence:
            if (word in words.keys()):
                words[word] = words[word] + 1
            else:
                words[word] = 1
    word_counts = sorted(words.items(), key=lambda kv: -kv[1])
    top_words = [x[0] for x in word_counts[0:10000]]

    word2idx['<pad>'] = 0
    word2idx['<start>'] = 1
    word2idx['<end>'] = 2
    word2idx['<unk>'] = 3
    word2idx['start_of_conversation_token']

    for index, word in enumerate(top_words):
        word2idx[word] = index + 4

    f = open(path_to_vocab, 'w')
    for k in word2idx:
        f.write(k + "\n")
    f.close()

if __name__ == "__main__":
    create_vocab("/Users/mansurpasha/map79/partII/Individual Project/DialogueSystem/data/self_dialogue_corpus/processed/nples.txt",
             324401, "vocab_file")