partII/backprocessing.py at master · mansurpasha/partII · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
from sklearn.model_selection import train_test_split
import tensorflow as tf
import re
import unicodedata
import os
import numpy as np

from nltk.tokenize import word_tokenize


def preprocess_sentence(w):
    return word_tokenize(w.lower())

def create_dataset(path, num_examples):
    lines = open(path, encoding='UTF-8').read().strip().split('\n')

    word_pairs = [[preprocess_sentence(w) for w in l.split('$$--$$')] for l in lines[:num_examples]]

    return word_pairs

class LanguageIndex():
    def __init__(self, vocab):
        self.word2idx = {}
        self.idx2word = {}

        self.set_index(vocab)

    def set_index(self, vocab):
        for (i, word) in enumerate(vocab):
            self.word2idx[word] = i
        for word, index in self.word2idx.items():
            self.idx2word[index] = word

def max_length(tensor):
    return max(len(t) for t in tensor)

# accepts a string of any format, and returns its equivalent as a padded numpy array
# output: numpy array of length max_len
def sentence_to_idx(sentence, lang, max_len):
    indices = [lang.word2idx[s] if (s in lang.word2idx) else lang.word2idx['<unk>'] for s in word_tokenize(sentence.lower())]
    padded = tf.keras.preprocessing.sequence.pad_sequences([indices] ,
                                                           maxlen=max_len,
                                                           padding='post')
    return padded

# accepts a (padded) array of index tokens and returns the string equivalent, filtering out any padding
# output: string token format
def idx_to_sentence(sentence, lang):
    output = ""
    for word in sentence:
        if lang.idx2word[word] != "<pad>":
            output += ' ' + lang.idx2word[word]
    return output[1:]

def load_dataset(path, num_examples, path_to_vocab):
    # creating cleaned input, output pairs
    triples = create_dataset(path, num_examples)

    # index language using the class defined above
    vocab = [w.strip("\n") for w in open(path_to_vocab, 'r').readlines()]

    lang = LanguageIndex(vocab)

    # Vectorize the input and target languages
    decoder_input = [[lang.word2idx[s] if (s in lang.word2idx) else lang.word2idx['<unk>'] for s in inp1] for inp1, targ in triples]
    input_tensor = [[lang.word2idx[s] if (s in lang.word2idx) else lang.word2idx['<unk>'] for s in targ] for inp1, targ in triples]

    # Add start and end of sentence markers to respective sentences, creating two sets for decoder input and output
    for s in input_tensor:
        s.append(lang.word2idx["<end>"])
    for s in decoder_input:
        s.append(lang.word2idx["<end>"])

    # Calculate max_length of input and output tensor
    # Here, we'll set those to the longest sentence in the dataset
    max_seq_length = max(max_length(input_tensor), max_length(decoder_input))

    # Padding the input and output tensor to the maximum length
    encoder_input = tf.keras.preprocessing.sequence.pad_sequences(input_tensor,
                                                                 maxlen=max_seq_length,
                                                                 padding='post')

    decoder_input = tf.keras.preprocessing.sequence.pad_sequences(decoder_input,
                                                                  maxlen=max_seq_length,
                                                                  padding='post')

    return encoder_input, decoder_input, lang, max_seq_length

def create_vocab(path, num_examples, path_to_vocab):
    # creating cleaned input, output pairs
    pairs = create_dataset(path, num_examples)

    # index language using the class defined above
    lang = [inp for inp, targ in pairs]
    lang.append(pairs[-1][1])

    word2idx = {}
    words = {}
    for sentence in lang:
        for word in sentence:
            if (word in words.keys()):
                words[word] = words[word] + 1
            else:
                words[word] = 1
    word_counts = sorted(words.items(), key=lambda kv: -kv[1])
    top_words = [x[0] for x in word_counts[0:10000]]

    word2idx['<pad>'] = 0
    word2idx['<start>'] = 1
    word2idx['<end>'] = 2
    word2idx['<unk>'] = 3
    word2idx['start_of_conversation_token']

    for index, word in enumerate(top_words):
        word2idx[word] = index + 4

    f = open(path_to_vocab, 'w')
    for k in word2idx:
        f.write(k + "\n")
    f.close()

if __name__ == "__main__":
    create_vocab("/Users/mansurpasha/map79/partII/Individual Project/DialogueSystem/data/self_dialogue_corpus/processed/nples.txt",
             324401, "vocab_file")