nlp/ngram.py at main · igormorgado/nlp · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
##
import numpy as np
import pandas as pd
import collections
import re
from collections import Counter
import matplotlib.pyplot as plt
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize

##
def text_to_list(corpus):
    return re.findall(r"[\w']+", corpus.lower())

##
def stopwords_remove(corpus_list, stopwords):
    corpus_list_clean = []
    for word in corpus_list:
        if word not in stopwords:
            corpus_list_clean.append(word)
    return corpus_list_clean

##
def ngram_frequency(corpus, rank):
    return Counter(corpus.split(' '))

##
def compute_probability(word_frequency):
    total = sum(word_frequency.values())
    for k, v in word_frequency.items():
        word_frequency[k] = v/total
    return total

##
def shannon_entropy(values):
    assert abs(sum(values) - 1.0) < 1e-7
    assert min(values) > 0.0
    return sum(values * np.log2(1/values))

##
def ngram(sentences, rank):
    parts = []
    for sentence in sentences:
        words = sentence.split(" ")
        nwords = len(words)
        assert nwords >= rank, f"{rank}-gram can't be created in {nwords} words sentence"
        for i in range(nwords-rank+1):
            parts.append(words[i:i+rank])
    return parts

##
sentences = ["<s> I am Sam </s>",
             "<s> Sam I am </s>",
             "<s> I do not like green eggs and ham </s>"]

## Compute all ngrams to rank 2
max_rank = 2
grams = []
grams_count = []
for rank in range(1,max_rank+1):
    gram_list = ngram(sentences, rank)
    grams.append([ ' '.join(g) for g in gram_list ])
    grams_count.append(Counter(grams[-1]))

## Some Probability checks in Language Model (LM)
ptests = [['I', '<s>'],
          ['Sam', '<s>'],
          ['am', 'I'],
          ['</s>', 'Sam'],
          ['Sam', 'am'],
          ['do', 'I']]

## Compute the probabilities
for p in ptests:
    pstr = f'P ({p[0]} | {p[1]})'
    res = grams_count[1][f'{p[1]} {p[0]}'] / grams_count[0][p[1]]
    print(f'{pstr:14s} = {res:.2f}')


## More realistic sentences
restaurant = ["<s> can you tell me about any good cantonese restaurants close by </s>",
              "<s> mid priced thai food is what i'm looking for </s>",
              "<s> tell me about chez panisse </s>",
              "<s> can you give me a listing of the kinds of food that are available </s>",
              "<s> i'm looking for a good place to eat breakfast </s>",
              "<s> whan is caffe venezia open during day </s>"]


max_rank = 2
grams = []
grams_count = []
for rank in range(1,max_rank+1):
    gram_list = ngram(restaurant, rank)
    grams.append([ ' '.join(g) for g in gram_list ])
    grams_count.append(Counter(grams[-1]))


# We compute n-gram probabilities, approximating to 2gram probabilities hence
# P(<s> I want english food </s> ==
# P(I | <s> ) * P( want | I) * P (english | want) * P (food | english) * P (</s> | food)
#
# Therefore we can store probabilities in a regular matrix with each element being a prob
# of word in j-column being followed by word in i-row. And fast retrieve this data
#
# Also to avoid underflows and faster computations we will store log p_i in matrix instead
# p_i. Since
#
# log( p1 * p2 * p3 * p3) =  log p1 + log p2 + log p3 + log p4