diff --git a/.idea/deployment.xml b/.idea/deployment.xml new file mode 100644 index 0000000..c982474 --- /dev/null +++ b/.idea/deployment.xml @@ -0,0 +1,15 @@ + + + + + + + + + + + + + + \ No newline at end of file diff --git a/.idea/workspace.xml b/.idea/workspace.xml new file mode 100644 index 0000000..de18bf0 --- /dev/null +++ b/.idea/workspace.xml @@ -0,0 +1,518 @@ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + sys + word_tok + pos_tag + print + + + + + + + + + + + true + DEFINITION_ORDER + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + project + + + + + + + + + + + + + + + + + + + + + + + + + 1526441288919 + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + \ No newline at end of file diff --git a/MANIFEST.in b/MANIFEST.in index 843ef4a..7db390c 100644 --- a/MANIFEST.in +++ b/MANIFEST.in @@ -1,2 +1,2 @@ include g2p_en/homographs.en -include g2p_en/logdir/* +include g2p_en/checkpoint20.npz \ No newline at end of file diff --git a/README.md b/README.md index ca325ee..443586b 100644 --- a/README.md +++ b/README.md @@ -1,4 +1,9 @@ -# g2p_en: A Simple Python Module for English Grapheme To Phoneme Conversion +[![image](https://img.shields.io/pypi/v/g2p-en.svg)](https://pypi.org/project/g2p-en/) +[![image](https://img.shields.io/pypi/l/g2p-en.svg)](https://pypi.org/project/g2p-en/) + +# g2pE: A Simple Python Module for English Grapheme To Phoneme Conversion + +* [v.2.0] We removed TensorFlow from the dependencies. After all, it changes its APIs quite often, and we don't expect you to have a GPU. Instead, NumPy is used for inference. This module is designed to convert English graphemes (spelling) to phonemes (pronunciation). It is considered essential in several tasks such as speech synthesis. @@ -20,23 +25,21 @@ In this project, we employ a deep learning seq2seq framework based on TensorFlow ## Algorithm 1. Spells out arabic numbers and some currency symbols. (e.g. $200 -> two hundred dollars) (This is borrowed from [Keith Ito's code](https://github.com/keithito/tacotron/blob/master/text/numbers.py)) -2. Attempts to retrieve the correct pronunciation for homographs based on their POS) +2. Attempts to retrieve the correct pronunciation for heteronyms based on their POS) 3. Looks up [The CMU Pronouncing Dictionary](http://www.speech.cs.cmu.edu/cgi-bin/cmudict) for non-homographs. 4. For OOVs, we predict their pronunciations using our neural net model. ## Environment -* python 2.x or 3.x +* python 3.x ## Dependencies * numpy >= 1.13.1 -* tensorflow >= 1.3.0 * nltk >= 3.2.4 * python -m nltk.downloader "averaged_perceptron_tagger" "cmudict" * inflect >= 0.3.1 * Distance >= 0.1.3 -* future >= 0.16.0 ## Installation @@ -47,28 +50,41 @@ OR nltk package will be automatically downloaded at your first run. -## Training (Note that pretrained model is already included) - - python train.py ## Usage - from g2p_en import g2p - - text = "I refuse to collect the refuse around here." - print(g2p(text)) - >>>[u'AY1', ' ', u'R', u'IH0', u'F', u'Y', u'UW1', u'Z', ' ', u'T', u'UW1', ' ', u'K', u'AH0', u'L', u'EH1', u'K', u'T', ' ', u'DH', u'AH0', ' ', u'R', u'EH1', u'F', u'Y', u'UW2', u'Z', ' ', u'ER0', u'AW1', u'N', u'D', ' ', u'HH', u'EH1', u'R'] - - text = "I am an activationist." - print(g2p(text)) - >>>[u'AY1', u'M', ' ', u'AE1', u'N', ' ', u'AE2', u'K', u'T', u'AH0', u'V', u'EY1', u'SH', u'AH0', u'N', u'IH0', u'S', u'T'] - -If you need to convert lots of texts, you can use the global tf session. - - import g2p_en as g2p - - with g2p.Session(): - phs = [g2p.g2p(text) for text in texts] + from g2p_en import G2p + + texts = ["I have $250 in my pocket.", # number -> spell-out + "popular pets, e.g. cats and dogs", # e.g. -> for example + "I refuse to collect the refuse around here.", # homograph + "I'm an activationist."] # newly coined word + g2p = G2p() + for text in texts: + out = g2p(text) + print(out) + >>> ['AY1', ' ', 'HH', 'AE1', 'V', ' ', 'T', 'UW1', ' ', 'HH', 'AH1', 'N', 'D', 'R', 'AH0', 'D', ' ', 'F', 'IH1', 'F', 'T', 'IY0', ' ', 'D', 'AA1', 'L', 'ER0', 'Z', ' ', 'IH0', 'N', ' ', 'M', 'AY1', ' ', 'P', 'AA1', 'K', 'AH0', 'T', ' ', '.'] + >>> ['P', 'AA1', 'P', 'Y', 'AH0', 'L', 'ER0', ' ', 'P', 'EH1', 'T', 'S', ' ', ',', ' ', 'F', 'AO1', 'R', ' ', 'IH0', 'G', 'Z', 'AE1', 'M', 'P', 'AH0', 'L', ' ', 'K', 'AE1', 'T', 'S', ' ', 'AH0', 'N', 'D', ' ', 'D', 'AA1', 'G', 'Z'] + >>> ['AY1', ' ', 'R', 'IH0', 'F', 'Y', 'UW1', 'Z', ' ', 'T', 'UW1', ' ', 'K', 'AH0', 'L', 'EH1', 'K', 'T', ' ', 'DH', 'AH0', ' ', 'R', 'EH1', 'F', 'Y', 'UW2', 'Z', ' ', 'ER0', 'AW1', 'N', 'D', ' ', 'HH', 'IY1', 'R', ' ', '.'] + >>> ['AY1', ' ', 'AH0', 'M', ' ', 'AE1', 'N', ' ', 'AE2', 'K', 'T', 'IH0', 'V', 'EY1', 'SH', 'AH0', 'N', 'IH0', 'S', 'T', ' ', '.'] + +## References + +If you use this code for research, please cite: + +``` +@misc{g2pE2019, + author = {Park, Kyubyong & Kim, Jongseok}, + title = {g2pE}, + year = {2019}, + publisher = {GitHub}, + journal = {GitHub repository}, + howpublished = {\url{https://github.com/Kyubyong/g2p}} +} +``` + +## Cited in +* [Learning pronunciation from a foreign language in speech synthesis networks](https://arxiv.org/abs/1811.09364) May, 2018. diff --git a/README.rst b/README.rst index d228fd2..c2d414d 100644 --- a/README.rst +++ b/README.rst @@ -1,6 +1,8 @@ g2p\_en: A Simple Python Module for English Grapheme To Phoneme Conversion ========================================================================== +[Update] * We removed TensorFlow from the dependencies. After all, it changes its APIs quite often, and we don't expect you to have a GPU. Instead, NumPy is used for inference. + This module is designed to convert English graphemes (spelling) to phonemes (pronunciation). It is considered essential in several tasks such as speech synthesis. Unlike many languages like Spanish or German @@ -42,18 +44,16 @@ Algorithm Environment ----------- -- python 2.x or 3.x +- python 3.x Dependencies ------------ - numpy >= 1.13.1 -- tensorflow >= 1.3.0 - nltk >= 3.2.4 - python -m nltk.downloader "averaged\_perceptron\_tagger" "cmudict" - inflect >= 0.3.1 - Distance >= 0.1.3 -- future >= 0.16.0 Installation ------------ @@ -70,36 +70,27 @@ OR nltk package will be automatically downloaded at your first run. -Training (Note that pretrained model is already included) ---------------------------------------------------------- - -:: - - python train.py Usage ----- :: - from g2p_en import g2p - - text = "I refuse to collect the refuse around here." - print(g2p(text)) - >>>[u'AY1', ' ', u'R', u'IH0', u'F', u'Y', u'UW1', u'Z', ' ', u'T', u'UW1', ' ', u'K', u'AH0', u'L', u'EH1', u'K', u'T', ' ', u'DH', u'AH0', ' ', u'R', u'EH1', u'F', u'Y', u'UW2', u'Z', ' ', u'ER0', u'AW1', u'N', u'D', ' ', u'HH', u'EH1', u'R'] - - text = "I am an activationist." - print(g2p(text)) - >>>[u'AY1', u'M', ' ', u'AE1', u'N', ' ', u'AE2', u'K', u'T', u'AH0', u'V', u'EY1', u'SH', u'AH0', u'N', u'IH0', u'S', u'T'] - -If you need to convert lots of texts, you can use the global tf session. - -:: - - import g2p_en as g2p + from g2p_en import G2p + + texts = ["I have $250 in my pocket.", # number -> spell-out + "popular pets, e.g. cats and dogs", # e.g. -> for example + "I refuse to collect the refuse around here.", # homograph + "I'm an activationist."] # newly coined word + g2p = G2p() + for text in texts: + out = g2p(text) + print(out) + >>> ['AY1', ' ', 'HH', 'AE1', 'V', ' ', 'T', 'UW1', ' ', 'HH', 'AH1', 'N', 'D', 'R', 'AH0', 'D', ' ', 'F', 'IH1', 'F', 'T', 'IY0', ' ', 'D', 'AA1', 'L', 'ER0', 'Z', ' ', 'IH0', 'N', ' ', 'M', 'AY1', ' ', 'P', 'AA1', 'K', 'AH0', 'T', ' ', '.'] + >>> ['P', 'AA1', 'P', 'Y', 'AH0', 'L', 'ER0', ' ', 'P', 'EH1', 'T', 'S', ' ', ',', ' ', 'F', 'AO1', 'R', ' ', 'IH0', 'G', 'Z', 'AE1', 'M', 'P', 'AH0', 'L', ' ', 'K', 'AE1', 'T', 'S', ' ', 'AH0', 'N', 'D', ' ', 'D', 'AA1', 'G', 'Z'] + >>> ['AY1', ' ', 'R', 'IH0', 'F', 'Y', 'UW1', 'Z', ' ', 'T', 'UW1', ' ', 'K', 'AH0', 'L', 'EH1', 'K', 'T', ' ', 'DH', 'AH0', ' ', 'R', 'EH1', 'F', 'Y', 'UW2', 'Z', ' ', 'ER0', 'AW1', 'N', 'D', ' ', 'HH', 'IY1', 'R', ' ', '.'] + >>> ['AY1', ' ', 'AH0', 'M', ' ', 'AE1', 'N', ' ', 'AE2', 'K', 'T', 'IH0', 'V', 'EY1', 'SH', 'AH0', 'N', 'IH0', 'S', 'T', ' ', '.'] - with g2p.Session(): - phs = [g2p.g2p(text) for text in texts] May, 2018. diff --git a/g2p.pdf b/g2p.pdf new file mode 100644 index 0000000..c1ef41f Binary files /dev/null and b/g2p.pdf differ diff --git a/g2p_en.egg-info/PKG-INFO b/g2p_en.egg-info/PKG-INFO deleted file mode 100644 index e331e5d..0000000 --- a/g2p_en.egg-info/PKG-INFO +++ /dev/null @@ -1,119 +0,0 @@ -Metadata-Version: 1.1 -Name: g2p-en -Version: 1.0.0 -Summary: A Simple Python Module for English Grapheme To Phoneme Conversion -Home-page: https://github.com/Kyubyong/g2p -Author: Kyubyong Park & Jongseok Kim -Author-email: kbpark.linguist@gmail.com -License: Apache Software License -Download-URL: https://github.com/Kyubyong/g2p/archive/1.0.0.tar.gz -Description-Content-Type: UNKNOWN -Description: g2p\_en: A Simple Python Module for English Grapheme To Phoneme Conversion - ========================================================================== - - This module is designed to convert English graphemes (spelling) to - phonemes (pronunciation). It is considered essential in several tasks - such as speech synthesis. Unlike many languages like Spanish or German - where pronunciation of a word can be inferred from its spelling, English - words are often far from people's expectations. Therefore, it will be - the best idea to consult a dictionary if we want to know the - pronunciation of some word. However, there are at least two tentative - issues in this approach. First, you can't disambiguate the pronunciation - of homographs, words which have multiple pronunciations. (See ``a`` - below.) Second, you can't check if the word is not in the dictionary. - (See ``b`` below.) - - - - -   \a. I refuse to collect the refuse around here. (rɪ\|fju:z as verb vs. \|refju:s as noun) - - - - \b. I am an activationist. (activationist: newly coined word which means ``n. A person who designs and implements programs of treatment or therapy that use recreation and activities to help people whose functional abilities are affected by illness or disability.`` from `WORD SPY `__ - - For the first homograph issue, fortunately many homographs can be - disambiguated using their part-of-speech, if not all. When it comes to - the words not in the dictionary, however, we should make our best guess - using our knowledge. In this project, we employ a deep learning seq2seq - framework based on TensorFlow. - - Algorithm - --------- - - 1. Spells out arabic numbers and some currency symbols. (e.g. $200 -> - two hundred dollars) (This is borrowed from `Keith Ito's - code `__) - 2. Attempts to retrieve the correct pronunciation for homographs based - on their POS) - 3. Looks up `The CMU Pronouncing - Dictionary `__ for - non-homographs. - 4. For OOVs, we predict their pronunciations using our neural net model. - - Environment - ----------- - - - python 2.x or 3.x - - Dependencies - ------------ - - - numpy >= 1.13.1 - - tensorflow >= 1.3.0 - - nltk >= 3.2.4 - - python -m nltk.downloader "averaged\_perceptron\_tagger" "cmudict" - - inflect >= 0.3.1 - - Distance >= 0.1.3 - - future >= 0.16.0 - - Installation - ------------ - - :: - - pip install g2p_en - - OR - - :: - - python setup.py install - - nltk package will be automatically downloaded at your first run. - - Training (Note that pretrained model is already included) - --------------------------------------------------------- - - :: - - python train.py - - Usage - ----- - - :: - - from g2p_en import g2p - - text = "I refuse to collect the refuse around here." - print(g2p(text)) - >>>[u'AY1', ' ', u'R', u'IH0', u'F', u'Y', u'UW1', u'Z', ' ', u'T', u'UW1', ' ', u'K', u'AH0', u'L', u'EH1', u'K', u'T', ' ', u'DH', u'AH0', ' ', u'R', u'EH1', u'F', u'Y', u'UW2', u'Z', ' ', u'ER0', u'AW1', u'N', u'D', ' ', u'HH', u'EH1', u'R'] - - text = "I am an activationist." - print(g2p(text)) - >>>[u'AY1', u'M', ' ', u'AE1', u'N', ' ', u'AE2', u'K', u'T', u'AH0', u'V', u'EY1', u'SH', u'AH0', u'N', u'IH0', u'S', u'T'] - - If you need to convert lots of texts, you can use the global tf session. - - :: - - import g2p_en as g2p - - with g2p.Session(): - phs = [g2p.g2p(text) for text in texts] - - May, 2018. - - Kyubyong Park & `Jongseok Kim `__ - -Keywords: g2p,g2p_en -Platform: UNKNOWN diff --git a/g2p_en.egg-info/SOURCES.txt b/g2p_en.egg-info/SOURCES.txt deleted file mode 100644 index a10c2f8..0000000 --- a/g2p_en.egg-info/SOURCES.txt +++ /dev/null @@ -1,19 +0,0 @@ -MANIFEST.in -README.md -README.rst -setup.cfg -setup.py -g2p_en/__init__.py -g2p_en/expand.py -g2p_en/g2p.py -g2p_en/homographs.en -g2p_en/train.py -g2p_en.egg-info/PKG-INFO -g2p_en.egg-info/SOURCES.txt -g2p_en.egg-info/dependency_links.txt -g2p_en.egg-info/requires.txt -g2p_en.egg-info/top_level.txt -g2p_en/logdir/checkpoint -g2p_en/logdir/model_epoch_14_gs_27956.data-00000-of-00001 -g2p_en/logdir/model_epoch_14_gs_27956.index -g2p_en/logdir/model_epoch_14_gs_27956.meta \ No newline at end of file diff --git a/g2p_en.egg-info/dependency_links.txt b/g2p_en.egg-info/dependency_links.txt deleted file mode 100644 index 8b13789..0000000 --- a/g2p_en.egg-info/dependency_links.txt +++ /dev/null @@ -1 +0,0 @@ - diff --git a/g2p_en.egg-info/requires.txt b/g2p_en.egg-info/requires.txt deleted file mode 100644 index 33d6eca..0000000 --- a/g2p_en.egg-info/requires.txt +++ /dev/null @@ -1,6 +0,0 @@ -numpy>=1.13.1 -tensorflow>=1.3.0 -nltk>=3.2.4 -inflect>=0.3.1 -distance>=0.1.3 -future>=0.16.0 diff --git a/g2p_en.egg-info/top_level.txt b/g2p_en.egg-info/top_level.txt deleted file mode 100644 index 19ccc74..0000000 --- a/g2p_en.egg-info/top_level.txt +++ /dev/null @@ -1 +0,0 @@ -g2p_en diff --git a/g2p_en/__init__.py b/g2p_en/__init__.py index 9ef46b8..4d9ce97 100644 --- a/g2p_en/__init__.py +++ b/g2p_en/__init__.py @@ -1 +1 @@ -from g2p import g2p, Session +from .g2p import G2p diff --git a/g2p_en/checkpoint20.npz b/g2p_en/checkpoint20.npz new file mode 100644 index 0000000..b0722a8 Binary files /dev/null and b/g2p_en/checkpoint20.npz differ diff --git a/g2p_en/expand.py b/g2p_en/expand.py index 3aae3d5..d6a9592 100644 --- a/g2p_en/expand.py +++ b/g2p_en/expand.py @@ -12,8 +12,6 @@ - - _inflect = inflect.engine() _comma_number_re = re.compile(r'([0-9][0-9\,]+[0-9])') _decimal_number_re = re.compile(r'([0-9]+\.[0-9]+)') diff --git a/g2p_en/g2p.py b/g2p_en/g2p.py index 100d96b..8b37659 100644 --- a/g2p_en/g2p.py +++ b/g2p_en/g2p.py @@ -1,16 +1,22 @@ # -*- coding: utf-8 -*- -# /usr/bin/python2 +# /usr/bin/python ''' -By kyubyong park. kbpark.linguist@gmail.com. +By kyubyong park(kbpark.linguist@gmail.com) and Jongseok Kim(https://github.com/ozmig77) https://www.github.com/kyubyong/g2p ''' -from __future__ import print_function - -import tensorflow as tf - from nltk import pos_tag from nltk.corpus import cmudict import nltk +from nltk.tokenize import TweetTokenizer +word_tokenize = TweetTokenizer().tokenize +import numpy as np +import codecs +import re +import os +import unicodedata +from builtins import str as unicode +from .expand import normalize_numbers + try: nltk.data.find('taggers/averaged_perceptron_tagger.zip') except LookupError: @@ -20,173 +26,168 @@ except LookupError: nltk.download('cmudict') -from train import Graph, hp, load_vocab -import numpy as np -import codecs -import re -import os -import unicodedata -from expand import normalize_numbers -from builtins import str as unicode - dirname = os.path.dirname(__file__) -cmu = cmudict.dict() - -# Load vocab -g2idx, idx2g, p2idx, idx2p = load_vocab() - -# Load Graph -g = tf.Graph() -with g.as_default(): - with tf.device('/cpu:0'): - graph = Graph(); print("Graph loaded for g2p") - saver = tf.train.Saver() -config = tf.ConfigProto( - device_count={'GPU' : 0}, - gpu_options=tf.GPUOptions(per_process_gpu_memory_fraction=0.0001) - ) - -g_sess = None # global session -class Session: # make/remove global session - def __enter__(self): - global g_sess - if g_sess != None: - raise Exception('Session already exist in g2p') - g_sess = tf.Session(graph=g, config=config) - saver.restore(g_sess, tf.train.latest_checkpoint(os.path.join(dirname,hp.logdir))) - - def __exit__(self, exc_type, exc_val, exc_tb): - global g_sess - g_sess.close() - g_sess = None - - -def predict(words, sess): - ''' - Returns predicted pronunciation of `words` which do NOT exist in the dictionary. - :param words: A list of words. - :return: pron: A list of phonemes - ''' - if len(words) > hp.batch_size: - after = predict(words[hp.batch_size:], sess) - words = words[:hp.batch_size] - else: - after = [] - x = np.zeros((len(words), hp.maxlen), np.int32) # 0: - for i, w in enumerate(words): - for j, g in enumerate((w + "E")[:hp.maxlen]): - x[i][j] = g2idx.get(g, 2) # 2: - - ## Autoregressive inference - preds = np.zeros((len(x), hp.maxlen), np.int32) - for j in range(hp.maxlen): - _preds = sess.run(graph.preds, {graph.x: x, graph.y: preds}) - preds[:, j] = _preds[:, j] - - # convert to string - pron = [] - for i in range(len(preds)): - p = [u"%s" % unicode(idx2p[idx]) for idx in preds[i]] # Make p into unicode. - if "" in p: - eos = p.index("") - p = p[:eos] - pron.append(p) - - return pron + after - -# Construct homograph dictionary -f = os.path.join(dirname,'homographs.en') -homograph2features = dict() -for line in codecs.open(f, 'r', 'utf8').read().splitlines(): - if line.startswith("#"): continue # comment - headword, pron1, pron2, pos1 = line.strip().split("|") - homograph2features[headword.lower()] = (pron1.split(), pron2.split(), pos1) - -def token2pron(token): - ''' - Returns pronunciation of word based on its pos. - :param token: A tuple of (word, pos) - :return: A list of phonemes. If word is not in the dictionary, [] is returned. - ''' - word, pos = token - - if re.search("[a-z]", word) is None: - pron = [word] - - elif word in homograph2features: # Check homograph - pron1, pron2, pos1 = homograph2features[word] - if pos.startswith(pos1): - pron = pron1 - else: - pron = pron2 - elif word in cmu: # CMU dict - pron = cmu[word][0] - else: - return [] - - return pron - -def tokenize(text): - ''' - Splits text into `tokens`. - :param text: A string. - :return: A list of tokens (string). - ''' - text = re.sub('([.,?!]( |$))', r' \1', text) - return text.split() - -def g2p(text): - ''' - Returns the pronunciation of text. - :param text: A string. A sequence of words. - :return: A list of phonemes. - ''' - # normalization - text = unicode(text) - text = normalize_numbers(text) - text = ''.join(char for char in unicodedata.normalize('NFD', text) - if unicodedata.category(char) != 'Mn') # Strip accents - text = text.lower() - text = re.sub("[^ a-z'.,?!\-]", "", text) - text = text.replace("i.e.", "that is") - text = text.replace("e.g.", "for example") - - # tokenization - words = tokenize(text) - tokens = pos_tag(words) # tuples of (word, tag) - - # g2p - oovs, u_loc = [], [] - ret = [] - for token in tokens: - pron = token2pron(token) # list of phonemes - if pron == []: # oov - oovs.append(token[0]) - u_loc.append(len(ret)) - ret.extend(pron) - ret.extend([" "]) - - if len(oovs)>0: - global g_sess - if g_sess is not None: # check global session - prons = predict(oovs, g_sess) - for i in range(len(oovs)-1,-1,-1): - ret = ret[:u_loc[i]]+prons[i]+ret[u_loc[i]:] - else: # If global session is not defined, make new one as local. - with tf.Session(graph=g, config=config) as sess: - saver.restore(sess, tf.train.latest_checkpoint(os.path.join(dirname, hp.logdir))) - prons = predict(oovs, sess) - for i in range(len(oovs)-1,-1,-1): - ret = ret[:u_loc[i]]+prons[i]+ret[u_loc[i]:] - return ret[:-1] - +def construct_homograph_dictionary(): + f = os.path.join(dirname,'homographs.en') + homograph2features = dict() + for line in codecs.open(f, 'r', 'utf8').read().splitlines(): + if line.startswith("#"): continue # comment + headword, pron1, pron2, pos1 = line.strip().split("|") + homograph2features[headword.lower()] = (pron1.split(), pron2.split(), pos1) + return homograph2features + +# def segment(text): +# ''' +# Splits text into `tokens`. +# :param text: A string. +# :return: A list of tokens (string). +# ''' +# print(text) +# text = re.sub('([.,?!]( |$))', r' \1', text) +# print(text) +# return text.split() + +class G2p(object): + def __init__(self): + super().__init__() + self.graphemes = ["", "", ""] + list("abcdefghijklmnopqrstuvwxyz") + self.phonemes = ["", "", "", ""] + ['AA0', 'AA1', 'AA2', 'AE0', 'AE1', 'AE2', 'AH0', 'AH1', 'AH2', 'AO0', + 'AO1', 'AO2', 'AW0', 'AW1', 'AW2', 'AY0', 'AY1', 'AY2', 'B', 'CH', 'D', 'DH', + 'EH0', 'EH1', 'EH2', 'ER0', 'ER1', 'ER2', 'EY0', 'EY1', + 'EY2', 'F', 'G', 'HH', + 'IH0', 'IH1', 'IH2', 'IY0', 'IY1', 'IY2', 'JH', 'K', 'L', + 'M', 'N', 'NG', 'OW0', 'OW1', + 'OW2', 'OY0', 'OY1', 'OY2', 'P', 'R', 'S', 'SH', 'T', 'TH', + 'UH0', 'UH1', 'UH2', 'UW', + 'UW0', 'UW1', 'UW2', 'V', 'W', 'Y', 'Z', 'ZH'] + self.g2idx = {g: idx for idx, g in enumerate(self.graphemes)} + self.idx2g = {idx: g for idx, g in enumerate(self.graphemes)} + + self.p2idx = {p: idx for idx, p in enumerate(self.phonemes)} + self.idx2p = {idx: p for idx, p in enumerate(self.phonemes)} + + self.cmu = cmudict.dict() + self.load_variables() + self.homograph2features = construct_homograph_dictionary() + + def load_variables(self): + self.variables = np.load(os.path.join(dirname,'checkpoint20.npz')) + self.enc_emb = self.variables["enc_emb"] # (29, 64). (len(graphemes), emb) + self.enc_w_ih = self.variables["enc_w_ih"] # (3*128, 64) + self.enc_w_hh = self.variables["enc_w_hh"] # (3*128, 128) + self.enc_b_ih = self.variables["enc_b_ih"] # (3*128,) + self.enc_b_hh = self.variables["enc_b_hh"] # (3*128,) + + self.dec_emb = self.variables["dec_emb"] # (74, 64). (len(phonemes), emb) + self.dec_w_ih = self.variables["dec_w_ih"] # (3*128, 64) + self.dec_w_hh = self.variables["dec_w_hh"] # (3*128, 128) + self.dec_b_ih = self.variables["dec_b_ih"] # (3*128,) + self.dec_b_hh = self.variables["dec_b_hh"] # (3*128,) + self.fc_w = self.variables["fc_w"] # (74, 128) + self.fc_b = self.variables["fc_b"] # (74,) + + def sigmoid(self, x): + return 1 / (1 + np.exp(-x)) + + def grucell(self, x, h, w_ih, w_hh, b_ih, b_hh): + rzn_ih = np.matmul(x, w_ih.T) + b_ih + rzn_hh = np.matmul(h, w_hh.T) + b_hh + + rz_ih, n_ih = rzn_ih[:, :rzn_ih.shape[-1] * 2 // 3], rzn_ih[:, rzn_ih.shape[-1] * 2 // 3:] + rz_hh, n_hh = rzn_hh[:, :rzn_hh.shape[-1] * 2 // 3], rzn_hh[:, rzn_hh.shape[-1] * 2 // 3:] + + rz = self.sigmoid(rz_ih + rz_hh) + r, z = np.split(rz, 2, -1) + + n = np.tanh(n_ih + r * n_hh) + h = (1 - z) * n + z * h + + return h + + def gru(self, x, steps, w_ih, w_hh, b_ih, b_hh, h0=None): + if h0 is None: + h0 = np.zeros((x.shape[0], w_hh.shape[1]), np.float32) + h = h0 # initial hidden state + outputs = np.zeros((x.shape[0], steps, w_hh.shape[1]), np.float32) + for t in range(steps): + h = self.grucell(x[:, t, :], h, w_ih, w_hh, b_ih, b_hh) # (b, h) + outputs[:, t, ::] = h + return outputs + + def encode(self, word): + chars = list(word) + [""] + x = [self.g2idx.get(char, self.g2idx[""]) for char in chars] + x = np.take(self.enc_emb, np.expand_dims(x, 0), axis=0) + + return x + + def predict(self, word): + # encoder + enc = self.encode(word) + enc = self.gru(enc, len(word) + 1, self.enc_w_ih, self.enc_w_hh, + self.enc_b_ih, self.enc_b_hh, h0=np.zeros((1, self.enc_w_hh.shape[-1]), np.float32)) + last_hidden = enc[:, -1, :] + + # decoder + dec = np.take(self.dec_emb, [2], axis=0) # 2: + h = last_hidden + + preds = [] + for i in range(20): + h = self.grucell(dec, h, self.dec_w_ih, self.dec_w_hh, self.dec_b_ih, self.dec_b_hh) # (b, h) + logits = np.matmul(h, self.fc_w.T) + self.fc_b + pred = logits.argmax() + if pred == 3: break # 3: + preds.append(pred) + dec = np.take(self.dec_emb, [pred], axis=0) + + preds = [self.idx2p.get(idx, "") for idx in preds] + return preds + + def __call__(self, text): + # preprocessing + text = unicode(text) + text = normalize_numbers(text) + text = ''.join(char for char in unicodedata.normalize('NFD', text) + if unicodedata.category(char) != 'Mn') # Strip accents + text = text.lower() + text = re.sub("[^ a-z'.,?!\-]", "", text) + text = text.replace("i.e.", "that is") + text = text.replace("e.g.", "for example") + + # tokenization + words = word_tokenize(text) + tokens = pos_tag(words) # tuples of (word, tag) + + # steps + prons = [] + for word, pos in tokens: + if re.search("[a-z]", word) is None: + pron = [word] + + elif word in self.homograph2features: # Check homograph + pron1, pron2, pos1 = self.homograph2features[word] + if pos.startswith(pos1): + pron = pron1 + else: + pron = pron2 + elif word in self.cmu: # lookup CMU dict + pron = self.cmu[word][0] + else: # predict for oov + pron = self.predict(word) + + prons.extend(pron) + prons.extend([" "]) + + return prons[:-1] if __name__ == '__main__': texts = ["I have $250 in my pocket.", # number -> spell-out "popular pets, e.g. cats and dogs", # e.g. -> for example "I refuse to collect the refuse around here.", # homograph "I'm an activationist."] # newly coined word + g2p = G2p() for text in texts: out = g2p(text) print(out) diff --git a/g2p_en/logdir/checkpoint b/g2p_en/logdir/checkpoint deleted file mode 100644 index 184f6dc..0000000 --- a/g2p_en/logdir/checkpoint +++ /dev/null @@ -1,6 +0,0 @@ -model_checkpoint_path: "model_epoch_14_gs_27956" -all_model_checkpoint_paths: "model_epoch_10_gs_24100" -all_model_checkpoint_paths: "model_epoch_11_gs_25064" -all_model_checkpoint_paths: "model_epoch_12_gs_26028" -all_model_checkpoint_paths: "model_epoch_13_gs_26992" -all_model_checkpoint_paths: "model_epoch_14_gs_27956" diff --git a/g2p_en/logdir/model_epoch_14_gs_27956.data-00000-of-00001 b/g2p_en/logdir/model_epoch_14_gs_27956.data-00000-of-00001 deleted file mode 100644 index d4cde5a..0000000 Binary files a/g2p_en/logdir/model_epoch_14_gs_27956.data-00000-of-00001 and /dev/null differ diff --git a/g2p_en/logdir/model_epoch_14_gs_27956.index b/g2p_en/logdir/model_epoch_14_gs_27956.index deleted file mode 100644 index 8a28f65..0000000 Binary files a/g2p_en/logdir/model_epoch_14_gs_27956.index and /dev/null differ diff --git a/g2p_en/logdir/model_epoch_14_gs_27956.meta b/g2p_en/logdir/model_epoch_14_gs_27956.meta deleted file mode 100644 index a8bbb94..0000000 Binary files a/g2p_en/logdir/model_epoch_14_gs_27956.meta and /dev/null differ diff --git a/g2p_en/train.py b/g2p_en/train.py deleted file mode 100644 index cf19177..0000000 --- a/g2p_en/train.py +++ /dev/null @@ -1,165 +0,0 @@ -# -*- coding: utf-8 -*- -#/usr/bin/python2 -''' -By kyubyong park. kbpark.linguist@gmail.com. -https://www.github.com/kyubyong/g2p -''' -from __future__ import print_function - -#import tqdm -import distance - -import tensorflow as tf -import numpy as np - -from nltk.corpus import cmudict -cmu = cmudict.dict() - -# Hyper parameters -class hp: - batch_size = 128 - lr = 0.0001 - logdir = "logdir" - maxlen = 20 - num_epochs = 15 - hidden_units = 128 - graphemes = ["P", "E", "U"] + list("abcdefghijklmnopqrstuvwxyz") # Padding, EOS, UNK - phonemes = ["", "", 'AA0', 'AA1', 'AA2', 'AE0', 'AE1', 'AE2', 'AH0', 'AH1', 'AH2', 'AO0', - 'AO1', 'AO2', 'AW0', 'AW1', 'AW2', 'AY0', 'AY1', 'AY2', 'B', 'CH', 'D', 'DH', - 'EH0', 'EH1', 'EH2', 'ER0', 'ER1', 'ER2', 'EY0', 'EY1', 'EY2', 'F', 'G', 'HH', - 'IH0', 'IH1', 'IH2', 'IY0', 'IY1', 'IY2', 'JH', 'K', 'L', 'M', 'N', 'NG', 'OW0', 'OW1', - 'OW2', 'OY0', 'OY1', 'OY2', 'P', 'R', 'S', 'SH', 'T', 'TH', 'UH0', 'UH1', 'UH2', 'UW', - 'UW0', 'UW1', 'UW2', 'V', 'W', 'Y', 'Z', 'ZH'] - -def load_vocab(): - g2idx = {g: idx for idx, g in enumerate(hp.graphemes)} - idx2g = {idx: g for idx, g in enumerate(hp.graphemes)} - - p2idx = {p: idx for idx, p in enumerate(hp.phonemes)} - idx2p = {idx: p for idx, p in enumerate(hp.phonemes)} - - return g2idx, idx2g, p2idx, idx2p - -g2idx, idx2g, p2idx, idx2p = load_vocab() - -def load_data(mode="train"): - # Vectorize - xs, ys = [], [] # vectorized sentences - for word, prons in cmu.items(): - graphemes = word + "E" # EOS - if len(graphemes) > hp.maxlen: continue - graphemes += "P" * hp.maxlen # Padding - - x = [g2idx.get(g, 2) for g in graphemes[:hp.maxlen]] # 2: - - pron = prons[0] - phonemes = list(pron) + [""] - if len(phonemes) > hp.maxlen: continue - phonemes += [""] * hp.maxlen - y = [p2idx[p] for p in phonemes[:hp.maxlen]] - - xs.append(x) - ys.append(y) - - # Convert to 2d-arrays - X = np.array(xs, np.int32) - Y = np.array(ys, np.int32) - - if mode=="train": - X, Y = X[:-hp.batch_size], Y[:-hp.batch_size] - else: # eval - X, Y = X[-hp.batch_size:], Y[-hp.batch_size:] - - return X, Y - -class Graph(): - '''Builds a model graph''' - def __init__(self): - self.x = tf.placeholder(tf.int32, shape=(None, hp.maxlen), name="grapheme") - self.y = tf.placeholder(tf.int32, shape=(None, hp.maxlen), name="phoneme") - - # Sequence lengths - self.seqlens = tf.reduce_sum(tf.sign(self.x), -1) - - # Embedding - self.inputs = tf.one_hot(self.x, len(hp.graphemes)) - - # Encoder: BiGRU - cell_fw = tf.nn.rnn_cell.GRUCell(hp.hidden_units) - cell_bw = tf.nn.rnn_cell.GRUCell(hp.hidden_units) - outputs, _ = tf.nn.bidirectional_dynamic_rnn(cell_fw, cell_bw, self.inputs, self.seqlens, dtype=tf.float32) - memory = tf.concat(outputs, -1) - - # Decoder : Attentional GRU - decoder_inputs = tf.concat((tf.zeros_like((self.y[:, :1])), self.y[:, :-1]), -1) - decoder_inputs = tf.one_hot(decoder_inputs, len(hp.phonemes)) - attention_mechanism = tf.contrib.seq2seq.BahdanauAttention(hp.hidden_units, memory, self.seqlens) - cell = tf.nn.rnn_cell.GRUCell(hp.hidden_units) - cell_with_attention = tf.contrib.seq2seq.AttentionWrapper(cell, - attention_mechanism, - hp.hidden_units, - alignment_history=True) - outputs, _ = tf.nn.dynamic_rnn(cell_with_attention, decoder_inputs, dtype=tf.float32) # ( N, T', 16) - logits = tf.layers.dense(outputs, len(hp.phonemes)) - self.preds = tf.to_int32(tf.argmax(logits, -1)) - - ## Loss and training - loss = tf.nn.sparse_softmax_cross_entropy_with_logits(logits=logits, labels=self.y) - self.mean_loss = tf.reduce_mean(loss) - self.global_step = tf.Variable(0, name='global_step', trainable=False) - optimizer = tf.train.AdamOptimizer(hp.lr) - self.train_op = optimizer.minimize(self.mean_loss, global_step=self.global_step) - -if __name__ == '__main__': - # Data loading - X_train, Y_train = load_data(mode="train") - x_val, y_val = load_data(mode="val") - - # Graph loading - g = Graph(); print("Training Graph loaded") - - # Session - sv = tf.train.Supervisor(logdir=hp.logdir, save_model_secs=0) - with sv.managed_session() as sess: - for epoch in range(hp.num_epochs): - # shuffle - ids = np.arange(len(X_train)) - np.random.shuffle(ids) - X_train, Y_train = X_train[ids], Y_train[ids] - - # batch train - #for i in tqdm.tqdm(range(0, len(X_train), hp.batch_size), total=len(X_train) // hp.batch_size): - for i in range(0, len(X_train)): - x_train = X_train[i: i + hp.batch_size] - y_train = Y_train[i: i + hp.batch_size] - _, loss = sess.run([g.train_op, g.mean_loss], {g.x: x_train, g.y: y_train}) - - # Write checkpoint files at every epoch - gs = sess.run(g.global_step) - sv.saver.save(sess, hp.logdir + '/model_epoch_%02d_gs_%d' % (epoch, gs)) - - # Evaluation - ## Autoregressive inference - preds = np.zeros((hp.batch_size, hp.maxlen), np.int32) - for j in range(hp.maxlen): - _preds = sess.run(g.preds, {g.x: x_val, g.y: preds}) - preds[:, j] = _preds[:, j] - - ## Parsing & Calculation - total, errors = 0, 0 - bugs = 0 - for xx, yy, pred in zip(x_val, y_val, preds): # sample-wise - inputs = "".join(idx2g[g] for g in xx).split("E")[0] - expected = " ".join(idx2p[p] for p in yy).split("")[0].strip() - got = " ".join(idx2p[p] for p in pred).split("")[0].strip() - - print("* Input : {}".format(inputs)) - print("* Expected: {}".format(expected)) - print("* Got : {}".format(got)) - - error = distance.levenshtein(expected.split(), got.split()) - errors += error - total += len(expected.split()) - - cer = errors / float(total) - print("epoch: %02d, training loss: %02f, CER: %02f\n" % (epoch+1, loss, cer)) diff --git a/setup.py b/setup.py index 6bf7c2f..c765f59 100644 --- a/setup.py +++ b/setup.py @@ -12,7 +12,7 @@ setup( name = 'g2p_en', packages = ['g2p_en'], # this must be the same as the name above - version = '1.0.0', + version = '2.0.0', description = 'A Simple Python Module for English Grapheme To Phoneme Conversion', long_description=long_description, author = 'Kyubyong Park & Jongseok Kim', @@ -23,11 +23,9 @@ classifiers = [], install_requires = [ 'numpy>=1.13.1', - 'tensorflow >= 1.3.0', 'nltk>=3.2.4', 'inflect>=0.3.1', 'distance>=0.1.3', - 'future>=0.16.0' ], license='Apache Software License', include_package_data=True