diff --git a/.idea/deployment.xml b/.idea/deployment.xml
new file mode 100644
index 0000000..c982474
--- /dev/null
+++ b/.idea/deployment.xml
@@ -0,0 +1,15 @@
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
\ No newline at end of file
diff --git a/.idea/workspace.xml b/.idea/workspace.xml
new file mode 100644
index 0000000..de18bf0
--- /dev/null
+++ b/.idea/workspace.xml
@@ -0,0 +1,518 @@
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ sys
+ word_tok
+ pos_tag
+ print
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ true
+ DEFINITION_ORDER
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ project
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ 1526441288919
+
+
+ 1526441288919
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
\ No newline at end of file
diff --git a/MANIFEST.in b/MANIFEST.in
index 843ef4a..7db390c 100644
--- a/MANIFEST.in
+++ b/MANIFEST.in
@@ -1,2 +1,2 @@
include g2p_en/homographs.en
-include g2p_en/logdir/*
+include g2p_en/checkpoint20.npz
\ No newline at end of file
diff --git a/README.md b/README.md
index ca325ee..443586b 100644
--- a/README.md
+++ b/README.md
@@ -1,4 +1,9 @@
-# g2p_en: A Simple Python Module for English Grapheme To Phoneme Conversion
+[](https://pypi.org/project/g2p-en/)
+[](https://pypi.org/project/g2p-en/)
+
+# g2pE: A Simple Python Module for English Grapheme To Phoneme Conversion
+
+* [v.2.0] We removed TensorFlow from the dependencies. After all, it changes its APIs quite often, and we don't expect you to have a GPU. Instead, NumPy is used for inference.
This module is designed to convert English graphemes (spelling) to phonemes (pronunciation).
It is considered essential in several tasks such as speech synthesis.
@@ -20,23 +25,21 @@ In this project, we employ a deep learning seq2seq framework based on TensorFlow
## Algorithm
1. Spells out arabic numbers and some currency symbols. (e.g. $200 -> two hundred dollars) (This is borrowed from [Keith Ito's code](https://github.com/keithito/tacotron/blob/master/text/numbers.py))
-2. Attempts to retrieve the correct pronunciation for homographs based on their POS)
+2. Attempts to retrieve the correct pronunciation for heteronyms based on their POS)
3. Looks up [The CMU Pronouncing Dictionary](http://www.speech.cs.cmu.edu/cgi-bin/cmudict) for non-homographs.
4. For OOVs, we predict their pronunciations using our neural net model.
## Environment
-* python 2.x or 3.x
+* python 3.x
## Dependencies
* numpy >= 1.13.1
-* tensorflow >= 1.3.0
* nltk >= 3.2.4
* python -m nltk.downloader "averaged_perceptron_tagger" "cmudict"
* inflect >= 0.3.1
* Distance >= 0.1.3
-* future >= 0.16.0
## Installation
@@ -47,28 +50,41 @@ OR
nltk package will be automatically downloaded at your first run.
-## Training (Note that pretrained model is already included)
-
- python train.py
## Usage
- from g2p_en import g2p
-
- text = "I refuse to collect the refuse around here."
- print(g2p(text))
- >>>[u'AY1', ' ', u'R', u'IH0', u'F', u'Y', u'UW1', u'Z', ' ', u'T', u'UW1', ' ', u'K', u'AH0', u'L', u'EH1', u'K', u'T', ' ', u'DH', u'AH0', ' ', u'R', u'EH1', u'F', u'Y', u'UW2', u'Z', ' ', u'ER0', u'AW1', u'N', u'D', ' ', u'HH', u'EH1', u'R']
-
- text = "I am an activationist."
- print(g2p(text))
- >>>[u'AY1', u'M', ' ', u'AE1', u'N', ' ', u'AE2', u'K', u'T', u'AH0', u'V', u'EY1', u'SH', u'AH0', u'N', u'IH0', u'S', u'T']
-
-If you need to convert lots of texts, you can use the global tf session.
-
- import g2p_en as g2p
-
- with g2p.Session():
- phs = [g2p.g2p(text) for text in texts]
+ from g2p_en import G2p
+
+ texts = ["I have $250 in my pocket.", # number -> spell-out
+ "popular pets, e.g. cats and dogs", # e.g. -> for example
+ "I refuse to collect the refuse around here.", # homograph
+ "I'm an activationist."] # newly coined word
+ g2p = G2p()
+ for text in texts:
+ out = g2p(text)
+ print(out)
+ >>> ['AY1', ' ', 'HH', 'AE1', 'V', ' ', 'T', 'UW1', ' ', 'HH', 'AH1', 'N', 'D', 'R', 'AH0', 'D', ' ', 'F', 'IH1', 'F', 'T', 'IY0', ' ', 'D', 'AA1', 'L', 'ER0', 'Z', ' ', 'IH0', 'N', ' ', 'M', 'AY1', ' ', 'P', 'AA1', 'K', 'AH0', 'T', ' ', '.']
+ >>> ['P', 'AA1', 'P', 'Y', 'AH0', 'L', 'ER0', ' ', 'P', 'EH1', 'T', 'S', ' ', ',', ' ', 'F', 'AO1', 'R', ' ', 'IH0', 'G', 'Z', 'AE1', 'M', 'P', 'AH0', 'L', ' ', 'K', 'AE1', 'T', 'S', ' ', 'AH0', 'N', 'D', ' ', 'D', 'AA1', 'G', 'Z']
+ >>> ['AY1', ' ', 'R', 'IH0', 'F', 'Y', 'UW1', 'Z', ' ', 'T', 'UW1', ' ', 'K', 'AH0', 'L', 'EH1', 'K', 'T', ' ', 'DH', 'AH0', ' ', 'R', 'EH1', 'F', 'Y', 'UW2', 'Z', ' ', 'ER0', 'AW1', 'N', 'D', ' ', 'HH', 'IY1', 'R', ' ', '.']
+ >>> ['AY1', ' ', 'AH0', 'M', ' ', 'AE1', 'N', ' ', 'AE2', 'K', 'T', 'IH0', 'V', 'EY1', 'SH', 'AH0', 'N', 'IH0', 'S', 'T', ' ', '.']
+
+## References
+
+If you use this code for research, please cite:
+
+```
+@misc{g2pE2019,
+ author = {Park, Kyubyong & Kim, Jongseok},
+ title = {g2pE},
+ year = {2019},
+ publisher = {GitHub},
+ journal = {GitHub repository},
+ howpublished = {\url{https://github.com/Kyubyong/g2p}}
+}
+```
+
+## Cited in
+* [Learning pronunciation from a foreign language in speech synthesis networks](https://arxiv.org/abs/1811.09364)
May, 2018.
diff --git a/README.rst b/README.rst
index d228fd2..c2d414d 100644
--- a/README.rst
+++ b/README.rst
@@ -1,6 +1,8 @@
g2p\_en: A Simple Python Module for English Grapheme To Phoneme Conversion
==========================================================================
+[Update] * We removed TensorFlow from the dependencies. After all, it changes its APIs quite often, and we don't expect you to have a GPU. Instead, NumPy is used for inference.
+
This module is designed to convert English graphemes (spelling) to
phonemes (pronunciation). It is considered essential in several tasks
such as speech synthesis. Unlike many languages like Spanish or German
@@ -42,18 +44,16 @@ Algorithm
Environment
-----------
-- python 2.x or 3.x
+- python 3.x
Dependencies
------------
- numpy >= 1.13.1
-- tensorflow >= 1.3.0
- nltk >= 3.2.4
- python -m nltk.downloader "averaged\_perceptron\_tagger" "cmudict"
- inflect >= 0.3.1
- Distance >= 0.1.3
-- future >= 0.16.0
Installation
------------
@@ -70,36 +70,27 @@ OR
nltk package will be automatically downloaded at your first run.
-Training (Note that pretrained model is already included)
----------------------------------------------------------
-
-::
-
- python train.py
Usage
-----
::
- from g2p_en import g2p
-
- text = "I refuse to collect the refuse around here."
- print(g2p(text))
- >>>[u'AY1', ' ', u'R', u'IH0', u'F', u'Y', u'UW1', u'Z', ' ', u'T', u'UW1', ' ', u'K', u'AH0', u'L', u'EH1', u'K', u'T', ' ', u'DH', u'AH0', ' ', u'R', u'EH1', u'F', u'Y', u'UW2', u'Z', ' ', u'ER0', u'AW1', u'N', u'D', ' ', u'HH', u'EH1', u'R']
-
- text = "I am an activationist."
- print(g2p(text))
- >>>[u'AY1', u'M', ' ', u'AE1', u'N', ' ', u'AE2', u'K', u'T', u'AH0', u'V', u'EY1', u'SH', u'AH0', u'N', u'IH0', u'S', u'T']
-
-If you need to convert lots of texts, you can use the global tf session.
-
-::
-
- import g2p_en as g2p
+ from g2p_en import G2p
+
+ texts = ["I have $250 in my pocket.", # number -> spell-out
+ "popular pets, e.g. cats and dogs", # e.g. -> for example
+ "I refuse to collect the refuse around here.", # homograph
+ "I'm an activationist."] # newly coined word
+ g2p = G2p()
+ for text in texts:
+ out = g2p(text)
+ print(out)
+ >>> ['AY1', ' ', 'HH', 'AE1', 'V', ' ', 'T', 'UW1', ' ', 'HH', 'AH1', 'N', 'D', 'R', 'AH0', 'D', ' ', 'F', 'IH1', 'F', 'T', 'IY0', ' ', 'D', 'AA1', 'L', 'ER0', 'Z', ' ', 'IH0', 'N', ' ', 'M', 'AY1', ' ', 'P', 'AA1', 'K', 'AH0', 'T', ' ', '.']
+ >>> ['P', 'AA1', 'P', 'Y', 'AH0', 'L', 'ER0', ' ', 'P', 'EH1', 'T', 'S', ' ', ',', ' ', 'F', 'AO1', 'R', ' ', 'IH0', 'G', 'Z', 'AE1', 'M', 'P', 'AH0', 'L', ' ', 'K', 'AE1', 'T', 'S', ' ', 'AH0', 'N', 'D', ' ', 'D', 'AA1', 'G', 'Z']
+ >>> ['AY1', ' ', 'R', 'IH0', 'F', 'Y', 'UW1', 'Z', ' ', 'T', 'UW1', ' ', 'K', 'AH0', 'L', 'EH1', 'K', 'T', ' ', 'DH', 'AH0', ' ', 'R', 'EH1', 'F', 'Y', 'UW2', 'Z', ' ', 'ER0', 'AW1', 'N', 'D', ' ', 'HH', 'IY1', 'R', ' ', '.']
+ >>> ['AY1', ' ', 'AH0', 'M', ' ', 'AE1', 'N', ' ', 'AE2', 'K', 'T', 'IH0', 'V', 'EY1', 'SH', 'AH0', 'N', 'IH0', 'S', 'T', ' ', '.']
- with g2p.Session():
- phs = [g2p.g2p(text) for text in texts]
May, 2018.
diff --git a/g2p.pdf b/g2p.pdf
new file mode 100644
index 0000000..c1ef41f
Binary files /dev/null and b/g2p.pdf differ
diff --git a/g2p_en.egg-info/PKG-INFO b/g2p_en.egg-info/PKG-INFO
deleted file mode 100644
index e331e5d..0000000
--- a/g2p_en.egg-info/PKG-INFO
+++ /dev/null
@@ -1,119 +0,0 @@
-Metadata-Version: 1.1
-Name: g2p-en
-Version: 1.0.0
-Summary: A Simple Python Module for English Grapheme To Phoneme Conversion
-Home-page: https://github.com/Kyubyong/g2p
-Author: Kyubyong Park & Jongseok Kim
-Author-email: kbpark.linguist@gmail.com
-License: Apache Software License
-Download-URL: https://github.com/Kyubyong/g2p/archive/1.0.0.tar.gz
-Description-Content-Type: UNKNOWN
-Description: g2p\_en: A Simple Python Module for English Grapheme To Phoneme Conversion
- ==========================================================================
-
- This module is designed to convert English graphemes (spelling) to
- phonemes (pronunciation). It is considered essential in several tasks
- such as speech synthesis. Unlike many languages like Spanish or German
- where pronunciation of a word can be inferred from its spelling, English
- words are often far from people's expectations. Therefore, it will be
- the best idea to consult a dictionary if we want to know the
- pronunciation of some word. However, there are at least two tentative
- issues in this approach. First, you can't disambiguate the pronunciation
- of homographs, words which have multiple pronunciations. (See ``a``
- below.) Second, you can't check if the word is not in the dictionary.
- (See ``b`` below.)
-
- -
-
- \a. I refuse to collect the refuse around here. (rɪ\|fju:z as verb vs. \|refju:s as noun)
-
- -
- \b. I am an activationist. (activationist: newly coined word which means ``n. A person who designs and implements programs of treatment or therapy that use recreation and activities to help people whose functional abilities are affected by illness or disability.`` from `WORD SPY `__
-
- For the first homograph issue, fortunately many homographs can be
- disambiguated using their part-of-speech, if not all. When it comes to
- the words not in the dictionary, however, we should make our best guess
- using our knowledge. In this project, we employ a deep learning seq2seq
- framework based on TensorFlow.
-
- Algorithm
- ---------
-
- 1. Spells out arabic numbers and some currency symbols. (e.g. $200 ->
- two hundred dollars) (This is borrowed from `Keith Ito's
- code `__)
- 2. Attempts to retrieve the correct pronunciation for homographs based
- on their POS)
- 3. Looks up `The CMU Pronouncing
- Dictionary `__ for
- non-homographs.
- 4. For OOVs, we predict their pronunciations using our neural net model.
-
- Environment
- -----------
-
- - python 2.x or 3.x
-
- Dependencies
- ------------
-
- - numpy >= 1.13.1
- - tensorflow >= 1.3.0
- - nltk >= 3.2.4
- - python -m nltk.downloader "averaged\_perceptron\_tagger" "cmudict"
- - inflect >= 0.3.1
- - Distance >= 0.1.3
- - future >= 0.16.0
-
- Installation
- ------------
-
- ::
-
- pip install g2p_en
-
- OR
-
- ::
-
- python setup.py install
-
- nltk package will be automatically downloaded at your first run.
-
- Training (Note that pretrained model is already included)
- ---------------------------------------------------------
-
- ::
-
- python train.py
-
- Usage
- -----
-
- ::
-
- from g2p_en import g2p
-
- text = "I refuse to collect the refuse around here."
- print(g2p(text))
- >>>[u'AY1', ' ', u'R', u'IH0', u'F', u'Y', u'UW1', u'Z', ' ', u'T', u'UW1', ' ', u'K', u'AH0', u'L', u'EH1', u'K', u'T', ' ', u'DH', u'AH0', ' ', u'R', u'EH1', u'F', u'Y', u'UW2', u'Z', ' ', u'ER0', u'AW1', u'N', u'D', ' ', u'HH', u'EH1', u'R']
-
- text = "I am an activationist."
- print(g2p(text))
- >>>[u'AY1', u'M', ' ', u'AE1', u'N', ' ', u'AE2', u'K', u'T', u'AH0', u'V', u'EY1', u'SH', u'AH0', u'N', u'IH0', u'S', u'T']
-
- If you need to convert lots of texts, you can use the global tf session.
-
- ::
-
- import g2p_en as g2p
-
- with g2p.Session():
- phs = [g2p.g2p(text) for text in texts]
-
- May, 2018.
-
- Kyubyong Park & `Jongseok Kim `__
-
-Keywords: g2p,g2p_en
-Platform: UNKNOWN
diff --git a/g2p_en.egg-info/SOURCES.txt b/g2p_en.egg-info/SOURCES.txt
deleted file mode 100644
index a10c2f8..0000000
--- a/g2p_en.egg-info/SOURCES.txt
+++ /dev/null
@@ -1,19 +0,0 @@
-MANIFEST.in
-README.md
-README.rst
-setup.cfg
-setup.py
-g2p_en/__init__.py
-g2p_en/expand.py
-g2p_en/g2p.py
-g2p_en/homographs.en
-g2p_en/train.py
-g2p_en.egg-info/PKG-INFO
-g2p_en.egg-info/SOURCES.txt
-g2p_en.egg-info/dependency_links.txt
-g2p_en.egg-info/requires.txt
-g2p_en.egg-info/top_level.txt
-g2p_en/logdir/checkpoint
-g2p_en/logdir/model_epoch_14_gs_27956.data-00000-of-00001
-g2p_en/logdir/model_epoch_14_gs_27956.index
-g2p_en/logdir/model_epoch_14_gs_27956.meta
\ No newline at end of file
diff --git a/g2p_en.egg-info/dependency_links.txt b/g2p_en.egg-info/dependency_links.txt
deleted file mode 100644
index 8b13789..0000000
--- a/g2p_en.egg-info/dependency_links.txt
+++ /dev/null
@@ -1 +0,0 @@
-
diff --git a/g2p_en.egg-info/requires.txt b/g2p_en.egg-info/requires.txt
deleted file mode 100644
index 33d6eca..0000000
--- a/g2p_en.egg-info/requires.txt
+++ /dev/null
@@ -1,6 +0,0 @@
-numpy>=1.13.1
-tensorflow>=1.3.0
-nltk>=3.2.4
-inflect>=0.3.1
-distance>=0.1.3
-future>=0.16.0
diff --git a/g2p_en.egg-info/top_level.txt b/g2p_en.egg-info/top_level.txt
deleted file mode 100644
index 19ccc74..0000000
--- a/g2p_en.egg-info/top_level.txt
+++ /dev/null
@@ -1 +0,0 @@
-g2p_en
diff --git a/g2p_en/__init__.py b/g2p_en/__init__.py
index 9ef46b8..4d9ce97 100644
--- a/g2p_en/__init__.py
+++ b/g2p_en/__init__.py
@@ -1 +1 @@
-from g2p import g2p, Session
+from .g2p import G2p
diff --git a/g2p_en/checkpoint20.npz b/g2p_en/checkpoint20.npz
new file mode 100644
index 0000000..b0722a8
Binary files /dev/null and b/g2p_en/checkpoint20.npz differ
diff --git a/g2p_en/expand.py b/g2p_en/expand.py
index 3aae3d5..d6a9592 100644
--- a/g2p_en/expand.py
+++ b/g2p_en/expand.py
@@ -12,8 +12,6 @@
-
-
_inflect = inflect.engine()
_comma_number_re = re.compile(r'([0-9][0-9\,]+[0-9])')
_decimal_number_re = re.compile(r'([0-9]+\.[0-9]+)')
diff --git a/g2p_en/g2p.py b/g2p_en/g2p.py
index 100d96b..8b37659 100644
--- a/g2p_en/g2p.py
+++ b/g2p_en/g2p.py
@@ -1,16 +1,22 @@
# -*- coding: utf-8 -*-
-# /usr/bin/python2
+# /usr/bin/python
'''
-By kyubyong park. kbpark.linguist@gmail.com.
+By kyubyong park(kbpark.linguist@gmail.com) and Jongseok Kim(https://github.com/ozmig77)
https://www.github.com/kyubyong/g2p
'''
-from __future__ import print_function
-
-import tensorflow as tf
-
from nltk import pos_tag
from nltk.corpus import cmudict
import nltk
+from nltk.tokenize import TweetTokenizer
+word_tokenize = TweetTokenizer().tokenize
+import numpy as np
+import codecs
+import re
+import os
+import unicodedata
+from builtins import str as unicode
+from .expand import normalize_numbers
+
try:
nltk.data.find('taggers/averaged_perceptron_tagger.zip')
except LookupError:
@@ -20,173 +26,168 @@
except LookupError:
nltk.download('cmudict')
-from train import Graph, hp, load_vocab
-import numpy as np
-import codecs
-import re
-import os
-import unicodedata
-from expand import normalize_numbers
-from builtins import str as unicode
-
dirname = os.path.dirname(__file__)
-cmu = cmudict.dict()
-
-# Load vocab
-g2idx, idx2g, p2idx, idx2p = load_vocab()
-
-# Load Graph
-g = tf.Graph()
-with g.as_default():
- with tf.device('/cpu:0'):
- graph = Graph(); print("Graph loaded for g2p")
- saver = tf.train.Saver()
-config = tf.ConfigProto(
- device_count={'GPU' : 0},
- gpu_options=tf.GPUOptions(per_process_gpu_memory_fraction=0.0001)
- )
-
-g_sess = None # global session
-class Session: # make/remove global session
- def __enter__(self):
- global g_sess
- if g_sess != None:
- raise Exception('Session already exist in g2p')
- g_sess = tf.Session(graph=g, config=config)
- saver.restore(g_sess, tf.train.latest_checkpoint(os.path.join(dirname,hp.logdir)))
-
- def __exit__(self, exc_type, exc_val, exc_tb):
- global g_sess
- g_sess.close()
- g_sess = None
-
-
-def predict(words, sess):
- '''
- Returns predicted pronunciation of `words` which do NOT exist in the dictionary.
- :param words: A list of words.
- :return: pron: A list of phonemes
- '''
- if len(words) > hp.batch_size:
- after = predict(words[hp.batch_size:], sess)
- words = words[:hp.batch_size]
- else:
- after = []
- x = np.zeros((len(words), hp.maxlen), np.int32) # 0:
- for i, w in enumerate(words):
- for j, g in enumerate((w + "E")[:hp.maxlen]):
- x[i][j] = g2idx.get(g, 2) # 2:
-
- ## Autoregressive inference
- preds = np.zeros((len(x), hp.maxlen), np.int32)
- for j in range(hp.maxlen):
- _preds = sess.run(graph.preds, {graph.x: x, graph.y: preds})
- preds[:, j] = _preds[:, j]
-
- # convert to string
- pron = []
- for i in range(len(preds)):
- p = [u"%s" % unicode(idx2p[idx]) for idx in preds[i]] # Make p into unicode.
- if "" in p:
- eos = p.index("")
- p = p[:eos]
- pron.append(p)
-
- return pron + after
-
-# Construct homograph dictionary
-f = os.path.join(dirname,'homographs.en')
-homograph2features = dict()
-for line in codecs.open(f, 'r', 'utf8').read().splitlines():
- if line.startswith("#"): continue # comment
- headword, pron1, pron2, pos1 = line.strip().split("|")
- homograph2features[headword.lower()] = (pron1.split(), pron2.split(), pos1)
-
-def token2pron(token):
- '''
- Returns pronunciation of word based on its pos.
- :param token: A tuple of (word, pos)
- :return: A list of phonemes. If word is not in the dictionary, [] is returned.
- '''
- word, pos = token
-
- if re.search("[a-z]", word) is None:
- pron = [word]
-
- elif word in homograph2features: # Check homograph
- pron1, pron2, pos1 = homograph2features[word]
- if pos.startswith(pos1):
- pron = pron1
- else:
- pron = pron2
- elif word in cmu: # CMU dict
- pron = cmu[word][0]
- else:
- return []
-
- return pron
-
-def tokenize(text):
- '''
- Splits text into `tokens`.
- :param text: A string.
- :return: A list of tokens (string).
- '''
- text = re.sub('([.,?!]( |$))', r' \1', text)
- return text.split()
-
-def g2p(text):
- '''
- Returns the pronunciation of text.
- :param text: A string. A sequence of words.
- :return: A list of phonemes.
- '''
- # normalization
- text = unicode(text)
- text = normalize_numbers(text)
- text = ''.join(char for char in unicodedata.normalize('NFD', text)
- if unicodedata.category(char) != 'Mn') # Strip accents
- text = text.lower()
- text = re.sub("[^ a-z'.,?!\-]", "", text)
- text = text.replace("i.e.", "that is")
- text = text.replace("e.g.", "for example")
-
- # tokenization
- words = tokenize(text)
- tokens = pos_tag(words) # tuples of (word, tag)
-
- # g2p
- oovs, u_loc = [], []
- ret = []
- for token in tokens:
- pron = token2pron(token) # list of phonemes
- if pron == []: # oov
- oovs.append(token[0])
- u_loc.append(len(ret))
- ret.extend(pron)
- ret.extend([" "])
-
- if len(oovs)>0:
- global g_sess
- if g_sess is not None: # check global session
- prons = predict(oovs, g_sess)
- for i in range(len(oovs)-1,-1,-1):
- ret = ret[:u_loc[i]]+prons[i]+ret[u_loc[i]:]
- else: # If global session is not defined, make new one as local.
- with tf.Session(graph=g, config=config) as sess:
- saver.restore(sess, tf.train.latest_checkpoint(os.path.join(dirname, hp.logdir)))
- prons = predict(oovs, sess)
- for i in range(len(oovs)-1,-1,-1):
- ret = ret[:u_loc[i]]+prons[i]+ret[u_loc[i]:]
- return ret[:-1]
-
+def construct_homograph_dictionary():
+ f = os.path.join(dirname,'homographs.en')
+ homograph2features = dict()
+ for line in codecs.open(f, 'r', 'utf8').read().splitlines():
+ if line.startswith("#"): continue # comment
+ headword, pron1, pron2, pos1 = line.strip().split("|")
+ homograph2features[headword.lower()] = (pron1.split(), pron2.split(), pos1)
+ return homograph2features
+
+# def segment(text):
+# '''
+# Splits text into `tokens`.
+# :param text: A string.
+# :return: A list of tokens (string).
+# '''
+# print(text)
+# text = re.sub('([.,?!]( |$))', r' \1', text)
+# print(text)
+# return text.split()
+
+class G2p(object):
+ def __init__(self):
+ super().__init__()
+ self.graphemes = ["", "", ""] + list("abcdefghijklmnopqrstuvwxyz")
+ self.phonemes = ["", "", "", ""] + ['AA0', 'AA1', 'AA2', 'AE0', 'AE1', 'AE2', 'AH0', 'AH1', 'AH2', 'AO0',
+ 'AO1', 'AO2', 'AW0', 'AW1', 'AW2', 'AY0', 'AY1', 'AY2', 'B', 'CH', 'D', 'DH',
+ 'EH0', 'EH1', 'EH2', 'ER0', 'ER1', 'ER2', 'EY0', 'EY1',
+ 'EY2', 'F', 'G', 'HH',
+ 'IH0', 'IH1', 'IH2', 'IY0', 'IY1', 'IY2', 'JH', 'K', 'L',
+ 'M', 'N', 'NG', 'OW0', 'OW1',
+ 'OW2', 'OY0', 'OY1', 'OY2', 'P', 'R', 'S', 'SH', 'T', 'TH',
+ 'UH0', 'UH1', 'UH2', 'UW',
+ 'UW0', 'UW1', 'UW2', 'V', 'W', 'Y', 'Z', 'ZH']
+ self.g2idx = {g: idx for idx, g in enumerate(self.graphemes)}
+ self.idx2g = {idx: g for idx, g in enumerate(self.graphemes)}
+
+ self.p2idx = {p: idx for idx, p in enumerate(self.phonemes)}
+ self.idx2p = {idx: p for idx, p in enumerate(self.phonemes)}
+
+ self.cmu = cmudict.dict()
+ self.load_variables()
+ self.homograph2features = construct_homograph_dictionary()
+
+ def load_variables(self):
+ self.variables = np.load(os.path.join(dirname,'checkpoint20.npz'))
+ self.enc_emb = self.variables["enc_emb"] # (29, 64). (len(graphemes), emb)
+ self.enc_w_ih = self.variables["enc_w_ih"] # (3*128, 64)
+ self.enc_w_hh = self.variables["enc_w_hh"] # (3*128, 128)
+ self.enc_b_ih = self.variables["enc_b_ih"] # (3*128,)
+ self.enc_b_hh = self.variables["enc_b_hh"] # (3*128,)
+
+ self.dec_emb = self.variables["dec_emb"] # (74, 64). (len(phonemes), emb)
+ self.dec_w_ih = self.variables["dec_w_ih"] # (3*128, 64)
+ self.dec_w_hh = self.variables["dec_w_hh"] # (3*128, 128)
+ self.dec_b_ih = self.variables["dec_b_ih"] # (3*128,)
+ self.dec_b_hh = self.variables["dec_b_hh"] # (3*128,)
+ self.fc_w = self.variables["fc_w"] # (74, 128)
+ self.fc_b = self.variables["fc_b"] # (74,)
+
+ def sigmoid(self, x):
+ return 1 / (1 + np.exp(-x))
+
+ def grucell(self, x, h, w_ih, w_hh, b_ih, b_hh):
+ rzn_ih = np.matmul(x, w_ih.T) + b_ih
+ rzn_hh = np.matmul(h, w_hh.T) + b_hh
+
+ rz_ih, n_ih = rzn_ih[:, :rzn_ih.shape[-1] * 2 // 3], rzn_ih[:, rzn_ih.shape[-1] * 2 // 3:]
+ rz_hh, n_hh = rzn_hh[:, :rzn_hh.shape[-1] * 2 // 3], rzn_hh[:, rzn_hh.shape[-1] * 2 // 3:]
+
+ rz = self.sigmoid(rz_ih + rz_hh)
+ r, z = np.split(rz, 2, -1)
+
+ n = np.tanh(n_ih + r * n_hh)
+ h = (1 - z) * n + z * h
+
+ return h
+
+ def gru(self, x, steps, w_ih, w_hh, b_ih, b_hh, h0=None):
+ if h0 is None:
+ h0 = np.zeros((x.shape[0], w_hh.shape[1]), np.float32)
+ h = h0 # initial hidden state
+ outputs = np.zeros((x.shape[0], steps, w_hh.shape[1]), np.float32)
+ for t in range(steps):
+ h = self.grucell(x[:, t, :], h, w_ih, w_hh, b_ih, b_hh) # (b, h)
+ outputs[:, t, ::] = h
+ return outputs
+
+ def encode(self, word):
+ chars = list(word) + [""]
+ x = [self.g2idx.get(char, self.g2idx[""]) for char in chars]
+ x = np.take(self.enc_emb, np.expand_dims(x, 0), axis=0)
+
+ return x
+
+ def predict(self, word):
+ # encoder
+ enc = self.encode(word)
+ enc = self.gru(enc, len(word) + 1, self.enc_w_ih, self.enc_w_hh,
+ self.enc_b_ih, self.enc_b_hh, h0=np.zeros((1, self.enc_w_hh.shape[-1]), np.float32))
+ last_hidden = enc[:, -1, :]
+
+ # decoder
+ dec = np.take(self.dec_emb, [2], axis=0) # 2:
+ h = last_hidden
+
+ preds = []
+ for i in range(20):
+ h = self.grucell(dec, h, self.dec_w_ih, self.dec_w_hh, self.dec_b_ih, self.dec_b_hh) # (b, h)
+ logits = np.matmul(h, self.fc_w.T) + self.fc_b
+ pred = logits.argmax()
+ if pred == 3: break # 3:
+ preds.append(pred)
+ dec = np.take(self.dec_emb, [pred], axis=0)
+
+ preds = [self.idx2p.get(idx, "") for idx in preds]
+ return preds
+
+ def __call__(self, text):
+ # preprocessing
+ text = unicode(text)
+ text = normalize_numbers(text)
+ text = ''.join(char for char in unicodedata.normalize('NFD', text)
+ if unicodedata.category(char) != 'Mn') # Strip accents
+ text = text.lower()
+ text = re.sub("[^ a-z'.,?!\-]", "", text)
+ text = text.replace("i.e.", "that is")
+ text = text.replace("e.g.", "for example")
+
+ # tokenization
+ words = word_tokenize(text)
+ tokens = pos_tag(words) # tuples of (word, tag)
+
+ # steps
+ prons = []
+ for word, pos in tokens:
+ if re.search("[a-z]", word) is None:
+ pron = [word]
+
+ elif word in self.homograph2features: # Check homograph
+ pron1, pron2, pos1 = self.homograph2features[word]
+ if pos.startswith(pos1):
+ pron = pron1
+ else:
+ pron = pron2
+ elif word in self.cmu: # lookup CMU dict
+ pron = self.cmu[word][0]
+ else: # predict for oov
+ pron = self.predict(word)
+
+ prons.extend(pron)
+ prons.extend([" "])
+
+ return prons[:-1]
if __name__ == '__main__':
texts = ["I have $250 in my pocket.", # number -> spell-out
"popular pets, e.g. cats and dogs", # e.g. -> for example
"I refuse to collect the refuse around here.", # homograph
"I'm an activationist."] # newly coined word
+ g2p = G2p()
for text in texts:
out = g2p(text)
print(out)
diff --git a/g2p_en/logdir/checkpoint b/g2p_en/logdir/checkpoint
deleted file mode 100644
index 184f6dc..0000000
--- a/g2p_en/logdir/checkpoint
+++ /dev/null
@@ -1,6 +0,0 @@
-model_checkpoint_path: "model_epoch_14_gs_27956"
-all_model_checkpoint_paths: "model_epoch_10_gs_24100"
-all_model_checkpoint_paths: "model_epoch_11_gs_25064"
-all_model_checkpoint_paths: "model_epoch_12_gs_26028"
-all_model_checkpoint_paths: "model_epoch_13_gs_26992"
-all_model_checkpoint_paths: "model_epoch_14_gs_27956"
diff --git a/g2p_en/logdir/model_epoch_14_gs_27956.data-00000-of-00001 b/g2p_en/logdir/model_epoch_14_gs_27956.data-00000-of-00001
deleted file mode 100644
index d4cde5a..0000000
Binary files a/g2p_en/logdir/model_epoch_14_gs_27956.data-00000-of-00001 and /dev/null differ
diff --git a/g2p_en/logdir/model_epoch_14_gs_27956.index b/g2p_en/logdir/model_epoch_14_gs_27956.index
deleted file mode 100644
index 8a28f65..0000000
Binary files a/g2p_en/logdir/model_epoch_14_gs_27956.index and /dev/null differ
diff --git a/g2p_en/logdir/model_epoch_14_gs_27956.meta b/g2p_en/logdir/model_epoch_14_gs_27956.meta
deleted file mode 100644
index a8bbb94..0000000
Binary files a/g2p_en/logdir/model_epoch_14_gs_27956.meta and /dev/null differ
diff --git a/g2p_en/train.py b/g2p_en/train.py
deleted file mode 100644
index cf19177..0000000
--- a/g2p_en/train.py
+++ /dev/null
@@ -1,165 +0,0 @@
-# -*- coding: utf-8 -*-
-#/usr/bin/python2
-'''
-By kyubyong park. kbpark.linguist@gmail.com.
-https://www.github.com/kyubyong/g2p
-'''
-from __future__ import print_function
-
-#import tqdm
-import distance
-
-import tensorflow as tf
-import numpy as np
-
-from nltk.corpus import cmudict
-cmu = cmudict.dict()
-
-# Hyper parameters
-class hp:
- batch_size = 128
- lr = 0.0001
- logdir = "logdir"
- maxlen = 20
- num_epochs = 15
- hidden_units = 128
- graphemes = ["P", "E", "U"] + list("abcdefghijklmnopqrstuvwxyz") # Padding, EOS, UNK
- phonemes = ["", "", 'AA0', 'AA1', 'AA2', 'AE0', 'AE1', 'AE2', 'AH0', 'AH1', 'AH2', 'AO0',
- 'AO1', 'AO2', 'AW0', 'AW1', 'AW2', 'AY0', 'AY1', 'AY2', 'B', 'CH', 'D', 'DH',
- 'EH0', 'EH1', 'EH2', 'ER0', 'ER1', 'ER2', 'EY0', 'EY1', 'EY2', 'F', 'G', 'HH',
- 'IH0', 'IH1', 'IH2', 'IY0', 'IY1', 'IY2', 'JH', 'K', 'L', 'M', 'N', 'NG', 'OW0', 'OW1',
- 'OW2', 'OY0', 'OY1', 'OY2', 'P', 'R', 'S', 'SH', 'T', 'TH', 'UH0', 'UH1', 'UH2', 'UW',
- 'UW0', 'UW1', 'UW2', 'V', 'W', 'Y', 'Z', 'ZH']
-
-def load_vocab():
- g2idx = {g: idx for idx, g in enumerate(hp.graphemes)}
- idx2g = {idx: g for idx, g in enumerate(hp.graphemes)}
-
- p2idx = {p: idx for idx, p in enumerate(hp.phonemes)}
- idx2p = {idx: p for idx, p in enumerate(hp.phonemes)}
-
- return g2idx, idx2g, p2idx, idx2p
-
-g2idx, idx2g, p2idx, idx2p = load_vocab()
-
-def load_data(mode="train"):
- # Vectorize
- xs, ys = [], [] # vectorized sentences
- for word, prons in cmu.items():
- graphemes = word + "E" # EOS
- if len(graphemes) > hp.maxlen: continue
- graphemes += "P" * hp.maxlen # Padding
-
- x = [g2idx.get(g, 2) for g in graphemes[:hp.maxlen]] # 2:
-
- pron = prons[0]
- phonemes = list(pron) + [""]
- if len(phonemes) > hp.maxlen: continue
- phonemes += [""] * hp.maxlen
- y = [p2idx[p] for p in phonemes[:hp.maxlen]]
-
- xs.append(x)
- ys.append(y)
-
- # Convert to 2d-arrays
- X = np.array(xs, np.int32)
- Y = np.array(ys, np.int32)
-
- if mode=="train":
- X, Y = X[:-hp.batch_size], Y[:-hp.batch_size]
- else: # eval
- X, Y = X[-hp.batch_size:], Y[-hp.batch_size:]
-
- return X, Y
-
-class Graph():
- '''Builds a model graph'''
- def __init__(self):
- self.x = tf.placeholder(tf.int32, shape=(None, hp.maxlen), name="grapheme")
- self.y = tf.placeholder(tf.int32, shape=(None, hp.maxlen), name="phoneme")
-
- # Sequence lengths
- self.seqlens = tf.reduce_sum(tf.sign(self.x), -1)
-
- # Embedding
- self.inputs = tf.one_hot(self.x, len(hp.graphemes))
-
- # Encoder: BiGRU
- cell_fw = tf.nn.rnn_cell.GRUCell(hp.hidden_units)
- cell_bw = tf.nn.rnn_cell.GRUCell(hp.hidden_units)
- outputs, _ = tf.nn.bidirectional_dynamic_rnn(cell_fw, cell_bw, self.inputs, self.seqlens, dtype=tf.float32)
- memory = tf.concat(outputs, -1)
-
- # Decoder : Attentional GRU
- decoder_inputs = tf.concat((tf.zeros_like((self.y[:, :1])), self.y[:, :-1]), -1)
- decoder_inputs = tf.one_hot(decoder_inputs, len(hp.phonemes))
- attention_mechanism = tf.contrib.seq2seq.BahdanauAttention(hp.hidden_units, memory, self.seqlens)
- cell = tf.nn.rnn_cell.GRUCell(hp.hidden_units)
- cell_with_attention = tf.contrib.seq2seq.AttentionWrapper(cell,
- attention_mechanism,
- hp.hidden_units,
- alignment_history=True)
- outputs, _ = tf.nn.dynamic_rnn(cell_with_attention, decoder_inputs, dtype=tf.float32) # ( N, T', 16)
- logits = tf.layers.dense(outputs, len(hp.phonemes))
- self.preds = tf.to_int32(tf.argmax(logits, -1))
-
- ## Loss and training
- loss = tf.nn.sparse_softmax_cross_entropy_with_logits(logits=logits, labels=self.y)
- self.mean_loss = tf.reduce_mean(loss)
- self.global_step = tf.Variable(0, name='global_step', trainable=False)
- optimizer = tf.train.AdamOptimizer(hp.lr)
- self.train_op = optimizer.minimize(self.mean_loss, global_step=self.global_step)
-
-if __name__ == '__main__':
- # Data loading
- X_train, Y_train = load_data(mode="train")
- x_val, y_val = load_data(mode="val")
-
- # Graph loading
- g = Graph(); print("Training Graph loaded")
-
- # Session
- sv = tf.train.Supervisor(logdir=hp.logdir, save_model_secs=0)
- with sv.managed_session() as sess:
- for epoch in range(hp.num_epochs):
- # shuffle
- ids = np.arange(len(X_train))
- np.random.shuffle(ids)
- X_train, Y_train = X_train[ids], Y_train[ids]
-
- # batch train
- #for i in tqdm.tqdm(range(0, len(X_train), hp.batch_size), total=len(X_train) // hp.batch_size):
- for i in range(0, len(X_train)):
- x_train = X_train[i: i + hp.batch_size]
- y_train = Y_train[i: i + hp.batch_size]
- _, loss = sess.run([g.train_op, g.mean_loss], {g.x: x_train, g.y: y_train})
-
- # Write checkpoint files at every epoch
- gs = sess.run(g.global_step)
- sv.saver.save(sess, hp.logdir + '/model_epoch_%02d_gs_%d' % (epoch, gs))
-
- # Evaluation
- ## Autoregressive inference
- preds = np.zeros((hp.batch_size, hp.maxlen), np.int32)
- for j in range(hp.maxlen):
- _preds = sess.run(g.preds, {g.x: x_val, g.y: preds})
- preds[:, j] = _preds[:, j]
-
- ## Parsing & Calculation
- total, errors = 0, 0
- bugs = 0
- for xx, yy, pred in zip(x_val, y_val, preds): # sample-wise
- inputs = "".join(idx2g[g] for g in xx).split("E")[0]
- expected = " ".join(idx2p[p] for p in yy).split("")[0].strip()
- got = " ".join(idx2p[p] for p in pred).split("")[0].strip()
-
- print("* Input : {}".format(inputs))
- print("* Expected: {}".format(expected))
- print("* Got : {}".format(got))
-
- error = distance.levenshtein(expected.split(), got.split())
- errors += error
- total += len(expected.split())
-
- cer = errors / float(total)
- print("epoch: %02d, training loss: %02f, CER: %02f\n" % (epoch+1, loss, cer))
diff --git a/setup.py b/setup.py
index 6bf7c2f..c765f59 100644
--- a/setup.py
+++ b/setup.py
@@ -12,7 +12,7 @@
setup(
name = 'g2p_en',
packages = ['g2p_en'], # this must be the same as the name above
- version = '1.0.0',
+ version = '2.0.0',
description = 'A Simple Python Module for English Grapheme To Phoneme Conversion',
long_description=long_description,
author = 'Kyubyong Park & Jongseok Kim',
@@ -23,11 +23,9 @@
classifiers = [],
install_requires = [
'numpy>=1.13.1',
- 'tensorflow >= 1.3.0',
'nltk>=3.2.4',
'inflect>=0.3.1',
'distance>=0.1.3',
- 'future>=0.16.0'
],
license='Apache Software License',
include_package_data=True