From 09a16433126c9e20d2f46f2cb50afcc1e3ed4e96 Mon Sep 17 00:00:00 2001
From: weiliang <weiliang.chong@day-care.cn>
Date: Wed, 4 Jan 2023 17:08:11 +0800
Subject: [PATCH] add load_custom_phonemes()

---
 g2p_en/g2p.py | 19 +++++++++++++++++++
 1 file changed, 19 insertions(+)

diff --git a/g2p_en/g2p.py b/g2p_en/g2p.py
index 8b37659..c4092af 100644
--- a/g2p_en/g2p.py
+++ b/g2p_en/g2p.py
@@ -71,6 +71,25 @@ def __init__(self):
         self.load_variables()
         self.homograph2features = construct_homograph_dictionary()
 
+    def load_custom_phonemes(self, file_path):
+        '''Load custom graphemes (spelling) to phonemes (pronunciation)
+        in the file, which has the same format with nltk's cmudict:
+            A 1 AH0
+            A. 1 EY1
+            A 2 EY1
+            A42128 1 EY1 F AO1 R T UW1 W AH1 N T UW1 EY1 T
+            AAA 1 T R IH2 P AH0 L EY1
+        '''
+        with open(file_path) as fin:
+            for line in fin:
+                if line[0] == '#':
+                    continue
+                word, left = line.strip().split(' ', maxsplit=1)
+                phonemes = left.split()[1:]
+                word = word.lower()
+                # just replace if word is in cmudict
+                self.cmu[word] = [phonemes]
+
     def load_variables(self):
         self.variables = np.load(os.path.join(dirname,'checkpoint20.npz'))
         self.enc_emb = self.variables["enc_emb"]  # (29, 64). (len(graphemes), emb)