From 2cd8bbbb88a61577e48a835be79493ce2420c723 Mon Sep 17 00:00:00 2001
From: Michael Kamerath <mkamerath@gmail.com>
Date: Thu, 27 Feb 2025 16:41:55 -0700
Subject: [PATCH] Adding ability to pass encoding into various plp structures
 and functions

---
 src/alphabet.py | 5 +++--
 src/plp.py      | 5 +++--
 src/utils.py    | 4 ++--
 3 files changed, 8 insertions(+), 6 deletions(-)

diff --git a/src/alphabet.py b/src/alphabet.py
index e00e682..37bc90f 100644
--- a/src/alphabet.py
+++ b/src/alphabet.py
@@ -8,13 +8,14 @@ def __init__(self,
                  ipa_file='../data/ipa.txt',
                  segs=None,
                  add_segs=False,
-                 nas_vowels=False):
+                 nas_vowels=False,
+                 file_encoding=None):
         self.segments = set()
         self.nas_vowels = nas_vowels
         dir_path = os.path.dirname(os.path.realpath(__file__))
 
         self.seg_to_feats = dict()
-        with open(f'{dir_path}/{ipa_file}', 'r') as f:
+        with open(f'{dir_path}/{ipa_file}', 'r', encoding=file_encoding) as f:
             for i, line in enumerate(f):
                 line = line.strip().split('\t')
                 seg, feats = line[0], line[1:]
diff --git a/src/plp.py b/src/plp.py
index 1d48aaa..c991c59 100644
--- a/src/plp.py
+++ b/src/plp.py
@@ -19,11 +19,12 @@ def __init__(self,
                  add_segs=False,
                  n_grams_lens=[1, 2, 3], 
                  skip_gen_A=False,
-                 verbose=True):
+                 verbose=True,
+                 file_encoding=None):
         self.threshold = threshold
         self.vocab = set()
         self.verbose = verbose
-        self.alphabet = Alphabet(ipa_file=ipa_file, nas_vowels=nas_vowels, add_segs=add_segs)
+        self.alphabet = Alphabet(ipa_file=ipa_file, nas_vowels=nas_vowels, add_segs=add_segs, file_encoding=file_encoding)
         self.grammar = PLPgrammar()
         self.n_gram_lens = n_grams_lens
         self.n_grams = dict((k, defaultdict(int)) for k in self.n_gram_lens)
diff --git a/src/utils.py b/src/utils.py
index b88329c..13ad885 100644
--- a/src/utils.py
+++ b/src/utils.py
@@ -12,10 +12,10 @@
 import numpy as np
 from scipy.spatial.distance import hamming
 
-def load(fname, sep='\t', skip_header=False, alphabet=False):
+def load(fname, sep='\t', skip_header=False, alphabet=False, file_encoding=None):
     pairs = list()
     freqs = list()
-    with open(fname, 'r') as f:
+    with open(fname, 'r', encoding=file_encoding) as f:
         if skip_header:
             next(f)
         for line in f: