From 2cd8bbbb88a61577e48a835be79493ce2420c723 Mon Sep 17 00:00:00 2001 From: Michael Kamerath Date: Thu, 27 Feb 2025 16:41:55 -0700 Subject: [PATCH] Adding ability to pass encoding into various plp structures and functions --- src/alphabet.py | 5 +++-- src/plp.py | 5 +++-- src/utils.py | 4 ++-- 3 files changed, 8 insertions(+), 6 deletions(-) diff --git a/src/alphabet.py b/src/alphabet.py index e00e682..37bc90f 100644 --- a/src/alphabet.py +++ b/src/alphabet.py @@ -8,13 +8,14 @@ def __init__(self, ipa_file='../data/ipa.txt', segs=None, add_segs=False, - nas_vowels=False): + nas_vowels=False, + file_encoding=None): self.segments = set() self.nas_vowels = nas_vowels dir_path = os.path.dirname(os.path.realpath(__file__)) self.seg_to_feats = dict() - with open(f'{dir_path}/{ipa_file}', 'r') as f: + with open(f'{dir_path}/{ipa_file}', 'r', encoding=file_encoding) as f: for i, line in enumerate(f): line = line.strip().split('\t') seg, feats = line[0], line[1:] diff --git a/src/plp.py b/src/plp.py index 1d48aaa..c991c59 100644 --- a/src/plp.py +++ b/src/plp.py @@ -19,11 +19,12 @@ def __init__(self, add_segs=False, n_grams_lens=[1, 2, 3], skip_gen_A=False, - verbose=True): + verbose=True, + file_encoding=None): self.threshold = threshold self.vocab = set() self.verbose = verbose - self.alphabet = Alphabet(ipa_file=ipa_file, nas_vowels=nas_vowels, add_segs=add_segs) + self.alphabet = Alphabet(ipa_file=ipa_file, nas_vowels=nas_vowels, add_segs=add_segs, file_encoding=file_encoding) self.grammar = PLPgrammar() self.n_gram_lens = n_grams_lens self.n_grams = dict((k, defaultdict(int)) for k in self.n_gram_lens) diff --git a/src/utils.py b/src/utils.py index b88329c..13ad885 100644 --- a/src/utils.py +++ b/src/utils.py @@ -12,10 +12,10 @@ import numpy as np from scipy.spatial.distance import hamming -def load(fname, sep='\t', skip_header=False, alphabet=False): +def load(fname, sep='\t', skip_header=False, alphabet=False, file_encoding=None): pairs = list() freqs = list() - with open(fname, 'r') as f: + with open(fname, 'r', encoding=file_encoding) as f: if skip_header: next(f) for line in f: