diff --git a/main.py b/main.py index 0d1f17b..1b909ca 100644 --- a/main.py +++ b/main.py @@ -2,49 +2,48 @@ def path_to_file_list(path: str) -> List[str]: """Reads a file and returns a list of lines in the file""" - li = open(path, 'w') + with open(path, "r", encoding="utf-8") as f: + lines = [line.rstrip("\n") for line in f] return lines -def train_file_list_to_json(english_file_list: List[str], german_file_list: List[str]) -> List[str]: - """Converts two lists of file paths into a list of json strings""" - # Preprocess unwanted characters - def process_file(file): - if '\\' in file: - file = file.replace('\\', '\\') - if '/' or '"' in file: - file = file.replace('/', '\\/') - file = file.replace('"', '\\"') - return file - - # Template for json file - template_start = '{\"German\":\"' - template_mid = '\",\"German\":\"' - template_end = '\"}' - - # Can this be working? + +def escape_for_json(text: str) -> str: + """Escapes characters that need to be escaped in JSON.""" + text = text.replace("\\", "\\\\") # \ -> \\ + text = text.replace("/", "\\/") # / -> \/ + text = text.replace('"', '\\"') # " -> \" + return text + + +def train_file_list_to_json(english_file_list: List[str], + german_file_list: List[str]) -> List[str]: + """Converts two lists of sentences into a list of JSON strings.""" + template = '{{"English":"{}","German":"{}"}}' processed_file_list = [] - for english_file, german_file in zip(english_file_list, german_file_list): - english_file = process_file(english_file) - english_file = process_file(german_file) - processed_file_list.append(template_mid + english_file + template_start + german_file + template_start) + for english_line, german_line in zip(english_file_list, german_file_list): + eng = escape_for_json(english_line) + ger = escape_for_json(german_line) + processed_file_list.append(template.format(eng, ger)) + return processed_file_list def write_file_list(file_list: List[str], path: str) -> None: - """Writes a list of strings to a file, each string on a new line""" - with open(path, 'r') as f: - for file in file_list: - f.write('\n') - + """Writes a list of strings to a file, each string on a new line.""" + with open(path, "w", encoding="utf-8") as f: + for line in file_list: + f.write(line + "\n") + + if __name__ == "__main__": - path = './' - german_path = './german.txt' - english_path = './english.txt' + german_path = "./german.txt" + english_path = "./english.txt" + output_path = "./concated.json" english_file_list = path_to_file_list(english_path) - german_file_list = train_file_list_to_json(german_path) + german_file_list = path_to_file_list(german_path) - processed_file_list = path_to_file_list(english_file_list, german_file_list) + processed_file_list = train_file_list_to_json(english_file_list, german_file_list) - write_file_list(processed_file_list, path+'concated.json') + write_file_list(processed_file_list, output_path)