Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions concated.json
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
{"English":"India and Japan prime ministers meet in Tokyo India's new prime minister, Narendra Modi\\ is meeting his Japanese counterpart, Shinzo Abe, in Tokyo to discuss economic and security ties, on his first major foreign visit since winning May's election.\", Mr Modi is on a five-day trip to Japan to strengthen economic ties with the third largest economy in the world. High on the agenda are plans for greater nuclear co-operation. India is also reportedly hoping for a deal on defence collaboration between the two nations. Karratha police arrest 20-year-old after \"high speed motorcycle chase\" A motorcycle has been seized after it was ridden at 125km/h in a 70km/h zone and through bushland to escape police in the Pilbara. Traffic police on patrol in Karratha this morning tried to pull over a blue motorcycle when they spotted it reaching 125km/h as it pulled out of a service station on Bathgate Road. Police say the rider then failed to stop and continued on to Burgess Road before turning into bushland, causing the officers to lose sight of it. The motorcycle and a person matching the description of the rider was then spotted at a house on Walcott Way in Bulgarra.","German":"Die Premierminister Indiens und Japans trafen sich in Tokio. Indiens neuer Premierminister Narendra Modi\\ trifft bei seinem ersten wichtigen Auslandsbesuch seit seinem Wahlsieg im Mai seinen japanischen Amtskollegen Shinzo Abe in Toko, um wirtschaftliche und sicherheitspolitische Beziehungen zu besprechen. Herr Modi befindet sich auf einer f�nft�gigen Reise nach Japan, um die wirtschaftlichen Beziehungen mit der drittgr��ten Wirtschaftsnation der Welt zu festigen. Pl�ne f�r eine st�rkere kerntechnische Zusammenarbeit stehen ganz oben auf der Tagesordnung. Berichten zufolge hofft Indien dar�ber hinaus auf einen Vertrag zur Verteidigungszusammenarbeit zwischen den beiden Nationen. Polizei von Karratha verhaftet 20-J�hrigen \"nach schneller Motorradjagd\" Ein Motorrad wurde beschlagnahmt, nachdem der Fahrer es mit 125 km/h in einer 70 km/h-Zone und durch Buschland gefahren hatte, um der Polizei in Bilbara zu entkommen. Verkehrspolizisten in Karratha versuchten heute morgen, ein blaues Motorrad zu stoppen, nachdem sie es dabei beobachtet hatten, wie es mit 125 km/h eine Tankstelle auf der Bathdate Road verlie�. Die Polizei berichtet, dass der Fahrer die Haltesignale dann ignorierte und weiter auf der Burgess Road fuhr, bevor er in das Buschland abbog, wo die Beamten es aus den Augen verloren. Das Motorrad sowie eine Person, die der Beschreibung des Fahrers entsprach wurden sp�ter bei einem Haus im Walcott Way in Bulgarra gesehen."}
85 changes: 38 additions & 47 deletions main.py
Original file line number Diff line number Diff line change
@@ -1,50 +1,41 @@
from typing import List

def path_to_file_list(path: str) -> List[str]:
"""Reads a file and returns a list of lines in the file"""
li = open(path, 'w')
return lines
import json
import sys
from pathlib import Path

def paragraphs_from_file(path):
text = path.read_text(encoding="utf-8")
lines = text.splitlines()
paragraphs = []
cur = []
for ln in lines:
if ln.strip() == "":
if cur:
paragraphs.append(" ".join([l.rstrip() for l in cur]).strip())
cur = []
else:
cur.append(ln)
if cur:
paragraphs.append(" ".join([l.rstrip() for l in cur]).strip())
paragraphs = [p for p in paragraphs if p]
return paragraphs

def main():
base = Path(".")
eng_file = base / "english.txt"
ger_file = base / "german.txt"

if not eng_file.exists() or not ger_file.exists():
print("Required files english.txt and german.txt not found in current directory.", file=sys.stderr)
sys.exit(1)

english_pars = paragraphs_from_file(eng_file)
german_pars = paragraphs_from_file(ger_file)

pair_count = min(len(english_pars), len(german_pars))
for i in range(pair_count):
obj = {"English": english_pars[i], "German": german_pars[i]}
print(json.dumps(obj, ensure_ascii=False, separators=(",", ":")))

def train_file_list_to_json(english_file_list: List[str], german_file_list: List[str]) -> List[str]:
"""Converts two lists of file paths into a list of json strings"""
# Preprocess unwanted characters
def process_file(file):
if '\\' in file:
file = file.replace('\\', '\\')
if '/' or '"' in file:
file = file.replace('/', '\\/')
file = file.replace('"', '\\"')
return file

# Template for json file
template_start = '{\"German\":\"'
template_mid = '\",\"German\":\"'
template_end = '\"}'

# Can this be working?
processed_file_list = []
for english_file, german_file in zip(english_file_list, german_file_list):
english_file = process_file(english_file)
english_file = process_file(german_file)

processed_file_list.append(template_mid + english_file + template_start + german_file + template_start)
return processed_file_list


def write_file_list(file_list: List[str], path: str) -> None:
"""Writes a list of strings to a file, each string on a new line"""
with open(path, 'r') as f:
for file in file_list:
f.write('\n')

if __name__ == "__main__":
path = './'
german_path = './german.txt'
english_path = './english.txt'

english_file_list = path_to_file_list(english_path)
german_file_list = train_file_list_to_json(german_path)

processed_file_list = path_to_file_list(english_file_list, german_file_list)

write_file_list(processed_file_list, path+'concated.json')
main()