Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
6 changes: 0 additions & 6 deletions .gitignore

This file was deleted.

24 changes: 24 additions & 0 deletions README.md
Original file line number Diff line number Diff line change
@@ -1 +1,25 @@
# Bailey

Bailey is a fully customizable PEG based data parsing tool (primarily designed to parse dictionary data from plain text files)

## How to use Bailey

At the moment there are no executables available for using Bailey. We might release a `cli` later. You may still be able to use this utility by cloning this repo and then running it on your local system.

### Step 1ː Install dependencies
> $ python setup.py install

### Step 2ː Run Bailey
> $ python bailey/baily.py path/to/dictionary_plain_text_file

Running the above will generate two outputs. 1) a dictionary representation of all the valid entries (entries that matches the expression in the grammar) in the dictionary_plain_text_file 2) every invalid entries in the file will be stored inside a `error.log` file with the corresponding line numbers.

## Output
Bailey currently output data in Multi Dictionary Format (MDF) version 4.0. For more info please see the [description](docs/lexical_entry.md) of entries in the MDF.

## Testing
To test the script, simply run

> $ python baily/baily.py

This should output a sample dictionary format.
68 changes: 57 additions & 11 deletions bailey/bailey.py
Original file line number Diff line number Diff line change
Expand Up @@ -94,17 +94,20 @@
grammar = Grammar(
r"""
expr = (entry / emptyline )*
entry = hash headword comma pos ws senses subentry emptyline
entry = hash headword comma pos ws senses subentry period emptyline
# entry = hash headphrase comma pos ws senses subentry period emptyline
hash = (~"#")*
# headphrase = headword (ws headword)*
Copy link
Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I've added this to capture the head words with multiple words. Most of the exception are due to this.

However I couldn't get this to work.

headword = ~"[A-Z 0-9 -]*"i
pos = (ws ~"[a-z]+\.")+
pos = (ws ~"[a-z]+[\., ]")+
subentry = (semicolon ws senses)*
senses = (sense comma)* sense
sense = (ml ws ml)* ml
ml = ~"[\u0d00-\u0d7f]*"
semicolon = ~";"
semicolon = ~"[;:]"
Copy link
Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

in many places the keyboardists made typos where they put a : in the place of ;. Since we are not preserving the data, I thought of bypassing them.

comma = ~","
ws = ~"\s*"
period = ~"."
emptyline = ws+
"""
)
Expand All @@ -126,11 +129,11 @@ def visit_entry(self, node, visited_children):
if visited_children[0].lstrip().startswith("#"):
return
output["lx"] = visited_children[1]
output["tx"] = datetime.date.today().isoformat()
output["ps"] = visited_children[3]
output["senses"] = visited_children[5]
output["sn"] = visited_children[5]
if visited_children[6]:
output["se"] = visited_children[6]
output["dt"] = datetime.date.today().isoformat()
return output

def visit_hash(self, node, visited_children):
Expand Down Expand Up @@ -172,17 +175,60 @@ def generic_visit(self, node, visited_children):
""" The generic visit method. """
return visited_children or node

def parseData(data):
tree = grammar.parse(data)
dv = DictVisitor()
output = dv.visit(tree)
return (output)

def printOutput(item):
if type(item) == list:
for elem in item:
printOutput(elem)
elif type(item) == dict:
for k, v in item.items():
if k.strip() == "lx":
o.write("\n")
o.write("\n\\{}\t".format(k))
printOutput(v)
elif type(item) == str:
o.write(item.strip())
else:
# pass
print("\\warn\tError! {}".format(item))

def main():
global data
if len(sys.argv) > 1:
filelocation = sys.argv[1]
f = open(filelocation, mode="r", encoding="utf-8")
data = f.read()

tree = grammar.parse(data)
dv = DictVisitor()
output = dv.visit(tree)
print(output)
dataset = f.readlines()
i = 1
log = ""
output = []
for data in dataset:
try:
output.append(parseData(data))
i += 1
except Exception as e:
log += ("{}\t{}".format(i, data))
# print("Error on line {line_number}\t{missing_data}\t{err}".format(missing_data=data, err=e, line_number=i))
i += 1
pass
parseData(data)

global o
o = open("./data/output/dict.txt", mode="w", encoding="utf-8")
o.write("\\_sh v3.0 231 MDF 4.0\n")
# # o.write(str(output))
# pickle.dump(output, o)
printOutput(output)
o.close()

if(len(log)):
errorLog = open("./data/output/error.log", mode="w", encoding="utf-8")
errorLog.write(log)
errorLog.close

if __name__ == "__main__":
main()
Loading