baijum · beniza · Jan 7, 2020 · Jan 7, 2020 · Jan 7, 2020 · Jan 7, 2020
diff --git a/.gitignore b/.gitignore
diff --git a/README.md b/README.md
@@ -1 +1,25 @@
 # Bailey
+
+Bailey is a fully customizable PEG based data parsing tool (primarily designed to parse dictionary data from plain text files)
+
+## How to use Bailey
+
+At the moment there are no executables available for using Bailey. We might release a `cli` later. You may still be able to use this utility by cloning this repo and then running it on your local system.
+
+### Step 1ː Install dependencies
+> $ python setup.py install
+
+### Step 2ː Run Bailey
+> $ python bailey/baily.py path/to/dictionary_plain_text_file
+
+Running the above will generate two outputs. 1) a dictionary representation of all the valid entries (entries that matches the expression in the grammar) in the dictionary_plain_text_file 2) every invalid entries in the file will be stored inside a `error.log` file with the corresponding line numbers.
+
+## Output
+Bailey currently output data in Multi Dictionary Format (MDF) version 4.0. For more info please see the [description](docs/lexical_entry.md) of entries in the MDF.
+
+## Testing
+To test the script, simply run
+
+> $ python baily/baily.py
+
+This should output a sample dictionary format.
diff --git a/bailey/bailey.py b/bailey/bailey.py
@@ -94,17 +94,20 @@
 grammar = Grammar(
     r"""
     expr       = (entry / emptyline )*
-    entry      = hash headword comma pos ws senses subentry emptyline
+    entry      = hash headword comma pos ws senses subentry period emptyline
+    # entry      = hash headphrase comma pos ws senses subentry period emptyline
     hash       = (~"#")*
+    # headphrase = headword (ws headword)*
     headword   = ~"[A-Z 0-9 -]*"i
-    pos        = (ws ~"[a-z]+\.")+
+    pos        = (ws ~"[a-z]+[\., ]")+
     subentry   = (semicolon ws senses)*
     senses     = (sense comma)* sense
     sense      = (ml ws ml)* ml
     ml         = ~"[\u0d00-\u0d7f]*"
-    semicolon  = ~";"
+    semicolon  = ~"[;:]"
     comma      = ~","
     ws         = ~"\s*"
+    period    = ~"."
     emptyline  = ws+
     """
 )
@@ -126,11 +129,11 @@ def visit_entry(self, node, visited_children):
         if visited_children[0].lstrip().startswith("#"):
             return
         output["lx"] = visited_children[1]
-        output["tx"] = datetime.date.today().isoformat()
         output["ps"] = visited_children[3]
-        output["senses"] = visited_children[5]
+        output["sn"] = visited_children[5]
         if visited_children[6]:
             output["se"] = visited_children[6]
+        output["dt"] = datetime.date.today().isoformat()
         return output
 
     def visit_hash(self, node, visited_children):
@@ -172,17 +175,60 @@ def generic_visit(self, node, visited_children):
         """ The generic visit method. """
         return visited_children or node
 
+def parseData(data):
+    tree = grammar.parse(data)
+    dv = DictVisitor()
+    output = dv.visit(tree)
+    return (output)
+
+def printOutput(item):
+    if type(item) == list:
+        for elem in item:
+            printOutput(elem)
+    elif type(item) == dict:
+        for k, v in item.items():
+            if k.strip() == "lx":
+                o.write("\n")
+            o.write("\n\\{}\t".format(k))
+            printOutput(v)
+    elif type(item) == str:
+        o.write(item.strip())
+    else:
+        # pass
+        print("\\warn\tError! {}".format(item))
+
 def main():
     global data
     if len(sys.argv) > 1:
         filelocation = sys.argv[1]
         f = open(filelocation, mode="r", encoding="utf-8")
-        data = f.read()
-
-    tree = grammar.parse(data)
-    dv = DictVisitor()
-    output = dv.visit(tree)
-    print(output)
+        dataset = f.readlines()
+        i = 1
+        log = ""
+        output = []
+        for data in dataset:
+            try:
+                output.append(parseData(data))
+                i += 1
+            except Exception as e:
+                log += ("{}\t{}".format(i, data))
+                # print("Error on line {line_number}\t{missing_data}\t{err}".format(missing_data=data, err=e, line_number=i))
+                i += 1
+                pass
+    parseData(data)
+
+    global o 
+    o = open("./data/output/dict.txt", mode="w", encoding="utf-8")
+    o.write("\\_sh v3.0  231  MDF 4.0\n")
+    # # o.write(str(output))
+    # pickle.dump(output, o)
+    printOutput(output)
+    o.close()
+
+    if(len(log)):
+        errorLog = open("./data/output/error.log", mode="w", encoding="utf-8")
+        errorLog.write(log)
+        errorLog.close
 
 if __name__ == "__main__":
     main()