From 4101716b9b83280bc7d3824d9634fd2b556cdeb9 Mon Sep 17 00:00:00 2001 From: Matthias Gille Levenson Date: Wed, 17 Sep 2025 16:13:31 +0200 Subject: [PATCH 1/6] Change module name, refactor --- CQL.py | 2 +- src/{CQLEngine => corpus_query_language}/__init__.py | 0 src/{CQLEngine => corpus_query_language}/engine.py | 2 +- src/{CQLEngine => corpus_query_language}/functions.py | 6 +++--- src/{CQLEngine => corpus_query_language}/lexer.py | 0 src/{CQLEngine => corpus_query_language}/parser.py | 2 +- tests/tests.py | 2 +- 7 files changed, 7 insertions(+), 7 deletions(-) rename src/{CQLEngine => corpus_query_language}/__init__.py (100%) rename src/{CQLEngine => corpus_query_language}/engine.py (98%) rename src/{CQLEngine => corpus_query_language}/functions.py (95%) rename src/{CQLEngine => corpus_query_language}/lexer.py (100%) rename src/{CQLEngine => corpus_query_language}/parser.py (98%) diff --git a/CQL.py b/CQL.py index adb1d24..66d45d4 100644 --- a/CQL.py +++ b/CQL.py @@ -6,7 +6,7 @@ # - any regex in the annotation value [lemma='reye?s?'] # - alternatives: [lemma='rey']|[lemma='príncipe'][]{,5}[lemma='santo'] import sys -import CQLEngine.functions as functions +import corpus_query_language.functions as functions # Takes a list of dicts with the annotations as input. Returns: # - a list of spans (search_all function) diff --git a/src/CQLEngine/__init__.py b/src/corpus_query_language/__init__.py similarity index 100% rename from src/CQLEngine/__init__.py rename to src/corpus_query_language/__init__.py diff --git a/src/CQLEngine/engine.py b/src/corpus_query_language/engine.py similarity index 98% rename from src/CQLEngine/engine.py rename to src/corpus_query_language/engine.py index 89574c1..309dc81 100644 --- a/src/CQLEngine/engine.py +++ b/src/corpus_query_language/engine.py @@ -1,4 +1,4 @@ -import CQLEngine.functions as functions +import corpus_query_language.functions as functions def parse_corpus(ast, corpus, mode, debug): diff --git a/src/CQLEngine/functions.py b/src/corpus_query_language/functions.py similarity index 95% rename from src/CQLEngine/functions.py rename to src/corpus_query_language/functions.py index 4c01ad6..1bcfdb5 100644 --- a/src/CQLEngine/functions.py +++ b/src/corpus_query_language/functions.py @@ -1,8 +1,8 @@ import re import json -import CQLEngine.parser as parser -import CQLEngine.lexer as lexer -import CQLEngine.engine as engine +import corpus_query_language.parser as parser +import corpus_query_language.lexer as lexer +import corpus_query_language.engine as engine class CQLEngine(): diff --git a/src/CQLEngine/lexer.py b/src/corpus_query_language/lexer.py similarity index 100% rename from src/CQLEngine/lexer.py rename to src/corpus_query_language/lexer.py diff --git a/src/CQLEngine/parser.py b/src/corpus_query_language/parser.py similarity index 98% rename from src/CQLEngine/parser.py rename to src/corpus_query_language/parser.py index 4c16277..c698699 100644 --- a/src/CQLEngine/parser.py +++ b/src/corpus_query_language/parser.py @@ -1,5 +1,5 @@ import ply.yacc as yacc -import CQLEngine.lexer as lexer +import corpus_query_language.lexer as lexer diff --git a/tests/tests.py b/tests/tests.py index 6899377..f63ac45 100644 --- a/tests/tests.py +++ b/tests/tests.py @@ -1,7 +1,7 @@ import ast import sys sys.path.append('src/') -import CQLEngine.functions as functions +import corpus_query_language.functions as functions import unittest def import_test_queries(path): From 4b0b05080b1dac48a93db2caddbd8d02c4229529 Mon Sep 17 00:00:00 2001 From: Matthias Gille Levenson Date: Wed, 17 Sep 2025 16:13:55 +0200 Subject: [PATCH 2/6] Increase version --- pyproject.toml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pyproject.toml b/pyproject.toml index 21449a0..fcb222b 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -1,6 +1,6 @@ [project] name = "corpus-query-language" -version = "0.0.1" +version = "0.0.2" authors = [ { name="Matthias Gille Levenson", email="matthias.gille-levenson@ens-lyon.fr" }, ] From 25f428eb18b3fa683424bf99a1b5d8c223dc40b7 Mon Sep 17 00:00:00 2001 From: Matthias Gille Levenson Date: Wed, 17 Sep 2025 16:30:12 +0200 Subject: [PATCH 3/6] Reorganize repo for better imports --- CQL.py | 8 ++--- src/corpus_query_language/core/__init__.py | 0 src/corpus_query_language/core/core.py | 33 +++++++++++++++++ src/corpus_query_language/engine/__init__.py | 0 .../{ => engine}/engine.py | 2 +- .../functions/__init__.py | 0 .../{ => functions}/functions.py | 35 ++----------------- .../language/__init__.py | 0 .../{ => language}/lexer.py | 0 .../{ => language}/parser.py | 2 +- tests/tests.py | 7 ++-- 11 files changed, 45 insertions(+), 42 deletions(-) create mode 100644 src/corpus_query_language/core/__init__.py create mode 100644 src/corpus_query_language/core/core.py create mode 100644 src/corpus_query_language/engine/__init__.py rename src/corpus_query_language/{ => engine}/engine.py (98%) create mode 100644 src/corpus_query_language/functions/__init__.py rename src/corpus_query_language/{ => functions}/functions.py (54%) create mode 100644 src/corpus_query_language/language/__init__.py rename src/corpus_query_language/{ => language}/lexer.py (100%) rename src/corpus_query_language/{ => language}/parser.py (98%) diff --git a/CQL.py b/CQL.py index 66d45d4..daf56a7 100644 --- a/CQL.py +++ b/CQL.py @@ -1,4 +1,4 @@ -# Python package project: CQL (Corpus Query Language) parser: +# Python package project: CQL (Corpus Query Language) language: # - parsing of any kind of annotation: word, lemma, pos, morph # - combination of annotations: [lemma='rey' & pos='NCMP000'] # - one or zero annotations [lemma='rey']?. @@ -6,18 +6,18 @@ # - any regex in the annotation value [lemma='reye?s?'] # - alternatives: [lemma='rey']|[lemma='príncipe'][]{,5}[lemma='santo'] import sys -import corpus_query_language.functions as functions +import corpus_query_language.core.core as core +import corpus_query_language.functions.functions as functions # Takes a list of dicts with the annotations as input. Returns: # - a list of spans (search_all function) # - a boolean (match function) - def main(): query = sys.argv[1] corpus = functions.import_corpus("tests/test_data/test_corpus.json") - MyEngine = functions.CQLEngine() + MyEngine = core.CQLEngine() MyEngine.findall(corpus, query) MyEngine.match(corpus, query) diff --git a/src/corpus_query_language/core/__init__.py b/src/corpus_query_language/core/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/src/corpus_query_language/core/core.py b/src/corpus_query_language/core/core.py new file mode 100644 index 0000000..9a2e980 --- /dev/null +++ b/src/corpus_query_language/core/core.py @@ -0,0 +1,33 @@ +import corpus_query_language.functions.functions as functions +import corpus_query_language.engine.engine as engine + +class CQLEngine(): + def findall(self, corpus:list[dict], query:str, verbose:bool=True, debug:bool=False) -> list[tuple[int, int]]: + """ + This function checks if a query matches some text, and returns the start and end span. + :param query: a CQL query + :param corpus: the annotated text as a list of dictionnaries containing the annotations (lemma, pos, morph, word) + :return: a list of tuples with the start and end position. + """ + query_ast = functions.build_grammar(debug=debug, query=query) + result = engine.parse_corpus(query_ast, corpus, mode="find", debug=debug) + if verbose: + print(f"\n---\nResults for query {query}:") + print(f"Ast: {query_ast}") + print(f"Spans: {result}") + return result + + + def match(self, corpus:list[dict], query:str, verbose:bool=True, debug:bool=False) -> bool: + """ + This function checks whether a query matches some text, and returns True or False + :param query: a CQL query + :param corpus: the annotated text as a list of dictionnaries containing the annotations (lemma, pos, morph, word) + :return: a boolean + """ + query_ast = functions.build_grammar(debug=debug, query=query) + result = engine.parse_corpus(query_ast, corpus, mode="match", debug=debug) + if verbose: + print(f"\n---\nResults for query {query}:") + print(result) + return result diff --git a/src/corpus_query_language/engine/__init__.py b/src/corpus_query_language/engine/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/src/corpus_query_language/engine.py b/src/corpus_query_language/engine/engine.py similarity index 98% rename from src/corpus_query_language/engine.py rename to src/corpus_query_language/engine/engine.py index 309dc81..4b8eac7 100644 --- a/src/corpus_query_language/engine.py +++ b/src/corpus_query_language/engine/engine.py @@ -1,4 +1,4 @@ -import corpus_query_language.functions as functions +import corpus_query_language.functions.functions as functions def parse_corpus(ast, corpus, mode, debug): diff --git a/src/corpus_query_language/functions/__init__.py b/src/corpus_query_language/functions/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/src/corpus_query_language/functions.py b/src/corpus_query_language/functions/functions.py similarity index 54% rename from src/corpus_query_language/functions.py rename to src/corpus_query_language/functions/functions.py index 1bcfdb5..217e098 100644 --- a/src/corpus_query_language/functions.py +++ b/src/corpus_query_language/functions/functions.py @@ -1,40 +1,9 @@ import re import json -import corpus_query_language.parser as parser -import corpus_query_language.lexer as lexer -import corpus_query_language.engine as engine +import corpus_query_language.language.parser as parser +import corpus_query_language.language.lexer as lexer -class CQLEngine(): - def findall(self, corpus:list[dict], query:str, verbose:bool=True, debug:bool=False) -> list[tuple[int, int]]: - """ - This function checks if a query matches some text, and returns the start and end span. - :param query: a CQL query - :param corpus: the annotated text as a list of dictionnaries containing the annotations (lemma, pos, morph, word) - :return: a list of tuples with the start and end position. - """ - query_ast = build_grammar(debug=debug, query=query) - result = engine.parse_corpus(query_ast, corpus, mode="find", debug=debug) - if verbose: - print(f"\n---\nResults for query {query}:") - print(f"Ast: {query_ast}") - print(f"Spans: {result}") - return result - - - def match(self, corpus:list[dict], query:str, verbose:bool=True, debug:bool=False) -> bool: - """ - This function checks whether a query matches some text, and returns True or False - :param query: a CQL query - :param corpus: the annotated text as a list of dictionnaries containing the annotations (lemma, pos, morph, word) - :return: a boolean - """ - query_ast = build_grammar(debug=debug, query=query) - result = engine.parse_corpus(query_ast, corpus, mode="match", debug=debug) - if verbose: - print(f"\n---\nResults for query {query}:") - print(result) - return result def build_grammar(debug, query): diff --git a/src/corpus_query_language/language/__init__.py b/src/corpus_query_language/language/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/src/corpus_query_language/lexer.py b/src/corpus_query_language/language/lexer.py similarity index 100% rename from src/corpus_query_language/lexer.py rename to src/corpus_query_language/language/lexer.py diff --git a/src/corpus_query_language/parser.py b/src/corpus_query_language/language/parser.py similarity index 98% rename from src/corpus_query_language/parser.py rename to src/corpus_query_language/language/parser.py index c698699..1050179 100644 --- a/src/corpus_query_language/parser.py +++ b/src/corpus_query_language/language/parser.py @@ -1,5 +1,5 @@ import ply.yacc as yacc -import corpus_query_language.lexer as lexer +import corpus_query_language.language.lexer as lexer diff --git a/tests/tests.py b/tests/tests.py index f63ac45..4c08913 100644 --- a/tests/tests.py +++ b/tests/tests.py @@ -1,7 +1,8 @@ import ast import sys sys.path.append('src/') -import corpus_query_language.functions as functions +import corpus_query_language.core.core as core +import corpus_query_language.functions.functions as functions import unittest def import_test_queries(path): @@ -30,7 +31,7 @@ class TestQueries(unittest.TestCase): def test_findall_queries(self): self.corpus = functions.import_corpus("tests/test_data/test_corpus.json") self.queries = import_test_queries("tests/queries_findall.txt") - self.MyEngine = functions.CQLEngine() + self.MyEngine = core.CQLEngine() for query, GT in self.queries: GT = ast.literal_eval(GT) with self.subTest(query=query, GT=GT): @@ -39,7 +40,7 @@ def test_findall_queries(self): def test_match_queries(self): self.queries = import_match_queries("tests/queries_match.txt") - self.MyEngine = functions.CQLEngine() + self.MyEngine = core.CQLEngine() for idx, (nodes, query, GT) in enumerate(self.queries): with self.subTest(query=query, GT=GT): GT = True if GT == "True" else False From 9a75b27d6282f9da39fecf770e247022013826da Mon Sep 17 00:00:00 2001 From: Matthias Gille Levenson Date: Wed, 17 Sep 2025 17:22:20 +0200 Subject: [PATCH 4/6] Reorganize repo to improve imports --- CQL.py | 4 ++-- pyproject.toml | 5 ++++- src/corpus_query_language/__init__.py | 5 +++++ src/corpus_query_language/core/core.py | 6 +++--- src/corpus_query_language/engine/engine.py | 12 ++++++------ .../{functions => utils}/__init__.py | 0 .../{functions/functions.py => utils/utils.py} | 0 tests/tests.py | 11 +++++------ 8 files changed, 25 insertions(+), 18 deletions(-) rename src/corpus_query_language/{functions => utils}/__init__.py (100%) rename src/corpus_query_language/{functions/functions.py => utils/utils.py} (100%) diff --git a/CQL.py b/CQL.py index daf56a7..4332e69 100644 --- a/CQL.py +++ b/CQL.py @@ -7,7 +7,7 @@ # - alternatives: [lemma='rey']|[lemma='príncipe'][]{,5}[lemma='santo'] import sys import corpus_query_language.core.core as core -import corpus_query_language.functions.functions as functions +import corpus_query_language.utils.utils as utils # Takes a list of dicts with the annotations as input. Returns: # - a list of spans (search_all function) @@ -16,7 +16,7 @@ def main(): query = sys.argv[1] - corpus = functions.import_corpus("tests/test_data/test_corpus.json") + corpus = utils.import_corpus("tests/test_data/test_corpus.json") MyEngine = core.CQLEngine() MyEngine.findall(corpus, query) MyEngine.match(corpus, query) diff --git a/pyproject.toml b/pyproject.toml index fcb222b..49baa2b 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -1,6 +1,6 @@ [project] name = "corpus-query-language" -version = "0.0.2" +version = "0.0.4" authors = [ { name="Matthias Gille Levenson", email="matthias.gille-levenson@ens-lyon.fr" }, ] @@ -21,6 +21,9 @@ dependencies = [ requires = ["setuptools", "setuptools-scm"] build-backend = "setuptools.build_meta" +[tool.setuptools.packages.find] +where = ["src"] + [project.urls] Homepage = "https://github.com/matgille/CQL" Issues = "https://github.com/matgille/CQL/issues" \ No newline at end of file diff --git a/src/corpus_query_language/__init__.py b/src/corpus_query_language/__init__.py index e69de29..c76bd74 100644 --- a/src/corpus_query_language/__init__.py +++ b/src/corpus_query_language/__init__.py @@ -0,0 +1,5 @@ +import corpus_query_language.core.core as core +import corpus_query_language.engine.engine as engine +import corpus_query_language.utils.utils as utils +import corpus_query_language.language as language +__all__ = ["core", "engine", "language", "utils"] \ No newline at end of file diff --git a/src/corpus_query_language/core/core.py b/src/corpus_query_language/core/core.py index 9a2e980..eacf7d7 100644 --- a/src/corpus_query_language/core/core.py +++ b/src/corpus_query_language/core/core.py @@ -1,4 +1,4 @@ -import corpus_query_language.functions.functions as functions +import corpus_query_language.utils.utils as utils import corpus_query_language.engine.engine as engine class CQLEngine(): @@ -9,7 +9,7 @@ def findall(self, corpus:list[dict], query:str, verbose:bool=True, debug:bool=F :param corpus: the annotated text as a list of dictionnaries containing the annotations (lemma, pos, morph, word) :return: a list of tuples with the start and end position. """ - query_ast = functions.build_grammar(debug=debug, query=query) + query_ast = utils.build_grammar(debug=debug, query=query) result = engine.parse_corpus(query_ast, corpus, mode="find", debug=debug) if verbose: print(f"\n---\nResults for query {query}:") @@ -25,7 +25,7 @@ def match(self, corpus:list[dict], query:str, verbose:bool=True, debug:bool=Fals :param corpus: the annotated text as a list of dictionnaries containing the annotations (lemma, pos, morph, word) :return: a boolean """ - query_ast = functions.build_grammar(debug=debug, query=query) + query_ast = utils.build_grammar(debug=debug, query=query) result = engine.parse_corpus(query_ast, corpus, mode="match", debug=debug) if verbose: print(f"\n---\nResults for query {query}:") diff --git a/src/corpus_query_language/engine/engine.py b/src/corpus_query_language/engine/engine.py index 4b8eac7..a722c91 100644 --- a/src/corpus_query_language/engine/engine.py +++ b/src/corpus_query_language/engine/engine.py @@ -1,4 +1,4 @@ -import corpus_query_language.functions.functions as functions +import corpus_query_language.utils.utils as utils def parse_corpus(ast, corpus, mode, debug): @@ -65,7 +65,7 @@ def parse_corpus(ast, corpus, mode, debug): print(f"{operator} in list of analysis") print(len(corpus)) print(text_index) - if functions.simple_match(current_query, corpus[text_index]): + if utils.simple_match(current_query, corpus[text_index]): if debug: print("Found you a. Going forward on tree and text.") print(f"First match is {text_index}") @@ -84,7 +84,7 @@ def parse_corpus(ast, corpus, mode, debug): if debug: print(f"{operator} operator") if operator == "or": - if functions.alternative_match(current_query[1:], corpus[text_index]): + if utils.alternative_match(current_query[1:], corpus[text_index]): if debug: print("Found your alternative. Going forward on tree and text.") print(f"First match is {text_index}") @@ -107,7 +107,7 @@ def parse_corpus(ast, corpus, mode, debug): print(f"\t{text_index}: Looking for {ast[tree_index + 1]} in position {text_index}") if len(corpus) == text_index: break - if functions.simple_match(ast[tree_index + 1], corpus[text_index]): + if utils.simple_match(ast[tree_index + 1], corpus[text_index]): submatch = True tree_index += 2 if debug: @@ -126,7 +126,7 @@ def parse_corpus(ast, corpus, mode, debug): elif operator == "and": all_matches = [] for item in current_query[1:]: - if functions.simple_match(item, corpus[text_index]): + if utils.simple_match(item, corpus[text_index]): all_matches.append(True) else: all_matches.append(False) @@ -143,7 +143,7 @@ def parse_corpus(ast, corpus, mode, debug): # Pour l'opérateur "0 ou 1", on vérifie que le token matche. # S'il ne matche pas, on passe à la requête suivante sans # incrémenter le texte - if functions.alternative_match(current_query[1:], corpus[text_index]): + if utils.alternative_match(current_query[1:], corpus[text_index]): if debug: print("Found your alternative. Going forward on tree and text.") print(f"First match is {text_index}") diff --git a/src/corpus_query_language/functions/__init__.py b/src/corpus_query_language/utils/__init__.py similarity index 100% rename from src/corpus_query_language/functions/__init__.py rename to src/corpus_query_language/utils/__init__.py diff --git a/src/corpus_query_language/functions/functions.py b/src/corpus_query_language/utils/utils.py similarity index 100% rename from src/corpus_query_language/functions/functions.py rename to src/corpus_query_language/utils/utils.py diff --git a/tests/tests.py b/tests/tests.py index 4c08913..61537c5 100644 --- a/tests/tests.py +++ b/tests/tests.py @@ -1,8 +1,7 @@ import ast import sys sys.path.append('src/') -import corpus_query_language.core.core as core -import corpus_query_language.functions.functions as functions +import corpus_query_language as CQL import unittest def import_test_queries(path): @@ -24,14 +23,14 @@ def test_simple_match(self): "pos": "NCMS000", "morph": None, "word": "asnos"} - self.assertEqual(functions.simple_match(query, test_token), True, "Something is wrong" + self.assertEqual(CQL.utils.simple_match(query, test_token), True, "Something is wrong" "with function `test_simple_match`") class TestQueries(unittest.TestCase): def test_findall_queries(self): - self.corpus = functions.import_corpus("tests/test_data/test_corpus.json") + self.corpus = CQL.utils.import_corpus("tests/test_data/test_corpus.json") self.queries = import_test_queries("tests/queries_findall.txt") - self.MyEngine = core.CQLEngine() + self.MyEngine = CQL.core.CQLEngine() for query, GT in self.queries: GT = ast.literal_eval(GT) with self.subTest(query=query, GT=GT): @@ -40,7 +39,7 @@ def test_findall_queries(self): def test_match_queries(self): self.queries = import_match_queries("tests/queries_match.txt") - self.MyEngine = core.CQLEngine() + self.MyEngine = CQL.core.CQLEngine() for idx, (nodes, query, GT) in enumerate(self.queries): with self.subTest(query=query, GT=GT): GT = True if GT == "True" else False From a1ef50e90c7ef5531386b2090933e69c3fd4b38d Mon Sep 17 00:00:00 2001 From: Matthias Gille Levenson Date: Wed, 17 Sep 2025 17:41:43 +0200 Subject: [PATCH 5/6] Reorg prototype script --- CQL.py => src/CQL.py | 7 +++---- 1 file changed, 3 insertions(+), 4 deletions(-) rename CQL.py => src/CQL.py (78%) diff --git a/CQL.py b/src/CQL.py similarity index 78% rename from CQL.py rename to src/CQL.py index 4332e69..a933b91 100644 --- a/CQL.py +++ b/src/CQL.py @@ -6,8 +6,7 @@ # - any regex in the annotation value [lemma='reye?s?'] # - alternatives: [lemma='rey']|[lemma='príncipe'][]{,5}[lemma='santo'] import sys -import corpus_query_language.core.core as core -import corpus_query_language.utils.utils as utils +import corpus_query_language as CQL # Takes a list of dicts with the annotations as input. Returns: # - a list of spans (search_all function) @@ -16,8 +15,8 @@ def main(): query = sys.argv[1] - corpus = utils.import_corpus("tests/test_data/test_corpus.json") - MyEngine = core.CQLEngine() + corpus = CQL.utils.import_corpus("../tests/test_data/test_corpus.json") + MyEngine = CQL.core.CQLEngine() MyEngine.findall(corpus, query) MyEngine.match(corpus, query) From 40f5b82adbaa0fe06da4e6bc943d02cbf870c1f1 Mon Sep 17 00:00:00 2001 From: Matthias Gille Levenson Date: Wed, 17 Sep 2025 18:10:16 +0200 Subject: [PATCH 6/6] Increase version, add documentation and README --- README.md | 38 ++++++++++++++++++-- pyproject.toml | 2 +- src/corpus_query_language/core/core.py | 5 +++ src/corpus_query_language/engine/engine.py | 11 ++++-- src/corpus_query_language/language/lexer.py | 5 ++- src/corpus_query_language/language/parser.py | 3 ++ src/corpus_query_language/utils/utils.py | 15 ++++++-- 7 files changed, 71 insertions(+), 8 deletions(-) diff --git a/README.md b/README.md index d4e3326..2a5ef74 100644 --- a/README.md +++ b/README.md @@ -1,4 +1,38 @@ -# CQLEngine - a simple Corpus Query Language Processor +# Corpus Query Language Engine +## Presentation +This repo hosts the code for a simple +CQL processor. CQL is a language used for +linguistics queries over large corporas. -Work in progress \ No newline at end of file +## Pip install + +```shell +pip3 install corpus-query-language +``` + +## Uses + +Two main functions are implemented: +- match, for checking if some pattern exists in a corpus (stops at first match). Returns a boolean +- findall, for finding the position of all matching tokens. Returns a list of tuples, with start and end position. + +```python +import sys +import corpus_query_language as CQL + +query = "Some CQL query" +corpus = CQL.utils.import_corpus("path/to/json/corpus.json") +MyEngine = CQL.core.CQLEngine() +MyEngine.findall(corpus, query) +MyEngine.match(corpus, query) +``` + +## Implemented CQL functions + +- parsing of any kind of annotation classes: `word`, `lemma`, `pos`, `morph` +- combination of annotations: `[lemma='rey' & pos='NCMP000']` +- one or zero annotations `[lemma='rey']?` (partially implemented, may produce errors). +- distance `[lemma='rey'][]{,5}[lemma='santo']` +- any regex in the annotation value `[lemma='reye?s?']` +- alternatives: `([lemma='rey']|[lemma='príncipe'])[]{,5}[lemma='santo']` (may produce errors) \ No newline at end of file diff --git a/pyproject.toml b/pyproject.toml index 49baa2b..bced933 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -1,6 +1,6 @@ [project] name = "corpus-query-language" -version = "0.0.4" +version = "0.0.5" authors = [ { name="Matthias Gille Levenson", email="matthias.gille-levenson@ens-lyon.fr" }, ] diff --git a/src/corpus_query_language/core/core.py b/src/corpus_query_language/core/core.py index eacf7d7..4039eae 100644 --- a/src/corpus_query_language/core/core.py +++ b/src/corpus_query_language/core/core.py @@ -2,6 +2,11 @@ import corpus_query_language.engine.engine as engine class CQLEngine(): + """ + The main class: tokenize a query, parse it, and parse a corpus with 2 main functions: + - findall + - match + """ def findall(self, corpus:list[dict], query:str, verbose:bool=True, debug:bool=False) -> list[tuple[int, int]]: """ This function checks if a query matches some text, and returns the start and end span. diff --git a/src/corpus_query_language/engine/engine.py b/src/corpus_query_language/engine/engine.py index a722c91..1d51eaf 100644 --- a/src/corpus_query_language/engine/engine.py +++ b/src/corpus_query_language/engine/engine.py @@ -1,7 +1,15 @@ import corpus_query_language.utils.utils as utils -def parse_corpus(ast, corpus, mode, debug): +def parse_corpus(ast, corpus: list[dict], mode:str, debug) -> bool | list[tuple[int, int]]: + """ + Main function for parsing a corpus given an AST. + :param ast: The Abstract Syntax Tree to be matched against the corpus. + :param corpus: The corpus as a list of dictionaries. + :param mode: The mode: match (stop at first match, return Bool) or find (search for all matches, returns list of tuples) + :param debug: Debug mode: print all information of matching process + :return: + """ match = False text_end = False tree_index = 0 @@ -23,7 +31,6 @@ def parse_corpus(ast, corpus, mode, debug): # Text-directed engine. while text_end == False: - # On teste si on est en bout de texte. if len(corpus) == text_index and tree_index != ast_length: if debug: diff --git a/src/corpus_query_language/language/lexer.py b/src/corpus_query_language/language/lexer.py index 3490917..ad417d3 100644 --- a/src/corpus_query_language/language/lexer.py +++ b/src/corpus_query_language/language/lexer.py @@ -2,6 +2,9 @@ import copy class Lexer(object): + """ + Lexer that is used to tokenize a query. + """ tokens = ( 'RANGE', 'DISTANCE', @@ -81,7 +84,7 @@ def t_error(self, t): print("Illegal character '%s'" % t.value[0]) t.lexer.skip(1) - def tokenize(self, query, debug): + def tokenize(self, query:str, debug:bool=False): self.lexer = lex.lex(module=self) self.lexer.input(query) diff --git a/src/corpus_query_language/language/parser.py b/src/corpus_query_language/language/parser.py index 1050179..9d7266e 100644 --- a/src/corpus_query_language/language/parser.py +++ b/src/corpus_query_language/language/parser.py @@ -6,6 +6,9 @@ # API functionnalities. class Parser(lexer.Lexer): + """ + The parser. Builds the Ast with the tokens produced by the lexer. + """ tokens = lexer.Lexer.tokens def p_or_queries(self, p): diff --git a/src/corpus_query_language/utils/utils.py b/src/corpus_query_language/utils/utils.py index 217e098..a73e4d9 100644 --- a/src/corpus_query_language/utils/utils.py +++ b/src/corpus_query_language/utils/utils.py @@ -6,7 +6,13 @@ -def build_grammar(debug, query): +def build_grammar(debug:bool, query:str) -> list: + """ + This function builds an Abstract Syntax Tree from a query + :param debug: outputs parsing information + :param query: the query to build the AST from + :return: the ast + """ MyLexer = lexer.Lexer() MyLexer.tokenize(query, debug=debug) MyParser = parser.Parser(MyLexer, debug=debug) @@ -72,7 +78,12 @@ def alternative_match(queries:list[tuple], text_token:dict) -> bool: -def import_corpus(path): +def import_corpus(path) -> list: + """ + Simple JSON file import to dict + :param path: Path to the JSON file + :return: the list of dicts + """ with open(path, "r") as f: corpus = json.load(f) return corpus \ No newline at end of file