From 39639282707b88524e0bb852dd224e95de822222 Mon Sep 17 00:00:00 2001 From: Matthias Gille Levenson Date: Tue, 16 Sep 2025 21:55:41 +0200 Subject: [PATCH 01/23] Update project --- .github/workflows/testpypi-publish.yml | 2 +- pyproject.toml | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/.github/workflows/testpypi-publish.yml b/.github/workflows/testpypi-publish.yml index d61737c..d769f4f 100644 --- a/.github/workflows/testpypi-publish.yml +++ b/.github/workflows/testpypi-publish.yml @@ -57,4 +57,4 @@ jobs: - name: Publish package distributions to TestPyPI uses: pypa/gh-action-pypi-publish@release/v1 with: - repository-url: https://test.pypi.org/p/CQLEngine + repository-url: https://test.pypi.org/p/CQLEngine/ diff --git a/pyproject.toml b/pyproject.toml index 994e150..1ea9448 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -1,6 +1,6 @@ [project] name = "CQLEngine" -version = "0.0.1" +version = "0.0.213" authors = [ { name="Matthias Gille Levenson", email="matthias.gille-levenson@ens-lyon.fr" }, ] From c4dcdf3512db0729419112c3dda3d257feb9f8a8 Mon Sep 17 00:00:00 2001 From: Matthias Gille Levenson Date: Tue, 16 Sep 2025 21:57:40 +0200 Subject: [PATCH 02/23] Update workflow --- .github/workflows/testpypi-publish.yml | 2 +- pyproject.toml | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/.github/workflows/testpypi-publish.yml b/.github/workflows/testpypi-publish.yml index d769f4f..3b34af4 100644 --- a/.github/workflows/testpypi-publish.yml +++ b/.github/workflows/testpypi-publish.yml @@ -57,4 +57,4 @@ jobs: - name: Publish package distributions to TestPyPI uses: pypa/gh-action-pypi-publish@release/v1 with: - repository-url: https://test.pypi.org/p/CQLEngine/ + repository-url: https://test.pypi.org/legacy/ diff --git a/pyproject.toml b/pyproject.toml index 1ea9448..07d9c64 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -1,6 +1,6 @@ [project] name = "CQLEngine" -version = "0.0.213" +version = "0.0.214" authors = [ { name="Matthias Gille Levenson", email="matthias.gille-levenson@ens-lyon.fr" }, ] From aa2d4fc3bc920bdab699249fa54e541d0675713e Mon Sep 17 00:00:00 2001 From: Matthias Gille Levenson Date: Tue, 16 Sep 2025 22:24:52 +0200 Subject: [PATCH 03/23] Update architecture, fix errors --- CQL.py | 26 ++++++++++++++ pyproject.toml | 2 +- src/CQLEngine/CQL.py | 69 -------------------------------------- src/CQLEngine/engine.py | 3 +- src/CQLEngine/functions.py | 44 ++++++++++++++++++++++++ src/CQLEngine/lexer.py | 1 - src/CQLEngine/parser.py | 2 +- tests/tests.py | 7 ++-- 8 files changed, 77 insertions(+), 77 deletions(-) create mode 100644 CQL.py delete mode 100644 src/CQLEngine/CQL.py diff --git a/CQL.py b/CQL.py new file mode 100644 index 0000000..b9344c4 --- /dev/null +++ b/CQL.py @@ -0,0 +1,26 @@ +# Python package project: CQL (Corpus Query Language) parser: +# - parsing of any kind of annotation: word, lemma, pos, morph +# - combination of annotations: [lemma='rey' & pos='NCMP000'] +# - one or zero annotations [lemma='rey']?. For one ore more, see the distance operator +# - distance [lemma='rey'][]{,5}[lemma='santo'] +# - any regex in the annotation value [lemma='reye?s?'] +# - alternatives: [lemma='rey']|[lemma='príncipe'][]{,5}[lemma='santo'] +import sys +import CQLEngine.functions as functions + +# Takes a list of dicts with the annotations as input. Returns: +# - a list of spans (search_all function) +# - a boolean (match function) + + + +def main(): + query = sys.argv[1] + corpus = functions.import_corpus("tests/test_data/test_corpus.json") + MyEngine = functions.CQLEngine() + MyEngine.findall(corpus, query) + MyEngine.match(corpus, query) + + +if __name__ == '__main__': + main() \ No newline at end of file diff --git a/pyproject.toml b/pyproject.toml index 07d9c64..ac2c11a 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -1,6 +1,6 @@ [project] name = "CQLEngine" -version = "0.0.214" +version = "0.0.3" authors = [ { name="Matthias Gille Levenson", email="matthias.gille-levenson@ens-lyon.fr" }, ] diff --git a/src/CQLEngine/CQL.py b/src/CQLEngine/CQL.py deleted file mode 100644 index 7997e42..0000000 --- a/src/CQLEngine/CQL.py +++ /dev/null @@ -1,69 +0,0 @@ -# Python package project: CQL (Corpus Query Language) parser: -# - parsing of any kind of annotation: word, lemma, pos, morph -# - combination of annotations: [lemma='rey' & pos='NCMP000'] -# - one or zero annotations [lemma='rey']?. For one ore more, see the distance operator -# - distance [lemma='rey'][]{,5}[lemma='santo'] -# - any regex in the annotation value [lemma='reye?s?'] -# - alternatives: [lemma='rey']|[lemma='príncipe'][]{,5}[lemma='santo'] -import json - -# Takes a list of dicts with the annotations as input. Returns: -# - a list of spans (search_all function) -# - a boolean (match function) - -import src.CQLEngine.parser as parser -import src.CQLEngine.lexer as lexer -import src.CQLEngine.functions as functions -import src.CQLEngine.engine as engine -import sys - - -class CQLEngine(): - def findall(self, corpus:list[dict], query:str, debug) -> list[tuple[int, int]]: - """ - This function checks if a query matches some text, and returns the start and end span. - :param query: a CQL query - :param corpus: the annotated text as a list of dictionnaries containing the annotations (lemma, pos, morph, word) - :return: a list of tuples with the start and end position. - """ - query_ast = build_grammar(debug=debug, query=query) - result = engine.parse_corpus(query_ast, corpus, debug=debug, match=False) - print(f"\n---\nResults for query {query}:") - print(f"Ast: {query_ast}") - print(f"Spans: {result}") - return result - - - def match(self, corpus:list[dict], query:str, debug:bool) -> bool: - """ - This function checks whether a query matches some text, and returns True or False - :param query: a CQL query - :param corpus: the annotated text as a list of dictionnaries containing the annotations (lemma, pos, morph, word) - :return: a boolean - """ - query_ast = build_grammar(debug=debug, query=query) - result = engine.parse_corpus(query_ast, corpus, debug=debug, match=False) - print(f"\n---\nResults for query {query}:") - result = len(result) != 0 - print(result) - return result - - -def build_grammar(debug, query): - MyLexer = lexer.Lexer() - MyLexer.build(query, debug=debug) - MyParser = parser.Parser(MyLexer, debug=debug) - if debug: - print(MyParser.ast) - return MyParser.ast - - - - - -if __name__ == '__main__': - query = sys.argv[1] - corpus = functions.import_corpus("tests/test_data/test_corpus.json") - MyEngine = CQLEngine() - MyEngine.findall(corpus, query) - MyEngine.match(corpus, query) \ No newline at end of file diff --git a/src/CQLEngine/engine.py b/src/CQLEngine/engine.py index 3a5b952..857cbe5 100644 --- a/src/CQLEngine/engine.py +++ b/src/CQLEngine/engine.py @@ -1,4 +1,5 @@ -import src.CQLEngine.functions as functions +import CQLEngine.functions as functions + def parse_corpus(ast, corpus, debug, match=True): match = False diff --git a/src/CQLEngine/functions.py b/src/CQLEngine/functions.py index d6b7a68..998fbd0 100644 --- a/src/CQLEngine/functions.py +++ b/src/CQLEngine/functions.py @@ -1,5 +1,49 @@ import re import json +import CQLEngine.parser as parser +import CQLEngine.lexer as lexer +import CQLEngine.engine as engine + + +class CQLEngine(): + def findall(self, corpus:list[dict], query:str, debug:bool=False) -> list[tuple[int, int]]: + """ + This function checks if a query matches some text, and returns the start and end span. + :param query: a CQL query + :param corpus: the annotated text as a list of dictionnaries containing the annotations (lemma, pos, morph, word) + :return: a list of tuples with the start and end position. + """ + query_ast = build_grammar(debug=debug, query=query) + result = engine.parse_corpus(query_ast, corpus, debug=debug, match=False) + print(f"\n---\nResults for query {query}:") + print(f"Ast: {query_ast}") + print(f"Spans: {result}") + return result + + + def match(self, corpus:list[dict], query:str, debug:bool=False) -> bool: + """ + This function checks whether a query matches some text, and returns True or False + :param query: a CQL query + :param corpus: the annotated text as a list of dictionnaries containing the annotations (lemma, pos, morph, word) + :return: a boolean + """ + query_ast = build_grammar(debug=debug, query=query) + result = engine.parse_corpus(query_ast, corpus, debug=debug, match=False) + print(f"\n---\nResults for query {query}:") + result = len(result) != 0 + print(result) + return result + + +def build_grammar(debug, query): + MyLexer = lexer.Lexer() + MyLexer.build(query, debug=debug) + MyParser = parser.Parser(MyLexer, debug=debug) + if debug: + print(MyParser.ast) + return MyParser.ast + def simple_match(query:tuple, text_token:dict) -> bool: """ diff --git a/src/CQLEngine/lexer.py b/src/CQLEngine/lexer.py index f72d7bd..ce58389 100644 --- a/src/CQLEngine/lexer.py +++ b/src/CQLEngine/lexer.py @@ -1,6 +1,5 @@ import ply.lex as lex import copy -print("Imported") class Lexer(object): tokens = ( diff --git a/src/CQLEngine/parser.py b/src/CQLEngine/parser.py index 56dbc81..1f80c12 100644 --- a/src/CQLEngine/parser.py +++ b/src/CQLEngine/parser.py @@ -1,5 +1,5 @@ import ply.yacc as yacc -import src.CQLEngine.lexer as lexer +import CQLEngine.lexer as lexer diff --git a/tests/tests.py b/tests/tests.py index 953dac0..9fa9dd6 100644 --- a/tests/tests.py +++ b/tests/tests.py @@ -1,7 +1,6 @@ import ast -import src.CQLEngine.CQL as CQLEngine -import src.CQLEngine.functions as functions +import CQLEngine.functions as functions import unittest @@ -24,7 +23,7 @@ class TestQueries(unittest.TestCase): def test_findall_queries(self): self.corpus = functions.import_corpus("tests/test_data/test_corpus.json") self.queries = import_test_queries("tests/queries_findall.txt") - self.MyEngine = CQLEngine.CQLEngine() + self.MyEngine = functions.CQLEngine() for query, GT in self.queries: GT = ast.literal_eval(GT) with self.subTest(query=query, GT=GT): @@ -34,7 +33,7 @@ def test_findall_queries(self): def test_match_queries(self): self.corpus = functions.import_corpus("tests/test_data/test_corpus.json") self.queries = import_test_queries("tests/queries_match.txt") - self.MyEngine = CQLEngine.CQLEngine() + self.MyEngine = functions.CQLEngine() for query, GT in self.queries: with self.subTest(query=query, GT=GT): GT = True if GT == "True" else False From d24799b6bbb3c25fb7cbe8d866a28a3d6075e57a Mon Sep 17 00:00:00 2001 From: Matthias Gille Levenson Date: Tue, 16 Sep 2025 22:35:36 +0200 Subject: [PATCH 04/23] New release --- pyproject.toml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pyproject.toml b/pyproject.toml index ac2c11a..dee4b7d 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -1,6 +1,6 @@ [project] name = "CQLEngine" -version = "0.0.3" +version = "0.0.2" authors = [ { name="Matthias Gille Levenson", email="matthias.gille-levenson@ens-lyon.fr" }, ] From 31196e38395f2f57edc8432ead8d114c3b0ab0ed Mon Sep 17 00:00:00 2001 From: Matthias Gille Levenson Date: Tue, 16 Sep 2025 22:51:56 +0200 Subject: [PATCH 05/23] Fix test import --- tests/tests.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/tests/tests.py b/tests/tests.py index 9fa9dd6..e6ac430 100644 --- a/tests/tests.py +++ b/tests/tests.py @@ -1,7 +1,7 @@ import ast - +import sys +sys.path.append('src/') import CQLEngine.functions as functions - import unittest def import_test_queries(path): From ff9eb22a853f6e4676dabaa93261ebb7cfaf0fb4 Mon Sep 17 00:00:00 2001 From: Matthias Gille Levenson Date: Tue, 16 Sep 2025 23:05:29 +0200 Subject: [PATCH 06/23] Update project and actions --- .github/workflows/python-app.yml | 3 +-- .github/workflows/testpypi-publish.yml | 23 +++++++++++++++++++++++ pyproject.toml | 2 +- 3 files changed, 25 insertions(+), 3 deletions(-) diff --git a/.github/workflows/python-app.yml b/.github/workflows/python-app.yml index 9882c8c..e431c4a 100644 --- a/.github/workflows/python-app.yml +++ b/.github/workflows/python-app.yml @@ -5,7 +5,7 @@ name: Python application on: push: - branches: [ "main", "dev" ] + branches: [ "dev" ] pull_request: branches: [ "main" ] @@ -14,7 +14,6 @@ permissions: jobs: build: - runs-on: ubuntu-latest steps: diff --git a/.github/workflows/testpypi-publish.yml b/.github/workflows/testpypi-publish.yml index 3b34af4..63b46bf 100644 --- a/.github/workflows/testpypi-publish.yml +++ b/.github/workflows/testpypi-publish.yml @@ -17,6 +17,29 @@ permissions: contents: read jobs: + build-and-test: + runs-on: ubuntu-latest + steps: + - uses: actions/checkout@v4 + - name: Set up Python 3.11 + uses: actions/setup-python@v3 + with: + python-version: "3.11" + - name: Install dependencies + run: | + python -m pip install --upgrade pip + pip install flake8 pytest + if [ -f requirements.txt ]; then pip install -r requirements.txt; fi + - name: Lint with flake8 + run: | + # stop the build if there are Python syntax errors or undefined names + flake8 . --count --select=E9,F63,F7,F82 --show-source --statistics + # exit-zero treats all errors as warnings. The GitHub editor is 127 chars wide + flake8 . --count --exit-zero --max-complexity=10 --max-line-length=127 --statistics + - name: Test with unittest + run: | + python3 -m unittest tests/tests.py + release-build: runs-on: ubuntu-latest steps: diff --git a/pyproject.toml b/pyproject.toml index dee4b7d..ac2c11a 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -1,6 +1,6 @@ [project] name = "CQLEngine" -version = "0.0.2" +version = "0.0.3" authors = [ { name="Matthias Gille Levenson", email="matthias.gille-levenson@ens-lyon.fr" }, ] From 15261c79dc693d71e934ff95326b0a7aed755c53 Mon Sep 17 00:00:00 2001 From: Matthias Gille Levenson Date: Tue, 16 Sep 2025 23:08:05 +0200 Subject: [PATCH 07/23] Update project --- .github/workflows/testpypi-publish.yml | 5 +++-- pyproject.toml | 2 +- 2 files changed, 4 insertions(+), 3 deletions(-) diff --git a/.github/workflows/testpypi-publish.yml b/.github/workflows/testpypi-publish.yml index 63b46bf..c263370 100644 --- a/.github/workflows/testpypi-publish.yml +++ b/.github/workflows/testpypi-publish.yml @@ -7,11 +7,11 @@ # To get a newer version, you will need to update the SHA. # You can also reference a tag or branch, but the action may change without warning. -name: Upload Python Package +name: Upload on test.pypi on: push: - tags: '*' + tags: '**' permissions: contents: read @@ -41,6 +41,7 @@ jobs: python3 -m unittest tests/tests.py release-build: + needs: build-and-test runs-on: ubuntu-latest steps: - uses: actions/checkout@v5 diff --git a/pyproject.toml b/pyproject.toml index ac2c11a..e1d2539 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -1,6 +1,6 @@ [project] name = "CQLEngine" -version = "0.0.3" +version = "0.0.3/alpha" authors = [ { name="Matthias Gille Levenson", email="matthias.gille-levenson@ens-lyon.fr" }, ] From 89399f8d636203900a6e63725f94466982c059a2 Mon Sep 17 00:00:00 2001 From: Matthias Gille Levenson Date: Tue, 16 Sep 2025 23:10:55 +0200 Subject: [PATCH 08/23] Update architecture --- pyproject.toml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pyproject.toml b/pyproject.toml index e1d2539..ac2c11a 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -1,6 +1,6 @@ [project] name = "CQLEngine" -version = "0.0.3/alpha" +version = "0.0.3" authors = [ { name="Matthias Gille Levenson", email="matthias.gille-levenson@ens-lyon.fr" }, ] From 3f47c4b23dfd0ffab1103de27b53ddeac7937a16 Mon Sep 17 00:00:00 2001 From: Matthias Gille Levenson Date: Tue, 16 Sep 2025 23:13:13 +0200 Subject: [PATCH 09/23] Update pyproject --- pyproject.toml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pyproject.toml b/pyproject.toml index ac2c11a..fde6f35 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -1,6 +1,6 @@ [project] name = "CQLEngine" -version = "0.0.3" +version = "0.0.4" authors = [ { name="Matthias Gille Levenson", email="matthias.gille-levenson@ens-lyon.fr" }, ] From b43afb94cbfa3e8855480979b06c4f7fa742366a Mon Sep 17 00:00:00 2001 From: Matthias Gille Levenson Date: Tue, 16 Sep 2025 23:30:49 +0200 Subject: [PATCH 10/23] Simplify code --- .github/workflows/testpypi-publish.yml | 3 +- src/CQLEngine/engine.py | 2 +- src/CQLEngine/functions.py | 4 +- src/CQLEngine/parser.py | 1 - src/CQLEngine/test.py | 56 -------------------------- tests/tests.py | 7 ++-- 6 files changed, 9 insertions(+), 64 deletions(-) delete mode 100644 src/CQLEngine/test.py diff --git a/.github/workflows/testpypi-publish.yml b/.github/workflows/testpypi-publish.yml index c263370..ea004cf 100644 --- a/.github/workflows/testpypi-publish.yml +++ b/.github/workflows/testpypi-publish.yml @@ -11,7 +11,8 @@ name: Upload on test.pypi on: push: - tags: '**' + branches: [ "dev" ] + tags: '*' permissions: contents: read diff --git a/src/CQLEngine/engine.py b/src/CQLEngine/engine.py index 857cbe5..8d1fef2 100644 --- a/src/CQLEngine/engine.py +++ b/src/CQLEngine/engine.py @@ -1,7 +1,7 @@ import CQLEngine.functions as functions -def parse_corpus(ast, corpus, debug, match=True): +def parse_corpus(ast, corpus, debug): match = False tree_index = 0 text_index = 0 diff --git a/src/CQLEngine/functions.py b/src/CQLEngine/functions.py index 998fbd0..9af0060 100644 --- a/src/CQLEngine/functions.py +++ b/src/CQLEngine/functions.py @@ -14,7 +14,7 @@ def findall(self, corpus:list[dict], query:str, debug:bool=False) -> list[tuple[ :return: a list of tuples with the start and end position. """ query_ast = build_grammar(debug=debug, query=query) - result = engine.parse_corpus(query_ast, corpus, debug=debug, match=False) + result = engine.parse_corpus(query_ast, corpus, debug=debug) print(f"\n---\nResults for query {query}:") print(f"Ast: {query_ast}") print(f"Spans: {result}") @@ -29,7 +29,7 @@ def match(self, corpus:list[dict], query:str, debug:bool=False) -> bool: :return: a boolean """ query_ast = build_grammar(debug=debug, query=query) - result = engine.parse_corpus(query_ast, corpus, debug=debug, match=False) + result = engine.parse_corpus(query_ast, corpus, debug=debug) print(f"\n---\nResults for query {query}:") result = len(result) != 0 print(result) diff --git a/src/CQLEngine/parser.py b/src/CQLEngine/parser.py index 1f80c12..4c16277 100644 --- a/src/CQLEngine/parser.py +++ b/src/CQLEngine/parser.py @@ -74,7 +74,6 @@ def p_subquery_and_subquery(self, p): | query_atom AND query_atom | query_atom AND query_atom AND query_atom | query_atom AND query_atom AND query_atom AND query_atom''' - print(len(p)) if len(p) == 2: p[0] = p[1] elif len(p) == 4: diff --git a/src/CQLEngine/test.py b/src/CQLEngine/test.py deleted file mode 100644 index b195fb0..0000000 --- a/src/CQLEngine/test.py +++ /dev/null @@ -1,56 +0,0 @@ -import sys - -import ply.lex as lex -import ply.yacc as yacc - -# Lexer -tokens = ('LEMMA', 'EQUAL', 'VALUE', 'LSQBRACK', 'RSQBRACK') - -t_EQUAL = r'=' -t_LSQBRACK = r'\[' -t_RSQBRACK = r'\]' - -def t_LEMMA(t): - r'lemma' - return t - -def t_VALUE(t): - r'[a-zA-Z_][a-zA-Z0-9_]*|\'[^\']*\'|"[^"]*"' - if t.value.startswith("'") and t.value.endswith("'"): - t.value = t.value[1:-1] - elif t.value.startswith('"') and t.value.endswith('"'): - t.value = t.value[1:-1] - return t - -t_ignore = ' \t' - -def t_error(t): - print(f"Caractère inattendu: '{t.value[0]}' (ligne {t.lineno})") - t.lexer.skip(1) - -lexer = lex.lex() - -# Parser -def p_bracketed_query(p): - 'query : LSQBRACK query RSQBRACK' - p[0] = p[2] - -def p_query_lemma(p): - 'query : LEMMA EQUAL VALUE' - p[0] = ('lemma', p[3]) - -def p_error(p): - if p: - print(f"Erreur de syntaxe à '{p.value}' (ligne {p.lineno})") - else: - print("Erreur de syntaxe : fin de fichier inattendue") - -parser = yacc.yacc(start='query') - -# Test -data = "[lemma='monarchia']" -lexer.input(sys.argv[1]) - -ast = parser.parse(lexer=lexer) -print("AST généré :", ast) - diff --git a/tests/tests.py b/tests/tests.py index e6ac430..4ce1a79 100644 --- a/tests/tests.py +++ b/tests/tests.py @@ -12,12 +12,13 @@ def import_test_queries(path): class TestFunctions(unittest.TestCase): def test_simple_match(self): - query = ("lemma", "!=", "asno") + query = ("lemma", "=", "asno") test_token = {"lemma": "asno", "pos": "NCMS000", "morph": None, "word": "asnos"} - self.assertEqual(functions.simple_match(query, test_token), False) + self.assertEqual(functions.simple_match(query, test_token), True, "Something is wrong" + "with function `test_simple_match`") class TestQueries(unittest.TestCase): def test_findall_queries(self): @@ -27,7 +28,7 @@ def test_findall_queries(self): for query, GT in self.queries: GT = ast.literal_eval(GT) with self.subTest(query=query, GT=GT): - self.assertEqual(self.MyEngine.findall(self.corpus, query, debug=False), GT) + self.assertEqual(self.MyEngine.findall(self.corpus, query, debug=False), GT, "Error with findall function") def test_match_queries(self): From 939fd6c0a1b2655311b8016acb5f8588a979efa0 Mon Sep 17 00:00:00 2001 From: Matthias Gille Levenson Date: Wed, 17 Sep 2025 12:55:47 +0200 Subject: [PATCH 11/23] Rename lexer function --- src/CQLEngine/lexer.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/CQLEngine/lexer.py b/src/CQLEngine/lexer.py index ce58389..3490917 100644 --- a/src/CQLEngine/lexer.py +++ b/src/CQLEngine/lexer.py @@ -81,7 +81,7 @@ def t_error(self, t): print("Illegal character '%s'" % t.value[0]) t.lexer.skip(1) - def build(self, query, debug): + def tokenize(self, query, debug): self.lexer = lex.lex(module=self) self.lexer.input(query) From 9a8fe65b768a4810d7078d6d15c9d7aec7dfee9e Mon Sep 17 00:00:00 2001 From: Matthias Gille Levenson Date: Wed, 17 Sep 2025 12:56:00 +0200 Subject: [PATCH 12/23] Add verbosity option --- src/CQLEngine/functions.py | 18 ++++++++++-------- 1 file changed, 10 insertions(+), 8 deletions(-) diff --git a/src/CQLEngine/functions.py b/src/CQLEngine/functions.py index 9af0060..8a0552f 100644 --- a/src/CQLEngine/functions.py +++ b/src/CQLEngine/functions.py @@ -6,7 +6,7 @@ class CQLEngine(): - def findall(self, corpus:list[dict], query:str, debug:bool=False) -> list[tuple[int, int]]: + def findall(self, corpus:list[dict], query:str, verbose:bool=True, debug:bool=False) -> list[tuple[int, int]]: """ This function checks if a query matches some text, and returns the start and end span. :param query: a CQL query @@ -15,13 +15,14 @@ def findall(self, corpus:list[dict], query:str, debug:bool=False) -> list[tuple[ """ query_ast = build_grammar(debug=debug, query=query) result = engine.parse_corpus(query_ast, corpus, debug=debug) - print(f"\n---\nResults for query {query}:") - print(f"Ast: {query_ast}") - print(f"Spans: {result}") + if verbose: + print(f"\n---\nResults for query {query}:") + print(f"Ast: {query_ast}") + print(f"Spans: {result}") return result - def match(self, corpus:list[dict], query:str, debug:bool=False) -> bool: + def match(self, corpus:list[dict], query:str, verbose:bool=True, debug:bool=False) -> bool: """ This function checks whether a query matches some text, and returns True or False :param query: a CQL query @@ -30,15 +31,16 @@ def match(self, corpus:list[dict], query:str, debug:bool=False) -> bool: """ query_ast = build_grammar(debug=debug, query=query) result = engine.parse_corpus(query_ast, corpus, debug=debug) - print(f"\n---\nResults for query {query}:") result = len(result) != 0 - print(result) + if verbose: + print(f"\n---\nResults for query {query}:") + print(result) return result def build_grammar(debug, query): MyLexer = lexer.Lexer() - MyLexer.build(query, debug=debug) + MyLexer.tokenize(query, debug=debug) MyParser = parser.Parser(MyLexer, debug=debug) if debug: print(MyParser.ast) From 3304332dbad31cfd75a51087f1def03a37eb54a7 Mon Sep 17 00:00:00 2001 From: Matthias Gille Levenson Date: Wed, 17 Sep 2025 12:56:24 +0200 Subject: [PATCH 13/23] Update description, to be added in the README --- CQL.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/CQL.py b/CQL.py index b9344c4..adb1d24 100644 --- a/CQL.py +++ b/CQL.py @@ -1,7 +1,7 @@ # Python package project: CQL (Corpus Query Language) parser: # - parsing of any kind of annotation: word, lemma, pos, morph # - combination of annotations: [lemma='rey' & pos='NCMP000'] -# - one or zero annotations [lemma='rey']?. For one ore more, see the distance operator +# - one or zero annotations [lemma='rey']?. # - distance [lemma='rey'][]{,5}[lemma='santo'] # - any regex in the annotation value [lemma='reye?s?'] # - alternatives: [lemma='rey']|[lemma='príncipe'][]{,5}[lemma='santo'] From 5752239b472c1d5e2187cc1c02f9a6234b756d7b Mon Sep 17 00:00:00 2001 From: Matthias Gille Levenson Date: Wed, 17 Sep 2025 14:49:47 +0200 Subject: [PATCH 14/23] Add requirements --- pyproject.toml | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/pyproject.toml b/pyproject.toml index fde6f35..7445c4b 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -13,6 +13,13 @@ classifiers = [ ] license = " CC-BY-NC-SA-4.0" license-files = ["LICEN[CS]E*"] +dependencies = [ + "ply" +] + +[build-system] +requires = ["setuptools", "setuptools-scm"] +build-backend = "setuptools.build_meta" [project.urls] Homepage = "https://github.com/matgille/CQL" From 20fa4503cfff4bbdfbc11fe8d74e37b487fe1013 Mon Sep 17 00:00:00 2001 From: Matthias Gille Levenson Date: Wed, 17 Sep 2025 14:51:07 +0200 Subject: [PATCH 15/23] Improve parse corpus function, debug --- src/CQLEngine/engine.py | 41 +++++++++++++++++++++----------------- src/CQLEngine/functions.py | 5 ++--- 2 files changed, 25 insertions(+), 21 deletions(-) diff --git a/src/CQLEngine/engine.py b/src/CQLEngine/engine.py index 8d1fef2..89574c1 100644 --- a/src/CQLEngine/engine.py +++ b/src/CQLEngine/engine.py @@ -1,8 +1,9 @@ import CQLEngine.functions as functions -def parse_corpus(ast, corpus, debug): +def parse_corpus(ast, corpus, mode, debug): match = False + text_end = False tree_index = 0 text_index = 0 @@ -21,38 +22,40 @@ def parse_corpus(ast, corpus, debug): analysis_list = ['lemma', 'pos', 'morph', 'word'] # Text-directed engine. - while match == False: - if debug: - print("-") - print(corpus[text_index]) - print(f"Text index: {text_index}") - print(f"Tree index: {tree_index}") - print(f"Ast length: {ast_length}") + while text_end == False: # On teste si on est en bout de texte. - if len(corpus) == text_index: + if len(corpus) == text_index and tree_index != ast_length: if debug: - print("End of text. Exiting.") - break - if text_index + 1 == len(corpus): - tree_index += 1 - if debug: - print("End of text. Exiting.") + print("End of text a. Exiting.") + text_end = True + if mode == "match": + return False break + + # Si on matche la longueur de notre arbre if tree_index == ast_length: + match = True all_spans.append((first_matching_index, text_index)) - first_matching_index = None - if match is True: - return True if debug: print(f"Appending {(first_matching_index, text_index)} to spans.") print(tree_index) print(ast_length) + first_matching_index = None + if match is True and mode == "match": + return True text_index += 1 tree_index = 0 matches = True # La boucle s'arrête là + + if debug: + print("-") + print(corpus[text_index]) + print(f"Text index: {text_index}") + print(f"Tree index: {tree_index}") + print(f"Ast length: {ast_length}") current_query = ast[tree_index] operator = current_query[0] if debug: @@ -60,6 +63,8 @@ def parse_corpus(ast, corpus, debug): if operator in analysis_list: if debug: print(f"{operator} in list of analysis") + print(len(corpus)) + print(text_index) if functions.simple_match(current_query, corpus[text_index]): if debug: print("Found you a. Going forward on tree and text.") diff --git a/src/CQLEngine/functions.py b/src/CQLEngine/functions.py index 8a0552f..4c01ad6 100644 --- a/src/CQLEngine/functions.py +++ b/src/CQLEngine/functions.py @@ -14,7 +14,7 @@ def findall(self, corpus:list[dict], query:str, verbose:bool=True, debug:bool=F :return: a list of tuples with the start and end position. """ query_ast = build_grammar(debug=debug, query=query) - result = engine.parse_corpus(query_ast, corpus, debug=debug) + result = engine.parse_corpus(query_ast, corpus, mode="find", debug=debug) if verbose: print(f"\n---\nResults for query {query}:") print(f"Ast: {query_ast}") @@ -30,8 +30,7 @@ def match(self, corpus:list[dict], query:str, verbose:bool=True, debug:bool=Fals :return: a boolean """ query_ast = build_grammar(debug=debug, query=query) - result = engine.parse_corpus(query_ast, corpus, debug=debug) - result = len(result) != 0 + result = engine.parse_corpus(query_ast, corpus, mode="match", debug=debug) if verbose: print(f"\n---\nResults for query {query}:") print(result) From 586710e3c914840fd71c56a3cfffd14f83abe853 Mon Sep 17 00:00:00 2001 From: Matthias Gille Levenson Date: Wed, 17 Sep 2025 14:51:24 +0200 Subject: [PATCH 16/23] Improve tests --- tests/queries_match.txt | 5 ++++- tests/tests.py | 18 ++++++++++++++---- 2 files changed, 18 insertions(+), 5 deletions(-) diff --git a/tests/queries_match.txt b/tests/queries_match.txt index 8876ee1..6955092 100644 --- a/tests/queries_match.txt +++ b/tests/queries_match.txt @@ -1 +1,4 @@ -[lemma='a'][lemma='vez'] True +[{'word': 'et', 'lemma': 'et', 'pos': 'CONcoo', 'morph': ''}, {'word': 'tenere', 'lemma': 'teneo', 'pos': 'VER', 'morph': 'MODE=Inf|TEMPS=Pres'}, {'word': 'monarchiam', 'lemma': 'monarchia', 'pos': 'NOMcom', 'morph': 'CAS=Acc|NOMB.=Sing'}, {'word': ':', 'lemma': 'e', 'pos': 'INJ', 'morph': ''}] [lemma='monarchia'] True +[{'word': 'et', 'lemma': 'et', 'pos': 'CONcoo', 'morph': ''}, {'word': 'tenere', 'lemma': 'teneo', 'pos': 'VER', 'morph': 'MODE=Inf|TEMPS=Pres'}, {'word': 'monarchiam', 'lemma': 'monarchia', 'pos': 'NOMcom', 'morph': 'CAS=Acc|NOMB.=Sing'}] [lemma='monarchia'] True +[{'word': 'et', 'lemma': 'et', 'pos': 'CONcoo', 'morph': ''}, {'word': 'tenere', 'lemma': 'teneo', 'pos': 'VER', 'morph': 'MODE=Inf|TEMPS=Pres'}, {'word': ':', 'lemma': 'e', 'pos': 'INJ', 'morph': ''}] [lemma='monarchia'] False +[{'word': 'et', 'lemma': 'monarchia', 'pos': 'CONcoo', 'morph': ''}, {'word': 'tenere', 'lemma': 'teneo', 'pos': 'VER', 'morph': 'MODE=Inf|TEMPS=Pres'}, {'word': 'monarchiam', 'lemma': 'monarchia', 'pos': 'NOMcom', 'morph': 'CAS=Acc|NOMB.=Sing'}, {'word': ':', 'lemma': 'e', 'pos': 'INJ', 'morph': ''}] [lemma='monarchia'] True diff --git a/tests/tests.py b/tests/tests.py index 4ce1a79..6899377 100644 --- a/tests/tests.py +++ b/tests/tests.py @@ -9,6 +9,12 @@ def import_test_queries(path): list_of_queries = f.read().splitlines() return [line.split("\t") for line in list_of_queries] +def import_match_queries(path): + with open(path, "r") as f: + list_of_queries = f.read().splitlines() + as_splits = [line.split("\t") for line in list_of_queries] + return [(ast.literal_eval(nodes), query, GT) for nodes, query, GT in as_splits] + class TestFunctions(unittest.TestCase): def test_simple_match(self): @@ -32,13 +38,17 @@ def test_findall_queries(self): def test_match_queries(self): - self.corpus = functions.import_corpus("tests/test_data/test_corpus.json") - self.queries = import_test_queries("tests/queries_match.txt") + self.queries = import_match_queries("tests/queries_match.txt") self.MyEngine = functions.CQLEngine() - for query, GT in self.queries: + for idx, (nodes, query, GT) in enumerate(self.queries): with self.subTest(query=query, GT=GT): GT = True if GT == "True" else False - self.assertEqual(self.MyEngine.match(self.corpus, query, debug=False), GT) + match = self.MyEngine.match(nodes, query, debug=True) + self.assertEqual(match, GT, + msg=f"\nTest {idx + 1} failed.\n" + f"Query: {query}\n" + f"Nodes: {nodes}\n" + f"Match should be {GT}, is {match}") if __name__ == '__main__': From 225a37c085ea34113ceb9e1f08311ced2d17f0c0 Mon Sep 17 00:00:00 2001 From: Matthias Gille Levenson Date: Wed, 17 Sep 2025 14:51:35 +0200 Subject: [PATCH 17/23] Increase version --- pyproject.toml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pyproject.toml b/pyproject.toml index 7445c4b..67052be 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -1,6 +1,6 @@ [project] name = "CQLEngine" -version = "0.0.4" +version = "0.0.5" authors = [ { name="Matthias Gille Levenson", email="matthias.gille-levenson@ens-lyon.fr" }, ] From 891a73b896469a6ee5cec7296fb18b45f25a84f3 Mon Sep 17 00:00:00 2001 From: Matthias Gille Levenson Date: Wed, 17 Sep 2025 15:07:35 +0200 Subject: [PATCH 18/23] Rename project, revert to version v0.0.1 --- pyproject.toml | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/pyproject.toml b/pyproject.toml index 67052be..21449a0 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -1,6 +1,6 @@ [project] -name = "CQLEngine" -version = "0.0.5" +name = "corpus-query-language" +version = "0.0.1" authors = [ { name="Matthias Gille Levenson", email="matthias.gille-levenson@ens-lyon.fr" }, ] From 8ea3f24d31abf5b0cbf29b75a5f3f5fcc4d48f31 Mon Sep 17 00:00:00 2001 From: Matthias Gille Levenson Date: Wed, 17 Sep 2025 15:09:14 +0200 Subject: [PATCH 19/23] Update workflow --- .github/workflows/testpypi-publish.yml | 1 - 1 file changed, 1 deletion(-) diff --git a/.github/workflows/testpypi-publish.yml b/.github/workflows/testpypi-publish.yml index ea004cf..ab48b4a 100644 --- a/.github/workflows/testpypi-publish.yml +++ b/.github/workflows/testpypi-publish.yml @@ -11,7 +11,6 @@ name: Upload on test.pypi on: push: - branches: [ "dev" ] tags: '*' permissions: From a8c00546be76a3839f8737b4a927534b66e80f5a Mon Sep 17 00:00:00 2001 From: Matthias Gille Levenson Date: Wed, 17 Sep 2025 15:22:03 +0200 Subject: [PATCH 20/23] Update workflow --- .github/workflows/pypi-publish.yml | 84 ++++++++++++++++++++++++++ .github/workflows/testpypi-publish.yml | 2 +- 2 files changed, 85 insertions(+), 1 deletion(-) create mode 100644 .github/workflows/pypi-publish.yml diff --git a/.github/workflows/pypi-publish.yml b/.github/workflows/pypi-publish.yml new file mode 100644 index 0000000..6017d70 --- /dev/null +++ b/.github/workflows/pypi-publish.yml @@ -0,0 +1,84 @@ +# This workflow uses actions that are not certified by GitHub. +# They are provided by a third-party and are governed by +# separate terms of service, privacy policy, and support +# documentation. + +# GitHub recommends pinning actions to a commit SHA. +# To get a newer version, you will need to update the SHA. +# You can also reference a tag or branch, but the action may change without warning. + +name: Upload on Pypi + +on: + release: + types: [published] + +permissions: + contents: read + +jobs: + build-and-test: + runs-on: ubuntu-latest + steps: + - uses: actions/checkout@v4 + - name: Set up Python 3.11 + uses: actions/setup-python@v3 + with: + python-version: "3.11" + - name: Install dependencies + run: | + python -m pip install --upgrade pip + pip install flake8 pytest + if [ -f requirements.txt ]; then pip install -r requirements.txt; fi + - name: Lint with flake8 + run: | + # stop the build if there are Python syntax errors or undefined names + flake8 . --count --select=E9,F63,F7,F82 --show-source --statistics + # exit-zero treats all errors as warnings. The GitHub editor is 127 chars wide + flake8 . --count --exit-zero --max-complexity=10 --max-line-length=127 --statistics + - name: Test with unittest + run: | + python3 -m unittest tests/tests.py + + release-build: + needs: build-and-test + runs-on: ubuntu-latest + steps: + - uses: actions/checkout@v5 + + - uses: actions/setup-python@v5 + with: + python-version: "3.11" + + - name: Build release distributions + run: | + # NOTE: put your own distribution build steps here. + python -m pip install build + python -m build + + - name: Upload distributions + uses: actions/upload-artifact@v4 + with: + name: release-dists + path: dist/ + + pypi-publish: + needs: release-build + name: Publish package distributions to TestPyPI + runs-on: ubuntu-latest + environment: + name: testpypi + url: https://pypi.org/project/corpus-query-language + permissions: + id-token: write # IMPORTANT: this permission is mandatory for trusted publishing + steps: + - name: Télécharger les artefacts du build + uses: actions/download-artifact@v4 # <-- Ajoutez cette étape + with: + name: release-dists # <-- Nom de l'artefact uploadé dans `release-build` + path: dist/ + # retrieve your distributions here + - name: Publish package distributions to TestPyPI + uses: pypa/gh-action-pypi-publish@release/v1 + with: + repository-url: https://upload.pypi.org/legacy/ diff --git a/.github/workflows/testpypi-publish.yml b/.github/workflows/testpypi-publish.yml index ab48b4a..2121062 100644 --- a/.github/workflows/testpypi-publish.yml +++ b/.github/workflows/testpypi-publish.yml @@ -68,7 +68,7 @@ jobs: runs-on: ubuntu-latest environment: name: testpypi - url: https://test.pypi.org/p/CQLEngine + url: https://test.pypi.org/p/corpus-query-language permissions: id-token: write # IMPORTANT: this permission is mandatory for trusted publishing steps: From a48b59687c0c4f973e6dda9c5addd7d4dec54ebd Mon Sep 17 00:00:00 2001 From: Matthias Gille Levenson Date: Wed, 17 Sep 2025 15:23:22 +0200 Subject: [PATCH 21/23] Rename workflow --- .github/workflows/{python-app.yml => test.yml} | 0 1 file changed, 0 insertions(+), 0 deletions(-) rename .github/workflows/{python-app.yml => test.yml} (100%) diff --git a/.github/workflows/python-app.yml b/.github/workflows/test.yml similarity index 100% rename from .github/workflows/python-app.yml rename to .github/workflows/test.yml From f48d06717008f35f6eab54714e24ab1dfa0d2938 Mon Sep 17 00:00:00 2001 From: Matthias Gille Levenson Date: Wed, 17 Sep 2025 15:25:06 +0200 Subject: [PATCH 22/23] Rename job --- .github/workflows/test.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/test.yml b/.github/workflows/test.yml index e431c4a..3c40e39 100644 --- a/.github/workflows/test.yml +++ b/.github/workflows/test.yml @@ -13,7 +13,7 @@ permissions: contents: read jobs: - build: + Test-code: runs-on: ubuntu-latest steps: From a0ac8c82cf4d1fc5882df84a6216d33bcd2d2326 Mon Sep 17 00:00:00 2001 From: Matthias Gille Levenson Date: Wed, 17 Sep 2025 15:26:13 +0200 Subject: [PATCH 23/23] Trigger tests on PR only --- .github/workflows/test.yml | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/.github/workflows/test.yml b/.github/workflows/test.yml index 3c40e39..b16f084 100644 --- a/.github/workflows/test.yml +++ b/.github/workflows/test.yml @@ -4,10 +4,8 @@ name: Python application on: - push: - branches: [ "dev" ] pull_request: - branches: [ "main" ] + branches: [ "main", "dev" ] permissions: contents: read