diff --git a/.github/workflows/pypi-publish.yml b/.github/workflows/pypi-publish.yml new file mode 100644 index 0000000..6017d70 --- /dev/null +++ b/.github/workflows/pypi-publish.yml @@ -0,0 +1,84 @@ +# This workflow uses actions that are not certified by GitHub. +# They are provided by a third-party and are governed by +# separate terms of service, privacy policy, and support +# documentation. + +# GitHub recommends pinning actions to a commit SHA. +# To get a newer version, you will need to update the SHA. +# You can also reference a tag or branch, but the action may change without warning. + +name: Upload on Pypi + +on: + release: + types: [published] + +permissions: + contents: read + +jobs: + build-and-test: + runs-on: ubuntu-latest + steps: + - uses: actions/checkout@v4 + - name: Set up Python 3.11 + uses: actions/setup-python@v3 + with: + python-version: "3.11" + - name: Install dependencies + run: | + python -m pip install --upgrade pip + pip install flake8 pytest + if [ -f requirements.txt ]; then pip install -r requirements.txt; fi + - name: Lint with flake8 + run: | + # stop the build if there are Python syntax errors or undefined names + flake8 . --count --select=E9,F63,F7,F82 --show-source --statistics + # exit-zero treats all errors as warnings. The GitHub editor is 127 chars wide + flake8 . --count --exit-zero --max-complexity=10 --max-line-length=127 --statistics + - name: Test with unittest + run: | + python3 -m unittest tests/tests.py + + release-build: + needs: build-and-test + runs-on: ubuntu-latest + steps: + - uses: actions/checkout@v5 + + - uses: actions/setup-python@v5 + with: + python-version: "3.11" + + - name: Build release distributions + run: | + # NOTE: put your own distribution build steps here. + python -m pip install build + python -m build + + - name: Upload distributions + uses: actions/upload-artifact@v4 + with: + name: release-dists + path: dist/ + + pypi-publish: + needs: release-build + name: Publish package distributions to TestPyPI + runs-on: ubuntu-latest + environment: + name: testpypi + url: https://pypi.org/project/corpus-query-language + permissions: + id-token: write # IMPORTANT: this permission is mandatory for trusted publishing + steps: + - name: Télécharger les artefacts du build + uses: actions/download-artifact@v4 # <-- Ajoutez cette étape + with: + name: release-dists # <-- Nom de l'artefact uploadé dans `release-build` + path: dist/ + # retrieve your distributions here + - name: Publish package distributions to TestPyPI + uses: pypa/gh-action-pypi-publish@release/v1 + with: + repository-url: https://upload.pypi.org/legacy/ diff --git a/.github/workflows/python-app.yml b/.github/workflows/test.yml similarity index 96% rename from .github/workflows/python-app.yml rename to .github/workflows/test.yml index 9882c8c..b16f084 100644 --- a/.github/workflows/python-app.yml +++ b/.github/workflows/test.yml @@ -4,17 +4,14 @@ name: Python application on: - push: - branches: [ "main", "dev" ] pull_request: - branches: [ "main" ] + branches: [ "main", "dev" ] permissions: contents: read jobs: - build: - + Test-code: runs-on: ubuntu-latest steps: diff --git a/.github/workflows/testpypi-publish.yml b/.github/workflows/testpypi-publish.yml index d61737c..2121062 100644 --- a/.github/workflows/testpypi-publish.yml +++ b/.github/workflows/testpypi-publish.yml @@ -7,7 +7,7 @@ # To get a newer version, you will need to update the SHA. # You can also reference a tag or branch, but the action may change without warning. -name: Upload Python Package +name: Upload on test.pypi on: push: @@ -17,7 +17,31 @@ permissions: contents: read jobs: + build-and-test: + runs-on: ubuntu-latest + steps: + - uses: actions/checkout@v4 + - name: Set up Python 3.11 + uses: actions/setup-python@v3 + with: + python-version: "3.11" + - name: Install dependencies + run: | + python -m pip install --upgrade pip + pip install flake8 pytest + if [ -f requirements.txt ]; then pip install -r requirements.txt; fi + - name: Lint with flake8 + run: | + # stop the build if there are Python syntax errors or undefined names + flake8 . --count --select=E9,F63,F7,F82 --show-source --statistics + # exit-zero treats all errors as warnings. The GitHub editor is 127 chars wide + flake8 . --count --exit-zero --max-complexity=10 --max-line-length=127 --statistics + - name: Test with unittest + run: | + python3 -m unittest tests/tests.py + release-build: + needs: build-and-test runs-on: ubuntu-latest steps: - uses: actions/checkout@v5 @@ -44,7 +68,7 @@ jobs: runs-on: ubuntu-latest environment: name: testpypi - url: https://test.pypi.org/p/CQLEngine + url: https://test.pypi.org/p/corpus-query-language permissions: id-token: write # IMPORTANT: this permission is mandatory for trusted publishing steps: @@ -57,4 +81,4 @@ jobs: - name: Publish package distributions to TestPyPI uses: pypa/gh-action-pypi-publish@release/v1 with: - repository-url: https://test.pypi.org/p/CQLEngine + repository-url: https://test.pypi.org/legacy/ diff --git a/CQL.py b/CQL.py new file mode 100644 index 0000000..adb1d24 --- /dev/null +++ b/CQL.py @@ -0,0 +1,26 @@ +# Python package project: CQL (Corpus Query Language) parser: +# - parsing of any kind of annotation: word, lemma, pos, morph +# - combination of annotations: [lemma='rey' & pos='NCMP000'] +# - one or zero annotations [lemma='rey']?. +# - distance [lemma='rey'][]{,5}[lemma='santo'] +# - any regex in the annotation value [lemma='reye?s?'] +# - alternatives: [lemma='rey']|[lemma='príncipe'][]{,5}[lemma='santo'] +import sys +import CQLEngine.functions as functions + +# Takes a list of dicts with the annotations as input. Returns: +# - a list of spans (search_all function) +# - a boolean (match function) + + + +def main(): + query = sys.argv[1] + corpus = functions.import_corpus("tests/test_data/test_corpus.json") + MyEngine = functions.CQLEngine() + MyEngine.findall(corpus, query) + MyEngine.match(corpus, query) + + +if __name__ == '__main__': + main() \ No newline at end of file diff --git a/pyproject.toml b/pyproject.toml index 994e150..21449a0 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -1,5 +1,5 @@ [project] -name = "CQLEngine" +name = "corpus-query-language" version = "0.0.1" authors = [ { name="Matthias Gille Levenson", email="matthias.gille-levenson@ens-lyon.fr" }, @@ -13,6 +13,13 @@ classifiers = [ ] license = " CC-BY-NC-SA-4.0" license-files = ["LICEN[CS]E*"] +dependencies = [ + "ply" +] + +[build-system] +requires = ["setuptools", "setuptools-scm"] +build-backend = "setuptools.build_meta" [project.urls] Homepage = "https://github.com/matgille/CQL" diff --git a/src/CQLEngine/CQL.py b/src/CQLEngine/CQL.py deleted file mode 100644 index 7997e42..0000000 --- a/src/CQLEngine/CQL.py +++ /dev/null @@ -1,69 +0,0 @@ -# Python package project: CQL (Corpus Query Language) parser: -# - parsing of any kind of annotation: word, lemma, pos, morph -# - combination of annotations: [lemma='rey' & pos='NCMP000'] -# - one or zero annotations [lemma='rey']?. For one ore more, see the distance operator -# - distance [lemma='rey'][]{,5}[lemma='santo'] -# - any regex in the annotation value [lemma='reye?s?'] -# - alternatives: [lemma='rey']|[lemma='príncipe'][]{,5}[lemma='santo'] -import json - -# Takes a list of dicts with the annotations as input. Returns: -# - a list of spans (search_all function) -# - a boolean (match function) - -import src.CQLEngine.parser as parser -import src.CQLEngine.lexer as lexer -import src.CQLEngine.functions as functions -import src.CQLEngine.engine as engine -import sys - - -class CQLEngine(): - def findall(self, corpus:list[dict], query:str, debug) -> list[tuple[int, int]]: - """ - This function checks if a query matches some text, and returns the start and end span. - :param query: a CQL query - :param corpus: the annotated text as a list of dictionnaries containing the annotations (lemma, pos, morph, word) - :return: a list of tuples with the start and end position. - """ - query_ast = build_grammar(debug=debug, query=query) - result = engine.parse_corpus(query_ast, corpus, debug=debug, match=False) - print(f"\n---\nResults for query {query}:") - print(f"Ast: {query_ast}") - print(f"Spans: {result}") - return result - - - def match(self, corpus:list[dict], query:str, debug:bool) -> bool: - """ - This function checks whether a query matches some text, and returns True or False - :param query: a CQL query - :param corpus: the annotated text as a list of dictionnaries containing the annotations (lemma, pos, morph, word) - :return: a boolean - """ - query_ast = build_grammar(debug=debug, query=query) - result = engine.parse_corpus(query_ast, corpus, debug=debug, match=False) - print(f"\n---\nResults for query {query}:") - result = len(result) != 0 - print(result) - return result - - -def build_grammar(debug, query): - MyLexer = lexer.Lexer() - MyLexer.build(query, debug=debug) - MyParser = parser.Parser(MyLexer, debug=debug) - if debug: - print(MyParser.ast) - return MyParser.ast - - - - - -if __name__ == '__main__': - query = sys.argv[1] - corpus = functions.import_corpus("tests/test_data/test_corpus.json") - MyEngine = CQLEngine() - MyEngine.findall(corpus, query) - MyEngine.match(corpus, query) \ No newline at end of file diff --git a/src/CQLEngine/engine.py b/src/CQLEngine/engine.py index 3a5b952..89574c1 100644 --- a/src/CQLEngine/engine.py +++ b/src/CQLEngine/engine.py @@ -1,7 +1,9 @@ -import src.CQLEngine.functions as functions +import CQLEngine.functions as functions -def parse_corpus(ast, corpus, debug, match=True): + +def parse_corpus(ast, corpus, mode, debug): match = False + text_end = False tree_index = 0 text_index = 0 @@ -20,38 +22,40 @@ def parse_corpus(ast, corpus, debug, match=True): analysis_list = ['lemma', 'pos', 'morph', 'word'] # Text-directed engine. - while match == False: - if debug: - print("-") - print(corpus[text_index]) - print(f"Text index: {text_index}") - print(f"Tree index: {tree_index}") - print(f"Ast length: {ast_length}") + while text_end == False: # On teste si on est en bout de texte. - if len(corpus) == text_index: - if debug: - print("End of text. Exiting.") - break - if text_index + 1 == len(corpus): - tree_index += 1 + if len(corpus) == text_index and tree_index != ast_length: if debug: - print("End of text. Exiting.") + print("End of text a. Exiting.") + text_end = True + if mode == "match": + return False break + + # Si on matche la longueur de notre arbre if tree_index == ast_length: + match = True all_spans.append((first_matching_index, text_index)) - first_matching_index = None - if match is True: - return True if debug: print(f"Appending {(first_matching_index, text_index)} to spans.") print(tree_index) print(ast_length) + first_matching_index = None + if match is True and mode == "match": + return True text_index += 1 tree_index = 0 matches = True # La boucle s'arrête là + + if debug: + print("-") + print(corpus[text_index]) + print(f"Text index: {text_index}") + print(f"Tree index: {tree_index}") + print(f"Ast length: {ast_length}") current_query = ast[tree_index] operator = current_query[0] if debug: @@ -59,6 +63,8 @@ def parse_corpus(ast, corpus, debug, match=True): if operator in analysis_list: if debug: print(f"{operator} in list of analysis") + print(len(corpus)) + print(text_index) if functions.simple_match(current_query, corpus[text_index]): if debug: print("Found you a. Going forward on tree and text.") diff --git a/src/CQLEngine/functions.py b/src/CQLEngine/functions.py index d6b7a68..4c01ad6 100644 --- a/src/CQLEngine/functions.py +++ b/src/CQLEngine/functions.py @@ -1,5 +1,50 @@ import re import json +import CQLEngine.parser as parser +import CQLEngine.lexer as lexer +import CQLEngine.engine as engine + + +class CQLEngine(): + def findall(self, corpus:list[dict], query:str, verbose:bool=True, debug:bool=False) -> list[tuple[int, int]]: + """ + This function checks if a query matches some text, and returns the start and end span. + :param query: a CQL query + :param corpus: the annotated text as a list of dictionnaries containing the annotations (lemma, pos, morph, word) + :return: a list of tuples with the start and end position. + """ + query_ast = build_grammar(debug=debug, query=query) + result = engine.parse_corpus(query_ast, corpus, mode="find", debug=debug) + if verbose: + print(f"\n---\nResults for query {query}:") + print(f"Ast: {query_ast}") + print(f"Spans: {result}") + return result + + + def match(self, corpus:list[dict], query:str, verbose:bool=True, debug:bool=False) -> bool: + """ + This function checks whether a query matches some text, and returns True or False + :param query: a CQL query + :param corpus: the annotated text as a list of dictionnaries containing the annotations (lemma, pos, morph, word) + :return: a boolean + """ + query_ast = build_grammar(debug=debug, query=query) + result = engine.parse_corpus(query_ast, corpus, mode="match", debug=debug) + if verbose: + print(f"\n---\nResults for query {query}:") + print(result) + return result + + +def build_grammar(debug, query): + MyLexer = lexer.Lexer() + MyLexer.tokenize(query, debug=debug) + MyParser = parser.Parser(MyLexer, debug=debug) + if debug: + print(MyParser.ast) + return MyParser.ast + def simple_match(query:tuple, text_token:dict) -> bool: """ diff --git a/src/CQLEngine/lexer.py b/src/CQLEngine/lexer.py index f72d7bd..3490917 100644 --- a/src/CQLEngine/lexer.py +++ b/src/CQLEngine/lexer.py @@ -1,6 +1,5 @@ import ply.lex as lex import copy -print("Imported") class Lexer(object): tokens = ( @@ -82,7 +81,7 @@ def t_error(self, t): print("Illegal character '%s'" % t.value[0]) t.lexer.skip(1) - def build(self, query, debug): + def tokenize(self, query, debug): self.lexer = lex.lex(module=self) self.lexer.input(query) diff --git a/src/CQLEngine/parser.py b/src/CQLEngine/parser.py index 56dbc81..4c16277 100644 --- a/src/CQLEngine/parser.py +++ b/src/CQLEngine/parser.py @@ -1,5 +1,5 @@ import ply.yacc as yacc -import src.CQLEngine.lexer as lexer +import CQLEngine.lexer as lexer @@ -74,7 +74,6 @@ def p_subquery_and_subquery(self, p): | query_atom AND query_atom | query_atom AND query_atom AND query_atom | query_atom AND query_atom AND query_atom AND query_atom''' - print(len(p)) if len(p) == 2: p[0] = p[1] elif len(p) == 4: diff --git a/src/CQLEngine/test.py b/src/CQLEngine/test.py deleted file mode 100644 index b195fb0..0000000 --- a/src/CQLEngine/test.py +++ /dev/null @@ -1,56 +0,0 @@ -import sys - -import ply.lex as lex -import ply.yacc as yacc - -# Lexer -tokens = ('LEMMA', 'EQUAL', 'VALUE', 'LSQBRACK', 'RSQBRACK') - -t_EQUAL = r'=' -t_LSQBRACK = r'\[' -t_RSQBRACK = r'\]' - -def t_LEMMA(t): - r'lemma' - return t - -def t_VALUE(t): - r'[a-zA-Z_][a-zA-Z0-9_]*|\'[^\']*\'|"[^"]*"' - if t.value.startswith("'") and t.value.endswith("'"): - t.value = t.value[1:-1] - elif t.value.startswith('"') and t.value.endswith('"'): - t.value = t.value[1:-1] - return t - -t_ignore = ' \t' - -def t_error(t): - print(f"Caractère inattendu: '{t.value[0]}' (ligne {t.lineno})") - t.lexer.skip(1) - -lexer = lex.lex() - -# Parser -def p_bracketed_query(p): - 'query : LSQBRACK query RSQBRACK' - p[0] = p[2] - -def p_query_lemma(p): - 'query : LEMMA EQUAL VALUE' - p[0] = ('lemma', p[3]) - -def p_error(p): - if p: - print(f"Erreur de syntaxe à '{p.value}' (ligne {p.lineno})") - else: - print("Erreur de syntaxe : fin de fichier inattendue") - -parser = yacc.yacc(start='query') - -# Test -data = "[lemma='monarchia']" -lexer.input(sys.argv[1]) - -ast = parser.parse(lexer=lexer) -print("AST généré :", ast) - diff --git a/tests/queries_match.txt b/tests/queries_match.txt index 8876ee1..6955092 100644 --- a/tests/queries_match.txt +++ b/tests/queries_match.txt @@ -1 +1,4 @@ -[lemma='a'][lemma='vez'] True +[{'word': 'et', 'lemma': 'et', 'pos': 'CONcoo', 'morph': ''}, {'word': 'tenere', 'lemma': 'teneo', 'pos': 'VER', 'morph': 'MODE=Inf|TEMPS=Pres'}, {'word': 'monarchiam', 'lemma': 'monarchia', 'pos': 'NOMcom', 'morph': 'CAS=Acc|NOMB.=Sing'}, {'word': ':', 'lemma': 'e', 'pos': 'INJ', 'morph': ''}] [lemma='monarchia'] True +[{'word': 'et', 'lemma': 'et', 'pos': 'CONcoo', 'morph': ''}, {'word': 'tenere', 'lemma': 'teneo', 'pos': 'VER', 'morph': 'MODE=Inf|TEMPS=Pres'}, {'word': 'monarchiam', 'lemma': 'monarchia', 'pos': 'NOMcom', 'morph': 'CAS=Acc|NOMB.=Sing'}] [lemma='monarchia'] True +[{'word': 'et', 'lemma': 'et', 'pos': 'CONcoo', 'morph': ''}, {'word': 'tenere', 'lemma': 'teneo', 'pos': 'VER', 'morph': 'MODE=Inf|TEMPS=Pres'}, {'word': ':', 'lemma': 'e', 'pos': 'INJ', 'morph': ''}] [lemma='monarchia'] False +[{'word': 'et', 'lemma': 'monarchia', 'pos': 'CONcoo', 'morph': ''}, {'word': 'tenere', 'lemma': 'teneo', 'pos': 'VER', 'morph': 'MODE=Inf|TEMPS=Pres'}, {'word': 'monarchiam', 'lemma': 'monarchia', 'pos': 'NOMcom', 'morph': 'CAS=Acc|NOMB.=Sing'}, {'word': ':', 'lemma': 'e', 'pos': 'INJ', 'morph': ''}] [lemma='monarchia'] True diff --git a/tests/tests.py b/tests/tests.py index 953dac0..6899377 100644 --- a/tests/tests.py +++ b/tests/tests.py @@ -1,8 +1,7 @@ import ast - -import src.CQLEngine.CQL as CQLEngine -import src.CQLEngine.functions as functions - +import sys +sys.path.append('src/') +import CQLEngine.functions as functions import unittest def import_test_queries(path): @@ -10,35 +9,46 @@ def import_test_queries(path): list_of_queries = f.read().splitlines() return [line.split("\t") for line in list_of_queries] +def import_match_queries(path): + with open(path, "r") as f: + list_of_queries = f.read().splitlines() + as_splits = [line.split("\t") for line in list_of_queries] + return [(ast.literal_eval(nodes), query, GT) for nodes, query, GT in as_splits] + class TestFunctions(unittest.TestCase): def test_simple_match(self): - query = ("lemma", "!=", "asno") + query = ("lemma", "=", "asno") test_token = {"lemma": "asno", "pos": "NCMS000", "morph": None, "word": "asnos"} - self.assertEqual(functions.simple_match(query, test_token), False) + self.assertEqual(functions.simple_match(query, test_token), True, "Something is wrong" + "with function `test_simple_match`") class TestQueries(unittest.TestCase): def test_findall_queries(self): self.corpus = functions.import_corpus("tests/test_data/test_corpus.json") self.queries = import_test_queries("tests/queries_findall.txt") - self.MyEngine = CQLEngine.CQLEngine() + self.MyEngine = functions.CQLEngine() for query, GT in self.queries: GT = ast.literal_eval(GT) with self.subTest(query=query, GT=GT): - self.assertEqual(self.MyEngine.findall(self.corpus, query, debug=False), GT) + self.assertEqual(self.MyEngine.findall(self.corpus, query, debug=False), GT, "Error with findall function") def test_match_queries(self): - self.corpus = functions.import_corpus("tests/test_data/test_corpus.json") - self.queries = import_test_queries("tests/queries_match.txt") - self.MyEngine = CQLEngine.CQLEngine() - for query, GT in self.queries: + self.queries = import_match_queries("tests/queries_match.txt") + self.MyEngine = functions.CQLEngine() + for idx, (nodes, query, GT) in enumerate(self.queries): with self.subTest(query=query, GT=GT): GT = True if GT == "True" else False - self.assertEqual(self.MyEngine.match(self.corpus, query, debug=False), GT) + match = self.MyEngine.match(nodes, query, debug=True) + self.assertEqual(match, GT, + msg=f"\nTest {idx + 1} failed.\n" + f"Query: {query}\n" + f"Nodes: {nodes}\n" + f"Match should be {GT}, is {match}") if __name__ == '__main__':