Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
38 changes: 36 additions & 2 deletions README.md
Original file line number Diff line number Diff line change
@@ -1,4 +1,38 @@
# CQLEngine - a simple Corpus Query Language Processor
# Corpus Query Language Engine

## Presentation
This repo hosts the code for a simple
CQL processor. CQL is a language used for
linguistics queries over large corporas.

Work in progress
## Pip install

```shell
pip3 install corpus-query-language
```

## Uses

Two main functions are implemented:
- match, for checking if some pattern exists in a corpus (stops at first match). Returns a boolean
- findall, for finding the position of all matching tokens. Returns a list of tuples, with start and end position.

```python
import sys
import corpus_query_language as CQL

query = "Some CQL query"
corpus = CQL.utils.import_corpus("path/to/json/corpus.json")
MyEngine = CQL.core.CQLEngine()
MyEngine.findall(corpus, query)
MyEngine.match(corpus, query)
```

## Implemented CQL functions

- parsing of any kind of annotation classes: `word`, `lemma`, `pos`, `morph`
- combination of annotations: `[lemma='rey' & pos='NCMP000']`
- one or zero annotations `[lemma='rey']?` (partially implemented, may produce errors).
- distance `[lemma='rey'][]{,5}[lemma='santo']`
- any regex in the annotation value `[lemma='reye?s?']`
- alternatives: `([lemma='rey']|[lemma='príncipe'])[]{,5}[lemma='santo']` (may produce errors)
5 changes: 4 additions & 1 deletion pyproject.toml
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
[project]
name = "corpus-query-language"
version = "0.0.1"
version = "0.0.5"
authors = [
{ name="Matthias Gille Levenson", email="matthias.gille-levenson@ens-lyon.fr" },
]
Expand All @@ -21,6 +21,9 @@ dependencies = [
requires = ["setuptools", "setuptools-scm"]
build-backend = "setuptools.build_meta"

[tool.setuptools.packages.find]
where = ["src"]

[project.urls]
Homepage = "https://github.com/matgille/CQL"
Issues = "https://github.com/matgille/CQL/issues"
9 changes: 4 additions & 5 deletions CQL.py → src/CQL.py
Original file line number Diff line number Diff line change
@@ -1,23 +1,22 @@
# Python package project: CQL (Corpus Query Language) parser:
# Python package project: CQL (Corpus Query Language) language:
# - parsing of any kind of annotation: word, lemma, pos, morph
# - combination of annotations: [lemma='rey' & pos='NCMP000']
# - one or zero annotations [lemma='rey']?.
# - distance [lemma='rey'][]{,5}[lemma='santo']
# - any regex in the annotation value [lemma='reye?s?']
# - alternatives: [lemma='rey']|[lemma='príncipe'][]{,5}[lemma='santo']
import sys
import CQLEngine.functions as functions
import corpus_query_language as CQL

# Takes a list of dicts with the annotations as input. Returns:
# - a list of spans (search_all function)
# - a boolean (match function)



def main():
query = sys.argv[1]
corpus = functions.import_corpus("tests/test_data/test_corpus.json")
MyEngine = functions.CQLEngine()
corpus = CQL.utils.import_corpus("../tests/test_data/test_corpus.json")
MyEngine = CQL.core.CQLEngine()
MyEngine.findall(corpus, query)
MyEngine.match(corpus, query)

Expand Down
5 changes: 5 additions & 0 deletions src/corpus_query_language/__init__.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,5 @@
import corpus_query_language.core.core as core
import corpus_query_language.engine.engine as engine
import corpus_query_language.utils.utils as utils
import corpus_query_language.language as language
__all__ = ["core", "engine", "language", "utils"]
File renamed without changes.
38 changes: 38 additions & 0 deletions src/corpus_query_language/core/core.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,38 @@
import corpus_query_language.utils.utils as utils
import corpus_query_language.engine.engine as engine

class CQLEngine():
"""
The main class: tokenize a query, parse it, and parse a corpus with 2 main functions:
- findall
- match
"""
def findall(self, corpus:list[dict], query:str, verbose:bool=True, debug:bool=False) -> list[tuple[int, int]]:
"""
This function checks if a query matches some text, and returns the start and end span.
:param query: a CQL query
:param corpus: the annotated text as a list of dictionnaries containing the annotations (lemma, pos, morph, word)
:return: a list of tuples with the start and end position.
"""
query_ast = utils.build_grammar(debug=debug, query=query)
result = engine.parse_corpus(query_ast, corpus, mode="find", debug=debug)
if verbose:
print(f"\n---\nResults for query {query}:")
print(f"Ast: {query_ast}")
print(f"Spans: {result}")
return result


def match(self, corpus:list[dict], query:str, verbose:bool=True, debug:bool=False) -> bool:
"""
This function checks whether a query matches some text, and returns True or False
:param query: a CQL query
:param corpus: the annotated text as a list of dictionnaries containing the annotations (lemma, pos, morph, word)
:return: a boolean
"""
query_ast = utils.build_grammar(debug=debug, query=query)
result = engine.parse_corpus(query_ast, corpus, mode="match", debug=debug)
if verbose:
print(f"\n---\nResults for query {query}:")
print(result)
return result
Empty file.
Original file line number Diff line number Diff line change
@@ -1,7 +1,15 @@
import CQLEngine.functions as functions
import corpus_query_language.utils.utils as utils


def parse_corpus(ast, corpus, mode, debug):
def parse_corpus(ast, corpus: list[dict], mode:str, debug) -> bool | list[tuple[int, int]]:
"""
Main function for parsing a corpus given an AST.
:param ast: The Abstract Syntax Tree to be matched against the corpus.
:param corpus: The corpus as a list of dictionaries.
:param mode: The mode: match (stop at first match, return Bool) or find (search for all matches, returns list of tuples)
:param debug: Debug mode: print all information of matching process
:return:
"""
match = False
text_end = False
tree_index = 0
Expand All @@ -23,7 +31,6 @@ def parse_corpus(ast, corpus, mode, debug):

# Text-directed engine.
while text_end == False:

# On teste si on est en bout de texte.
if len(corpus) == text_index and tree_index != ast_length:
if debug:
Expand Down Expand Up @@ -65,7 +72,7 @@ def parse_corpus(ast, corpus, mode, debug):
print(f"{operator} in list of analysis")
print(len(corpus))
print(text_index)
if functions.simple_match(current_query, corpus[text_index]):
if utils.simple_match(current_query, corpus[text_index]):
if debug:
print("Found you a. Going forward on tree and text.")
print(f"First match is {text_index}")
Expand All @@ -84,7 +91,7 @@ def parse_corpus(ast, corpus, mode, debug):
if debug:
print(f"{operator} operator")
if operator == "or":
if functions.alternative_match(current_query[1:], corpus[text_index]):
if utils.alternative_match(current_query[1:], corpus[text_index]):
if debug:
print("Found your alternative. Going forward on tree and text.")
print(f"First match is {text_index}")
Expand All @@ -107,7 +114,7 @@ def parse_corpus(ast, corpus, mode, debug):
print(f"\t{text_index}: Looking for {ast[tree_index + 1]} in position {text_index}")
if len(corpus) == text_index:
break
if functions.simple_match(ast[tree_index + 1], corpus[text_index]):
if utils.simple_match(ast[tree_index + 1], corpus[text_index]):
submatch = True
tree_index += 2
if debug:
Expand All @@ -126,7 +133,7 @@ def parse_corpus(ast, corpus, mode, debug):
elif operator == "and":
all_matches = []
for item in current_query[1:]:
if functions.simple_match(item, corpus[text_index]):
if utils.simple_match(item, corpus[text_index]):
all_matches.append(True)
else:
all_matches.append(False)
Expand All @@ -143,7 +150,7 @@ def parse_corpus(ast, corpus, mode, debug):
# Pour l'opérateur "0 ou 1", on vérifie que le token matche.
# S'il ne matche pas, on passe à la requête suivante sans
# incrémenter le texte
if functions.alternative_match(current_query[1:], corpus[text_index]):
if utils.alternative_match(current_query[1:], corpus[text_index]):
if debug:
print("Found your alternative. Going forward on tree and text.")
print(f"First match is {text_index}")
Expand Down
Empty file.
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,9 @@
import copy

class Lexer(object):
"""
Lexer that is used to tokenize a query.
"""
tokens = (
'RANGE',
'DISTANCE',
Expand Down Expand Up @@ -81,7 +84,7 @@ def t_error(self, t):
print("Illegal character '%s'" % t.value[0])
t.lexer.skip(1)

def tokenize(self, query, debug):
def tokenize(self, query:str, debug:bool=False):
self.lexer = lex.lex(module=self)
self.lexer.input(query)

Expand Down
Original file line number Diff line number Diff line change
@@ -1,11 +1,14 @@
import ply.yacc as yacc
import CQLEngine.lexer as lexer
import corpus_query_language.language.lexer as lexer



# API functionnalities.

class Parser(lexer.Lexer):
"""
The parser. Builds the Ast with the tokens produced by the lexer.
"""
tokens = lexer.Lexer.tokens

def p_or_queries(self, p):
Expand Down
Empty file.
Original file line number Diff line number Diff line change
@@ -1,43 +1,18 @@
import re
import json
import CQLEngine.parser as parser
import CQLEngine.lexer as lexer
import CQLEngine.engine as engine
import corpus_query_language.language.parser as parser
import corpus_query_language.language.lexer as lexer


class CQLEngine():
def findall(self, corpus:list[dict], query:str, verbose:bool=True, debug:bool=False) -> list[tuple[int, int]]:
"""
This function checks if a query matches some text, and returns the start and end span.
:param query: a CQL query
:param corpus: the annotated text as a list of dictionnaries containing the annotations (lemma, pos, morph, word)
:return: a list of tuples with the start and end position.
"""
query_ast = build_grammar(debug=debug, query=query)
result = engine.parse_corpus(query_ast, corpus, mode="find", debug=debug)
if verbose:
print(f"\n---\nResults for query {query}:")
print(f"Ast: {query_ast}")
print(f"Spans: {result}")
return result


def match(self, corpus:list[dict], query:str, verbose:bool=True, debug:bool=False) -> bool:
"""
This function checks whether a query matches some text, and returns True or False
:param query: a CQL query
:param corpus: the annotated text as a list of dictionnaries containing the annotations (lemma, pos, morph, word)
:return: a boolean
"""
query_ast = build_grammar(debug=debug, query=query)
result = engine.parse_corpus(query_ast, corpus, mode="match", debug=debug)
if verbose:
print(f"\n---\nResults for query {query}:")
print(result)
return result


def build_grammar(debug, query):
def build_grammar(debug:bool, query:str) -> list:
"""
This function builds an Abstract Syntax Tree from a query
:param debug: outputs parsing information
:param query: the query to build the AST from
:return: the ast
"""
MyLexer = lexer.Lexer()
MyLexer.tokenize(query, debug=debug)
MyParser = parser.Parser(MyLexer, debug=debug)
Expand Down Expand Up @@ -103,7 +78,12 @@ def alternative_match(queries:list[tuple], text_token:dict) -> bool:



def import_corpus(path):
def import_corpus(path) -> list:
"""
Simple JSON file import to dict
:param path: Path to the JSON file
:return: the list of dicts
"""
with open(path, "r") as f:
corpus = json.load(f)
return corpus
10 changes: 5 additions & 5 deletions tests/tests.py
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
import ast
import sys
sys.path.append('src/')
import CQLEngine.functions as functions
import corpus_query_language as CQL
import unittest

def import_test_queries(path):
Expand All @@ -23,14 +23,14 @@ def test_simple_match(self):
"pos": "NCMS000",
"morph": None,
"word": "asnos"}
self.assertEqual(functions.simple_match(query, test_token), True, "Something is wrong"
self.assertEqual(CQL.utils.simple_match(query, test_token), True, "Something is wrong"
"with function `test_simple_match`")

class TestQueries(unittest.TestCase):
def test_findall_queries(self):
self.corpus = functions.import_corpus("tests/test_data/test_corpus.json")
self.corpus = CQL.utils.import_corpus("tests/test_data/test_corpus.json")
self.queries = import_test_queries("tests/queries_findall.txt")
self.MyEngine = functions.CQLEngine()
self.MyEngine = CQL.core.CQLEngine()
for query, GT in self.queries:
GT = ast.literal_eval(GT)
with self.subTest(query=query, GT=GT):
Expand All @@ -39,7 +39,7 @@ def test_findall_queries(self):

def test_match_queries(self):
self.queries = import_match_queries("tests/queries_match.txt")
self.MyEngine = functions.CQLEngine()
self.MyEngine = CQL.core.CQLEngine()
for idx, (nodes, query, GT) in enumerate(self.queries):
with self.subTest(query=query, GT=GT):
GT = True if GT == "True" else False
Expand Down
Loading