matgille · matgille · Sep 17, 2025 · Sep 17, 2025 · Sep 17, 2025 · Sep 17, 2025
diff --git a/README.md b/README.md
@@ -1,4 +1,38 @@
-# CQLEngine - a simple Corpus Query Language Processor
+# Corpus Query Language Engine
 
+## Presentation
+This repo hosts the code for a simple 
+CQL processor. CQL is a language used for 
+linguistics queries over large corporas.
 
-Work in progress
+## Pip install
+
+```shell
+pip3 install corpus-query-language
+```
+
+## Uses
+
+Two main functions are implemented:
+- match, for checking if some pattern exists in a corpus (stops at first match). Returns a boolean
+- findall, for finding the position of all matching tokens. Returns a list of tuples, with start and end position.
+
+```python
+import sys
+import corpus_query_language as CQL
+
+query = "Some CQL query"
+corpus = CQL.utils.import_corpus("path/to/json/corpus.json")
+MyEngine = CQL.core.CQLEngine()
+MyEngine.findall(corpus, query)
+MyEngine.match(corpus, query)
+```
+
+## Implemented CQL functions
+
+- parsing of any kind of annotation classes: `word`, `lemma`, `pos`, `morph`
+- combination of annotations: `[lemma='rey' & pos='NCMP000']`
+- one or zero annotations `[lemma='rey']?` (partially implemented, may produce errors).
+- distance `[lemma='rey'][]{,5}[lemma='santo']`
+- any regex in the annotation value `[lemma='reye?s?']`
+- alternatives: `([lemma='rey']|[lemma='príncipe'])[]{,5}[lemma='santo']` (may produce errors)
diff --git a/pyproject.toml b/pyproject.toml
@@ -1,6 +1,6 @@
 [project]
 name = "corpus-query-language"
-version = "0.0.1"
+version = "0.0.5"
 authors = [
   { name="Matthias Gille Levenson", email="matthias.gille-levenson@ens-lyon.fr" },
 ]
@@ -21,6 +21,9 @@ dependencies = [
 requires = ["setuptools", "setuptools-scm"]
 build-backend = "setuptools.build_meta"
 
+[tool.setuptools.packages.find]
+where = ["src"]
+
 [project.urls]
 Homepage = "https://github.com/matgille/CQL"
 Issues = "https://github.com/matgille/CQL/issues"
diff --git a/CQL.py → src/CQL.py b/CQL.py → src/CQL.py
@@ -1,23 +1,22 @@
-# Python package project: CQL (Corpus Query Language) parser:
+# Python package project: CQL (Corpus Query Language) language:
 # - parsing of any kind of annotation: word, lemma, pos, morph
 # - combination of annotations: [lemma='rey' & pos='NCMP000']
 # - one or zero annotations [lemma='rey']?.
 # - distance [lemma='rey'][]{,5}[lemma='santo']
 # - any regex in the annotation value [lemma='reye?s?']
 # - alternatives: [lemma='rey']|[lemma='príncipe'][]{,5}[lemma='santo']
 import sys
-import CQLEngine.functions as functions
+import corpus_query_language as CQL
 
 # Takes a list of dicts with the annotations as input. Returns:
 # - a list of spans (search_all function)
 # - a boolean (match function)
 
 
-
 def main():
 	query = sys.argv[1]
-	corpus = functions.import_corpus("tests/test_data/test_corpus.json")
-	MyEngine = functions.CQLEngine()
+	corpus = CQL.utils.import_corpus("../tests/test_data/test_corpus.json")
+	MyEngine = CQL.core.CQLEngine()
 	MyEngine.findall(corpus, query)
 	MyEngine.match(corpus, query)
 

diff --git a/src/corpus_query_language/__init__.py b/src/corpus_query_language/__init__.py
@@ -0,0 +1,5 @@
+import corpus_query_language.core.core as core
+import corpus_query_language.engine.engine as engine
+import corpus_query_language.utils.utils as utils
+import corpus_query_language.language as language
+__all__ = ["core", "engine", "language", "utils"]
diff --git a/src/CQLEngine/__init__.py → src/corpus_query_language/core/__init__.py b/src/CQLEngine/__init__.py → src/corpus_query_language/core/__init__.py
diff --git a/src/corpus_query_language/core/core.py b/src/corpus_query_language/core/core.py
@@ -0,0 +1,38 @@
+import corpus_query_language.utils.utils as utils
+import corpus_query_language.engine.engine as engine
+
+class CQLEngine():
+	"""
+	The main class: tokenize a query, parse it, and parse a corpus with 2 main functions:
+		- findall
+		- match
+	"""
+	def findall(self, corpus:list[dict], query:str, verbose:bool=True,  debug:bool=False) -> list[tuple[int, int]]:
+		"""
+			This function checks if a query matches some text, and returns the start and end span.
+			:param query: a CQL query
+			:param corpus: the annotated text as a list of dictionnaries containing the annotations (lemma, pos, morph, word)
+			:return: a list of tuples with the start and end position.
+			"""
+		query_ast = utils.build_grammar(debug=debug, query=query)
+		result = engine.parse_corpus(query_ast, corpus, mode="find", debug=debug)
+		if verbose:
+			print(f"\n---\nResults for query {query}:")
+			print(f"Ast: {query_ast}")
+			print(f"Spans: {result}")
+		return result
+
+
+	def match(self, corpus:list[dict], query:str, verbose:bool=True, debug:bool=False) -> bool:
+		"""
+		This function checks whether a query matches some text, and returns True or False
+		:param query: a CQL query
+		:param corpus: the annotated text as a list of dictionnaries containing the annotations (lemma, pos, morph, word)
+		:return: a boolean
+		"""
+		query_ast = utils.build_grammar(debug=debug, query=query)
+		result = engine.parse_corpus(query_ast, corpus, mode="match", debug=debug)
+		if verbose:
+			print(f"\n---\nResults for query {query}:")
+			print(result)
+		return result
diff --git a/src/corpus_query_language/engine/__init__.py b/src/corpus_query_language/engine/__init__.py
diff --git a/src/CQLEngine/engine.py → src/corpus_query_language/engine/engine.py b/src/CQLEngine/engine.py → src/corpus_query_language/engine/engine.py
@@ -1,7 +1,15 @@
-import CQLEngine.functions as functions
+import corpus_query_language.utils.utils as utils
 
 
-def parse_corpus(ast, corpus, mode, debug):
+def parse_corpus(ast, corpus: list[dict], mode:str, debug) -> bool | list[tuple[int, int]]:
+	"""
+	Main function for parsing a corpus given an AST.
+	:param ast: The Abstract Syntax Tree to be matched against the corpus.
+	:param corpus: The corpus as a list of dictionaries.
+	:param mode: The mode: match (stop at first match, return Bool) or find (search for all matches, returns list of tuples)
+	:param debug: Debug mode: print all information of matching process
+	:return:
+	"""
 	match = False
 	text_end = False
 	tree_index = 0
@@ -23,7 +31,6 @@ def parse_corpus(ast, corpus, mode, debug):
 
 	# Text-directed engine.
 	while text_end == False:
-
 		# On teste si on est en bout de texte.
 		if len(corpus) == text_index and tree_index != ast_length:
 			if debug:
@@ -65,7 +72,7 @@ def parse_corpus(ast, corpus, mode, debug):
 				print(f"{operator} in list of analysis")
 				print(len(corpus))
 				print(text_index)
-			if functions.simple_match(current_query, corpus[text_index]):
+			if utils.simple_match(current_query, corpus[text_index]):
 				if debug:
 					print("Found you a. Going forward on tree and text.")
 					print(f"First match is {text_index}")
@@ -84,7 +91,7 @@ def parse_corpus(ast, corpus, mode, debug):
 			if debug:
 				print(f"{operator} operator")
 			if operator == "or":
-				if functions.alternative_match(current_query[1:], corpus[text_index]):
+				if utils.alternative_match(current_query[1:], corpus[text_index]):
 					if debug:
 						print("Found your alternative. Going forward on tree and text.")
 						print(f"First match is {text_index}")
@@ -107,7 +114,7 @@ def parse_corpus(ast, corpus, mode, debug):
 						print(f"\t{text_index}: Looking for {ast[tree_index + 1]} in position {text_index}")
 					if len(corpus) == text_index:
 						break
-					if functions.simple_match(ast[tree_index + 1], corpus[text_index]):
+					if utils.simple_match(ast[tree_index + 1], corpus[text_index]):
 						submatch = True
 						tree_index += 2
 						if debug:
@@ -126,7 +133,7 @@ def parse_corpus(ast, corpus, mode, debug):
 			elif operator == "and":
 				all_matches = []
 				for item in current_query[1:]:
-					if functions.simple_match(item, corpus[text_index]):
+					if utils.simple_match(item, corpus[text_index]):
 						all_matches.append(True)
 					else:
 						all_matches.append(False)
@@ -143,7 +150,7 @@ def parse_corpus(ast, corpus, mode, debug):
 				# Pour l'opérateur "0 ou 1", on vérifie que le token matche.
 				# S'il ne matche pas, on passe à la requête suivante sans
 				# incrémenter le texte
-				if functions.alternative_match(current_query[1:], corpus[text_index]):
+				if utils.alternative_match(current_query[1:], corpus[text_index]):
 					if debug:
 						print("Found your alternative. Going forward on tree and text.")
 						print(f"First match is {text_index}")

diff --git a/src/corpus_query_language/language/__init__.py b/src/corpus_query_language/language/__init__.py
diff --git a/src/CQLEngine/lexer.py → src/corpus_query_language/language/lexer.py b/src/CQLEngine/lexer.py → src/corpus_query_language/language/lexer.py
@@ -2,6 +2,9 @@
 import copy
 
 class Lexer(object):
+    """
+    Lexer that is used to tokenize a query.
+    """
     tokens = (
         'RANGE',
         'DISTANCE',
@@ -81,7 +84,7 @@ def t_error(self, t):
         print("Illegal character '%s'" % t.value[0])
         t.lexer.skip(1)
 
-    def tokenize(self, query, debug):
+    def tokenize(self, query:str, debug:bool=False):
         self.lexer = lex.lex(module=self)
         self.lexer.input(query)
 

diff --git a/src/CQLEngine/parser.py → src/corpus_query_language/language/parser.py b/src/CQLEngine/parser.py → src/corpus_query_language/language/parser.py
@@ -1,11 +1,14 @@
 import ply.yacc as yacc
-import CQLEngine.lexer as lexer
+import corpus_query_language.language.lexer as lexer
 
 
 
 # API functionnalities.
 
 class Parser(lexer.Lexer):
+    """
+    The parser. Builds the Ast with the tokens produced by the lexer.
+    """
     tokens = lexer.Lexer.tokens
 
     def p_or_queries(self, p):

diff --git a/src/corpus_query_language/utils/__init__.py b/src/corpus_query_language/utils/__init__.py
diff --git a/src/CQLEngine/functions.py → src/corpus_query_language/utils/utils.py b/src/CQLEngine/functions.py → src/corpus_query_language/utils/utils.py
@@ -1,43 +1,18 @@
 import re
 import json
-import CQLEngine.parser as parser
-import CQLEngine.lexer as lexer
-import CQLEngine.engine as engine
+import corpus_query_language.language.parser as parser
+import corpus_query_language.language.lexer as lexer
 
 
-class CQLEngine():
-	def findall(self, corpus:list[dict], query:str, verbose:bool=True,  debug:bool=False) -> list[tuple[int, int]]:
-		"""
-			This function checks if a query matches some text, and returns the start and end span.
-			:param query: a CQL query
-			:param corpus: the annotated text as a list of dictionnaries containing the annotations (lemma, pos, morph, word)
-			:return: a list of tuples with the start and end position.
-			"""
-		query_ast = build_grammar(debug=debug, query=query)
-		result = engine.parse_corpus(query_ast, corpus, mode="find", debug=debug)
-		if verbose:
-			print(f"\n---\nResults for query {query}:")
-			print(f"Ast: {query_ast}")
-			print(f"Spans: {result}")
-		return result
 
 
-	def match(self, corpus:list[dict], query:str, verbose:bool=True, debug:bool=False) -> bool:
-		"""
-		This function checks whether a query matches some text, and returns True or False
-		:param query: a CQL query
-		:param corpus: the annotated text as a list of dictionnaries containing the annotations (lemma, pos, morph, word)
-		:return: a boolean
-		"""
-		query_ast = build_grammar(debug=debug, query=query)
-		result = engine.parse_corpus(query_ast, corpus, mode="match", debug=debug)
-		if verbose:
-			print(f"\n---\nResults for query {query}:")
-			print(result)
-		return result
-
-
-def build_grammar(debug, query):
+def build_grammar(debug:bool, query:str) -> list:
+	"""
+	This function builds an Abstract Syntax Tree from a query
+	:param debug: outputs parsing information
+	:param query: the query to build the AST from
+	:return: the ast
+	"""
 	MyLexer = lexer.Lexer()
 	MyLexer.tokenize(query, debug=debug)
 	MyParser = parser.Parser(MyLexer, debug=debug)
@@ -103,7 +78,12 @@ def alternative_match(queries:list[tuple], text_token:dict) -> bool:
 
 
 
-def import_corpus(path):
+def import_corpus(path) -> list:
+	"""
+	Simple JSON file import to dict
+	:param path: Path to the JSON file
+	:return: the list of dicts
+	"""
 	with open(path, "r") as f:
 		corpus = json.load(f)
 	return corpus
diff --git a/tests/tests.py b/tests/tests.py
@@ -1,7 +1,7 @@
 import ast
 import sys
 sys.path.append('src/')
-import CQLEngine.functions as functions
+import corpus_query_language as CQL
 import unittest
 
 def import_test_queries(path):
@@ -23,14 +23,14 @@ def test_simple_match(self):
 					  "pos": "NCMS000",
 					  "morph": None,
 					  "word": "asnos"}
-		self.assertEqual(functions.simple_match(query, test_token), True, "Something is wrong"
+		self.assertEqual(CQL.utils.simple_match(query, test_token), True, "Something is wrong"
 																		  "with function `test_simple_match`")
 
 class TestQueries(unittest.TestCase):
 	def test_findall_queries(self):
-		self.corpus = functions.import_corpus("tests/test_data/test_corpus.json")
+		self.corpus = CQL.utils.import_corpus("tests/test_data/test_corpus.json")
 		self.queries = import_test_queries("tests/queries_findall.txt")
-		self.MyEngine = functions.CQLEngine()
+		self.MyEngine = CQL.core.CQLEngine()
 		for query, GT in self.queries:
 			GT = ast.literal_eval(GT)
 			with self.subTest(query=query, GT=GT):
@@ -39,7 +39,7 @@ def test_findall_queries(self):
 
 	def test_match_queries(self):
 		self.queries = import_match_queries("tests/queries_match.txt")
-		self.MyEngine = functions.CQLEngine()
+		self.MyEngine = CQL.core.CQLEngine()
 		for idx, (nodes, query, GT) in enumerate(self.queries):
 			with self.subTest(query=query, GT=GT):
 				GT = True if GT == "True" else False