GoelandProver · jrosain · Jul 20, 2025 · Feb 7, 2026 · Feb 7, 2026 · Jul 20, 2025
diff --git a/.github/workflows/full-ci.yml b/.github/workflows/full-ci.yml
@@ -86,9 +86,19 @@ jobs:
       - uses: actions/checkout@v5
         with:
           ref: ${{ github.event.pull_request.head.sha }}
+      - uses: cachix/install-nix-action@v31
+        with:
+          nix_path: nixpkgs=channel:nixos-unstable
       - uses: actions/setup-go@v3
         with:
           go-version: ${{ env.goVersion }}
       - run: go install golang.org/x/tools/cmd/goyacc@latest
       - run: cd src && make
-      - run: cd devtools && make test-suite
+      - uses: rrbutani/use-nix-shell-action@v1
+        with:
+          file: shell.nix
+          script: |
+            cd devtools
+            make test-suite
+
+
diff --git a/.gitignore b/.gitignore
@@ -1,3 +1,5 @@
+*.lpo
+
 .direnv
 [Pp]roblems
 [Oo]utput

diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md
@@ -8,11 +8,12 @@ wish to contribute to Goéland, you should start by
 [forking](https://github.com/GoelandProver/Goeland/fork) the repository. Then,
 you can work on on your feature/bug fix/enhancement in your local repository.
 
-Once you deem your work satisfactory, you should [open a pull
-request](https://github.com/GoelandProver/Goeland/compare) **targeting
-master**. Then, one of the maintainer will review your code as soon as
-possible. If you have no feedback for a few days, do not hesitate to ping one of
-them. The current maintainers are: @jcailler, @jrosain.
+Once you deem your work satisfactory and have properly updated the test suite
+(c.f. [Managing the test suite](#managing-the-test-suite)), you should [open a pull
+request](https://github.com/GoelandProver/Goeland/compare) **targeting master**. Then, one
+of the maintainer will review your code as soon as possible. If you have no feedback for a
+few days, do not hesitate to ping one of them. The current maintainers are: @jcailler,
+@jrosain.
 
 Your code is expected to (i) build, (ii) satisfy the unit tests and (iii) not
 prove countertheorems. This check *does not* run automatically. One of the
@@ -39,6 +40,25 @@ have a very descriptive error as it will make things easier to debug.
 Note that, by default, neither of these options `panic`. You have to activate the `-debug`
 flag in order for them to panic and you to have a backtrace.
 
+### Managing the test suite
+
+In order to have a systematic testing of Goéland, we have a [test
+suite](devtools/test-suite) that contains:
+- basic test files to check functionalities,
+- bug files that correspond to a reported bug that has been resolved, and
+- output files that test the output of Goéland.
+
+When you add a new functionality to Goéland, you must add some files to the
+[basic](devtools/test-suite/basic) folder that tests your newly implemented
+functionalities. If your pull request fixes a bug, you must add the bug file to the
+[bugs](devtools/test-suite/bugs) folder. Beware that we run the Rocq and Lambdapi output
+on the test suite, so you should think about whether a problem is checkable or not. If
+it's not, add them in the corresponding `no_chk` instead. The current not-checkable
+problems are problems involving a typed context. Moreover, if your problem includes
+equalities, the Lambdapi check may fail. If so, add your problem to the
+`lp_tolerate_fails` variable in the `run-test-suite` file.
+
+
 ## For Maintainers
 
 By default, a pull request that modifies the go source code has the `needs:ci` label. You

diff --git a/devtools/run-test-suite.py b/devtools/run-test-suite.py
@@ -5,13 +5,15 @@
 import sys
 import re
 import difflib
+import shutil
 from subprocess import PIPE, run
 
 class Parser:
     ARGS = "% args: "
     RES = "% result: "
     ENV = "% env: "
     EXIT_CODE = "% exit: "
+    no_check = False
 
     def __init__(self, filename):
         self.filename = filename
@@ -20,6 +22,10 @@ def __init__(self, filename):
         self.parseEnv()
         self.parseExitCode()
 
+        args_avoid_chk = ["-proof", "-otptp", "-osctptp"]
+        if "no_chk" in filename or self.expectedExitCode != "" or any(arg in self.arguments for arg in args_avoid_chk) or self.expectedResult == "NOT VALID":
+            self.no_check = True
+
     def parseGen(self, pat):
         with open(self.filename) as f:
             for line in f.readlines():
@@ -42,24 +48,118 @@ def parseEnv(self):
     def parseExitCode(self):
         self.expectedExitCode = self.parseGen(self.EXIT_CODE).strip()
 
-    def getCommandLine(self):
-        return self.env + " ../src/_build/goeland " + self.arguments + " " + self.filename
+    def getCommandLine(self, checker_args):
+        arguments = self.arguments + checker_args
+        return self.env + " ../src/_build/goeland " + arguments + " " + self.filename
+
+    def getArgsForPrinting(self):
+        chk_str = ""
+        if self.no_check:
+            chk_str = " (no check)"
+        return self.arguments + chk_str
 
 def sanitize(s):
     return s.encode('utf-8', errors='ignore').decode(errors='ignore')
 
-def runProver(command):
-    print(f"Launching: {command}")
+def runProver(f, command):
     result = run(command, stdout=PIPE, stderr=PIPE, universal_newlines=True, shell=True, encoding='utf-8')
     return (sanitize(result.stdout), sanitize(result.stderr), result.returncode)
 
-def runWithExpected(f, parser):
-    output, err, exit_code = runProver(parser.getCommandLine())
+def getRelevantOutput(output):
+    relevantLines = []
+    ok = -1
+    for line in output.split("\n"):
+        if re.search("% SZS output start", line):
+            ok = 1
+        elif re.search("% SZS output end", line):
+            ok = -1
+        elif ok == 0:
+            if len(line.strip()) > 0:
+                relevantLines.append(line.strip())
+        else:
+            ok -= 1
+
+    return relevantLines
+
+def isExecutable(prog) :
+    return shutil.which(prog) is not None
+
+def makeGenericCheck(command, extension, cleanup_always, cleanup_compile_success, f, output):
+    check_lines = getRelevantOutput(output)
+    check_success = False
+    filename = os.getcwd() + "/" + os.path.basename(f)[:-2].replace("-", "_")
+
+    with open(f"{filename}.{extension}", "w") as chk_file:
+        chk_file.write("\n".join(check_lines))
+
+    result = run(f"{command} {filename}.{extension}", stdout=PIPE, stderr=PIPE, universal_newlines=True, shell=True, encoding='utf-8')
+    check_success = result.returncode == 0
+
+    for ext in cleanup_always:
+        os.remove(f"{filename}.{ext}")
+
+    if not check_success:
+        return False, result.stderr
+    else:
+        for ext in cleanup_compile_success:
+            os.remove(f"{filename}.{ext}")
+
+    return True, None
+
+def getRocqCompiler() :
+    if isExecutable("rocq"):
+        return "rocq compile"
+    elif isExecutable("coqc"):
+        return "coqc"
+    else:
+        raise Exception("No Rocq executable found on the system")
+
+def makeRocqCheck(f, output):
+    check_status, err = makeGenericCheck(getRocqCompiler(), "v", ["glob"], ["v", "vo", "vok", "vos"], f, output)
+
+    if not check_status:
+        print(f"ROCQ check has failed.")
+        print(f"Reason: {err}")
+        exit(1)
+
+def makeLambdapiCheck(f, output):
+    lp_command = "lambdapi check --lib-root .. --map-dir Logic.Goeland:../proof-certification/LambdaPi"
+    check_status, err = makeGenericCheck(lp_command, "lp", [], ["lp"], f, output)
+
+    # As the lambdapi output does not manage equality, we tolerate fails for the problems
+    # of the test suite that have equality.
+    lp_tolerated_fails = ["TEST_EQ.p", "TEST_EQ2.p", "sankalp.p"]
+
+    if not check_status:
+        if os.path.basename(f) in lp_tolerated_fails:
+            print(f"LAMBDAPI check has failed, but it was expected.")
+        else:
+            print(f"LAMBDAPI check has failed")
+            exit(1)
+
+def runWithExpected(f, parser, checker_args, check_fun):
+    """
+    Runs Goéland on [f] using the parsed command line, then checks if the output corresponds to the expected one.
+    This function manages:
+    - error codes (e.g., it can detect whether Goéland exits with error code i
+    - results (e.g., VALID, NOT VALID).
+
+    If Goéland runs into an unexpected error, we report it. Moreover, if the kind of expected return is a VALID
+    result, we run a checker (specified by checker_args and check_fun) to check that the proof is indeed valid
+    (except for files in the no-chk folder).
+    """
+    output, err, exit_code = runProver(f, parser.getCommandLine(""))
 
     if err != "":
         print(f"Runtime error: {err}")
         exit(1)
 
+    if parser.expectedExitCode != '':
+        if int(parser.expectedExitCode) != exit_code:
+            print(f"Error: expected exit code '{parser.expectedExitCode}', got: '{exit_code}'")
+            exit(1)
+        else: return
+
     search = re.compile(".*% RES : (.*)$")
     for line in output.split("\n"):
         res = search.match(line)
@@ -69,37 +169,28 @@ def runWithExpected(f, parser):
                 print(f"Error: expected '{parser.expectedResult}', got: '{actual}'")
                 exit(1)
             else:
+                if parser.no_check: return
+                output, _, _ = runProver(f, parser.getCommandLine(checker_args))
+                check_fun(f, output)
                 return
 
-    if parser.expectedExitCode != '':
-        if int(parser.expectedExitCode) != exit_code:
-            print(f"Error: expected exit code '{parser.expectedExitCode}', got: '{exit_code}'")
-            exit(1)
-        else: return
-
     print(f"Unknown error: got\n{output}")
     exit(1)
 
+def runWithRocqChk(f, parser):
+    runWithExpected(f, parser, " -context -orocq", makeRocqCheck)
+
+def runWithLpChk(f, parser):
+    runWithExpected(f, parser, " -olp", makeLambdapiCheck)
+
 def compareOutputs(f, parser):
-    output, err, _ = runProver(parser.getCommandLine())
+    output, err, exit_code = runProver(f, parser.getCommandLine(""))
 
-    if err != "":
+    if err != "" or exit_code != 0:
         print(f"Runtime error: {err}")
         exit(1)
 
-    relevantLines = []
-    ok = -1
-    for line in output.split("\n"):
-        if re.search("% SZS output start", line):
-            ok = 1
-        elif re.search("% SZS output end", line):
-            ok = -1
-        elif ok == 0:
-            if len(line.strip()) > 0:
-                relevantLines.append(line.strip())
-        else:
-            ok -= 1
-
+    relevantLines = getRelevantOutput(output)
     relevantExpectedLines = []
 
     with open(os.path.splitext(f)[0] + ".out") as f:
@@ -119,10 +210,12 @@ def compareOutputs(f, parser):
 
 if __name__ == "__main__":
     outputTest = ["-proof", "-otptp", "-osctptp"]
-    for f in glob.glob("test-suite/**/*.p"):
+    for f in sorted(glob.glob("test-suite/**/*.p")):
         parser = Parser(f)
 
         if any(out in parser.arguments for out in outputTest) :
             compareOutputs(f, parser)
         else :
-            runWithExpected(f, parser)
+            print(f"{f}\t{parser.getArgsForPrinting()}")
+            runWithRocqChk(f, parser)
+            runWithLpChk(f, parser)
diff --git a/devtools/test-suite/basic/test-typing-elab-1.p b/devtools/test-suite/basic/test-typing-elab-1.p
@@ -0,0 +1,7 @@
+% should fail because $lesseq is a defined function for $int, $rat and $real but is not polymorphic
+% for any type
+% exit: 1
+
+tff(ty_type, type, ty : $tType).
+
+tff(conj, conjecture, ! [A : ty] : $lesseq(A, A)).
diff --git a/devtools/test-suite/basic/test-typing-elab-2.p b/devtools/test-suite/basic/test-typing-elab-2.p
@@ -0,0 +1,7 @@
+% should fail because $quotient is a defined function for $int, $rat and $real but is not polymorphic
+% for any type
+% exit: 1
+
+tff(ty_type, type, ty : $tType).
+
+tff(conj, conjecture, ! [A : ty] : $lesseq($quotient(A, A), 1)).
diff --git a/devtools/test-suite/basic/test-typing-elab-3.p b/devtools/test-suite/basic/test-typing-elab-3.p
@@ -0,0 +1,4 @@
+% quotient is defined specifically for integers
+% result: NOT VALID
+
+tff(conj, conjecture, $lesseq($quotient(2, 2), 1)).
diff --git a/devtools/test-suite/basic/tf1_syntax_chk.p b/devtools/test-suite/basic/tf1_syntax_chk.p
@@ -1,5 +1,5 @@
-% TODO: args -one_step and result NOT_VALID once typing is working again (this is satisfiable)
-% exit: 3
+% args: -one_step
+% result: NOT VALID
 
 tff(beverage_type,type,
     beverage: $tType ).

diff --git a/devtools/test-suite/basic/tfa_syntax_chk.p b/devtools/test-suite/basic/tfa_syntax_chk.p
@@ -1,5 +1,5 @@
 % As arithmetic is not handled by Goeland, we simply check that parsing is OK
-% args: -ari -one_step -l 1
+% args: -one_step -l 1
 % result: NOT VALID
 
 tff(p_int_type,type,

diff --git a/devtools/test-suite/bugs/bug_52-1.p b/devtools/test-suite/bugs/bug_52-1.p
@@ -0,0 +1,3 @@
+% result: VALID
+
+fof(rocq_output_test,conjecture,! [X]: (p(X) | ~p(X))).
diff --git a/devtools/test-suite/bugs/bug_52-3.p b/devtools/test-suite/bugs/bug_52-3.p
@@ -0,0 +1,4 @@
+% tests for the presence of metavariables
+% result: VALID
+
+fof(rocq_output_test,conjecture,? [X]: (p(X) | ~p(X))).
diff --git a/devtools/test-suite/bugs/bug_53-1.p b/devtools/test-suite/bugs/bug_53-1.p
@@ -0,0 +1,33 @@
+% args: -one_step
+% exit: 1
+
+tff(list_type,type,
+	list: $tType > $tType ).
+
+tff(maybe_type,type,
+	maybe: $tType > $tType ).
+
+%----Polymorphic symbols
+tff(nil_type,type,
+	nil:
+	  !>[A: $tType] : list(A) ).
+
+tff(cons_type,type,
+	cons:
+	  !>[A: $tType] : ( ( A * list(A) ) > list(A) ) ).
+
+tff(none_type,type,
+	none:
+	  !>[A: $tType] : maybe(A) ).
+
+tff(some_type,type,
+	some:
+	  !>[A: $tType] : ( A > maybe(A) ) ).
+
+tff(head_type,type,
+	head:
+	  !>[A: $tType] : ( list(A) > maybe(A) ) ).
+
+%----This cannot be well typed, the last cons is not well formed
+tff(solve_this,conjecture,
+	  head($int,cons($int,1,cons($int,2,cons($int,3)))) = some($int,1) ).
diff --git a/devtools/test-suite/bugs_no_chk/bug_52-2.p b/devtools/test-suite/bugs_no_chk/bug_52-2.p
@@ -0,0 +1,13 @@
+% TODO: move this file to bugs once we enable typed GS3 proofs
+% result: VALID
+
+tff(nat_type,type,nat : $tType).
+tff(zero_nat,type,zero : nat).
+tff(succ_nat,type,succ : nat > nat).
+tff(list_type,type,list : $tType > $tType).
+tff(nil_list,type,nil : !> [A]: list(A)).
+tff(cons_list,type,cons : !> [A]: ((A * list(A)) > list(A))).
+tff(p_type,type,p : !> [A : $tType]: (list(A) > $o)).
+
+tff(rocq_output_test,conjecture,! [A: $tType, X : A]:
+  ((succ(zero) = succ(zero)) | (p(A,cons(A,X,nil(A))) | ~p(A,cons(A,X,nil(A)))))).
diff --git a/devtools/test-suite/basic/Axioms/SYN000+0.ax → ...ools/test-suite/no_chk/Axioms/SYN000+0.ax b/devtools/test-suite/basic/Axioms/SYN000+0.ax → ...ools/test-suite/no_chk/Axioms/SYN000+0.ax
diff --git a/devtools/test-suite/basic/Axioms/SYN000_0.ax → ...ools/test-suite/no_chk/Axioms/SYN000_0.ax b/devtools/test-suite/basic/Axioms/SYN000_0.ax → ...ools/test-suite/no_chk/Axioms/SYN000_0.ax
diff --git a/devtools/test-suite/basic/fol_syntax_chk.p → devtools/test-suite/no_chk/fol_syntax_chk.p b/devtools/test-suite/basic/fol_syntax_chk.p → devtools/test-suite/no_chk/fol_syntax_chk.p
diff --git a/devtools/test-suite/basic/tf0_syntax_chk.p → devtools/test-suite/no_chk/tf0_syntax_chk.p b/devtools/test-suite/basic/tf0_syntax_chk.p → devtools/test-suite/no_chk/tf0_syntax_chk.p
Original file line number	Diff line number	Diff line change
		@@ -0,0 +1,3 @@
		% result: VALID

		fof(rocq_output_test,conjecture,! [X]: (p(X) \| ~p(X))).