lambda-feedback
diff --git a/‎eval_tests.json
Lines changed: 4 additions & 4 deletions b/‎eval_tests.json
Lines changed: 4 additions & 4 deletions
diff --git a/‎eval_tests.yaml
Lines changed: 55 additions & 0 deletions b/‎eval_tests.yaml
Lines changed: 55 additions & 0 deletions
diff --git a/‎evaluation_function/auto_tests.py
Lines changed: 140 additions & 0 deletions b/‎evaluation_function/auto_tests.py
Lines changed: 140 additions & 0 deletions
diff --git a/‎evaluation_function/evaluation_test.py
Lines changed: 2 additions & 2 deletions b/‎evaluation_function/evaluation_test.py
Lines changed: 2 additions & 2 deletions
diff --git a/‎evaluation_function/json_tests.py
Lines changed: 0 additions & 71 deletions b/‎evaluation_function/json_tests.py
Lines changed: 0 additions & 71 deletions
@@ -14,7 +14,7 @@
                             {
                                 "description": "Most basic possible case",
                                 "response": "A & B",
-                                "expectedResult": {
+                                "expected_result": {
                                     "is_correct": true,
                                     "response_latex": "A \\cdot B"
                                 }
@@ -29,7 +29,7 @@
                             {
                                 "description": "Works with variable names of any length",
                                 "response": "A & Test",
-                                "expectedResult": {
+                                "expected_result": {
                                     "is_correct": true,
                                     "response_latex": "A \\cdot \\mathrm{Test}"
                                 }
@@ -49,7 +49,7 @@
                             {
                                 "description": "Tests transposed variables are correct",
                                 "response": "B & A",
-                                "expectedResult": {
+                                "expected_result": {
                                     "is_correct": true
                                 }
                             }
@@ -63,7 +63,7 @@
                             {
                                 "description": "Incorrect results marked as false",
                                 "response": "A | B",
-                                "expectedResult": {
+                                "expected_result": {
                                     "is_correct": false
                                 }
                             }
 
@@ -0,0 +1,55 @@
+# YAML supports comments: one reason to prefer it over JSON, which doesn't.
+---
+title: Trivial comparisons
+tests:
+  - description: The response and answer are exactly the same, so the response should be considered correct.
+    # Parameters can be given, but this field can be removed if none are needed
+    params: {}
+    answer: "A & B"
+    response: "A & B"
+    expected_result:
+      is_correct: yes
+      # Ensure that the latex returned (for the preview) is correct
+      response_latex: "A \\cdot B"
+
+  - description: Multi-character variable names are also supported
+    answer: "A & Test"
+    response: "A & Test"
+    expected_result:
+      is_correct: yes
+      response_latex: "A \\cdot \\mathrm{Test}"
+# Tests can be divided into sections using '---'
+---
+title: Trivial comparisons, but not identical
+tests:
+  - description: Variables can appear in any order.
+    answer: "A & B"
+    response: "B & A"
+    expected_result:
+      is_correct: yes
+      # Any other fields returned by the evaluation function can be tested too, but this is optional.
+
+  - description: > # Multi-line strings are possible for readability
+      The wrong operator is used, so this is incorrect as
+      the two expressions have different truth tables.
+    answer: "A & B"
+    response: "A | B"
+    expected_result:
+      is_correct: no
+---
+# This illustrates how sub-tests can be used to share the same answer and parameters
+# for multiple tests.
+title: More complex comparisons
+tests:
+  - description: XOR can be implemented using NAND or NOR
+    answer: "A ^ B"
+    sub_tests:
+      - description: "Using NAND:"
+        response: "~(~(A & ~(A & B)) & ~(B & ~(A & B)))"
+        expected_result:
+          is_correct: yes
+      - description: "Using NOR:"
+        response: "~(~(~A | ~B) | ~(A | B))"
+        expected_result:
+          is_correct: yes
+ 
@@ -0,0 +1,140 @@
+import json
+import yaml
+from typing import Union
+
+class TestFile:
+    """An abstraction over a test file, which may be in one of several different formats.
+    Currently, JSON and YAML are supported.
+    """
+
+    def __init__(self, path: str) -> None:
+        self.groups = []
+
+        # Attempt to open the given file. Exit with an error if this
+        # is not possible.
+        file_content = ""
+        try:
+            with open(path, "r") as test_file:
+                file_content = test_file.read()
+        except IOError as e:
+            raise Exception(f'Failed to open test file: "{e}"')
+
+        # Get the file extension to determine which format should be used.
+        extension = path.split(".")[-1]
+        if extension == "json":
+            try:
+                questions = json.loads(file_content)
+
+                for question in questions:
+                    out = []
+                    title = question["title"]
+                    for part in question["parts"]:
+                        for response_area in part["responseAreas"]:
+                            params = response_area["params"]
+                            answer = response_area["answer"]
+                            for test in response_area["tests"]:
+                                test.update({"answer": answer})
+                                test.update({"params": params})
+                                out.append(SingleTest(test))
+                    self.groups.append({"title": title, "tests": out})
+
+            except KeyError as e:
+                raise Exception(f'The key "{e.args[0]}" doesn\'t exist, or is in the wrong place.')
+            except json.JSONDecodeError as e:
+                raise Exception(f'Error parsing JSON: "{e}"')
+        elif extension == "yaml":
+            try:
+                # Tests are organised in groups of separate YAML documents (separated by "---")
+                docs = yaml.safe_load_all(file_content)
+                for test_group in docs:
+                    tests = []
+                    title = test_group.get("title", "")
+                    for test in test_group.get("tests", []):
+                        # Add an empty params field if none was provided.
+                        if test.get("params") == None:
+                            test["params"] = {}
+
+                        # Does this test have sub-tests?
+                        sub_tests = test.get("sub_tests")
+                        if sub_tests != None:
+                            params = test["params"]
+                            answer = test["answer"]
+
+                            for sub_test in sub_tests:
+                                sub_test["params"] = params
+                                sub_test["answer"] = answer
+                                tests.append(SingleTest(sub_test))
+                        else:
+                            tests.append(SingleTest(test))
+
+                    self.groups.append({"title": title, "tests": tests})
+            except yaml.YAMLError as e:
+                raise Exception(f'Error parsing YAML: {e}')
+        else:
+            raise Exception(f'"{extension}" files are not supported as a test format.')
+
+class SingleTest:
+    def __init__(self, test_dict: dict):
+        self.response = test_dict.get("response", "")
+        self.answer = test_dict.get("answer", "")
+        self.params = test_dict.get("params", {})
+        expected_result = test_dict.get("expected_result")
+        if not expected_result:
+            raise Exception("No expected result given for test")
+        self.is_correct = expected_result.get("is_correct")
+        self.results = expected_result
+        self.desc = test_dict.get("description", "")
+
+    def evaluate(self, func) -> dict:
+        return func(self.response, self.answer, self.params)
+    
+    def compare(self, eval_result: dict) -> tuple[bool, str]:
+        eval_correct = eval_result["is_correct"]
+            
+        if eval_correct != self.is_correct:
+            return (
+                False,
+                f"response \"{self.response}\" with answer \"{self.answer}\" was {'' if eval_correct else 'in'}correct: {eval_result['feedback']}\nTest description: {self.desc}"
+            )
+        
+        # Are there any other fields in the eval function result that need to be checked?
+        if self.results != None:
+            # Check each one in turn
+            for key, value in self.results.items():
+                actual_result_val = eval_result.get(key)
+                if actual_result_val == None:
+                    return (False, f"No value returned for \"{key}\"")
+                
+                if actual_result_val != value:
+                    return (
+                        False,
+                        f"expected {key} = \"{value}\", got {key} = \"{actual_result_val}\"\nTest description: {self.desc}"
+                    )
+        
+        return (True, "")
+
+
+def auto_test(path, func):
+    """A decorator that adds the necessary infrastructure to run tests defined
+    in an external data file.\n
+    `path`: the path to the data file, relative to the eval function root.\n
+    `func`: the function to test. Should usually be `evaluation_function`.
+    """
+    def _auto_test(orig_class):
+        def test_auto(self):
+            # Creating a TestFile can fail for several reasons.
+            # If so, an exception is raised with a suitable error message
+            try:
+                tests = TestFile(path)
+            except Exception as e:
+                self.fail(e)
+
+            # Successfully loaded 
+            for group in tests.groups:
+                for test in group["tests"]:
+                    results = test.evaluate(func)
+                    self.assertTrue(*test.compare(results.to_dict()))
+
+        orig_class.test_auto = test_auto # Add the test_auto function to the class
+        return orig_class
+    return _auto_test
@@ -1,9 +1,9 @@
 import unittest
 
 from .evaluation import Params, evaluation_function
-from .json_tests import auto_test
+from .auto_tests import auto_test
 
-@auto_test("eval_tests.json", evaluation_function)
+@auto_test("eval_tests.yaml", evaluation_function)
 class TestEvaluationFunction(unittest.TestCase):
     """
     TestCase Class used to test the algorithm.
Original file line number	Diff line number	Diff line change
`@@ -14,7 +14,7 @@`
`14`	`14`	`{`
`15`	`15`	`"description": "Most basic possible case",`
`16`	`16`	`"response": "A & B",`
`17`		`- "expectedResult": {`
	`17`	`+ "expected_result": {`
`18`	`18`	`"is_correct": true,`
`19`	`19`	`"response_latex": "A \\cdot B"`
`20`	`20`	`}`
`@@ -29,7 +29,7 @@`
`29`	`29`	`{`
`30`	`30`	`"description": "Works with variable names of any length",`
`31`	`31`	`"response": "A & Test",`
`32`		`- "expectedResult": {`
	`32`	`+ "expected_result": {`
`33`	`33`	`"is_correct": true,`
`34`	`34`	`"response_latex": "A \\cdot \\mathrm{Test}"`
`35`	`35`	`}`
`@@ -49,7 +49,7 @@`
`49`	`49`	`{`
`50`	`50`	`"description": "Tests transposed variables are correct",`
`51`	`51`	`"response": "B & A",`
`52`		`- "expectedResult": {`
	`52`	`+ "expected_result": {`
`53`	`53`	`"is_correct": true`
`54`	`54`	`}`
`55`	`55`	`}`
`@@ -63,7 +63,7 @@`
`63`	`63`	`{`
`64`	`64`	`"description": "Incorrect results marked as false",`
`65`	`65`	`"response": "A \| B",`
`66`		`- "expectedResult": {`
	`66`	`+ "expected_result": {`
`67`	`67`	`"is_correct": false`
`68`	`68`	`}`
`69`	`69`	`}`