lambda-feedback · neagualexa · Sep 20, 2024 · Sep 20, 2024 · Sep 20, 2024 · Sep 23, 2024
diff --git a/.dockerignore b/.dockerignore
@@ -144,4 +144,7 @@ README.md
 data/
 
 # Test reports
-reports/
+reports/
+
+# Models to be downloaded from the internet
+evaluation_function/models/
diff --git a/.github/workflows/deploy.yml b/.github/workflows/deploy.yml
@@ -19,7 +19,7 @@ jobs:
     strategy:
       fail-fast: false
       matrix:
-        python-version: ["3.12"]
+        python-version: ["3.11"]
     steps:
       - name: Checkout
         uses: actions/checkout@v4
@@ -57,6 +57,16 @@ jobs:
         run: |
           poetry install  --no-interaction --no-root
 
+          python -m nltk.downloader wordnet
+          python -m nltk.downloader word2vec_sample
+          python -m nltk.downloader brown
+          python -m nltk.downloader stopwords
+          python -m nltk.downloader punkt
+          python -m nltk.downloader punkt_tab
+
+          mkdir -p ./evaluation_function/models
+          wget -O ./evaluation_function/models/Phi-3.5-mini-instruct-Q6_K.gguf https://huggingface.co/bartowski/Phi-3.5-mini-instruct-GGUF/resolve/main/Phi-3.5-mini-instruct-Q6_K.gguf
+
       # TODO: add linting / black / flake8
       # - name: Lint with flake8
       #   run: |
@@ -84,6 +94,8 @@ jobs:
     name: Build Docker Image
     uses: lambda-feedback/evaluation-function-workflows/.github/workflows/build.yml@main
     needs: test
+    with:
+      build-args: "memory=10240m"
     permissions:
       contents: read
       id-token: write
@@ -95,6 +107,7 @@ jobs:
     needs: test
     with:
       template-repository-name: "lambda-feedback/evaluation-function-boilerplate-python"
+      build-args: "memory=10240m"
     permissions:
       contents: read
       id-token: write

diff --git a/.gitignore b/.gitignore
@@ -132,4 +132,16 @@ dmypy.json
 .vscode
 
 # Test reports
-reports/
+reports/
+
+# MacOS files
+.DS_Store
+
+# Testing
+evaluation_function/testing/
+
+# Models - to be added manually
+evaluation_function/models/*
+
+# Poetry
+poetry.lock
diff --git a/Dockerfile b/Dockerfile
@@ -1,4 +1,5 @@
-FROM ghcr.io/lambda-feedback/evaluation-function-base/python:3.12 AS builder
+# Layer 1: Build the virtual environment
+FROM ghcr.io/lambda-feedback/evaluation-function-base/python:3.11 AS builder
 
 RUN pip install poetry==1.8.3
 
@@ -7,24 +8,77 @@ ENV POETRY_NO_INTERACTION=1 \
     POETRY_VIRTUALENVS_CREATE=1 \
     POETRY_CACHE_DIR=/tmp/poetry_cache
 
-COPY pyproject.toml poetry.lock ./
+COPY pyproject.toml ./
+RUN poetry lock
 
 RUN --mount=type=cache,target=$POETRY_CACHE_DIR \
     poetry install --without dev --no-root
 
-FROM ghcr.io/lambda-feedback/evaluation-function-base/python:3.12
+# Layer 2: Download NLTK models
+FROM ghcr.io/lambda-feedback/evaluation-function-base/python:3.11 AS models
+
+RUN apt-get update && apt-get install -y --no-install-recommends \
+    wget \
+    unzip 
+
+#  ----- For MaxOS development
+RUN mkdir -p /usr/share/nltk_data/corpora /usr/share/nltk_data/models /usr/share/nltk_data/tokenizers
+RUN mkdir -p /app/evaluation_function/models
+
+# NLTK data downloads
+RUN wget -O /usr/share/nltk_data/corpora/wordnet.zip https://raw.githubusercontent.com/nltk/nltk_data/gh-pages/packages/corpora/wordnet.zip
+RUN wget -O /usr/share/nltk_data/models/word2vec_sample.zip https://raw.githubusercontent.com/nltk/nltk_data/gh-pages/packages/models/word2vec_sample.zip
+RUN wget -O /usr/share/nltk_data/corpora/brown.zip https://raw.githubusercontent.com/nltk/nltk_data/gh-pages/packages/corpora/brown.zip
+RUN wget -O /usr/share/nltk_data/corpora/stopwords.zip https://raw.githubusercontent.com/nltk/nltk_data/gh-pages/packages/corpora/stopwords.zip
+RUN wget -O /usr/share/nltk_data/tokenizers/punkt.zip https://raw.githubusercontent.com/nltk/nltk_data/gh-pages/packages/tokenizers/punkt.zip
+RUN wget -O /usr/share/nltk_data/tokenizers/punkt_tab.zip https://raw.githubusercontent.com/nltk/nltk_data/gh-pages/packages/tokenizers/punkt_tab.zip
+
+# Unzip the downloaded NLTK data
+RUN unzip /usr/share/nltk_data/corpora/wordnet.zip -d /usr/share/nltk_data/corpora/
+RUN unzip /usr/share/nltk_data/models/word2vec_sample.zip -d /usr/share/nltk_data/models/
+RUN unzip /usr/share/nltk_data/corpora/brown.zip -d /usr/share/nltk_data/corpora/
+RUN unzip /usr/share/nltk_data/corpora/stopwords.zip -d /usr/share/nltk_data/corpora/
+RUN unzip /usr/share/nltk_data/tokenizers/punkt.zip -d /usr/share/nltk_data/tokenizers/
+RUN unzip /usr/share/nltk_data/tokenizers/punkt_tab.zip -d /usr/share/nltk_data/tokenizers/
+
+# Clean up zip files to reduce image size
+RUN rm /usr/share/nltk_data/corpora/*.zip
+RUN rm /usr/share/nltk_data/models/*.zip
+RUN rm /usr/share/nltk_data/tokenizers/*.zip
+
+RUN wget -O /app/evaluation_function/models/Phi-3.5-mini-instruct-Q6_K.gguf https://huggingface.co/bartowski/Phi-3.5-mini-instruct-GGUF/resolve/main/Phi-3.5-mini-instruct-Q6_K.gguf
+
+# Layer 3: Final image
+FROM ghcr.io/lambda-feedback/evaluation-function-base/python:3.11
 
 ENV VIRTUAL_ENV=/app/.venv \
     PATH="/app/.venv/bin:$PATH"
 
+ENV NLTK_DATA=/usr/share/nltk_data 
+
+ENV MODEL_PATH=/app/evaluation_function/models 
+
 COPY --from=builder ${VIRTUAL_ENV} ${VIRTUAL_ENV}
+COPY --from=models ${NLTK_DATA} ${NLTK_DATA}
+COPY --from=models ${MODEL_PATH} ${MODEL_PATH}
 
 # Precompile python files for faster startup
 RUN python -m compileall -q .
 
 # Copy the evaluation function to the app directory
 COPY evaluation_function ./evaluation_function
 
+#  ----- For Linux development instead of Layer 2 NLTK downloads
+# # Warnings: those commands sometimes download corrupted zips, so it is better to wget each package from the main site
+# RUN python -m nltk.downloader wordnet
+# RUN python -m nltk.downloader word2vec_sample
+# RUN python -m nltk.downloader brown
+# RUN python -m nltk.downloader stopwords
+# RUN python -m nltk.downloader punkt
+# RUN python -m nltk.downloader punkt_tab
+
+ENV EVAL_RPC_TRANSPORT="ipc"
+
 # Command to start the evaluation function with
 ENV FUNCTION_COMMAND="python"
 
@@ -35,3 +89,7 @@ ENV FUNCTION_ARGS="-m,evaluation_function.main"
 ENV FUNCTION_RPC_TRANSPORT="ipc"
 
 ENV LOG_LEVEL="debug"
+
+# ------- FOR DEBIAN
+# Keep the container running
+# CMD ["tail", "-f", "/dev/null"]
diff --git a/README.md b/README.md
@@ -118,7 +118,7 @@ In its most basic form, the development workflow consists of writing the evaluat
 Testing the evaluation function can be done by running the `dev.py` script using the Python interpreter like so:
 
 ```bash
-python -m evaluation_function.dev <response> <answer>
+python -m evaluation_function.dev <response> <answer> <(optional)params>
 ```
 
 > [!NOTE]
@@ -129,15 +129,15 @@ python -m evaluation_function.dev <response> <answer>
 To build the Docker image, run the following command:
 
 ```bash
-docker build -t my-python-evaluation-function .
+docker build -t my-python-evaluation-function --platform=linux/x86_64 .
 ```
 
 ### Running the Docker Image
 
 To run the Docker image, use the following command:
 
 ```bash
-docker run -it --rm -p 8080:8080 my-python-evaluation-function
+docker run -it --rm -p 8080:8080 -e EXPERIMENTAL_DOCKER_DESKTOP_FORCE_QEMU=1 --platform=linux/x86_64 my-python-evaluation-function
 ```
 
 This will start the evaluation function and expose it on port `8080`.

diff --git a/config.json b/config.json
@@ -1,3 +1,3 @@
 {
-  "EvaluationFunctionName": ""
+  "EvaluationFunctionName": "shortTextAnswer"
 }
diff --git a/docs/dev.md b/docs/dev.md
@@ -1,29 +1,102 @@
-# YourFunctionName
-*Brief description of what this evaluation function does, from the developer perspective*
+# ShortTextAnswer
+This function evaluates the similarity value between two short texts, as well as identifying certain key strings in a student's answer.
 
 ## Inputs
-*Specific input parameters which can be supplied when the `eval` command is supplied to this function.*
+`keystrings` - Optional parameter. Represents a list of keystring objects which the function will search for in the answer.
+
+### `keystring` object
+The `keystring` object contains several fields which affect how it will be interpreted:
+
+* `string` - Required. The actual keystring being searched for.
+* `exact_match` - Optional. A boolean value indicating whether to search for the exact string or for a semantically similar one. Defaults to `false`
+* `should_contain` - Optional. A boolean value indicating whether it is expected for the keystring to be found in the answer or not. Defaults to `true`. Setting this flag to false indicates that a correct response will not contain the specified keystring.
+* `custom_feedback` - Optional. A feedback string to be returned if the `string` was not found (or if it was, in case `should_contain` was set to `false`). Defaults to `None`, in which case a generic response will be generated containing the string searched for.
 
 ## Outputs
-*Output schema/values for this function*
+The function will return an object with 3 fields of interest. the `is_correct` and `feedback` fields are required by LambdaFeedback to present feedback to the user. The `result` field is only used for development.
+```python
+{
+    "is_correct": "<bool>",
+    "result": {
+        "response": "<string>",
+        "processing_time": "<double>",
+    },
+    "feedback": "string"
+}
+```
+
+* `response` - The student answer. USed for debugging purposes.
+* `processing_time` - The time it took for the function to evaluate
+
+If the function identified a problematic keystring, the result object will have an additional field:
+* `keystring-scores` - list(string, double). List of the provided keystrings and their best similarity scores that were found in the answer.
+
+Otherwise, it will have the additional fields:
+* `method` - string. Either "w2v" or "BOW vector similarity".
+* `similarity_value` - double. The similarity value between the response and the answer.
+
+If the method is w2v, it means the two texts were found to be similar. Otherwise, a BOW vector similarity check is performed in order to identify the most likely word that caused the texts to be found dissimilar.
+
+## Initial SetUp
+Follow Docker Image instructions and run 
+`docker build -t <image_name> .` in app/
+
+Otherwise if setup locally:
+1. create a venv
+2. in the venv `pip install -r app/requirements.txt`
+3. if errors encountered with nltk packages, follow `testing_nltk.py` instructions
 
 ## Examples
 *List of example inputs and outputs for this function, each under a different sub-heading*
 
-### Simple Evaluation
+### Example simple input, no keystring
 
+Input
 ```python
 {
-  "example": {
-    "Something": "something"
-  }
+    "response": "Density, velocity, viscosity, length",
+    "answer": "Density, speed, Viscosity, Length",
 }
 ```
 
+Output
 ```python
 {
-  "example": {
-    "Something": "something"
-  }
+    'is_correct': True, 
+    'result': {
+        'response': 'Density, speed, Viscosity, Length',
+        'processing_time': 0.022912219000000178, 
+        'method': 'w2v', 
+        'similarity_value': 0.9326027035713196}, 
+    'feedback': 'Confidence: 0.933%'
 }
+```
+
+### Example keystring input
+
+Input
+```python
+{
+    "response": "Molecules are made out of atoms",
+    "answer": "Many atoms form a molecule",
+    'keystrings': [
+        {'string': 'molecule'}, 
+        {'string': 'proton', 'exact_match': True}
+    ]
+}
+```
+
+Output
+```python
+{
+    'is_correct': False, 
+    'result': {
+        'response': 'Molecules are made out of atoms', 
+        'processing_time': 0.30640586500000033, 
+        'keystring-scores': [
+            ('molecule', 0.990715997949492), 
+            ('proton', 0.9186190596675989) # Searched for with exact match, therefore not a match.
+        ]
+    }, 
+    'feedback': "Cannot determine if the answer is correct. Please provide more information about 'proton'"}
 ```