diff --git a/src/scripts/testing/bleu_evaluation.ipynb b/src/scripts/testing/bleu_evaluation.ipynb
new file mode 100644
index 0000000..f8d9fd1
--- /dev/null
+++ b/src/scripts/testing/bleu_evaluation.ipynb
@@ -0,0 +1,140 @@
+{
+ "cells": [
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "\"\"\"\n",
+    "This script sets up a pipeline for evaluating a language model using the \"Kubermatic/cncf-question-and-answer-dataset-for-llm-training\" dataset.\n",
+    "It leverages advanced techniques including 4-bit quantization and evaluates the model using BLEU score.\n",
+    "\n",
+    "Dependencies:\n",
+    "- accelerate\n",
+    "- datasets\n",
+    "- trl\n",
+    "- peft\n",
+    "- bitsandbytes\n",
+    "- evaluate\n",
+    "- transformers\n",
+    "- torch\n",
+    "- tqdm\n",
+    "- pandas\n",
+    "- huggingface_hub\n",
+    "\n",
+    "Environment Setup:\n",
+    "- Ensure Hugging Face Hub authentication with 'notebook_login'.\n",
+    "\n",
+    "Model Configuration:\n",
+    "- Uses BitsAndBytesConfig for 4-bit quantization.\n",
+    "- Configures AutoModelForCausalLM for causal language modeling tasks.\n",
+    "\n",
+    "Training Data:\n",
+    "- Loads a subset of 1000 samples from the dataset.\n",
+    "\n",
+    "Example Question/Answer:\n",
+    "- Tokenizes and generates an answer for a sample question from the dataset.\n",
+    "- Evaluates the generated answer against the reference answer using BLEU score.\n",
+    "\n",
+    "Output:\n",
+    "- Prints the example question, reference answer, predicted answer, and BLEU score.\n",
+    "\"\"\"\n",
+    "\n",
+    "!pip install -q -U accelerate\n",
+    "!pip install -q -U datasets\n",
+    "!pip install -q -U trl\n",
+    "!pip install -q -U peft\n",
+    "!pip install -q -U -i https://pypi.org/simple/ bitsandbytes\n",
+    "!pip install evaluate -q -U\n",
+    "\n",
+    "import evaluate\n",
+    "from transformers import AutoTokenizer, AutoModelForCausalLM\n",
+    "import transformers\n",
+    "from datasets import load_dataset\n",
+    "import torch\n",
+    "import re\n",
+    "from tqdm import tqdm\n",
+    "import pandas as pd\n",
+    "from datasets import Dataset\n",
+    "from peft import LoraConfig, PeftConfig\n",
+    "import bitsandbytes as bnb\n",
+    "import accelerate\n",
+    "from trl import SFTTrainer\n",
+    "from transformers import (AutoModelForCausalLM,\n",
+    "                          AutoModelForQuestionAnswering,\n",
+    "                          AutoTokenizer,\n",
+    "                          BitsAndBytesConfig,\n",
+    "                          TrainingArguments,\n",
+    "                          )"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "from huggingface_hub import hf_hub_download\n",
+    "from huggingface_hub import notebook_login\n",
+    "notebook_login()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "model_id = \"google/gemma-1.1-2b-it\"\n",
+    "\n",
+    "bnb_config = BitsAndBytesConfig(\n",
+    "    load_in_4bit=True,\n",
+    "    bnb_4bit_quant_type=\"nf4\",\n",
+    "    bnb_4bit_compute_dtype=torch.bfloat16\n",
+    ")\n",
+    "\n",
+    "tokenizer = AutoTokenizer.from_pretrained(model_id)\n",
+    "# TODO: Check if this can be changed to AutoModelForQuestionAnswering with GEMMA\n",
+    "model = AutoModelForCausalLM.from_pretrained(model_id, quantization_config=bnb_config, device_map=\"auto\")\n",
+    "\n",
+    "# Training Data\n",
+    "dataset = load_dataset(\"Kubermatic/cncf-question-and-answer-dataset-for-llm-training\", split=\"train[:1000]\")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "#example question/answer\n",
+    "\n",
+    "question = dataset[1][\"Question\"]\n",
+    "device = \"cuda:0\"\n",
+    "inputs = tokenizer(question, return_tensors=\"pt\").to(device)\n",
+    "\n",
+    "outputs = model.generate(**inputs, max_new_tokens=500)\n",
+    "tokenizer.decode(outputs[0], skip_special_tokens=True)\n",
+    "\n",
+    "prediction = [tokenizer.decode(outputs[0], skip_special_tokens=True)]\n",
+    "reference = dataset[1][\"Answer\"]\n",
+    "reference = [reference]\n",
+    "\n",
+    "bleu = evaluate.load(\"bleu\")\n",
+    "results = bleu.compute(predictions=prediction, references=reference)\n",
+    "print(\"Question from dataset: \" + question)\n",
+    "print(\"Answer from dataset: \" + reference[0])\n",
+    "print(\"Predicted Answer: \" + prediction[0])\n",
+    "print(results)"
+   ]
+  }
+ ],
+ "metadata": {
+  "language_info": {
+   "name": "python"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 2
+}