diff --git a/src/scripts/testing/bleu_evaluation.ipynb b/src/scripts/testing/bleu_evaluation.ipynb new file mode 100644 index 0000000..f8d9fd1 --- /dev/null +++ b/src/scripts/testing/bleu_evaluation.ipynb @@ -0,0 +1,140 @@ +{ + "cells": [ + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "\"\"\"\n", + "This script sets up a pipeline for evaluating a language model using the \"Kubermatic/cncf-question-and-answer-dataset-for-llm-training\" dataset.\n", + "It leverages advanced techniques including 4-bit quantization and evaluates the model using BLEU score.\n", + "\n", + "Dependencies:\n", + "- accelerate\n", + "- datasets\n", + "- trl\n", + "- peft\n", + "- bitsandbytes\n", + "- evaluate\n", + "- transformers\n", + "- torch\n", + "- tqdm\n", + "- pandas\n", + "- huggingface_hub\n", + "\n", + "Environment Setup:\n", + "- Ensure Hugging Face Hub authentication with 'notebook_login'.\n", + "\n", + "Model Configuration:\n", + "- Uses BitsAndBytesConfig for 4-bit quantization.\n", + "- Configures AutoModelForCausalLM for causal language modeling tasks.\n", + "\n", + "Training Data:\n", + "- Loads a subset of 1000 samples from the dataset.\n", + "\n", + "Example Question/Answer:\n", + "- Tokenizes and generates an answer for a sample question from the dataset.\n", + "- Evaluates the generated answer against the reference answer using BLEU score.\n", + "\n", + "Output:\n", + "- Prints the example question, reference answer, predicted answer, and BLEU score.\n", + "\"\"\"\n", + "\n", + "!pip install -q -U accelerate\n", + "!pip install -q -U datasets\n", + "!pip install -q -U trl\n", + "!pip install -q -U peft\n", + "!pip install -q -U -i https://pypi.org/simple/ bitsandbytes\n", + "!pip install evaluate -q -U\n", + "\n", + "import evaluate\n", + "from transformers import AutoTokenizer, AutoModelForCausalLM\n", + "import transformers\n", + "from datasets import load_dataset\n", + "import torch\n", + "import re\n", + "from tqdm import tqdm\n", + "import pandas as pd\n", + "from datasets import Dataset\n", + "from peft import LoraConfig, PeftConfig\n", + "import bitsandbytes as bnb\n", + "import accelerate\n", + "from trl import SFTTrainer\n", + "from transformers import (AutoModelForCausalLM,\n", + " AutoModelForQuestionAnswering,\n", + " AutoTokenizer,\n", + " BitsAndBytesConfig,\n", + " TrainingArguments,\n", + " )" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "from huggingface_hub import hf_hub_download\n", + "from huggingface_hub import notebook_login\n", + "notebook_login()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "model_id = \"google/gemma-1.1-2b-it\"\n", + "\n", + "bnb_config = BitsAndBytesConfig(\n", + " load_in_4bit=True,\n", + " bnb_4bit_quant_type=\"nf4\",\n", + " bnb_4bit_compute_dtype=torch.bfloat16\n", + ")\n", + "\n", + "tokenizer = AutoTokenizer.from_pretrained(model_id)\n", + "# TODO: Check if this can be changed to AutoModelForQuestionAnswering with GEMMA\n", + "model = AutoModelForCausalLM.from_pretrained(model_id, quantization_config=bnb_config, device_map=\"auto\")\n", + "\n", + "# Training Data\n", + "dataset = load_dataset(\"Kubermatic/cncf-question-and-answer-dataset-for-llm-training\", split=\"train[:1000]\")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "#example question/answer\n", + "\n", + "question = dataset[1][\"Question\"]\n", + "device = \"cuda:0\"\n", + "inputs = tokenizer(question, return_tensors=\"pt\").to(device)\n", + "\n", + "outputs = model.generate(**inputs, max_new_tokens=500)\n", + "tokenizer.decode(outputs[0], skip_special_tokens=True)\n", + "\n", + "prediction = [tokenizer.decode(outputs[0], skip_special_tokens=True)]\n", + "reference = dataset[1][\"Answer\"]\n", + "reference = [reference]\n", + "\n", + "bleu = evaluate.load(\"bleu\")\n", + "results = bleu.compute(predictions=prediction, references=reference)\n", + "print(\"Question from dataset: \" + question)\n", + "print(\"Answer from dataset: \" + reference[0])\n", + "print(\"Predicted Answer: \" + prediction[0])\n", + "print(results)" + ] + } + ], + "metadata": { + "language_info": { + "name": "python" + } + }, + "nbformat": 4, + "nbformat_minor": 2 +}