pamelafox
diff --git a/‎data/California_carpenter_bee.pdf
316 KB b/‎data/California_carpenter_bee.pdf
316 KB
diff --git a/‎data/Centris_pallida.pdf
970 KB b/‎data/Centris_pallida.pdf
970 KB
diff --git a/‎data/Western_honey_bee.pdf
1010 KB b/‎data/Western_honey_bee.pdf
1010 KB
diff --git a/‎rag.py
Lines changed: 73 additions & 0 deletions b/‎rag.py
Lines changed: 73 additions & 0 deletions
diff --git a/‎rag_hybrid.py
Lines changed: 70 additions & 0 deletions b/‎rag_hybrid.py
Lines changed: 70 additions & 0 deletions
@@ -0,0 +1,73 @@
+import csv
+import os
+
+import azure.identity
+import openai
+from dotenv import load_dotenv
+from lunr import lunr
+
+# Setup the OpenAI client to use either Azure, OpenAI.com, or Ollama API
+load_dotenv(override=True)
+API_HOST = os.getenv("API_HOST")
+
+if API_HOST == "azure":
+    token_provider = azure.identity.get_bearer_token_provider(
+        azure.identity.DefaultAzureCredential(), "https://cognitiveservices.azure.com/.default"
+    )
+    client = openai.AzureOpenAI(
+        api_version=os.environ["AZURE_OPENAI_VERSION"],
+        azure_endpoint=os.environ["AZURE_OPENAI_ENDPOINT"],
+        azure_ad_token_provider=token_provider,
+    )
+    MODEL_NAME = os.environ["AZURE_OPENAI_DEPLOYMENT"]
+
+elif API_HOST == "ollama":
+    client = openai.OpenAI(base_url=os.environ["OLLAMA_ENDPOINT"], api_key="nokeyneeded")
+    MODEL_NAME = os.environ["OLLAMA_MODEL"]
+
+elif API_HOST == "github":
+    client = openai.OpenAI(base_url="https://models.inference.ai.azure.com", api_key=os.environ["GITHUB_TOKEN"])
+    MODEL_NAME = os.environ["GITHUB_MODEL"]
+
+else:
+    client = openai.OpenAI(api_key=os.environ["OPENAI_KEY"])
+    MODEL_NAME = os.environ["OPENAI_MODEL"]
+
+# Index the data from the CSV
+with open("hybrid.csv") as file:
+    reader = csv.reader(file)
+    rows = list(reader)
+documents = [{"id": (i + 1), "body": " ".join(row)} for i, row in enumerate(rows[1:])]
+index = lunr(ref="id", fields=["body"], documents=documents)
+
+# Get the user question
+user_question = "how fast is the prius v?"
+
+# Search the index for the user question
+results = index.search(user_question)
+matching_rows = [rows[int(result["ref"])] for result in results]
+
+# Format as a markdown table, since language models understand markdown
+matches_table = " | ".join(rows[0]) + "\n" + " | ".join(" --- " for _ in range(len(rows[0]))) + "\n"
+matches_table += "\n".join(" | ".join(row) for row in matching_rows)
+
+print("Found matches:")
+print(matches_table)
+
+# Now we can use the matches to generate a response
+SYSTEM_MESSAGE = """
+You are a helpful assistant that answers questions about cars based off a hybrid car data set.
+You must use the data set to answer the questions, you should not provide any info that is not in the provided sources.
+"""
+
+response = client.chat.completions.create(
+    model=MODEL_NAME,
+    temperature=0.3,
+    messages=[
+        {"role": "system", "content": SYSTEM_MESSAGE},
+        {"role": "user", "content": f"{user_question}\nSources: {matches_table}"},
+    ],
+)
+
+print(f"\nResponse from {API_HOST}: \n")
+print(response.choices[0].message.content)
@@ -0,0 +1,70 @@
+import json
+import os
+
+import azure.identity
+import openai
+from dotenv import load_dotenv
+from lunr import lunr
+
+# Setup the OpenAI client to use either Azure, OpenAI.com, or Ollama API
+load_dotenv(override=True)
+API_HOST = os.getenv("API_HOST")
+
+if API_HOST == "azure":
+    token_provider = azure.identity.get_bearer_token_provider(
+        azure.identity.DefaultAzureCredential(), "https://cognitiveservices.azure.com/.default"
+    )
+    client = openai.AzureOpenAI(
+        api_version=os.environ["AZURE_OPENAI_VERSION"],
+        azure_endpoint=os.environ["AZURE_OPENAI_ENDPOINT"],
+        azure_ad_token_provider=token_provider,
+    )
+    MODEL_NAME = os.environ["AZURE_OPENAI_DEPLOYMENT"]
+
+elif API_HOST == "ollama":
+    client = openai.OpenAI(base_url=os.environ["OLLAMA_ENDPOINT"], api_key="nokeyneeded")
+    MODEL_NAME = os.environ["OLLAMA_MODEL"]
+
+elif API_HOST == "github":
+    client = openai.OpenAI(base_url="https://models.inference.ai.azure.com", api_key=os.environ["GITHUB_TOKEN"])
+    MODEL_NAME = os.environ["GITHUB_MODEL"]
+
+else:
+    client = openai.OpenAI(api_key=os.environ["OPENAI_KEY"])
+    MODEL_NAME = os.environ["OPENAI_MODEL"]
+
+# Index the data from the JSON - each object has id, text, and embedding
+with open("rag_ingested_chunks.json") as file:
+    documents = json.load(file)
+    documents_by_id = {doc["id"]: doc for doc in documents}
+index = lunr(ref="id", fields=["text"], documents=documents)
+
+# Get the user question
+user_question = "where do digger bees live?"
+
+# Search the index for the user question
+results = index.search(user_question)
+retrieved_documents = [documents_by_id[result["ref"]] for result in results]
+print(f"Retrieved {len(retrieved_documents)} matching documents, only sending the first 5.")
+context = "\n".join([f"{doc['id']}: {doc['text']}" for doc in retrieved_documents[0:5]])
+
+# Now we can use the matches to generate a response
+SYSTEM_MESSAGE = """
+You are a helpful assistant that answers questions about Maya civilization.
+You must use the data set to answer the questions,
+you should not provide any info that is not in the provided sources.
+Cite the sources you used to answer the question inside square brackets.
+The sources are in the format: <id>: <text>.
+"""
+
+response = client.chat.completions.create(
+    model=MODEL_NAME,
+    temperature=0.3,
+    messages=[
+        {"role": "system", "content": SYSTEM_MESSAGE},
+        {"role": "user", "content": f"{user_question}\nSources: {context}"},
+    ],
+)
+
+print(f"\nResponse from {MODEL_NAME} on {API_HOST}: \n")
+print(response.choices[0].message.content)