fixed spellings

behrica · behrica · commit 2f77f65cc064 · 2024-12-12T21:41:28.000Z
diff --git a/projects/ml/llm/notebooks/llms.clj b/projects/ml/llm/notebooks/llms.clj
@@ -3,20 +3,19 @@
    [org.httpkit.client :as hk-client]
    [cheshire.core :as json]))
 
-;;  # using Large Language Models from Clojure
-;; LLMs often come as APIs, as they require computing power (GPUs), which most users do not have
-;; localy.
-;; OpenAI offers their models behind an (paid) API for example. In the following we will see three 
-;;diferent ways to use the GPT-4 model from OpenAI
+;;# using Large Language Models from Clojure
+;;LLMs often come as APIs, as they require computing power (GPUs), which most users do not have
+;;localy.
+;;OpenAI offers their models behind an (paid) API for example. In the following we will see three 
+;;different ways to use the GPT-4 model from OpenAI
 
-;; get the openai API key either from environemnt or a specific file
+;; Get the openai API key either from environemnt or a specific file
 (def open-ai-key
   (or (System/getenv "OPEN_AI_KEY")
       (slurp "open_ai_secret.txt")
       )
   )
 
-(or "hello" (slurp "aa"))
 
 ;## Use OpenAI API directly
 ;; OpenAI offers a rather simple API, text-in text-out for "chatting" with GPT 
@@ -42,9 +41,9 @@
  (json/decode keyword)) 
 
 ; ## use Bosquet
-; [bosquet](https://github.com/zmedelis/bosquet) abstracts some of the concepts of LLMs
+; [Bosquet](https://github.com/zmedelis/bosquet) abstracts some of the concepts of LLMs
 ; on a higher level API. Its has further notions of "memory" and "tools"
-; and has feature we find for exampl in python "LangChain"
+; and has other features we find for exampl in python "LangChain"
 
 ;; Bosque wants the API key in a config file
 (spit "secrets.edn"
@@ -54,19 +53,21 @@
 
 (require '[bosquet.llm.generator :refer [generate llm]])
 
+;; Call GPT from Bosquet
+
 (generate
  [[:user "What is Clojure"]
   [:assistant (llm :openai
                    :llm/model-params {:model :gpt-4
                                       })]])
 
 
-;# use langchain4j
-;; We can use LLMs as well via a Java Interop and teh library
+;## Use langchain4j
+;; We can use LLMs as well via a Java Interop and the library
 ;; [lnagchain4j](https://github.com/langchain4j/langchain4j) which aims
-;; to be a copy of the pythin langcahin, and offers support or
-;; build blcoks for several consept arround LLMs (model, vecstorstores, document loaders)
-;; We see it used in te following chapters
+;; to be a copy of the python library langchain, and offers support or
+;; building blocks for several concepts arround LLMs (model, vectorstores, document loaders, etc.)
+;; We see it used in the following chapters
 
 (import '[dev.langchain4j.model.openai OpenAiChatModel OpenAiChatModelName])
 
diff --git a/projects/ml/llm/notebooks/rag.clj b/projects/ml/llm/notebooks/rag.clj
@@ -12,8 +12,8 @@
 
 
 ;; # Simple RAG (Retrieval-Augmented Generation) System
-;; This is a Clojure / langchain4j adaption of 
-;; https://github.com/NirDiamant/RAG_Techniques/blob/main/all_rag_techniques/simple_rag.ipynb
+;; This is a Clojure / langchain4j adaption of a
+;; (simple_rag)[https://github.com/NirDiamant/RAG_Techniques/blob/main/all_rag_techniques/simple_rag.ipynb]
 
 ;; ## Overview
 ;; This code implements a basic Retrieval-Augmented Generation (RAG) system for processing and 
@@ -61,12 +61,16 @@
 ;;Flexibility: Easy to adjust parameters like chunk size and number of retrieved results.
 
 ;;## Conclusion
-;;This simple RAG system provides a solid foundation for building more complex information retrieval and question-answering systems. By encoding document content into a searchable vector store, it enables efficient retrieval of relevant information in response to queries. This approach is particularly useful for applications requiring quick access to specific information within 
+;;This simple RAG system provides a solid foundation for building more complex information retrieval and question-answering systems. 
+;;
+;;By encoding document content into a searchable vector store, it enables efficient retrieval of relevant information in response to queries. 
+;;
+;;This approach is particularly useful for applications requiring quick access to specific information within 
 ;;large documents or document collections.
 
 ;; # Implementation
 
-;; helper to replace abs by space
+;; A helper to replace tabs by space:
 (defn replace-t-with-space [list-of-documents]
   (map
    (fn [text-segment]
@@ -76,25 +80,26 @@
    list-of-documents))
 
 
-;; convert PDF to text document
+;; Convert PDF to text document:
 (def document (.parse (ApachePdfBoxDocumentParser.) (io/input-stream "Understanding_Climate_Change.pdf")))
 
-;; split document into chunks of max 1000 chars and overlaping of 200
+;; Split document into chunks of max 1000 chars and overlaping of 200:
 (def texts
   (.split 
    (DocumentSplitters/recursive 1000 200)
    document))
-;; clean textx
+;; Clean texts:
 (def cleaned-texts (replace-t-with-space texts))
 
+;; Create embedding for clean texts:
 (def embedding-model (AllMiniLmL6V2EmbeddingModel.))
 (def embedding-store (InMemoryEmbeddingStore.))
 
-;; create embedding for clean texts
+
 (def embeddings
   (.embedAll embedding-model cleaned-texts))
 
-;; add embeddings to vector store
+;; Add all embeddings to vector store:
 (run!
     (fn [ [text-segment embedding]]
       (.add embedding-store embedding text-segment))
@@ -103,15 +108,15 @@
       cleaned-texts
       (.content embeddings)))
 
-;; encode retriever
+;; Encode the retriever text:
 (def retriever 
   (.content (.embed embedding-model
                     "What is the main cause of climate change?")))
 
-;; find top 5 relevant texts
+;; Find top 5 relevant texts:
 (def relevant (.findRelevant embedding-store retriever 5))
 
-;; put 5 results in table
+;; Put 5 results in table:
 (tc/dataset
  (map
   (fn [a-relevant]
diff --git a/projects/ml/llm/notebooks/vectorstore.clj b/projects/ml/llm/notebooks/vectorstore.clj
@@ -7,10 +7,10 @@
    [dev.langchain4j.store.embedding.inmemory InMemoryEmbeddingStore]))
 
 ;; # Use a vectorstore from langchain4j
-;; In thios example we will create embeddings for some
-;; phansaty food items, and find teh closest one to a query.
+;; In this example we will create embeddings for some
+;; fantasy food items, and find the closest one to a query sentence.
 
-;; Create the data, so a list of 1000 food descriptions
+;; Firt wereate the data, so a list of 1000 food descriptions
 
 
 (def food-items  
@@ -47,7 +47,7 @@
    "that brings comfort and joy" "which excites the palate"  
    "that enchants every taste bud" "which is pure indulgence"])  
   
-;; Generate 1000 unique descriptions as a dataset  
+;; Generate 1000 unique descriptions as a dataset:  
 (def food-descriptions
   (tc/dataset {:food-description
                (->>
@@ -64,24 +64,24 @@
                 (take 1000))}))  
 
 ;; Now we create the embedding store, which is able to calculate vector distances
-;; (fast)
+;; (fast).
 (def embedding-store (InMemoryEmbeddingStore.))
-;; Create an instance of the embedding model, which can calculate an emebdiing for a piece of text
+;; Create an instance of the embedding model, which can calculate an emebedding for a piece of text.
 (def embedding-model (AllMiniLmL6V2EmbeddingModel.))
 
-;; And we embbed all food description
+;; And we embbed all food descriptions:
 (run!
   #(let [segment (TextSegment/from %)
          embedding (.content (.embed embedding-model %))]
      (.add embedding-store embedding segment))
   (:food-description food-descriptions))
 
 
-;; Embed the query text  
+;; Now we embedd the query text:  
 (def query-embedding (.content (.embed embedding-model "Which spicy food can you offer  ?")))
 
-;; Find the 5 most relevant embedding which are sematically the closest to the query.
-;; Its using a certain vector distance (cosine) between the embedding vectors of query and texts)  
+;; And finally we find the 5 most relevant embedding which are sematically the closest to the query.
+;; It's using a certain vector distance (cosine) between the embedding vectors of query and texts.  
 (def relevant (.findRelevant embedding-store query-embedding 5))
 
 (tc/dataset