fixed spelling

behrica · behrica · commit 8a42598e2a4e · 2024-12-14T10:47:30.000Z
diff --git a/projects/ml/llm/.devcontainer/devcontainer.json b/projects/ml/llm/.devcontainer/devcontainer.json
@@ -17,7 +17,8 @@
 		"vscode": {
 			"extensions": [
 				"betterthantomorrow.calva",
-				"vscjava.vscode-java-pack"
+				"vscjava.vscode-java-pack",
+				"streetsidesoftware.code-spell-checker"
 			]
 		}
 	}
diff --git a/projects/ml/llm/notebooks/index.clj b/projects/ml/llm/notebooks/index.clj
@@ -0,0 +1,29 @@
+^:kindly/hide-code
+(ns index)
+
+;;# LLMs and Clojure
+
+;;LLMs (Large Language Models) are a class of predictive models which can create "content"
+;;in various forms, original "text" content.
+;;
+;;They are ultimately based on the completion of "text" a user is giving them. The quality of this got so good lately,
+;;that we interpret this "text completions" as artificial intelligence, as they imitate with a very high quality what a human might generate.
+;;
+;;These models come "pre-trained", so they have learned a probability distribution of word sequences, which enables them to predict the next word
+;; base on any sequence of words.
+;;
+;; The inner working of LLMs have as well the concept of embeddings, which means to represent text as high dimensional vectors,
+;; where mathematical vector distance is correlated with semantic similarity.
+;;
+;; In their most popular form they are presented to users as "chat bots" with which a user can have a 
+;; a coherent conversion with questions and answers.
+;;
+;; This being a "conversation" is an illusion from the technical level. The model itself is stateless, it uses previous parts of the conversation
+;; as input for its prediction, which creates the illusion of coherence.
+;;
+;; The following chapters show three examples for using LLMs from Clojure:
+;;
+;;- a simple chat completion
+;;- using a vector store and embeddings to perform a semantic search
+;;- show case a simple RAG (Retrieval-Augmented Generation) use case
+
diff --git a/projects/ml/llm/notebooks/llms.clj b/projects/ml/llm/notebooks/llms.clj
@@ -3,13 +3,13 @@
    [org.httpkit.client :as hk-client]
    [cheshire.core :as json]))
  
-;;# using Large Language Models from Clojure
+;;# Using Large Language Models from Clojure
 ;;LLMs often come as APIs, as they require computing power (GPUs), which most users do not have
-;;localy.
+;;locally.
 ;;OpenAI offers their models behind an (paid) API for example. In the following we will see three 
 ;;different ways to use the GPT-4 model from OpenAI
 
-;; Get the openai API key either from environemnt or a specific file
+;; Get the openai API key either from environment or a specific file
 (def open-ai-key
   (or (System/getenv "OPEN_AI_KEY")
       (slurp "open_ai_secret.txt")
@@ -20,7 +20,7 @@
 ;## Use OpenAI API directly
 ;; OpenAI offers a rather simple API, text-in text-out for "chatting" with GPT 
 ;;
-;; The following shows how to ask a simple question, and getting the answer using an http libray,
+;; The following shows how to ask a simple question, and getting the answer using an http library,
 ;; [http-kit](https://github.com/http-kit/http-kit). The API is based on JSON, so easy to use
 ;; from Clojure
 
@@ -43,7 +43,7 @@
 ; ## use Bosquet
 ; [Bosquet](https://github.com/zmedelis/bosquet) abstracts some of the concepts of LLMs
 ; on a higher level API. Its has further notions of "memory" and "tools"
-; and has other features we find for exampl in python "LangChain"
+; and has other features we find for example in python "LangChain"
 
 ;; Bosque wants the API key in a config file
 (spit "secrets.edn"
@@ -66,7 +66,7 @@
 ;; We can use LLMs as well via a Java Interop and the library
 ;; [lnagchain4j](https://github.com/langchain4j/langchain4j) which aims
 ;; to be a copy of the python library langchain, and offers support or
-;; building blocks for several concepts arround LLMs (model, vectorstores, document loaders, etc.)
+;; building blocks for several concepts around LLMs (model, vector stores, document loaders, etc.)
 ;; We see it used in the following chapters
 
 (import '[dev.langchain4j.model.openai OpenAiChatModel OpenAiChatModelName])
diff --git a/projects/ml/llm/notebooks/rag.clj b/projects/ml/llm/notebooks/rag.clj
@@ -83,7 +83,7 @@
 ;; Convert PDF to text document:
 (def document (.parse (ApachePdfBoxDocumentParser.) (io/input-stream "Understanding_Climate_Change.pdf")))
 
-;; Split document into chunks of max 1000 chars and overlaping of 200:
+;; Split document into chunks of max 1000 chars and overlapping of 200:
 (def texts
   (.split 
    (DocumentSplitters/recursive 1000 200)
diff --git a/projects/ml/llm/notebooks/render.clj b/projects/ml/llm/notebooks/render.clj
@@ -4,7 +4,8 @@
 (clay/make! {:format [:quarto :html]
              :show false
              :base-source-path "notebooks"
-             :source-path ["llms.clj"
+             :source-path ["index.clj"
+                           "llms.clj"
                            "vectorstore.clj"
                            "rag.clj"
                            ]
diff --git a/projects/ml/llm/notebooks/vectorstore.clj b/projects/ml/llm/notebooks/vectorstore.clj
@@ -6,11 +6,12 @@
    [dev.langchain4j.model.embedding.onnx.allminilml6v2 AllMiniLmL6V2EmbeddingModel]
    [dev.langchain4j.store.embedding.inmemory InMemoryEmbeddingStore]))
 
-;; # Use a vectorstore from langchain4j
+;; # Use a vector store from langchain4j
 ;; In this example we will create embeddings for some
 ;; fantasy food items, and find the closest one to a query sentence.
 
-;; Firt wereate the data, so a list of 1000 food descriptions
+;; ## Create dummy data
+;; First we create the data, so a list of 1000 food descriptions
 
 
 (def food-items  
@@ -63,24 +64,26 @@
                 shuffle
                 (take 1000))}))  
 
+;; ## Add food description to vector store
 ;; Now we create the embedding store, which is able to calculate vector distances
 ;; (fast).
 (def embedding-store (InMemoryEmbeddingStore.))
-;; Create an instance of the embedding model, which can calculate an emebedding for a piece of text.
+;; Create an instance of the embedding model, which can calculate an embedding for a piece of text.
 (def embedding-model (AllMiniLmL6V2EmbeddingModel.))
 
-;; And we embbed all food descriptions:
+;; And we embed all food descriptions:
 (run!
   #(let [segment (TextSegment/from %)
          embedding (.content (.embed embedding-model %))]
      (.add embedding-store embedding segment))
   (:food-description food-descriptions))
 
 
-;; Now we embedd the query text:  
+;; Now we embed the query text:  
 (def query-embedding (.content (.embed embedding-model "Which spicy food can you offer  ?")))
 
-;; And finally we find the 5 most relevant embedding which are sematically the closest to the query.
+;; ## Query vector store
+;; And finally we find the 5 most relevant embedding which are semantically the closest to the query.
 ;; It's using a certain vector distance (cosine) between the embedding vectors of query and texts.  
 (def relevant (.findRelevant embedding-store query-embedding 5))
 

Original file line number	Diff line number	Diff line change
`@@ -17,7 +17,8 @@`
`17`	`17`	`"vscode": {`
`18`	`18`	`"extensions": [`
`19`	`19`	`"betterthantomorrow.calva",`
`20`		`- "vscjava.vscode-java-pack"`
	`20`	`+ "vscjava.vscode-java-pack",`
	`21`	`+ "streetsidesoftware.code-spell-checker"`
`21`	`22`	`]`
`22`	`23`	`}`
`23`	`24`	`}`