CodingTrain
diff --git a/‎.gitignore
Lines changed: 3 additions & 1 deletion b/‎.gitignore
Lines changed: 3 additions & 1 deletion
diff --git a/‎README.md
Lines changed: 36 additions & 0 deletions b/‎README.md
Lines changed: 36 additions & 0 deletions
diff --git a/‎embeddings-transformers.js
Lines changed: 74 additions & 19 deletions b/‎embeddings-transformers.js
Lines changed: 74 additions & 19 deletions
@@ -4,4 +4,6 @@ node_modules
 embeddings/
 transcripts/
 *.json
-*.zip
+*.zip
+!package.json
+!package-lock.json
@@ -0,0 +1,36 @@
+# Semantic Retrieval
+
+This repository features the testing code (and probably final code) we used for extracting the embeddings out of video transcripts for [Bizarro-Devin](https://github.com/CodingTrain/Bizarro-Devin/). There are a few files in this repository, all having their own purpose
+
+[embeddings-transformers.js](/embeddings-transformers.js) is the file that generates embeddings from transcripts in the `transcripts` directory
+[semantic-retrieval.js](/semantic-retrieval.js) can be used for retrieving from the embeddings based on a query
+[semantic-retrieval-benchmark.js](/semantic-retrieval-benchmark.js) is used for benchmarking the retrieval, during my own tests it was ~180ms / retrieval
+
+## How to use
+
+### Generating embeddings
+
+1. Make sure you've installed all dependencies by running `npm install`
+2. Create a directory called `transcripts` and insert all json transcript files in here. Each file being a transcript of a video. The transcript json should be in the following format:
+
+```json
+{
+    "text": "full transcript text",
+    "chunks": [
+        {
+            "timestamp": [0.48, 7.04],
+            "text": "..."
+        }
+    ]
+}
+```
+
+However, the chunks array is currently not used. So this can be left out. 3. Create a `embeddings` directory for the embeddings of each transcript to be written to 4. Run `node embeddings-transformers.js` to run the script that generates the embeddings.
+All embeddings should now be in the embeddings folder, as well as an `embeddings.json` file being present in the current working directory. This `embeddings.json` file is the combination of all embeddings generated from the transcripts.
+
+### Semantic retrieval from embeddings
+
+1. Make sure you've installed all dependencies by running `npm install`
+2. Make sure you have the embeddings you want to retrieve from in an `embeddings.json` file. This file is usually already generated if you've generated them using the previous [generating embeddings](#generating-embeddings) section.
+3. Open up the `semantic-retrieval.js` file and edit your query on line `25`.
+4. Save the file and run `node semantic-retrieval.js` to retrieve the top 5 results from the embeddings.
@@ -3,28 +3,83 @@ import { pipeline } from '@xenova/transformers';
 
 // Load the embeddings model
 const extractor = await pipeline(
-  'feature-extraction',
-  'Xenova/bge-small-en-v1.5'
+    'feature-extraction',
+    'Xenova/bge-small-en-v1.5'
 );
 
-const raw = fs.readFileSync('transcripts/_-AfhLQfb6w.json', 'utf-8');
-const json = JSON.parse(raw);
-const txt = json.text;
-// console.log(txt);
+const fullOutput = [];
 
-const chunks = txt.split(/[.?!]/);
+(async () => {
+    // Scan transcripts directory for all json files
+    const files = fs.readdirSync('transcripts');
 
-let outputJSON = { embeddings: [] };
+    // Iterate through each file and calculate the embeddings
+    for (const file of files) {
+        const rawContents = fs.readFileSync(`transcripts/${file}`, 'utf-8');
+        const json = JSON.parse(rawContents);
 
-for (let chunk of chunks) {
-  let output = await extractor(chunk, {
-    pooling: 'mean',
-    normalize: true,
-  });
-  const embedding = output.tolist()[0];
-  outputJSON.embeddings.push({ text: chunk, embedding });
-}
+        const text = json.text;
+
+        // Calculate chunks based on this text
+        const chunks = calculateChunks(text);
+
+        // Extract embeddings for each chunk
+        const output = [];
+
+        for (const chunk of chunks) {
+            const embeddingOutput = await extractor(chunk, {
+                pooling: 'mean',
+                normalize: true,
+            });
+
+            const embedding = embeddingOutput.tolist()[0];
+            output.push({ text: chunk, embedding });
+            fullOutput.push({ text: chunk, embedding });
+        }
+
+        // Save the embeddings to a file
+        const fileOut = `embeddings/${file}`;
+        fs.writeFileSync(fileOut, JSON.stringify(output));
 
-const fileOut = `embeddings.json`;
-fs.writeFileSync(fileOut, JSON.stringify(outputJSON));
-console.log(`Embeddings saved to ${fileOut}`);
+        console.log(
+            `Embeddings saved for ${file} to ${fileOut} (${
+                output.length
+            } chunks) (${files.indexOf(file) + 1}/${files.length})`
+        );
+    }
+
+    // Save the full output to a single file
+    const fileOut = `embeddings.json`;
+    fs.writeFileSync(fileOut, JSON.stringify(fullOutput));
+    console.log(`Complete embeddings saved to ${fileOut}`);
+})();
+
+function calculateChunks(text) {
+    // We want to split the text into chunks of at least 100 characters, after this we will keep adding to the chunk until we find a sentence boundary
+    const chunks = [];
+    let chunk = '';
+    for (let i = 0; i < text.length; i++) {
+        chunk += text[i];
+
+        // If our current character is a punctuation mark, we will split the chunk here
+        if (
+            chunk.length >= 100 &&
+            (text[i] === '.' || text[i] === '?' || text[i] === '!')
+        ) {
+            chunks.push(chunk.trim());
+            chunk = '';
+        }
+
+        // If we are exceeding 150 characters and we haven't found a punctuation mark, we will split the chunk at the last space
+        if (chunk.length >= 150) {
+            let lastSpace = chunk.lastIndexOf(' ');
+            if (lastSpace === -1) {
+                lastSpace = chunk.length;
+            }
+            chunks.push(chunk.slice(0, lastSpace).trim());
+            chunk = chunk.slice(lastSpace).trim();
+        }
+    }
+
+    return chunks;
+}