-
Notifications
You must be signed in to change notification settings - Fork 1
/
Copy pathintegrate_explicit_embeddings.py
90 lines (64 loc) · 2.26 KB
/
integrate_explicit_embeddings.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
"""
Required dependencies:
pip install \
"langchain>=0.3,<0.4" \
"langchain-astradb>=0.6,<0.7" \
"langchain-openai>=0.3,<0.4"
Requires a `.env` file with environment variables, see `template.env`.
"""
# Import dependencies
import os
import requests
from getpass import getpass
from astrapy.info import VectorServiceOptions
from langchain_astradb import AstraDBVectorStore
from langchain_core.documents import Document
from langchain_openai import OpenAIEmbeddings
from dotenv import load_dotenv
# Load environment variables
load_dotenv()
ASTRA_DB_APPLICATION_TOKEN = os.environ["ASTRA_DB_APPLICATION_TOKEN"]
ASTRA_DB_API_ENDPOINT = os.environ["ASTRA_DB_API_ENDPOINT"]
ASTRA_DB_KEYSPACE = os.environ.get("ASTRA_DB_KEYSPACE") or None
ASTRA_DB_API_KEY_NAME = os.environ.get("ASTRA_DB_API_KEY_NAME") or None
OPENAI_API_KEY = os.environ.get("OPENAI_API_KEY") or None
# Create a vector store
embedding = OpenAIEmbeddings()
vector_store = AstraDBVectorStore(
collection_name="langchain_integration_demo",
embedding=embedding,
token=ASTRA_DB_APPLICATION_TOKEN,
api_endpoint=ASTRA_DB_API_ENDPOINT,
namespace=ASTRA_DB_KEYSPACE,
)
# Load data
philo_dataset = requests.get(
"https://raw.githubusercontent.com/"
"datastaxdevs/mini-demo-astradb-langchain/"
"refs/heads/main/data/philosopher-quotes.json"
).json()
print("An example entry:")
print(philo_dataset[16])
# Process dataset
documents_to_insert = []
for entry_idx, entry in enumerate(philo_dataset):
metadata = {
"author": entry["author"],
**entry["metadata"],
}
# Construct the Document, with the quote and metadata tags
new_document = Document(
id=entry["_id"],
page_content=entry["quote"],
metadata=metadata,
)
documents_to_insert.append(new_document)
print(f"Ready to insert {len(documents_to_insert)} documents.")
print(f"Example document: {documents_to_insert[16]}")
# Insert documents
inserted_ids = vector_store.add_documents(documents_to_insert)
print(f"\nInserted {len(inserted_ids)} documents: {', '.join(inserted_ids[:3])} ...")
# Verify the integration
results = vector_store.similarity_search("Our life is what we make of it", k=3)
for res in results:
print(f"* {res.page_content} [{res.metadata}]")