pinecone_ingest_python_implementation/ingest.py at main · ucl98/pinecone_ingest_python_implementation · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
from langchain.document_loaders import PyPDFLoader
from langchain.embeddings import OpenAIEmbeddings

import hashlib
import json
import pinecone

import os

from config import OPENAI_API_KEY, PINECONE_API_KEY, PINECONE_INDEX_NAME, PINECONE_NAMESPACE, PINECONE_ENVIRONMENT

embeddings = OpenAIEmbeddings(openai_api_key=OPENAI_API_KEY, model="text-embedding-ada-002")
pinecone.init(api_key=PINECONE_API_KEY, environment=PINECONE_ENVIRONMENT)
index = PINECONE_INDEX_NAME
index = pinecone.Index(index)
namespace = PINECONE_NAMESPACE

num = 0

def hash_string(input_string, algorithm='sha256'):
    # Create a hash object with the specified algorithm
    hash_obj = hashlib.new(algorithm)

    # Encode the input string to bytes
    input_bytes = input_string.encode('utf-8')

    # Update the hash object with the bytes of the input string
    hash_obj.update(input_bytes)

    # Get the hexadecimal representation of the hash
    hashed_string = hash_obj.hexdigest()

    return hashed_string

def chunk_text(linked_pages, chunk_size, chunk_overlap):
    chunks = []
    start = 0
    end = chunk_size

    if len(linked_pages) == 1:
        page1 = linked_pages[0]
        while end <= len(page1.page_content):
            chunk = page1.page_content[start:end]
            # The pdf-pages start at 1
            metadata = {"text": chunk, "pdf_page_number": page1.metadata["page"] + 1, "source": page1.metadata["source"]}
            chunks.append({"metadata": metadata})
            start += (chunk_size - chunk_overlap)
            end += (chunk_size - chunk_overlap)

        return chunks

    page1 = linked_pages[0]
    page2 = linked_pages[1] if len(linked_pages) == 2 else None

    # First pdf page only
    while end <= len(page1.page_content):
        chunk = page1.page_content[start:end]
        metadata = {"text": chunk, "pdf_page_number": page1.metadata["page"] + 1, "source": page1.metadata["source"]}
        chunks.append({"metadata": metadata})
        start += (chunk_size - chunk_overlap)
        end += (chunk_size - chunk_overlap)

    # First pdf page and second pdf page
    if page2:
        remaining = chunk_size - (len(page1.page_content) - start)
        chunk = page1.page_content[start:] + page2.page_content[:remaining]
        metadata = {"text": chunk, "pdf_page_number": page1.metadata["page"] + 1, "source": page1.metadata["source"]}
        chunks.append({"metadata": metadata})

        # Second pdf page only
        start = remaining
        end = start + chunk_size
        while end <= len(page2.page_content):
            chunk = page2.page_content[start:end]
            metadata = {"text": chunk, "pdf_page_number": page2.metadata["page"] + 1, "source": page2.metadata["source"]}
            chunks.append({"metadata": metadata})
            start += (chunk_size - chunk_overlap)
            end += (chunk_size - chunk_overlap)

    return chunks

def write_chunks_to_file(chunks, output_file):
    with open(output_file, 'w') as f:
        for chunk in chunks:
            f.write(json.dumps(chunk))
            f.write('\n')

def process_pdf(file_path, chunk_size=1000, chunk_overlap=200):
    loader = PyPDFLoader(file_path)
    pages = loader.load_and_split()

    linked_pages = [] # Changed this line
    result = []

    for i, _ in enumerate(pages):
        if(len(pages) == 1):
            linked_pages.append([pages[i]]) # Changed this line
            break
        if(i == len(pages) - 1):
          break
        linked_pages.append([pages[i], pages[i + 1]]) # Changed this line

    for pages in linked_pages:
        result += chunk_text(pages, chunk_size, chunk_overlap)

    return result

def uploadToPinecone(chunks):
    global namespace
    global num
    vectors = []

    for i, chunk in enumerate(chunks):
        text = chunk['metadata']['text']
        values = embeddings.embed_query(text)

        vector_dict = {
            'id': hash_string(text),
            'values': values,
            'metadata': chunk['metadata']
        }

        vectors.append(vector_dict)

        if(i % 100 == 0):
            index.upsert(vectors, namespace=namespace)
            num += len(vectors)
            vectors = []

    if(vectors != []):
        index.upsert(vectors, namespace=namespace)
        num += len(vectors)

# specify the folder path
def list_files(folder_path):
    file_names = []

    if os.path.isfile(folder_path):
        file_names.append(folder_path)
    elif os.path.isdir(folder_path):
        for filename in os.listdir(folder_path):
            if filename.endswith('.doc') or filename.endswith('.docx') or filename.endswith('.pdf'):
                file_names.append(os.path.join(folder_path, filename))

    return file_names

folder_path = "docs"
file_names = list_files(folder_path)

# Index all documents
for file_path in file_names:
    print("Add chunk ", file_path)
    chunked_pages = process_pdf(file_path)
    uploadToPinecone(chunked_pages)

print("Vectors added", num)