Skip to content

Commit 7fa4f8c

Browse files
Added support for the semantic_text field and semantic query type (#1881)
* Added support for the `semantic_text` field and `semantic` query type * Fix nltk code... again * feedback
1 parent ec60616 commit 7fa4f8c

File tree

8 files changed

+307
-4
lines changed

8 files changed

+307
-4
lines changed

elasticsearch_dsl/field.py

+4
Original file line numberDiff line numberDiff line change
@@ -560,3 +560,7 @@ class TokenCount(Field):
560560

561561
class Murmur3(Field):
562562
name = "murmur3"
563+
564+
565+
class SemanticText(Field):
566+
name = "semantic_text"

elasticsearch_dsl/query.py

+4
Original file line numberDiff line numberDiff line change
@@ -527,6 +527,10 @@ class Shape(Query):
527527
name = "shape"
528528

529529

530+
class Semantic(Query):
531+
name = "semantic"
532+
533+
530534
class SimpleQueryString(Query):
531535
name = "simple_query_string"
532536

examples/async/semantic_text.py

+148
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,148 @@
1+
# Licensed to Elasticsearch B.V. under one or more contributor
2+
# license agreements. See the NOTICE file distributed with
3+
# this work for additional information regarding copyright
4+
# ownership. Elasticsearch B.V. licenses this file to you under
5+
# the Apache License, Version 2.0 (the "License"); you may
6+
# not use this file except in compliance with the License.
7+
# You may obtain a copy of the License at
8+
#
9+
# http://www.apache.org/licenses/LICENSE-2.0
10+
#
11+
# Unless required by applicable law or agreed to in writing,
12+
# software distributed under the License is distributed on an
13+
# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
14+
# KIND, either express or implied. See the License for the
15+
# specific language governing permissions and limitations
16+
# under the License.
17+
18+
19+
"""
20+
# Semantic Text example
21+
22+
Requirements:
23+
24+
$ pip install "elasticsearch-dsl[async]" tqdm
25+
26+
Before running this example, an ELSER inference endpoint must be created in the
27+
Elasticsearch cluster. This can be done manually from Kibana, or with the
28+
following curl command from a terminal:
29+
30+
curl -X PUT \
31+
"$ELASTICSEARCH_URL/_inference/sparse_embedding/my-elser-endpoint" \
32+
-H "Content-Type: application/json" \
33+
-d '{"service":"elser","service_settings":{"num_allocations":1,"num_threads":1}}'
34+
35+
To run the example:
36+
37+
$ python semantic_text.py "text to search"
38+
39+
The index will be created automatically if it does not exist. Add
40+
`--recreate-index` to the command to regenerate it.
41+
42+
The example dataset includes a selection of workplace documents. The
43+
following are good example queries to try out with this dataset:
44+
45+
$ python semantic_text.py "work from home"
46+
$ python semantic_text.py "vacation time"
47+
$ python semantic_text.py "can I bring a bird to work?"
48+
49+
When the index is created, the inference service will split the documents into
50+
short passages, and for each passage a sparse embedding will be generated using
51+
Elastic's ELSER v2 model.
52+
"""
53+
54+
import argparse
55+
import asyncio
56+
import json
57+
import os
58+
from datetime import datetime
59+
from typing import Any, Optional
60+
from urllib.request import urlopen
61+
62+
from tqdm import tqdm
63+
64+
import elasticsearch_dsl as dsl
65+
66+
DATASET_URL = "https://raw.githubusercontent.com/elastic/elasticsearch-labs/main/datasets/workplace-documents.json"
67+
68+
69+
class WorkplaceDoc(dsl.AsyncDocument):
70+
class Index:
71+
name = "workplace_documents_semantic"
72+
73+
name: str
74+
summary: str
75+
content: Any = dsl.mapped_field(
76+
dsl.field.SemanticText(inference_id="my-elser-endpoint")
77+
)
78+
created: datetime
79+
updated: Optional[datetime]
80+
url: str = dsl.mapped_field(dsl.Keyword())
81+
category: str = dsl.mapped_field(dsl.Keyword())
82+
83+
84+
async def create() -> None:
85+
86+
# create the index
87+
await WorkplaceDoc._index.delete(ignore_unavailable=True)
88+
await WorkplaceDoc.init()
89+
90+
# download the data
91+
dataset = json.loads(urlopen(DATASET_URL).read())
92+
93+
# import the dataset
94+
for data in tqdm(dataset, desc="Indexing documents..."):
95+
doc = WorkplaceDoc(
96+
name=data["name"],
97+
summary=data["summary"],
98+
content=data["content"],
99+
created=data.get("created_on"),
100+
updated=data.get("updated_at"),
101+
url=data["url"],
102+
category=data["category"],
103+
)
104+
await doc.save()
105+
106+
# refresh the index
107+
await WorkplaceDoc._index.refresh()
108+
109+
110+
async def search(query: str) -> dsl.AsyncSearch[WorkplaceDoc]:
111+
search = WorkplaceDoc.search()
112+
search = search[:5]
113+
return search.query(dsl.query.Semantic(field=WorkplaceDoc.content, query=query))
114+
115+
116+
def parse_args() -> argparse.Namespace:
117+
parser = argparse.ArgumentParser(description="Vector database with Elasticsearch")
118+
parser.add_argument(
119+
"--recreate-index", action="store_true", help="Recreate and populate the index"
120+
)
121+
parser.add_argument("query", action="store", help="The search query")
122+
return parser.parse_args()
123+
124+
125+
async def main() -> None:
126+
args = parse_args()
127+
128+
# initiate the default connection to elasticsearch
129+
dsl.async_connections.create_connection(hosts=[os.environ["ELASTICSEARCH_URL"]])
130+
131+
if args.recreate_index or not await WorkplaceDoc._index.exists():
132+
await create()
133+
134+
results = await search(args.query)
135+
136+
async for hit in results:
137+
print(
138+
f"Document: {hit.name} [Category: {hit.category}] [Score: {hit.meta.score}]"
139+
)
140+
print(f"Content: {hit.content.text}")
141+
print("--------------------\n")
142+
143+
# close the connection
144+
await dsl.async_connections.get_connection().close()
145+
146+
147+
if __name__ == "__main__":
148+
asyncio.run(main())

examples/async/sparse_vectors.py

+1-1
Original file line numberDiff line numberDiff line change
@@ -84,7 +84,7 @@
8484
DATASET_URL = "https://raw.githubusercontent.com/elastic/elasticsearch-labs/main/datasets/workplace-documents.json"
8585

8686
# initialize sentence tokenizer
87-
nltk.download("punkt", quiet=True)
87+
nltk.download("punkt_tab", quiet=True)
8888

8989

9090
class Passage(InnerDoc):

examples/async/vectors.py

+1-1
Original file line numberDiff line numberDiff line change
@@ -70,7 +70,7 @@
7070
MODEL_NAME = "all-MiniLM-L6-v2"
7171

7272
# initialize sentence tokenizer
73-
nltk.download("punkt", quiet=True)
73+
nltk.download("punkt_tab", quiet=True)
7474

7575
# this will be the embedding model
7676
embedding_model: Any = None

examples/semantic_text.py

+147
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,147 @@
1+
# Licensed to Elasticsearch B.V. under one or more contributor
2+
# license agreements. See the NOTICE file distributed with
3+
# this work for additional information regarding copyright
4+
# ownership. Elasticsearch B.V. licenses this file to you under
5+
# the Apache License, Version 2.0 (the "License"); you may
6+
# not use this file except in compliance with the License.
7+
# You may obtain a copy of the License at
8+
#
9+
# http://www.apache.org/licenses/LICENSE-2.0
10+
#
11+
# Unless required by applicable law or agreed to in writing,
12+
# software distributed under the License is distributed on an
13+
# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
14+
# KIND, either express or implied. See the License for the
15+
# specific language governing permissions and limitations
16+
# under the License.
17+
18+
19+
"""
20+
# Semantic Text example
21+
22+
Requirements:
23+
24+
$ pip install "elasticsearch-dsl" tqdm
25+
26+
Before running this example, an ELSER inference endpoint must be created in the
27+
Elasticsearch cluster. This can be done manually from Kibana, or with the
28+
following curl command from a terminal:
29+
30+
curl -X PUT \
31+
"$ELASTICSEARCH_URL/_inference/sparse_embedding/my-elser-endpoint" \
32+
-H "Content-Type: application/json" \
33+
-d '{"service":"elser","service_settings":{"num_allocations":1,"num_threads":1}}'
34+
35+
To run the example:
36+
37+
$ python semantic_text.py "text to search"
38+
39+
The index will be created automatically if it does not exist. Add
40+
`--recreate-index` to the command to regenerate it.
41+
42+
The example dataset includes a selection of workplace documents. The
43+
following are good example queries to try out with this dataset:
44+
45+
$ python semantic_text.py "work from home"
46+
$ python semantic_text.py "vacation time"
47+
$ python semantic_text.py "can I bring a bird to work?"
48+
49+
When the index is created, the inference service will split the documents into
50+
short passages, and for each passage a sparse embedding will be generated using
51+
Elastic's ELSER v2 model.
52+
"""
53+
54+
import argparse
55+
import json
56+
import os
57+
from datetime import datetime
58+
from typing import Any, Optional
59+
from urllib.request import urlopen
60+
61+
from tqdm import tqdm
62+
63+
import elasticsearch_dsl as dsl
64+
65+
DATASET_URL = "https://raw.githubusercontent.com/elastic/elasticsearch-labs/main/datasets/workplace-documents.json"
66+
67+
68+
class WorkplaceDoc(dsl.Document):
69+
class Index:
70+
name = "workplace_documents_semantic"
71+
72+
name: str
73+
summary: str
74+
content: Any = dsl.mapped_field(
75+
dsl.field.SemanticText(inference_id="my-elser-endpoint")
76+
)
77+
created: datetime
78+
updated: Optional[datetime]
79+
url: str = dsl.mapped_field(dsl.Keyword())
80+
category: str = dsl.mapped_field(dsl.Keyword())
81+
82+
83+
def create() -> None:
84+
85+
# create the index
86+
WorkplaceDoc._index.delete(ignore_unavailable=True)
87+
WorkplaceDoc.init()
88+
89+
# download the data
90+
dataset = json.loads(urlopen(DATASET_URL).read())
91+
92+
# import the dataset
93+
for data in tqdm(dataset, desc="Indexing documents..."):
94+
doc = WorkplaceDoc(
95+
name=data["name"],
96+
summary=data["summary"],
97+
content=data["content"],
98+
created=data.get("created_on"),
99+
updated=data.get("updated_at"),
100+
url=data["url"],
101+
category=data["category"],
102+
)
103+
doc.save()
104+
105+
# refresh the index
106+
WorkplaceDoc._index.refresh()
107+
108+
109+
def search(query: str) -> dsl.Search[WorkplaceDoc]:
110+
search = WorkplaceDoc.search()
111+
search = search[:5]
112+
return search.query(dsl.query.Semantic(field=WorkplaceDoc.content, query=query))
113+
114+
115+
def parse_args() -> argparse.Namespace:
116+
parser = argparse.ArgumentParser(description="Vector database with Elasticsearch")
117+
parser.add_argument(
118+
"--recreate-index", action="store_true", help="Recreate and populate the index"
119+
)
120+
parser.add_argument("query", action="store", help="The search query")
121+
return parser.parse_args()
122+
123+
124+
def main() -> None:
125+
args = parse_args()
126+
127+
# initiate the default connection to elasticsearch
128+
dsl.connections.create_connection(hosts=[os.environ["ELASTICSEARCH_URL"]])
129+
130+
if args.recreate_index or not WorkplaceDoc._index.exists():
131+
create()
132+
133+
results = search(args.query)
134+
135+
for hit in results:
136+
print(
137+
f"Document: {hit.name} [Category: {hit.category}] [Score: {hit.meta.score}]"
138+
)
139+
print(f"Content: {hit.content.text}")
140+
print("--------------------\n")
141+
142+
# close the connection
143+
dsl.connections.get_connection().close()
144+
145+
146+
if __name__ == "__main__":
147+
main()

examples/sparse_vectors.py

+1-1
Original file line numberDiff line numberDiff line change
@@ -83,7 +83,7 @@
8383
DATASET_URL = "https://raw.githubusercontent.com/elastic/elasticsearch-labs/main/datasets/workplace-documents.json"
8484

8585
# initialize sentence tokenizer
86-
nltk.download("punkt", quiet=True)
86+
nltk.download("punkt_tab", quiet=True)
8787

8888

8989
class Passage(InnerDoc):

examples/vectors.py

+1-1
Original file line numberDiff line numberDiff line change
@@ -69,7 +69,7 @@
6969
MODEL_NAME = "all-MiniLM-L6-v2"
7070

7171
# initialize sentence tokenizer
72-
nltk.download("punkt", quiet=True)
72+
nltk.download("punkt_tab", quiet=True)
7373

7474
# this will be the embedding model
7575
embedding_model: Any = None

0 commit comments

Comments
 (0)