-
Notifications
You must be signed in to change notification settings - Fork 136
Add argparse support for knnPerfTest.py #413
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
base: main
Are you sure you want to change the base?
Changes from all commits
d79897e
06ff824
76fccac
726c43b
ed4fb74
18b513c
a32fc08
4e7c6c3
5339bf1
8a3da61
90c4789
78d15cb
30311ec
0337f5d
c18706e
a362acf
4029ab8
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -11,6 +11,7 @@ | |
import re | ||
import subprocess | ||
import sys | ||
import argparse | ||
|
||
import benchUtil | ||
import constants | ||
|
@@ -27,70 +28,64 @@ | |
### Create document and task vectors | ||
# ./gradlew vectors-100 | ||
# | ||
# change the parameters below and then run (you can still manually run this file, but using gradle command | ||
# To run this script directly with arguments: | ||
# python src/python/knnPerfTest.py --ndoc 1000000 --topK 10 50 | ||
# | ||
# To run this script with arguments via Gradle, use the -Pargs property: | ||
# ./gradlew runKnnPerfTest -Pargs="--ndoc 1000000 --topK 10 50" | ||
# | ||
# The -Pargs property will forward the arguments to the Python script. | ||
# | ||
# add parameters as needed below and then run (you can still manually run this file, but using gradle command | ||
# below will auto recompile if you made any changes to java files in luceneutils) | ||
# ./gradlew runKnnPerfTest | ||
# | ||
# you may want to modify the following settings: | ||
|
||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Let's also update the doc string above with instructions on how to run this script with args? There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Done! |
||
DO_PROFILING = False | ||
|
||
# e.g. to compile KnnIndexer: | ||
# | ||
# javac -d build -cp /l/trunk/lucene/core/build/libs/lucene-core-10.0.0-SNAPSHOT.jar:/l/trunk/lucene/join/build/libs/lucene-join-10.0.0-SNAPSHOT.jar src/main/knn/*.java src/main/WikiVectors.java src/main/perf/VectorDictionary.java | ||
# | ||
|
||
NOISY = True | ||
|
||
# TODO | ||
# - can we expose greediness (global vs local queue exploration in KNN search) here? | ||
|
||
# test parameters. This script will run KnnGraphTester on every combination of these parameters | ||
PARAMS = { | ||
# "ndoc": (10_000_000,), | ||
#'ndoc': (10000, 100000, 200000, 500000), | ||
#'ndoc': (10000, 100000, 200000, 500000), | ||
#'ndoc': (2_000_000,), | ||
#'ndoc': (1_000_000,), | ||
"ndoc": (500_000,), | ||
#'ndoc': (50_000,), | ||
"maxConn": (32, 64, 96), | ||
# "maxConn": (64,), | ||
#'maxConn': (32,), | ||
"beamWidthIndex": (250, 500), | ||
# "beamWidthIndex": (250,), | ||
#'beamWidthIndex': (50,), | ||
"fanout": (20, 50, 100, 250), | ||
# "fanout": (50,), | ||
#'quantize': None, | ||
#'quantizeBits': (32, 7, 4), | ||
"numMergeWorker": (12,), | ||
"numMergeThread": (4,), | ||
"numSearchThread": (0,), | ||
#'numMergeWorker': (1,), | ||
#'numMergeThread': (1,), | ||
"encoding": ("float32",), | ||
# 'metric': ('angular',), # default is angular (dot_product) | ||
# 'metric': ('mip',), | ||
#'quantize': (True,), | ||
"quantizeBits": ( | ||
4, | ||
7, | ||
32, | ||
), | ||
# "quantizeBits": (1,), | ||
# "overSample": (5,), # extra ratio of vectors to retrieve, for testing approximate scoring, e.g. quantized indices | ||
#'fanout': (0,), | ||
"topK": (100,), | ||
# "bp": ("false", "true"), | ||
#'quantizeCompress': (True, False), | ||
"quantizeCompress": (True,), | ||
# "indexType": ("flat", "hnsw"), # index type, only works with singlt bit | ||
"queryStartIndex": (0,), # seek to this start vector before searching, to sample different vectors | ||
# "forceMerge": (True, False), | ||
#'niter': (10,), | ||
} | ||
|
||
def str2bool(v): | ||
if v.lower() == 'true': | ||
return True | ||
elif v.lower() == 'false': | ||
return False | ||
else: | ||
raise argparse.ArgumentTypeError(f"Unexpected value: {v}. Expected boolean value(s).") | ||
|
||
def parse_args(): | ||
parser = argparse.ArgumentParser( | ||
description="Run KNN benchmark with configurable parameters.", | ||
epilog="Example: python src/python/knnPerfTest.py --docVectors ../data/cohere-wikipedia-docs-768d.vec --queryVectors ../data/cohere-wikipedia-queries-768d.vec --topK 10 50 100 --maxConn 32 64 --quantizeCompress True False" | ||
) | ||
|
||
parser.add_argument("--ndoc", type=int, nargs="*", default=[500_000], help="Number of documents") | ||
parser.add_argument("--topK", type=int, nargs="*", default=[100], help="Top K results to retrieve") | ||
parser.add_argument("--maxConn", type=int, nargs="*", default=[64], help="Max connections in the graph") | ||
parser.add_argument("--beamWidthIndex", type=int, nargs="*", default=[250], help="Beam width at index time") | ||
parser.add_argument("--fanout", type=int, nargs="*", default=[50], help="Fanout parameter") | ||
parser.add_argument("--quantizeBits", type=int, nargs="*", default=[32], help="Quantization bits") | ||
parser.add_argument("--quantizeCompress", type=str2bool, nargs="*", default=[True], help="Enable quantize compression") | ||
parser.add_argument("--numMergeWorker", type=int, nargs="*", default=[12], help="Number of merge workers") | ||
parser.add_argument("--numMergeThread", type=int, nargs="*", default=[4], help="Number of merge threads") | ||
parser.add_argument("--encoding", type=str, nargs="*", default=["float32"], help="Encoding type") | ||
parser.add_argument("--queryStartIndex", type=int, nargs="*", default=[0], help="Query start index") | ||
parser.add_argument("--numSearchThread", type=int, nargs="*", default=[0], help="Number of search threads") | ||
parser.add_argument("--dim", type=int, default=768, help="Vector dimensionality") | ||
parser.add_argument("--docVectors", type=str, default=f"{constants.BASE_DIR}/data/cohere-wikipedia-docs-768d.vec", help="Path to document vectors") | ||
parser.add_argument("--queryVectors", type=str, default=f"{constants.BASE_DIR}/data/cohere-wikipedia-queries-768d.vec", help="Path to query vectors") | ||
parser.add_argument("--parentJoin", type=str, default=None, help="Path to parent join metadata file") | ||
parser.add_argument("--profile", action="store_true", help="Enable Java profiling") | ||
parser.add_argument("--quiet", action="store_true", help="Suppress benchmark output") | ||
|
||
return parser.parse_args() | ||
|
||
OUTPUT_HEADERS = [ | ||
"recall", | ||
|
@@ -117,7 +112,6 @@ | |
"indexType", | ||
] | ||
|
||
|
||
def advance(ix, values): | ||
for i in reversed(range(len(ix))): | ||
# scary to rely on dict key enumeration order? but i guess if dict never changes while we do this, it's stable? | ||
|
@@ -132,33 +126,25 @@ def advance(ix, values): | |
|
||
|
||
def run_knn_benchmark(checkout, values): | ||
indexes = [0] * len(values.keys()) | ||
indexes[-1] = -1 | ||
args = [] | ||
# dim = 100 | ||
# doc_vectors = constants.GLOVE_VECTOR_DOCS_FILE | ||
# query_vectors = '%s/luceneutil/tasks/vector-task-100d.vec' % constants.BASE_DIR | ||
# dim = 768 | ||
# doc_vectors = '/lucenedata/enwiki/enwiki-20120502-lines-1k-mpnet.vec' | ||
# query_vectors = '/lucenedata/enwiki/enwiki-20120502.mpnet.vec' | ||
# dim = 384 | ||
# doc_vectors = '%s/data/enwiki-20120502-lines-1k-minilm.vec' % constants.BASE_DIR | ||
# query_vectors = '%s/luceneutil/tasks/vector-task-minilm.vec' % constants.BASE_DIR | ||
# dim = 300 | ||
# doc_vectors = '%s/data/enwiki-20120502-lines-1k-300d.vec' % constants.BASE_DIR | ||
# query_vectors = '%s/luceneutil/tasks/vector-task-300d.vec' % constants.BASE_DIR | ||
|
||
# dim = 256 | ||
# doc_vectors = '/d/electronics_asin_emb.bin' | ||
# query_vectors = '/d/electronics_query_vectors.bin' | ||
do_profiling = values.pop("profile") | ||
noisy = not values.pop("quiet") | ||
|
||
# Ensure parentJoin is always a list for consistency | ||
parent_join = values.get("parentJoin") | ||
if parent_join is None or parent_join == "": | ||
del values["parentJoin"] | ||
else: | ||
values["parentJoin"] = [parent_join] | ||
|
||
# Cohere dataset | ||
dim = 768 | ||
doc_vectors = f"{constants.BASE_DIR}/data/cohere-wikipedia-docs-{dim}d.vec" | ||
query_vectors = f"{constants.BASE_DIR}/data/cohere-wikipedia-queries-{dim}d.vec" | ||
# doc_vectors = f"/lucenedata/enwiki/{'cohere-wikipedia'}-docs-{dim}d.vec" | ||
# query_vectors = f"/lucenedata/enwiki/{'cohere-wikipedia'}-queries-{dim}d.vec" | ||
# parentJoin_meta_file = f"{constants.BASE_DIR}/data/{'cohere-wikipedia'}-metadata.csv" | ||
dim = values.pop("dim") | ||
doc_vectors = values.pop("docVectors") | ||
query_vectors = values.pop("queryVectors") | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. YAY! I'm so tired of editing this source for our runs... There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Yes, 100% |
||
|
||
# iterator state through all possible index combinations of incoming arguments | ||
indexes = [0] * len(values.keys()) | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Hmm this is sort of confusing -- maybe add a comment what this |
||
indexes[-1] = -1 # for advance(...) to roll to all zeros at very first call | ||
args = [] | ||
|
||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Do we somewhere print If not, can we add that, and could you also give some juicy examples showing off the odometer iterator aspect, like mixing multiple There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. So far the script adopts default values for all parameters. For the arguments doc/query vector files, it defaults to the downloaded sources from running Let me know if you think that that would be a good design. I also just added a usage example that shows how multiple arguments are passed. Thanks for the feedback! There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Yeah actually I love that it runs all defaults -- it's a great out of the box experience. I guess if the user messes something else up (not sure what?) we'd print a Usage line...? Can add this later... |
||
jfr_output = f"{constants.LOGS_DIR}/knn-perf-test.jfr" | ||
|
||
|
@@ -174,14 +160,14 @@ def run_knn_benchmark(checkout, values): | |
"-XX:+DebugNonSafepoints", | ||
] | ||
|
||
if DO_PROFILING: | ||
if do_profiling: | ||
cmd += [f"-XX:StartFlightRecording=dumponexit=true,maxsize=250M,settings={constants.BENCH_BASE_DIR}/src/python/profiling.jfc" + f",filename={jfr_output}"] | ||
|
||
cmd += ["knn.KnnGraphTester"] | ||
|
||
all_results = [] | ||
while advance(indexes, values): | ||
if NOISY: | ||
if noisy: | ||
print("\nNEXT:") | ||
pv = {} | ||
args = [] | ||
|
@@ -238,7 +224,7 @@ def run_knn_benchmark(checkout, values): | |
#'-quiet' | ||
] | ||
) | ||
if NOISY: | ||
if noisy: | ||
print(f" cmd: {this_cmd}") | ||
else: | ||
cmd += ["-quiet"] | ||
|
@@ -251,7 +237,7 @@ def run_knn_benchmark(checkout, values): | |
if line == "": | ||
break | ||
lines += line | ||
if NOISY: | ||
if noisy: | ||
sys.stdout.write(line) | ||
m = re_summary.match(line) | ||
if m is not None: | ||
|
@@ -262,10 +248,10 @@ def run_knn_benchmark(checkout, values): | |
if job.returncode != 0: | ||
raise RuntimeError(f"command failed with exit {job.returncode}") | ||
all_results.append((summary, args)) | ||
if DO_PROFILING: | ||
if do_profiling: | ||
benchUtil.profilerOutput(constants.JAVA_EXE, jfr_output, benchUtil.checkoutToPath(checkout), 30, (1,)) | ||
|
||
if NOISY: | ||
if noisy: | ||
print("\nResults:") | ||
|
||
# TODO: be more careful when we skip/show headers e.g. if some of the runs involve filtering, | ||
|
@@ -424,6 +410,8 @@ def chart_args_label(args): | |
|
||
|
||
if __name__ == "__main__": | ||
args = parse_args() | ||
params = vars(args) | ||
# Where the version of Lucene is that will be tested. Now this will be sourced from gradle.properties | ||
LUCENE_CHECKOUT = getLuceneDirFromGradleProperties() | ||
run_knn_benchmark(LUCENE_CHECKOUT, PARAMS) | ||
run_knn_benchmark(LUCENE_CHECKOUT, params) |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Will this still run as is, or do we need to update the gradle task as well?
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
I updated the task so that it accepts arguments by passing
-Pargs="(script args go here)"
togradlew
.