Skip to content
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion 3rdparty/ucxx
Submodule ucxx updated 49 files
+2 −0 .github/workflows/build.yaml
+15 −0 .github/workflows/pr.yaml
+1 −1 .github/workflows/trigger-breaking-change-alert.yaml
+1 −1 .pre-commit-config.yaml
+1 −1 README.md
+3 −3 conda/environments/all_cuda-118_arch-x86_64.yaml
+5 −5 conda/environments/all_cuda-128_arch-x86_64.yaml
+7 −7 conda/recipes/ucxx/conda_build_config.yaml
+9 −21 conda/recipes/ucxx/meta.yaml
+2 −3 cpp/CMakeLists.txt
+4 −3 cpp/include/ucxx/buffer.h
+1 −1 cpp/include/ucxx/delayed_submission.h
+3 −3 cpp/include/ucxx/endpoint.h
+7 −7 cpp/include/ucxx/request.h
+3 −0 cpp/include/ucxx/request_am.h
+1 −2 cpp/include/ucxx/request_tag_multi.h
+17 −2 cpp/include/ucxx/typedefs.h
+14 −5 cpp/include/ucxx/worker.h
+1 −0 cpp/python/src/exception.cpp
+1 −0 cpp/python/src/worker.cpp
+1 −0 cpp/src/config.cpp
+2 −0 cpp/src/context.cpp
+4 −4 cpp/src/delayed_submission.cpp
+2 −1 cpp/src/endpoint.cpp
+4 −2 cpp/src/internal/request_am.cpp
+2 −1 cpp/src/listener.cpp
+3 −3 cpp/src/memory_handle.cpp
+16 −4 cpp/src/remote_key.cpp
+15 −13 cpp/src/request_am.cpp
+1 −0 cpp/src/request_data.cpp
+0 −2 cpp/src/request_endpoint_close.cpp
+0 −2 cpp/src/request_flush.cpp
+4 −6 cpp/src/request_mem.cpp
+12 −8 cpp/src/request_stream.cpp
+5 −5 cpp/src/request_tag.cpp
+13 −12 cpp/src/request_tag_multi.cpp
+3 −3 cpp/src/utils/file_descriptor.cpp
+6 −3 cpp/src/utils/sockaddr.cpp
+12 −9 cpp/src/worker.cpp
+5 −0 cpp/tests/buffer.cpp
+1 −0 cpp/tests/context.cpp
+17 −12 cpp/tests/request.cpp
+9 −4 cpp/tests/worker.cpp
+11 −7 dependencies.yaml
+1 −1 python/distributed-ucxx/pyproject.toml
+1 −1 python/libucxx/pyproject.toml
+2 −2 python/ucxx/pyproject.toml
+16 −3 python/ucxx/ucxx/_lib/libucxx.pyx
+9 −3 python/ucxx/ucxx/_lib/ucxx_api.pxd
70 changes: 44 additions & 26 deletions benchmarks/cpp/prepare_dataset.py
Original file line number Diff line number Diff line change
Expand Up @@ -27,16 +27,17 @@
class RootArgs(BaseModel):
tokenizer: str
output: str
output_format: str
random_seed: int
task_id: int
std_out: bool
rand_task_id: Optional[Tuple[int, int]]

@field_validator('tokenizer')
@field_validator("tokenizer")
def get_tokenizer(cls,
v: str) -> PreTrainedTokenizer | PreTrainedTokenizerFast:
try:
tokenizer = AutoTokenizer.from_pretrained(v, padding_side='left')
tokenizer = AutoTokenizer.from_pretrained(v, padding_side="left")
except EnvironmentError as e:
raise ValueError(
f"Cannot find a tokenizer from the given string because of {e}\nPlease set tokenizer to the directory that contains the tokenizer, or set to a model name in HuggingFace."
Expand All @@ -51,48 +52,65 @@ def get_tokenizer(cls,
required=True,
type=str,
help=
"Tokenizer dir for the model run by gptManagerBenchmark, or the model name from HuggingFace."
"Tokenizer dir for the model run by gptManagerBenchmark, or the model name from HuggingFace.",
)
@click.option(
"--output-format",
type=click.Choice(["trtllm-bench", "gptManagerBenchmark"]),
help=
"The format of the output, to accommodate different benchmarking entrypoints.",
default="gptManagerBenchmark",
)
@click.option(
"--output",
type=str,
help="Output json filename.",
default="preprocessed_dataset.json",
)
@click.option("--output",
type=str,
help="Output json filename.",
default="preprocessed_dataset.json")
@click.option(
"--stdout",
is_flag=True,
help="Print output to stdout with a JSON dataset entry on each line.",
default=False)
@click.option("--random-seed",
required=False,
type=int,
help="random seed for token_ids",
default=420)
default=False,
)
@click.option(
"--random-seed",
required=False,
type=int,
help="random seed for token_ids",
default=420,
)
@click.option("--task-id", type=int, default=-1, help="LoRA task id")
@click.option("--rand-task-id",
type=int,
default=None,
nargs=2,
help="Random LoRA Tasks")
@click.option("--log-level",
default="info",
type=click.Choice(['info', 'debug']),
help="Logging level.")
@click.option(
"--log-level",
default="info",
type=click.Choice(["info", "debug"]),
help="Logging level.",
)
@click.pass_context
def cli(ctx, **kwargs):
def cli(ctx: click.Context, **kwargs):
"""This script generates dataset input for gptManagerBenchmark."""
if kwargs['log_level'] == 'info':
if kwargs["log_level"] == "info":
logging.basicConfig(level=logging.INFO)
elif kwargs['log_level'] == 'debug':
elif kwargs["log_level"] == "debug":
logging.basicConfig(level=logging.DEBUG)
else:
raise ValueError(f"Unsupported logging level {kwargs['log_level']}")

ctx.obj = RootArgs(tokenizer=kwargs['tokenizer'],
output=kwargs['output'],
std_out=kwargs['stdout'],
random_seed=kwargs['random_seed'],
task_id=kwargs['task_id'],
rand_task_id=kwargs['rand_task_id'])
ctx.obj = RootArgs(
tokenizer=kwargs["tokenizer"],
output=kwargs["output"],
std_out=kwargs["stdout"],
random_seed=kwargs["random_seed"],
task_id=kwargs["task_id"],
rand_task_id=kwargs["rand_task_id"],
output_format=kwargs["output_format"],
)


cli.add_command(dataset)
Expand Down
128 changes: 75 additions & 53 deletions benchmarks/cpp/utils/prepare_synthetic_data.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,23 +10,25 @@
@click.option("--num-requests",
required=True,
type=int,
help='Number of requests to be generated')
@click.option('--input-mean',
help="Number of requests to be generated")
@click.option("--input-mean",
required=True,
type=int,
help='normal dist mean for input tokens')
@click.option('--input-stdev',
help="normal dist mean for input tokens")
@click.option("--input-stdev",
required=True,
type=int,
help='normal dist stdev for input tokens')
@click.option('--output-mean',
help="normal dist stdev for input tokens")
@click.option("--output-mean",
required=True,
type=int,
help='normal dist mean for output tokens')
@click.option('--output-stdev',
required=True,
type=int,
help='normal dist stdev for output tokens')
help="normal dist mean for output tokens")
@click.option(
"--output-stdev",
required=True,
type=int,
help="normal dist stdev for output tokens",
)
@click.pass_obj
def token_norm_dist(root_args, **kwargs):
"""Prepare synthetic dataset by generating random tokens with normal dist lengths."""
Expand All @@ -35,14 +37,16 @@ def token_norm_dist(root_args, **kwargs):
output_lens = []
task_ids = []

input_lens = get_norm_dist_lengths(kwargs['input_mean'],
kwargs['input_stdev'],
kwargs['num_requests'],
root_args.random_seed)
input_lens = get_norm_dist_lengths(
kwargs["input_mean"],
kwargs["input_stdev"],
kwargs["num_requests"],
root_args.random_seed,
)

num_reqs = len(input_lens)
output_lens = get_norm_dist_lengths(kwargs['output_mean'],
kwargs['output_stdev'], num_reqs,
output_lens = get_norm_dist_lengths(kwargs["output_mean"],
kwargs["output_stdev"], num_reqs,
root_args.random_seed)

max_input_len = max(input_lens)
Expand All @@ -61,15 +65,18 @@ def token_norm_dist(root_args, **kwargs):
text_dataset_dump(
input_lens, input_ids, output_lens, task_ids, {
"workload_type": "token-norm-dist",
"input_mean": kwargs['input_mean'],
"input_stdev": kwargs['input_stdev'],
"output_mean": kwargs['output_mean'],
"output_stdev": kwargs['output_stdev'],
"num_requests": kwargs['num_requests'],
"input_mean": kwargs["input_mean"],
"input_stdev": kwargs["input_stdev"],
"output_mean": kwargs["output_mean"],
"output_stdev": kwargs["output_stdev"],
"num_requests": kwargs["num_requests"],
"tokenize_vocabsize": root_args.tokenizer.vocab_size,
"max_input_len": max_input_len,
"max_output_len": max_output_len
}, root_args.output)
"max_output_len": max_output_len,
},
root_args.output,
root_args.output_format,
)
else:
print_text_dataset(
input_ids,
Expand All @@ -81,23 +88,31 @@ def token_norm_dist(root_args, **kwargs):
@click.option("--num-requests",
required=True,
type=int,
help='Number of requests to be generated')
@click.option('--input-min',
required=True,
type=int,
help='uniform dist (inclusive) min for input tokens')
@click.option('--input-max',
required=True,
type=int,
help='normal dist (inclusive) max for input tokens')
@click.option('--output-min',
required=True,
type=int,
help='normal dist (inclusive) min for output tokens')
@click.option('--output-max',
required=True,
type=int,
help='normal dist (inclusive) max for output tokens')
help="Number of requests to be generated")
@click.option(
"--input-min",
required=True,
type=int,
help="uniform dist (inclusive) min for input tokens",
)
@click.option(
"--input-max",
required=True,
type=int,
help="normal dist (inclusive) max for input tokens",
)
@click.option(
"--output-min",
required=True,
type=int,
help="normal dist (inclusive) min for output tokens",
)
@click.option(
"--output-max",
required=True,
type=int,
help="normal dist (inclusive) max for output tokens",
)
@click.pass_obj
def token_unif_dist(root_args, **kwargs):
"""Prepare synthetic dataset by generating random tokens with normal uniformly lengths."""
Expand All @@ -106,13 +121,17 @@ def token_unif_dist(root_args, **kwargs):
output_lens = []
task_ids = []

input_lens = get_unif_dist_lengths(kwargs['input_min'], kwargs['input_max'],
kwargs['num_requests'],
root_args.random_seed)
input_lens = get_unif_dist_lengths(
kwargs["input_min"],
kwargs["input_max"],
kwargs["num_requests"],
root_args.random_seed,
root_args.output_format,
)

num_reqs = len(input_lens)
output_lens = get_unif_dist_lengths(kwargs['output_min'],
kwargs['output_max'], num_reqs,
output_lens = get_unif_dist_lengths(kwargs["output_min"],
kwargs["output_max"], num_reqs,
root_args.random_seed)

max_input_len = max(input_lens)
Expand All @@ -131,15 +150,18 @@ def token_unif_dist(root_args, **kwargs):
text_dataset_dump(
input_lens, input_ids, output_lens, task_ids, {
"workload_type": "token-unif-dist",
"input_min": kwargs['input_min'],
"input_max": kwargs['input_max'],
"output_min": kwargs['output_min'],
"output_max": kwargs['output_max'],
"num_requests": kwargs['num_requests'],
"input_min": kwargs["input_min"],
"input_max": kwargs["input_max"],
"output_min": kwargs["output_min"],
"output_max": kwargs["output_max"],
"num_requests": kwargs["num_requests"],
"tokenize_vocabsize": root_args.tokenizer.vocab_size,
"max_input_len": max_input_len,
"max_output_len": max_output_len
}, root_args.output)
"max_output_len": max_output_len,
},
root_args.output,
root_args.output_format,
)
else:
print_text_dataset(
input_ids,
Expand Down
Loading
Loading