Skip to content

Commit a83d122

Browse files
committed
Streaming working
Accuracy on BFCL simple multiple as in unary
1 parent 34925ad commit a83d122

24 files changed

+509
-105
lines changed

src/llm/apis/openai_completions.cpp

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -760,9 +760,9 @@ ParsedOutput OpenAIChatCompletionsHandler::parseOutputIfNeeded(const std::vector
760760
OVMS_PROFILE_FUNCTION();
761761
ParsedOutput parsedOutput;
762762
if (endpoint != Endpoint::CHAT_COMPLETIONS || outputParser == nullptr) {
763-
parsedOutput.content = tokenizer.decode(generatedIds);
763+
parsedOutput.content = this->tokenizer.decode(generatedIds);
764764
} else {
765-
parsedOutput = outputParser->parse(generatedIds, areToolsAvailable(), this->request.toolNameSchemaMap);
765+
parsedOutput = outputParser->parse(generatedIds, this->areToolsAvailable());
766766
}
767767
return parsedOutput;
768768
}

src/llm/apis/openai_completions.hpp

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -86,8 +86,9 @@ class OpenAIChatCompletionsHandler {
8686
endpoint(endpoint),
8787
created(creationTime),
8888
tokenizer(tokenizer) {
89+
// FIXME we should delay creating output parser until we have requiest with toolNameSchemaMap parsed
8990
if (!toolParserName.empty() || !reasoningParserName.empty()) {
90-
outputParser = std::make_unique<OutputParser>(tokenizer, toolParserName, reasoningParserName);
91+
outputParser = std::make_unique<OutputParser>(tokenizer, toolParserName, reasoningParserName, this->request.toolNameSchemaMap);
9192
}
9293
}
9394

src/llm/io_processing/base_output_parser.hpp

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -93,7 +93,7 @@ class BaseOutputParser {
9393
// Parse model output and extract relevant information to parsedOutput fields. Raw generated tokens are provided as an argument.
9494
// Additionally parsedOutput.content is already filled with decoded content when this method is called, enabling chain or parsing.
9595
// Parser is also responsible for removing extracted part from the parsedOutput.content if necessary.
96-
virtual void parse(ParsedOutput& parsedOutput, const std::vector<int64_t>& generatedTokens, const ToolsSchemas_t& toolNameSchemaMap) = 0;
96+
virtual void parse(ParsedOutput& parsedOutput, const std::vector<int64_t>& generatedTokens) = 0;
9797

9898
// Parse model output chunk in the streaming mode. If in result of processing the chunk we cannot produce meaningful response, we return std::nullopt.
9999
// Otherwise we return a JSON object containing the delta that conforms to OpenAI API.

src/llm/io_processing/hermes3/tool_parser.cpp

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -31,7 +31,7 @@
3131

3232
namespace ovms {
3333

34-
void Hermes3ToolParser::parse(ParsedOutput& parsedOutput, const std::vector<int64_t>& generatedTokens, const ToolsSchemas_t&) {
34+
void Hermes3ToolParser::parse(ParsedOutput& parsedOutput, const std::vector<int64_t>& generatedTokens) {
3535
const std::string startTag = "<tool_call>";
3636
const std::string endTag = "</tool_call>";
3737
std::vector<std::string> tools;

src/llm/io_processing/hermes3/tool_parser.hpp

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -55,7 +55,7 @@ class Hermes3ToolParser : public BaseOutputParser {
5555
explicit Hermes3ToolParser(ov::genai::Tokenizer& tokenizer) :
5656
BaseOutputParser(tokenizer) {}
5757

58-
void parse(ParsedOutput& parsedOutput, const std::vector<int64_t>& generatedTokens, const ToolsSchemas_t&) override;
58+
void parse(ParsedOutput& parsedOutput, const std::vector<int64_t>& generatedTokens) override;
5959
std::optional<rapidjson::Document> parseChunk(const std::string& chunk, ov::genai::GenerationFinishReason finishReason) override;
6060
const std::string& getParsingStartTag() const override {
6161
return parsingStartTag;

src/llm/io_processing/llama3/tool_parser.cpp

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -31,7 +31,7 @@
3131
#include "../utils.hpp"
3232

3333
namespace ovms {
34-
void Llama3ToolParser::parse(ParsedOutput& parsedOutput, const std::vector<int64_t>& generatedTokens, const ToolsSchemas_t&) {
34+
void Llama3ToolParser::parse(ParsedOutput& parsedOutput, const std::vector<int64_t>& generatedTokens) {
3535
// TODO: check if we can rely on decoded <|python_tag|> token to be present in the content, so we can drop multiple detokenizations and copies
3636
// and just extract substrings from the content and modify content in-place
3737

src/llm/io_processing/llama3/tool_parser.hpp

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -38,7 +38,7 @@ class Llama3ToolParser : public BaseOutputParser {
3838
const std::string parsingEndTag = "";
3939

4040
// Id of the <|python_tag|> which is a special token used to indicate the start of a tool calls
41-
static const int64_t botTokenId = 128010;
41+
int64_t botTokenId = 128010;
4242
// ";" is used as a separator between tool calls in the response
4343
std::string separator = ";";
4444

@@ -59,7 +59,7 @@ class Llama3ToolParser : public BaseOutputParser {
5959
explicit Llama3ToolParser(ov::genai::Tokenizer& tokenizer) :
6060
BaseOutputParser(tokenizer) {}
6161

62-
void parse(ParsedOutput& parsedOutput, const std::vector<int64_t>& generatedTokens, const ToolsSchemas_t&) override;
62+
void parse(ParsedOutput& parsedOutput, const std::vector<int64_t>& generatedTokens) override;
6363
std::optional<rapidjson::Document> parseChunk(const std::string& chunk, ov::genai::GenerationFinishReason finishReason) override;
6464
const std::string& getParsingStartTag() const override {
6565
return parsingStartTag;

src/llm/io_processing/mistral/tool_parser.cpp

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -32,7 +32,7 @@
3232

3333
namespace ovms {
3434

35-
void MistralToolParser::parse(ParsedOutput& parsedOutput, const std::vector<int64_t>& generatedTokens, const ToolsSchemas_t&) {
35+
void MistralToolParser::parse(ParsedOutput& parsedOutput, const std::vector<int64_t>& generatedTokens) {
3636
std::vector<std::string> tools;
3737

3838
if (parsedOutput.content.empty() || generatedTokens.size() <= 0) {

src/llm/io_processing/mistral/tool_parser.hpp

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -39,7 +39,7 @@ class MistralToolParser : public BaseOutputParser {
3939
explicit MistralToolParser(ov::genai::Tokenizer& tokenizer) :
4040
BaseOutputParser(tokenizer) {}
4141

42-
void parse(ParsedOutput& parsedOutput, const std::vector<int64_t>& generatedTokens, const ToolsSchemas_t&) override;
42+
void parse(ParsedOutput& parsedOutput, const std::vector<int64_t>& generatedTokens) override;
4343
std::optional<rapidjson::Document> parseChunk(const std::string& chunk, ov::genai::GenerationFinishReason finishReason) override;
4444
const std::string& getParsingStartTag() const override {
4545
static const std::string toolCallStartTag = "[TOOL_CALLS]";

src/llm/io_processing/output_parser.cpp

Lines changed: 7 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -143,8 +143,9 @@ std::optional<rapidjson::Document> OutputParser::parseReasoningChunk(ov::genai::
143143
return result;
144144
}
145145

146-
OutputParser::OutputParser(ov::genai::Tokenizer& tokenizer, const std::string toolParserName, const std::string reasoningParserName) :
146+
OutputParser::OutputParser(ov::genai::Tokenizer& tokenizer, const std::string toolParserName, const std::string reasoningParserName, const ToolsSchemas_t& toolNameSchemaMap) :
147147
tokenizer(tokenizer) {
148+
SPDLOG_ERROR("OutputParser created with toolNameSchemaMap of size: {}", toolNameSchemaMap.size());
148149
if (toolParserName == "llama3") {
149150
toolParser = std::make_unique<Llama3ToolParser>(tokenizer);
150151
} else if (toolParserName == "hermes3") {
@@ -156,7 +157,7 @@ OutputParser::OutputParser(ov::genai::Tokenizer& tokenizer, const std::string to
156157
} else if (toolParserName == "gptoss") {
157158
toolParser = std::make_unique<GptOssToolParser>(tokenizer);
158159
} else if (toolParserName == "qwen3coder") {
159-
toolParser = std::make_unique<Qwen3CoderToolParser>(tokenizer);
160+
toolParser = std::make_unique<Qwen3CoderToolParser>(tokenizer, toolNameSchemaMap);
160161
} else if (!toolParserName.empty()) {
161162
throw std::runtime_error("Unsupported tool parser: " + toolParserName);
162163
}
@@ -201,7 +202,7 @@ std::string OutputParser::getToolParserStartTag() const {
201202
}
202203
}
203204

204-
ParsedOutput OutputParser::parse(const std::vector<int64_t>& generatedTokens, const bool toolsAvailable, const ToolsSchemas_t& toolNameSchemaMap) {
205+
ParsedOutput OutputParser::parse(const std::vector<int64_t>& generatedTokens, const bool toolsAvailable) {
205206
// Model output is processed by the chain of parsers. Each parser extracts relevant part of the output and fills the ParsedOutput structure.
206207
// At the beginning, the content field of ParsedOutput is already filled with decoded content from generatedTokens.
207208
// When parser extracts relevant information, it should remove it from the content field, so we don't duplicate it in the final output.
@@ -212,11 +213,11 @@ ParsedOutput OutputParser::parse(const std::vector<int64_t>& generatedTokens, co
212213
ParsedOutput parsedOutput;
213214
parsedOutput.content = tokenizer.decode(generatedTokens);
214215
if (reasoningParser) {
215-
reasoningParser->parse(parsedOutput, generatedTokens, toolNameSchemaMap);
216+
reasoningParser->parse(parsedOutput, generatedTokens);
216217
}
217218
// We run tool parser only if the parser is available and tools have been provided in the request.
218219
if (toolParser && toolsAvailable) {
219-
toolParser->parse(parsedOutput, generatedTokens, toolNameSchemaMap);
220+
toolParser->parse(parsedOutput, generatedTokens);
220221
}
221222
return parsedOutput;
222223
}
@@ -231,7 +232,7 @@ std::optional<rapidjson::Document> OutputParser::parseChunk(const std::string& c
231232
*/
232233

233234
bool reasoningParserExistsAndSupportsStreaming = reasoningParser && !reasoningParser->getParsingStartTag().empty() && !reasoningParser->getParsingEndTag().empty();
234-
bool toolParserExistsAndSupportsStreaming = toolParser && !toolParser->getParsingStartTag().empty();
235+
bool toolParserExistsAndSupportsStreaming = toolParser && !toolParser->getParsingStartTag().empty(); // FIXME why not check for parsingEntTag not empty?
235236
bool applyToolParser = toolParserExistsAndSupportsStreaming && toolsAvailable;
236237

237238
if (applyToolParser && toolParser->isImmediateParsingEnabled() && processingPhase == UNKNOWN) {

0 commit comments

Comments
 (0)