Reify structured output chunks. Move JSON parsing to the depths of Completion

romanrizzi · romanrizzi · commit 2ad1d65e85ca · 2025-05-05T10:56:01.000-03:00
diff --git a/lib/completions/endpoints/base.rb b/lib/completions/endpoints/base.rb
@@ -106,6 +106,18 @@ def perform_completion!(
 
           prompt = dialect.translate
 
+          structured_output = nil
+
+          if model_params[:response_format].present?
+            response_structure =
+              model_params[:response_format].dig(:json_schema, :schema, :required)
+
+            if response_structure.present?
+              structured_output =
+                DiscourseAi::Completions::StructuredOutput.new(response_structure.map(&:to_sym))
+            end
+          end
+
           FinalDestination::HTTP.start(
             model_uri.host,
             model_uri.port,
@@ -140,10 +152,17 @@ def perform_completion!(
               xml_stripper =
                 DiscourseAi::Completions::XmlTagStripper.new(to_strip) if to_strip.present?
 
-              if @streaming_mode && xml_stripper
+              if @streaming_mode
                 blk =
                   lambda do |partial, cancel|
-                    partial = xml_stripper << partial if partial.is_a?(String)
+                    if partial.is_a?(String)
+                      partial = xml_stripper << partial if xml_stripper
+
+                      if structured_output.present?
+                        structured_output << partial
+                        partial = structured_output
+                      end
+                    end
                     orig_blk.call(partial, cancel) if partial
                   end
               end
@@ -167,6 +186,7 @@ def perform_completion!(
                     xml_stripper: xml_stripper,
                     partials_raw: partials_raw,
                     response_raw: response_raw,
+                    structured_output: structured_output,
                   )
                 return response_data
               end
@@ -373,7 +393,8 @@ def non_streaming_response(
           xml_tool_processor:,
           xml_stripper:,
           partials_raw:,
-          response_raw:
+          response_raw:,
+          structured_output:
         )
           response_raw << response.read_body
           response_data = decode(response_raw)
@@ -403,6 +424,26 @@ def non_streaming_response(
 
           response_data.reject!(&:blank?)
 
+          if structured_output.present?
+            has_string_response = false
+
+            response_data =
+              response_data.reduce([]) do |memo, data|
+                if data.is_a?(String)
+                  structured_output << data
+                  has_string_response = true
+                  next(memo)
+                else
+                  memo << data
+                end
+
+                memo
+              end
+
+            # We only include the structured output if there was actually a structured response
+            response_data << structured_output if has_string_response
+          end
+
           # this is to keep stuff backwards compatible
           response_data = response_data.first if response_data.length == 1
 
diff --git a/lib/completions/endpoints/canned_response.rb b/lib/completions/endpoints/canned_response.rb
@@ -40,7 +40,7 @@ def perform_completion!(
                   "The number of completions you requested exceed the number of canned responses"
           end
 
-          response = transform_from_schema(response) if model_params[:response_format].present?
+          response = as_structured_output(response) if model_params[:response_format].present?
 
           raise response if response.is_a?(StandardError)
 
@@ -56,6 +56,8 @@ def perform_completion!(
                 yield(response, cancel_fn)
               elsif is_thinking?(response)
                 yield(response, cancel_fn)
+              elsif is_structured_output?(response)
+                yield(response, cancel_fn)
               else
                 response.each_char do |char|
                   break if cancelled
@@ -83,11 +85,18 @@ def is_tool?(response)
           response.is_a?(DiscourseAi::Completions::ToolCall)
         end
 
-        def transform_from_schema(response)
-          key = model_params[:response_format].dig(:json_schema, :schema, :properties)&.keys&.first
-          return response if key.nil?
+        def is_structured_output?(response)
+          response.is_a?(DiscourseAi::Completions::StructuredOutput)
+        end
+
+        def as_structured_output(response)
+          keys = model_params[:response_format].dig(:json_schema, :schema, :properties)&.keys
+          return response if keys.blank?
+
+          output = DiscourseAi::Completions::StructuredOutput.new(keys)
+          output << { keys.first => response }.to_json
 
-          { key => response }.to_json
+          output
         end
       end
     end
diff --git a/lib/completions/structured_output.rb b/lib/completions/structured_output.rb
@@ -0,0 +1,94 @@
+# frozen_string_literal: true
+
+module DiscourseAi
+  module Completions
+    class StructuredOutput
+      def initialize(property_names)
+        @raw_response = +""
+        @state = :awaiting_key
+        @current_key = +""
+        @escape = false
+
+        @full_output =
+          property_names.reduce({}) do |memo, pn|
+            memo[pn.to_sym] = +""
+            memo
+          end
+
+        # Partial output is what we processed in the last chunk.
+        @partial_output_proto = @full_output.deep_dup
+        @last_chunk_output = @full_output.deep_dup
+      end
+
+      attr_reader :full_output, :last_chunk_output
+
+      def <<(raw)
+        @raw_response << raw
+
+        @last_chunk_output = @partial_output_proto.deep_dup
+
+        raw.each_char do |char|
+          case @state
+          when :awaiting_key
+            if char == "\""
+              @current_key = +""
+              @state = :parsing_key
+              @escape = false
+            end
+          when :parsing_key
+            if char == "\""
+              @state = :awaiting_colon
+            else
+              @current_key << char
+            end
+          when :awaiting_colon
+            @state = :awaiting_value if char == ":"
+          when :awaiting_value
+            if char == '"'
+              @escape = false
+              @state = :parsing_value
+            end
+          when :parsing_value
+            if @escape
+              # Don't add escape sequence until we know what it is
+              unescaped = unescape_char(char)
+              @full_output[@current_key.to_sym] << unescaped
+              @last_chunk_output[@current_key.to_sym] << unescaped
+
+              @escape = false
+            elsif char == "\\"
+              @escape = true
+            elsif char == "\""
+              @state = :awaiting_key_or_end
+            else
+              @full_output[@current_key.to_sym] << char
+              @last_chunk_output[@current_key.to_sym] << char
+            end
+          when :awaiting_key_or_end
+            @state = :awaiting_key if char == ","
+            # End of object or whitespace ignored here
+          else
+            next
+          end
+        end
+      end
+
+      private
+
+      def unescape_char(char)
+        chars = {
+          '"' => '"',
+          '\\' => '\\',
+          "/" => "/",
+          "b" => "\b",
+          "f" => "\f",
+          "n" => "\n",
+          "r" => "\r",
+          "t" => "\t",
+        }
+
+        chars[char] || char
+      end
+    end
+  end
+end
diff --git a/lib/personas/bot.rb b/lib/personas/bot.rb
@@ -151,6 +151,8 @@ def reply(context, llm_args: {}, &update_blk)
                       raw_context << partial
                       current_thinking << partial
                     end
+                  elsif partial.is_a?(DiscourseAi::Completions::StructuredOutput)
+                    update_blk.call(partial.last_chunk_output, cancel, nil, :structured_output)
                   else
                     update_blk.call(partial, cancel)
                   end
diff --git a/lib/summarization/fold_content.rb b/lib/summarization/fold_content.rb
@@ -27,9 +27,7 @@ def initialize(bot, strategy, persist_summaries: true)
       def summarize(user, &on_partial_blk)
         truncated_content = content_to_summarize.map { |cts| truncate(cts) }
 
-        # Done here to cover non-streaming mode.
-        json_reply_end = "\"}"
-        summary = fold(truncated_content, user, &on_partial_blk).chomp(json_reply_end)
+        summary = fold(truncated_content, user, &on_partial_blk)
 
         if persist_summaries
           AiSummary.store!(strategy, llm_model, summary, truncated_content, human: user&.human?)
@@ -113,67 +111,24 @@ def fold(items, user, &on_partial_blk)
 
         summary = +""
 
-        # Auxiliary variables to get the summary content from the JSON response.
-        json_start_buffer = +""
-        json_start_found = false
-        # { is optional because Claude uses prefill, so it's not incldued.
-        # TODO(roman): Maybe extraction should happen in the bot?
-        json_summary_schema_keys = bot.persona.response_format&.first.to_h
-        json_reply_start_regex = /\{?\s*"#{json_summary_schema_keys[:key]}"\s*:\s*"/
-        # We need to buffer escaped newlines as the API likes to send \\ and n in different chunks.
-        partial_unescape_buffer = +""
-        unescape_regex = %r{\\(["/bfnrt])}
-        json_reply_end = "\"}"
-
         buffer_blk =
           Proc.new do |partial, cancel, _, type|
-            if type.blank?
-              if bot.returns_json?
-                # Extract summary from JSON.
-                if json_start_found
-                  if partial.end_with?("\\")
-                    partial_unescape_buffer << partial
-                  else
-                    unescaped_partial = partial_unescape_buffer
-
-                    buffered_newline = !partial_unescape_buffer.empty? && partial.first == "n"
-                    if buffered_newline
-                      unescaped_partial << partial.first
-
-                      unescaped_partial = unescaped_partial.gsub("\\n", "\n")
-                      unescaped_partial << partial[1..].to_s
-                    else
-                      unescaped_partial << partial.gsub("\\n", "\n")
-                    end
-                    partial_unescape_buffer = +""
-
-                    summary << unescaped_partial
-
-                    on_partial_blk.call(unescaped_partial, cancel) if on_partial_blk
-                  end
-                else
-                  json_start_buffer << partial
-
-                  if json_start_buffer.match?(json_reply_start_regex)
-                    buffered_start = json_start_buffer.gsub(json_reply_start_regex, "")
-                    summary << buffered_start
-
-                    on_partial_blk.call(buffered_start, cancel) if on_partial_blk
-
-                    json_start_found = true
-                  end
-                end
-              else
-                # Assume response is a regular completion.
-                summary << partial
-                on_partial_blk.call(partial, cancel) if on_partial_blk
-              end
+            if type == :structured_output
+              json_summary_schema_key = bot.persona.response_format&.first.to_h
+              partial_summary = partial[json_summary_schema_key[:key].to_sym]
+
+              summary << partial_summary
+              on_partial_blk.call(partial_summary, cancel) if on_partial_blk
+            elsif type.blank?
+              # Assume response is a regular completion.
+              summary << partial
+              on_partial_blk.call(partial, cancel) if on_partial_blk
             end
           end
 
         bot.reply(context, &buffer_blk)
 
-        summary.chomp(json_reply_end)
+        summary
       end
 
       def available_tokens
diff --git a/spec/lib/completions/endpoints/anthropic_spec.rb b/spec/lib/completions/endpoints/anthropic_spec.rb
@@ -837,15 +837,15 @@
         },
       ).to_return(status: 200, body: body)
 
-      result = +""
+      structured_output = nil
       llm.generate(
         prompt,
         user: Discourse.system_user,
         feature_name: "testing",
         response_format: schema,
-      ) { |partial, cancel| result << partial }
+      ) { |partial, cancel| structured_output = partial }
 
-      expect(result).to eq("\"key\":\"Hello!\"}")
+      expect(structured_output.full_output).to eq({ key: "Hello!" })
 
       expected_body = {
         model: "claude-3-opus-20240229",
diff --git a/spec/lib/completions/endpoints/aws_bedrock_spec.rb b/spec/lib/completions/endpoints/aws_bedrock_spec.rb
@@ -591,9 +591,9 @@ def encode_message(message)
           end
           .to_return(status: 200, body: messages)
 
-        response = +""
+        structured_output = nil
         proxy.generate("hello world", response_format: schema, user: user) do |partial|
-          response << partial
+          structured_output = partial
         end
 
         expected = {
@@ -607,7 +607,7 @@ def encode_message(message)
         }
         expect(JSON.parse(request.body)).to eq(expected)
 
-        expect(response).to eq("\"key\":\"Hello!\"}")
+        expect(structured_output.full_output).to eq({ key: "Hello!" })
       end
     end
   end
diff --git a/spec/lib/completions/endpoints/cohere_spec.rb b/spec/lib/completions/endpoints/cohere_spec.rb
diff --git a/spec/lib/completions/endpoints/gemini_spec.rb b/spec/lib/completions/endpoints/gemini_spec.rb
diff --git a/spec/lib/completions/structured_output_spec.rb b/spec/lib/completions/structured_output_spec.rb