-
Notifications
You must be signed in to change notification settings - Fork 1.4k
Add Anthropic prompt caching support with CachePoint #3363
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
base: main
Are you sure you want to change the base?
Changes from all commits
41107f1
e1ab79f
7259061
4a751cb
5b5cb9f
fc4f8dd
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1,152 @@ | ||
| #!/usr/bin/env python3 | ||
| """Example demonstrating Anthropic prompt caching. | ||
|
|
||
| This example shows how to use CachePoint to reduce costs by caching: | ||
| - Long system prompts | ||
| - Large context (like documentation) | ||
| - Tool definitions | ||
|
|
||
| Run with: uv run -m pydantic_ai_examples.anthropic_prompt_caching | ||
| """ | ||
|
|
||
| from pydantic_ai import Agent, CachePoint | ||
|
|
||
| # Sample long context to demonstrate caching | ||
| # Need at least 1024 tokens - repeating 10x to be safe | ||
| LONG_CONTEXT = ( | ||
| """ | ||
| # Product Documentation | ||
|
|
||
| ## Overview | ||
| Our API provides comprehensive data access with the following features: | ||
|
|
||
| ### Authentication | ||
| All requests require a Bearer token in the Authorization header. | ||
| Rate limits: 1000 requests/hour for standard tier. | ||
|
|
||
| ### Endpoints | ||
|
|
||
| #### GET /api/users | ||
| Returns a list of users with pagination support. | ||
| Parameters: | ||
| - page: Page number (default: 1) | ||
| - limit: Items per page (default: 20, max: 100) | ||
| - filter: Optional filter expression | ||
|
|
||
| #### GET /api/products | ||
| Returns product catalog with detailed specifications. | ||
| Parameters: | ||
| - category: Filter by category | ||
| - in_stock: Boolean, filter available items | ||
| - sort: Sort order (price_asc, price_desc, name) | ||
|
|
||
| #### POST /api/orders | ||
| Create a new order. Requires authentication. | ||
| Request body: | ||
| - user_id: Integer, required | ||
| - items: Array of {product_id, quantity} | ||
| - shipping_address: Object with address details | ||
|
|
||
| #### Error Handling | ||
| Standard HTTP status codes are used: | ||
| - 200: Success | ||
| - 400: Bad request | ||
| - 401: Unauthorized | ||
| - 404: Not found | ||
| - 500: Server error | ||
|
|
||
| ## Best Practices | ||
| 1. Always handle rate limiting with exponential backoff | ||
| 2. Cache responses where appropriate | ||
| 3. Use pagination for large datasets | ||
| 4. Validate input before submission | ||
| 5. Monitor API usage through dashboard | ||
|
|
||
| ## Code Examples | ||
| See detailed examples in our GitHub repository. | ||
| """ | ||
| * 10 | ||
| ) # Repeat 10x to ensure we exceed Anthropic's minimum cache size (1024 tokens) | ||
|
|
||
|
|
||
| async def main() -> None: | ||
| """Demonstrate prompt caching with Anthropic.""" | ||
| print('=== Anthropic Prompt Caching Demo ===\n') | ||
|
|
||
| agent = Agent( | ||
| 'anthropic:claude-sonnet-4-5', | ||
| system_prompt='You are a helpful API documentation assistant.', | ||
| ) | ||
|
|
||
| # First request with cache point - this will write to cache | ||
| print('First request (will cache context)...') | ||
| result1 = await agent.run( | ||
| [ | ||
| LONG_CONTEXT, | ||
| CachePoint(), # Everything before this will be cached | ||
| 'What authentication method does the API use?', | ||
| ] | ||
| ) | ||
|
|
||
| print(f'Response: {result1.output}\n') | ||
| usage1 = result1.usage() | ||
| print(f'Usage: {usage1}') | ||
| if usage1.cache_write_tokens: | ||
| print( | ||
| f' Cache write tokens: {usage1.cache_write_tokens} (tokens written to cache)' | ||
| ) | ||
| print() | ||
|
|
||
| # Second request with same cached context - should use cache | ||
| print('Second request (should read from cache)...') | ||
| result2 = await agent.run( | ||
| [ | ||
| LONG_CONTEXT, | ||
| CachePoint(), # Same content, should hit cache | ||
| 'What are the available API endpoints?', | ||
| ] | ||
| ) | ||
|
|
||
| print(f'Response: {result2.output}\n') | ||
| usage2 = result2.usage() | ||
| print(f'Usage: {usage2}') | ||
| if usage2.cache_read_tokens: | ||
| print( | ||
| f' Cache read tokens: {usage2.cache_read_tokens} (tokens read from cache)' | ||
| ) | ||
| print( | ||
| f' Cache savings: ~{usage2.cache_read_tokens * 0.9:.0f} token-equivalents (90% discount)' | ||
| ) | ||
| print() | ||
|
|
||
| # Third request with different question, same cache | ||
| print('Third request (should also read from cache)...') | ||
| result3 = await agent.run( | ||
| [ | ||
| LONG_CONTEXT, | ||
| CachePoint(), | ||
| 'How should I handle rate limiting?', | ||
| ] | ||
| ) | ||
|
|
||
| print(f'Response: {result3.output}\n') | ||
| usage3 = result3.usage() | ||
| print(f'Usage: {usage3}') | ||
| if usage3.cache_read_tokens: | ||
| print(f' Cache read tokens: {usage3.cache_read_tokens}') | ||
| print() | ||
|
|
||
| print('=== Summary ===') | ||
| total_usage = usage1 + usage2 + usage3 | ||
| print(f'Total input tokens: {total_usage.input_tokens}') | ||
| print(f'Total cache write: {total_usage.cache_write_tokens}') | ||
| print(f'Total cache read: {total_usage.cache_read_tokens}') | ||
| if total_usage.cache_read_tokens: | ||
| savings = total_usage.cache_read_tokens * 0.9 | ||
| print(f'Estimated savings: ~{savings:.0f} token-equivalents') | ||
|
|
||
|
|
||
| if __name__ == '__main__': | ||
| import asyncio | ||
|
|
||
| asyncio.run(main()) |
| Original file line number | Diff line number | Diff line change | ||||||||||
|---|---|---|---|---|---|---|---|---|---|---|---|---|
|
|
@@ -612,8 +612,20 @@ def __init__( | |||||||||||
| raise ValueError('`BinaryImage` must be have a media type that starts with "image/"') # pragma: no cover | ||||||||||||
|
|
||||||||||||
|
|
||||||||||||
| @dataclass | ||||||||||||
| class CachePoint: | ||||||||||||
| """A cache point marker for prompt caching. | ||||||||||||
|
|
||||||||||||
| Can be inserted into UserPromptPart.content to mark cache boundaries. | ||||||||||||
| Models that don't support caching will filter these out. | ||||||||||||
|
Collaborator
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more.
Suggested change
|
||||||||||||
| """ | ||||||||||||
|
|
||||||||||||
| kind: Literal['cache-point'] = 'cache-point' | ||||||||||||
| """Type identifier, this is available on all parts as a discriminator.""" | ||||||||||||
|
|
||||||||||||
|
|
||||||||||||
| MultiModalContent = ImageUrl | AudioUrl | DocumentUrl | VideoUrl | BinaryContent | ||||||||||||
| UserContent: TypeAlias = str | MultiModalContent | ||||||||||||
| UserContent: TypeAlias = str | MultiModalContent | CachePoint | ||||||||||||
|
|
||||||||||||
|
|
||||||||||||
| @dataclass(repr=False) | ||||||||||||
|
|
@@ -730,6 +742,9 @@ def otel_message_parts(self, settings: InstrumentationSettings) -> list[_otel_me | |||||||||||
| if settings.include_content and settings.include_binary_content: | ||||||||||||
| converted_part['content'] = base64.b64encode(part.data).decode() | ||||||||||||
| parts.append(converted_part) | ||||||||||||
| elif isinstance(part, CachePoint): | ||||||||||||
| # CachePoint is a marker, not actual content - skip it for otel | ||||||||||||
| pass | ||||||||||||
| else: | ||||||||||||
| parts.append({'type': part.kind}) # pragma: no cover | ||||||||||||
| return parts | ||||||||||||
|
|
||||||||||||
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -19,6 +19,7 @@ | |
| BinaryContent, | ||
| BuiltinToolCallPart, | ||
| BuiltinToolReturnPart, | ||
| CachePoint, | ||
| DocumentUrl, | ||
| FilePart, | ||
| FinishReason, | ||
|
|
@@ -58,6 +59,7 @@ | |
| from anthropic.types.beta import ( | ||
| BetaBase64PDFBlockParam, | ||
| BetaBase64PDFSourceParam, | ||
| BetaCacheControlEphemeralParam, | ||
| BetaCitationsDelta, | ||
| BetaCodeExecutionTool20250522Param, | ||
| BetaCodeExecutionToolResultBlock, | ||
|
|
@@ -477,7 +479,10 @@ async def _map_message( # noqa: C901 | |
| system_prompt_parts.append(request_part.content) | ||
| elif isinstance(request_part, UserPromptPart): | ||
| async for content in self._map_user_prompt(request_part): | ||
| user_content_params.append(content) | ||
| if isinstance(content, CachePoint): | ||
| self._add_cache_control_to_last_param(user_content_params) | ||
| else: | ||
| user_content_params.append(content) | ||
| elif isinstance(request_part, ToolReturnPart): | ||
| tool_result_block_param = BetaToolResultBlockParam( | ||
| tool_use_id=_guard_tool_call_id(t=request_part), | ||
|
|
@@ -639,10 +644,27 @@ async def _map_message( # noqa: C901 | |
| system_prompt = '\n\n'.join(system_prompt_parts) | ||
| return system_prompt, anthropic_messages | ||
|
|
||
| @staticmethod | ||
| def _add_cache_control_to_last_param(params: list[BetaContentBlockParam]) -> None: | ||
| """Add cache control to the last content block param.""" | ||
| if not params: | ||
| raise UserError( | ||
| 'CachePoint cannot be the first content in a user message - there must be previous content to attach the CachePoint to.' | ||
|
Collaborator
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Copying in context from https://docs.anthropic.com/en/docs/build-with-claude/prompt-caching#what-can-be-cached: I think we should support inserting a cache point after tool defs and system messages as well. In the original PR I suggested doing this by supporting What do you think about adding
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Seems reasonable, I'll look into it! |
||
| ) | ||
|
|
||
| # Only certain types support cache_control | ||
| cacheable_types = {'text', 'tool_use', 'server_tool_use', 'image', 'tool_result'} | ||
|
Collaborator
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Can you please link to the doc this came from? |
||
| last_param = cast(dict[str, Any], params[-1]) # Cast to dict for mutation | ||
| if last_param['type'] not in cacheable_types: | ||
| raise UserError(f'Cache control not supported for param type: {last_param["type"]}') | ||
|
|
||
| # Add cache_control to the last param | ||
| last_param['cache_control'] = BetaCacheControlEphemeralParam(type='ephemeral') | ||
|
|
||
| @staticmethod | ||
| async def _map_user_prompt( | ||
| part: UserPromptPart, | ||
| ) -> AsyncGenerator[BetaContentBlockParam]: | ||
| ) -> AsyncGenerator[BetaContentBlockParam | CachePoint]: | ||
| if isinstance(part.content, str): | ||
| if part.content: # Only yield non-empty text | ||
| yield BetaTextBlockParam(text=part.content, type='text') | ||
|
|
@@ -651,6 +673,8 @@ async def _map_user_prompt( | |
| if isinstance(item, str): | ||
| if item: # Only yield non-empty text | ||
| yield BetaTextBlockParam(text=item, type='text') | ||
| elif isinstance(item, CachePoint): | ||
| yield item | ||
| elif isinstance(item, BinaryContent): | ||
| if item.is_image: | ||
| yield BetaImageBlockParam( | ||
|
|
@@ -717,6 +741,8 @@ def _map_usage( | |
| key: value for key, value in response_usage.model_dump().items() if isinstance(value, int) | ||
| } | ||
|
|
||
| # Note: genai-prices already extracts cache_creation_input_tokens and cache_read_input_tokens | ||
| # from the Anthropic response and maps them to cache_write_tokens and cache_read_tokens | ||
| return usage.RequestUsage.extract( | ||
| dict(model=model, usage=details), | ||
| provider=provider, | ||
|
|
||
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Can we add a more basic example to the Anthropic docs, and drop this?