gpustack · hibig · Oct 31, 2025 · Oct 31, 2025
diff --git a/src/locales/en-US/models.ts b/src/locales/en-US/models.ts
@@ -212,5 +212,12 @@ export default {
     'Available only with built-in backends (vLLM / SGLang) — switch backend in <span class="bold-text">Advanced</span> to enable.',
   'models.form.kvCache.tips2':
     'KV cache is only supported when using built-in inference backends (vLLM or SGLang).',
-  'models.form.scheduling': 'Scheduling'
+  'models.form.scheduling': 'Scheduling',
+  'models.form.ramRatio': 'RAM-to-VRAM Ratio',
+  'models.form.ramSize': 'Maximum RAM Size (GiB)',
+  'models.form.ramRatio.tips':
+    'Ratio of system RAM to GPU VRAM used for KV cache. For example, 2.0 means the cache in RAM can be twice as large as the GPU VRAM.',
+  'models.form.ramSize.tips': `Maximum size of the KV cache stored in system memory (GiB). If set, this value overrides "{content}".`,
+  'models.form.chunkSize.tips':
+    'Number of tokens per KV cache chunk. A larger chunk size may improve throughput but increase memory usage.'
 };
diff --git a/src/locales/ja-JP/models.ts b/src/locales/ja-JP/models.ts
@@ -212,7 +212,14 @@ export default {
     'Available only with built-in backends (vLLM / SGLang) — switch backend in <span class="bold-text">Advanced</span> to enable.',
   'models.form.kvCache.tips2':
     'KV cache is only supported when using built-in inference backends (vLLM or SGLang).',
-  'models.form.scheduling': 'Scheduling'
+  'models.form.scheduling': 'Scheduling',
+  'models.form.ramRatio': 'RAM-to-VRAM Ratio',
+  'models.form.ramSize': 'Maximum RAM Size (GiB)',
+  'models.form.ramRatio.tips':
+    'Ratio of system RAM to GPU VRAM used for KV cache. For example, 2.0 means the cache in RAM can be twice as large as the GPU VRAM.',
+  'models.form.ramSize.tips': `Maximum size of the KV cache stored in system memory (GiB). If set, this value overrides "{content}".`,
+  'models.form.chunkSize.tips':
+    'Number of tokens per KV cache chunk. A larger chunk size may improve throughput but increase memory usage.'
 };
 
 // ========== To-Do: Translate Keys (Remove After Translation) ==========
@@ -259,5 +266,10 @@ export default {
 // 43. 'models.form.remoteURL.tips': 'Refer to the <a href="https://docs.lmcache.ai/api_reference/configurations.html" target="_blank">configuration documentation</a> for details.',
 // 44. 'models.form.kvCache.tips': 'Available only with built-in backends (vLLM / SGLang) — switch backend in <span class="bold-text">Advanced</span> to enable.'
 // 45. 'models.form.kvCache.tips2': 'KV cache is only supported when using built-in inference backends (vLLM or SGLang).',
-// 46. 'models.form.scheduling': 'Scheduling'
+// 46. 'models.form.scheduling': 'Scheduling',
+// 47. 'models.form.ramRatio': 'RAM-to-VRAM Ratio',
+// 48. 'models.form.ramSize': 'Maximum RAM Size (GiB)',
+// 49. 'models.form.ramRatio.tips': 'Ratio of system RAM to GPU VRAM used for KV cache. For example, 2.0 means the cache in RAM can be twice as large as the GPU VRAM.',
+// 50. 'models.form.ramSize.tips': `Maximum size of the KV cache stored in system memory (GiB). If set, this value overrides "{content}".`,
+// 51. 'models.form.chunkSize.tips': 'Number of tokens per KV cache chunk. A larger chunk size may improve throughput but increase memory usage.'
 // ========== End of To-Do List ==========
diff --git a/src/locales/ru-RU/models.ts b/src/locales/ru-RU/models.ts
@@ -212,7 +212,14 @@ export default {
     'Available only with built-in backends (vLLM / SGLang) — switch backend in <span class="bold-text">Advanced</span> to enable.',
   'models.form.kvCache.tips2':
     'KV cache is only supported when using built-in inference backends (vLLM or SGLang).',
-  'models.form.scheduling': 'Scheduling'
+  'models.form.scheduling': 'Scheduling',
+  'models.form.ramRatio': 'RAM-to-VRAM Ratio',
+  'models.form.ramSize': 'Maximum RAM Size (GiB)',
+  'models.form.ramRatio.tips':
+    'Ratio of system RAM to GPU VRAM used for KV cache. For example, 2.0 means the cache in RAM can be twice as large as the GPU VRAM.',
+  'models.form.ramSize.tips': `Maximum size of the KV cache stored in system memory (GiB). If set, this value overrides "{content}".`,
+  'models.form.chunkSize.tips':
+    'Number of tokens per KV cache chunk. A larger chunk size may improve throughput but increase memory usage.'
 };
 
 // ========== To-Do: Translate Keys (Remove After Translation) ==========
@@ -222,5 +229,10 @@ export default {
 // 5. 'models.form.remoteURL.tips': 'Refer to the <a href="https://docs.lmcache.ai/api_reference/configurations.html" target="_blank">configuration documentation</a> for details.',
 // 6. 'models.form.kvCache.tips': 'Available only with built-in backends (vLLM / SGLang) — switch backend in <span class="bold-text">Advanced</span> to enable.'
 // 7. 'models.form.kvCache.tips2': 'KV cache is only supported when using built-in inference backends (vLLM or SGLang).';
-// 8. 'models.form.scheduling': 'Scheduling'
+// 8. 'models.form.scheduling': 'Scheduling',
+// 9. 'models.form.ramRatio': 'RAM-to-VRAM Ratio',
+// 10. 'models.form.ramSize': 'Maximum RAM Size (GiB)',
+// 11. 'models.form.ramRatio.tips': 'Ratio of system RAM to GPU VRAM used for KV cache. For example, 2.0 means the cache in RAM can be twice as large as the GPU VRAM.',
+// 12. 'models.form.ramSize.tips': `Maximum size of the KV cache stored in system memory (GiB). If set, this value overrides "{content}".`,
+// 13. 'models.form.chunkSize.tips': 'Number of tokens per KV cache chunk. A larger chunk size may improve throughput but increase memory usage.'
 // ========== End of To-Do List ==========
diff --git a/src/locales/zh-CN/models.ts b/src/locales/zh-CN/models.ts
@@ -201,5 +201,12 @@ export default {
     '仅在内置后端（vLLM / SGLang）可用 —— 请在<span class="bold-text">高级</span>配置中切换后端以启用。',
   'models.form.kvCache.tips2':
     '仅在使用内置推理后端（vLLM 或 SGLang）时支持 KV 缓存。',
-  'models.form.scheduling': '调度'
+  'models.form.scheduling': '调度',
+  'models.form.ramRatio': '内存与显存比例',
+  'models.form.ramSize': '内存最大占用 (GiB)',
+  'models.form.ramRatio.tips':
+    'KV 缓存在系统内存与 GPU 显存之间的比例。例如设置为 2.0 表示系统内存中可缓存的数据量是显存的两倍。',
+  'models.form.ramSize.tips': `KV 缓存在系统内存中的最大值。当设置该值时，将覆盖 "{content}" 的配置。`,
+  'models.form.chunkSize.tips':
+    '每个 KV 缓存块包含的 token 数量。数值越大可提升吞吐量，但也会增加内存占用。'
 };
diff --git a/src/pages/llmodels/config/types.ts b/src/pages/llmodels/config/types.ts
@@ -79,8 +79,8 @@ export interface FormData {
   extended_kv_cache: {
     enabled: boolean;
     chunk_size: number;
-    max_local_cpu_size: number;
-    remote_url: string;
+    ram_ratio: number;
+    ram_size: number;
   };
 }
 

diff --git a/src/pages/llmodels/forms/index.tsx b/src/pages/llmodels/forms/index.tsx
@@ -368,9 +368,9 @@ const DataForm: React.FC<DataFormProps> = forwardRef((props, ref) => {
           distributed_inference_across_workers: true,
           extended_kv_cache: {
             enabled: false,
-            chunk_size: 256,
-            max_local_cpu_size: 10,
-            remote_url: ''
+            chunk_size: null,
+            ram_ratio: 1.2,
+            ram_size: null
           },
           ...initialValues
         }}

diff --git a/src/pages/llmodels/forms/kv-cache.tsx b/src/pages/llmodels/forms/kv-cache.tsx
@@ -1,6 +1,5 @@
 import CheckboxField from '@/components/seal-form/checkbox-field';
 import SealInputNumber from '@/components/seal-form/input-number';
-import SealInput from '@/components/seal-form/seal-input';
 import { useIntl } from '@umijs/max';
 import { Form } from 'antd';
 import { useMemo } from 'react';
@@ -21,9 +20,9 @@ const KVCacheForm = () => {
       form.setFieldsValue({
         extended_kv_cache: {
           enabled: true,
-          chunk_size: extendedKVCache?.chunk_size || 256,
-          max_local_cpu_size: extendedKVCache?.max_local_cpu_size || 10,
-          remote_url: extendedKVCache?.remote_url || ''
+          chunk_size: extendedKVCache?.chunk_size,
+          ram_ratio: extendedKVCache?.ram_ratio || 1.2,
+          ram_size: extendedKVCache?.ram_size
         }
       });
     }
@@ -76,38 +75,39 @@ const KVCacheForm = () => {
       </div>
       {kvCacheEnabled && (
         <>
-          <Form.Item<FormData>
-            name={['extended_kv_cache', 'max_local_cpu_size']}
-          >
+          <Form.Item<FormData> name={['extended_kv_cache', 'ram_ratio']}>
             <SealInputNumber
-              label={intl.formatMessage({ id: 'models.form.maxCPUSize' })}
+              label={intl.formatMessage({ id: 'models.form.ramRatio' })}
+              description={intl.formatMessage({
+                id: 'models.form.ramRatio.tips'
+              })}
               min={0}
-              step={1}
-              precision={0}
+              step={0.1}
+              precision={1}
             />
           </Form.Item>
-          <Form.Item<FormData> name={['extended_kv_cache', 'chunk_size']}>
+          <Form.Item<FormData> name={['extended_kv_cache', 'ram_size']}>
             <SealInputNumber
-              label={intl.formatMessage({ id: 'models.form.chunkSize' })}
+              label={intl.formatMessage({ id: 'models.form.ramSize' })}
+              description={intl.formatMessage(
+                {
+                  id: 'models.form.ramSize.tips'
+                },
+                { content: intl.formatMessage({ id: 'models.form.ramRatio' }) }
+              )}
               min={0}
               step={1}
+              precision={0}
             />
           </Form.Item>
-          <Form.Item<FormData> name={['extended_kv_cache', 'remote_url']}>
-            <SealInput.Input
-              description={
-                <span
-                  dangerouslySetInnerHTML={{
-                    __html: intl.formatMessage({
-                      id: 'models.form.remoteURL.tips'
-                    })
-                  }}
-                ></span>
-              }
-              label={intl.formatMessage({ id: 'models.form.remoteURL' })}
+          <Form.Item<FormData> name={['extended_kv_cache', 'chunk_size']}>
+            <SealInputNumber
+              label={intl.formatMessage({ id: 'models.form.chunkSize' })}
+              description={intl.formatMessage({
+                id: 'models.form.chunkSize.tips'
+              })}
               min={0}
               step={1}
-              placeholder="protocol://host:port"
             />
           </Form.Item>
         </>