Skip to content

Commit 81878fc

Browse files
committed
test: add integration tests for query_metrics
the integration tests check if all of the metrics we currently support actual are queryable after inference requests this also tests thinks like aggregation, label filtering, etc Signed-off-by: Charlie Doern <[email protected]>
1 parent 627da95 commit 81878fc

File tree

9 files changed

+413
-100
lines changed

9 files changed

+413
-100
lines changed
4 KB
Binary file not shown.

tests/integration/recordings/responses/4a3a4447b16b.json

Lines changed: 3 additions & 99 deletions
Original file line numberDiff line numberDiff line change
@@ -14,7 +14,7 @@
1414
"models": [
1515
{
1616
"model": "nomic-embed-text:latest",
17-
"modified_at": "2025-08-05T14:04:07.946926-07:00",
17+
"modified_at": "2025-08-08T13:09:33.750238-04:00",
1818
"digest": "0a109f422b47e3a30ba2b10eca18548e944e8a23073ee3f3e947efcf3c45e59f",
1919
"size": 274302450,
2020
"details": {
@@ -28,89 +28,9 @@
2828
"quantization_level": "F16"
2929
}
3030
},
31-
{
32-
"model": "llama3.2-vision:11b",
33-
"modified_at": "2025-07-30T18:45:02.517873-07:00",
34-
"digest": "6f2f9757ae97e8a3f8ea33d6adb2b11d93d9a35bef277cd2c0b1b5af8e8d0b1e",
35-
"size": 7816589186,
36-
"details": {
37-
"parent_model": "",
38-
"format": "gguf",
39-
"family": "mllama",
40-
"families": [
41-
"mllama"
42-
],
43-
"parameter_size": "10.7B",
44-
"quantization_level": "Q4_K_M"
45-
}
46-
},
47-
{
48-
"model": "llama3.2-vision:latest",
49-
"modified_at": "2025-07-29T20:18:47.920468-07:00",
50-
"digest": "6f2f9757ae97e8a3f8ea33d6adb2b11d93d9a35bef277cd2c0b1b5af8e8d0b1e",
51-
"size": 7816589186,
52-
"details": {
53-
"parent_model": "",
54-
"format": "gguf",
55-
"family": "mllama",
56-
"families": [
57-
"mllama"
58-
],
59-
"parameter_size": "10.7B",
60-
"quantization_level": "Q4_K_M"
61-
}
62-
},
63-
{
64-
"model": "llama-guard3:1b",
65-
"modified_at": "2025-07-25T14:39:44.978630-07:00",
66-
"digest": "494147e06bf99e10dbe67b63a07ac81c162f18ef3341aa3390007ac828571b3b",
67-
"size": 1600181919,
68-
"details": {
69-
"parent_model": "",
70-
"format": "gguf",
71-
"family": "llama",
72-
"families": [
73-
"llama"
74-
],
75-
"parameter_size": "1.5B",
76-
"quantization_level": "Q8_0"
77-
}
78-
},
7931
{
8032
"model": "all-minilm:l6-v2",
81-
"modified_at": "2025-07-24T15:15:11.129290-07:00",
82-
"digest": "1b226e2802dbb772b5fc32a58f103ca1804ef7501331012de126ab22f67475ef",
83-
"size": 45960996,
84-
"details": {
85-
"parent_model": "",
86-
"format": "gguf",
87-
"family": "bert",
88-
"families": [
89-
"bert"
90-
],
91-
"parameter_size": "23M",
92-
"quantization_level": "F16"
93-
}
94-
},
95-
{
96-
"model": "llama3.2:1b",
97-
"modified_at": "2025-07-17T22:02:24.953208-07:00",
98-
"digest": "baf6a787fdffd633537aa2eb51cfd54cb93ff08e28040095462bb63daf552878",
99-
"size": 1321098329,
100-
"details": {
101-
"parent_model": "",
102-
"format": "gguf",
103-
"family": "llama",
104-
"families": [
105-
"llama"
106-
],
107-
"parameter_size": "1.2B",
108-
"quantization_level": "Q8_0"
109-
}
110-
},
111-
{
112-
"model": "all-minilm:latest",
113-
"modified_at": "2025-06-03T16:50:10.946583-07:00",
33+
"modified_at": "2025-07-28T10:48:35.010422-04:00",
11434
"digest": "1b226e2802dbb772b5fc32a58f103ca1804ef7501331012de126ab22f67475ef",
11535
"size": 45960996,
11636
"details": {
@@ -126,7 +46,7 @@
12646
},
12747
{
12848
"model": "llama3.2:3b",
129-
"modified_at": "2025-05-01T11:15:23.797447-07:00",
49+
"modified_at": "2025-07-11T10:37:11.453812-04:00",
13050
"digest": "a80c4f17acd55265feec403c7aef86be0c25983ab279d83f3bcd3abbcb5b8b72",
13151
"size": 2019393189,
13252
"details": {
@@ -139,22 +59,6 @@
13959
"parameter_size": "3.2B",
14060
"quantization_level": "Q4_K_M"
14161
}
142-
},
143-
{
144-
"model": "llama3.2:3b-instruct-fp16",
145-
"modified_at": "2025-04-30T15:33:48.939665-07:00",
146-
"digest": "195a8c01d91ec3cb1e0aad4624a51f2602c51fa7d96110f8ab5a20c84081804d",
147-
"size": 6433703586,
148-
"details": {
149-
"parent_model": "",
150-
"format": "gguf",
151-
"family": "llama",
152-
"families": [
153-
"llama"
154-
],
155-
"parameter_size": "3.2B",
156-
"quantization_level": "F16"
157-
}
15862
}
15963
]
16064
}
Lines changed: 56 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,56 @@
1+
{
2+
"request": {
3+
"method": "POST",
4+
"url": "http://localhost:11434/v1/v1/chat/completions",
5+
"headers": {},
6+
"body": {
7+
"model": "llama3.2:3b",
8+
"messages": [
9+
{
10+
"role": "user",
11+
"content": "OpenAI test 0"
12+
}
13+
],
14+
"stream": false
15+
},
16+
"endpoint": "/v1/chat/completions",
17+
"model": "llama3.2:3b"
18+
},
19+
"response": {
20+
"body": {
21+
"__type__": "openai.types.chat.chat_completion.ChatCompletion",
22+
"__data__": {
23+
"id": "chatcmpl-619",
24+
"choices": [
25+
{
26+
"finish_reason": "stop",
27+
"index": 0,
28+
"logprobs": null,
29+
"message": {
30+
"content": "I'm happy to chat with you. What's on your mind? Want to talk about a particular topic, ask for recommendations, or just have a conversation? I'm here to help and learn!",
31+
"refusal": null,
32+
"role": "assistant",
33+
"annotations": null,
34+
"audio": null,
35+
"function_call": null,
36+
"tool_calls": null
37+
}
38+
}
39+
],
40+
"created": 1754673005,
41+
"model": "llama3.2:3b",
42+
"object": "chat.completion",
43+
"service_tier": null,
44+
"system_fingerprint": "fp_ollama",
45+
"usage": {
46+
"completion_tokens": 41,
47+
"prompt_tokens": 30,
48+
"total_tokens": 71,
49+
"completion_tokens_details": null,
50+
"prompt_tokens_details": null
51+
}
52+
}
53+
},
54+
"is_streaming": false
55+
}
56+
}
Lines changed: 56 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,56 @@
1+
{
2+
"request": {
3+
"method": "POST",
4+
"url": "http://localhost:11434/v1/v1/chat/completions",
5+
"headers": {},
6+
"body": {
7+
"model": "llama3.2:3b",
8+
"messages": [
9+
{
10+
"role": "user",
11+
"content": "OpenAI test 1"
12+
}
13+
],
14+
"stream": false
15+
},
16+
"endpoint": "/v1/chat/completions",
17+
"model": "llama3.2:3b"
18+
},
19+
"response": {
20+
"body": {
21+
"__type__": "openai.types.chat.chat_completion.ChatCompletion",
22+
"__data__": {
23+
"id": "chatcmpl-224",
24+
"choices": [
25+
{
26+
"finish_reason": "stop",
27+
"index": 0,
28+
"logprobs": null,
29+
"message": {
30+
"content": "I'm happy to help with your OpenAI prompt. What kind of task would you like me to assist with? Would you like to:\n\n1. Provide a text prompt and have me generate a response?\n2. Test my understanding of a specific topic or concept?\n3. Engage in a conversation or role-playing scenario?\n4. Something else?\n\nPlease go ahead and let me know, and I'll do my best to assist you!",
31+
"refusal": null,
32+
"role": "assistant",
33+
"annotations": null,
34+
"audio": null,
35+
"function_call": null,
36+
"tool_calls": null
37+
}
38+
}
39+
],
40+
"created": 1754673007,
41+
"model": "llama3.2:3b",
42+
"object": "chat.completion",
43+
"service_tier": null,
44+
"system_fingerprint": "fp_ollama",
45+
"usage": {
46+
"completion_tokens": 89,
47+
"prompt_tokens": 30,
48+
"total_tokens": 119,
49+
"completion_tokens_details": null,
50+
"prompt_tokens_details": null
51+
}
52+
}
53+
},
54+
"is_streaming": false
55+
}
56+
}
Lines changed: 39 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,39 @@
1+
{
2+
"request": {
3+
"method": "POST",
4+
"url": "http://localhost:11434/api/generate",
5+
"headers": {},
6+
"body": {
7+
"model": "llama3.2:3b",
8+
"raw": true,
9+
"prompt": "<|begin_of_text|><|start_header_id|>system<|end_header_id|>\n\n<|eot_id|><|start_header_id|>user<|end_header_id|>\n\nTest metrics generation 0<|eot_id|><|start_header_id|>assistant<|end_header_id|>\n\n",
10+
"options": {
11+
"temperature": 0.0
12+
},
13+
"stream": false
14+
},
15+
"endpoint": "/api/generate",
16+
"model": "llama3.2:3b"
17+
},
18+
"response": {
19+
"body": {
20+
"__type__": "ollama._types.GenerateResponse",
21+
"__data__": {
22+
"model": "llama3.2:3b",
23+
"created_at": "2025-08-08T17:09:43.022472Z",
24+
"done": true,
25+
"done_reason": "stop",
26+
"total_duration": 9149743458,
27+
"load_duration": 31375916,
28+
"prompt_eval_count": 21,
29+
"prompt_eval_duration": 113430583,
30+
"eval_count": 324,
31+
"eval_duration": 9004524125,
32+
"response": "Here are some test metrics that can be used to evaluate the performance of a system:\n\n1. **Accuracy**: The proportion of correct predictions made by the model.\n2. **Precision**: The ratio of true positives (correctly predicted instances) to total positive predictions.\n3. **Recall**: The ratio of true positives to the sum of true positives and false negatives (missed instances).\n4. **F1-score**: The harmonic mean of precision and recall, providing a balanced measure of both.\n5. **Mean Squared Error (MSE)**: The average squared difference between predicted and actual values.\n6. **Mean Absolute Error (MAE)**: The average absolute difference between predicted and actual values.\n7. **Root Mean Squared Percentage Error (RMSPE)**: A variation of MSE that expresses the error as a percentage.\n8. **Coefficient of Determination (R-squared, R2)**: Measures how well the model explains the variance in the data.\n9. **Mean Absolute Percentage Error (MAPE)**: The average absolute percentage difference between predicted and actual values.\n10. **Mean Squared Logarithmic Error (MSLE)**: A variation of MSE that is more suitable for skewed distributions.\n\nThese metrics can be used to evaluate different aspects of a system's performance, such as:\n\n* Classification models: accuracy, precision, recall, F1-score\n* Regression models: MSE, MAE, RMSPE, R2, MSLE\n* Time series forecasting: MAPE, RMSPE\n\nNote that the choice of metric depends on the specific problem and data.",
33+
"thinking": null,
34+
"context": null
35+
}
36+
},
37+
"is_streaming": false
38+
}
39+
}

tests/integration/recordings/responses/d0ac68cbde69.json

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -16,7 +16,7 @@
1616
"model": "llama3.2:3b",
1717
"name": "llama3.2:3b",
1818
"digest": "a80c4f17acd55265feec403c7aef86be0c25983ab279d83f3bcd3abbcb5b8b72",
19-
"expires_at": "2025-08-06T15:57:21.573326-04:00",
19+
"expires_at": "2025-08-08T13:13:41.164547-04:00",
2020
"size": 4030033920,
2121
"size_vram": 4030033920,
2222
"details": {
Lines changed: 39 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,39 @@
1+
{
2+
"request": {
3+
"method": "POST",
4+
"url": "http://localhost:11434/api/generate",
5+
"headers": {},
6+
"body": {
7+
"model": "llama3.2:3b",
8+
"raw": true,
9+
"prompt": "<|begin_of_text|><|start_header_id|>system<|end_header_id|>\n\n<|eot_id|><|start_header_id|>user<|end_header_id|>\n\nTest metrics generation 2<|eot_id|><|start_header_id|>assistant<|end_header_id|>\n\n",
10+
"options": {
11+
"temperature": 0.0
12+
},
13+
"stream": false
14+
},
15+
"endpoint": "/api/generate",
16+
"model": "llama3.2:3b"
17+
},
18+
"response": {
19+
"body": {
20+
"__type__": "ollama._types.GenerateResponse",
21+
"__data__": {
22+
"model": "llama3.2:3b",
23+
"created_at": "2025-08-08T17:10:03.824771Z",
24+
"done": true,
25+
"done_reason": "stop",
26+
"total_duration": 8585833416,
27+
"load_duration": 21426666,
28+
"prompt_eval_count": 21,
29+
"prompt_eval_duration": 96159500,
30+
"eval_count": 321,
31+
"eval_duration": 8467974667,
32+
"response": "Here are some test metrics that can be used to evaluate the performance of a system:\n\n1. **Accuracy**: Measures how close the predicted values are to the actual values.\n2. **Precision**: Measures the proportion of true positives among all positive predictions made by the model.\n3. **Recall**: Measures the proportion of true positives among all actual positive instances.\n4. **F1-score**: The harmonic mean of precision and recall, providing a balanced measure of both.\n5. **Mean Squared Error (MSE)**: Measures the average squared difference between predicted and actual values.\n6. **Mean Absolute Error (MAE)**: Measures the average absolute difference between predicted and actual values.\n7. **Root Mean Squared Percentage Error (RMSPE)**: A variation of MSE that expresses errors as a percentage of the actual value.\n8. **Coefficient of Determination (R-squared, R2)**: Measures how well the model explains the variance in the data.\n9. **Mean Absolute Percentage Error (MAPE)**: Measures the average absolute percentage difference between predicted and actual values.\n10. **Mean Squared Logarithmic Error (MSLE)**: A variation of MSE that is more suitable for skewed distributions.\n\nThese metrics can be used to evaluate different aspects of a system's performance, such as:\n\n* Classification models: accuracy, precision, recall, F1-score\n* Regression models: MSE, MAE, RMSPE, R2\n* Time series forecasting: MAPE, MSLE\n\nNote that the choice of metric depends on the specific problem and data.",
33+
"thinking": null,
34+
"context": null
35+
}
36+
},
37+
"is_streaming": false
38+
}
39+
}
Lines changed: 39 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,39 @@
1+
{
2+
"request": {
3+
"method": "POST",
4+
"url": "http://localhost:11434/api/generate",
5+
"headers": {},
6+
"body": {
7+
"model": "llama3.2:3b",
8+
"raw": true,
9+
"prompt": "<|begin_of_text|><|start_header_id|>system<|end_header_id|>\n\n<|eot_id|><|start_header_id|>user<|end_header_id|>\n\nTest metrics generation 1<|eot_id|><|start_header_id|>assistant<|end_header_id|>\n\n",
10+
"options": {
11+
"temperature": 0.0
12+
},
13+
"stream": false
14+
},
15+
"endpoint": "/api/generate",
16+
"model": "llama3.2:3b"
17+
},
18+
"response": {
19+
"body": {
20+
"__type__": "ollama._types.GenerateResponse",
21+
"__data__": {
22+
"model": "llama3.2:3b",
23+
"created_at": "2025-08-08T17:09:55.233644Z",
24+
"done": true,
25+
"done_reason": "stop",
26+
"total_duration": 12203490000,
27+
"load_duration": 21410708,
28+
"prompt_eval_count": 21,
29+
"prompt_eval_duration": 96471916,
30+
"eval_count": 457,
31+
"eval_duration": 12085312167,
32+
"response": "Here are some test metrics that can be used to evaluate the performance of a system:\n\n**Primary Metrics**\n\n1. **Response Time**: The time it takes for the system to respond to a request.\n2. **Throughput**: The number of requests processed by the system per unit time (e.g., requests per second).\n3. **Error Rate**: The percentage of requests that result in an error.\n\n**Secondary Metrics**\n\n1. **Average Response Time**: The average response time for all requests.\n2. **Median Response Time**: The middle value of the response times, used to detect outliers.\n3. **99th Percentile Response Time**: The response time at which 99% of requests are completed within this time.\n4. **Request Latency**: The difference between the request arrival time and the response time.\n\n**User Experience Metrics**\n\n1. **User Satisfaction (USAT)**: Measured through surveys or feedback forms to gauge user satisfaction with the system's performance.\n2. **First Response Time**: The time it takes for a user to receive their first response from the system.\n3. **Time Spent in System**: The total amount of time a user spends interacting with the system.\n\n**System Resource Metrics**\n\n1. **CPU Utilization**: The percentage of CPU resources being used by the system.\n2. **Memory Usage**: The amount of memory being used by the system.\n3. **Disk I/O Wait Time**: The average time spent waiting for disk I/O operations to complete.\n\n**Security Metrics**\n\n1. **Authentication Success Rate**: The percentage of successful authentication attempts.\n2. **Authorization Success Rate**: The percentage of successful authorization attempts.\n3. **Error Rate (Security)**: The percentage of security-related errors.\n\n**Other Metrics**\n\n1. **Page Load Time**: The time it takes for a page to load.\n2. **Click-Through Rate (CTR)**: The percentage of users who click on a link or button after seeing an ad or notification.\n3. **Conversion Rate**: The percentage of users who complete a desired action (e.g., fill out a form, make a purchase).\n\nThese metrics can be used to evaluate the performance and effectiveness of various aspects of your system, from user experience to security and resource utilization.",
33+
"thinking": null,
34+
"context": null
35+
}
36+
},
37+
"is_streaming": false
38+
}
39+
}

0 commit comments

Comments
 (0)