Download OpenAPI specification:
API REST server for evaluation backend orchestration
Create and execute evaluation request using the simplified benchmark schema.
| name required | string The evaluation job name. |
| description | string The evaluation job description. |
| tags | Array of strings The evaluation job tags. |
required | object (ModelRef) The model to evaluate. |
required | Array of objects (EvaluationBenchmarkConfig) The evaluation benchmarks to run. |
object (PassCriteriaWithDefault) The overall pass criteria for the evaluation job. | |
object (ExperimentConfig) The MLFlow experiment configuration. When provided, the evaluation job will be tracked in MLFlow. | |
object (EvaluationExports) Optional exports configuration for the evaluation job. When provided, the evaluation job results will be exported to the specified location. | |
object (QueueConfig) Optional scheduling queue for Kubernetes-backed evaluation jobs (e.g. Kueue). | |
object Custom request data. This can be used for user specific job data. |
{- "name": "granite-3.1-8b-safety-eval",
- "description": "Safety and reasoning evaluation for Granite 3.1 8B Instruct",
- "tags": [
- "nightly",
- "granite"
], - "model": {
- "name": "granite-3.1-8b-instruct"
}, - "benchmarks": [
- {
- "id": "arc_easy",
- "provider_id": "lm_evaluation_harness",
- "weight": 0.6,
- "primary_score": {
- "metric": "acc_norm",
- "lower_is_better": false
}, - "pass_criteria": {
- "threshold": 0.25
}, - "parameters": {
- "num_fewshot": 0,
- "limit": 100
}
}, - {
- "id": "owasp_llm_top10",
- "provider_id": "garak",
- "weight": 0.4,
- "primary_score": {
- "metric": "attack_success_rate",
- "lower_is_better": true
}, - "pass_criteria": {
- "threshold": 0.3
}
}
], - "pass_criteria": {
- "threshold": 0.5
}
}{- "resource": {
- "id": "a1b2c3d4-5678-9abc-def0-1234567890ab",
- "tenant": "default",
- "created_at": "2026-01-15T09:30:00Z",
- "updated_at": "2026-01-15T09:30:00Z",
- "owner": "user@example.com"
}, - "status": {
- "state": "pending",
- "message": {
- "message": "Evaluation job created.",
- "message_code": "evaluation_job_created"
}
}, - "name": "granite-3.1-8b-safety-eval",
- "description": "Safety and reasoning evaluation for Granite 3.1 8B Instruct",
- "tags": [
- "nightly",
- "granite"
], - "model": {
- "name": "granite-3.1-8b-instruct"
}, - "benchmarks": [
- {
- "id": "arc_easy",
- "provider_id": "lm_evaluation_harness",
- "weight": 0.6,
- "primary_score": {
- "metric": "acc_norm",
- "lower_is_better": false
}, - "pass_criteria": {
- "threshold": 0.25
}, - "parameters": {
- "num_fewshot": 0,
- "limit": 100
}
}, - {
- "id": "owasp_llm_top10",
- "provider_id": "garak",
- "weight": 0.4,
- "primary_score": {
- "metric": "attack_success_rate",
- "lower_is_better": true
}, - "pass_criteria": {
- "threshold": 0.3
}
}
], - "pass_criteria": {
- "threshold": 0.5
}
}List all evaluation requests.
| limit | integer (Limit) [ 1 .. 100 ] Default: 50 Maximum number of evaluations to return |
| offset | integer (Offset) >= 0 Default: 0 Offset for pagination |
| status | string (Status Filter) Filter by status |
| name | string (Name) Name to search for |
| tags | string (Tags) Tags to search for |
{- "first": {
- "href": "/api/v1/evaluations/jobs?limit=50&offset=0"
}, - "next": {
- "href": "/api/v1/evaluations/jobs?limit=50&offset=50"
}, - "limit": 50,
- "total_count": 73,
- "items": [
- {
- "resource": {
- "id": "a1b2c3d4-5678-9abc-def0-1234567890ab",
- "tenant": "default",
- "created_at": "2026-01-15T09:30:00Z",
- "updated_at": "2026-01-15T09:42:15Z",
- "owner": "user@example.com"
}, - "status": {
- "state": "completed",
- "message": {
- "message": "Evaluation job completed.",
- "message_code": "evaluation_job_updated"
}
}, - "results": {
- "benchmarks": [
- {
- "id": "arc_easy",
- "provider_id": "lm_evaluation_harness",
- "benchmark_index": 0,
- "metrics": {
- "acc": 0.82,
- "acc_norm": 0.85
}, - "test": {
- "primary_score": 0.85,
- "threshold": 0.25,
- "pass": true
}
}
], - "test": {
- "score": 0.85,
- "threshold": 0.5,
- "pass": true
}
}, - "name": "granite-3.1-8b-safety-eval",
- "model": {
- "name": "granite-3.1-8b-instruct"
}, - "benchmarks": [
- {
- "id": "arc_easy",
- "provider_id": "lm_evaluation_harness",
- "weight": 0.6
}
], - "pass_criteria": {
- "threshold": 0.5
}
}
]
}Returns the evaluation job resource with the current status and results.
| id required | string (Id) |
{- "resource": {
- "id": "a1b2c3d4-5678-9abc-def0-1234567890ab",
- "tenant": "default",
- "created_at": "2026-01-15T09:30:00Z",
- "updated_at": "2026-01-15T09:42:15Z",
- "owner": "user@example.com"
}, - "status": {
- "state": "completed",
- "message": {
- "message": "Evaluation job completed.",
- "message_code": "evaluation_job_updated"
}, - "benchmarks": [
- {
- "provider_id": "lm_evaluation_harness",
- "id": "arc_easy",
- "benchmark_index": 0,
- "status": "completed",
- "started_at": "2026-01-15T09:31:00Z",
- "completed_at": "2026-01-15T09:38:45Z"
}, - {
- "provider_id": "garak",
- "id": "owasp_llm_top10",
- "benchmark_index": 1,
- "status": "completed",
- "started_at": "2026-01-15T09:31:00Z",
- "completed_at": "2026-01-15T09:42:15Z"
}
]
}, - "results": {
- "benchmarks": [
- {
- "id": "arc_easy",
- "provider_id": "lm_evaluation_harness",
- "benchmark_index": 0,
- "metrics": {
- "acc": 0.82,
- "acc_norm": 0.85
}, - "mlflow_run_id": "run-7f3a1b2c",
- "logs_path": "/data/logs/a1b2c3d4.log",
- "test": {
- "primary_score": 0.85,
- "threshold": 0.25,
- "pass": true
}
}, - {
- "id": "owasp_llm_top10",
- "provider_id": "garak",
- "benchmark_index": 1,
- "metrics": {
- "attack_success_rate": 0.12
}, - "mlflow_run_id": "run-9e8d7c6b",
- "logs_path": "/data/logs/a1b2c3d4-garak.log",
- "test": {
- "primary_score": 0.12,
- "threshold": 0.3,
- "pass": true
}
}
], - "test": {
- "score": 0.85,
- "threshold": 0.5,
- "pass": true
}
}, - "name": "granite-3.1-8b-safety-eval",
- "description": "Safety and reasoning evaluation for Granite 3.1 8B Instruct",
- "tags": [
- "nightly",
- "granite"
], - "model": {
- "name": "granite-3.1-8b-instruct"
}, - "benchmarks": [
- {
- "id": "arc_easy",
- "provider_id": "lm_evaluation_harness",
- "weight": 0.6,
- "primary_score": {
- "metric": "acc_norm",
- "lower_is_better": false
}, - "pass_criteria": {
- "threshold": 0.25
}, - "parameters": {
- "num_fewshot": 0,
- "limit": 100
}
}, - {
- "id": "owasp_llm_top10",
- "provider_id": "garak",
- "weight": 0.4,
- "primary_score": {
- "metric": "attack_success_rate",
- "lower_is_better": true
}, - "pass_criteria": {
- "threshold": 0.3
}
}
], - "pass_criteria": {
- "threshold": 0.5
}
}Cancel a running evaluation.
| id required | string (Id) |
| hard_delete | boolean (Hard Delete) Default: false If |
{- "message": "The field 'state' is not valid.",
- "message_code": "invalid_value",
- "trace": "b12692e1-8582-4628-88ca-7a13fefb73e2"
}List all benchmark collections.
| limit | integer (Limit) [ 1 .. 100 ] Default: 50 Maximum number of collections to return |
| offset | integer (Offset) >= 0 Default: 0 Offset for pagination |
| name | string (Name) Name to search for |
| category | string (Category) Category to search for |
| tags | string (Tags) Tags to search for |
| scope | string (Scope of collections) Enum: "system" "tenant" Set to |
{- "first": {
- "href": "/api/v1/evaluations/collections?limit=50&offset=0"
}, - "limit": 50,
- "total_count": 2,
- "items": [
- {
- "resource": {
- "id": "e5f6a7b8-9012-3456-cdef-0123456789ab",
- "tenant": "default",
- "created_at": "2025-12-01T10:00:00Z",
- "updated_at": "2025-12-01T10:00:00Z"
}, - "name": "llm-safety-suite",
- "category": "safety",
- "description": "Comprehensive safety evaluation combining reasoning accuracy and vulnerability scanning",
- "tags": [
- "safety",
- "nightly"
], - "pass_criteria": {
- "threshold": 0.5
}, - "benchmarks": [
- {
- "id": "arc_easy",
- "provider_id": "lm_evaluation_harness",
- "weight": 0.6,
- "primary_score": {
- "metric": "acc_norm",
- "lower_is_better": false
}, - "pass_criteria": {
- "threshold": 0.25
}
}, - {
- "id": "owasp_llm_top10",
- "provider_id": "garak",
- "weight": 0.4,
- "primary_score": {
- "metric": "attack_success_rate",
- "lower_is_better": true
}, - "pass_criteria": {
- "threshold": 0.3
}
}
]
}
]
}Create a new collection.
| name required | string Collection name. |
| category required | string Collection category. |
| description | string Optional description. |
| tags | Array of strings Tags. |
object Custom key-value data. | |
object (PassCriteria) Pass criteria for the collection. | |
required | Array of objects (CollectionBenchmarkConfig) Benchmarks in the collection. |
{- "name": "release-gate-safety",
- "category": "safety",
- "description": "Release-gate collection combining reasoning and red-teaming benchmarks",
- "tags": [
- "release-gate",
- "safety"
], - "pass_criteria": {
- "threshold": 0.5
}, - "benchmarks": [
- {
- "id": "arc_easy",
- "provider_id": "lm_evaluation_harness",
- "weight": 0.6,
- "primary_score": {
- "metric": "acc_norm",
- "lower_is_better": false
}, - "pass_criteria": {
- "threshold": 0.25
}, - "parameters": {
- "num_fewshot": 0,
- "limit": 100
}
}, - {
- "id": "owasp_llm_top10",
- "provider_id": "garak",
- "weight": 0.4,
- "primary_score": {
- "metric": "attack_success_rate",
- "lower_is_better": true
}, - "pass_criteria": {
- "threshold": 0.3
}
}
]
}{- "resource": {
- "id": "f6a7b8c9-0123-4567-def0-123456789abc",
- "tenant": "default",
- "created_at": "2026-02-01T09:00:00Z",
- "updated_at": "2026-02-01T09:00:00Z",
- "owner": "user@example.com"
}, - "name": "release-gate-safety",
- "category": "safety",
- "description": "Release-gate collection combining reasoning and red-teaming benchmarks",
- "tags": [
- "release-gate",
- "safety"
], - "pass_criteria": {
- "threshold": 0.5
}, - "benchmarks": [
- {
- "id": "arc_easy",
- "provider_id": "lm_evaluation_harness",
- "weight": 0.6,
- "primary_score": {
- "metric": "acc_norm",
- "lower_is_better": false
}, - "pass_criteria": {
- "threshold": 0.25
}, - "parameters": {
- "num_fewshot": 0,
- "limit": 100
}
}, - {
- "id": "owasp_llm_top10",
- "provider_id": "garak",
- "weight": 0.4,
- "primary_score": {
- "metric": "attack_success_rate",
- "lower_is_better": true
}, - "pass_criteria": {
- "threshold": 0.3
}
}
]
}Get details of a specific collection.
| id required | string (Collection Id) |
{- "resource": {
- "id": "e5f6a7b8-9012-3456-cdef-0123456789ab",
- "tenant": "default",
- "created_at": "2025-12-01T10:00:00Z",
- "updated_at": "2025-12-01T10:00:00Z"
}, - "name": "llm-safety-suite",
- "category": "safety",
- "description": "Comprehensive safety evaluation combining reasoning accuracy and vulnerability scanning",
- "tags": [
- "safety",
- "nightly"
], - "pass_criteria": {
- "threshold": 0.5
}, - "benchmarks": [
- {
- "id": "arc_easy",
- "provider_id": "lm_evaluation_harness",
- "weight": 0.6,
- "primary_score": {
- "metric": "acc_norm",
- "lower_is_better": false
}, - "pass_criteria": {
- "threshold": 0.25
}
}, - {
- "id": "owasp_llm_top10",
- "provider_id": "garak",
- "weight": 0.4,
- "primary_score": {
- "metric": "attack_success_rate",
- "lower_is_better": true
}, - "pass_criteria": {
- "threshold": 0.3
}
}
]
}Update an existing collection.
| id required | string (Collection Id) |
| name required | string Collection name. |
| category required | string Collection category. |
| description | string Optional description. |
| tags | Array of strings Tags. |
object Custom key-value data. | |
object (PassCriteria) Pass criteria for the collection. | |
required | Array of objects (CollectionBenchmarkConfig) Benchmarks in the collection. |
{- "name": "llm-safety-suite",
- "category": "safety",
- "description": "Safety evaluation with reasoning, OWASP risks, and content quality",
- "tags": [
- "safety",
- "nightly",
- "updated"
], - "pass_criteria": {
- "threshold": 0.5
}, - "benchmarks": [
- {
- "id": "arc_easy",
- "provider_id": "lm_evaluation_harness",
- "weight": 0.5,
- "primary_score": {
- "metric": "acc_norm",
- "lower_is_better": false
}, - "pass_criteria": {
- "threshold": 0.25
}
}, - {
- "id": "owasp_llm_top10",
- "provider_id": "garak",
- "weight": 0.3,
- "primary_score": {
- "metric": "attack_success_rate",
- "lower_is_better": true
}, - "pass_criteria": {
- "threshold": 0.3
}
}, - {
- "id": "quality",
- "provider_id": "garak",
- "weight": 0.2,
- "primary_score": {
- "metric": "attack_success_rate",
- "lower_is_better": true
}, - "pass_criteria": {
- "threshold": 0.3
}
}
]
}{- "resource": {
- "id": "e5f6a7b8-9012-3456-cdef-0123456789ab",
- "tenant": "default",
- "created_at": "2025-12-01T10:00:00Z",
- "updated_at": "2026-02-10T11:00:00Z"
}, - "name": "llm-safety-suite",
- "category": "safety",
- "description": "Safety evaluation with reasoning, OWASP risks, and content quality",
- "tags": [
- "safety",
- "nightly",
- "updated"
], - "pass_criteria": {
- "threshold": 0.5
}, - "benchmarks": [
- {
- "id": "arc_easy",
- "provider_id": "lm_evaluation_harness",
- "weight": 0.5,
- "primary_score": {
- "metric": "acc_norm",
- "lower_is_better": false
}, - "pass_criteria": {
- "threshold": 0.25
}
}, - {
- "id": "owasp_llm_top10",
- "provider_id": "garak",
- "weight": 0.3,
- "primary_score": {
- "metric": "attack_success_rate",
- "lower_is_better": true
}, - "pass_criteria": {
- "threshold": 0.3
}
}, - {
- "id": "quality",
- "provider_id": "garak",
- "weight": 0.2,
- "primary_score": {
- "metric": "attack_success_rate",
- "lower_is_better": true
}, - "pass_criteria": {
- "threshold": 0.3
}
}
]
}Partially update an existing collection.
| id required | string (Collection Id) |
| op required | string (PatchOp) Enum: "replace" "add" "remove" Patch operation type |
| path required | string JSON Pointer path |
| value | any Value for add/replace (omit for remove) |
[- {
- "op": "replace",
- "path": "/pass_criteria/threshold",
- "value": 0.6
}, - {
- "op": "replace",
- "path": "/description",
- "value": "Safety evaluation with stricter pass threshold"
}
]{- "resource": {
- "id": "e5f6a7b8-9012-3456-cdef-0123456789ab",
- "tenant": "default",
- "created_at": "2025-12-01T10:00:00Z",
- "updated_at": "2026-02-10T12:30:00Z"
}, - "name": "llm-safety-suite",
- "category": "safety",
- "description": "Safety evaluation with stricter pass threshold",
- "tags": [
- "safety",
- "nightly"
], - "pass_criteria": {
- "threshold": 0.6
}, - "benchmarks": [
- {
- "id": "arc_easy",
- "provider_id": "lm_evaluation_harness",
- "weight": 0.6,
- "primary_score": {
- "metric": "acc_norm",
- "lower_is_better": false
}, - "pass_criteria": {
- "threshold": 0.25
}
}, - {
- "id": "owasp_llm_top10",
- "provider_id": "garak",
- "weight": 0.4,
- "primary_score": {
- "metric": "attack_success_rate",
- "lower_is_better": true
}, - "pass_criteria": {
- "threshold": 0.3
}
}
]
}List all registered evaluation providers.
| limit | integer (Limit) [ 1 .. 100 ] Default: 50 Maximum number of providers to return |
| offset | integer (Offset) >= 0 Default: 0 Offset for pagination |
| benchmarks | boolean (Benchmarks) Default: true Include or exclude benchmarks supported by this provider in the response |
| name | string (Name) Name to search for |
| tags | string (Tags) Tags to search for |
| scope | string (Scope of providers) Enum: "system" "tenant" Set to |
{- "first": {
- "href": "/api/v1/evaluations/providers?limit=50&offset=0"
}, - "limit": 50,
- "total_count": 3,
- "items": [
- {
- "resource": {
- "id": "b3f1a2c4-1234-5678-abcd-ef0123456789",
- "tenant": "default",
- "created_at": "2025-10-01T00:00:00Z",
- "updated_at": "2025-10-01T00:00:00Z"
}, - "name": "lm_evaluation_harness",
- "title": "LM Evaluation Harness",
- "description": "Comprehensive evaluation framework for language models with 180 benchmarks",
- "tags": [
- "reasoning",
- "science",
- "lm_eval"
], - "runtime": {
- "k8s": {
- "image": "quay.io/opendatahub/ta-lmes-job:odh-3.4-ea2",
- "entrypoint": [
- "/opt/app-root/bin/python",
- "/opt/app-root/src/main.py"
], - "cpu_request": "100m",
- "memory_request": "128Mi",
- "cpu_limit": "500m",
- "memory_limit": "4Gi"
}
}, - "benchmarks": [
- {
- "id": "arc_easy",
- "name": "Basic science Q&A",
- "description": "Grade-school science questions testing basic reasoning and scientific knowledge (AI2 Reasoning Challenge, easy split).",
- "category": "reasoning",
- "metrics": [
- "acc",
- "acc_norm"
], - "num_few_shot": 0,
- "dataset_size": 2376,
- "tags": [
- "reasoning",
- "science",
- "lm_eval"
], - "primary_score": {
- "metric": "acc_norm",
- "lower_is_better": false
}, - "pass_criteria": {
- "threshold": 0.25
}
}
]
}
]
}Create a new provider scoped to the current tenant (Bring Your Own Provider)
| name required | string Provider name |
| title | string Provider display title |
| description | string Provider description |
| tags | Array of strings Provider tags |
object (AgentMetadata) Agent discoverability metadata for this provider | |
required | object (Runtime) Provider runtime configuration |
required | Array of objects (BenchmarkResource) Benchmarks offered by this provider |
{- "name": "my-custom-evaluator",
- "title": "Custom Internal Evaluator",
- "description": "Internal evaluation adapter for domain-specific benchmarks",
- "tags": [
- "custom",
- "internal"
], - "runtime": {
- "k8s": {
- "image": "registry.internal.example.com/eval/custom-adapter:v1.2",
- "entrypoint": [
- "/opt/app-root/bin/python",
- "/opt/app-root/src/main.py"
], - "cpu_request": "250m",
- "memory_request": "512Mi",
- "cpu_limit": "1",
- "memory_limit": "2Gi"
}
}, - "benchmarks": [
- {
- "id": "domain-qa",
- "name": "Domain Q&A Accuracy",
- "description": "Measures accuracy on domain-specific question answering",
- "category": "reasoning",
- "metrics": [
- "acc",
- "f1"
], - "primary_score": {
- "metric": "acc",
- "lower_is_better": false
}, - "pass_criteria": {
- "threshold": 0.5
}
}
]
}{- "resource": {
- "id": "c4d5e6f7-8901-2345-bcde-f67890123456",
- "tenant": "default",
- "created_at": "2026-01-20T10:00:00Z",
- "updated_at": "2026-01-20T10:00:00Z",
- "owner": "user@example.com"
}, - "name": "my-custom-evaluator",
- "title": "Custom Internal Evaluator",
- "description": "Internal evaluation adapter for domain-specific benchmarks",
- "tags": [
- "custom",
- "internal"
], - "runtime": {
- "k8s": {
- "image": "registry.internal.example.com/eval/custom-adapter:v1.2",
- "entrypoint": [
- "/opt/app-root/bin/python",
- "/opt/app-root/src/main.py"
], - "cpu_request": "250m",
- "memory_request": "512Mi",
- "cpu_limit": "1",
- "memory_limit": "2Gi"
}
}, - "benchmarks": [
- {
- "id": "domain-qa",
- "name": "Domain Q&A Accuracy",
- "description": "Measures accuracy on domain-specific question answering",
- "category": "reasoning",
- "metrics": [
- "acc",
- "f1"
], - "primary_score": {
- "metric": "acc",
- "lower_is_better": false
}, - "pass_criteria": {
- "threshold": 0.5
}
}
]
}Get a provider by ID.
| id required | string (Provider Id) Provider ID |
{- "resource": {
- "id": "d8e9f0a1-2345-6789-cdef-012345678901",
- "tenant": "default",
- "created_at": "2025-10-01T00:00:00Z",
- "updated_at": "2025-10-01T00:00:00Z"
}, - "name": "garak",
- "title": "Garak",
- "description": "LLM vulnerability scanner and red-teaming framework",
- "tags": [
- "security",
- "red_team"
], - "runtime": {
- "k8s": {
- "image": "quay.io/trustyai/trustyai-garak-lls-provider-dsp:latest",
- "entrypoint": [
- "python",
- "-m",
- "llama_stack_provider_trustyai_garak.evalhub"
], - "cpu_request": "500m",
- "memory_request": "512Mi",
- "cpu_limit": "2000m",
- "memory_limit": "4Gi"
}
}, - "benchmarks": [
- {
- "id": "owasp_llm_top10",
- "name": "OWASP LLM top 10 risk scan",
- "description": "Tests against the top 10 security risks specific to LLM applications.",
- "category": "security",
- "metrics": [
- "attack_success_rate"
], - "tags": [
- "security",
- "owasp",
- "red_team"
], - "primary_score": {
- "metric": "attack_success_rate",
- "lower_is_better": true
}, - "pass_criteria": {
- "threshold": 0.3
}
}, - {
- "id": "quality",
- "name": "Toxic & harmful content scan",
- "description": "Scans for violence, profanity, toxicity, hate speech, and integrity issues.",
- "category": "safety",
- "metrics": [
- "attack_success_rate"
], - "tags": [
- "safety",
- "quality",
- "toxicity",
- "red_team"
], - "primary_score": {
- "metric": "attack_success_rate",
- "lower_is_better": true
}, - "pass_criteria": {
- "threshold": 0.3
}
}
]
}Update an existing provider.
| id required | string (Provider Id) Provider ID |
| name required | string Provider name |
| title | string Provider display title |
| description | string Provider description |
| tags | Array of strings Provider tags |
object (AgentMetadata) Agent discoverability metadata for this provider | |
required | object (Runtime) Provider runtime configuration |
required | Array of objects (BenchmarkResource) Benchmarks offered by this provider |
{- "name": "my-custom-evaluator",
- "title": "Custom Internal Evaluator",
- "description": "Updated evaluation adapter with improved tokenization",
- "tags": [
- "custom",
- "internal"
], - "runtime": {
- "k8s": {
- "image": "registry.internal.example.com/eval/custom-adapter:v2.0",
- "entrypoint": [
- "/opt/app-root/bin/python",
- "/opt/app-root/src/main.py"
], - "cpu_request": "500m",
- "memory_request": "1Gi",
- "cpu_limit": "2",
- "memory_limit": "4Gi"
}
}, - "benchmarks": [
- {
- "id": "domain-qa",
- "name": "Domain Q&A Accuracy",
- "description": "Measures accuracy on domain-specific question answering",
- "category": "reasoning",
- "metrics": [
- "acc",
- "f1"
], - "primary_score": {
- "metric": "acc",
- "lower_is_better": false
}, - "pass_criteria": {
- "threshold": 0.5
}
}
]
}{- "resource": {
- "id": "c4d5e6f7-8901-2345-bcde-f67890123456",
- "tenant": "default",
- "created_at": "2026-01-20T10:00:00Z",
- "updated_at": "2026-02-05T14:30:00Z"
}, - "name": "my-custom-evaluator",
- "title": "Custom Internal Evaluator",
- "description": "Updated evaluation adapter with improved tokenization",
- "tags": [
- "custom",
- "internal"
], - "runtime": {
- "k8s": {
- "image": "registry.internal.example.com/eval/custom-adapter:v2.0",
- "entrypoint": [
- "/opt/app-root/bin/python",
- "/opt/app-root/src/main.py"
], - "cpu_request": "500m",
- "memory_request": "1Gi",
- "cpu_limit": "2",
- "memory_limit": "4Gi"
}
}, - "benchmarks": [
- {
- "id": "domain-qa",
- "name": "Domain Q&A Accuracy",
- "description": "Measures accuracy on domain-specific question answering",
- "category": "reasoning",
- "metrics": [
- "acc",
- "f1"
], - "primary_score": {
- "metric": "acc",
- "lower_is_better": false
}, - "pass_criteria": {
- "threshold": 0.5
}
}
]
}Partially update an existing provider.
| id required | string (Provider Id) |
| op required | string (PatchOp) Enum: "replace" "add" "remove" Patch operation type |
| path required | string JSON Pointer path |
| value | any Value for add/replace (omit for remove) |
[- {
- "op": "replace",
- "path": "/runtime/k8s/image",
- "value": "registry.internal.example.com/eval/custom-adapter:v2.1"
}, - {
- "op": "replace",
- "path": "/description",
- "value": "Updated evaluation adapter with bug fixes"
}
]{- "resource": {
- "id": "c4d5e6f7-8901-2345-bcde-f67890123456",
- "tenant": "default",
- "created_at": "2026-01-20T10:00:00Z",
- "updated_at": "2026-02-06T09:15:00Z"
}, - "name": "my-custom-evaluator",
- "title": "Custom Internal Evaluator",
- "description": "Updated evaluation adapter with bug fixes",
- "tags": [
- "custom",
- "internal"
], - "runtime": {
- "k8s": {
- "image": "registry.internal.example.com/eval/custom-adapter:v2.1",
- "entrypoint": [
- "/opt/app-root/bin/python",
- "/opt/app-root/src/main.py"
], - "cpu_request": "500m",
- "memory_request": "1Gi",
- "cpu_limit": "2",
- "memory_limit": "4Gi"
}
}, - "benchmarks": [
- {
- "id": "domain-qa",
- "name": "Domain Q&A Accuracy",
- "description": "Measures accuracy on domain-specific question answering",
- "category": "reasoning",
- "metrics": [
- "acc",
- "f1"
], - "primary_score": {
- "metric": "acc",
- "lower_is_better": false
}, - "pass_criteria": {
- "threshold": 0.5
}
}
]
}