elastic · Jan-Kazlouski-elastic · Dec 16, 2025 · Dec 4, 2025 · Dec 11, 2025 · Dec 16, 2025
diff --git a/output/openapi/elasticsearch-openapi.json b/output/openapi/elasticsearch-openapi.json
diff --git a/output/openapi/elasticsearch-serverless-openapi.json b/output/openapi/elasticsearch-serverless-openapi.json
diff --git a/output/schema/schema.json b/output/schema/schema.json
diff --git a/output/typescript/types.ts b/output/typescript/types.ts
diff --git a/specification/_doc_ids/table.csv b/specification/_doc_ids/table.csv
@@ -398,6 +398,7 @@ inference-api-put-huggingface,https://www.elastic.co/docs/api/doc/elasticsearch/
 inference-api-put-jinaai,https://www.elastic.co/docs/api/doc/elasticsearch/operation/operation-inference-put-jinaai,,
 inference-api-put-llama,https://www.elastic.co/docs/api/doc/elasticsearch/operation/operation-inference-put-llama,,
 inference-api-put-mistral,https://www.elastic.co/docs/api/doc/elasticsearch/operation/operation-inference-put-mistral,https://www.elastic.co/guide/en/elasticsearch/reference/8.18/infer-service-mistral.html,
+inference-api-put-nvidia,https://www.elastic.co/docs/api/doc/elasticsearch/operation/operation-inference-put-nvidia,,
 inference-api-put-openai,https://www.elastic.co/docs/api/doc/elasticsearch/operation/operation-inference-put-openai,https://www.elastic.co/guide/en/elasticsearch/reference/8.18/infer-service-openai.html,
 inference-api-put-openshift-ai,https://www.elastic.co/docs/api/doc/elasticsearch/operation/operation-inference-put-openshift-ai,,
 inference-api-put-voyageai,https://www.elastic.co/docs/api/doc/elasticsearch/operation/operation-inference-put-voyageai,,

@@ -0,0 +1,49 @@
+{
+  "inference.put_nvidia": {
+    "documentation": {
+      "url": "https://www.elastic.co/docs/api/doc/elasticsearch/operation/operation-inference-put-nvidia",
+      "description": "Create an Nvidia inference endpoint"
+    },
+    "stability": "stable",
+    "visibility": "public",
+    "headers": {
+      "accept": ["application/json"],
+      "content_type": ["application/json"]
+    },
+    "url": {
+      "paths": [
+        {
+          "path": "/_inference/{task_type}/{nvidia_inference_id}",
+          "methods": ["PUT"],
+          "parts": {
+            "task_type": {
+              "type": "enum",
+              "description": "The task type",
+              "options": [
+                "chat_completion",
+                "completion",
+                "rerank",
+                "text_embedding"
+              ]
+            },
+            "nvidia_inference_id": {
+              "type": "string",
+              "description": "The inference ID"
+            }
+          }
+        }
+      ]
+    },
+    "body": {
+      "description": "The inference endpoint's task and service settings",
+      "required": true
+    },
+    "params": {
+      "timeout": {
+        "type": "time",
+        "description": "Specifies the amount of time to wait for the inference endpoint to be created.",
+        "default": "30s"
+      }
+    }
+  }
+}
diff --git a/specification/inference/_types/CommonTypes.ts b/specification/inference/_types/CommonTypes.ts
@@ -1810,6 +1810,90 @@ export enum MistralServiceType {
   mistral
 }
 
+export class NvidiaServiceSettings {
+  /**
+   * A valid API key for your Nvidia endpoint.
+   * Can be found in `API Keys` section of Nvidia account settings.
+   */
+  api_key: string
+  /**
+   * The URL of the Nvidia model endpoint. If not provided, the default endpoint URL is used depending on the task type:
+   *
+   * * For `text_embedding` task - `https://integrate.api.nvidia.com/v1/embeddings`.
+   * * For `completion` and `chat_completion` tasks - `https://integrate.api.nvidia.com/v1/chat/completions`.
+   * * For `rerank` task - `https://ai.api.nvidia.com/v1/retrieval/nvidia/reranking`.
+   */
+  url?: string
+  /**
+   * The name of the model to use for the inference task.
+   * Refer to the model's documentation for the name if needed.
+   * Service has been tested and confirmed to be working with the following models:
+   *
+   * * For `text_embedding` task - `nvidia/llama-3.2-nv-embedqa-1b-v2`.
+   * * For `completion` and `chat_completion` tasks - `microsoft/phi-3-mini-128k-instruct`.
+   * * For `rerank` task - `nv-rerank-qa-mistral-4b:1`.
+   * Service doesn't support `text_embedding` task `baai/bge-m3` and `nvidia/nvclip` models due to them not recognizing the `input_type` parameter.
+   */
+  model_id: string
+  /**
+   * For a `text_embedding` task, the maximum number of tokens per input. Inputs exceeding this value are truncated prior to sending to the Nvidia API.
+   */
+  max_input_tokens?: integer
+  /**
+   * For a `text_embedding` task, the similarity measure. One of cosine, dot_product, l2_norm.
+   */
+  similarity?: NvidiaSimilarityType
+  /**
+   * This setting helps to minimize the number of rate limit errors returned from the Nvidia API.
+   * By default, the `nvidia` service sets the number of requests allowed per minute to 3000.
+   */
+  rate_limit?: RateLimitSetting
+}
+
+export enum NvidiaTaskType {
+  chat_completion,
+  completion,
+  rerank,
+  text_embedding
+}
+
+export enum NvidiaServiceType {
+  nvidia
+}
+
+export enum NvidiaSimilarityType {
+  cosine,
+  dot_product,
+  l2_norm
+}
+
+export class NvidiaTaskSettings {
+  /**
+   * For a `text_embedding` task, type of input sent to the Nvidia endpoint.
+   * Valid values are:
+   *
+   * * `ingest`: Mapped to Nvidia's `passage` value in request. Used when generating embeddings during indexing.
+   * * `search`: Mapped to Nvidia's `query` value in request. Used when generating embeddings during querying.
+   *
+   * IMPORTANT: For Nvidia endpoints, if the `input_type` field is not specified, it defaults to `query`.
+   */
+  input_type?: NvidiaInputType
+  /**
+   * For a `text_embedding` task, the method used by the Nvidia model to handle inputs longer than the maximum token length.
+   * Valid values are:
+   *
+   * * `END`: When the input exceeds the maximum input token length, the end of the input is discarded.
+   * * `NONE`: When the input exceeds the maximum input token length, an error is returned.
+   * * `START`: When the input exceeds the maximum input token length, the start of the input is discarded.
+   */
+  truncate?: CohereTruncateType
+}
+
+export enum NvidiaInputType {
+  ingest,
+  search
+}
+
 export class OpenAIServiceSettings {
   /**
    * A valid API key of your OpenAI account.
@@ -1908,6 +1992,7 @@ export class OpenShiftAiServiceSettings {
   max_input_tokens?: integer
   /**
    * For a `text_embedding` task, the similarity measure. One of cosine, dot_product, l2_norm.
+   * If not specified, the default dot_product value is used.
    */
   similarity?: OpenShiftAiSimilarityType
   /**

diff --git a/specification/inference/_types/Services.ts b/specification/inference/_types/Services.ts
@@ -41,6 +41,7 @@ import {
   TaskTypeJinaAi,
   TaskTypeLlama,
   TaskTypeMistral,
+  TaskTypeNvidia,
   TaskTypeOpenAI,
   TaskTypeOpenShiftAi,
   TaskTypeVoyageAI,
@@ -304,6 +305,17 @@ export class InferenceEndpointInfoMistral extends InferenceEndpoint {
   task_type: TaskTypeMistral
 }
 
+export class InferenceEndpointInfoNvidia extends InferenceEndpoint {
+  /**
+   * The inference ID
+   */
+  inference_id: string
+  /**
+   * The task type
+   */
+  task_type: TaskTypeNvidia
+}
+
 export class InferenceEndpointInfoOpenAI extends InferenceEndpoint {
   /**
    * The inference Id

diff --git a/specification/inference/_types/TaskType.ts b/specification/inference/_types/TaskType.ts
@@ -141,6 +141,13 @@ export enum TaskTypeMistral {
   completion
 }
 
+export enum TaskTypeNvidia {
+  chat_completion,
+  completion,
+  rerank,
+  text_embedding
+}
+
 export enum TaskTypeOpenAI {
   text_embedding,
   chat_completion,

diff --git a/specification/inference/put/PutRequest.ts b/specification/inference/put/PutRequest.ts
@@ -49,6 +49,7 @@ import { TaskType } from '@inference/_types/TaskType'
  * * JinaAI (`rerank`, `text_embedding`)
  * * Llama (`chat_completion`, `completion`, `text_embedding`)
  * * Mistral (`chat_completion`, `completion`, `text_embedding`)
+ * * Nvidia (`chat_completion`, `completion`, `text_embedding`, `rerank`)
  * * OpenAI (`chat_completion`, `completion`, `text_embedding`)
  * * OpenShift AI (`chat_completion`, `completion`, `rerank`, `text_embedding`)
  * * VoyageAI (`rerank`, `text_embedding`)

diff --git a/specification/inference/put_nvidia/PutNvidiaRequest.ts b/specification/inference/put_nvidia/PutNvidiaRequest.ts
@@ -0,0 +1,90 @@
+/*
+ * Licensed to Elasticsearch B.V. under one or more contributor
+ * license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright
+ * ownership. Elasticsearch B.V. licenses this file to you under
+ * the Apache License, Version 2.0 (the "License"); you may
+ * not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+import { RequestBase } from '@_types/Base'
+import { Id } from '@_types/common'
+import { Duration } from '@_types/Time'
+import {
+  NvidiaServiceSettings,
+  NvidiaServiceType,
+  NvidiaTaskSettings,
+  NvidiaTaskType
+} from '@inference/_types/CommonTypes'
+import { InferenceChunkingSettings } from '@inference/_types/Services'
+
+/**
+ * Create an Nvidia inference endpoint.
+ *
+ * Create an inference endpoint to perform an inference task with the `nvidia` service.
+ * @rest_spec_name inference.put_nvidia
+ * @availability stack since=9.3.0 stability=stable visibility=public
+ * @availability serverless stability=stable visibility=public
+ * @cluster_privileges manage_inference
+ * @doc_id inference-api-put-nvidia
+ */
+export interface Request extends RequestBase {
+  urls: [
+    {
+      path: '/_inference/{task_type}/{nvidia_inference_id}'
+      methods: ['PUT']
+    }
+  ]
+  path_parts: {
+    /**
+     * The type of the inference task that the model will perform.
+     * NOTE: The `chat_completion` task type only supports streaming and only through the _stream API.
+     */
+    task_type: NvidiaTaskType
+    /**
+     * The unique identifier of the inference endpoint.
+     */
+    nvidia_inference_id: Id
+  }
+  query_parameters: {
+    /**
+     * Specifies the amount of time to wait for the inference endpoint to be created.
+     * @server_default 30s
+     */
+    timeout?: Duration
+  }
+  body: {
+    /**
+     * The chunking configuration object.
+     * Applies only to the `text_embedding` task type.
+     * Not applicable to the `rerank`, `completion`, or `chat_completion` task types.
+     * @ext_doc_id inference-chunking
+     */
+    chunking_settings?: InferenceChunkingSettings
+    /**
+     * The type of service supported for the specified task type. In this case, `nvidia`.
+     */
+    service: NvidiaServiceType
+    /**
+     * Settings used to install the inference model. These settings are specific to the `nvidia` service.
+     */
+    service_settings: NvidiaServiceSettings
+    /**
+     * Settings to configure the inference task.
+     * Applies only to the `text_embedding` task type.
+     * Not applicable to the `rerank`, `completion`, or `chat_completion` task types.
+     * These settings are specific to the task type you specified.
+     */
+    task_settings?: NvidiaTaskSettings
+  }
+}
diff --git a/specification/inference/put_nvidia/PutNvidiaResponse.ts b/specification/inference/put_nvidia/PutNvidiaResponse.ts
@@ -0,0 +1,25 @@
+/*
+ * Licensed to Elasticsearch B.V. under one or more contributor
+ * license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright
+ * ownership. Elasticsearch B.V. licenses this file to you under
+ * the Apache License, Version 2.0 (the "License"); you may
+ * not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+import { InferenceEndpointInfoNvidia } from '@inference/_types/Services'
+
+export class Response {
+  /** @codegen_name endpoint_info */
+  body: InferenceEndpointInfoNvidia
+}