From afbf6ef7aa87491cf2a315a871dd9b9297071183 Mon Sep 17 00:00:00 2001
From: Axel Palumbo <palumbo.axel@gmail.com>
Date: Thu, 21 May 2026 02:02:14 +0200
Subject: [PATCH] Add NVIDIA ASR NIM Sapat guide

Signed-off-by: Axel Palumbo <palumbo.axel@gmail.com>
---
 authors/absaloncrc.md                         |  10 +
 definitions/20260521_definition_asr_nim.md    |  28 ++
 .../20260521_run_nvidia_asr_nim_with_sapat.md | 284 ++++++++++++++++++
 ...521_run_nvidia_asr_nim_with_sapat_img1.svg |  43 +++
 4 files changed, 365 insertions(+)
 create mode 100644 authors/absaloncrc.md
 create mode 100644 definitions/20260521_definition_asr_nim.md
 create mode 100644 guides/20260521_run_nvidia_asr_nim_with_sapat.md
 create mode 100644 guides/assets/20260521_run_nvidia_asr_nim_with_sapat_img1.svg

diff --git a/authors/absaloncrc.md b/authors/absaloncrc.md
new file mode 100644
index 00000000..956a7b28
--- /dev/null
+++ b/authors/absaloncrc.md
@@ -0,0 +1,10 @@
+Author: absalonCRC
+Title: Open-source contributor
+Description: absalonCRC contributes practical fixes, tests, and technical writing for developer tools, with a focus on small automation workflows, reproducible environments, and AI-assisted engineering documentation.
+Author Image: <https://github.com/absalonCRC.png>
+Author LinkedIn:
+Author Twitter:
+Company Name:
+Company Description:
+Company Logo Dark:
+Company Logo White:
diff --git a/definitions/20260521_definition_asr_nim.md b/definitions/20260521_definition_asr_nim.md
new file mode 100644
index 00000000..7891d17a
--- /dev/null
+++ b/definitions/20260521_definition_asr_nim.md
@@ -0,0 +1,28 @@
+---
+title: "ASR NIM"
+description: "An ASR NIM is a packaged speech recognition inference service that exposes transcription APIs for turning recorded or streaming audio into text."
+date: 2026-05-21
+author: "absalonCRC"
+---
+
+# ASR NIM
+
+## Definition
+
+An ASR NIM is a packaged automatic speech recognition inference service. It
+serves speech-to-text models behind an API so applications can send audio and
+receive transcripts without embedding the full model runtime in the application
+itself.
+
+## Context and Usage
+
+NVIDIA ASR NIM microservices package speech recognition models, optimized
+inference components, and serving APIs into deployable services. In development
+workflows, an ASR NIM can sit behind a transcription tool such as Sapat: the tool
+prepares the audio file, sends it to the NIM endpoint, and stores the returned
+transcript beside the source media.
+
+This pattern is useful when a team wants reproducible local tooling and a
+consistent speech model endpoint. Developers can keep API keys and endpoint
+configuration in the workspace environment while the application code stays
+portable across hosted and self-managed NIM deployments.
diff --git a/guides/20260521_run_nvidia_asr_nim_with_sapat.md b/guides/20260521_run_nvidia_asr_nim_with_sapat.md
new file mode 100644
index 00000000..38163ade
--- /dev/null
+++ b/guides/20260521_run_nvidia_asr_nim_with_sapat.md
@@ -0,0 +1,284 @@
+---
+title: "Run NVIDIA ASR NIM with Sapat"
+description:
+  "Use Sapat in a Daytona workspace to transcribe videos with NVIDIA ASR NIM and
+  save reproducible text transcripts."
+date: 2026-05-21
+author: "absalonCRC"
+tags: ["daytona", "sapat", "transcription", "nvidia"]
+---
+
+# Run NVIDIA ASR NIM with Sapat
+
+# Introduction
+
+Video transcription workflows often fail for reasons that have nothing to do
+with speech recognition quality. A developer starts with a recording from a demo,
+support call, lecture, or internal review, then has to manage ffmpeg, API keys,
+temporary audio files, model choices, and transcript handoff. If those steps
+happen on a personal laptop, the next engineer may not be able to reproduce the
+same environment or rerun the transcript pass with the same settings.
+
+[Sapat](https://github.com/nkkko/sapat) is a small command-line transcription
+tool that keeps that workflow direct. It converts video files to MP3 with
+ffmpeg, sends the audio to a selected transcription provider, and writes the
+result to a `.txt` file beside the original media. This guide shows how to run a
+Sapat workflow in a Daytona workspace using a companion NVIDIA ASR NIM provider
+implementation: [nibzard/sapat#34](https://github.com/nibzard/sapat/pull/34).
+
+NVIDIA describes ASR NIM microservices as speech recognition services that turn
+spoken audio into text and support offline transcription for recorded files.
+That makes the workflow a good fit for product teams that already use NVIDIA's
+hosted NIM endpoints or self-managed NIM deployments and want a repeatable path
+from recording to transcript. Daytona keeps the tooling, configuration, and
+commands in a clean workspace so the workflow can be shared without sharing
+private API keys.
+
+The goal is not to build a large media pipeline. It is to create a small,
+inspectable loop that an AI engineer can trust: start from a known source file,
+run one command, review one text output, and keep enough context to rerun the
+same job later. That is especially useful when transcripts feed downstream
+summaries, retrieval datasets, QA notes, or internal knowledge bases.
+
+![Sapat, Daytona, and NVIDIA ASR NIM workflow](assets/20260521_run_nvidia_asr_nim_with_sapat_img1.svg)
+
+## TL;DR
+
+- Create a Daytona workspace from the Sapat repository.
+- Install Sapat in editable mode so the CLI can use the NVIDIA provider branch.
+- Store `NVIDIA_NIM_API_KEY`, `NVIDIA_NIM_BASE_URL`, and `NVIDIA_NIM_MODEL` in a
+  local `.env` file that is not committed.
+- Run `sapat path/to/video.mp4 --api nvidia --quality M --language en`.
+- Review the generated `.txt` transcript and keep the source video, command, and
+  environment names together for future reruns.
+
+## Prerequisites
+
+You need the following before starting:
+
+- A working Daytona installation.
+- An NVIDIA API key for a hosted NIM endpoint, or access to a self-hosted NVIDIA
+  [ASR NIM](../definitions/20260521_definition_asr_nim.md) endpoint.
+- A short MP4 test file. Start with a small file while validating provider
+  configuration.
+- Basic comfort with Python command-line projects.
+
+The companion provider PR uses NVIDIA's OpenAI-compatible offline transcription
+path at `/v1/audio/transcriptions`. It defaults to the hosted base URL
+`https://integrate.api.nvidia.com/v1` and the model identifier
+`nvidia/parakeet-ctc-0.6b-asr`. If your organization runs a self-hosted NIM,
+point `NVIDIA_NIM_BASE_URL` at that service instead.
+
+## Step 1: Create the Daytona workspace
+
+Start from the Sapat repository so the workspace has the same source layout as
+the upstream project:
+
+```bash
+daytona create https://github.com/nkkko/sapat --code
+```
+
+Open the workspace terminal and install the project in editable mode:
+
+```bash
+python3 -m venv .venv
+source .venv/bin/activate
+python3 -m pip install --upgrade pip
+python3 -m pip install -e .
+```
+
+Sapat also needs ffmpeg because the tool converts videos to MP3 before calling a
+provider. Confirm it is available:
+
+```bash
+ffmpeg -version
+```
+
+If the command is missing, install ffmpeg in the workspace image or use your
+team's standard dev container setup. Keeping ffmpeg inside the workspace is
+important: it removes a common source of "works on my machine" transcript
+differences.
+
+## Step 2: Use the NVIDIA provider branch
+
+Until the companion Sapat PR is merged, switch the workspace to the provider
+branch:
+
+```bash
+git remote add absaloncrc https://github.com/absalonCRC/sapat.git
+git fetch absaloncrc absaloncrc/add-nvidia-nim-provider
+git switch -c nvidia-nim-provider FETCH_HEAD
+python3 -m pip install -e .
+```
+
+After the PR is merged upstream, this branch step can be replaced with a normal
+pull from the Sapat main branch.
+
+The provider adds four pieces:
+
+- `src/sapat/transcription/nvidia.py`, which posts audio to the NIM transcription
+  endpoint.
+- `--api nvidia` in the Sapat CLI.
+- README environment variable documentation.
+- A small stdlib test file that validates missing credentials, request
+  construction, and unsupported audio extensions.
+
+This keeps the provider consistent with the existing OpenAI and Groq providers
+without adding a new runtime dependency.
+
+## Step 3: Configure NVIDIA credentials
+
+Create a `.env` file in the Sapat project root:
+
+```bash
+cat > .env <<'ENV'
+NVIDIA_NIM_API_KEY=replace-with-your-key
+NVIDIA_NIM_BASE_URL=https://integrate.api.nvidia.com/v1
+NVIDIA_NIM_MODEL=nvidia/parakeet-ctc-0.6b-asr
+ENV
+```
+
+Do not commit this file. It contains credentials and may also reveal internal
+endpoint names if you use a self-hosted NIM deployment.
+
+For self-hosted deployments, keep the same variable names and change only the
+base URL:
+
+```bash
+NVIDIA_NIM_BASE_URL=http://localhost:9000/v1
+```
+
+NVIDIA's ASR NIM documentation shows local containers exposing HTTP service
+ports such as `9000`, and the model pages document the `/v1/audio/transcriptions`
+style endpoint. The exact host, port, and model identifier should come from the
+deployment your team operates.
+
+## Step 4: Run a first transcription
+
+Copy a short MP4 into the workspace, for example:
+
+```bash
+mkdir -p samples
+cp ~/Downloads/product-demo.mp4 samples/product-demo.mp4
+```
+
+Run Sapat with the NVIDIA provider:
+
+```bash
+sapat samples/product-demo.mp4 --api nvidia --quality M --language en
+```
+
+Sapat will:
+
+1. Convert `samples/product-demo.mp4` to `samples/product-demo.mp3`.
+2. Send the MP3 to the NVIDIA ASR NIM transcription endpoint.
+3. Write the transcript to `samples/product-demo.txt`.
+4. Remove the temporary MP3 file after the transcript is saved.
+
+Open the transcript:
+
+```bash
+sed -n '1,80p' samples/product-demo.txt
+```
+
+For the first pass, focus on whether the full recording was processed, whether
+the transcript language matches the source audio, and whether the output file is
+created in the expected location. Once that path works, you can tune quality
+settings and model configuration.
+
+## Step 5: Tune quality and repeatability
+
+Sapat's `--quality` flag controls the MP3 conversion settings before the audio is
+sent to the provider:
+
+| Flag | Conversion target | Practical use |
+| --- | --- | --- |
+| `L` | Lower sample rate and mono audio | Quick tests, short validation runs |
+| `M` | Balanced mono audio | Default choice for most review workflows |
+| `H` | Higher bitrate stereo audio | Source recordings where channel separation matters |
+
+Start with `M`. Move to `H` when the source recording has channel detail that
+you want to preserve, or when you are diagnosing quality issues. Use `L` for
+quick endpoint checks where transcript quality is less important than validating
+credentials and routing.
+
+Record the exact command in a small run log:
+
+```bash
+cat > samples/product-demo.runlog.md <<'EOF'
+# product-demo transcription run
+
+- Source: samples/product-demo.mp4
+- Command: sapat samples/product-demo.mp4 --api nvidia --quality M --language en
+- Provider: NVIDIA ASR NIM
+- Model variable: NVIDIA_NIM_MODEL
+- Output: samples/product-demo.txt
+EOF
+```
+
+This is a small habit, but it makes transcripts easier to audit. Anyone with the
+same source media and workspace configuration can rerun the command and compare
+the output.
+
+## Step 6: Validate the provider path
+
+If you are modifying the provider code or reviewing the companion PR, run the
+focused checks from the Sapat repository:
+
+```bash
+PYTHONPATH=src python3 -m unittest discover -s tests
+PYTHONPATH=src python3 -m py_compile \
+  src/sapat/script.py \
+  src/sapat/transcription/nvidia.py \
+  tests/test_nvidia_transcription.py
+```
+
+The tests mock the network request, so they do not spend NVIDIA credits and do
+not require a real API key. They check that the provider refuses to run without
+`NVIDIA_NIM_API_KEY`, builds the expected `Authorization` header, posts to
+`/audio/transcriptions`, and rejects unsupported file extensions.
+
+## Troubleshooting
+
+**Problem: `NVIDIA_NIM_API_KEY is required for NVIDIA transcription.`**
+
+**Solution:** Confirm the `.env` file exists in the Sapat project root and that
+the key name is exactly `NVIDIA_NIM_API_KEY`. Restart the shell or rerun the
+command from the project root so `python-dotenv` can load the file.
+
+**Problem: The request reaches the wrong host.**
+
+**Solution:** Check `NVIDIA_NIM_BASE_URL`. It should include `/v1` but not
+`/audio/transcriptions`; the provider appends that path automatically.
+
+**Problem: The video converts, but no transcript is saved.**
+
+**Solution:** Rerun with a short sample and inspect the provider error. A hosted
+endpoint may reject the model name, while a self-hosted endpoint may require a
+different base URL or container tag.
+
+**Problem: The transcript is lower quality than expected.**
+
+**Solution:** Try `--quality H`, verify the source audio is clean, and confirm
+that the selected NIM model matches your language and recording type. NVIDIA's
+ASR NIM documentation lists model families and their supported modes; choose a
+model that fits recorded-file transcription rather than live streaming alone.
+
+## Conclusion
+
+Sapat is useful because it keeps the transcription workflow small: convert the
+video, call a provider, save a text file. Adding NVIDIA ASR NIM support gives AI
+engineers another provider option without changing that mental model.
+
+Daytona makes the workflow easier to share. The workspace holds the Python
+environment, ffmpeg dependency, branch, and commands, while secrets stay in a
+local `.env` file. That separation lets teams reproduce transcript runs without
+copying private keys into source control.
+
+## References
+
+- [Sapat repository](https://github.com/nkkko/sapat)
+- [Companion NVIDIA provider PR](https://github.com/nibzard/sapat/pull/34)
+- [NVIDIA ASR NIM overview](https://docs.nvidia.com/nim/speech/latest/asr/index.html)
+- [NVIDIA Parakeet RNNT ASR NIM example](https://docs.nvidia.com/nim/speech/latest/asr/deploy-asr-models/parakeet-rnnt.html)
+- [NVIDIA Parakeet CTC API reference](https://build.nvidia.com/nvidia/parakeet-ctc-0_6b-asr/api.md)
diff --git a/guides/assets/20260521_run_nvidia_asr_nim_with_sapat_img1.svg b/guides/assets/20260521_run_nvidia_asr_nim_with_sapat_img1.svg
new file mode 100644
index 00000000..99fd500c
--- /dev/null
+++ b/guides/assets/20260521_run_nvidia_asr_nim_with_sapat_img1.svg
@@ -0,0 +1,43 @@
+<svg xmlns="http://www.w3.org/2000/svg" width="1280" height="720" viewBox="0 0 1280 720" role="img" aria-labelledby="title desc">
+  <title id="title">Sapat transcription flow with Daytona and NVIDIA ASR NIM</title>
+  <desc id="desc">A workflow diagram showing a Daytona workspace converting video to MP3, sending it to NVIDIA ASR NIM, and saving a text transcript.</desc>
+  <rect width="1280" height="720" fill="#111827"/>
+  <rect x="72" y="92" width="1136" height="536" rx="18" fill="#f8fafc"/>
+  <text x="110" y="154" fill="#111827" font-family="Arial, sans-serif" font-size="38" font-weight="700">Reproducible NVIDIA ASR workflow in Daytona</text>
+  <text x="112" y="198" fill="#4b5563" font-family="Arial, sans-serif" font-size="22">Sapat keeps the local video handling simple while the speech model runs behind a NIM endpoint.</text>
+
+  <g font-family="Arial, sans-serif">
+    <rect x="112" y="282" width="220" height="132" rx="14" fill="#dbeafe" stroke="#1d4ed8" stroke-width="3"/>
+    <text x="142" y="328" fill="#111827" font-size="24" font-weight="700">Video input</text>
+    <text x="142" y="365" fill="#374151" font-size="18">MP4 file or folder</text>
+    <text x="142" y="394" fill="#374151" font-size="18">inside workspace</text>
+
+    <rect x="404" y="282" width="220" height="132" rx="14" fill="#dcfce7" stroke="#15803d" stroke-width="3"/>
+    <text x="436" y="328" fill="#111827" font-size="24" font-weight="700">Sapat</text>
+    <text x="436" y="365" fill="#374151" font-size="18">ffmpeg MP3 pass</text>
+    <text x="436" y="394" fill="#374151" font-size="18">provider routing</text>
+
+    <rect x="696" y="282" width="220" height="132" rx="14" fill="#fef3c7" stroke="#d97706" stroke-width="3"/>
+    <text x="728" y="328" fill="#111827" font-size="24" font-weight="700">NVIDIA NIM</text>
+    <text x="728" y="365" fill="#374151" font-size="18">ASR transcription</text>
+    <text x="728" y="394" fill="#374151" font-size="18">Parakeet model</text>
+
+    <rect x="988" y="282" width="180" height="132" rx="14" fill="#fae8ff" stroke="#a21caf" stroke-width="3"/>
+    <text x="1020" y="328" fill="#111827" font-size="24" font-weight="700">Text file</text>
+    <text x="1020" y="365" fill="#374151" font-size="18">sidecar .txt</text>
+    <text x="1020" y="394" fill="#374151" font-size="18">review-ready</text>
+
+    <path d="M340 348h52" stroke="#111827" stroke-width="4" marker-end="url(#arrow)"/>
+    <path d="M632 348h52" stroke="#111827" stroke-width="4" marker-end="url(#arrow)"/>
+    <path d="M924 348h52" stroke="#111827" stroke-width="4" marker-end="url(#arrow)"/>
+
+    <rect x="210" y="482" width="860" height="74" rx="12" fill="#111827"/>
+    <text x="244" y="528" fill="#f9fafb" font-size="22">daytona create https://github.com/nkkko/sapat --code</text>
+  </g>
+
+  <defs>
+    <marker id="arrow" markerWidth="12" markerHeight="12" refX="10" refY="6" orient="auto">
+      <path d="M2 2l8 4-8 4z" fill="#111827"/>
+    </marker>
+  </defs>
+</svg>