From b0db793f4d44e934114639df7b84d192ac43a168 Mon Sep 17 00:00:00 2001
From: EnesBrt <enes.barut@gmx.com>
Date: Wed, 20 May 2026 20:58:38 +0200
Subject: [PATCH] Add IBM Watson Sapat transcription guide

Signed-off-by: EnesBrt <enes.barut@gmx.com>
---
 authors/ness.md                               |   6 +
 ...finition_synchronous_speech_recognition.md |  21 ++
 ...son_transcription_with_sapat_in_daytona.md | 287 ++++++++++++++++++
 ...anscription_with_sapat_in_daytona_img1.svg |  26 ++
 4 files changed, 340 insertions(+)
 create mode 100644 authors/ness.md
 create mode 100644 definitions/20260520_definition_synchronous_speech_recognition.md
 create mode 100644 guides/20260520_run_ibm_watson_transcription_with_sapat_in_daytona.md
 create mode 100644 guides/assets/20260520_run_ibm_watson_transcription_with_sapat_in_daytona_img1.svg

diff --git a/authors/ness.md b/authors/ness.md
new file mode 100644
index 00000000..5a49f69c
--- /dev/null
+++ b/authors/ness.md
@@ -0,0 +1,6 @@
+Author: Ness Title: Open-source contributor Description: Ness contributes
+workflow-focused developer documentation and small open-source tooling examples,
+with an interest in reproducible environments, AI-assisted engineering, and
+practical command line workflows. Author Image:
+[https://avatars.githubusercontent.com/u/52550279?v=4] Author LinkedIn: Author
+Twitter: Company Name: Independent Company Description: Independent contributor
diff --git a/definitions/20260520_definition_synchronous_speech_recognition.md b/definitions/20260520_definition_synchronous_speech_recognition.md
new file mode 100644
index 00000000..c6808a9d
--- /dev/null
+++ b/definitions/20260520_definition_synchronous_speech_recognition.md
@@ -0,0 +1,21 @@
+---
+title: 'Synchronous Speech Recognition'
+description: 'A speech recognition request pattern where one API call sends audio and waits for the transcript response.'
+date: 2026-05-20
+author: 'Ness'
+---
+
+# Synchronous Speech Recognition
+
+## Definition
+
+Synchronous speech recognition is a speech-to-text request pattern where a
+client sends audio to a recognition service and waits for the completed
+transcript response before the request finishes.
+
+## Context and Usage
+
+Synchronous speech recognition is commonly used for short recordings, smoke
+tests, batch scripts, and simple command line tools. It is easier to integrate
+than live streaming or asynchronous job polling, but it is best suited to audio
+that can be processed within the provider's request limits.
diff --git a/guides/20260520_run_ibm_watson_transcription_with_sapat_in_daytona.md b/guides/20260520_run_ibm_watson_transcription_with_sapat_in_daytona.md
new file mode 100644
index 00000000..fdec45a7
--- /dev/null
+++ b/guides/20260520_run_ibm_watson_transcription_with_sapat_in_daytona.md
@@ -0,0 +1,287 @@
+---
+title: "Run IBM Watson Transcription with Sapat"
+description:
+  "Build a reproducible Daytona workflow for transcribing video files with Sapat
+  and IBM Watson Speech to Text."
+date: 2026-05-20
+author: "Ness"
+tags: ["ai", "transcription", "daytona", "python"]
+---
+
+# Run IBM Watson Transcription with Sapat
+
+# Introduction
+
+A transcription workflow is easiest to trust when every run is boring in the
+same way. The same command should convert the source file, call the chosen
+provider, write the transcript, and leave enough notes that another engineer can
+repeat the job. That is hard to maintain when audio tools, Python dependencies,
+API keys, and generated text files are scattered across a developer laptop.
+
+[Sapat](https://github.com/nkkko/sapat) gives this workflow a small command line
+shape. It accepts video files, converts them to MP3 with `ffmpeg`, sends the
+audio to a transcription provider, and saves a `.txt` transcript beside the
+source file. Running Sapat inside a [Daytona workspace](https://www.daytona.io/)
+keeps the toolchain and secrets in an isolated development environment. Adding
+IBM Watson Speech to Text gives teams another provider option for
+[synchronous speech recognition](../definitions/20260520_definition_synchronous_speech_recognition.md),
+where one HTTP request sends the audio and returns recognition results after the
+service finishes processing the file.
+
+This guide walks through a Daytona-based Sapat workflow backed by IBM Watson
+Speech to Text. It is written around a companion Sapat implementation that adds
+`--api ibm`, environment-based IBM configuration, and mocked tests for the
+recognition request and transcript extraction path.
+
+## TL;DR
+
+- Use Daytona to run Sapat in a reproducible workspace rather than configuring
+  Python, `ffmpeg`, and credentials on your host machine.
+- Add `IBM_WATSON_STT_API_KEY`, `IBM_WATSON_STT_URL`, and optional model
+  settings in `.env`.
+- Run `sapat <file-or-directory> --api ibm` to convert MP4 files to MP3 and send
+  the generated audio to IBM Watson Speech to Text.
+- Review the generated `.txt` file before sharing it downstream.
+- Use mocked tests to verify the provider wiring without sending audio to IBM.
+
+## Workflow Overview
+
+![IBM Watson and Sapat workflow](./assets/20260520_run_ibm_watson_transcription_with_sapat_in_daytona_img1.svg)
+
+Sapat is useful because it keeps the local file workflow simple:
+
+- A source `.mp4` file is the input.
+- A temporary `.mp3` file is generated with `ffmpeg`.
+- The selected provider receives the audio.
+- A `.txt` transcript is written next to the source file.
+- The temporary MP3 is removed.
+
+The IBM Watson Speech to Text HTTP API fits this flow because its basic
+`POST /v1/recognize` request accepts an audio stream and returns a JSON response
+with transcript alternatives. The provider can send the MP3 with basic
+authentication, pass the configured IBM model as a query parameter, read
+`results[].alternatives[0].transcript`, and return a single transcript string to
+Sapat's existing file writer.
+
+## Prerequisites
+
+Before starting, make sure you have:
+
+- Daytona installed and connected to your Git provider.
+- An IBM Cloud Speech to Text service instance.
+- The service API key and service URL from IBM Cloud.
+- A short MP4 file for the first smoke test.
+- Basic comfort with Python commands and `.env` files.
+
+Start with short audio. A one minute product demo, lecture excerpt, or support
+call sample is enough to validate the pipeline. Once the workspace and provider
+configuration are proven, you can run longer recordings or directories of files.
+
+## Step 1: Create a Daytona Workspace
+
+Create a workspace from the Sapat repository:
+
+```bash
+daytona create https://github.com/nkkko/sapat --code
+```
+
+Open the workspace in your editor. If you are testing the companion IBM Watson
+branch, fetch it inside the workspace:
+
+```bash
+git remote add enes https://github.com/EnesBrt/sapat.git
+git fetch enes codex/ibm-watson-transcription-provider
+git switch -c ibm-watson-transcription enes/codex/ibm-watson-transcription-provider
+```
+
+The companion implementation is available at
+[nibzard/sapat#27](https://github.com/nibzard/sapat/pull/27).
+
+## Step 2: Install Sapat and ffmpeg
+
+Install Sapat in editable mode from the project root:
+
+```bash
+python -m pip install --upgrade pip
+python -m pip install -e .
+```
+
+Then confirm `ffmpeg` is available:
+
+```bash
+ffmpeg -version
+```
+
+If `ffmpeg` is missing and your Daytona workspace uses a Debian or Ubuntu based
+image, install it with:
+
+```bash
+sudo apt-get update
+sudo apt-get install -y ffmpeg
+```
+
+The important part is that this setup lives in the workspace. Your host machine
+does not need another Python environment, another global command line tool, or a
+set of long-lived media-processing dependencies.
+
+## Step 3: Configure IBM Watson
+
+Create a `.env` file in the Sapat project root:
+
+```bash
+IBM_WATSON_STT_API_KEY=your_ibm_watson_api_key_here
+IBM_WATSON_STT_URL=https://api.us-south.speech-to-text.watson.cloud.ibm.com/instances/your_instance_id
+IBM_WATSON_STT_MODEL=en-US_BroadbandModel
+IBM_WATSON_STT_SMART_FORMATTING=true
+```
+
+The first two variables are required. `IBM_WATSON_STT_MODEL` defaults to
+`en-US_BroadbandModel`, which matches IBM's basic recognition examples for
+English broadband audio. `IBM_WATSON_STT_SMART_FORMATTING` is passed through as
+a query parameter so the service can format dates, times, numbers, and similar
+entities when the selected model supports it.
+
+Keep `.env` out of commits. The API key belongs in the workspace environment or
+secret manager, never in guide text, PR bodies, transcripts, or screenshots.
+
+## Step 4: Run a Smoke Test
+
+Run Sapat against one short file first:
+
+```bash
+sapat ./samples/customer-demo.mp4 --api ibm --language en --quality M
+```
+
+The IBM provider currently uses the configured Watson model rather than deriving
+the model from `--language`. Keep `--language` in the command when it helps you
+document the intent of the run, but choose the actual recognition model through
+`IBM_WATSON_STT_MODEL`.
+
+During the run, Sapat will:
+
+1. Convert `customer-demo.mp4` to `customer-demo.mp3`.
+2. Send the MP3 bytes to `{IBM_WATSON_STT_URL}/v1/recognize`.
+3. Authenticate with basic auth using `apikey` and your API key.
+4. Pass the configured model and smart-formatting setting as query parameters.
+5. Extract the top transcript alternatives from IBM's JSON response.
+6. Write `customer-demo.txt`.
+7. Delete the temporary MP3 file.
+
+For a folder of recordings, point Sapat at the folder:
+
+```bash
+sapat ./recordings --api ibm --quality M
+```
+
+Sapat processes `.mp4` files in the directory and writes one `.txt` file for
+each input file.
+
+## Step 5: Review and Package the Transcript
+
+Open the generated transcript before it leaves the workspace. A good review pass
+is simple:
+
+- Confirm the transcript covers the full recording.
+- Check product names, acronyms, and customer names.
+- Remove private or irrelevant sections before sharing.
+- Add a short run note beside the transcript.
+
+Example run note:
+
+```text
+source: customer-demo.mp4
+provider: ibm-watson-speech-to-text
+model: en-US_BroadbandModel
+quality: M
+review status: names checked, roadmap details redacted
+```
+
+This note turns the transcript from a loose text file into a reviewable artifact.
+When a teammate asks where the text came from, the answer is in the workspace
+next to the output.
+
+## Step 6: Validate Without Calling IBM
+
+The companion provider includes a mocked unit test:
+
+```bash
+PYTHONPATH=src python -m unittest tests.test_ibm_watson
+```
+
+The test verifies that the provider:
+
+- Sends audio to `/v1/recognize`.
+- Uses `("apikey", IBM_WATSON_STT_API_KEY)` for basic auth.
+- Sets `Content-Type` based on the generated audio file extension.
+- Passes the Watson model and smart-formatting values as query parameters.
+- Extracts transcript text from IBM's `results` array.
+
+Run this test in CI or before opening provider changes. It catches wiring
+mistakes without sending private audio to a third-party API.
+
+## Common Issues and Troubleshooting
+
+**Problem:** `IBM_WATSON_STT_API_KEY is required for IBM Watson transcription.`
+
+**Solution:** Confirm that `.env` exists in the Sapat project root and that the
+workspace terminal is running Sapat from that root.
+
+**Problem:** `IBM_WATSON_STT_URL is required for IBM Watson transcription.`
+
+**Solution:** Copy the service URL from the IBM Cloud Speech to Text instance,
+including the `/instances/...` path, and store it in `.env`.
+
+**Problem:** IBM returns an authentication error.
+
+**Solution:** Check that the API key belongs to the same service instance as the
+service URL. IBM's examples use basic auth with username `apikey` and the API
+key as the password.
+
+**Problem:** The transcript quality is poor.
+
+**Solution:** Try `--quality H`, verify the source audio track, and choose an IBM
+model that matches the language and audio type. For noisy sources, test a short
+excerpt before running the full folder.
+
+**Problem:** The workflow works locally but not in automation.
+
+**Solution:** Keep the same variable names, but inject them through your CI or
+workspace secret manager. Run the mocked test on every change and reserve real
+provider calls for intentional smoke-test jobs.
+
+## Why Daytona Helps
+
+Transcription pipelines are easy to underestimate. They touch local media files,
+provider credentials, generated artifacts, and sometimes sensitive customer or
+internal conversation data. Daytona gives the workflow a clean boundary: create
+the workspace, install the dependencies, run Sapat, review the transcript, keep
+the useful artifacts, and remove the environment when the job is complete.
+
+That boundary makes the process easier to hand to another engineer. Instead of
+"install these tools and set these variables somewhere on your machine," the
+workflow becomes "open this workspace, check `.env`, run this command, review
+this output." It is a small operational difference, but it is the difference
+between an ad hoc transcript and a repeatable engineering intake process.
+
+## Conclusion
+
+IBM Watson Speech to Text gives Sapat another practical provider for teams that
+already use IBM Cloud or want a synchronous HTTP transcription path. Daytona
+keeps the surrounding workflow contained: Python dependencies, `ffmpeg`, media
+files, API credentials, generated transcripts, and run notes all live in the
+workspace.
+
+The habit is what matters: start with a short smoke test, configure the provider
+through `.env`, review the generated text before sharing it, and keep the source
+file and run note together. With that routine in place, Sapat can turn product
+demos, customer calls, interviews, lectures, and QA recordings into text without
+turning every transcription request into a fresh setup project.
+
+## References
+
+- [Sapat repository](https://github.com/nkkko/sapat)
+- [IBM Speech to Text getting started](https://cloud.ibm.com/docs/speech-to-text?topic=speech-to-text-gettingStarted)
+- [IBM Speech to Text synchronous HTTP interface](https://cloud.ibm.com/docs/speech-to-text?topic=speech-to-text-http)
+- [IBM Speech to Text API reference](https://cloud.ibm.com/apidocs/speech-to-text)
+- [Companion IBM Watson provider PR](https://github.com/nibzard/sapat/pull/27)
+- [Daytona](https://www.daytona.io/)
diff --git a/guides/assets/20260520_run_ibm_watson_transcription_with_sapat_in_daytona_img1.svg b/guides/assets/20260520_run_ibm_watson_transcription_with_sapat_in_daytona_img1.svg
new file mode 100644
index 00000000..faf4a346
--- /dev/null
+++ b/guides/assets/20260520_run_ibm_watson_transcription_with_sapat_in_daytona_img1.svg
@@ -0,0 +1,26 @@
+<svg xmlns="http://www.w3.org/2000/svg" width="1200" height="520" viewBox="0 0 1200 520" role="img" aria-labelledby="title desc">
+  <title id="title">Sapat and IBM Watson transcription workflow</title>
+  <desc id="desc">A workflow diagram showing Daytona, Sapat, IBM Watson Speech to Text, and transcript output.</desc>
+  <rect width="1200" height="520" fill="#f8fafc"/>
+  <rect x="70" y="95" width="210" height="150" rx="18" fill="#dbeafe" stroke="#1d4ed8" stroke-width="3"/>
+  <text x="175" y="155" text-anchor="middle" font-family="Arial, sans-serif" font-size="24" font-weight="700" fill="#0f172a">Daytona</text>
+  <text x="175" y="192" text-anchor="middle" font-family="Arial, sans-serif" font-size="18" fill="#334155">workspace</text>
+  <rect x="360" y="95" width="210" height="150" rx="18" fill="#dcfce7" stroke="#15803d" stroke-width="3"/>
+  <text x="465" y="155" text-anchor="middle" font-family="Arial, sans-serif" font-size="24" font-weight="700" fill="#0f172a">Sapat CLI</text>
+  <text x="465" y="192" text-anchor="middle" font-family="Arial, sans-serif" font-size="18" fill="#334155">MP4 to MP3</text>
+  <rect x="650" y="95" width="250" height="150" rx="18" fill="#eef2ff" stroke="#4338ca" stroke-width="3"/>
+  <text x="775" y="155" text-anchor="middle" font-family="Arial, sans-serif" font-size="24" font-weight="700" fill="#0f172a">IBM Watson</text>
+  <text x="775" y="192" text-anchor="middle" font-family="Arial, sans-serif" font-size="18" fill="#334155">POST /v1/recognize</text>
+  <rect x="430" y="320" width="360" height="110" rx="18" fill="#ffffff" stroke="#475569" stroke-width="3"/>
+  <text x="610" y="366" text-anchor="middle" font-family="Arial, sans-serif" font-size="24" font-weight="700" fill="#0f172a">Transcript</text>
+  <text x="610" y="402" text-anchor="middle" font-family="Arial, sans-serif" font-size="18" fill="#334155">saved as .txt for review</text>
+  <path d="M280 170 H360" stroke="#0f172a" stroke-width="4" fill="none" marker-end="url(#arrow)"/>
+  <path d="M570 170 H650" stroke="#0f172a" stroke-width="4" fill="none" marker-end="url(#arrow)"/>
+  <path d="M775 245 C775 290 720 320 690 335" stroke="#0f172a" stroke-width="4" fill="none" marker-end="url(#arrow)"/>
+  <path d="M465 245 C465 285 500 315 530 335" stroke="#0f172a" stroke-width="4" fill="none" marker-end="url(#arrow)"/>
+  <defs>
+    <marker id="arrow" markerWidth="12" markerHeight="12" refX="10" refY="6" orient="auto">
+      <path d="M2 2 L10 6 L2 10 Z" fill="#0f172a"/>
+    </marker>
+  </defs>
+</svg>