From d573e341f064b420ffd844bdefd0552deb26a7d4 Mon Sep 17 00:00:00 2001
From: abridegan-bit <abridegan@gmail.com>
Date: Tue, 26 May 2026 17:07:44 -0400
Subject: [PATCH] Add STT.ai Sapat Daytona guide

Signed-off-by: abridegan-bit <abridegan@gmail.com>
---
 authors/abridegan-bit.md                      |   8 +
 definitions/20260526_definition_stt_ai.md     |  23 ++
 ..._ai_transcription_with_sapat_in_daytona.md | 275 ++++++++++++++++++
 ...20260526_stt_ai_sapat_daytona_workflow.svg |  36 +++
 4 files changed, 342 insertions(+)
 create mode 100644 authors/abridegan-bit.md
 create mode 100644 definitions/20260526_definition_stt_ai.md
 create mode 100644 guides/20260526_guide_run_stt_ai_transcription_with_sapat_in_daytona.md
 create mode 100644 guides/assets/20260526_stt_ai_sapat_daytona_workflow.svg

diff --git a/authors/abridegan-bit.md b/authors/abridegan-bit.md
new file mode 100644
index 00000000..fa0ff429
--- /dev/null
+++ b/authors/abridegan-bit.md
@@ -0,0 +1,8 @@
+Author: abridegan-bit Title: Open Source Contributor Description: abridegan-bit
+works on practical developer tooling, automation, and documentation workflows
+that make open source projects easier to test, review, and reproduce across
+clean environments. Author Image:
+![abridegan-bit](https://avatars.githubusercontent.com/u/279021788?v=4) Author
+LinkedIn: Author Twitter: Company Name: Independent Company Description:
+Independent open source contributor focused on developer experience and
+workflow automation.
diff --git a/definitions/20260526_definition_stt_ai.md b/definitions/20260526_definition_stt_ai.md
new file mode 100644
index 00000000..4a328f49
--- /dev/null
+++ b/definitions/20260526_definition_stt_ai.md
@@ -0,0 +1,23 @@
+---
+title: 'STT.ai'
+description: 'A speech-to-text API for transcribing audio and video files with REST endpoints, diarization, and multiple output formats.'
+date: 2026-05-26
+author: 'abridegan-bit'
+---
+
+# STT.ai
+
+## Definition
+
+STT.ai is a speech-to-text service that exposes REST and WebSocket APIs for
+transcribing audio and video files. Its REST API accepts multipart file uploads
+and can return transcript text, JSON segments, subtitles, speaker labels, and
+word-level timing information depending on the request options.
+
+## Context and Usage
+
+Developers can use STT.ai when they need a transcription API that works with
+common media formats and can be called from standard HTTP clients. In a Daytona
+workspace, it can be wired into command-line tools such as Sapat so audio
+processing, API configuration, and transcript validation happen in a repeatable
+development environment.
diff --git a/guides/20260526_guide_run_stt_ai_transcription_with_sapat_in_daytona.md b/guides/20260526_guide_run_stt_ai_transcription_with_sapat_in_daytona.md
new file mode 100644
index 00000000..576b1bcc
--- /dev/null
+++ b/guides/20260526_guide_run_stt_ai_transcription_with_sapat_in_daytona.md
@@ -0,0 +1,275 @@
+---
+title: "Run STT.ai transcription with Sapat in Daytona"
+description: "Build a reproducible Daytona workspace for Sapat, add the STT.ai provider, and transcribe video or audio files with one CLI."
+date: 2026-05-26
+author: "abridegan-bit"
+tags: ["daytona", "speech-to-text", "python"]
+---
+
+# Run STT.ai transcription with Sapat in Daytona
+
+# Introduction
+
+Audio and video transcripts are easiest to trust when the workflow is
+repeatable. A local laptop can hide package versions, missing FFmpeg binaries,
+and one-off environment variables. A
+[Daytona workspace](../definitions/20240819_definition_daytona%20workspace.md)
+gives the transcription job a clean development environment that can be
+recreated when a teammate, reviewer, or future you needs to run it again.
+
+This guide shows how to run [Sapat](https://github.com/nkkko/sapat), a small
+Python CLI for converting video files to MP3 and sending them to a
+speech-to-text provider, with a new STT.ai provider implementation. STT.ai is
+useful for this workflow because its REST API accepts multipart audio or video
+uploads, supports common media formats, and can return JSON, plain text, SRT,
+or VTT output from a single `POST /v1/transcribe` endpoint.
+
+The companion provider implementation is available in [nibzard/sapat#51](https://github.com/nibzard/sapat/pull/51). It adds `--api sttai`, documents the `STTAI_*` environment variables, and includes mocked tests so the request payload can be reviewed without a live API key.
+
+![STT.ai Sapat workflow](assets/20260526_stt_ai_sapat_daytona_workflow.svg)
+
+## Why this setup works well
+
+Transcription jobs are deceptively stateful. The input format, conversion
+quality, model choice, speaker settings, API key, and output format all affect
+the final transcript. Keeping those choices inside a Daytona workspace turns
+the job into a reproducible recipe instead of a memory exercise. Reviewers can
+see the provider code, run the same tests, and decide whether a transcript
+issue comes from media quality, local conversion, or the remote transcription
+model.
+
+That separation also makes it safer to experiment. You can start with the
+anonymous STT.ai path for a tiny sample, move to a real API key when you need
+larger files, and keep every secret outside the repository. If another provider
+does better on an accent or noisy recording, Sapat still gives you the same CLI
+surface for comparison.
+
+## TL;DR
+
+- **Use Daytona for repeatability**: Open Sapat in a clean workspace instead of relying on local machine state.
+- **Use the STT.ai provider**: Select it with `--api sttai` and configure it with `STTAI_API_KEY`, `STTAI_MODEL`, and related environment variables.
+- **Keep secrets out of Git**: Store API keys in `.env` or Daytona workspace environment settings, never in commits.
+- **Validate before processing real media**: Confirm the CLI sees `sttai`, run the mocked tests, then transcribe a short sample file.
+
+## Prerequisites
+
+You will need:
+
+- A GitHub account and access to fork or clone the Sapat repository.
+- Daytona installed locally. Follow the [Daytona installation guide](https://www.daytona.io/docs/installation/installation/) if you do not already have the CLI.
+- Docker or another runtime supported by your Daytona setup.
+- An STT.ai API key for regular use. STT.ai documents limited anonymous transcription, but a key is better for repeatable work.
+- A short test video or audio file, preferably under one minute while you are validating the workflow.
+
+The commands below assume a Unix-like shell inside the workspace. The same project structure works on any host that Daytona supports because the important dependencies live inside the workspace.
+
+## Step 1: Open Sapat in Daytona
+
+Create a Daytona workspace from your fork or from the upstream repository:
+
+```bash
+daytona create https://github.com/nkkko/sapat --code
+```
+
+When the workspace opens, create a branch for the STT.ai provider work if you are applying the companion PR locally:
+
+```bash
+git checkout -b sttai-provider
+```
+
+If you want to test the submitted provider directly before it is merged, fetch the PR branch:
+
+```bash
+git fetch origin pull/51/head:sttai-provider
+git checkout sttai-provider
+```
+
+This keeps the experiment isolated. If you later decide to compare the STT.ai path with OpenAI, Groq, or Azure, you can switch branches without rebuilding your host machine.
+
+## Step 2: Install the project dependencies
+
+Sapat is a Python package. Create a virtual environment inside the workspace and install the project in editable mode:
+
+```bash
+python -m venv .venv
+source .venv/bin/activate
+python -m pip install --upgrade pip
+python -m pip install -e .
+```
+
+Confirm that the CLI is available:
+
+```bash
+sapat --help
+```
+
+The `--api` option should include `sttai`:
+
+```text
+--api [openai|groq|azure|sttai]
+```
+
+If you do not see `sttai`, check that you are on the branch that includes the provider implementation.
+
+## Step 3: Install FFmpeg in the workspace
+
+Sapat converts media to MP3 before transcription. The conversion step depends on FFmpeg:
+
+```bash
+ffmpeg -version
+```
+
+If the command is missing in your workspace, install it through your workspace image or package manager. On Debian or Ubuntu-based images:
+
+```bash
+sudo apt-get update
+sudo apt-get install -y ffmpeg
+```
+
+For a long-lived team workflow, put the FFmpeg installation in your dev container or workspace setup so every contributor gets the same runtime automatically.
+
+## Step 4: Configure STT.ai
+
+Create a local `.env` file. Keep it untracked:
+
+```bash
+cat > .env <<'EOF'
+STTAI_API_KEY=replace-with-your-key
+STTAI_MODEL=large-v3-turbo
+STTAI_API_ENDPOINT=https://api.stt.ai/v1/transcribe
+STTAI_DIARIZE=true
+STTAI_SPEAKERS=0
+STTAI_RESPONSE_FORMAT=json
+EOF
+```
+
+The provider reads these values:
+
+| Variable | Purpose | Default |
+| --- | --- | --- |
+| `STTAI_API_KEY` | Optional bearer token for STT.ai | Empty |
+| `STTAI_MODEL` | Transcription model sent to the API | `large-v3-turbo` |
+| `STTAI_API_ENDPOINT` | STT.ai transcription endpoint | `https://api.stt.ai/v1/transcribe` |
+| `STTAI_DIARIZE` | Enables speaker diarization | `true` |
+| `STTAI_SPEAKERS` | Expected speaker count, where `0` means auto | `0` |
+| `STTAI_RESPONSE_FORMAT` | API output format | `json` |
+| `STTAI_MAX_FILE_SIZE_MB` | Local file-size guard before upload | `500` |
+
+STT.ai documents `json`, `txt`, `srt`, and `vtt` response formats. Keep `json` while testing because Sapat can write the returned `text` field directly to the output `.txt` file.
+
+## Step 5: Run the mocked validation
+
+Before using an API key, run the unit tests for the provider:
+
+```bash
+python -m unittest discover -s tests -v
+```
+
+The STT.ai tests check that Sapat:
+
+- posts the media file as multipart form data;
+- sends model, language, diarization, speaker, and response-format options;
+- includes the bearer token only when `STTAI_API_KEY` is present;
+- rejects missing input files before making a network request.
+
+These tests do not call STT.ai. They are meant to make code review deterministic and safe.
+
+## Step 6: Transcribe a short sample
+
+Place a short video or audio file in the workspace, then run:
+
+```bash
+sapat ./samples/demo.mp4 --api sttai --language auto --quality M
+```
+
+Sapat will:
+
+1. Convert `demo.mp4` to `demo.mp3` with FFmpeg.
+2. Upload `demo.mp3` to STT.ai through `POST /v1/transcribe`.
+3. Save the returned transcript to `demo.txt`.
+4. Remove the temporary MP3 file.
+
+For a single-speaker English clip, you can make the request more specific:
+
+```bash
+STTAI_DIARIZE=false STTAI_SPEAKERS=1 sapat ./samples/demo.mp4 --api sttai --language en --quality H
+```
+
+For a folder of videos:
+
+```bash
+sapat ./recordings --api sttai --language auto --quality M
+```
+
+Sapat currently scans directories for `.mp4` files. If your source files are already audio, run them one by one or extend the directory scan in a separate patch.
+
+## Step 7: Review the output
+
+Open the generated `.txt` file:
+
+```bash
+sed -n '1,120p' ./samples/demo.txt
+```
+
+Check for:
+
+- speaker labels if diarization is enabled;
+- obvious language-detection mistakes;
+- domain-specific words that may need a follow-up correction pass;
+- timestamps or subtitle output if you decide to switch `STTAI_RESPONSE_FORMAT` to `srt` or `vtt`.
+
+If you need subtitle files, set the response format before running Sapat:
+
+```bash
+STTAI_RESPONSE_FORMAT=srt sapat ./samples/demo.mp4 --api sttai --language auto
+```
+
+In that mode, the provider returns text instead of JSON. Sapat will still save it to the `.txt` path, so rename the output to `.srt` after the run or add a future improvement that chooses the extension from the response format.
+
+## Troubleshooting
+
+**Problem:** `sapat --help` does not show `sttai`.
+
+**Solution:** Check out the branch that contains the STT.ai provider and reinstall the package in editable mode:
+
+```bash
+git checkout sttai-provider
+python -m pip install -e .
+```
+
+**Problem:** The command fails before uploading.
+
+**Solution:** Confirm the file exists, the extension is supported, and FFmpeg can read it:
+
+```bash
+ffmpeg -i ./samples/demo.mp4 -f null -
+```
+
+**Problem:** STT.ai returns `401 Unauthorized`.
+
+**Solution:** Verify `STTAI_API_KEY` in `.env` and make sure the workspace shell has loaded it. If you use Daytona-managed environment variables, restart the workspace shell after changing them.
+
+**Problem:** STT.ai returns `402 Payment Required` or `429 Too Many Requests`.
+
+**Solution:** Use a shorter sample while testing, check your account credits, or wait for the rate-limit window to reset. For CI and PR review, rely on the mocked unit tests instead of live transcription.
+
+**Problem:** The transcript is empty or in the wrong language.
+
+**Solution:** Try `--language auto` first, then specify an ISO language code such as `en`, `es`, or `fr`. Also test with a higher-quality MP3 conversion:
+
+```bash
+sapat ./samples/demo.mp4 --api sttai --language auto --quality H
+```
+
+## Conclusion
+
+You now have a reproducible Daytona workflow for running Sapat with STT.ai. The workspace holds the Python environment, FFmpeg dependency, provider code, and validation commands in one place, while secrets stay in `.env` or Daytona environment settings.
+
+The main benefit is reviewability. A teammate can fetch the same Sapat branch, run the mocked tests, inspect the exact `POST /v1/transcribe` request shape, and then try a short real sample with their own STT.ai key. That makes the transcription pipeline easier to debug than a one-off local script.
+
+## References
+
+- [Sapat repository](https://github.com/nkkko/sapat)
+- [Companion STT.ai provider PR](https://github.com/nibzard/sapat/pull/51)
+- [STT.ai API documentation](https://stt.ai/api/)
+- [Daytona installation guide](https://www.daytona.io/docs/installation/installation/)
diff --git a/guides/assets/20260526_stt_ai_sapat_daytona_workflow.svg b/guides/assets/20260526_stt_ai_sapat_daytona_workflow.svg
new file mode 100644
index 00000000..b47752a5
--- /dev/null
+++ b/guides/assets/20260526_stt_ai_sapat_daytona_workflow.svg
@@ -0,0 +1,36 @@
+<svg xmlns="http://www.w3.org/2000/svg" width="1200" height="520" viewBox="0 0 1200 520" role="img" aria-labelledby="title desc">
+  <title id="title">STT.ai Sapat workflow in Daytona</title>
+  <desc id="desc">A workflow diagram showing media files processed in a Daytona workspace by Sapat, sent to STT.ai, and saved as transcripts.</desc>
+  <rect width="1200" height="520" fill="#f7fafc"/>
+  <rect x="70" y="70" width="1060" height="380" rx="18" fill="#ffffff" stroke="#1f2937" stroke-width="3"/>
+  <text x="100" y="125" font-family="Arial, sans-serif" font-size="30" font-weight="700" fill="#111827">Reproducible transcription workflow</text>
+  <text x="100" y="160" font-family="Arial, sans-serif" font-size="18" fill="#4b5563">Run Sapat inside Daytona, keep secrets local, and send only the prepared media file to STT.ai.</text>
+
+  <g font-family="Arial, sans-serif">
+    <rect x="105" y="225" width="185" height="110" rx="12" fill="#e0f2fe" stroke="#0369a1" stroke-width="2"/>
+    <text x="136" y="267" font-size="22" font-weight="700" fill="#0c4a6e">Media input</text>
+    <text x="132" y="296" font-size="16" fill="#075985">MP4, MP3, WAV</text>
+
+    <path d="M310 280 H405" stroke="#374151" stroke-width="4" fill="none"/>
+    <path d="M405 280 L388 268 M405 280 L388 292" stroke="#374151" stroke-width="4" fill="none"/>
+
+    <rect x="425" y="205" width="245" height="150" rx="12" fill="#ecfdf5" stroke="#047857" stroke-width="2"/>
+    <text x="470" y="248" font-size="22" font-weight="700" fill="#065f46">Daytona workspace</text>
+    <text x="465" y="279" font-size="16" fill="#047857">Python venv + FFmpeg</text>
+    <text x="465" y="305" font-size="16" fill="#047857">Sapat --api sttai</text>
+
+    <path d="M690 280 H785" stroke="#374151" stroke-width="4" fill="none"/>
+    <path d="M785 280 L768 268 M785 280 L768 292" stroke="#374151" stroke-width="4" fill="none"/>
+
+    <rect x="805" y="205" width="185" height="150" rx="12" fill="#fff7ed" stroke="#c2410c" stroke-width="2"/>
+    <text x="852" y="248" font-size="22" font-weight="700" fill="#7c2d12">STT.ai</text>
+    <text x="838" y="279" font-size="16" fill="#9a3412">/v1/transcribe</text>
+    <text x="835" y="305" font-size="16" fill="#9a3412">JSON, TXT, SRT</text>
+
+    <path d="M900 358 C900 410 745 410 670 345" stroke="#374151" stroke-width="4" fill="none"/>
+    <path d="M670 345 L690 348 M670 345 L679 363" stroke="#374151" stroke-width="4" fill="none"/>
+
+    <rect x="425" y="370" width="245" height="55" rx="12" fill="#eef2ff" stroke="#4f46e5" stroke-width="2"/>
+    <text x="483" y="405" font-size="19" font-weight="700" fill="#312e81">Transcript saved locally</text>
+  </g>
+</svg>