Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
5 changes: 5 additions & 0 deletions .env.example
Original file line number Diff line number Diff line change
Expand Up @@ -14,3 +14,8 @@ OPENAI_API_KEY=
OPENAI_MODEL=whisper-1
OPENAI_API_ENDPOINT=https://api.openai.com/v1/audio/transcriptions
OPENAI_MODEL_NAME_CHAT=gpt-4o

CLOUDFLARE_ACCOUNT_ID=
CLOUDFLARE_API_TOKEN=
CLOUDFLARE_WHISPER_MODEL=@cf/openai/whisper
CLOUDFLARE_MAX_FILE_SIZE_MB=25
22 changes: 18 additions & 4 deletions README.md
Original file line number Diff line number Diff line change
@@ -1,21 +1,22 @@
# Video Transcription Tool

This tool automates the process of transcribing video files using multiple transcription services: Azure OpenAI, Groq, and OpenAI APIs. It converts video files to MP3 format, transcribes the audio, and saves the transcription as a text file.
This tool automates the process of transcribing video files using multiple transcription services: Azure OpenAI, Cloudflare Workers AI, Groq, and OpenAI APIs. It converts video files to MP3 format, transcribes the audio, and saves the transcription as a text file.

## Features

- Converts video files to MP3 format using ffmpeg
- Supports transcription using Azure OpenAI, Groq, and OpenAI APIs
- Supports transcription using Azure OpenAI, Cloudflare Workers AI, Groq, and OpenAI APIs
- Supports processing of individual video files or entire directories
- Cleans up temporary MP3 files after transcription
- Provides flexibility in selecting the transcription service via configuration

## Prerequisites

- Python 3.6+
- Python 3.8+
- ffmpeg installed and available in the system PATH
- API access for one or more supported services:
- Azure OpenAI API
- Cloudflare Workers AI API
- Groq Cloud API
- OpenAI API

Expand Down Expand Up @@ -53,6 +54,12 @@ This tool automates the process of transcribing video files using multiple trans
OPENAI_MODEL=whisper-1
OPENAI_API_ENDPOINT=https://api.openai.com/v1/audio/transcriptions
OPENAI_MODEL_NAME_CHAT=gpt-4o

# Cloudflare Workers AI
CLOUDFLARE_ACCOUNT_ID=your_cloudflare_account_id_here
CLOUDFLARE_API_TOKEN=your_cloudflare_api_token_here
CLOUDFLARE_WHISPER_MODEL=@cf/openai/whisper
CLOUDFLARE_MAX_FILE_SIZE_MB=25
```

## Building and Installing the Package
Expand Down Expand Up @@ -113,6 +120,7 @@ sapat <video_file_or_directory> [--language <language>] [--prompt <prompt>] [--t
- `--quality`: Quality of the MP3 audio: 'L' for low, 'M' for medium, and 'H' for high (default: 'M').
- `--api`: Specify the API to use for transcription.
- `--api azure` for Azure OpenAI API
- `--api cloudflare` for Cloudflare Workers AI
- `--api groq` for Groq Cloud API
- `--api openai` for OpenAI API

Expand All @@ -122,14 +130,20 @@ Example:
sapat my_video.mp4 --quality H --language es --prompt "This is a test prompt" --temperature 0.5 --api groq
```

Cloudflare Workers AI example:

```
sapat my_video.mp4 --quality M --api cloudflare
```

- If a file is provided, it will process that single file.
- If a directory is provided, it will process all `.mp4` files in that directory.

The script will create a `.txt` file with the same name as the input video file, containing the transcription.

## Note

This tool is designed for use with multiple APIs (Azure OpenAI, Groq, and OpenAI). Ensure you have valid API credentials configured in the `.env` file and the necessary permissions and credits for the API service you plan to use.
This tool is designed for use with multiple APIs (Azure OpenAI, Cloudflare Workers AI, Groq, and OpenAI). Ensure you have valid API credentials configured in the `.env` file and the necessary permissions and credits for the API service you plan to use.

## License

Expand Down
2 changes: 1 addition & 1 deletion pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,7 @@
name = "sapat"
version = "0.1.2"
description = "Video Transcription Tool using different APIs"
requires-python = ">=3.6"
requires-python = ">=3.8"
dependencies = [
"click>=8.1.8",
"requests>=2.32.3",
Expand Down
5 changes: 4 additions & 1 deletion src/sapat/script.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,7 @@
from pathlib import Path
from .transcription.groq import GroqCloudTranscription
from .transcription.azure import AzureTranscription
from .transcription.cloudflare import CloudflareTranscription
from .transcription.openai import OpenAITranscription

@click.command()
Expand All @@ -11,7 +12,7 @@
@click.option("--temperature", "-t", type=float, default=0.3, help="Sampling temperature (default: 0.3)")
@click.option("--quality", "-q", type=click.Choice(['L', 'M', 'H'], case_sensitive=False), default='M', help="Quality of the MP3 audio: 'L' for low, 'M' for medium, and 'H' for high (default: 'M')")
@click.option("--correct", is_flag=True, help="Use LLM to correct the transcript")
@click.option("--api", "-a", type=click.Choice(['openai', 'groq', 'azure'], case_sensitive=True), required=True, help="API to use for the transcription ('openai', 'groq' or 'azure')")
@click.option("--api", "-a", type=click.Choice(['openai', 'groq', 'azure', 'cloudflare'], case_sensitive=True), required=True, help="API to use for the transcription ('openai', 'groq', 'azure' or 'cloudflare')")
def main(input_path, language, prompt, temperature, quality, correct, api):
"""
Transcribe video files using different APIs.
Expand All @@ -26,6 +27,8 @@ def main(input_path, language, prompt, temperature, quality, correct, api):
transcriber = GroqCloudTranscription(temperature=temperature)
elif api.lower() == "azure":
transcriber = AzureTranscription(temperature=temperature)
elif api.lower() == "cloudflare":
transcriber = CloudflareTranscription(temperature=temperature)
elif api.lower() == "openai":
transcriber = OpenAITranscription(temperature=temperature)
else:
Expand Down
106 changes: 106 additions & 0 deletions src/sapat/transcription/cloudflare.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,106 @@
import os
from pathlib import Path

import requests
from dotenv import load_dotenv

from .base import TranscriptionBase

# Load environment variables
load_dotenv(".env")


class CloudflareTranscription(TranscriptionBase):
"""
Cloudflare Workers AI implementation for transcription.
"""

def __init__(self, temperature: float, response_format: str = "json"):
"""
Initializes the CloudflareTranscription class.

Parameters:
- temperature (float): Default temperature value for transcription.
- response_format (str): Default response format for transcription.
"""
self.api_token = os.getenv("CLOUDFLARE_API_TOKEN")
self.account_id = os.getenv("CLOUDFLARE_ACCOUNT_ID")
self.model = os.getenv("CLOUDFLARE_WHISPER_MODEL", "@cf/openai/whisper")
self.endpoint = os.getenv("CLOUDFLARE_API_ENDPOINT")
self.temperature = temperature
self.response_format = response_format
self.max_file_size_mb = int(os.getenv("CLOUDFLARE_MAX_FILE_SIZE_MB", "25"))

def transcribe_audio(self, audio_file: str, **kwargs):
"""
Transcribes an audio file using Cloudflare Workers AI Whisper.

Parameters:
- audio_file (str): Path to the audio file.
- kwargs: Additional parameters accepted for CLI compatibility.

Returns:
- dict or str: The transcription result.
"""
self._validate_audio_file(audio_file)
self._validate_configuration()

headers = {
"Authorization": f"Bearer {self.api_token}",
"Content-Type": self._content_type(audio_file),
}

with open(audio_file, "rb") as f:
response = requests.post(self._transcription_endpoint(), headers=headers, data=f.read())

if response.status_code != 200:
raise Exception(f"Transcription failed: {response.text}")

payload = response.json()
if isinstance(payload, dict) and "result" in payload:
return payload["result"]
return payload

def _transcription_endpoint(self):
if self.endpoint:
return self.endpoint

return f"https://api.cloudflare.com/client/v4/accounts/{self.account_id}/ai/run/{self.model}"

def _validate_configuration(self):
if not self.api_token:
raise ValueError("CLOUDFLARE_API_TOKEN is required for Cloudflare transcription.")

if not self.account_id and not self.endpoint:
raise ValueError("CLOUDFLARE_ACCOUNT_ID is required unless CLOUDFLARE_API_ENDPOINT is set.")

def _validate_audio_file(self, audio_file: str):
"""
Validates the audio file for size and format.

Parameters:
- audio_file (str): Path to the audio file.

Raises:
- Exception: If the file is invalid.
"""
if not os.path.exists(audio_file):
raise ValueError(f"File {audio_file} does not exist.")

file_size_mb = os.path.getsize(audio_file) / (1024 * 1024)
if file_size_mb > self.max_file_size_mb:
raise Exception(f"File size exceeds the maximum limit of {self.max_file_size_mb} MB.")

valid_extensions = [".mp3", ".wav", ".flac"]
if not any(str(audio_file).endswith(ext) for ext in valid_extensions):
raise ValueError(f"Unsupported audio file format: {audio_file}. Supported formats are {valid_extensions}.")

@staticmethod
def _content_type(audio_file: str):
extension = Path(audio_file).suffix.lower()
content_types = {
".flac": "audio/flac",
".mp3": "audio/mpeg",
".wav": "audio/wav",
}
return content_types.get(extension, "application/octet-stream")
71 changes: 71 additions & 0 deletions tests/test_cloudflare_transcription.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,71 @@
import os
import tempfile
import unittest
from unittest.mock import Mock, patch

from sapat.transcription.cloudflare import CloudflareTranscription


class CloudflareTranscriptionTest(unittest.TestCase):
def setUp(self):
self.previous_env = {
"CLOUDFLARE_ACCOUNT_ID": os.environ.get("CLOUDFLARE_ACCOUNT_ID"),
"CLOUDFLARE_API_TOKEN": os.environ.get("CLOUDFLARE_API_TOKEN"),
"CLOUDFLARE_API_ENDPOINT": os.environ.get("CLOUDFLARE_API_ENDPOINT"),
"CLOUDFLARE_WHISPER_MODEL": os.environ.get("CLOUDFLARE_WHISPER_MODEL"),
}
os.environ["CLOUDFLARE_ACCOUNT_ID"] = "account-123"
os.environ["CLOUDFLARE_API_TOKEN"] = "token-abc"
os.environ.pop("CLOUDFLARE_API_ENDPOINT", None)
os.environ.pop("CLOUDFLARE_WHISPER_MODEL", None)

def tearDown(self):
for key, value in self.previous_env.items():
if value is None:
os.environ.pop(key, None)
else:
os.environ[key] = value

@patch("sapat.transcription.cloudflare.requests.post")
def test_transcribe_audio_posts_binary_audio_to_workers_ai(self, post):
post.return_value = Mock(
status_code=200,
text="ok",
json=Mock(return_value={"success": True, "result": {"text": "hello"}}),
)

with tempfile.NamedTemporaryFile(suffix=".mp3", delete=False) as audio:
audio.write(b"audio-bytes")
path = audio.name

try:
result = CloudflareTranscription(temperature=0.3).transcribe_audio(path)
finally:
os.unlink(path)

self.assertEqual({"text": "hello"}, result)
post.assert_called_once_with(
"https://api.cloudflare.com/client/v4/accounts/account-123/ai/run/@cf/openai/whisper",
headers={
"Authorization": "Bearer token-abc",
"Content-Type": "audio/mpeg",
},
data=b"audio-bytes",
)

def test_missing_token_raises_actionable_error(self):
os.environ.pop("CLOUDFLARE_API_TOKEN", None)

with tempfile.NamedTemporaryFile(suffix=".mp3", delete=False) as audio:
audio.write(b"audio-bytes")
path = audio.name

try:
with self.assertRaisesRegex(ValueError, "CLOUDFLARE_API_TOKEN"):
CloudflareTranscription(temperature=0.3).transcribe_audio(path)
finally:
os.unlink(path)


if __name__ == "__main__":
unittest.main()