From 62172b290a8f89b4bfa7f2258e74fe810ca8d742 Mon Sep 17 00:00:00 2001
From: Marc Harkonen <marc@liquid.ai>
Date: Thu, 28 May 2026 18:30:59 +0800
Subject: [PATCH 1/4] Remove torchaudio.load calls

These may call torchcodec in later versions
---
 src/liquid_audio/data/mapper.py | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/src/liquid_audio/data/mapper.py b/src/liquid_audio/data/mapper.py
index c1804d3..d955466 100644
--- a/src/liquid_audio/data/mapper.py
+++ b/src/liquid_audio/data/mapper.py
@@ -2,6 +2,7 @@
 
 import io
 
+import soundfile
 import torch
 import torchaudio
 
@@ -233,7 +234,8 @@ def _encode_audio_out(self, *, wav: torch.Tensor, sampling_rate: int) -> torch.T
     @staticmethod
     def _load_audio_bytes(audio: bytes) -> tuple[torch.Tensor, int]:
         with io.BytesIO(audio) as stream:
-            wav, sampling_rate = torchaudio.load(stream)
+            data, sampling_rate = soundfile.read(stream, dtype="float32", always_2d=True)
+        wav = torch.from_numpy(data.T.copy())
         if wav.shape[0] > 1:
             wav = wav.mean(dim=0, keepdim=True)
         return wav, sampling_rate

From e612a7a4d4527a117a0eed94c44699c24d7c2cc5 Mon Sep 17 00:00:00 2001
From: Marc Harkonen <marc@liquid.ai>
Date: Thu, 28 May 2026 18:31:47 +0800
Subject: [PATCH 2/4] update dependencies

---
 pyproject.toml |  1 -
 uv.lock        | 18 ------------------
 2 files changed, 19 deletions(-)

diff --git a/pyproject.toml b/pyproject.toml
index 7ee8c03..432f850 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -17,7 +17,6 @@ dependencies = [
     "sentencepiece>=0.2.1",
     "torch>=2.8.0",
     "torchaudio>=2.8.0",
-    "torchcodec>=0.9.1",
     "transformers>=4.55.4",
 ]
 keywords = ["Liquid AI", "LFM", "LFM2", "Audio", "Speech-to-Speech"]
diff --git a/uv.lock b/uv.lock
index 2c07b5f..89e02c6 100644
--- a/uv.lock
+++ b/uv.lock
@@ -1040,7 +1040,6 @@ dependencies = [
     { name = "sentencepiece" },
     { name = "torch" },
     { name = "torchaudio" },
-    { name = "torchcodec" },
     { name = "transformers" },
 ]
 
@@ -1066,7 +1065,6 @@ requires-dist = [
     { name = "sentencepiece", specifier = ">=0.2.1" },
     { name = "torch", specifier = ">=2.8.0" },
     { name = "torchaudio", specifier = ">=2.8.0" },
-    { name = "torchcodec", specifier = ">=0.9.1" },
     { name = "transformers", specifier = ">=4.55.4" },
 ]
 provides-extras = ["demo"]
@@ -2720,22 +2718,6 @@ wheels = [
     { url = "https://files.pythonhosted.org/packages/52/27/7fc2d7435af044ffbe0b9b8e98d99eac096d43f128a5cde23c04825d5dcf/torchaudio-2.8.0-cp313-cp313t-win_amd64.whl", hash = "sha256:d4a715d09ac28c920d031ee1e60ecbc91e8a5079ad8c61c0277e658436c821a6", size = 2549553, upload-time = "2025-08-06T14:59:00.019Z" },
 ]
 
-[[package]]
-name = "torchcodec"
-version = "0.9.1"
-source = { registry = "https://pypi.org/simple" }
-wheels = [
-    { url = "https://files.pythonhosted.org/packages/4b/60/3bfa459e09987af08e188811b191437c9d8215a74f4d418be6ff7df87b5c/torchcodec-0.9.1-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:8996ec62b72c69545c30246df64df386d06d7ec7de0689be5d20dfc06aad6442", size = 4064264, upload-time = "2025-12-10T15:55:56.313Z" },
-    { url = "https://files.pythonhosted.org/packages/17/c8/bfb74babec98aff11ab4f239b0901f39e1a93338b3438e842d864dc46935/torchcodec-0.9.1-cp312-cp312-manylinux_2_28_x86_64.whl", hash = "sha256:a50568ce73b70395d113833fb07394c223f5546ef5d4fafe0fdcd91627fca270", size = 2061978, upload-time = "2025-12-10T15:55:33.415Z" },
-    { url = "https://files.pythonhosted.org/packages/b2/12/c0bbf01b0ed52b69aaeed4af1043dc8308ccc522a47fcc082b34882e2ba2/torchcodec-0.9.1-cp312-cp312-win_amd64.whl", hash = "sha256:d9c8efe5845bde45a428f96493b4a041511f47f5bd53b333a0ad90426be4623a", size = 2187178, upload-time = "2025-12-10T15:56:16.968Z" },
-    { url = "https://files.pythonhosted.org/packages/6f/c7/67fc8417f9efa8a25c00a44f0d674761a0bad9c45e9725e3fd116b3c48ed/torchcodec-0.9.1-cp313-cp313-macosx_12_0_arm64.whl", hash = "sha256:5c9cdcba50c75be70ef6ec919ec1f7f14d9d5163d93cf6bd94403e134f03734c", size = 4034415, upload-time = "2025-12-10T15:56:02.04Z" },
-    { url = "https://files.pythonhosted.org/packages/68/05/06240f661e9aa08b20765305e3b88f60bff706bbe54ac35830af74612443/torchcodec-0.9.1-cp313-cp313-manylinux_2_28_x86_64.whl", hash = "sha256:0643c5e9c3a51fdafdea87935d5b0a38e99626c664f47a150482d77ab370a877", size = 2067767, upload-time = "2025-12-10T15:55:37.27Z" },
-    { url = "https://files.pythonhosted.org/packages/13/a2/d78cd65863fb805d9e35fe90ae7574eab86ff0ae63438208bd07d2cf1fd2/torchcodec-0.9.1-cp313-cp313-win_amd64.whl", hash = "sha256:df0b5a15998fd7457625c2af2a6276e0e710fac158d145045340dbbcd1cfdb65", size = 2186788, upload-time = "2025-12-10T15:56:20.204Z" },
-    { url = "https://files.pythonhosted.org/packages/01/02/f8ae9443d3bcbe8a8d6d0bbc3992296e5476e5afa1f244100a3a7967a36c/torchcodec-0.9.1-cp314-cp314-macosx_11_0_arm64.whl", hash = "sha256:b9bc5a5dff925df96d11bf90bd0ce964b8086bb11ae09adf353518192b5da483", size = 3812248, upload-time = "2025-12-10T15:56:06.382Z" },
-    { url = "https://files.pythonhosted.org/packages/59/a1/8462b55571286847ea31edb7634583125400824267db9ba8301f4ce3f137/torchcodec-0.9.1-cp314-cp314-manylinux_2_28_x86_64.whl", hash = "sha256:65634bb28b3155cf99f980dac31ecedb414c07b8156f8473ec9fb74bedbd2a1f", size = 2068456, upload-time = "2025-12-10T15:55:40.577Z" },
-    { url = "https://files.pythonhosted.org/packages/f2/63/752d0fc1c6e8f799ae880ca1087510def663a7f9aa1a70074ae334c6908f/torchcodec-0.9.1-cp314-cp314-win_amd64.whl", hash = "sha256:2d01c8b3685a3a38f050ed2b526808a2938dba6f56cb9f9e967884fd858bba15", size = 2188320, upload-time = "2025-12-10T15:56:24.63Z" },
-]
-
 [[package]]
 name = "tqdm"
 version = "4.67.1"

From d33bd96284e88b072676d7a29257f9cc21c01eb0 Mon Sep 17 00:00:00 2001
From: Marc Harkonen <marc@liquid.ai>
Date: Tue, 2 Jun 2026 15:01:32 +0900
Subject: [PATCH 3/4] adapt README examples

---
 README.md | 33 +++++++++++++++++----------------
 1 file changed, 17 insertions(+), 16 deletions(-)

diff --git a/README.md b/README.md
index 2f85600..1bdf5fc 100644
--- a/README.md
+++ b/README.md
@@ -15,6 +15,11 @@ pip install "liquid-audio [demo]" # optional, to install demo dependencies
 pip install flash-attn --no-build-isolation  # optional, to use flash attention 2. Will fallback to torch SDPA if not installed
 ```
 
+For installation on AMD ROCm, don't forget to specify the correct `pytorch` index, e.g.
+```bash
+pip install liquid-audio --index-url https://download.pytorch.org/whl/rocm7.2
+```
+
 ## Usage
 Generation is handled by two generation modes, interleaved and sequential, accessible from the methods `LFM2AudioModel.generate_interleaved` and `LFM2AudioModel.generate_sequential` respectively. Both are generators that yield `torch.Tensor`s. Text tokens are represented by tensors with 1 entry, and audio tokens are tensors with 8 entries, corresponding to 8 [Mimi](https://huggingface.co/docs/transformers/en/model_doc/mimi) codebooks.
 
@@ -60,7 +65,7 @@ https://github.com/user-attachments/assets/d0d054b2-6d1d-49fb-94df-4aa0b6641990
 
 ```python
 import torch
-import torchaudio
+import soundfile as sf
 from liquid_audio import LFM2AudioModel, LFM2AudioProcessor, ChatState, LFMModality
 
 # Load models
@@ -77,7 +82,8 @@ chat.add_text("Respond with interleaved text and audio.")
 chat.end_turn()
 
 chat.new_turn("user")
-wav, sampling_rate = torchaudio.load("assets/question.wav")
+wav, sampling_rate = sf.read("assets/question.wav", dtype="float32")
+wav = torch.from_numpy(wav).unsqueeze(0)
 chat.add_audio(wav, sampling_rate)
 chat.end_turn()
 
@@ -102,7 +108,7 @@ for t in model.generate_interleaved(**chat, max_new_tokens=512, audio_temperatur
 # Mimi returns audio at 24kHz
 audio_codes = torch.stack(audio_out[:-1], 1).unsqueeze(0)
 waveform = processor.decode(audio_codes)
-torchaudio.save("answer1.wav", waveform.cpu(), 24_000)
+sf.write("answer1.wav", waveform.cpu()[0], 24_000)
 
 # Append newly generated tokens to chat history
 chat.append(
@@ -132,7 +138,7 @@ for t in model.generate_interleaved(**chat, max_new_tokens=512, audio_temperatur
 # Detokenize second turn audio, removing the last "end-of-audio" codes
 audio_codes = torch.stack(audio_out[:-1], 1).unsqueeze(0)
 waveform = processor.decode(audio_codes)
-torchaudio.save("answer2.wav", waveform.cpu(), 24_000)
+sf.write("answer2.wav", waveform.cpu()[0], 24_000)
 ```
 
 
@@ -151,7 +157,7 @@ https://github.com/user-attachments/assets/b3cc017f-363d-49f3-8e7d-f6db9556900e
 
 ```python
 import torch
-import torchaudio
+import soundfile as sf
 from liquid_audio import LFM2AudioModel, LFM2AudioProcessor, ChatState, LFMModality
 
 # Load models
@@ -168,7 +174,8 @@ chat.add_text("Perform ASR.")
 chat.end_turn()
 
 chat.new_turn("user")
-wav, sampling_rate = torchaudio.load("assets/asr.wav")
+wav, sampling_rate = sf.read("assets/asr.wav", dtype="float32")
+wav = torch.from_numpy(wav).unsqueeze(0)
 chat.add_audio(wav, sampling_rate)
 chat.end_turn()
 
@@ -207,7 +214,7 @@ https://github.com/user-attachments/assets/8d57c184-b92e-4e1a-983b-d1f9d16d0d92
 
 ```python
 import torch
-import torchaudio
+import soundfile as sf
 from liquid_audio import LFM2AudioModel, LFM2AudioProcessor, ChatState, LFMModality
 
 # Load models
@@ -238,7 +245,7 @@ for t in model.generate_sequential(**chat, max_new_tokens=512, audio_temperature
 # Detokenize audio
 audio_codes = torch.stack(audio_out[:-1], 1).unsqueeze(0)
 waveform = processor.decode(audio_codes)
-torchaudio.save("tts.wav", waveform.cpu(), 24_000)
+sf.write("tts.wav", waveform.cpu()[0], 24_000)
 ```
 
 ## Finetuning
@@ -249,12 +256,6 @@ To finetune on your own data, make use of the `ChatMessage` interface. This requ
 2. use the [`LFM2AudioChatMapper`](src/liquid_audio/data/mapper.py) to create a preprocessed dataset
 3. train a model from the preprocessed dataset with `LFM2DataLoader`
 
-First, install project dependencies:
-
-```bash
-uv sync
-```
-
 ### Preprocess
 
 Before training, convert dataset into our preprocessed training format.
@@ -274,7 +275,7 @@ See [examples/preprocess_jenny_tts.py](examples/preprocess_jenny_tts.py) for an
 Run preprocessing with:
 
 ```bash
-python -m examples.preprocess_jenny_tts
+python examples/preprocess_jenny_tts
 ```
 
 This writes a preprocessed dataset to `data/jenny_tts/train`.
@@ -287,7 +288,7 @@ For example, to finetune a model on the [Jenny TTS Dataset](https://huggingface.
 using the preprocessed dataset from before, run:
 
 ```bash
-python -m examples.train
+python examples/train
 ```
 
 

From c84eea093988ead577f599dbd4cf6f908eddab79 Mon Sep 17 00:00:00 2001
From: Marc Harkonen <marc@liquid.ai>
Date: Tue, 2 Jun 2026 15:33:18 +0900
Subject: [PATCH 4/4] add to README

---
 README.md | 9 +++++----
 1 file changed, 5 insertions(+), 4 deletions(-)

diff --git a/README.md b/README.md
index 1bdf5fc..562bee3 100644
--- a/README.md
+++ b/README.md
@@ -5,6 +5,7 @@ We present LFM2-Audio-1.5B, [Liquid AI](https://www.liquid.ai/)'s first end-to-e
 LFM2-Audio supports two generation modes, interleaved and sequential, to maximize performance and quality across different tasks. Interleaved generation outputs text and audio tokens in a fixed interleaved pattern. This approach minimizes time to first audio output and number of tokens generated, making it ideal for naturally flowing real-time speech-to-speech interactions on resource constrained devices. Sequential generation mode, where the model decides when to switch modalities via special tokens, is suitable for non-conversational tasks, such as speech-to-text (ASR) or text-to-speech (TTS).
 
 ### Updates
+- [Finetuning](#finetuning) is now supported in both interleaved and sequential generation modes. Version 1.2.0 introduces data preparation tools and a lightweight trainer, enabling users to fine-tune models for a broad range of tasks, from ASR and TTS to function calling and end-to-end speech-to-speech chat.
 - [LFM2.5-Audio-1.5B](https://huggingface.co/LiquidAI/LFM2.5-Audio-1.5B) is released! This model is based on the stronger LFM2.5-1.2B base, and comes with a lightning fast LFM2 based audio detokenizer, stronger ASR, and better TTS voices. To use the new detokenizer, simply use `processor.decode`, see the examples below for more details. For the improved TTS voices, see the [TTS](#tts) section.
 
 ## Installation
@@ -15,9 +16,9 @@ pip install "liquid-audio [demo]" # optional, to install demo dependencies
 pip install flash-attn --no-build-isolation  # optional, to use flash attention 2. Will fallback to torch SDPA if not installed
 ```
 
-For installation on AMD ROCm, don't forget to specify the correct `pytorch` index, e.g.
+For installation on AMD ROCm, don't forget to specify the correct `pytorch` version and index, e.g.
 ```bash
-pip install liquid-audio --index-url https://download.pytorch.org/whl/rocm7.2
+pip install liquid-audio torch==2.12.0+rocm7.2 --extra-index-url https://download.pytorch.org/whl/rocm7.2
 ```
 
 ## Usage
@@ -275,7 +276,7 @@ See [examples/preprocess_jenny_tts.py](examples/preprocess_jenny_tts.py) for an
 Run preprocessing with:
 
 ```bash
-python examples/preprocess_jenny_tts
+python examples/preprocess_jenny_tts.py
 ```
 
 This writes a preprocessed dataset to `data/jenny_tts/train`.
@@ -288,7 +289,7 @@ For example, to finetune a model on the [Jenny TTS Dataset](https://huggingface.
 using the preprocessed dataset from before, run:
 
 ```bash
-python examples/train
+python examples/train.py
 ```