From 62172b290a8f89b4bfa7f2258e74fe810ca8d742 Mon Sep 17 00:00:00 2001 From: Marc Harkonen Date: Thu, 28 May 2026 18:30:59 +0800 Subject: [PATCH 1/4] Remove torchaudio.load calls These may call torchcodec in later versions --- src/liquid_audio/data/mapper.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/src/liquid_audio/data/mapper.py b/src/liquid_audio/data/mapper.py index c1804d3..d955466 100644 --- a/src/liquid_audio/data/mapper.py +++ b/src/liquid_audio/data/mapper.py @@ -2,6 +2,7 @@ import io +import soundfile import torch import torchaudio @@ -233,7 +234,8 @@ def _encode_audio_out(self, *, wav: torch.Tensor, sampling_rate: int) -> torch.T @staticmethod def _load_audio_bytes(audio: bytes) -> tuple[torch.Tensor, int]: with io.BytesIO(audio) as stream: - wav, sampling_rate = torchaudio.load(stream) + data, sampling_rate = soundfile.read(stream, dtype="float32", always_2d=True) + wav = torch.from_numpy(data.T.copy()) if wav.shape[0] > 1: wav = wav.mean(dim=0, keepdim=True) return wav, sampling_rate From e612a7a4d4527a117a0eed94c44699c24d7c2cc5 Mon Sep 17 00:00:00 2001 From: Marc Harkonen Date: Thu, 28 May 2026 18:31:47 +0800 Subject: [PATCH 2/4] update dependencies --- pyproject.toml | 1 - uv.lock | 18 ------------------ 2 files changed, 19 deletions(-) diff --git a/pyproject.toml b/pyproject.toml index 7ee8c03..432f850 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -17,7 +17,6 @@ dependencies = [ "sentencepiece>=0.2.1", "torch>=2.8.0", "torchaudio>=2.8.0", - "torchcodec>=0.9.1", "transformers>=4.55.4", ] keywords = ["Liquid AI", "LFM", "LFM2", "Audio", "Speech-to-Speech"] diff --git a/uv.lock b/uv.lock index 2c07b5f..89e02c6 100644 --- a/uv.lock +++ b/uv.lock @@ -1040,7 +1040,6 @@ dependencies = [ { name = "sentencepiece" }, { name = "torch" }, { name = "torchaudio" }, - { name = "torchcodec" }, { name = "transformers" }, ] @@ -1066,7 +1065,6 @@ requires-dist = [ { name = "sentencepiece", specifier = ">=0.2.1" }, { name = "torch", specifier = ">=2.8.0" }, { name = "torchaudio", specifier = ">=2.8.0" }, - { name = "torchcodec", specifier = ">=0.9.1" }, { name = "transformers", specifier = ">=4.55.4" }, ] provides-extras = ["demo"] @@ -2720,22 +2718,6 @@ wheels = [ { url = "https://files.pythonhosted.org/packages/52/27/7fc2d7435af044ffbe0b9b8e98d99eac096d43f128a5cde23c04825d5dcf/torchaudio-2.8.0-cp313-cp313t-win_amd64.whl", hash = "sha256:d4a715d09ac28c920d031ee1e60ecbc91e8a5079ad8c61c0277e658436c821a6", size = 2549553, upload-time = "2025-08-06T14:59:00.019Z" }, ] -[[package]] -name = "torchcodec" -version = "0.9.1" -source = { registry = "https://pypi.org/simple" } -wheels = [ - { url = "https://files.pythonhosted.org/packages/4b/60/3bfa459e09987af08e188811b191437c9d8215a74f4d418be6ff7df87b5c/torchcodec-0.9.1-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:8996ec62b72c69545c30246df64df386d06d7ec7de0689be5d20dfc06aad6442", size = 4064264, upload-time = "2025-12-10T15:55:56.313Z" }, - { url = "https://files.pythonhosted.org/packages/17/c8/bfb74babec98aff11ab4f239b0901f39e1a93338b3438e842d864dc46935/torchcodec-0.9.1-cp312-cp312-manylinux_2_28_x86_64.whl", hash = "sha256:a50568ce73b70395d113833fb07394c223f5546ef5d4fafe0fdcd91627fca270", size = 2061978, upload-time = "2025-12-10T15:55:33.415Z" }, - { url = "https://files.pythonhosted.org/packages/b2/12/c0bbf01b0ed52b69aaeed4af1043dc8308ccc522a47fcc082b34882e2ba2/torchcodec-0.9.1-cp312-cp312-win_amd64.whl", hash = "sha256:d9c8efe5845bde45a428f96493b4a041511f47f5bd53b333a0ad90426be4623a", size = 2187178, upload-time = "2025-12-10T15:56:16.968Z" }, - { url = "https://files.pythonhosted.org/packages/6f/c7/67fc8417f9efa8a25c00a44f0d674761a0bad9c45e9725e3fd116b3c48ed/torchcodec-0.9.1-cp313-cp313-macosx_12_0_arm64.whl", hash = "sha256:5c9cdcba50c75be70ef6ec919ec1f7f14d9d5163d93cf6bd94403e134f03734c", size = 4034415, upload-time = "2025-12-10T15:56:02.04Z" }, - { url = "https://files.pythonhosted.org/packages/68/05/06240f661e9aa08b20765305e3b88f60bff706bbe54ac35830af74612443/torchcodec-0.9.1-cp313-cp313-manylinux_2_28_x86_64.whl", hash = "sha256:0643c5e9c3a51fdafdea87935d5b0a38e99626c664f47a150482d77ab370a877", size = 2067767, upload-time = "2025-12-10T15:55:37.27Z" }, - { url = "https://files.pythonhosted.org/packages/13/a2/d78cd65863fb805d9e35fe90ae7574eab86ff0ae63438208bd07d2cf1fd2/torchcodec-0.9.1-cp313-cp313-win_amd64.whl", hash = "sha256:df0b5a15998fd7457625c2af2a6276e0e710fac158d145045340dbbcd1cfdb65", size = 2186788, upload-time = "2025-12-10T15:56:20.204Z" }, - { url = "https://files.pythonhosted.org/packages/01/02/f8ae9443d3bcbe8a8d6d0bbc3992296e5476e5afa1f244100a3a7967a36c/torchcodec-0.9.1-cp314-cp314-macosx_11_0_arm64.whl", hash = "sha256:b9bc5a5dff925df96d11bf90bd0ce964b8086bb11ae09adf353518192b5da483", size = 3812248, upload-time = "2025-12-10T15:56:06.382Z" }, - { url = "https://files.pythonhosted.org/packages/59/a1/8462b55571286847ea31edb7634583125400824267db9ba8301f4ce3f137/torchcodec-0.9.1-cp314-cp314-manylinux_2_28_x86_64.whl", hash = "sha256:65634bb28b3155cf99f980dac31ecedb414c07b8156f8473ec9fb74bedbd2a1f", size = 2068456, upload-time = "2025-12-10T15:55:40.577Z" }, - { url = "https://files.pythonhosted.org/packages/f2/63/752d0fc1c6e8f799ae880ca1087510def663a7f9aa1a70074ae334c6908f/torchcodec-0.9.1-cp314-cp314-win_amd64.whl", hash = "sha256:2d01c8b3685a3a38f050ed2b526808a2938dba6f56cb9f9e967884fd858bba15", size = 2188320, upload-time = "2025-12-10T15:56:24.63Z" }, -] - [[package]] name = "tqdm" version = "4.67.1" From d33bd96284e88b072676d7a29257f9cc21c01eb0 Mon Sep 17 00:00:00 2001 From: Marc Harkonen Date: Tue, 2 Jun 2026 15:01:32 +0900 Subject: [PATCH 3/4] adapt README examples --- README.md | 33 +++++++++++++++++---------------- 1 file changed, 17 insertions(+), 16 deletions(-) diff --git a/README.md b/README.md index 2f85600..1bdf5fc 100644 --- a/README.md +++ b/README.md @@ -15,6 +15,11 @@ pip install "liquid-audio [demo]" # optional, to install demo dependencies pip install flash-attn --no-build-isolation # optional, to use flash attention 2. Will fallback to torch SDPA if not installed ``` +For installation on AMD ROCm, don't forget to specify the correct `pytorch` index, e.g. +```bash +pip install liquid-audio --index-url https://download.pytorch.org/whl/rocm7.2 +``` + ## Usage Generation is handled by two generation modes, interleaved and sequential, accessible from the methods `LFM2AudioModel.generate_interleaved` and `LFM2AudioModel.generate_sequential` respectively. Both are generators that yield `torch.Tensor`s. Text tokens are represented by tensors with 1 entry, and audio tokens are tensors with 8 entries, corresponding to 8 [Mimi](https://huggingface.co/docs/transformers/en/model_doc/mimi) codebooks. @@ -60,7 +65,7 @@ https://github.com/user-attachments/assets/d0d054b2-6d1d-49fb-94df-4aa0b6641990 ```python import torch -import torchaudio +import soundfile as sf from liquid_audio import LFM2AudioModel, LFM2AudioProcessor, ChatState, LFMModality # Load models @@ -77,7 +82,8 @@ chat.add_text("Respond with interleaved text and audio.") chat.end_turn() chat.new_turn("user") -wav, sampling_rate = torchaudio.load("assets/question.wav") +wav, sampling_rate = sf.read("assets/question.wav", dtype="float32") +wav = torch.from_numpy(wav).unsqueeze(0) chat.add_audio(wav, sampling_rate) chat.end_turn() @@ -102,7 +108,7 @@ for t in model.generate_interleaved(**chat, max_new_tokens=512, audio_temperatur # Mimi returns audio at 24kHz audio_codes = torch.stack(audio_out[:-1], 1).unsqueeze(0) waveform = processor.decode(audio_codes) -torchaudio.save("answer1.wav", waveform.cpu(), 24_000) +sf.write("answer1.wav", waveform.cpu()[0], 24_000) # Append newly generated tokens to chat history chat.append( @@ -132,7 +138,7 @@ for t in model.generate_interleaved(**chat, max_new_tokens=512, audio_temperatur # Detokenize second turn audio, removing the last "end-of-audio" codes audio_codes = torch.stack(audio_out[:-1], 1).unsqueeze(0) waveform = processor.decode(audio_codes) -torchaudio.save("answer2.wav", waveform.cpu(), 24_000) +sf.write("answer2.wav", waveform.cpu()[0], 24_000) ``` @@ -151,7 +157,7 @@ https://github.com/user-attachments/assets/b3cc017f-363d-49f3-8e7d-f6db9556900e ```python import torch -import torchaudio +import soundfile as sf from liquid_audio import LFM2AudioModel, LFM2AudioProcessor, ChatState, LFMModality # Load models @@ -168,7 +174,8 @@ chat.add_text("Perform ASR.") chat.end_turn() chat.new_turn("user") -wav, sampling_rate = torchaudio.load("assets/asr.wav") +wav, sampling_rate = sf.read("assets/asr.wav", dtype="float32") +wav = torch.from_numpy(wav).unsqueeze(0) chat.add_audio(wav, sampling_rate) chat.end_turn() @@ -207,7 +214,7 @@ https://github.com/user-attachments/assets/8d57c184-b92e-4e1a-983b-d1f9d16d0d92 ```python import torch -import torchaudio +import soundfile as sf from liquid_audio import LFM2AudioModel, LFM2AudioProcessor, ChatState, LFMModality # Load models @@ -238,7 +245,7 @@ for t in model.generate_sequential(**chat, max_new_tokens=512, audio_temperature # Detokenize audio audio_codes = torch.stack(audio_out[:-1], 1).unsqueeze(0) waveform = processor.decode(audio_codes) -torchaudio.save("tts.wav", waveform.cpu(), 24_000) +sf.write("tts.wav", waveform.cpu()[0], 24_000) ``` ## Finetuning @@ -249,12 +256,6 @@ To finetune on your own data, make use of the `ChatMessage` interface. This requ 2. use the [`LFM2AudioChatMapper`](src/liquid_audio/data/mapper.py) to create a preprocessed dataset 3. train a model from the preprocessed dataset with `LFM2DataLoader` -First, install project dependencies: - -```bash -uv sync -``` - ### Preprocess Before training, convert dataset into our preprocessed training format. @@ -274,7 +275,7 @@ See [examples/preprocess_jenny_tts.py](examples/preprocess_jenny_tts.py) for an Run preprocessing with: ```bash -python -m examples.preprocess_jenny_tts +python examples/preprocess_jenny_tts ``` This writes a preprocessed dataset to `data/jenny_tts/train`. @@ -287,7 +288,7 @@ For example, to finetune a model on the [Jenny TTS Dataset](https://huggingface. using the preprocessed dataset from before, run: ```bash -python -m examples.train +python examples/train ``` From c84eea093988ead577f599dbd4cf6f908eddab79 Mon Sep 17 00:00:00 2001 From: Marc Harkonen Date: Tue, 2 Jun 2026 15:33:18 +0900 Subject: [PATCH 4/4] add to README --- README.md | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) diff --git a/README.md b/README.md index 1bdf5fc..562bee3 100644 --- a/README.md +++ b/README.md @@ -5,6 +5,7 @@ We present LFM2-Audio-1.5B, [Liquid AI](https://www.liquid.ai/)'s first end-to-e LFM2-Audio supports two generation modes, interleaved and sequential, to maximize performance and quality across different tasks. Interleaved generation outputs text and audio tokens in a fixed interleaved pattern. This approach minimizes time to first audio output and number of tokens generated, making it ideal for naturally flowing real-time speech-to-speech interactions on resource constrained devices. Sequential generation mode, where the model decides when to switch modalities via special tokens, is suitable for non-conversational tasks, such as speech-to-text (ASR) or text-to-speech (TTS). ### Updates +- [Finetuning](#finetuning) is now supported in both interleaved and sequential generation modes. Version 1.2.0 introduces data preparation tools and a lightweight trainer, enabling users to fine-tune models for a broad range of tasks, from ASR and TTS to function calling and end-to-end speech-to-speech chat. - [LFM2.5-Audio-1.5B](https://huggingface.co/LiquidAI/LFM2.5-Audio-1.5B) is released! This model is based on the stronger LFM2.5-1.2B base, and comes with a lightning fast LFM2 based audio detokenizer, stronger ASR, and better TTS voices. To use the new detokenizer, simply use `processor.decode`, see the examples below for more details. For the improved TTS voices, see the [TTS](#tts) section. ## Installation @@ -15,9 +16,9 @@ pip install "liquid-audio [demo]" # optional, to install demo dependencies pip install flash-attn --no-build-isolation # optional, to use flash attention 2. Will fallback to torch SDPA if not installed ``` -For installation on AMD ROCm, don't forget to specify the correct `pytorch` index, e.g. +For installation on AMD ROCm, don't forget to specify the correct `pytorch` version and index, e.g. ```bash -pip install liquid-audio --index-url https://download.pytorch.org/whl/rocm7.2 +pip install liquid-audio torch==2.12.0+rocm7.2 --extra-index-url https://download.pytorch.org/whl/rocm7.2 ``` ## Usage @@ -275,7 +276,7 @@ See [examples/preprocess_jenny_tts.py](examples/preprocess_jenny_tts.py) for an Run preprocessing with: ```bash -python examples/preprocess_jenny_tts +python examples/preprocess_jenny_tts.py ``` This writes a preprocessed dataset to `data/jenny_tts/train`. @@ -288,7 +289,7 @@ For example, to finetune a model on the [Jenny TTS Dataset](https://huggingface. using the preprocessed dataset from before, run: ```bash -python examples/train +python examples/train.py ```