diff --git a/README.md b/README.md index ded1c03..ec540da 100644 --- a/README.md +++ b/README.md @@ -4,6 +4,9 @@ We present LFM2-Audio-1.5B, [Liquid AI](https://www.liquid.ai/)'s first end-to-e LFM2-Audio supports two generation modes, interleaved and sequential, to maximize performance and quality across different tasks. Interleaved generation outputs text and audio tokens in a fixed interleaved pattern. This approach minimizes time to first audio output and number of tokens generated, making it ideal for naturally flowing real-time speech-to-speech interactions on resource constrained devices. Sequential generation mode, where the model decides when to switch modalities via special tokens, is suitable for non-conversational tasks, such as speech-to-text (ASR) or text-to-speech (TTS). +### Updates +- [LFM2.5-Audio-1.5B](https://huggingface.co/LiquidAI/LFM2.5-Audio-1.5B) is released! This model is based on the stronger LFM2.5-1.2B base, and comes with a lightning fast LFM2 based audio detokenizer, stronger ASR, and better TTS voices. To use the new detokenizer, simply use `processor.decode`, see the examples below for more details. For the improved TTS voices, see the [TTS](#tts) section. + ## Installation The package can be installed via `pip` ```bash @@ -61,7 +64,7 @@ import torchaudio from liquid_audio import LFM2AudioModel, LFM2AudioProcessor, ChatState, LFMModality # Load models -HF_REPO = "LiquidAI/LFM2-Audio-1.5B" +HF_REPO = "LiquidAI/LFM2.5-Audio-1.5B" processor = LFM2AudioProcessor.from_pretrained(HF_REPO).eval() model = LFM2AudioModel.from_pretrained(HF_REPO).eval() @@ -97,9 +100,8 @@ for t in model.generate_interleaved(**chat, max_new_tokens=512, audio_temperatur # Detokenize audio, removing the last "end-of-audio" codes # Mimi returns audio at 24kHz -mimi_codes = torch.stack(audio_out[:-1], 1).unsqueeze(0) -with torch.no_grad(): - waveform = processor.mimi.decode(mimi_codes)[0] +audio_codes = torch.stack(audio_out[:-1], 1).unsqueeze(0) +waveform = processor.decode(audio_codes) torchaudio.save("answer1.wav", waveform.cpu(), 24_000) # Append newly generated tokens to chat history @@ -128,9 +130,8 @@ for t in model.generate_interleaved(**chat, max_new_tokens=512, audio_temperatur # output: Sure thing! How about “Comfortable Chairs, Crafted with Care” or “Elegant Seats, Handcrafted for You”? Let me know if you’d like a few more options. # Detokenize second turn audio, removing the last "end-of-audio" codes -mimi_codes = torch.stack(audio_out[:-1], 1).unsqueeze(0) -with torch.no_grad(): - waveform = processor.mimi.decode(mimi_codes)[0] +audio_codes = torch.stack(audio_out[:-1], 1).unsqueeze(0) +waveform = processor.decode(audio_codes) torchaudio.save("answer2.wav", waveform.cpu(), 24_000) ``` @@ -154,7 +155,7 @@ import torchaudio from liquid_audio import LFM2AudioModel, LFM2AudioProcessor, ChatState, LFMModality # Load models -HF_REPO = "LiquidAI/LFM2-Audio-1.5B" +HF_REPO = "LiquidAI/LFM2.5-Audio-1.5B" processor = LFM2AudioProcessor.from_pretrained(HF_REPO).eval() model = LFM2AudioModel.from_pretrained(HF_REPO).eval() @@ -182,19 +183,25 @@ for t in model.generate_sequential(**chat, max_new_tokens=512): ``` ### TTS -For TTS, we also use sequential generation, with the fixed system prompt `Perform TTS.`. In addition, we can prompt the voice and a style using a natural language description. +For TTS, we also use sequential generation. We support four pre-defined voices, which can be selected by choosing one of the four system prompts below +``` +Perform TTS. Use the US male voice. +Perform TTS. Use the US female voice. +Perform TTS. Use the UK male voice. +Perform TTS. Use the UK female voice. +```
TTS Sample -**Voice description**: A male speaker delivers his lines with a low-pitched voice and an animated tone. The recording is of excellent quality with almost no noise and a very close-sounding atmosphere. +**System prompt**: Perform TTS. Use the UK male voice. **Input sentence**: What is this obsession people have with books? They put them in their houses—like they're trophies. What do you need it for after you read it? **Output audio** -https://github.com/user-attachments/assets/2fa953cf-d8a8-477a-b841-c4f18d9266e6 +https://github.com/user-attachments/assets/8d57c184-b92e-4e1a-983b-d1f9d16d0d92
@@ -204,7 +211,7 @@ import torchaudio from liquid_audio import LFM2AudioModel, LFM2AudioProcessor, ChatState, LFMModality # Load models -HF_REPO = "LiquidAI/LFM2-Audio-1.5B" +HF_REPO = "LiquidAI/LFM2.5-Audio-1.5B" processor = LFM2AudioProcessor.from_pretrained(HF_REPO).eval() model = LFM2AudioModel.from_pretrained(HF_REPO).eval() @@ -213,7 +220,7 @@ model = LFM2AudioModel.from_pretrained(HF_REPO).eval() chat = ChatState(processor) chat.new_turn("system") -chat.add_text("Perform TTS.\nUse the following voice: A male speaker delivers his lines with a low-pitched voice and an animated tone. The recording is of excellent quality with almost no noise and a very close-sounding atmosphere.") +chat.add_text("Perform TTS. Use the UK male voice.") chat.end_turn() chat.new_turn("user") @@ -229,9 +236,8 @@ for t in model.generate_sequential(**chat, max_new_tokens=512, audio_temperature audio_out.append(t) # Detokenize audio -mimi_codes = torch.stack(audio_out[:-1], 1).unsqueeze(0) -with torch.no_grad(): - waveform = processor.mimi.decode(mimi_codes)[0] +audio_codes = torch.stack(audio_out[:-1], 1).unsqueeze(0) +waveform = processor.decode(audio_codes) torchaudio.save("tts.wav", waveform.cpu(), 24_000) ``` diff --git a/pyproject.toml b/pyproject.toml index 9513357..27edb35 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -1,6 +1,6 @@ [project] name = "liquid-audio" -version = "1.0.0" +version = "1.1.0" description = "Liquid Audio - Speech-to-Speech audio models" readme = "README.md" authors = [ @@ -16,6 +16,7 @@ dependencies = [ "sentencepiece>=0.2.1", "torch>=2.8.0", "torchaudio>=2.8.0", + "torchcodec>=0.9.1", "transformers>=4.55.4", ] keywords = ["Liquid AI", "LFM", "LFM2", "Audio", "Speech-to-Speech"] diff --git a/src/liquid_audio/__init__.py b/src/liquid_audio/__init__.py index c82390f..8185ca8 100644 --- a/src/liquid_audio/__init__.py +++ b/src/liquid_audio/__init__.py @@ -1,5 +1,6 @@ +from liquid_audio.detokenizer import LFM2AudioDetokenizer from liquid_audio.model.lfm2_audio import LFM2AudioModel from liquid_audio.processor import ChatState, LFM2AudioProcessor from liquid_audio.utils import LFMModality -__all__ = ["ChatState", "LFM2AudioModel", "LFM2AudioProcessor", "LFMModality"] +__all__ = ["ChatState", "LFM2AudioDetokenizer", "LFM2AudioModel", "LFM2AudioProcessor", "LFMModality"] diff --git a/src/liquid_audio/demo/model.py b/src/liquid_audio/demo/model.py index 3429a85..8b27506 100644 --- a/src/liquid_audio/demo/model.py +++ b/src/liquid_audio/demo/model.py @@ -10,7 +10,7 @@ __all__ = ["lfm2_audio", "mimi", "proc"] -HF_DIR = "LiquidAI/LFM2-Audio-1.5B" +HF_DIR = "LiquidAI/LFM2.5-Audio-1.5B" logging.info("Loading processor") proc = LFM2AudioProcessor.from_pretrained(HF_DIR).eval() diff --git a/src/liquid_audio/detokenizer.py b/src/liquid_audio/detokenizer.py new file mode 100644 index 0000000..4fa06f5 --- /dev/null +++ b/src/liquid_audio/detokenizer.py @@ -0,0 +1,136 @@ +import torch +from torch import nn +from transformers import Lfm2Config, Lfm2Model + + +class FusedEmbedding(nn.Module): + """Turn codes into embeddings""" + + def __init__( + self, + dim: int, + codeboooks: int = 8, + vocab_size: int = 2048, + ): + super().__init__() + self.emb = nn.Embedding(codeboooks * vocab_size, dim) + + self.codeboooks = codeboooks + self.vocab_size = vocab_size + + def forward(self, x: torch.Tensor) -> torch.Tensor: + offsets = torch.arange(self.codeboooks, device=x.device) * self.vocab_size # TODO: buffer? + offset_x = offsets[:, None] + x + return self.emb(offset_x).mean(1) # B L D + + +class ISTFT(nn.Module): + """ + Custom implementation of ISTFT since torch.istft doesn't allow custom padding (other than `center=True`) with + windowing. This is because the NOLA (Nonzero Overlap Add) check fails at the edges. + See issue: https://github.com/pytorch/pytorch/issues/62323 + Specifically, in the context of neural vocoding we are interested in "same" padding analogous to CNNs. + The NOLA constraint is met as we trim padded samples anyway. + + Adapted from Vocos: https://github.com/gemelo-ai/vocos/blob/c859e3b7b534f3776a357983029d34170ddd6fc3/vocos/spectral_ops.py#L7 + Args: + n_fft (int): Size of Fourier transform. + hop_length (int): The distance between neighboring sliding window frames. + win_length (int): The size of window frame and STFT filter. + padding (str, optional): Type of padding. Options are "center" or "same". Defaults to "same". + """ + + def __init__(self, n_fft: int, hop_length: int, win_length: int, padding: str = "same"): + super().__init__() + if padding not in ["center", "same"]: + raise ValueError("Padding must be 'center' or 'same'.") + self.padding = padding + self.n_fft = n_fft + self.hop_length = hop_length + self.win_length = win_length + window = torch.hann_window(win_length) + self.register_buffer("window", window) + + def forward(self, spec: torch.Tensor) -> torch.Tensor: + """ + Compute the Inverse Short Time Fourier Transform (ISTFT) of a complex spectrogram. + Args: + spec (Tensor): Input complex spectrogram of shape (B, N, T), where B is the batch size, + N is the number of frequency bins, and T is the number of time frames. + Returns: + Tensor: Reconstructed time-domain signal of shape (B, L), where L is the length of the output signal. + """ + if self.padding == "center": + # Fallback to pytorch native implementation + return torch.istft( + spec, + self.n_fft, + self.hop_length, + self.win_length, + self.window, # type: ignore[arg-type] + center=True, + ) + elif self.padding == "same": + pad = (self.win_length - self.hop_length) // 2 + else: + raise ValueError("Padding must be 'center' or 'same'.") + + assert spec.dim() == 3, "Expected a 3D tensor as input" + _B, _N, T = spec.shape + + # Inverse FFT + ifft = torch.fft.irfft(spec, self.n_fft, dim=1, norm="backward") + ifft = ifft * self.window[None, :, None] # type: ignore[index] + + # Overlap and Add + output_size = (T - 1) * self.hop_length + self.win_length + y = torch.nn.functional.fold( + ifft, + output_size=(1, output_size), + kernel_size=(1, self.win_length), + stride=(1, self.hop_length), + )[:, 0, 0, pad:-pad] + + # Window envelope + window_sq = self.window.square().expand(1, T, -1).transpose(1, 2) # type: ignore[operator] + window_envelope = torch.nn.functional.fold( + window_sq, + output_size=(1, output_size), + kernel_size=(1, self.win_length), + stride=(1, self.hop_length), + ).squeeze()[pad:-pad] + + # Normalize + assert (window_envelope > 1e-11).all() + y = y / window_envelope + + return y + + +class LFM2AudioDetokenizer(nn.Module): + def __init__(self, backbone_config: Lfm2Config): + super().__init__() + self.emb = FusedEmbedding(512) + self.lfm = Lfm2Model(backbone_config) + self.lin = nn.Linear(512, 1282) # half are log-magnitude, half are angle + + self.istft = ISTFT(1280, 320, 1280, padding="same") + self.sliding_window_size = getattr(backbone_config, "sliding_window", 30) + + def forward(self, x: torch.Tensor) -> torch.Tensor: + x = self.emb(x) + upsample_size = 6 * x.shape[1] + x = nn.functional.interpolate(x.mT, upsample_size, mode="nearest-exact").mT + + # Set attn mask + idx = torch.arange(x.shape[1], device=x.device) + d_idx = idx - idx[:, None] + mask = torch.logical_and(d_idx <= 0, d_idx > -self.sliding_window_size)[None, None, ...] + + x = self.lfm(inputs_embeds=x, attention_mask=mask, use_cache=False).last_hidden_state + x = self.lin(x) + + log_abs, angle = torch.chunk(x.mT.contiguous(), 2, 1) + y = torch.polar(log_abs.exp(), angle) + + return self.istft(y) diff --git a/src/liquid_audio/processor.py b/src/liquid_audio/processor.py index e6eb526..7c60fed 100644 --- a/src/liquid_audio/processor.py +++ b/src/liquid_audio/processor.py @@ -1,15 +1,15 @@ import json from collections.abc import Iterator, Mapping from dataclasses import asdict, dataclass -from functools import cached_property from pathlib import Path -from typing import Any, ClassVar, Literal, Self +from typing import Any, ClassVar, Literal, Self, assert_never import torch import torchaudio -from transformers import AutoTokenizer, PreTrainedTokenizer +from transformers import AutoTokenizer, Lfm2Config, PreTrainedTokenizer from liquid_audio import moshi +from liquid_audio.detokenizer import LFM2AudioDetokenizer from liquid_audio.model.conformer.processor import AudioToMelSpectrogramPreprocessor from liquid_audio.moshi.models.compression import MimiModel from liquid_audio.utils import LFMModality, get_model_dir, mel2emb_len @@ -38,11 +38,19 @@ def __init__( self, text_tokenizer_path: str, audio_processor_config: PreprocessorConfig, - mimi_weights_path: str, + mimi_weights_path: str | None = None, + detokenizer_path: str | None = None, + name: str | None = None, ) -> None: self.text_tokenizer = AutoTokenizer.from_pretrained(text_tokenizer_path) self.audio_processor = AudioToMelSpectrogramPreprocessor(**asdict(audio_processor_config)).eval() self.mimi_weights_path = mimi_weights_path + self.detokenizer_path = detokenizer_path + + self.name = name + + self._mimi: MimiModel | None = None + self._audio_detokenizer: LFM2AudioDetokenizer | None = None @classmethod def from_pretrained( @@ -56,10 +64,18 @@ def from_pretrained( with (cache_path / "config.json").open() as f: config = json.load(f) + mimi_ckpt = cache_path / "tokenizer-e351c8d8-checkpoint125.safetensors" + mimi_weights_path = str(mimi_ckpt) if mimi_ckpt.exists() else None + + detok_ckpt = cache_path / "audio_detokenizer" + detokenizer_weights_path = str(detok_ckpt) if detok_ckpt.exists() else None + return cls( text_tokenizer_path=str(cache_path), audio_processor_config=PreprocessorConfig(**config["preprocessor"]), - mimi_weights_path=str(cache_path / "tokenizer-e351c8d8-checkpoint125.safetensors"), + mimi_weights_path=mimi_weights_path, + detokenizer_path=detokenizer_weights_path, + name=str(repo_id), ).to(device) def to(self, device: str | torch.device | None = None, dtype: torch.dtype | None = None) -> Self: @@ -82,15 +98,83 @@ def text(self) -> PreTrainedTokenizer: def audio(self) -> AudioToMelSpectrogramPreprocessor: return self.audio_processor - @cached_property + @property def mimi(self) -> MimiModel: - from safetensors.torch import load_file + if self.mimi_weights_path is None: + if self.name is None: + msg = "expected `mimi_weights_path` to be specified." + else: + msg = f"model {self.name} does not provide Mimi weights, use {type(self).__name__}.decode instead." + raise AttributeError(msg) - mimi_model = moshi.models.loaders.get_mimi(None, device=self.device) - mimi_weights = load_file(self.mimi_weights_path, device=str(self.device)) - mimi_model.load_state_dict(mimi_weights, strict=True) + if self._mimi is None: + from safetensors.torch import load_file - return mimi_model + mimi_model = moshi.models.loaders.get_mimi(None, device=self.device) + mimi_weights = load_file(self.mimi_weights_path, device=str(self.device)) + mimi_model.load_state_dict(mimi_weights, strict=True) + + self._mimi = mimi_model + + return self._mimi + + @property + def audio_detokenizer(self) -> LFM2AudioDetokenizer: + if self.detokenizer_path is None: + if self.name is None: + msg = "expected `detokenizer_weights_path` to be specified." + else: + msg = ( + f"model {self.name} does not provide LFM based audio detokenizer, use {type(self).__name__}.mimi instead." + ) + raise AttributeError(msg) + + if self._audio_detokenizer is None: + detok_config_path = Path(self.detokenizer_path) / "config.json" + detok_config = Lfm2Config.from_pretrained(detok_config_path) + + # Make llama.cpp config compatible with transformers Lfm2Model + def rename_layer( + layer: Literal["conv", "sliding_attention", "full_attention"], + ) -> Literal["conv", "full_attention"]: + match layer: + case "conv" | "full_attention": + return layer + case "sliding_attention": + return "full_attention" + case _: + assert_never(layer) + + assert isinstance(detok_config.layer_types, list) + detok_config.layer_types = [rename_layer(layer) for layer in detok_config.layer_types] # type: ignore[arg-type] + + detok = LFM2AudioDetokenizer(detok_config).eval().cuda() + + detok_weights_path = Path(self.detokenizer_path) / "model.safetensors" + from safetensors.torch import load_file + + detok_weights = load_file(detok_weights_path) + detok.load_state_dict(detok_weights) + + detok.eval() + + self._audio_detokenizer = detok + + return self._audio_detokenizer + + @torch.no_grad() + def decode(self, audio_codes: torch.Tensor) -> torch.Tensor: + """Detokenize audio codes into waveform with LFM2 based detokenizer + + Args: + audio_codes: (1, 8, T) shaped integer tensor, with values in [0, 2047] + Returns: + waveform: (1, T') float tensor, 24kHz mono waveform + """ + if torch.any(audio_codes >= 2048) or torch.any(audio_codes < 0): + raise RuntimeError("expected audio codes in range ") + + return self.audio_detokenizer(audio_codes) @property def device(self) -> torch.device: diff --git a/uv.lock b/uv.lock index b05d193..23ae556 100644 --- a/uv.lock +++ b/uv.lock @@ -806,7 +806,7 @@ wheels = [ [[package]] name = "liquid-audio" -version = "1.0.0" +version = "1.1.0" source = { editable = "." } dependencies = [ { name = "accelerate" }, @@ -815,6 +815,7 @@ dependencies = [ { name = "sentencepiece" }, { name = "torch" }, { name = "torchaudio" }, + { name = "torchcodec" }, { name = "transformers" }, ] @@ -839,6 +840,7 @@ requires-dist = [ { name = "sentencepiece", specifier = ">=0.2.1" }, { name = "torch", specifier = ">=2.8.0" }, { name = "torchaudio", specifier = ">=2.8.0" }, + { name = "torchcodec", specifier = ">=0.9.1" }, { name = "transformers", specifier = ">=4.55.4" }, ] provides-extras = ["demo"] @@ -2249,6 +2251,22 @@ wheels = [ { url = "https://files.pythonhosted.org/packages/52/27/7fc2d7435af044ffbe0b9b8e98d99eac096d43f128a5cde23c04825d5dcf/torchaudio-2.8.0-cp313-cp313t-win_amd64.whl", hash = "sha256:d4a715d09ac28c920d031ee1e60ecbc91e8a5079ad8c61c0277e658436c821a6", size = 2549553, upload-time = "2025-08-06T14:59:00.019Z" }, ] +[[package]] +name = "torchcodec" +version = "0.9.1" +source = { registry = "https://pypi.org/simple" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/4b/60/3bfa459e09987af08e188811b191437c9d8215a74f4d418be6ff7df87b5c/torchcodec-0.9.1-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:8996ec62b72c69545c30246df64df386d06d7ec7de0689be5d20dfc06aad6442", size = 4064264, upload-time = "2025-12-10T15:55:56.313Z" }, + { url = "https://files.pythonhosted.org/packages/17/c8/bfb74babec98aff11ab4f239b0901f39e1a93338b3438e842d864dc46935/torchcodec-0.9.1-cp312-cp312-manylinux_2_28_x86_64.whl", hash = "sha256:a50568ce73b70395d113833fb07394c223f5546ef5d4fafe0fdcd91627fca270", size = 2061978, upload-time = "2025-12-10T15:55:33.415Z" }, + { url = "https://files.pythonhosted.org/packages/b2/12/c0bbf01b0ed52b69aaeed4af1043dc8308ccc522a47fcc082b34882e2ba2/torchcodec-0.9.1-cp312-cp312-win_amd64.whl", hash = "sha256:d9c8efe5845bde45a428f96493b4a041511f47f5bd53b333a0ad90426be4623a", size = 2187178, upload-time = "2025-12-10T15:56:16.968Z" }, + { url = "https://files.pythonhosted.org/packages/6f/c7/67fc8417f9efa8a25c00a44f0d674761a0bad9c45e9725e3fd116b3c48ed/torchcodec-0.9.1-cp313-cp313-macosx_12_0_arm64.whl", hash = "sha256:5c9cdcba50c75be70ef6ec919ec1f7f14d9d5163d93cf6bd94403e134f03734c", size = 4034415, upload-time = "2025-12-10T15:56:02.04Z" }, + { url = "https://files.pythonhosted.org/packages/68/05/06240f661e9aa08b20765305e3b88f60bff706bbe54ac35830af74612443/torchcodec-0.9.1-cp313-cp313-manylinux_2_28_x86_64.whl", hash = "sha256:0643c5e9c3a51fdafdea87935d5b0a38e99626c664f47a150482d77ab370a877", size = 2067767, upload-time = "2025-12-10T15:55:37.27Z" }, + { url = "https://files.pythonhosted.org/packages/13/a2/d78cd65863fb805d9e35fe90ae7574eab86ff0ae63438208bd07d2cf1fd2/torchcodec-0.9.1-cp313-cp313-win_amd64.whl", hash = "sha256:df0b5a15998fd7457625c2af2a6276e0e710fac158d145045340dbbcd1cfdb65", size = 2186788, upload-time = "2025-12-10T15:56:20.204Z" }, + { url = "https://files.pythonhosted.org/packages/01/02/f8ae9443d3bcbe8a8d6d0bbc3992296e5476e5afa1f244100a3a7967a36c/torchcodec-0.9.1-cp314-cp314-macosx_11_0_arm64.whl", hash = "sha256:b9bc5a5dff925df96d11bf90bd0ce964b8086bb11ae09adf353518192b5da483", size = 3812248, upload-time = "2025-12-10T15:56:06.382Z" }, + { url = "https://files.pythonhosted.org/packages/59/a1/8462b55571286847ea31edb7634583125400824267db9ba8301f4ce3f137/torchcodec-0.9.1-cp314-cp314-manylinux_2_28_x86_64.whl", hash = "sha256:65634bb28b3155cf99f980dac31ecedb414c07b8156f8473ec9fb74bedbd2a1f", size = 2068456, upload-time = "2025-12-10T15:55:40.577Z" }, + { url = "https://files.pythonhosted.org/packages/f2/63/752d0fc1c6e8f799ae880ca1087510def663a7f9aa1a70074ae334c6908f/torchcodec-0.9.1-cp314-cp314-win_amd64.whl", hash = "sha256:2d01c8b3685a3a38f050ed2b526808a2938dba6f56cb9f9e967884fd858bba15", size = 2188320, upload-time = "2025-12-10T15:56:24.63Z" }, +] + [[package]] name = "tqdm" version = "4.67.1"