diff --git a/.gitattributes b/.gitattributes
index af70e23..5371240 100644
--- a/.gitattributes
+++ b/.gitattributes
@@ -62,4 +62,11 @@ onnx/language_model_q4.onnx_data filter=lfs diff=lfs merge=lfs -text
onnx/language_model_q4f16.onnx_data filter=lfs diff=lfs merge=lfs -text
onnx/language_model_q4.onnx filter=lfs diff=lfs merge=lfs -text
onnx/language_model_q4f16.onnx filter=lfs diff=lfs merge=lfs -text
-onnx/language_model_fp16.onnx filter=lfs diff=lfs merge=lfs -text
\ No newline at end of file
+onnx/language_model_fp16.onnx filter=lfs diff=lfs merge=lfs -text
+
+onnx/conditional_decoder.onnx_data filter=lfs diff=lfs merge=lfs -text
+onnx/embed_tokens.onnx_data filter=lfs diff=lfs merge=lfs -text
+onnx/speech_encoder.onnx filter=lfs diff=lfs merge=lfs -text
+onnx/speech_encoder.onnx_data filter=lfs diff=lfs merge=lfs -text
+onnx/embed_tokens.onnx filter=lfs diff=lfs merge=lfs -text
+onnx/conditional_decoder.onnx filter=lfs diff=lfs merge=lfs -text
\ No newline at end of file
diff --git a/README.md b/README.md
index f85ab78..0948396 100644
--- a/README.md
+++ b/README.md
@@ -1,48 +1,495 @@
---
-license: Apache License 2.0
-tags: []
-
-#model-type:
-##如 gpt、phi、llama、chatglm、baichuan 等
-#- gpt
-
-#domain:
-##如 nlp、cv、audio、multi-modal
-#- nlp
-
-#language:
-##语言代码列表 https://help.aliyun.com/document_detail/215387.html?spm=a2c4g.11186623.0.0.9f8d7467kni6Aa
-#- cn
-
-#metrics:
-##如 CIDEr、Blue、ROUGE 等
-#- CIDEr
-
-#tags:
-##各种自定义,包括 pretrained、fine-tuned、instruction-tuned、RL-tuned 等训练方法和其他
-#- pretrained
-
-#tools:
-##如 vllm、fastchat、llamacpp、AdaSeq 等
-#- vllm
+license: mit
+language:
+- ar
+- da
+- de
+- el
+- en
+- es
+- fi
+- fr
+- he
+- hi
+- it
+- ja
+- ko
+- ms
+- nl
+- no
+- pl
+- pt
+- ru
+- sv
+- sw
+- tr
+- zh
+pipeline_tag: text-to-speech
+tags:
+- text-to-speech
+- speech
+- speech-generation
+- voice-cloning
+- multilingual-tts
+library_name: chatterbox
---
-### 当前模型的贡献者未提供更加详细的模型介绍。模型文件和权重,可浏览“模型文件”页面获取。
-#### 您可以通过如下git clone命令,或者ModelScope SDK来下载模型
-SDK下载
-```bash
-#安装ModelScope
-pip install modelscope
-```
+
+
+
Chatterbox TTS
+
+
+
+
+

+
+
+**Chatterbox Multilingual** [Resemble AI's](https://resemble.ai) production-grade open source TTS model. Chatterbox Multilingual supports **Arabic**, **Danish**, **German**, **Greek**, **English**, **Spanish**, **Finnish**, **French**, **Hebrew**, **Hindi**, **Italian**, **Japanese**, **Korean**, **Malay**, **Dutch**, **Norwegian**, **Polish**, **Portuguese**, **Russian**, **Swedish**, **Swahili**, **Turkish**, **Chinese** out of the box. Licensed under MIT, Chatterbox has been benchmarked against leading closed-source systems like ElevenLabs, and is consistently preferred in side-by-side evaluations.
+
+Whether you're working on memes, videos, games, or AI agents, Chatterbox brings your content to life. It's also the first open source TTS model to support **emotion exaggeration control**, a powerful feature that makes your voices stand out.
+
+Chatterbox is provided in an exported ONNX format, enabling fast and portable inference with ONNX Runtime across platforms.
+
+# Key Details
+- SoTA zeroshot English TTS
+- 0.5B Llama backbone
+- Unique exaggeration/intensity control
+- Ultra-stable with alignment-informed inference
+- Trained on 0.5M hours of cleaned data
+- Watermarked outputs (optional)
+- Easy voice conversion script using onnxruntime
+- [Outperforms ElevenLabs](https://podonos.com/resembleai/chatterbox)
+
+# Tips
+- **General Use (TTS and Voice Agents):**
+ - The default settings (`exaggeration=0.5`, `cfg=0.5`) work well for most prompts.
+
+- **Expressive or Dramatic Speech:**
+ - Try increase `exaggeration` to around `0.7` or higher.
+ - Higher `exaggeration` tends to speed up speech;
+
+
+# Usage
+[Link to GitHub ONNX Export and Inference script](https://github.com/VladOS95-cyber/onnx_conversion_scripts/tree/main/chatterbox)
+
```python
-#SDK模型下载
-from modelscope import snapshot_download
-model_dir = snapshot_download('onnx-community/chatterbox-multilingual-ONNX')
-```
-Git下载
-```
-#Git模型下载
-git clone https://www.modelscope.cn/onnx-community/chatterbox-multilingual-ONNX.git
+# !pip install --upgrade onnxruntime==1.22.1 huggingface_hub==0.34.4 transformers==4.46.3 numpy==2.2.6 tqdm==4.67.1 librosa==0.11.0 soundfile==0.13.1 perth==1.0.0
+# for Chinese, Japanese additionally pip install pkuseg==0.0.25 pykakasi==2.3.0
+
+import onnxruntime
+
+from huggingface_hub import hf_hub_download
+from transformers import AutoTokenizer
+
+import numpy as np
+from tqdm import tqdm
+import librosa
+import soundfile as sf
+from unicodedata import category
+import json
+
+S3GEN_SR = 24000
+START_SPEECH_TOKEN = 6561
+STOP_SPEECH_TOKEN = 6562
+SUPPORTED_LANGUAGES = {
+ "ar": "Arabic",
+ "da": "Danish",
+ "de": "German",
+ "el": "Greek",
+ "en": "English",
+ "es": "Spanish",
+ "fi": "Finnish",
+ "fr": "French",
+ "he": "Hebrew",
+ "hi": "Hindi",
+ "it": "Italian",
+ "ja": "Japanese",
+ "ko": "Korean",
+ "ms": "Malay",
+ "nl": "Dutch",
+ "no": "Norwegian",
+ "pl": "Polish",
+ "pt": "Portuguese",
+ "ru": "Russian",
+ "sv": "Swedish",
+ "sw": "Swahili",
+ "tr": "Turkish",
+ "zh": "Chinese",
+}
+
+
+class RepetitionPenaltyLogitsProcessor:
+ def __init__(self, penalty: float):
+ if not isinstance(penalty, float) or not (penalty > 0):
+ raise ValueError(f"`penalty` must be a strictly positive float, but is {penalty}")
+ self.penalty = penalty
+
+ def __call__(self, input_ids: np.ndarray, scores: np.ndarray) -> np.ndarray:
+ score = np.take_along_axis(scores, input_ids, axis=1)
+ score = np.where(score < 0, score * self.penalty, score / self.penalty)
+ scores_processed = scores.copy()
+ np.put_along_axis(scores_processed, input_ids, score, axis=1)
+ return scores_processed
+
+
+class ChineseCangjieConverter:
+ """Converts Chinese characters to Cangjie codes for tokenization."""
+
+ def __init__(self):
+ self.word2cj = {}
+ self.cj2word = {}
+ self.segmenter = None
+ self._load_cangjie_mapping()
+ self._init_segmenter()
+
+ def _load_cangjie_mapping(self):
+ """Load Cangjie mapping from HuggingFace model repository."""
+ try:
+ cangjie_file = hf_hub_download(
+ repo_id="onnx-community/chatterbox-multilingual-ONNX",
+ filename="Cangjie5_TC.json",
+ )
+
+ with open(cangjie_file, "r", encoding="utf-8") as fp:
+ data = json.load(fp)
+
+ for entry in data:
+ word, code = entry.split("\t")[:2]
+ self.word2cj[word] = code
+ if code not in self.cj2word:
+ self.cj2word[code] = [word]
+ else:
+ self.cj2word[code].append(word)
+
+ except Exception as e:
+ print(f"Could not load Cangjie mapping: {e}")
+
+ def _init_segmenter(self):
+ """Initialize pkuseg segmenter."""
+ try:
+ from pkuseg import pkuseg
+ self.segmenter = pkuseg()
+ except ImportError:
+ print("pkuseg not available - Chinese segmentation will be skipped")
+ self.segmenter = None
+
+ def _cangjie_encode(self, glyph: str):
+ """Encode a single Chinese glyph to Cangjie code."""
+ normed_glyph = glyph
+ code = self.word2cj.get(normed_glyph, None)
+ if code is None: # e.g. Japanese hiragana
+ return None
+ index = self.cj2word[code].index(normed_glyph)
+ index = str(index) if index > 0 else ""
+ return code + str(index)
+
+
+
+ def __call__(self, text):
+ """Convert Chinese characters in text to Cangjie tokens."""
+ output = []
+ if self.segmenter is not None:
+ segmented_words = self.segmenter.cut(text)
+ full_text = " ".join(segmented_words)
+ else:
+ full_text = text
+
+ for t in full_text:
+ if category(t) == "Lo":
+ cangjie = self._cangjie_encode(t)
+ if cangjie is None:
+ output.append(t)
+ continue
+ code = []
+ for c in cangjie:
+ code.append(f"[cj_{c}]")
+ code.append("[cj_.]")
+ code = "".join(code)
+ output.append(code)
+ else:
+ output.append(t)
+ return "".join(output)
+
+
+def is_kanji(c: str) -> bool:
+ """Check if character is kanji."""
+ return 19968 <= ord(c) <= 40959
+
+
+def is_katakana(c: str) -> bool:
+ """Check if character is katakana."""
+ return 12449 <= ord(c) <= 12538
+
+
+def hiragana_normalize(text: str) -> str:
+ """Japanese text normalization: converts kanji to hiragana; katakana remains the same."""
+ global _kakasi
+
+ try:
+ if _kakasi is None:
+ import pykakasi
+ _kakasi = pykakasi.kakasi()
+
+ result = _kakasi.convert(text)
+ out = []
+
+ for r in result:
+ inp = r['orig']
+ hira = r["hira"]
+
+ # Any kanji in the phrase
+ if any([is_kanji(c) for c in inp]):
+ if hira and hira[0] in ["は", "へ"]: # Safety check for empty hira
+ hira = " " + hira
+ out.append(hira)
+
+ # All katakana
+ elif all([is_katakana(c) for c in inp]) if inp else False: # Safety check for empty inp
+ out.append(r['orig'])
+
+ else:
+ out.append(inp)
+
+ normalized_text = "".join(out)
+
+ # Decompose Japanese characters for tokenizer compatibility
+ import unicodedata
+ normalized_text = unicodedata.normalize('NFKD', normalized_text)
+
+ return normalized_text
+
+ except ImportError:
+ print("pykakasi not available - Japanese text processing skipped")
+ return text
+
+
+def add_hebrew_diacritics(text: str) -> str:
+ """Hebrew text normalization: adds diacritics to Hebrew text."""
+ global _dicta
+
+ try:
+ if _dicta is None:
+ from dicta_onnx import Dicta
+ _dicta = Dicta()
+
+ return _dicta.add_diacritics(text)
+
+ except ImportError:
+ print("dicta_onnx not available - Hebrew text processing skipped")
+ return text
+ except Exception as e:
+ print(f"Hebrew diacritization failed: {e}")
+ return text
+
+
+def korean_normalize(text: str) -> str:
+ """Korean text normalization: decompose syllables into Jamo for tokenization."""
+
+ def decompose_hangul(char):
+ """Decompose Korean syllable into Jamo components."""
+ if not ('\uac00' <= char <= '\ud7af'):
+ return char
+
+ # Hangul decomposition formula
+ base = ord(char) - 0xAC00
+ initial = chr(0x1100 + base // (21 * 28))
+ medial = chr(0x1161 + (base % (21 * 28)) // 28)
+ final = chr(0x11A7 + base % 28) if base % 28 > 0 else ''
+
+ return initial + medial + final
+
+ # Decompose syllables and normalize punctuation
+ result = ''.join(decompose_hangul(char) for char in text)
+ return result.strip()
+
+
+def prepare_language(txt, language_id):
+ # Language-specific text processing
+ cangjie_converter = ChineseCangjieConverter()
+ if language_id == 'zh':
+ txt = cangjie_converter(txt)
+ elif language_id == 'ja':
+ txt = hiragana_normalize(txt)
+ elif language_id == 'he':
+ txt = add_hebrew_diacritics(txt)
+ elif language_id == 'ko':
+ txt = korean_normalize(txt)
+
+ # Prepend language token
+ if language_id:
+ txt = f"[{language_id.lower()}]{txt}"
+ return txt
+
+
+def run_inference(
+ text="The Lord of the Rings is the greatest work of literature.",
+ language_id="en",
+ target_voice_path=None,
+ max_new_tokens=256,
+ exaggeration=0.5,
+ output_dir="converted",
+ output_file_name="output.wav",
+ apply_watermark=True,
+):
+ # Validate language_id
+ if language_id and language_id.lower() not in SUPPORTED_LANGUAGES:
+ supported_langs = ", ".join(SUPPORTED_LANGUAGES.keys())
+ raise ValueError(
+ f"Unsupported language_id '{language_id}'. "
+ f"Supported languages: {supported_langs}"
+ )
+ model_id = "onnx-community/chatterbox-multilingual-ONNX"
+ if not target_voice_path:
+ target_voice_path = hf_hub_download(repo_id=model_id, filename="default_voice.wav", local_dir=output_dir)
+
+ ## Load model
+ speech_encoder_path = hf_hub_download(repo_id=model_id, filename="speech_encoder.onnx", local_dir=output_dir, subfolder='onnx')
+ hf_hub_download(repo_id=model_id, filename="speech_encoder.onnx_data", local_dir=output_dir, subfolder='onnx')
+ embed_tokens_path = hf_hub_download(repo_id=model_id, filename="embed_tokens.onnx", local_dir=output_dir, subfolder='onnx')
+ hf_hub_download(repo_id=model_id, filename="embed_tokens.onnx_data", local_dir=output_dir, subfolder='onnx')
+ conditional_decoder_path = hf_hub_download(repo_id=model_id, filename="conditional_decoder.onnx", local_dir=output_dir, subfolder='onnx')
+ hf_hub_download(repo_id=model_id, filename="conditional_decoder.onnx_data", local_dir=output_dir, subfolder='onnx')
+ language_model_path = hf_hub_download(repo_id=model_id, filename="language_model.onnx", local_dir=output_dir, subfolder='onnx')
+ hf_hub_download(repo_id=model_id, filename="language_model.onnx_data", local_dir=output_dir, subfolder='onnx')
+
+ # # Start inferense sessions
+ speech_encoder_session = onnxruntime.InferenceSession(speech_encoder_path)
+ embed_tokens_session = onnxruntime.InferenceSession(embed_tokens_path)
+ llama_with_past_session = onnxruntime.InferenceSession(language_model_path)
+ cond_decoder_session = onnxruntime.InferenceSession(conditional_decoder_path)
+
+ def execute_text_to_audio_inference(text):
+ print("Start inference script...")
+
+ audio_values, _ = librosa.load(target_voice_path, sr=S3GEN_SR)
+ audio_values = audio_values[np.newaxis, :].astype(np.float32)
+
+ ## Prepare input
+ tokenizer = AutoTokenizer.from_pretrained(model_id)
+ text = prepare_language(text, language_id)
+ input_ids = tokenizer(text, return_tensors="np")["input_ids"].astype(np.int64)
+
+ position_ids = np.where(
+ input_ids >= START_SPEECH_TOKEN,
+ 0,
+ np.arange(input_ids.shape[1])[np.newaxis, :] - 1
+ )
+
+ ort_embed_tokens_inputs = {
+ "input_ids": input_ids,
+ "position_ids": position_ids.astype(np.int64),
+ "exaggeration": np.array([exaggeration], dtype=np.float32)
+ }
+
+ ## Instantiate the logits processors.
+ repetition_penalty = 1.2
+ repetition_penalty_processor = RepetitionPenaltyLogitsProcessor(penalty=repetition_penalty)
+
+ num_hidden_layers = 30
+ num_key_value_heads = 16
+ head_dim = 64
+
+ generate_tokens = np.array([[START_SPEECH_TOKEN]])
+
+ # ---- Generation Loop using kv_cache ----
+ for i in tqdm(range(max_new_tokens), desc="Sampling", dynamic_ncols=True):
+
+ inputs_embeds = embed_tokens_session.run(None, ort_embed_tokens_inputs)[0]
+ if i == 0:
+ ort_speech_encoder_input = {
+ "audio_values": audio_values,
+ }
+ cond_emb, prompt_token, ref_x_vector, prompt_feat = speech_encoder_session.run(None, ort_speech_encoder_input)
+ inputs_embeds = np.concatenate((cond_emb, inputs_embeds), axis=1)
+
+ ## Prepare llm inputs
+ batch_size, seq_len, _ = inputs_embeds.shape
+ past_key_values = {
+ f"past_key_values.{layer}.{kv}": np.zeros([batch_size, num_key_value_heads, 0, head_dim], dtype=np.float32)
+ for layer in range(num_hidden_layers)
+ for kv in ("key", "value")
+ }
+ attention_mask = np.ones((batch_size, seq_len), dtype=np.int64)
+ logits, *present_key_values = llama_with_past_session.run(None, dict(
+ inputs_embeds=inputs_embeds,
+ attention_mask=attention_mask,
+ **past_key_values,
+ ))
+
+ logits = logits[:, -1, :]
+ next_token_logits = repetition_penalty_processor(generate_tokens, logits)
+
+ next_token = np.argmax(next_token_logits, axis=-1, keepdims=True).astype(np.int64)
+ generate_tokens = np.concatenate((generate_tokens, next_token), axis=-1)
+ if (next_token.flatten() == STOP_SPEECH_TOKEN).all():
+ break
+
+ # Get embedding for the new token.
+ position_ids = np.full(
+ (input_ids.shape[0], 1),
+ i + 1,
+ dtype=np.int64,
+ )
+ ort_embed_tokens_inputs["input_ids"] = next_token
+ ort_embed_tokens_inputs["position_ids"] = position_ids
+
+ ## Update values for next generation loop
+ attention_mask = np.concatenate([attention_mask, np.ones((batch_size, 1), dtype=np.int64)], axis=1)
+ for j, key in enumerate(past_key_values):
+ past_key_values[key] = present_key_values[j]
+
+ speech_tokens = generate_tokens[:, 1:-1]
+ speech_tokens = np.concatenate([prompt_token, speech_tokens], axis=1)
+ return speech_tokens, ref_x_vector, prompt_feat
+
+ speech_tokens, speaker_embeddings, speaker_features = execute_text_to_audio_inference(text)
+ cond_incoder_input = {
+ "speech_tokens": speech_tokens,
+ "speaker_embeddings": speaker_embeddings,
+ "speaker_features": speaker_features,
+ }
+ wav = cond_decoder_session.run(None, cond_incoder_input)[0]
+ wav = np.squeeze(wav, axis=0)
+
+ # Optional: Apply watermark
+ if apply_watermark:
+ import perth
+ watermarker = perth.PerthImplicitWatermarker()
+ wav = watermarker.apply_watermark(wav, sample_rate=S3GEN_SR)
+
+ sf.write(output_file_name, wav, S3GEN_SR)
+ print(f"{output_file_name} was successfully saved")
+
+if __name__ == "__main__":
+ run_inference(
+ text="Bonjour, comment ça va? Ceci est le modèle de synthèse vocale multilingue Chatterbox, il prend en charge 23 langues.",
+ language_id="fr",
+ exaggeration=0.5,
+ output_file_name="output.wav",
+ apply_watermark=False,
+ )
+
```
-如果您是本模型的贡献者,我们邀请您根据模型贡献文档,及时完善模型卡片内容。
\ No newline at end of file
+
+# Acknowledgements
+- [Xenova](https://huggingface.co/Xenova)
+- [Vladislav Bronzov](https://github.com/VladOS95-cyber)
+- [Resemble AI](https://github.com/resemble-ai/chatterbox)
+
+# Built-in PerTh Watermarking for Responsible AI
+
+Every audio file generated by Chatterbox includes [Resemble AI's Perth (Perceptual Threshold) Watermarker](https://github.com/resemble-ai/perth) - imperceptible neural watermarks that survive MP3 compression, audio editing, and common manipulations while maintaining nearly 100% detection accuracy.
+
+# Disclaimer
+Don't use this model to do bad things. Prompts are sourced from freely available data on the internet.
\ No newline at end of file
diff --git a/onnx/conditional_decoder.onnx b/onnx/conditional_decoder.onnx
new file mode 100644
index 0000000..cbed412
--- /dev/null
+++ b/onnx/conditional_decoder.onnx
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:1656d0d31332bae1854839959a3139300ebb67c178651dfa3f8c5fbfa5351351
+size 6350448
diff --git a/onnx/conditional_decoder.onnx_data b/onnx/conditional_decoder.onnx_data
new file mode 100644
index 0000000..e07b222
--- /dev/null
+++ b/onnx/conditional_decoder.onnx_data
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:51d58345a272747665ec9d5bb61e01835258a940e321a288582ac4c18cf01b5a
+size 533970816
diff --git a/onnx/embed_tokens.onnx b/onnx/embed_tokens.onnx
new file mode 100644
index 0000000..8d275d4
--- /dev/null
+++ b/onnx/embed_tokens.onnx
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:f785819ca4f6271262d5bb8971d62796c3a909e3b031982c113dbe83a4c3b854
+size 13286
diff --git a/onnx/embed_tokens.onnx_data b/onnx/embed_tokens.onnx_data
new file mode 100644
index 0000000..6a0c7af
--- /dev/null
+++ b/onnx/embed_tokens.onnx_data
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:2a15f7dd73b2ee47f6edf87740324011594b5a528ed6471ae55e327ed6cad68c
+size 68390912
diff --git a/onnx/speech_encoder.onnx b/onnx/speech_encoder.onnx
new file mode 100644
index 0000000..6a03835
--- /dev/null
+++ b/onnx/speech_encoder.onnx
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:8f1c8a0f89b77bf9cd5dd8f2e034eb2c79dc00fe70d41196b28c257643b00ccb
+size 1184608
diff --git a/onnx/speech_encoder.onnx_data b/onnx/speech_encoder.onnx_data
new file mode 100644
index 0000000..130bc07
--- /dev/null
+++ b/onnx/speech_encoder.onnx_data
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:92f8f290fc9720e169bc2412c507209e20b03f6564bc3243739e25c56f7dfb8f
+size 591274880