TTS:CosyVoice2 高质量语音生成
CosyVoice2 只需3~10秒的原始音频,即可生成音色,实现不同语言之间的语音合成,中文、英文、日语、韩语、中国方言(粤语、四川话、上海话、天津话、武汉话、长沙话、郑州话等),还支持生成具有多种情感表达的语音,包括快乐、兴奋、悲伤、愤怒等,还能通过指令控制说话的语气、情绪等
【原文】CosyVoice: A Scalable Multilingual Zero-shot Text-to-speech Synthesizer based on Supervised Semantic Tokens
【原文】CosyVoice 2: Scalable Streaming Speech Synthesis with Large Language Models
【项目】https://github.com/FunAudioLLM/CosyVoice
pip install wetext==0.0.4 inflect==7.3.1 numpy==1.25.0 gradio HyperPyYAML==1.2.2 librosa==0.10.2 onnxruntime-gpu==1.19.0 openai-whisper==20231117 transformers==4.40.1 omegaconf==2.3.0 conformer==0.3.2 diffusers==0.29.0 hydra-core==1.3.2 lightning==2.2.4 gdown==5.1.0 matplotlib==3.7.5 wget==3.2 pyarrow==18.1.0 pyworld==0.3.4 pydantic==2.10.6 tensorrt==10.9.0.34 pynini==2.1.5 openpyxl conda install -c conda-forge pynini pip install WeTextProcessing --no-deps conda install -c conda-forge pyface
【CosyVoice2-0.5B】
E:\\modelscope_cache\\hub\\iic\\CosyVoice2-0___5B
【CosyVoice-300B】
E:\\modelscope_cache\\hub\\iic\\CosyVoice-300M
【CosyVoice-300M-Instruct】
E:\\modelscope_cache\\hub\\iic\\CosyVoice-300M-Instruct
【CosyVoice-300M-SFT】
E:\\modelscope_cache\\hub\\iic\\CosyVoice-300M-SFT
1,CosyVoice
1.1,inference_zero_shot
【服务端】CosyVoice/runtime/python/fastapi/server.py,添加环境变量:
PYTHONUNBUFFERED=1;PYTHONPATH=D:\PyCharmWorkSpace\VH\Linly-Talker\CosyVoice\third_party\Matcha-TTS
【报错】TypeError: expected str, bytes or os.PathLike object, not MultiplexedPath
【解决】Windows 下不支持 MultiplexedPath,手动添加进去。
self.zh_tn_model = ZhNormalizer(remove_erhua=False, full_to_half=False, overwrite_cache=True) self.en_tn_model = EnNormalizer() 👇 self.zh_tn_model = ZhNormalizer(remove_erhua=False, full_to_half=False, overwrite_cache=True,cache_dir="tn") self.en_tn_model = EnNormalizer(cache_dir="tn")
【inference_zero_shot】
import torch import random from openpyxl import load_workbook from cosyvoice.cli.cosyvoice import CosyVoice2, CosyVoice from cosyvoice.utils.file_utils import load_wav from cosyvoice.utils.common import set_all_random_seed import librosa import soundfile as sf message = "****。" model_dir = 'E:\\modelscope_cache\\hub\\iic\\CosyVoice-300M' prompt_audio_path = 'yuan.WAV' prompt_text = '今天可以处理吗?' seed = random.randint(1, 100000000) stream = False speeds = [0.97] cosyvoice = CosyVoice(model_dir) prompt_sr = 16000 max_val = 0.8 def postprocess(speech, top_db=60, hop_length=220, win_length=440): speech, _ = librosa.effects.trim( speech, top_db=top_db, frame_length=win_length, hop_length=hop_length ) if speech.abs().max() > max_val: speech = speech / speech.abs().max() * max_val speech = torch.concat([speech, torch.zeros(1, int(cosyvoice.sample_rate * 0.2))], dim=1) return speech prompt_speech_16k = postprocess(load_wav(prompt_audio_path, prompt_sr)) set_all_random_seed(seed) tts_wav = "C:\\Users\\shao\\Desktop\\" for i in cosyvoice.inference_zero_shot(tts_text=message, prompt_text=prompt_text, prompt_speech_16k=prompt_speech_16k, stream=stream, speed=1.0): speech = i['tts_speech'].numpy().flatten() sf.write(tts_wav + "CosyVoice-300B.wav", speech, samplerate=cosyvoice.sample_rate) print(f"生成成功,音频保存至:{tts_wav}")
【耗时】从传入文本 👉 输出 .wav 总耗时:2.8s
- 拿到语音数据 response:40 ms
- 拼接组装 response:2400 ms
【批量语音生成】
import logging import os import time import numpy as np import requests import torch import torchaudio messages = [ ] from cosyvoice.cli.cosyvoice import CosyVoice2, CosyVoice cosyvoice = CosyVoice2('D:\modelscope_cache\hub\iic\CosyVoice2-0___5B') # cosyvoice = CosyVoice('D:\modelscope_cache\hub\iic\CosyVoice-300M-SFT') # cosyvoice = CosyVoice('D:\modelscope_cache\hub\iic\CosyVoice-300M') # cosyvoice = CosyVoice('D:\modelscope_cache\hub\iic\CosyVoice-300M-Instruct') url = "http://127.0.0.1:50000/inference_zero_shot" prompt_wav = "D:\\PyCharmWorkSpace\\TTS\\CosyVoice2\\cosyvoice\\data\\test.mp3" index = 1 for message in messages: tts_wav = str(index) + ".wav" payload = { 'tts_text': message, 'prompt_text': "XXXX" } files = [('prompt_wav', ('prompt_wav', open(prompt_wav, 'rb'), 'application/octet-stream'))] response = requests.request("GET", url, data=payload, files=files, stream=True) tts_audio = b'' for r in response.iter_content(chunk_size=16000): tts_audio += r tts_speech = torch.from_numpy(np.array(np.frombuffer(tts_audio, dtype=np.int16))).unsqueeze(dim=0) logging.info('save response to {}'.format(tts_wav)) torchaudio.save(tts_wav, tts_speech, 22050) logging.info('get response') index+=1
def read_excel_as_list(filename, sheet_name=None): wb = load_workbook(filename) ws = wb[sheet_name] if sheet_name else wb.active data = [[cell.value for cell in row] for row in ws.iter_rows()] return data # 参数配置 model_dir = 'D:/modelscope_cache/hub/iic/CosyVoice2-0___5B' # 模型路径 prompt_audio_path = 'D:\\PyCharmWorkSpace\\TTS\\CosyVoice2\\cosyvoice\\data\\test2.wav' prompt_text = '' seed = random.randint(1, 100000000) stream = False speed = 1.0 # 初始化模型 cosyvoice = CosyVoice2(model_dir) prompt_sr = 16000 max_val = 0.8 # 预处理函数 def postprocess(speech, top_db=60, hop_length=220, win_length=440): speech, _ = librosa.effects.trim( speech, top_db=top_db, frame_length=win_length, hop_length=hop_length ) if speech.abs().max() > max_val: speech = speech / speech.abs().max() * max_val speech = torch.concat([speech, torch.zeros(1, int(cosyvoice.sample_rate * 0.2))], dim=1) return speech # 加载prompt音频 prompt_speech_16k = postprocess(load_wav(prompt_audio_path, prompt_sr)) # 设置随机种子 set_all_random_seed(seed) data = read_excel_as_list("example.xlsx") for message in data: tts_wav = "C:\\Users\\shao\\Desktop\\output2\\" + str(message[0]) + ".wav" for i in cosyvoice.inference_zero_shot(message[1], prompt_text, prompt_speech_16k, stream=stream, speed=speed): speech = i['tts_speech'].numpy().flatten() sf.write(tts_wav, speech, samplerate=cosyvoice.sample_rate) print(f"生成成功,音频保存至:{tts_wav}")
【长文本&语速过快问题】
index = 1 wav_files = [] for i in cosyvoice.inference_zero_shot(message, prompt_text, prompt_speech_16k, stream=stream, speed=speed): print(index) tts_wav = "" speech = i['tts_speech'].numpy().flatten() sf.write(tts_wav, speech, samplerate=cosyvoice.sample_rate) print(f"生成成功,音频保存至:{tts_wav}") wav_files.append(tts_wav) index += 1 existing_wavs = [f for f in wav_files if os.path.isfile(f)] if len(existing_wavs) > 1: combined = AudioSegment.empty() for wav_file in existing_wavs: audio = AudioSegment.from_wav(wav_file) combined += audio output_file = '' combined.export(output_file, format='wav') print(f"合并完成,保存为:{output_file}")
1.2,inference_sft
【保存音色文件】参考 CosyVoice2
data = load_spk_from_wav("C:\\Users\\shao\\Desktop\\5.wav", cosyvoice) torch.save(data, f'speakers/xijun.pt')
【直接生成】耗时:4毫秒
import random from cosyvoice.utils.common import set_all_random_seed import soundfile as sf from cosyvoice.cli.cosyvoice import CosyVoice, CosyVoice2 message = "****" model_dir = 'E:\\modelscope_cache\\hub\\iic\\CosyVoice-300M-SFT' seed = random.randint(1, 100000000) stream = False speeds = [0.97] cosyvoice = CosyVoice(model_dir) prompt_sr = 16000 max_val = 0.8 set_all_random_seed(seed) tts_wav = "C:\\Users\\shao\\Desktop\\" for i in cosyvoice.inference_sft(tts_text=message, spk_id="xijun", stream=stream, speed=1.0): speech = i['tts_speech'].numpy().flatten() sf.write(tts_wav + "CosyVoice-300B.wav", speech, samplerate=cosyvoice.sample_rate) print(f"生成成功,音频保存至:{tts_wav}")
1.3,inference_instruct
【直接生成】耗时:6毫秒,没办法使用自己的音色,据说是安全考虑。
model_dir = 'E:\\modelscope_cache\\hub\\iic\\CosyVoice-300M-Instruct' for i in cosyvoice.inference_instruct(tts_text=message, spk_id="xijun",instruct_text="用开心的语气说", stream=stream, speed=1.0): speech = i['tts_speech'].numpy().flatten() sf.write(tts_wav + "CosyVoice-300B.wav", speech, samplerate=cosyvoice.sample_rate) print(f"生成成功,音频保存至:{tts_wav}")
2,CosyVoice2
2.1,inference_zero_shot
【服务端】CosyVoice/runtime/python/fastapi/server.py,添加环境变量:
PYTHONUNBUFFERED=1;PYTHONPATH=D:\PyCharmWorkSpace\Linly-Talker\CosyVoice\third_party\Matcha-TTS
【报错】ZeroDivisionError: 0.0 cannot be raised to a negative power
【解决】diffusers 版本太高了,建议降低到 0.29.0。
【报错】找不到预训练音色
【解决】需要手动下载spk2info.pt文件粘贴到pretrained_models/CosyVoice2-0.5B中,随后重新运行webui.py就能看到预训练模型。Issue
import torch import random from openpyxl import load_workbook from cosyvoice.cli.cosyvoice import CosyVoice2, CosyVoice from cosyvoice.utils.file_utils import load_wav from cosyvoice.utils.common import set_all_random_seed import librosa import soundfile as sf message = "" model_dir = 'E:\\modelscope_cache\\hub\\iic\\CosyVoice2-0___5B' prompt_audio_path = 'Test.mp3' prompt_text = '' seed = random.randint(1, 100000000) stream = False speeds = [0.97] cosyvoice = CosyVoice2(model_dir) prompt_sr = 16000 max_val = 0.8 def postprocess(speech, top_db=60, hop_length=220, win_length=440): speech, _ = librosa.effects.trim( speech, top_db=top_db, frame_length=win_length, hop_length=hop_length ) if speech.abs().max() > max_val: speech = speech / speech.abs().max() * max_val speech = torch.concat([speech, torch.zeros(1, int(cosyvoice.sample_rate * 0.2))], dim=1) return speech prompt_speech_16k = postprocess(load_wav(prompt_audio_path, prompt_sr)) set_all_random_seed(seed) tts_wav = "C:\\Users\\shao\\Desktop\\" for i in cosyvoice.inference_zero_shot(tts_text=message, prompt_text=prompt_text, prompt_speech_16k=prompt_speech_16k, stream=stream, speed=1.0): speech = i['tts_speech'].numpy().flatten() sf.write(tts_wav + "CosyVoice2.wav", speech, samplerate=cosyvoice.sample_rate) print(f"生成成功,音频保存至:{tts_wav}")
【耗时】从传入文本 👉 输出 .wav 总耗时:2.8s,跟 CosyVoice 离线几乎一样。
- 拿到语音数据 response:40 ms
- 拼接组装 response:2400 ms
2.2,inference_instruct2
【粤语输出】耗时 15s
message = "******" model_dir = 'E:\\modelscope_cache\\hub\\iic\\CosyVoice2-0___5B' prompt_audio_path = 'C:\\Users\\shao\\Desktop\\Test.mp3' seed = random.randint(1, 100000000) stream = False speeds = [0.97] cosyvoice = CosyVoice2(model_dir) prompt_sr = 16000 max_val = 0.8 def postprocess(speech, top_db=60, hop_length=220, win_length=440): speech, _ = librosa.effects.trim( speech, top_db=top_db, frame_length=win_length, hop_length=hop_length ) if speech.abs().max() > max_val: speech = speech / speech.abs().max() * max_val speech = torch.concat([speech, torch.zeros(1, int(cosyvoice.sample_rate * 0.2))], dim=1) return speech prompt_speech_16k = postprocess(load_wav(prompt_audio_path, prompt_sr)) set_all_random_seed(seed) tts_wav = "C:\\Users\\shao\\Desktop\\" for i in cosyvoice.inference_instruct2(tts_text=message, instruct_text="用粤语说这句话", prompt_speech_16k=prompt_speech_16k, stream=stream, speed=1.0): speech = i['tts_speech'].numpy().flatten() sf.write(tts_wav + "CosyVoice2.wav", speech, samplerate=cosyvoice.sample_rate) print(f"生成成功,音频保存至:{tts_wav}")
2.3,流式输出
【耗时】首次耗时 8s,二次耗时 6s。
【耗时高的原因】150ms请使用阿里云,而且是预训练音色不需要提取token和embedding情况下。开源版本是python且zero shot推理,跟150ms肯定是有差距的,但是模型方案是一模一样的,有能力的可以先进行工程话,开源后续可能会更新一个llm用vllm推理。
【流式输出-API请求】
import io import os import sys import base64 import uvicorn import requests import torchaudio import numpy as np import soundfile as sf from datetime import datetime from fastapi import FastAPI from fastapi.responses import StreamingResponse from pydantic import BaseModel from typing import Optional from pydub import AudioSegment, silence from cosyvoice.cli.cosyvoice import CosyVoice2 from cosyvoice.utils.file_utils import load_wav cosyvoice = CosyVoice2('E:\\modelscope_cache\\hub\\iic\\CosyVoice2-0___5B', load_jit=True, load_trt=False, use_flow_cache=True) app = FastAPI() # 定义一个数据模型,用于接收POST请求中的数据 class TTSRequest(BaseModel): spk_id: Optional[str] = None # 预训练语音id ref_audio: Optional[str] = None # 参考语音 base64编码的音频文件 ref_text: Optional[str] = None # 参考语音的文本 ref_tag: Optional[str] = None # 参考语音的标签 tts_text: Optional[str] = None # 待合成的文本 instruct_text: Optional[str] = None # 指令文本 stream: Optional[bool] = False # 是否使用流式合成 speed: Optional[float] = 1.0 # 语速 mode: Optional[str] = 'zero_shot' # 合成模式,默认模式为 'sft',可选模式为 'sft' 'zero_shot' 'cross_lingual' 'instruct' def save_audio(reference_audio, reference_tag): # base64编码的音频文件转为音频文件并保存到本地 try: audio_data = base64.b64decode(reference_audio) reference_audio = io.BytesIO(audio_data) query_audio = AudioSegment.from_file(reference_audio, format="wav") query_audio = query_audio.set_channels(1) query_audio = query_audio.set_frame_rate(16000) query_audio = query_audio.set_sample_width(2) save_path = f"C:/Users/shao/Desktop/audios/{reference_tag}.wav" query_audio.export(save_path, format="wav") return save_path except base64.binascii.Error: print("Error: base64 decoding failed") return '' @app.post("/cosyvoice") async def tts(request: TTSRequest): headers = { "Content-Type": "audio/pcm", "X-Sample-Rate": "24000", # 假设采样率是 24kHz "X-Channel-Count": "1" # 假设单声道 } if request.mode == 'sft': if not request.spk_id: return {'message': 'spk_id is required for sft mode'} async def generate(): for out in cosyvoice.inference_sft(tts_text=request.tts_text, spk_id=request.spk_id, stream=request.stream, speed=request.speed): raw = (out['tts_speech'].numpy() * 32767).astype( np.int16).flatten() # 原始输出 [-1, 1] 之间的float32,需要转为 16 位 PCM yield raw.tobytes() return StreamingResponse(generate(), media_type="audio/pcm", headers=headers) else: ref_audio_path = f"C:/Users/shao/Desktop/audios/{request.ref_tag}.wav" ref_text_path = f"C:/Users/shao/Desktop/audios/{request.ref_tag}.txt" if not os.path.exists(ref_audio_path): ref_audio_path = save_audio(request.ref_audio, request.ref_tag) if not os.path.exists(ref_text_path): ref_text = request.ref_text with open(ref_text_path, 'w') as f: f.write(ref_text) else: ref_text = open(ref_text_path, 'r').read() if request.mode == 'zero_shot': async def generate(): for out in cosyvoice.inference_zero_shot(tts_text=request.tts_text, prompt_text=ref_text, prompt_speech_16k=load_wav(ref_audio_path, 16000), stream=request.stream, speed=request.speed): raw = (out['tts_speech'].numpy() * 32767).astype( np.int16).flatten() # 原始输出 [-1, 1] 之间的float32,需要转为 16 位 PCM yield raw.tobytes() return StreamingResponse(generate(), media_type="audio/pcm", headers=headers) elif request.mode == 'cross_lingual': async def generate(): for out in cosyvoice.inference_cross_lingual(tts_text=request.tts_text, prompt_speech_16k=load_wav(ref_audio_path, 16000), stream=request.stream, speed=request.speed): raw = (out['tts_speech'].numpy() * 32767).astype( np.int16).flatten() # 原始输出 [-1, 1] 之间的float32,需要转为 16 位 PCM yield raw.tobytes() return StreamingResponse(generate(), media_type="audio/pcm", headers=headers) elif request.mode == 'instruct': if not request.instruct_text: return {'message': 'instruct_text is required for instruct mode'} async def generate(): for out in cosyvoice.inference_instruct2(tts_text=request.tts_text, instruct_text=request.instruct_text, prompt_speech_16k=load_wav(ref_audio_path), stream=request.stream, speed=request.speed): raw = (out['tts_speech'].numpy() * 32767).astype( np.int16).flatten() # 原始输出 [-1, 1] 之间的float32,需要转为 16 位 PCM yield raw.tobytes() return StreamingResponse(generate(), media_type="audio/pcm", headers=headers) else: return {'message': 'Invalid mode'} if __name__ == '__main__': uvicorn.run(app, host='0.0.0.0', port=3005)
import base64 import datetime import time import requests import wave import pyaudio import io def wav_to_base64(file_path): with open(file_path, 'rb') as f: wav_data = f.read() base64_data = base64.b64encode(wav_data).decode('utf-8') return base64_data def play_and_save_pcm_stream(pcm_generator, save_path='C:/Users/shao/Desktop/audios/gen_audio.wav', sample_rate=24000, channels=1, sampwidth=2): p = pyaudio.PyAudio() stream = p.open(format=p.get_format_from_width(sampwidth), channels=channels, rate=sample_rate, output=True) # 存储所有PCM数据 all_pcm_data = bytearray() first_chunk = True try: for chunk in pcm_generator: if chunk: if first_chunk: playback_start_time = datetime.datetime.now() print(f"开始播放时间:{playback_start_time.strftime('%Y-%m-%d %H:%M:%S.%f')}") first_chunk = False stream.write(chunk) all_pcm_data.extend(chunk) finally: stream.stop_stream() stream.close() p.terminate() # 保存为WAV文件 with wave.open(save_path, 'wb') as wf: wf.setnchannels(channels) wf.setsampwidth(sampwidth) wf.setframerate(sample_rate) wf.writeframes(all_pcm_data) print(f"音频已保存到: {save_path}") def test_tts(): spk_id = "中文男" ref_text = "***" tts_text = "***" stream_mode = True data = { "tts_text": tts_text, "stream": stream_mode, "ref_audio": wav_to_base64(r'C:\Users\shao\Desktop\1.wav'), "ref_text": ref_text, "ref_tag": "test", "speed": 1.0, "mode": "zero_shot" } st = time.time() if stream_mode: playback_start_time = datetime.datetime.now() print(f"开始请求时间:{playback_start_time.strftime('%Y-%m-%d %H:%M:%S.%f')}") response = requests.post("http://localhost:3005/cosyvoice", json=data, stream=True) print(f"请求耗时:{time.time() - st}s") play_and_save_pcm_stream(response.iter_content(chunk_size=1024)) else: response = requests.post("http://localhost:3005/cosyvoice", json=data) print(f"请求耗时:{time.time() - st}s") audio_content = base64.b64decode(response.content) pcm2wav(audio_content) def pcm2wav(pcm_data, save_path='C:/Users/shao/Desktop/audios/gen_audio.wav'): with wave.open(save_path, "wb") as wav: wav.setnchannels(1) wav.setsampwidth(2) wav.setframerate(24000) wav.writeframes(pcm_data) print(f"音频已保存到: {save_path}") if __name__ == '__main__': test_tts()
【流式输出-代码】
import os import random import soundfile as sf import librosa import torch import sounddevice as sd # ✅ 实时播放 from cosyvoice.cli.cosyvoice import CosyVoice2 from cosyvoice.utils.common import set_all_random_seed from cosyvoice.utils.file_utils import load_wav prompt_text = "***" model_dir = 'E:\\modelscope_cache\\hub\\iic\\CosyVoice2-0___5B' prompt_audio_path = r'C:\Users\shao\Desktop\1.wav' message = '***' seed = random.randint(1, 100000000) stream = True cosyvoice = CosyVoice2(model_dir, load_jit=False, load_trt=True, fp16=True, use_flow_cache=True) prompt_sr = 16000 max_val = 0.8 def postprocess(speech, top_db=60, hop_length=220, win_length=440): speech, _ = librosa.effects.trim( speech, top_db=top_db, frame_length=win_length, hop_length=hop_length ) if speech.abs().max() > max_val: speech = speech / speech.abs().max() * max_val speech = torch.concat([speech, torch.zeros(1, int(cosyvoice.sample_rate * 0.2))], dim=1) return speech prompt_speech_16k = postprocess(load_wav(prompt_audio_path, prompt_sr)) set_all_random_seed(seed) output_path = "C:\\Users\\shao\\Desktop\\CosyVoice2_stream.wav" if os.path.exists(output_path): os.remove(output_path) # ✅ 实时播放和写入 with sf.SoundFile(output_path, mode='w', samplerate=cosyvoice.sample_rate, channels=1, format='WAV') as f: print("开始流式生成语音...") for i, chunk in enumerate(cosyvoice.inference_zero_shot(tts_text=message, prompt_text=prompt_text, prompt_speech_16k=prompt_speech_16k, stream=stream, speed=1.0)): speech = chunk['tts_speech'].numpy().flatten() # ✅ 实时播放 sd.play(speech, samplerate=cosyvoice.sample_rate, blocking=True) # ✅ 写入文件 f.write(speech) print(f"第 {i + 1} 段语音写入完毕") print(f"全部生成完成,音频保存至:{output_path}")
2.4,声调控制
【支持声调】https://funaudiollm.github.io/cosyvoice2/
在他讲述那个荒诞故事的过程中,他突然[laughter]停下来,因为他自己也被逗笑了[laughter]。 追求卓越不是终点,它需要你每天都<strong>付出</strong>和<strong>精进</strong>,最终才能达到巅峰。 当你用心去倾听一首音乐时[breath],你会开始注意到那些细微的音符变化[breath],并通过它们感受到音乐背后的情感。
[breath]:呼吸声 [quick_breath]:急促呼吸 [laughter]:笑声 [cough]:咳嗽 [clucking]:咂舌声 [accent]:口音 [noise]:背景噪音 [hissing]:嘶嘶声 [sigh]:叹气 [vocalized-noise]:发声噪音 [lipsmack]:嘴唇动作声(如亲吻、咂嘴) [mn]:模糊的“嗯”声 <|im_start|>, <|im_end|>, <|endofprompt|> → 标记对话开始、结束或提示结束 <strong>, </strong> → 强调文本 <laughter>, </laughter> → 表示笑声段落
【支持语气】只能在 client.py 中使用 inference_instruct2() 模式实现。
parser.add_argument('--mode', default='instruct2', choices=['sft', 'zero_shot', 'cross_lingual', 'instruct'],
用惊讶的语气说<|endofprompt|>走进家门,看见墙上挂满了我的照片,我惊讶得愣住了。原来家人悄悄为我准备了一个惊喜的纪念墙。 用伤心的语气说<|endofprompt|>收到拒信的那一刻,我感到无比伤心。虽然知道失败是成长的一部分,但仍然难以掩饰心中的失落。 用开心的语气说<|endofprompt|>参加朋友的婚礼,看着新人幸福的笑脸,我感到无比开心。这样的爱与承诺,总是令人心生向往。
【报错】ValueError: buffer size must be a multiple of element size
【解决】在 client.py 下配置
else: payload = { 'tts_text': args.tts_text, 'instruct_text': args.instruct_text } files = [('prompt_wav', ('prompt_wav', open(args.prompt_wav, 'rb'), 'application/octet-stream'))] response = requests.request("GET", url, data=payload, files=files, stream=True)
【报错】Mono data must have shape (samples,). Received shape=(1, 77760)
【解决】
pip install librosa==0.10.2
2.5,音色保存 & 音色加载
使用 CosyVoice2 可以完成音色保存&音色加载,如果要想实现对自己音色的 Instruct,CosyVoice所有模型目前均不支持,Instruct 会直接删除音色 embding,使用内部设定好的音色 embding(中文女),所以下面方法无法实现对自己音色的语气调整,只可以使用语气词。
总结:无法对自己的音色进行语气控制,只能使用语气词。
【参考内容】
【音色保存】实现将自己的音色保存并且作为预训练音色生成语音,首先需要得到自己的录音,将 .wav 转换为预训练音色 .pt 。
import torch from cosyvoice.cli.cosyvoice import CosyVoice, CosyVoice2 from cosyvoice.utils.file_utils import load_wav model_dir = r'E:\modelscope_cache\hub\iic\CosyVoice2-0___5B' cosyvoice = CosyVoice2(model_dir) spk_id = "xijun" audio_path = "C:\\Users\\shaoqisun\\Desktop\\1.wav" sample_text = "****" prompt_speech_16k = load_wav(audio_path, 16000) cosyvoice.add_zero_shot_spk(sample_text, prompt_speech_16k, spk_id) print("注册成功,当前可用说话人:", cosyvoice.list_available_spks()) cosyvoice.save_spkinfo() spk2info = torch.load(model_dir + '/spk2info.pt') print("所有可用spk_id:", list(spk2info.keys())) for spk_id in spk2info.keys(): print(spk_id) print(spk2info[spk_id].keys())
【音色加载】由于音色并非通过训练获取,因此效果有限!!!
3,CosyVoice2 vllm
3.1,安装报错
【速度】慢于官方版本
【项目】https://github.com/qi-hua/async_cosyvoice
【旧代码】
git clone -b dev/Comet --single-branch https://github.com/FunAudioLLM/CosyVoice.git
【旧权重】
conda install -c conda-forge git-lfs && \ git lfs install && \ git clone https://www.modelscope.cn/iic/CosyVoice2-0.5B.git && \ cd CosyVoice2-0.5B && \ git checkout 9bd5b08
git checkout ca7f2c63
cp async_cosyvoice/CosyVoice2-0.5B/* pretrained_models/CosyVoice2-0.5B/
【报错】ERROR Server encountered an error: Error(s) in loading state_dict for CausalMaskedDiffWithXvec
【解决】使用绝对路径
【报错】DEBUG dealloc called on running server <grpc._cython.cygrpc.AioServer object at 0x7f853bcdb7f0> with status 1
- https://github.com/qi-hua/async_cosyvoice/issues/22
- https://github.com/qi-hua/async_cosyvoice/issues/35
- https://github.com/qi-hua/async_cosyvoice/issues/74
【解决】把cosyvoice2.yaml里的causal=True注释掉 & 使用旧权重!
【报错】flow.decoder: ImportError: cannot import name 'EstimatorWrapper' from 'cosyvoice.flow.flow_matching'
【解决】替换:cosyvoice/flow/flow_matching.py
# Copyright (c) 2024 Alibaba Inc (authors: Xiang Lyu, Zhihao Du) # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. import queue import threading import torch import torch.nn.functional as F from matcha.models.components.flow_matching import BASECFM class EstimatorWrapper: def __init__(self, estimator_engine, estimator_count=2, ): self.estimators = queue.Queue() self.estimator_engine = estimator_engine for _ in range(estimator_count): estimator = estimator_engine.create_execution_context() if estimator is not None: self.estimators.put(estimator) if self.estimators.empty(): raise Exception("No available estimator") def acquire_estimator(self): return self.estimators.get(), self.estimator_engine def release_estimator(self, estimator): self.estimators.put(estimator) return class ConditionalCFM(BASECFM): def __init__(self, in_channels, cfm_params, n_spks=1, spk_emb_dim=64, estimator: torch.nn.Module = None): super().__init__( n_feats=in_channels, cfm_params=cfm_params, n_spks=n_spks, spk_emb_dim=spk_emb_dim, ) self.t_scheduler = cfm_params.t_scheduler self.training_cfg_rate = cfm_params.training_cfg_rate self.inference_cfg_rate = cfm_params.inference_cfg_rate in_channels = in_channels + (spk_emb_dim if n_spks > 0 else 0) # Just change the architecture of the estimator here self.estimator = estimator self.lock = threading.Lock() @torch.inference_mode() def forward(self, mu, mask, n_timesteps, temperature=1.0, spks=None, cond=None, prompt_len=0, flow_cache=torch.zeros(1, 80, 0, 2)): """Forward diffusion Args: mu (torch.Tensor): output of encoder shape: (batch_size, n_feats, mel_timesteps) mask (torch.Tensor): output_mask shape: (batch_size, 1, mel_timesteps) n_timesteps (int): number of diffusion steps temperature (float, optional): temperature for scaling noise. Defaults to 1.0. spks (torch.Tensor, optional): speaker ids. Defaults to None. shape: (batch_size, spk_emb_dim) cond: Not used but kept for future purposes Returns: sample: generated mel-spectrogram shape: (batch_size, n_feats, mel_timesteps) """ z = torch.randn_like(mu).to(mu.device).to(mu.dtype) * temperature cache_size = flow_cache.shape[2] # fix prompt and overlap part mu and z if cache_size != 0: z[:, :, :cache_size] = flow_cache[:, :, :, 0] mu[:, :, :cache_size] = flow_cache[:, :, :, 1] z_cache = torch.concat([z[:, :, :prompt_len], z[:, :, -34:]], dim=2) mu_cache = torch.concat([mu[:, :, :prompt_len], mu[:, :, -34:]], dim=2) flow_cache = torch.stack([z_cache, mu_cache], dim=-1) t_span = torch.linspace(0, 1, n_timesteps + 1, device=mu.device, dtype=mu.dtype) if self.t_scheduler == 'cosine': t_span = 1 - torch.cos(t_span * 0.5 * torch.pi) return self.solve_euler(z, t_span=t_span, mu=mu, mask=mask, spks=spks, cond=cond), flow_cache def solve_euler(self, x, t_span, mu, mask, spks, cond): """ Fixed euler solver for ODEs. Args: x (torch.Tensor): random noise t_span (torch.Tensor): n_timesteps interpolated shape: (n_timesteps + 1,) mu (torch.Tensor): output of encoder shape: (batch_size, n_feats, mel_timesteps) mask (torch.Tensor): output_mask shape: (batch_size, 1, mel_timesteps) spks (torch.Tensor, optional): speaker ids. Defaults to None. shape: (batch_size, spk_emb_dim) cond: Not used but kept for future purposes """ t, _, dt = t_span[0], t_span[-1], t_span[1] - t_span[0] t = t.unsqueeze(dim=0) # I am storing this because I can later plot it by putting a debugger here and saving it to a file # Or in future might add like a return_all_steps flag sol = [] # Do not use concat, it may cause memory format changed and trt infer with wrong results! x_in = torch.zeros([2, 80, x.size(2)], device=x.device, dtype=x.dtype) mask_in = torch.zeros([2, 1, x.size(2)], device=x.device, dtype=x.dtype) mu_in = torch.zeros([2, 80, x.size(2)], device=x.device, dtype=x.dtype) t_in = torch.zeros([2], device=x.device, dtype=x.dtype) spks_in = torch.zeros([2, 80], device=x.device, dtype=x.dtype) cond_in = torch.zeros([2, 80, x.size(2)], device=x.device, dtype=x.dtype) for step in range(1, len(t_span)): # Classifier-Free Guidance inference introduced in VoiceBox x_in[:] = x mask_in[:] = mask mu_in[0] = mu t_in[:] = t.unsqueeze(0) spks_in[0] = spks cond_in[0] = cond dphi_dt = self.forward_estimator( x_in, mask_in, mu_in, t_in, spks_in, cond_in ) dphi_dt, cfg_dphi_dt = torch.split(dphi_dt, [x.size(0), x.size(0)], dim=0) dphi_dt = ((1.0 + self.inference_cfg_rate) * dphi_dt - self.inference_cfg_rate * cfg_dphi_dt) x = x + dt * dphi_dt t = t + dt sol.append(x) if step < len(t_span) - 1: dt = t_span[step + 1] - t return sol[-1].float() def forward_estimator(self, x, mask, mu, t, spks, cond): if isinstance(self.estimator, torch.nn.Module): return self.estimator.forward(x, mask, mu, t, spks, cond) else: if isinstance(self.estimator, EstimatorWrapper): estimator, engine = self.estimator.acquire_estimator() estimator.set_input_shape('x', (2, 80, x.size(2))) estimator.set_input_shape('mask', (2, 1, x.size(2))) estimator.set_input_shape('mu', (2, 80, x.size(2))) estimator.set_input_shape('t', (2,)) estimator.set_input_shape('spks', (2, 80)) estimator.set_input_shape('cond', (2, 80, x.size(2))) data_ptrs = [x.contiguous().data_ptr(), mask.contiguous().data_ptr(), mu.contiguous().data_ptr(), t.contiguous().data_ptr(), spks.contiguous().data_ptr(), cond.contiguous().data_ptr(), x.data_ptr()] for idx, data_ptr in enumerate(data_ptrs): estimator.set_tensor_address(engine.get_tensor_name(idx), data_ptr) # run trt engine estimator.execute_async_v3(torch.cuda.current_stream().cuda_stream) torch.cuda.current_stream().synchronize() self.estimator.release_estimator(estimator) return x else: with self.lock: self.estimator.set_input_shape('x', (2, 80, x.size(2))) self.estimator.set_input_shape('mask', (2, 1, x.size(2))) self.estimator.set_input_shape('mu', (2, 80, x.size(2))) self.estimator.set_input_shape('t', (2,)) self.estimator.set_input_shape('spks', (2, 80)) self.estimator.set_input_shape('cond', (2, 80, x.size(2))) # run trt engine self.estimator.execute_v2([x.contiguous().data_ptr(), mask.contiguous().data_ptr(), mu.contiguous().data_ptr(), t.contiguous().data_ptr(), spks.contiguous().data_ptr(), cond.contiguous().data_ptr(), x.data_ptr()]) return x def compute_loss(self, x1, mask, mu, spks=None, cond=None): """Computes diffusion loss Args: x1 (torch.Tensor): Target shape: (batch_size, n_feats, mel_timesteps) mask (torch.Tensor): target mask shape: (batch_size, 1, mel_timesteps) mu (torch.Tensor): output of encoder shape: (batch_size, n_feats, mel_timesteps) spks (torch.Tensor, optional): speaker embedding. Defaults to None. shape: (batch_size, spk_emb_dim) Returns: loss: conditional flow matching loss y: conditional flow shape: (batch_size, n_feats, mel_timesteps) """ b, _, t = mu.shape # random timestep t = torch.rand([b, 1, 1], device=mu.device, dtype=mu.dtype) if self.t_scheduler == 'cosine': t = 1 - torch.cos(t * 0.5 * torch.pi) # sample noise p(x_0) z = torch.randn_like(x1) y = (1 - (1 - self.sigma_min) * t) * z + t * x1 u = x1 - (1 - self.sigma_min) * z # during training, we randomly drop condition to trade off mode coverage and sample fidelity if self.training_cfg_rate > 0: cfg_mask = torch.rand(b, device=x1.device) > self.training_cfg_rate mu = mu * cfg_mask.view(-1, 1, 1) spks = spks * cfg_mask.view(-1, 1) cond = cond * cfg_mask.view(-1, 1, 1) pred = self.estimator(y, mask, mu, t.squeeze(), spks, cond) loss = F.mse_loss(pred * mask, u * mask, reduction="sum") / (torch.sum(mask) * u.shape[1]) return loss, y class CausalConditionalCFM(ConditionalCFM): def __init__(self, in_channels, cfm_params, n_spks=1, spk_emb_dim=64, estimator: torch.nn.Module = None): super().__init__(in_channels, cfm_params, n_spks, spk_emb_dim, estimator) self.rand_noise = torch.randn([1, 80, 50 * 300]) @torch.inference_mode() def forward(self, mu, mask, n_timesteps, temperature=1.0, spks=None, cond=None): """Forward diffusion Args: mu (torch.Tensor): output of encoder shape: (batch_size, n_feats, mel_timesteps) mask (torch.Tensor): output_mask shape: (batch_size, 1, mel_timesteps) n_timesteps (int): number of diffusion steps temperature (float, optional): temperature for scaling noise. Defaults to 1.0. spks (torch.Tensor, optional): speaker ids. Defaults to None. shape: (batch_size, spk_emb_dim) cond: Not used but kept for future purposes Returns: sample: generated mel-spectrogram shape: (batch_size, n_feats, mel_timesteps) """ z = self.rand_noise[:, :, :mu.size(2)].to(mu.device).to(mu.dtype) * temperature # fix prompt and overlap part mu and z t_span = torch.linspace(0, 1, n_timesteps + 1, device=mu.device, dtype=mu.dtype) if self.t_scheduler == 'cosine': t_span = 1 - torch.cos(t_span * 0.5 * torch.pi) return self.solve_euler(z, t_span=t_span, mu=mu, mask=mask, spks=spks, cond=cond), None
3.2,启动项目
【GRPC】
cd runtime/async_grpc
python -m grpc_tools.protoc -I. --python_out=. --grpc_python_out=. cosyvoice.proto
python server.py --load_jit --load_trt --fp16
【报错】ImportError: libre2.so.11: cannot open shared object file: No such file or directory
sudo apt install git cmake build-essential git clone https://github.com/google/re2.git cd re2 make -j$(nproc) sudo make install sudo ldconfig
【报错】re2/dfa.cc:37:10: fatal error: absl/base/call_once.h: No such file or directory
37 | #include "absl/base/call_once.h"
| ^~~~~~~~~~~~~~~~~~~~~~~git clone https://github.com/abseil/abseil-cpp.git cd abseil-cpp mkdir build && cd build cmake .. -DCMAKE_POSITION_INDEPENDENT_CODE=ON make -j$(nproc) sudo make install sudo ldconfig
cd .. cd .. make clean make -j$(nproc) sudo make install sudo ldconfig
【报错】Package 'gtest', required by 'virtual:world', not found
sudo apt-get install libbenchmark-dev sudo apt-get install libgtest-dev
【报错】ImportError: /root/miniconda3/envs/ac2/bin/../lib/libstdc++.so.6: version `GLIBCXX_3.4.32' not found (required by /usr/local/lib/libre2.so.11)https://github.com/pybind/pybind11/discussions/3453
rm /home/xx/anaconda3/bin/../lib/libstdc++.so.6 cp /usr/lib/x86_64-linux-gnu/libstdc++.so.6.0.29 /home/xx/anaconda3/bin/../lib ln -s /home/xx/anaconda3/bin/../lib/libstdc++.so.6.0.29 /home/xx/anaconda3/bin/../lib/libstdc++.so.6
【报错】ImportError: /root/autodl-tmp/miniconda3/envs/cosyvoice2/lib/python3.10/site-packages/ttsfrd.cpython-310-x86_64-linux-gnu.so: undefined symbol: _ZN3re212re2_internal5ParseISsEEbPKcmPT_
export LD_LIBRARY_PATH=/root/autodl-tmp/CosyVoice/async_cosyvoice/runtime/async_grpc/re2/lib:$LD_LIBRARY_PATH
pip install /root/CosyVoice/pretrained_models/CosyVoice2-ttsfrd/ttsfrd_dependency-0.1-py3-none-any.whl pip install /root/CosyVoice/pretrained_models/CosyVoice2-ttsfrd/ttsfrd-0.4.2-cp310-cp310-linux_x86_64.whl
【报错】ERROR Server encountered an error: failed to initialize ttsfrd resource
【解决】将‘pretrained_models’下的‘CosyVoice-ttsfrd’文件夹 cp & 重命名为‘speech_kantts_ttsfrd’。”unzip resource.zip -d .
3.3,保存音色
from async_cosyvoice.async_cosyvoice import AsyncCosyVoice2 from cosyvoice.utils.file_utils import load_wav if __name__ == '__main__': model_dir = r'/root/CosyVoice/pretrained_models/CosyVoice2-0.5B' cosyvoice = AsyncCosyVoice2(model_dir, load_jit=True, load_trt=True, fp16=True) spk_id = "xijun" audio_path = "1.wav" prompt_text = "我这边现在确实是还不上,然后前一阵你们这边一直来电话各种骚扰,还跟我家人打电话。" prompt_speech_16k = load_wav(audio_path, 16000) cosyvoice.frontend.generate_spk_info(spk_id, prompt_text, prompt_speech_16k, 24000, "A")
4,CosyVoice2 Sglang
4.1,模型转换
将 Qwen 替换 CosyVoice2 中的 Qwen:
import torch from transformers import Qwen2ForCausalLM, AutoConfig from safetensors.torch import load_file class Qwen2ForCausalLM_tts(Qwen2ForCausalLM): def __init__(self, config): super().__init__(config) llm_input_size = 896 llm_output_size = 896 speech_token_size = 151936 self.llm_decoder = torch.nn.Linear(llm_output_size, speech_token_size + 3) # 这个维度自己改下,我是本地修改过的模型 self.speech_embedding = torch.nn.Embedding(speech_token_size + 3, llm_input_size) def forward(self, *args, **kwargs): outputs = super().forward(*args, **kwargs) return outputs config = AutoConfig.from_pretrained("/root/CosyVoice-sglang/pretrained_models/CosyVoice2-0.5B/CosyVoice-BlankEN") custom_model = Qwen2ForCausalLM_tts(config) from safetensors.torch import load_file state_dict = load_file("/root/CosyVoice-sglang/pretrained_models/Qwen2.5-0.5B/model.safetensors") new_state_dict = { k.replace('llm.model.', '') if 'llm.model.' in k else k: v for k, v in state_dict.items() } # 把cosy的结构里套娃的实际的qwen的参数摘出来 custom_model.load_state_dict(new_state_dict, strict=False) custom_model.save_pretrained("/root/CosyVoice-sglang/pretrained_models/CosyVoice-Sglang", safe_serialization=True)
将 Qwen 在 Sglang 注册:
vim /root/miniconda3/envs/cv2-sglang/lib/python3.10/site-packages/sglang/srt/models/qwen2_5.py
from typing import Any, Dict, Iterable, Optional, Tuple import torch from torch import nn from sglang.srt.distributed import ( get_tensor_model_parallel_rank, get_tensor_model_parallel_world_size, ) from sglang.srt.layers.activation import SiluAndMul from sglang.srt.layers.layernorm import RMSNorm from sglang.srt.layers.linear import ( MergedColumnParallelLinear, QKVParallelLinear, RowParallelLinear, ) from sglang.srt.layers.logits_processor import LogitsProcessor from sglang.srt.layers.pooler import Pooler, PoolingType from sglang.srt.layers.quantization.base_config import QuantizationConfig from sglang.srt.layers.radix_attention import RadixAttention from sglang.srt.layers.rotary_embedding import get_rope from sglang.srt.layers.vocab_parallel_embedding import ( ParallelLMHead, VocabParallelEmbedding, ) from sglang.srt.model_executor.forward_batch_info import ForwardBatch from sglang.srt.model_loader.weight_utils import ( default_weight_loader, kv_cache_scales_loader, ) from sglang.srt.utils import make_layers Qwen2Config = None class Qwen2MLP(nn.Module): def __init__( self, hidden_size: int, intermediate_size: int, hidden_act: str, quant_config: Optional[QuantizationConfig] = None, ) -> None: super().__init__() self.gate_up_proj = MergedColumnParallelLinear( hidden_size, [intermediate_size] * 2, bias=False, quant_config=quant_config, ) self.down_proj = RowParallelLinear( intermediate_size, hidden_size, bias=False, quant_config=quant_config, ) if hidden_act != "silu": raise ValueError( f"Unsupported activation: {hidden_act}. " "Only silu is supported for now." ) self.act_fn = SiluAndMul() def forward(self, x): gate_up, _ = self.gate_up_proj(x) x = self.act_fn(gate_up) x, _ = self.down_proj(x) return x class Qwen2Attention(nn.Module): def __init__( self, hidden_size: int, num_heads: int, num_kv_heads: int, layer_id: int = 0, rope_theta: float = 1000000, rope_scaling: Optional[Dict[str, Any]] = None, max_position_embeddings: int = 32768, quant_config: Optional[QuantizationConfig] = None, ) -> None: super().__init__() self.hidden_size = hidden_size tp_size = get_tensor_model_parallel_world_size() self.total_num_heads = num_heads assert self.total_num_heads % tp_size == 0 self.num_heads = self.total_num_heads // tp_size self.total_num_kv_heads = num_kv_heads if self.total_num_kv_heads >= tp_size: # Number of KV heads is greater than TP size, so we partition # the KV heads across multiple tensor parallel GPUs. assert self.total_num_kv_heads % tp_size == 0 else: # Number of KV heads is less than TP size, so we replicate # the KV heads across multiple tensor parallel GPUs. assert tp_size % self.total_num_kv_heads == 0 self.num_kv_heads = max(1, self.total_num_kv_heads // tp_size) self.head_dim = hidden_size // self.total_num_heads self.q_size = self.num_heads * self.head_dim self.kv_size = self.num_kv_heads * self.head_dim self.scaling = self.head_dim ** -0.5 self.rope_theta = rope_theta self.max_position_embeddings = max_position_embeddings self.qkv_proj = QKVParallelLinear( hidden_size, self.head_dim, self.total_num_heads, self.total_num_kv_heads, bias=True, quant_config=quant_config, ) self.o_proj = RowParallelLinear( self.total_num_heads * self.head_dim, hidden_size, bias=False, quant_config=quant_config, ) self.rotary_emb = get_rope( self.head_dim, rotary_dim=self.head_dim, max_position=max_position_embeddings, base=rope_theta, rope_scaling=rope_scaling, ) self.attn = RadixAttention( self.num_heads, self.head_dim, self.scaling, num_kv_heads=self.num_kv_heads, layer_id=layer_id, ) def forward( self, positions: torch.Tensor, hidden_states: torch.Tensor, forward_batch: ForwardBatch, ) -> torch.Tensor: qkv, _ = self.qkv_proj(hidden_states) q, k, v = qkv.split([self.q_size, self.kv_size, self.kv_size], dim=-1) q, k = self.rotary_emb(positions, q, k) attn_output = self.attn(q, k, v, forward_batch) output, _ = self.o_proj(attn_output) return output class Qwen2DecoderLayer(nn.Module): def __init__( self, config: Qwen2Config, layer_id: int = 0, quant_config: Optional[QuantizationConfig] = None, ) -> None: super().__init__() self.hidden_size = config.hidden_size rope_theta = getattr(config, "rope_theta", 1000000) rope_scaling = getattr(config, "rope_scaling", None) max_position_embeddings = getattr(config, "max_position_embeddings", 32768) self.self_attn = Qwen2Attention( hidden_size=self.hidden_size, num_heads=config.num_attention_heads, num_kv_heads=config.num_key_value_heads, layer_id=layer_id, rope_theta=rope_theta, rope_scaling=rope_scaling, max_position_embeddings=max_position_embeddings, quant_config=quant_config, ) self.mlp = Qwen2MLP( hidden_size=self.hidden_size, intermediate_size=config.intermediate_size, hidden_act=config.hidden_act, quant_config=quant_config, ) self.input_layernorm = RMSNorm(config.hidden_size, eps=config.rms_norm_eps) self.post_attention_layernorm = RMSNorm( config.hidden_size, eps=config.rms_norm_eps ) def forward( self, positions: torch.Tensor, hidden_states: torch.Tensor, forward_batch: ForwardBatch, residual: Optional[torch.Tensor], ) -> Tuple[torch.Tensor, torch.Tensor]: # Self Attention if residual is None: residual = hidden_states hidden_states = self.input_layernorm(hidden_states) else: hidden_states, residual = self.input_layernorm(hidden_states, residual) hidden_states = self.self_attn( positions=positions, hidden_states=hidden_states, forward_batch=forward_batch, ) # Fully Connected hidden_states, residual = self.post_attention_layernorm(hidden_states, residual) hidden_states = self.mlp(hidden_states) return hidden_states, residual class Qwen2Model(nn.Module): def __init__( self, config: Qwen2Config, quant_config: Optional[QuantizationConfig] = None, ) -> None: super().__init__() self.config = config self.padding_idx = config.pad_token_id self.vocab_size = config.vocab_size self.embed_tokens = VocabParallelEmbedding( config.vocab_size, config.hidden_size, quant_config=quant_config, ) self.layers = make_layers( config.num_hidden_layers, lambda idx, prefix: Qwen2DecoderLayer( layer_id=idx, config=config, quant_config=quant_config, ), ) self.norm = RMSNorm(config.hidden_size, eps=config.rms_norm_eps) def get_input_embeddings(self, input_ids: torch.Tensor) -> torch.Tensor: if hasattr(self.config, "scale_emb"): return self.embed_tokens(input_ids) * self.config.scale_emb else: return self.embed_tokens(input_ids) def forward( self, input_ids: torch.Tensor, positions: torch.Tensor, forward_batch: ForwardBatch, input_embeds: torch.Tensor = None, ) -> torch.Tensor: if input_embeds is None: hidden_states = self.embed_tokens(input_ids) else: hidden_states = input_embeds residual = None for i in range(len(self.layers)): layer = self.layers[i] hidden_states, residual = layer( positions, hidden_states, forward_batch, residual, ) hidden_states, _ = self.norm(hidden_states, residual) return hidden_states # If this function is called, it should always initialize KV cache scale # factors (or else raise an exception). Thus, handled exceptions should # make sure to leave KV cache scale factors in a known good (dummy) state def load_kv_cache_scales(self, quantization_param_path: str) -> None: tp_size = get_tensor_model_parallel_world_size() tp_rank = get_tensor_model_parallel_rank() for layer_idx, scaling_factor in kv_cache_scales_loader( quantization_param_path, tp_rank, tp_size, self.config.num_hidden_layers, self.config.__class__.model_type, ): if not isinstance(self.layers[layer_idx], nn.Identity): layer_self_attn = self.layers[layer_idx].self_attn if hasattr(layer_self_attn.attn, "k_scale"): layer_self_attn.attn.k_scale = scaling_factor layer_self_attn.attn.v_scale = scaling_factor else: raise RuntimeError( "Self attention has no KV cache scaling " "factor attribute!" ) class Qwen2ForCausalLM_tts(nn.Module): # BitandBytes specific attributes default_bitsandbytes_target_modules = [ ".gate_proj.", ".down_proj.", ".up_proj.", ".q_proj.", ".k_proj.", ".v_proj.", ".o_proj.", ] bitsandbytes_stacked_params_mapping = { # shard_name, weight_name, index "q_proj": ("qkv_proj", 0), "k_proj": ("qkv_proj", 1), "v_proj": ("qkv_proj", 2), "gate_proj": ("gate_up_proj", 0), "up_proj": ("gate_up_proj", 1), } def __init__( self, config: Qwen2Config, quant_config: Optional[QuantizationConfig] = None, ) -> None: super().__init__() self.config = config self.quant_config = quant_config self.model = Qwen2Model(config, quant_config=quant_config) if config.tie_word_embeddings: self.lm_head = self.model.embed_tokens else: self.lm_head = ParallelLMHead( config.vocab_size, config.hidden_size, quant_config=quant_config ) self.logits_processor = LogitsProcessor(config) self.pooler = Pooler(pooling_type=PoolingType.LAST, normalize=True) # 克隆的大概这么写: llm_input_size = 896 llm_output_size = 896 speech_token_size = 151936 self.speech_embedding = VocabParallelEmbedding( speech_token_size + 3, llm_input_size, quant_config=quant_config, ) # torch.nn.Embedding(speech_token_size + 3, llm_input_size) self.llm_decoder = ParallelLMHead( speech_token_size + 3, llm_output_size, quant_config=quant_config ) # torch.nn.Linear(llm_output_size, speech_token_size + 3) def get_input_embeddings(self, input_ids: torch.Tensor) -> torch.Tensor: return self.model.get_input_embeddings(input_ids) @torch.no_grad() def qwen_forward( self, input_ids: torch.Tensor, positions: torch.Tensor, forward_batch: ForwardBatch, input_embeds: torch.Tensor = None, ) -> torch.Tensor: if input_embeds is None: hidden_states = self.speech_embedding(input_ids) else: hidden_states = input_embeds residual = None for i in range(len(self.model.layers)): layer = self.model.layers[i] hidden_states, residual = layer( positions, hidden_states, forward_batch, residual, ) hidden_states, _ = self.model.norm(hidden_states, residual) return hidden_states @torch.no_grad() def forward( self, input_ids: torch.Tensor, positions: torch.Tensor, forward_batch: ForwardBatch, input_embeds: torch.Tensor = None, get_embedding: bool = False, ) -> torch.Tensor: hidden_states = self.qwen_forward(input_ids, positions, forward_batch, input_embeds) if not get_embedding: return self.logits_processor( input_ids, hidden_states, self.llm_decoder, forward_batch ) else: return self.pooler(hidden_states, forward_batch) def load_weights(self, weights: Iterable[Tuple[str, torch.Tensor]]): stacked_params_mapping = [ # (param_name, shard_name, shard_id) ("qkv_proj", "q_proj", "q"), ("qkv_proj", "k_proj", "k"), ("qkv_proj", "v_proj", "v"), ("gate_up_proj", "gate_proj", 0), ("gate_up_proj", "up_proj", 1), ] params_dict = dict(self.named_parameters()) for name, loaded_weight in weights: if "rotary_emb.inv_freq" in name or "projector" in name: continue if "rotary_emb.cos_cached" in name or "rotary_emb.sin_cached" in name: # Models trained using ColossalAI may include these tensors in # the checkpoint. Skip them. continue if name.startswith("model.vision_tower") and name not in params_dict: continue for param_name, weight_name, shard_id in stacked_params_mapping: if weight_name not in name: continue name = name.replace(weight_name, param_name) # Skip loading extra bias for GPTQ models. if name.endswith(".bias") and name not in params_dict: continue param = params_dict[name] weight_loader = param.weight_loader weight_loader(param, loaded_weight, shard_id) break else: # Skip loading extra bias for GPTQ models. if name.endswith(".bias") and name not in params_dict: continue param = params_dict[name] weight_loader = getattr(param, "weight_loader", default_weight_loader) weight_loader(param, loaded_weight) def get_embed_and_head(self): # return self.model.embed_tokens.weight, self.lm_head.weight return self.speech_embedding.weight, self.llm_decoder.weight def set_embed_and_head(self, embed, head): # del self.model.embed_tokens.weight # del self.lm_head.weight # self.model.embed_tokens.weight = embed # self.lm_head.weight = head del self.speech_embedding.weight del self.llm_decoder.weight self.speech_embedding.weight = embed self.llm_decoder.weight = head torch.cuda.empty_cache() torch.cuda.synchronize() def load_kv_cache_scales(self, quantization_param_path: str) -> None: self.model.load_kv_cache_scales(quantization_param_path) EntryClass = Qwen2ForCausalLM_tts
4.2,启动服务端
python -m sglang.launch_server --model-path /root/CosyVoice-sglang/pretrained_models/CosyVoice-Sglang --tensor-parallel-size 1 --disable-radix --skip-tokenizer-init --random-seed 1234 --mem-fraction-static 0.3 --dtype bfloat16 --base-gpu-id 0
显存查看:
nvitop -1
【报错】ImportError: cannot import name 'AutoModel' from partially initialized module 'transformers' (most likely due to a circular import) (/root/miniconda3/envs/cv2-sglang/lib/python3.10/site-packages/sglang/srt/models/transformers.py)
mv /root/miniconda3/envs/cv2-sglang/lib/python3.10/site-packages/sglang/srt/models/transformers.py /root/miniconda3/envs/cv2-sglang/lib/python3.10/site-packages/sglang/srt/models/transformers2.py
【报错】/bin/sh: 1: /usr/local/cuda/bin/nvcc: not found ninja: build stopped: subcommand failed.
【报错】
报错分析:In file included from /usr/include/crt/math_functions.h:10551, from /usr/include/crt/common_functions.h:303, from /usr/include/cuda_runtime.h:118, from <command-line>: /usr/include/c++/12/cmath:45:15: fatal error: math.h: 没有那个文件或目录 45 | #include_next <math.h> | ^~~~~~~~ compilation terminated. In file included from /usr/include/crt/math_functions.h:10551, from /usr/include/crt/common_functions.h:303, from /usr/include/cuda_runtime.h:118, from <command-line>: /usr/include/c++/12/cmath:45:15: fatal error: math.h: 没有那个文件或目录 45 | #include_next <math.h> | ^~~~~~~~ compilation terminated. fatal : Could not open input file /tmp/tmpxft_000031b8_00000000-7_batch_prefill_jit_pybind.cpp1.ii ninja: build stopped: subcommand failed.
【解决】匹配 GCC 和 CUDA 版本:
https://stackoverflow.com/questions/6622454/cuda-incompatible-with-gcc-version
echo 'export PATH=/usr/local/cuda-12.8/bin:$PATH' >> ~/.bashrc echo 'export LD_LIBRARY_PATH=/usr/local/cuda-12.8/lib64:$LD_LIBRARY_PATH' >> ~/.bashrc source ~/.bashrc
【报错】更新 Cuda 驱动版本
No CUDA runtime is found, using CUDA_HOME='/usr/local/cuda-12.8' → Python/Triton 找不到有效的 CUDA runtime,尽管你系统有 CUDA 12.8。
RuntimeError: 0 active drivers ([]). There should only be one. → Triton 在初始化时没检测到任何可用的 NVIDIA 驱动。
Torch Warning: system has unsupported display driver / cuda driver combination → PyTorch 检测到 CUDA 驱动和显示驱动版本不匹配。
Can't initialize NVML → NVIDIA 管理库(用于 GPU 状态查询)无法启动,说明驱动有问题。
sudo ubuntu-drivers autoinstall
4.3,启动客户端
模型转换:执行 onnx2trt.py,选择 fp16 精度。
指定模型:修改 cosyvoice_ca.py,第 200 行左右代码,内容为上一步生成的模型。
f"flow.decoder.estimator.fp16.4070.plan"
启动 cosyvoice_ca.py:指定模型:/root/CosyVoice-sglang/pretrained_models/CosyVoice2-0.5B
保存音色:
import torch from cosyvoice.cli.cosyvoice import CosyVoice, CosyVoice2 from cosyvoice.utils.file_utils import load_wav model_dir = r'E:\modelscope_cache\hub\iic\CosyVoice2-0___5B' cosyvoice = CosyVoice2(model_dir) spk_id = "xijun" audio_path = "C:\\Users\\shaoqisun\\Desktop\\1.wav" sample_text = "****" prompt_speech_16k = load_wav(audio_path, 16000) cosyvoice.add_zero_shot_spk(sample_text, prompt_speech_16k, spk_id) print("注册成功,当前可用说话人:", cosyvoice.list_available_spks()) cosyvoice.save_spkinfo() spk2info = torch.load(model_dir + '/spk2info.pt') print("所有可用spk_id:", list(spk2info.keys())) for spk_id in spk2info.keys(): print(spk_id) print(spk2info[spk_id].keys())
【报错】AttributeError: 'NoneType' object has no attribute 'additional_stop_token_ids'
【解决】临时方案:关闭 max_new_tokens & min_new_tokens,地址:https://github.com/sgl-project/sglang/issues/9039
【报错】
/pytorch/aten/src/ATen/native/cuda/Indexing.cu:1553: indexSelectLargeIndex: block: [87,0,0], thread: [32,0,0] Assertion `srcIndex < srcSelectDimSize` failed. /pytorch/aten/src/ATen/native/cuda/Indexing.cu:1553: indexSelectLargeIndex: block: [87,0,0], thread: [33,0,0] Assertion `srcIndex < srcSelectDimSize` failed. /pytorch/aten/src/ATen/native/cuda/Indexing.cu:1553: indexSelectLargeIndex: block: [87,0,0], thread: [34,0,0] Assertion `srcIndex < srcSelectDimSize` failed. /pytorch/aten/src/ATen/native/cuda/Indexing.cu:1553: indexSelectLargeIndex: block: [87,0,0], thread: [35,0,0] Assertion `srcIndex < srcSelectDimSize` failed. /pytorch/aten/src/ATen/native/cuda/Indexing.cu:1553: indexSelectLargeIndex: block: [87,0,0], thread: [36,0,0] Assertion `srcIndex < srcSelectDimSize` failed. /pytorch/aten/src/ATen/native/cuda/Indexing.cu:1553: indexSelectLargeIndex: block: [87,0,0], thread: [37,0,0] Assertion `srcIndex < srcSelectDimSize` failed. /pytorch/aten/src/ATen/native/cuda/Indexing.cu:1553: indexSelectLargeIndex: block: [87,0,0], thread: [38,0,0] Assertion `srcIndex < srcSelectDimSize` failed. /pytorch/aten/src/ATen/native/cuda/Indexing.cu:1553: indexSelectLargeIndex: block: [87,0,0], thread: [39,0,0] Assertion `srcIndex < srcSelectDimSize` failed. /pytorch/aten/src/ATen/native/cuda/Indexing.cu:1553: indexSelectLargeIndex: block: [87,0,0], thread: [40,0,0] Assertion `srcIndex < srcSelectDimSize` failed. /pytorch/aten/src/ATen/native/cuda/Indexing.cu:1553: indexSelectLargeIndex: block: [87,0,0], thread: [41,0,0] Assertion `srcIndex < srcSelectDimSize` failed. /pytorch/aten/src/ATen/native/cuda/Indexing.cu:1553: indexSelectLargeIndex: block: [87,0,0], thread: [42,0,0] Assertion `srcIndex < srcSelectDimSize` failed. /pytorch/aten/src/ATen/native/cuda/Indexing.cu:1553: indexSelectLargeIndex: block: [87,0,0], thread: [43,0,0] Assertion `srcIndex < srcSelectDimSize` failed. /pytorch/aten/src/ATen/native/cuda/Indexing.cu:1553: indexSelectLargeIndex: block: [87,0,0], thread: [44,0,0] Assertion `srcIndex < srcSelectDimSize` failed. /pytorch/aten/src/ATen/native/cuda/Indexing.cu:1553: indexSelectLargeIndex: block: [87,0,0], thread: [45,0,0] Assertion `srcIndex < srcSelectDimSize` failed.
【解决】检查一下 token 是否超过了 LLM 的允许索引。
5,CosyVoice2 Triton
【下载镜像】
docker pull docker-0.unsee.tech/soar97/triton-cosyvoice:25.06
【下载模型】
# 配置环境变量 export HTTP_PROXY=http://127.0.0.1:15732 export HTTPS_PROXY=http://127.0.0.1:15732 huggingface_model_local_dir=./cosyvoice2_llm model_scope_model_local_dir=./CosyVoice2-0.5B huggingface-cli download --local-dir $huggingface_model_local_dir yuekai/cosyvoice2_llm modelscope download --model iic/CosyVoice2-0.5B --local_dir $model_scope_model_local_dir
【启动镜像】
docker run -it --name "cosyvoice-server" --gpus all --net host -v /root/PycharmProjects:/workspace --shm-size=2g docker-0.unsee.tech/soar97/triton-cosyvoice:25.06
【重新进入】
docker start cosyvoice-server docker exec -it cosyvoice-server /bin/bash
【依次执行】
bash run.sh 1 1 bash run.sh 2 2 bash run.sh 3 3 bash run.sh 4 4 bash run.sh 5 5
【报错】error: creating server: Internal - failed to load all models
git submodule update --init --recursive
【报错】ConnectionError: Couldn't reach 'yuekai/seed_tts_cosy2' on the Hub (LocalEntryNotFoundError)
手动下载 yuekai/seed_tts_cosy2
更多推荐
所有评论(0)