TTS：CosyVoice2 高质量语音生成

CosyVoice2 只需3~10秒的原始音频，即可生成音色，实现不同语言之间的语音合成，中文、英文、日语、韩语、中国方言（粤语、四川话、上海话、天津话、武汉话、长沙话、郑州话等），还支持生成具有多种情感表达的语音，包括快乐、兴奋、悲伤、愤怒等，还能通过指令控制说话的语气、情绪等

燕双嘤

2227人浏览 · 2025-04-28 10:46:21

燕双嘤 · 2025-04-28 10:46:21 发布

【原文】CosyVoice: A Scalable Multilingual Zero-shot Text-to-speech Synthesizer based on Supervised Semantic Tokens

【原文】CosyVoice 2: Scalable Streaming Speech Synthesis with Large Language Models

【项目】https://github.com/FunAudioLLM/CosyVoice
pip install wetext==0.0.4 inflect==7.3.1 numpy==1.25.0 gradio HyperPyYAML==1.2.2 librosa==0.10.2 onnxruntime-gpu==1.19.0 openai-whisper==20231117 transformers==4.40.1 omegaconf==2.3.0 conformer==0.3.2 diffusers==0.29.0 hydra-core==1.3.2 lightning==2.2.4 gdown==5.1.0 matplotlib==3.7.5 wget==3.2 pyarrow==18.1.0 pyworld==0.3.4 pydantic==2.10.6 tensorrt==10.9.0.34 pynini==2.1.5 openpyxl

conda install -c conda-forge pynini
pip install WeTextProcessing --no-deps
conda install -c conda-forge pyface
【CosyVoice2-0.5B】
E:\\modelscope_cache\\hub\\iic\\CosyVoice2-0___5B
【CosyVoice-300B】
E:\\modelscope_cache\\hub\\iic\\CosyVoice-300M
【CosyVoice-300M-Instruct】
E:\\modelscope_cache\\hub\\iic\\CosyVoice-300M-Instruct
【CosyVoice-300M-SFT】
E:\\modelscope_cache\\hub\\iic\\CosyVoice-300M-SFT

1，CosyVoice

1.1，inference_zero_shot

【服务端】CosyVoice/runtime/python/fastapi/server.py，添加环境变量：
PYTHONUNBUFFERED=1;PYTHONPATH=D:\PyCharmWorkSpace\VH\Linly-Talker\CosyVoice\third_party\Matcha-TTS
【报错】TypeError: expected str, bytes or os.PathLike object, not MultiplexedPath

【解决】Windows 下不支持 MultiplexedPath，手动添加进去。
self.zh_tn_model = ZhNormalizer(remove_erhua=False, full_to_half=False, overwrite_cache=True)
self.en_tn_model = EnNormalizer()
👇
self.zh_tn_model = ZhNormalizer(remove_erhua=False, full_to_half=False, overwrite_cache=True,cache_dir="tn")
self.en_tn_model = EnNormalizer(cache_dir="tn")

【inference_zero_shot】

import torch
import random
from openpyxl import load_workbook
from cosyvoice.cli.cosyvoice import CosyVoice2, CosyVoice
from cosyvoice.utils.file_utils import load_wav
from cosyvoice.utils.common import set_all_random_seed
import librosa
import soundfile as sf

message = "****。"

model_dir = 'E:\\modelscope_cache\\hub\\iic\\CosyVoice-300M'
prompt_audio_path = 'yuan.WAV'
prompt_text = '今天可以处理吗？'
seed = random.randint(1, 100000000)
stream = False
speeds = [0.97]
cosyvoice = CosyVoice(model_dir)
prompt_sr = 16000
max_val = 0.8

def postprocess(speech, top_db=60, hop_length=220, win_length=440):
    speech, _ = librosa.effects.trim(
        speech, top_db=top_db,
        frame_length=win_length,
        hop_length=hop_length
    )
    if speech.abs().max() > max_val:
        speech = speech / speech.abs().max() * max_val
    speech = torch.concat([speech, torch.zeros(1, int(cosyvoice.sample_rate * 0.2))], dim=1)
    return speech
prompt_speech_16k = postprocess(load_wav(prompt_audio_path, prompt_sr))
set_all_random_seed(seed)
tts_wav = "C:\\Users\\shao\\Desktop\\"
for i in cosyvoice.inference_zero_shot(tts_text=message, prompt_text=prompt_text, prompt_speech_16k=prompt_speech_16k, stream=stream, speed=1.0):
    speech = i['tts_speech'].numpy().flatten()
    sf.write(tts_wav + "CosyVoice-300B.wav", speech, samplerate=cosyvoice.sample_rate)
    print(f"生成成功，音频保存至：{tts_wav}")

【耗时】从传入文本 👉 输出 .wav 总耗时：2.8s

拿到语音数据 response：40 ms
拼接组装 response：2400 ms

【批量语音生成】

import logging
import os
import time

import numpy as np
import requests
import torch
import torchaudio

messages = [

]

from cosyvoice.cli.cosyvoice import CosyVoice2, CosyVoice

cosyvoice = CosyVoice2('D:\modelscope_cache\hub\iic\CosyVoice2-0___5B')
# cosyvoice = CosyVoice('D:\modelscope_cache\hub\iic\CosyVoice-300M-SFT')
# cosyvoice = CosyVoice('D:\modelscope_cache\hub\iic\CosyVoice-300M')
# cosyvoice = CosyVoice('D:\modelscope_cache\hub\iic\CosyVoice-300M-Instruct')

url = "http://127.0.0.1:50000/inference_zero_shot"
prompt_wav = "D:\\PyCharmWorkSpace\\TTS\\CosyVoice2\\cosyvoice\\data\\test.mp3"
index = 1
for message in messages:
    tts_wav = str(index) + ".wav"
    payload = {
        'tts_text': message,
        'prompt_text': "XXXX"
    }
    files = [('prompt_wav', ('prompt_wav', open(prompt_wav, 'rb'), 'application/octet-stream'))]
    response = requests.request("GET", url, data=payload, files=files, stream=True)
    tts_audio = b''
    for r in response.iter_content(chunk_size=16000):
        tts_audio += r
    tts_speech = torch.from_numpy(np.array(np.frombuffer(tts_audio, dtype=np.int16))).unsqueeze(dim=0)
    logging.info('save response to {}'.format(tts_wav))
    torchaudio.save(tts_wav, tts_speech, 22050)
    logging.info('get response')
    index+=1

def read_excel_as_list(filename, sheet_name=None):
    wb = load_workbook(filename)
    ws = wb[sheet_name] if sheet_name else wb.active
    data = [[cell.value for cell in row] for row in ws.iter_rows()]
    return data

# 参数配置
model_dir = 'D:/modelscope_cache/hub/iic/CosyVoice2-0___5B'  # 模型路径
prompt_audio_path = 'D:\\PyCharmWorkSpace\\TTS\\CosyVoice2\\cosyvoice\\data\\test2.wav'  
prompt_text = ''
seed = random.randint(1, 100000000)
stream = False
speed = 1.0
# 初始化模型
cosyvoice = CosyVoice2(model_dir)
prompt_sr = 16000
max_val = 0.8
# 预处理函数
def postprocess(speech, top_db=60, hop_length=220, win_length=440):
    speech, _ = librosa.effects.trim(
        speech, top_db=top_db,
        frame_length=win_length,
        hop_length=hop_length
    )
    if speech.abs().max() > max_val:
        speech = speech / speech.abs().max() * max_val
    speech = torch.concat([speech, torch.zeros(1, int(cosyvoice.sample_rate * 0.2))], dim=1)
    return speech
# 加载prompt音频
prompt_speech_16k = postprocess(load_wav(prompt_audio_path, prompt_sr))

# 设置随机种子
set_all_random_seed(seed)
data = read_excel_as_list("example.xlsx")
for message in data:
    tts_wav = "C:\\Users\\shao\\Desktop\\output2\\" + str(message[0]) + ".wav"
    for i in cosyvoice.inference_zero_shot(message[1], prompt_text, prompt_speech_16k, stream=stream, speed=speed):
        speech = i['tts_speech'].numpy().flatten()
        sf.write(tts_wav, speech, samplerate=cosyvoice.sample_rate)
        print(f"生成成功，音频保存至：{tts_wav}")

【长文本&语速过快问题】

index = 1
wav_files = []
for i in cosyvoice.inference_zero_shot(message, prompt_text, prompt_speech_16k, stream=stream, speed=speed):
    print(index)
    tts_wav = ""
    speech = i['tts_speech'].numpy().flatten()
    sf.write(tts_wav, speech, samplerate=cosyvoice.sample_rate)
    print(f"生成成功，音频保存至：{tts_wav}")
    wav_files.append(tts_wav)
    index += 1

existing_wavs = [f for f in wav_files if os.path.isfile(f)]
if len(existing_wavs) > 1:
    combined = AudioSegment.empty()
    for wav_file in existing_wavs:
        audio = AudioSegment.from_wav(wav_file)
        combined += audio
    output_file = ''
    combined.export(output_file, format='wav')
    print(f"合并完成，保存为：{output_file}")

1.2，inference_sft

【保存音色文件】参考 CosyVoice2

data = load_spk_from_wav("C:\\Users\\shao\\Desktop\\5.wav", cosyvoice)
torch.save(data, f'speakers/xijun.pt')

【直接生成】耗时：4毫秒

import random
from cosyvoice.utils.common import set_all_random_seed
import soundfile as sf
from cosyvoice.cli.cosyvoice import CosyVoice, CosyVoice2
message = "****"

model_dir = 'E:\\modelscope_cache\\hub\\iic\\CosyVoice-300M-SFT'
seed = random.randint(1, 100000000)
stream = False
speeds = [0.97]
cosyvoice = CosyVoice(model_dir)
prompt_sr = 16000
max_val = 0.8
set_all_random_seed(seed)

tts_wav = "C:\\Users\\shao\\Desktop\\"
for i in cosyvoice.inference_sft(tts_text=message, spk_id="xijun", stream=stream, speed=1.0):
    speech = i['tts_speech'].numpy().flatten()
    sf.write(tts_wav + "CosyVoice-300B.wav", speech, samplerate=cosyvoice.sample_rate)
    print(f"生成成功，音频保存至：{tts_wav}")

1.3，inference_instruct

【直接生成】耗时：6毫秒，没办法使用自己的音色，据说是安全考虑。

model_dir = 'E:\\modelscope_cache\\hub\\iic\\CosyVoice-300M-Instruct'
for i in cosyvoice.inference_instruct(tts_text=message, spk_id="xijun",instruct_text="用开心的语气说", stream=stream, speed=1.0):
    speech = i['tts_speech'].numpy().flatten()
    sf.write(tts_wav + "CosyVoice-300B.wav", speech, samplerate=cosyvoice.sample_rate)
    print(f"生成成功，音频保存至：{tts_wav}")

2，CosyVoice2

2.1，inference_zero_shot

【服务端】CosyVoice/runtime/python/fastapi/server.py，添加环境变量：
PYTHONUNBUFFERED=1;PYTHONPATH=D:\PyCharmWorkSpace\Linly-Talker\CosyVoice\third_party\Matcha-TTS
【报错】ZeroDivisionError: 0.0 cannot be raised to a negative power

【解决】diffusers 版本太高了，建议降低到 0.29.0。

【报错】找不到预训练音色

【解决】需要手动下载spk2info.pt文件粘贴到pretrained_models/CosyVoice2-0.5B中，随后重新运行webui.py就能看到预训练模型。Issue

import torch
import random
from openpyxl import load_workbook
from cosyvoice.cli.cosyvoice import CosyVoice2, CosyVoice
from cosyvoice.utils.file_utils import load_wav
from cosyvoice.utils.common import set_all_random_seed
import librosa
import soundfile as sf

message = ""

model_dir = 'E:\\modelscope_cache\\hub\\iic\\CosyVoice2-0___5B'
prompt_audio_path = 'Test.mp3'
prompt_text = ''
seed = random.randint(1, 100000000)
stream = False
speeds = [0.97]
cosyvoice = CosyVoice2(model_dir)
prompt_sr = 16000
max_val = 0.8

def postprocess(speech, top_db=60, hop_length=220, win_length=440):
    speech, _ = librosa.effects.trim(
        speech, top_db=top_db,
        frame_length=win_length,
        hop_length=hop_length
    )
    if speech.abs().max() > max_val:
        speech = speech / speech.abs().max() * max_val
    speech = torch.concat([speech, torch.zeros(1, int(cosyvoice.sample_rate * 0.2))], dim=1)
    return speech
prompt_speech_16k = postprocess(load_wav(prompt_audio_path, prompt_sr))
set_all_random_seed(seed)
tts_wav = "C:\\Users\\shao\\Desktop\\"
for i in cosyvoice.inference_zero_shot(tts_text=message, prompt_text=prompt_text, prompt_speech_16k=prompt_speech_16k, stream=stream, speed=1.0):
    speech = i['tts_speech'].numpy().flatten()
    sf.write(tts_wav + "CosyVoice2.wav", speech, samplerate=cosyvoice.sample_rate)
    print(f"生成成功，音频保存至：{tts_wav}")

【耗时】从传入文本 👉 输出 .wav 总耗时：2.8s，跟 CosyVoice 离线几乎一样。

拿到语音数据 response：40 ms
拼接组装 response：2400 ms

2.2，inference_instruct2

【粤语输出】耗时 15s

message = "******"

model_dir = 'E:\\modelscope_cache\\hub\\iic\\CosyVoice2-0___5B'
prompt_audio_path = 'C:\\Users\\shao\\Desktop\\Test.mp3'
seed = random.randint(1, 100000000)
stream = False
speeds = [0.97]
cosyvoice = CosyVoice2(model_dir)
prompt_sr = 16000
max_val = 0.8

def postprocess(speech, top_db=60, hop_length=220, win_length=440):
    speech, _ = librosa.effects.trim(
        speech, top_db=top_db,
        frame_length=win_length,
        hop_length=hop_length
    )
    if speech.abs().max() > max_val:
        speech = speech / speech.abs().max() * max_val
    speech = torch.concat([speech, torch.zeros(1, int(cosyvoice.sample_rate * 0.2))], dim=1)
    return speech
prompt_speech_16k = postprocess(load_wav(prompt_audio_path, prompt_sr))
set_all_random_seed(seed)
tts_wav = "C:\\Users\\shao\\Desktop\\"
for i in cosyvoice.inference_instruct2(tts_text=message, instruct_text="用粤语说这句话", prompt_speech_16k=prompt_speech_16k, stream=stream, speed=1.0):
    speech = i['tts_speech'].numpy().flatten()
    sf.write(tts_wav + "CosyVoice2.wav", speech, samplerate=cosyvoice.sample_rate)
    print(f"生成成功，音频保存至：{tts_wav}")

2.3，流式输出

【耗时】首次耗时 8s，二次耗时 6s。

【耗时高的原因】150ms请使用阿里云，而且是预训练音色不需要提取token和embedding情况下。开源版本是python且zero shot推理，跟150ms肯定是有差距的，但是模型方案是一模一样的，有能力的可以先进行工程话，开源后续可能会更新一个llm用vllm推理。

https://github.com/FunAudioLLM/CosyVoice/issues/723

https://github.com/FunAudioLLM/CosyVoice/issues/755

【流式输出-API请求】

import io
import os
import sys
import base64
import uvicorn
import requests
import torchaudio
import numpy as np
import soundfile as sf
from datetime import datetime
from fastapi import FastAPI
from fastapi.responses import StreamingResponse
from pydantic import BaseModel
from typing import Optional
from pydub import AudioSegment, silence
from cosyvoice.cli.cosyvoice import CosyVoice2
from cosyvoice.utils.file_utils import load_wav

cosyvoice = CosyVoice2('E:\\modelscope_cache\\hub\\iic\\CosyVoice2-0___5B', load_jit=True, load_trt=False, use_flow_cache=True)
app = FastAPI()


# 定义一个数据模型，用于接收POST请求中的数据
class TTSRequest(BaseModel):
    spk_id: Optional[str] = None  # 预训练语音id
    ref_audio: Optional[str] = None  # 参考语音 base64编码的音频文件
    ref_text: Optional[str] = None  # 参考语音的文本
    ref_tag: Optional[str] = None  # 参考语音的标签
    tts_text: Optional[str] = None  # 待合成的文本
    instruct_text: Optional[str] = None  # 指令文本
    stream: Optional[bool] = False  # 是否使用流式合成
    speed: Optional[float] = 1.0  # 语速
    mode: Optional[str] = 'zero_shot'  # 合成模式，默认模式为 'sft'，可选模式为 'sft' 'zero_shot' 'cross_lingual' 'instruct'


def save_audio(reference_audio, reference_tag):
    # base64编码的音频文件转为音频文件并保存到本地
    try:
        audio_data = base64.b64decode(reference_audio)
        reference_audio = io.BytesIO(audio_data)
        query_audio = AudioSegment.from_file(reference_audio, format="wav")
        query_audio = query_audio.set_channels(1)
        query_audio = query_audio.set_frame_rate(16000)
        query_audio = query_audio.set_sample_width(2)
        save_path = f"C:/Users/shao/Desktop/audios/{reference_tag}.wav"
        query_audio.export(save_path, format="wav")
        return save_path
    except base64.binascii.Error:
        print("Error: base64 decoding failed")
        return ''


@app.post("/cosyvoice")
async def tts(request: TTSRequest):
    headers = {
        "Content-Type": "audio/pcm",
        "X-Sample-Rate": "24000",  # 假设采样率是 24kHz
        "X-Channel-Count": "1"  # 假设单声道
    }
    if request.mode == 'sft':
        if not request.spk_id:
            return {'message': 'spk_id is required for sft mode'}

        async def generate():
            for out in cosyvoice.inference_sft(tts_text=request.tts_text, spk_id=request.spk_id, stream=request.stream,
                                               speed=request.speed):
                raw = (out['tts_speech'].numpy() * 32767).astype(
                    np.int16).flatten()  # 原始输出 [-1, 1] 之间的float32，需要转为 16 位 PCM
                yield raw.tobytes()

        return StreamingResponse(generate(), media_type="audio/pcm", headers=headers)
    else:
        ref_audio_path = f"C:/Users/shao/Desktop/audios/{request.ref_tag}.wav"
        ref_text_path = f"C:/Users/shao/Desktop/audios/{request.ref_tag}.txt"
        if not os.path.exists(ref_audio_path):
            ref_audio_path = save_audio(request.ref_audio, request.ref_tag)
        if not os.path.exists(ref_text_path):
            ref_text = request.ref_text
            with open(ref_text_path, 'w') as f:
                f.write(ref_text)
        else:
            ref_text = open(ref_text_path, 'r').read()
        if request.mode == 'zero_shot':
            async def generate():
                for out in cosyvoice.inference_zero_shot(tts_text=request.tts_text, prompt_text=ref_text,
                                                         prompt_speech_16k=load_wav(ref_audio_path, 16000),
                                                         stream=request.stream, speed=request.speed):
                    raw = (out['tts_speech'].numpy() * 32767).astype(
                        np.int16).flatten()  # 原始输出 [-1, 1] 之间的float32，需要转为 16 位 PCM
                    yield raw.tobytes()

            return StreamingResponse(generate(), media_type="audio/pcm", headers=headers)
        elif request.mode == 'cross_lingual':
            async def generate():
                for out in cosyvoice.inference_cross_lingual(tts_text=request.tts_text,
                                                             prompt_speech_16k=load_wav(ref_audio_path, 16000),
                                                             stream=request.stream, speed=request.speed):
                    raw = (out['tts_speech'].numpy() * 32767).astype(
                        np.int16).flatten()  # 原始输出 [-1, 1] 之间的float32，需要转为 16 位 PCM
                    yield raw.tobytes()

            return StreamingResponse(generate(), media_type="audio/pcm", headers=headers)
        elif request.mode == 'instruct':
            if not request.instruct_text:
                return {'message': 'instruct_text is required for instruct mode'}

            async def generate():
                for out in cosyvoice.inference_instruct2(tts_text=request.tts_text, instruct_text=request.instruct_text,
                                                         prompt_speech_16k=load_wav(ref_audio_path),
                                                         stream=request.stream, speed=request.speed):
                    raw = (out['tts_speech'].numpy() * 32767).astype(
                        np.int16).flatten()  # 原始输出 [-1, 1] 之间的float32，需要转为 16 位 PCM
                    yield raw.tobytes()

            return StreamingResponse(generate(), media_type="audio/pcm", headers=headers)
        else:
            return {'message': 'Invalid mode'}


if __name__ == '__main__':
    uvicorn.run(app, host='0.0.0.0', port=3005)

import base64
import datetime
import time
import requests
import wave
import pyaudio
import io


def wav_to_base64(file_path):
    with open(file_path, 'rb') as f:
        wav_data = f.read()
    base64_data = base64.b64encode(wav_data).decode('utf-8')
    return base64_data


def play_and_save_pcm_stream(pcm_generator, save_path='C:/Users/shao/Desktop/audios/gen_audio.wav',
                              sample_rate=24000, channels=1, sampwidth=2):
    p = pyaudio.PyAudio()
    stream = p.open(format=p.get_format_from_width(sampwidth),
                    channels=channels,
                    rate=sample_rate,
                    output=True)

    # 存储所有PCM数据
    all_pcm_data = bytearray()
    first_chunk = True
    try:
        for chunk in pcm_generator:
            if chunk:
                if first_chunk:
                    playback_start_time = datetime.datetime.now()
                    print(f"开始播放时间：{playback_start_time.strftime('%Y-%m-%d %H:%M:%S.%f')}")
                    first_chunk = False
                stream.write(chunk)
                all_pcm_data.extend(chunk)
    finally:
        stream.stop_stream()
        stream.close()
        p.terminate()

    # 保存为WAV文件
    with wave.open(save_path, 'wb') as wf:
        wf.setnchannels(channels)
        wf.setsampwidth(sampwidth)
        wf.setframerate(sample_rate)
        wf.writeframes(all_pcm_data)
    print(f"音频已保存到: {save_path}")


def test_tts():
    spk_id = "中文男"
    ref_text = "***"
    tts_text = "***"
    stream_mode = True
    data = {
        "tts_text": tts_text,
        "stream": stream_mode,
        "ref_audio": wav_to_base64(r'C:\Users\shao\Desktop\1.wav'),
        "ref_text": ref_text,
        "ref_tag": "test",
        "speed": 1.0,
        "mode": "zero_shot"
    }

    st = time.time()
    if stream_mode:
        playback_start_time = datetime.datetime.now()
        print(f"开始请求时间：{playback_start_time.strftime('%Y-%m-%d %H:%M:%S.%f')}")
        response = requests.post("http://localhost:3005/cosyvoice", json=data, stream=True)
        print(f"请求耗时：{time.time() - st}s")
        play_and_save_pcm_stream(response.iter_content(chunk_size=1024))
    else:
        response = requests.post("http://localhost:3005/cosyvoice", json=data)
        print(f"请求耗时：{time.time() - st}s")
        audio_content = base64.b64decode(response.content)
        pcm2wav(audio_content)


def pcm2wav(pcm_data, save_path='C:/Users/shao/Desktop/audios/gen_audio.wav'):
    with wave.open(save_path, "wb") as wav:
        wav.setnchannels(1)
        wav.setsampwidth(2)
        wav.setframerate(24000)
        wav.writeframes(pcm_data)
    print(f"音频已保存到: {save_path}")


if __name__ == '__main__':
    test_tts()

【流式输出-代码】

import os
import random
import soundfile as sf
import librosa
import torch
import sounddevice as sd  # ✅ 实时播放

from cosyvoice.cli.cosyvoice import CosyVoice2
from cosyvoice.utils.common import set_all_random_seed
from cosyvoice.utils.file_utils import load_wav

prompt_text = "***"
model_dir = 'E:\\modelscope_cache\\hub\\iic\\CosyVoice2-0___5B'
prompt_audio_path = r'C:\Users\shao\Desktop\1.wav'
message = '***'
seed = random.randint(1, 100000000)
stream = True
cosyvoice = CosyVoice2(model_dir, load_jit=False, load_trt=True, fp16=True, use_flow_cache=True)
prompt_sr = 16000
max_val = 0.8


def postprocess(speech, top_db=60, hop_length=220, win_length=440):
    speech, _ = librosa.effects.trim(
        speech, top_db=top_db,
        frame_length=win_length,
        hop_length=hop_length
    )
    if speech.abs().max() > max_val:
        speech = speech / speech.abs().max() * max_val
    speech = torch.concat([speech, torch.zeros(1, int(cosyvoice.sample_rate * 0.2))], dim=1)
    return speech


prompt_speech_16k = postprocess(load_wav(prompt_audio_path, prompt_sr))
set_all_random_seed(seed)

output_path = "C:\\Users\\shao\\Desktop\\CosyVoice2_stream.wav"
if os.path.exists(output_path):
    os.remove(output_path)

# ✅ 实时播放和写入
with sf.SoundFile(output_path, mode='w', samplerate=cosyvoice.sample_rate, channels=1, format='WAV') as f:
    print("开始流式生成语音...")
    for i, chunk in enumerate(cosyvoice.inference_zero_shot(tts_text=message,
                                                            prompt_text=prompt_text,
                                                            prompt_speech_16k=prompt_speech_16k,
                                                            stream=stream,
                                                            speed=1.0)):
        speech = chunk['tts_speech'].numpy().flatten()
        # ✅ 实时播放
        sd.play(speech, samplerate=cosyvoice.sample_rate, blocking=True)
        # ✅ 写入文件
        f.write(speech)
        print(f"第 {i + 1} 段语音写入完毕")

print(f"全部生成完成，音频保存至：{output_path}")

2.4，声调控制

【支持声调】https://funaudiollm.github.io/cosyvoice2/

在他讲述那个荒诞故事的过程中，他突然[laughter]停下来，因为他自己也被逗笑了[laughter]。

追求卓越不是终点，它需要你每天都<strong>付出</strong>和<strong>精进</strong>，最终才能达到巅峰。

当你用心去倾听一首音乐时[breath]，你会开始注意到那些细微的音符变化[breath]，并通过它们感受到音乐背后的情感。

[breath]：呼吸声
[quick_breath]：急促呼吸
[laughter]：笑声
[cough]：咳嗽
[clucking]：咂舌声
[accent]：口音
[noise]：背景噪音
[hissing]：嘶嘶声
[sigh]：叹气
[vocalized-noise]：发声噪音
[lipsmack]：嘴唇动作声（如亲吻、咂嘴）
[mn]：模糊的“嗯”声

<|im_start|>, <|im_end|>, <|endofprompt|>
→ 标记对话开始、结束或提示结束
<strong>, </strong>
→ 强调文本
<laughter>, </laughter>
→ 表示笑声段落

【支持语气】只能在 client.py 中使用 inference_instruct2() 模式实现。

parser.add_argument('--mode', default='instruct2', choices=['sft', 'zero_shot', 'cross_lingual', 'instruct'],

用惊讶的语气说<|endofprompt|>走进家门，看见墙上挂满了我的照片，我惊讶得愣住了。原来家人悄悄为我准备了一个惊喜的纪念墙。

用伤心的语气说<|endofprompt|>收到拒信的那一刻，我感到无比伤心。虽然知道失败是成长的一部分，但仍然难以掩饰心中的失落。

用开心的语气说<|endofprompt|>参加朋友的婚礼，看着新人幸福的笑脸，我感到无比开心。这样的爱与承诺，总是令人心生向往。

【报错】ValueError: buffer size must be a multiple of element size

【解决】在 client.py 下配置

else:
    payload = {
        'tts_text': args.tts_text,
        'instruct_text': args.instruct_text
    }
    files = [('prompt_wav', ('prompt_wav', open(args.prompt_wav, 'rb'), 'application/octet-stream'))]
    response = requests.request("GET", url, data=payload, files=files, stream=True)

【报错】Mono data must have shape (samples,). Received shape=(1, 77760)

【解决】

pip install librosa==0.10.2

2.5，音色保存 & 音色加载

使用 CosyVoice2 可以完成音色保存&音色加载，如果要想实现对自己音色的 Instruct，CosyVoice所有模型目前均不支持，Instruct 会直接删除音色 embding，使用内部设定好的音色 embding（中文女），所以下面方法无法实现对自己音色的语气调整，只可以使用语气词。

总结：无法对自己的音色进行语气控制，只能使用语气词。

【参考内容】

https://github.com/FunAudioLLM/CosyVoice/issues/671

https://github.com/FunAudioLLM/CosyVoice/issues/1151

https://github.com/FunAudioLLM/CosyVoice/issues/604

https://github.com/FunAudioLLM/CosyVoice/issues/918

【音色保存】实现将自己的音色保存并且作为预训练音色生成语音，首先需要得到自己的录音，将 .wav 转换为预训练音色 .pt 。

import torch

from cosyvoice.cli.cosyvoice import CosyVoice, CosyVoice2
from cosyvoice.utils.file_utils import load_wav

model_dir = r'E:\modelscope_cache\hub\iic\CosyVoice2-0___5B'
cosyvoice = CosyVoice2(model_dir)
spk_id = "xijun"
audio_path = "C:\\Users\\shaoqisun\\Desktop\\1.wav"
sample_text = "****"
prompt_speech_16k = load_wav(audio_path, 16000)
cosyvoice.add_zero_shot_spk(sample_text, prompt_speech_16k, spk_id)
print("注册成功，当前可用说话人：", cosyvoice.list_available_spks())
cosyvoice.save_spkinfo()
spk2info = torch.load(model_dir + '/spk2info.pt')
print("所有可用spk_id：", list(spk2info.keys()))
for spk_id in spk2info.keys():
    print(spk_id)
    print(spk2info[spk_id].keys())

【音色加载】由于音色并非通过训练获取，因此效果有限！！！

3，CosyVoice2 vllm

3.1，安装报错

【速度】慢于官方版本

【项目】https://github.com/qi-hua/async_cosyvoice

【旧代码】

git clone -b dev/Comet --single-branch https://github.com/FunAudioLLM/CosyVoice.git

【旧权重】

conda install -c conda-forge git-lfs && \
git lfs install && \
git clone https://www.modelscope.cn/iic/CosyVoice2-0.5B.git && \
cd CosyVoice2-0.5B && \
git checkout 9bd5b08

git checkout ca7f2c63

cp async_cosyvoice/CosyVoice2-0.5B/* pretrained_models/CosyVoice2-0.5B/

【报错】ERROR Server encountered an error: Error(s) in loading state_dict for CausalMaskedDiffWithXvec

【解决】使用绝对路径

【报错】DEBUG dealloc called on running server <grpc._cython.cygrpc.AioServer object at 0x7f853bcdb7f0> with status 1

【解决】把cosyvoice2.yaml里的causal=True注释掉 & 使用旧权重！

【报错】flow.decoder： ImportError: cannot import name 'EstimatorWrapper' from 'cosyvoice.flow.flow_matching'

【解决】替换：cosyvoice/flow/flow_matching.py

# Copyright (c) 2024 Alibaba Inc (authors: Xiang Lyu, Zhihao Du)
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import queue
import threading
import torch
import torch.nn.functional as F
from matcha.models.components.flow_matching import BASECFM


class EstimatorWrapper:
    def __init__(self, estimator_engine, estimator_count=2, ):
        self.estimators = queue.Queue()
        self.estimator_engine = estimator_engine
        for _ in range(estimator_count):
            estimator = estimator_engine.create_execution_context()
            if estimator is not None:
                self.estimators.put(estimator)

        if self.estimators.empty():
            raise Exception("No available estimator")

    def acquire_estimator(self):
        return self.estimators.get(), self.estimator_engine

    def release_estimator(self, estimator):
        self.estimators.put(estimator)
        return


class ConditionalCFM(BASECFM):
    def __init__(self, in_channels, cfm_params, n_spks=1, spk_emb_dim=64, estimator: torch.nn.Module = None):
        super().__init__(
            n_feats=in_channels,
            cfm_params=cfm_params,
            n_spks=n_spks,
            spk_emb_dim=spk_emb_dim,
        )
        self.t_scheduler = cfm_params.t_scheduler
        self.training_cfg_rate = cfm_params.training_cfg_rate
        self.inference_cfg_rate = cfm_params.inference_cfg_rate
        in_channels = in_channels + (spk_emb_dim if n_spks > 0 else 0)
        # Just change the architecture of the estimator here
        self.estimator = estimator
        self.lock = threading.Lock()

    @torch.inference_mode()
    def forward(self, mu, mask, n_timesteps, temperature=1.0, spks=None, cond=None, prompt_len=0,
                flow_cache=torch.zeros(1, 80, 0, 2)):
        """Forward diffusion

        Args:
            mu (torch.Tensor): output of encoder
                shape: (batch_size, n_feats, mel_timesteps)
            mask (torch.Tensor): output_mask
                shape: (batch_size, 1, mel_timesteps)
            n_timesteps (int): number of diffusion steps
            temperature (float, optional): temperature for scaling noise. Defaults to 1.0.
            spks (torch.Tensor, optional): speaker ids. Defaults to None.
                shape: (batch_size, spk_emb_dim)
            cond: Not used but kept for future purposes

        Returns:
            sample: generated mel-spectrogram
                shape: (batch_size, n_feats, mel_timesteps)
        """

        z = torch.randn_like(mu).to(mu.device).to(mu.dtype) * temperature
        cache_size = flow_cache.shape[2]
        # fix prompt and overlap part mu and z
        if cache_size != 0:
            z[:, :, :cache_size] = flow_cache[:, :, :, 0]
            mu[:, :, :cache_size] = flow_cache[:, :, :, 1]
        z_cache = torch.concat([z[:, :, :prompt_len], z[:, :, -34:]], dim=2)
        mu_cache = torch.concat([mu[:, :, :prompt_len], mu[:, :, -34:]], dim=2)
        flow_cache = torch.stack([z_cache, mu_cache], dim=-1)

        t_span = torch.linspace(0, 1, n_timesteps + 1, device=mu.device, dtype=mu.dtype)
        if self.t_scheduler == 'cosine':
            t_span = 1 - torch.cos(t_span * 0.5 * torch.pi)
        return self.solve_euler(z, t_span=t_span, mu=mu, mask=mask, spks=spks, cond=cond), flow_cache

    def solve_euler(self, x, t_span, mu, mask, spks, cond):
        """
        Fixed euler solver for ODEs.
        Args:
            x (torch.Tensor): random noise
            t_span (torch.Tensor): n_timesteps interpolated
                shape: (n_timesteps + 1,)
            mu (torch.Tensor): output of encoder
                shape: (batch_size, n_feats, mel_timesteps)
            mask (torch.Tensor): output_mask
                shape: (batch_size, 1, mel_timesteps)
            spks (torch.Tensor, optional): speaker ids. Defaults to None.
                shape: (batch_size, spk_emb_dim)
            cond: Not used but kept for future purposes
        """
        t, _, dt = t_span[0], t_span[-1], t_span[1] - t_span[0]
        t = t.unsqueeze(dim=0)

        # I am storing this because I can later plot it by putting a debugger here and saving it to a file
        # Or in future might add like a return_all_steps flag
        sol = []

        # Do not use concat, it may cause memory format changed and trt infer with wrong results!
        x_in = torch.zeros([2, 80, x.size(2)], device=x.device, dtype=x.dtype)
        mask_in = torch.zeros([2, 1, x.size(2)], device=x.device, dtype=x.dtype)
        mu_in = torch.zeros([2, 80, x.size(2)], device=x.device, dtype=x.dtype)
        t_in = torch.zeros([2], device=x.device, dtype=x.dtype)
        spks_in = torch.zeros([2, 80], device=x.device, dtype=x.dtype)
        cond_in = torch.zeros([2, 80, x.size(2)], device=x.device, dtype=x.dtype)
        for step in range(1, len(t_span)):
            # Classifier-Free Guidance inference introduced in VoiceBox
            x_in[:] = x
            mask_in[:] = mask
            mu_in[0] = mu
            t_in[:] = t.unsqueeze(0)
            spks_in[0] = spks
            cond_in[0] = cond
            dphi_dt = self.forward_estimator(
                x_in, mask_in,
                mu_in, t_in,
                spks_in,
                cond_in
            )
            dphi_dt, cfg_dphi_dt = torch.split(dphi_dt, [x.size(0), x.size(0)], dim=0)
            dphi_dt = ((1.0 + self.inference_cfg_rate) * dphi_dt - self.inference_cfg_rate * cfg_dphi_dt)
            x = x + dt * dphi_dt
            t = t + dt
            sol.append(x)
            if step < len(t_span) - 1:
                dt = t_span[step + 1] - t

        return sol[-1].float()

    def forward_estimator(self, x, mask, mu, t, spks, cond):
        if isinstance(self.estimator, torch.nn.Module):
            return self.estimator.forward(x, mask, mu, t, spks, cond)
        else:
            if isinstance(self.estimator, EstimatorWrapper):
                estimator, engine = self.estimator.acquire_estimator()

                estimator.set_input_shape('x', (2, 80, x.size(2)))
                estimator.set_input_shape('mask', (2, 1, x.size(2)))
                estimator.set_input_shape('mu', (2, 80, x.size(2)))
                estimator.set_input_shape('t', (2,))
                estimator.set_input_shape('spks', (2, 80))
                estimator.set_input_shape('cond', (2, 80, x.size(2)))

                data_ptrs = [x.contiguous().data_ptr(),
                             mask.contiguous().data_ptr(),
                             mu.contiguous().data_ptr(),
                             t.contiguous().data_ptr(),
                             spks.contiguous().data_ptr(),
                             cond.contiguous().data_ptr(),
                             x.data_ptr()]

                for idx, data_ptr in enumerate(data_ptrs):
                    estimator.set_tensor_address(engine.get_tensor_name(idx), data_ptr)

                # run trt engine
                estimator.execute_async_v3(torch.cuda.current_stream().cuda_stream)

                torch.cuda.current_stream().synchronize()
                self.estimator.release_estimator(estimator)
                return x
            else:
                with self.lock:
                    self.estimator.set_input_shape('x', (2, 80, x.size(2)))
                    self.estimator.set_input_shape('mask', (2, 1, x.size(2)))
                    self.estimator.set_input_shape('mu', (2, 80, x.size(2)))
                    self.estimator.set_input_shape('t', (2,))
                    self.estimator.set_input_shape('spks', (2, 80))
                    self.estimator.set_input_shape('cond', (2, 80, x.size(2)))
                    # run trt engine
                    self.estimator.execute_v2([x.contiguous().data_ptr(),
                                               mask.contiguous().data_ptr(),
                                               mu.contiguous().data_ptr(),
                                               t.contiguous().data_ptr(),
                                               spks.contiguous().data_ptr(),
                                               cond.contiguous().data_ptr(),
                                               x.data_ptr()])
                    return x

    def compute_loss(self, x1, mask, mu, spks=None, cond=None):
        """Computes diffusion loss

        Args:
            x1 (torch.Tensor): Target
                shape: (batch_size, n_feats, mel_timesteps)
            mask (torch.Tensor): target mask
                shape: (batch_size, 1, mel_timesteps)
            mu (torch.Tensor): output of encoder
                shape: (batch_size, n_feats, mel_timesteps)
            spks (torch.Tensor, optional): speaker embedding. Defaults to None.
                shape: (batch_size, spk_emb_dim)

        Returns:
            loss: conditional flow matching loss
            y: conditional flow
                shape: (batch_size, n_feats, mel_timesteps)
        """
        b, _, t = mu.shape

        # random timestep
        t = torch.rand([b, 1, 1], device=mu.device, dtype=mu.dtype)
        if self.t_scheduler == 'cosine':
            t = 1 - torch.cos(t * 0.5 * torch.pi)
        # sample noise p(x_0)
        z = torch.randn_like(x1)

        y = (1 - (1 - self.sigma_min) * t) * z + t * x1
        u = x1 - (1 - self.sigma_min) * z

        # during training, we randomly drop condition to trade off mode coverage and sample fidelity
        if self.training_cfg_rate > 0:
            cfg_mask = torch.rand(b, device=x1.device) > self.training_cfg_rate
            mu = mu * cfg_mask.view(-1, 1, 1)
            spks = spks * cfg_mask.view(-1, 1)
            cond = cond * cfg_mask.view(-1, 1, 1)

        pred = self.estimator(y, mask, mu, t.squeeze(), spks, cond)
        loss = F.mse_loss(pred * mask, u * mask, reduction="sum") / (torch.sum(mask) * u.shape[1])
        return loss, y


class CausalConditionalCFM(ConditionalCFM):
    def __init__(self, in_channels, cfm_params, n_spks=1, spk_emb_dim=64, estimator: torch.nn.Module = None):
        super().__init__(in_channels, cfm_params, n_spks, spk_emb_dim, estimator)
        self.rand_noise = torch.randn([1, 80, 50 * 300])

    @torch.inference_mode()
    def forward(self, mu, mask, n_timesteps, temperature=1.0, spks=None, cond=None):
        """Forward diffusion

        Args:
            mu (torch.Tensor): output of encoder
                shape: (batch_size, n_feats, mel_timesteps)
            mask (torch.Tensor): output_mask
                shape: (batch_size, 1, mel_timesteps)
            n_timesteps (int): number of diffusion steps
            temperature (float, optional): temperature for scaling noise. Defaults to 1.0.
            spks (torch.Tensor, optional): speaker ids. Defaults to None.
                shape: (batch_size, spk_emb_dim)
            cond: Not used but kept for future purposes

        Returns:
            sample: generated mel-spectrogram
                shape: (batch_size, n_feats, mel_timesteps)
        """

        z = self.rand_noise[:, :, :mu.size(2)].to(mu.device).to(mu.dtype) * temperature
        # fix prompt and overlap part mu and z
        t_span = torch.linspace(0, 1, n_timesteps + 1, device=mu.device, dtype=mu.dtype)
        if self.t_scheduler == 'cosine':
            t_span = 1 - torch.cos(t_span * 0.5 * torch.pi)
        return self.solve_euler(z, t_span=t_span, mu=mu, mask=mask, spks=spks, cond=cond), None

3.2，启动项目

【GRPC】

cd runtime/async_grpc

python -m grpc_tools.protoc -I. --python_out=. --grpc_python_out=. cosyvoice.proto

python server.py --load_jit --load_trt --fp16

【报错】ImportError: libre2.so.11: cannot open shared object file: No such file or directory
sudo apt install git cmake build-essential
git clone https://github.com/google/re2.git
cd re2
make -j$(nproc)
sudo make install
sudo ldconfig
【报错】re2/dfa.cc:37:10: fatal error: absl/base/call_once.h: No such file or directory
37 | #include "absl/base/call_once.h"
| ^~~~~~~~~~~~~~~~~~~~~~~
git clone https://github.com/abseil/abseil-cpp.git
cd abseil-cpp
mkdir build && cd build
cmake .. -DCMAKE_POSITION_INDEPENDENT_CODE=ON
make -j$(nproc)
sudo make install
sudo ldconfig
cd ..
cd ..
make clean
make -j$(nproc)
sudo make install
sudo ldconfig
【报错】Package 'gtest', required by 'virtual:world', not found
sudo apt-get install libbenchmark-dev
sudo apt-get install libgtest-dev
【报错】ImportError: /root/miniconda3/envs/ac2/bin/../lib/libstdc++.so.6: version `GLIBCXX_3.4.32' not found (required by /usr/local/lib/libre2.so.11)https://github.com/pybind/pybind11/discussions/3453
rm /home/xx/anaconda3/bin/../lib/libstdc++.so.6
cp /usr/lib/x86_64-linux-gnu/libstdc++.so.6.0.29 /home/xx/anaconda3/bin/../lib
ln -s /home/xx/anaconda3/bin/../lib/libstdc++.so.6.0.29 /home/xx/anaconda3/bin/../lib/libstdc++.so.6
【报错】ImportError: /root/autodl-tmp/miniconda3/envs/cosyvoice2/lib/python3.10/site-packages/ttsfrd.cpython-310-x86_64-linux-gnu.so: undefined symbol: _ZN3re212re2_internal5ParseISsEEbPKcmPT_
export LD_LIBRARY_PATH=/root/autodl-tmp/CosyVoice/async_cosyvoice/runtime/async_grpc/re2/lib:$LD_LIBRARY_PATH
pip install /root/CosyVoice/pretrained_models/CosyVoice2-ttsfrd/ttsfrd_dependency-0.1-py3-none-any.whl 
pip install /root/CosyVoice/pretrained_models/CosyVoice2-ttsfrd/ttsfrd-0.4.2-cp310-cp310-linux_x86_64.whl 
【报错】ERROR Server encountered an error: failed to initialize ttsfrd resource
【解决】将‘pretrained_models’下的‘CosyVoice-ttsfrd’文件夹 cp & 重命名为‘speech_kantts_ttsfrd’。”
unzip resource.zip -d .

3.3，保存音色

from async_cosyvoice.async_cosyvoice import AsyncCosyVoice2
from cosyvoice.utils.file_utils import load_wav

if __name__ == '__main__':
    model_dir = r'/root/CosyVoice/pretrained_models/CosyVoice2-0.5B'
    cosyvoice = AsyncCosyVoice2(model_dir, load_jit=True, load_trt=True, fp16=True)
    spk_id = "xijun"
    audio_path = "1.wav"
    prompt_text = "我这边现在确实是还不上，然后前一阵你们这边一直来电话各种骚扰，还跟我家人打电话。"
    prompt_speech_16k = load_wav(audio_path, 16000)
    cosyvoice.frontend.generate_spk_info(spk_id, prompt_text, prompt_speech_16k, 24000, "A")

4，CosyVoice2 Sglang

【参考】https://github.com/FunAudioLLM/CosyVoice/issues/873

4.1，模型转换

将 Qwen 替换 CosyVoice2 中的 Qwen：

import torch
from transformers import Qwen2ForCausalLM, AutoConfig
from safetensors.torch import load_file

class Qwen2ForCausalLM_tts(Qwen2ForCausalLM):
    def __init__(self, config):
        super().__init__(config)

        llm_input_size = 896
        llm_output_size = 896
        speech_token_size = 151936
        self.llm_decoder = torch.nn.Linear(llm_output_size, speech_token_size + 3)  # 这个维度自己改下，我是本地修改过的模型
        self.speech_embedding = torch.nn.Embedding(speech_token_size + 3, llm_input_size)

    def forward(self, *args, **kwargs):
        outputs = super().forward(*args, **kwargs)
        return outputs


config = AutoConfig.from_pretrained("/root/CosyVoice-sglang/pretrained_models/CosyVoice2-0.5B/CosyVoice-BlankEN")
custom_model = Qwen2ForCausalLM_tts(config)

from safetensors.torch import load_file
state_dict = load_file("/root/CosyVoice-sglang/pretrained_models/Qwen2.5-0.5B/model.safetensors")

new_state_dict = {
    k.replace('llm.model.', '') if 'llm.model.' in k else k: v
    for k, v in state_dict.items()
}  # 把cosy的结构里套娃的实际的qwen的参数摘出来
custom_model.load_state_dict(new_state_dict, strict=False)
custom_model.save_pretrained("/root/CosyVoice-sglang/pretrained_models/CosyVoice-Sglang", safe_serialization=True)

将 Qwen 在 Sglang 注册：

vim /root/miniconda3/envs/cv2-sglang/lib/python3.10/site-packages/sglang/srt/models/qwen2_5.py

from typing import Any, Dict, Iterable, Optional, Tuple

import torch
from torch import nn

from sglang.srt.distributed import (
    get_tensor_model_parallel_rank,
    get_tensor_model_parallel_world_size,
)
from sglang.srt.layers.activation import SiluAndMul
from sglang.srt.layers.layernorm import RMSNorm
from sglang.srt.layers.linear import (
    MergedColumnParallelLinear,
    QKVParallelLinear,
    RowParallelLinear,
)
from sglang.srt.layers.logits_processor import LogitsProcessor
from sglang.srt.layers.pooler import Pooler, PoolingType
from sglang.srt.layers.quantization.base_config import QuantizationConfig
from sglang.srt.layers.radix_attention import RadixAttention
from sglang.srt.layers.rotary_embedding import get_rope
from sglang.srt.layers.vocab_parallel_embedding import (
    ParallelLMHead,
    VocabParallelEmbedding,
)
from sglang.srt.model_executor.forward_batch_info import ForwardBatch
from sglang.srt.model_loader.weight_utils import (
    default_weight_loader,
    kv_cache_scales_loader,
)
from sglang.srt.utils import make_layers

Qwen2Config = None


class Qwen2MLP(nn.Module):
    def __init__(
            self,
            hidden_size: int,
            intermediate_size: int,
            hidden_act: str,
            quant_config: Optional[QuantizationConfig] = None,
    ) -> None:
        super().__init__()
        self.gate_up_proj = MergedColumnParallelLinear(
            hidden_size,
            [intermediate_size] * 2,
            bias=False,
            quant_config=quant_config,
        )
        self.down_proj = RowParallelLinear(
            intermediate_size,
            hidden_size,
            bias=False,
            quant_config=quant_config,
        )
        if hidden_act != "silu":
            raise ValueError(
                f"Unsupported activation: {hidden_act}. "
                "Only silu is supported for now."
            )
        self.act_fn = SiluAndMul()

    def forward(self, x):
        gate_up, _ = self.gate_up_proj(x)
        x = self.act_fn(gate_up)
        x, _ = self.down_proj(x)
        return x


class Qwen2Attention(nn.Module):
    def __init__(
            self,
            hidden_size: int,
            num_heads: int,
            num_kv_heads: int,
            layer_id: int = 0,
            rope_theta: float = 1000000,
            rope_scaling: Optional[Dict[str, Any]] = None,
            max_position_embeddings: int = 32768,
            quant_config: Optional[QuantizationConfig] = None,
    ) -> None:
        super().__init__()
        self.hidden_size = hidden_size
        tp_size = get_tensor_model_parallel_world_size()
        self.total_num_heads = num_heads
        assert self.total_num_heads % tp_size == 0
        self.num_heads = self.total_num_heads // tp_size
        self.total_num_kv_heads = num_kv_heads
        if self.total_num_kv_heads >= tp_size:
            # Number of KV heads is greater than TP size, so we partition
            # the KV heads across multiple tensor parallel GPUs.
            assert self.total_num_kv_heads % tp_size == 0
        else:
            # Number of KV heads is less than TP size, so we replicate
            # the KV heads across multiple tensor parallel GPUs.
            assert tp_size % self.total_num_kv_heads == 0
        self.num_kv_heads = max(1, self.total_num_kv_heads // tp_size)
        self.head_dim = hidden_size // self.total_num_heads
        self.q_size = self.num_heads * self.head_dim
        self.kv_size = self.num_kv_heads * self.head_dim
        self.scaling = self.head_dim ** -0.5
        self.rope_theta = rope_theta
        self.max_position_embeddings = max_position_embeddings

        self.qkv_proj = QKVParallelLinear(
            hidden_size,
            self.head_dim,
            self.total_num_heads,
            self.total_num_kv_heads,
            bias=True,
            quant_config=quant_config,
        )
        self.o_proj = RowParallelLinear(
            self.total_num_heads * self.head_dim,
            hidden_size,
            bias=False,
            quant_config=quant_config,
        )

        self.rotary_emb = get_rope(
            self.head_dim,
            rotary_dim=self.head_dim,
            max_position=max_position_embeddings,
            base=rope_theta,
            rope_scaling=rope_scaling,
        )
        self.attn = RadixAttention(
            self.num_heads,
            self.head_dim,
            self.scaling,
            num_kv_heads=self.num_kv_heads,
            layer_id=layer_id,
        )

    def forward(
            self,
            positions: torch.Tensor,
            hidden_states: torch.Tensor,
            forward_batch: ForwardBatch,
    ) -> torch.Tensor:
        qkv, _ = self.qkv_proj(hidden_states)
        q, k, v = qkv.split([self.q_size, self.kv_size, self.kv_size], dim=-1)
        q, k = self.rotary_emb(positions, q, k)
        attn_output = self.attn(q, k, v, forward_batch)
        output, _ = self.o_proj(attn_output)
        return output


class Qwen2DecoderLayer(nn.Module):
    def __init__(
            self,
            config: Qwen2Config,
            layer_id: int = 0,
            quant_config: Optional[QuantizationConfig] = None,
    ) -> None:
        super().__init__()
        self.hidden_size = config.hidden_size
        rope_theta = getattr(config, "rope_theta", 1000000)
        rope_scaling = getattr(config, "rope_scaling", None)
        max_position_embeddings = getattr(config, "max_position_embeddings", 32768)
        self.self_attn = Qwen2Attention(
            hidden_size=self.hidden_size,
            num_heads=config.num_attention_heads,
            num_kv_heads=config.num_key_value_heads,
            layer_id=layer_id,
            rope_theta=rope_theta,
            rope_scaling=rope_scaling,
            max_position_embeddings=max_position_embeddings,
            quant_config=quant_config,
        )
        self.mlp = Qwen2MLP(
            hidden_size=self.hidden_size,
            intermediate_size=config.intermediate_size,
            hidden_act=config.hidden_act,
            quant_config=quant_config,
        )
        self.input_layernorm = RMSNorm(config.hidden_size, eps=config.rms_norm_eps)
        self.post_attention_layernorm = RMSNorm(
            config.hidden_size, eps=config.rms_norm_eps
        )

    def forward(
            self,
            positions: torch.Tensor,
            hidden_states: torch.Tensor,
            forward_batch: ForwardBatch,
            residual: Optional[torch.Tensor],
    ) -> Tuple[torch.Tensor, torch.Tensor]:
        # Self Attention
        if residual is None:
            residual = hidden_states
            hidden_states = self.input_layernorm(hidden_states)
        else:
            hidden_states, residual = self.input_layernorm(hidden_states, residual)
        hidden_states = self.self_attn(
            positions=positions,
            hidden_states=hidden_states,
            forward_batch=forward_batch,
        )

        # Fully Connected
        hidden_states, residual = self.post_attention_layernorm(hidden_states, residual)
        hidden_states = self.mlp(hidden_states)
        return hidden_states, residual


class Qwen2Model(nn.Module):
    def __init__(
            self,
            config: Qwen2Config,
            quant_config: Optional[QuantizationConfig] = None,
    ) -> None:
        super().__init__()
        self.config = config
        self.padding_idx = config.pad_token_id
        self.vocab_size = config.vocab_size
        self.embed_tokens = VocabParallelEmbedding(
            config.vocab_size,
            config.hidden_size,
            quant_config=quant_config,
        )
        self.layers = make_layers(
            config.num_hidden_layers,
            lambda idx, prefix: Qwen2DecoderLayer(
                layer_id=idx,
                config=config,
                quant_config=quant_config,
            ),
        )
        self.norm = RMSNorm(config.hidden_size, eps=config.rms_norm_eps)

    def get_input_embeddings(self, input_ids: torch.Tensor) -> torch.Tensor:
        if hasattr(self.config, "scale_emb"):
            return self.embed_tokens(input_ids) * self.config.scale_emb
        else:
            return self.embed_tokens(input_ids)

    def forward(
            self,
            input_ids: torch.Tensor,
            positions: torch.Tensor,
            forward_batch: ForwardBatch,
            input_embeds: torch.Tensor = None,
    ) -> torch.Tensor:
        if input_embeds is None:
            hidden_states = self.embed_tokens(input_ids)
        else:
            hidden_states = input_embeds
        residual = None
        for i in range(len(self.layers)):
            layer = self.layers[i]
            hidden_states, residual = layer(
                positions,
                hidden_states,
                forward_batch,
                residual,
            )
        hidden_states, _ = self.norm(hidden_states, residual)
        return hidden_states

    # If this function is called, it should always initialize KV cache scale
    # factors (or else raise an exception). Thus, handled exceptions should
    # make sure to leave KV cache scale factors in a known good (dummy) state
    def load_kv_cache_scales(self, quantization_param_path: str) -> None:
        tp_size = get_tensor_model_parallel_world_size()
        tp_rank = get_tensor_model_parallel_rank()
        for layer_idx, scaling_factor in kv_cache_scales_loader(
                quantization_param_path,
                tp_rank,
                tp_size,
                self.config.num_hidden_layers,
                self.config.__class__.model_type,
        ):
            if not isinstance(self.layers[layer_idx], nn.Identity):
                layer_self_attn = self.layers[layer_idx].self_attn
            if hasattr(layer_self_attn.attn, "k_scale"):
                layer_self_attn.attn.k_scale = scaling_factor
                layer_self_attn.attn.v_scale = scaling_factor
            else:
                raise RuntimeError(
                    "Self attention has no KV cache scaling " "factor attribute!"
                )


class Qwen2ForCausalLM_tts(nn.Module):
    # BitandBytes specific attributes
    default_bitsandbytes_target_modules = [
        ".gate_proj.",
        ".down_proj.",
        ".up_proj.",
        ".q_proj.",
        ".k_proj.",
        ".v_proj.",
        ".o_proj.",
    ]
    bitsandbytes_stacked_params_mapping = {
        # shard_name, weight_name, index
        "q_proj": ("qkv_proj", 0),
        "k_proj": ("qkv_proj", 1),
        "v_proj": ("qkv_proj", 2),
        "gate_proj": ("gate_up_proj", 0),
        "up_proj": ("gate_up_proj", 1),
    }

    def __init__(
            self,
            config: Qwen2Config,
            quant_config: Optional[QuantizationConfig] = None,
    ) -> None:
        super().__init__()
        self.config = config
        self.quant_config = quant_config
        self.model = Qwen2Model(config, quant_config=quant_config)
        if config.tie_word_embeddings:
            self.lm_head = self.model.embed_tokens
        else:
            self.lm_head = ParallelLMHead(
                config.vocab_size, config.hidden_size, quant_config=quant_config
            )
        self.logits_processor = LogitsProcessor(config)
        self.pooler = Pooler(pooling_type=PoolingType.LAST, normalize=True)

        # 克隆的大概这么写：
        llm_input_size = 896
        llm_output_size = 896
        speech_token_size = 151936

        self.speech_embedding = VocabParallelEmbedding(
            speech_token_size + 3, llm_input_size,
            quant_config=quant_config,
        )
        # torch.nn.Embedding(speech_token_size + 3, llm_input_size)

        self.llm_decoder = ParallelLMHead(
            speech_token_size + 3, llm_output_size,
            quant_config=quant_config
        )
        # torch.nn.Linear(llm_output_size, speech_token_size + 3)

    def get_input_embeddings(self, input_ids: torch.Tensor) -> torch.Tensor:
        return self.model.get_input_embeddings(input_ids)

    @torch.no_grad()
    def qwen_forward(
            self,
            input_ids: torch.Tensor,
            positions: torch.Tensor,
            forward_batch: ForwardBatch,
            input_embeds: torch.Tensor = None,
    ) -> torch.Tensor:
        if input_embeds is None:
            hidden_states = self.speech_embedding(input_ids)
        else:
            hidden_states = input_embeds
        residual = None
        for i in range(len(self.model.layers)):
            layer = self.model.layers[i]
            hidden_states, residual = layer(
                positions,
                hidden_states,
                forward_batch,
                residual,
            )
        hidden_states, _ = self.model.norm(hidden_states, residual)
        return hidden_states

    @torch.no_grad()
    def forward(
            self,
            input_ids: torch.Tensor,
            positions: torch.Tensor,
            forward_batch: ForwardBatch,
            input_embeds: torch.Tensor = None,
            get_embedding: bool = False,
    ) -> torch.Tensor:
        hidden_states = self.qwen_forward(input_ids, positions, forward_batch, input_embeds)
        if not get_embedding:
            return self.logits_processor(
                input_ids, hidden_states, self.llm_decoder, forward_batch
            )
        else:
            return self.pooler(hidden_states, forward_batch)

    def load_weights(self, weights: Iterable[Tuple[str, torch.Tensor]]):
        stacked_params_mapping = [
            # (param_name, shard_name, shard_id)
            ("qkv_proj", "q_proj", "q"),
            ("qkv_proj", "k_proj", "k"),
            ("qkv_proj", "v_proj", "v"),
            ("gate_up_proj", "gate_proj", 0),
            ("gate_up_proj", "up_proj", 1),
        ]

        params_dict = dict(self.named_parameters())
        for name, loaded_weight in weights:
            if "rotary_emb.inv_freq" in name or "projector" in name:
                continue
            if "rotary_emb.cos_cached" in name or "rotary_emb.sin_cached" in name:
                # Models trained using ColossalAI may include these tensors in
                # the checkpoint. Skip them.
                continue
            if name.startswith("model.vision_tower") and name not in params_dict:
                continue

            for param_name, weight_name, shard_id in stacked_params_mapping:
                if weight_name not in name:
                    continue
                name = name.replace(weight_name, param_name)
                # Skip loading extra bias for GPTQ models.
                if name.endswith(".bias") and name not in params_dict:
                    continue
                param = params_dict[name]
                weight_loader = param.weight_loader
                weight_loader(param, loaded_weight, shard_id)
                break
            else:
                # Skip loading extra bias for GPTQ models.
                if name.endswith(".bias") and name not in params_dict:
                    continue
                param = params_dict[name]
                weight_loader = getattr(param, "weight_loader", default_weight_loader)
                weight_loader(param, loaded_weight)

    def get_embed_and_head(self):
        # return self.model.embed_tokens.weight, self.lm_head.weight
        return self.speech_embedding.weight, self.llm_decoder.weight

    def set_embed_and_head(self, embed, head):
        # del self.model.embed_tokens.weight
        # del self.lm_head.weight
        # self.model.embed_tokens.weight = embed
        # self.lm_head.weight = head
        del self.speech_embedding.weight
        del self.llm_decoder.weight
        self.speech_embedding.weight = embed
        self.llm_decoder.weight = head
        torch.cuda.empty_cache()
        torch.cuda.synchronize()

    def load_kv_cache_scales(self, quantization_param_path: str) -> None:
        self.model.load_kv_cache_scales(quantization_param_path)


EntryClass = Qwen2ForCausalLM_tts

4.2，启动服务端

python -m sglang.launch_server     --model-path /root/CosyVoice-sglang/pretrained_models/CosyVoice-Sglang     --tensor-parallel-size 1     --disable-radix     --skip-tokenizer-init     --random-seed 1234     --mem-fraction-static 0.3     --dtype bfloat16     --base-gpu-id 0
显存查看：
nvitop -1
【报错】ImportError: cannot import name 'AutoModel' from partially initialized module 'transformers' (most likely due to a circular import) (/root/miniconda3/envs/cv2-sglang/lib/python3.10/site-packages/sglang/srt/models/transformers.py)
mv /root/miniconda3/envs/cv2-sglang/lib/python3.10/site-packages/sglang/srt/models/transformers.py /root/miniconda3/envs/cv2-sglang/lib/python3.10/site-packages/sglang/srt/models/transformers2.py
【报错】/bin/sh: 1: /usr/local/cuda/bin/nvcc: not found ninja: build stopped: subcommand failed.

【报错】
报错分析：In file included from /usr/include/crt/math_functions.h:10551,
                 from /usr/include/crt/common_functions.h:303,
                 from /usr/include/cuda_runtime.h:118,
                 from <command-line>:
/usr/include/c++/12/cmath:45:15: fatal error: math.h: 没有那个文件或目录
   45 | #include_next <math.h>
      |               ^~~~~~~~
compilation terminated.
In file included from /usr/include/crt/math_functions.h:10551,
                 from /usr/include/crt/common_functions.h:303,
                 from /usr/include/cuda_runtime.h:118,
                 from <command-line>:
/usr/include/c++/12/cmath:45:15: fatal error: math.h: 没有那个文件或目录
   45 | #include_next <math.h>
      |               ^~~~~~~~
compilation terminated.
fatal   : Could not open input file /tmp/tmpxft_000031b8_00000000-7_batch_prefill_jit_pybind.cpp1.ii
ninja: build stopped: subcommand failed.
【解决】匹配 GCC 和 CUDA 版本：

https://stackoverflow.com/questions/6622454/cuda-incompatible-with-gcc-version
echo 'export PATH=/usr/local/cuda-12.8/bin:$PATH' >> ~/.bashrc
echo 'export LD_LIBRARY_PATH=/usr/local/cuda-12.8/lib64:$LD_LIBRARY_PATH' >> ~/.bashrc
source ~/.bashrc
【报错】更新 Cuda 驱动版本

No CUDA runtime is found, using CUDA_HOME='/usr/local/cuda-12.8' → Python/Triton 找不到有效的 CUDA runtime，尽管你系统有 CUDA 12.8。

RuntimeError: 0 active drivers ([]). There should only be one. → Triton 在初始化时没检测到任何可用的 NVIDIA 驱动。

Torch Warning: system has unsupported display driver / cuda driver combination → PyTorch 检测到 CUDA 驱动和显示驱动版本不匹配。

Can't initialize NVML → NVIDIA 管理库（用于 GPU 状态查询）无法启动，说明驱动有问题。
sudo ubuntu-drivers autoinstall

4.3，启动客户端

【地址】https://github.com/sheepHavingPurpleLeaf/cosy-sglang

模型转换：执行 onnx2trt.py，选择 fp16 精度。

指定模型：修改 cosyvoice_ca.py，第 200 行左右代码，内容为上一步生成的模型。

f"flow.decoder.estimator.fp16.4070.plan"

启动 cosyvoice_ca.py：指定模型：/root/CosyVoice-sglang/pretrained_models/CosyVoice2-0.5B

保存音色：

import torch

from cosyvoice.cli.cosyvoice import CosyVoice, CosyVoice2
from cosyvoice.utils.file_utils import load_wav

model_dir = r'E:\modelscope_cache\hub\iic\CosyVoice2-0___5B'
cosyvoice = CosyVoice2(model_dir)
spk_id = "xijun"
audio_path = "C:\\Users\\shaoqisun\\Desktop\\1.wav"
sample_text = "****"
prompt_speech_16k = load_wav(audio_path, 16000)
cosyvoice.add_zero_shot_spk(sample_text, prompt_speech_16k, spk_id)
print("注册成功，当前可用说话人：", cosyvoice.list_available_spks())
cosyvoice.save_spkinfo()
spk2info = torch.load(model_dir + '/spk2info.pt')
print("所有可用spk_id：", list(spk2info.keys()))
for spk_id in spk2info.keys():
    print(spk_id)
    print(spk2info[spk_id].keys())

【报错】AttributeError: 'NoneType' object has no attribute 'additional_stop_token_ids'

【解决】临时方案：关闭 max_new_tokens & min_new_tokens，地址：https://github.com/sgl-project/sglang/issues/9039

【报错】

/pytorch/aten/src/ATen/native/cuda/Indexing.cu:1553: indexSelectLargeIndex: block: [87,0,0], thread: [32,0,0] Assertion `srcIndex < srcSelectDimSize` failed.
/pytorch/aten/src/ATen/native/cuda/Indexing.cu:1553: indexSelectLargeIndex: block: [87,0,0], thread: [33,0,0] Assertion `srcIndex < srcSelectDimSize` failed.
/pytorch/aten/src/ATen/native/cuda/Indexing.cu:1553: indexSelectLargeIndex: block: [87,0,0], thread: [34,0,0] Assertion `srcIndex < srcSelectDimSize` failed.
/pytorch/aten/src/ATen/native/cuda/Indexing.cu:1553: indexSelectLargeIndex: block: [87,0,0], thread: [35,0,0] Assertion `srcIndex < srcSelectDimSize` failed.
/pytorch/aten/src/ATen/native/cuda/Indexing.cu:1553: indexSelectLargeIndex: block: [87,0,0], thread: [36,0,0] Assertion `srcIndex < srcSelectDimSize` failed.
/pytorch/aten/src/ATen/native/cuda/Indexing.cu:1553: indexSelectLargeIndex: block: [87,0,0], thread: [37,0,0] Assertion `srcIndex < srcSelectDimSize` failed.
/pytorch/aten/src/ATen/native/cuda/Indexing.cu:1553: indexSelectLargeIndex: block: [87,0,0], thread: [38,0,0] Assertion `srcIndex < srcSelectDimSize` failed.
/pytorch/aten/src/ATen/native/cuda/Indexing.cu:1553: indexSelectLargeIndex: block: [87,0,0], thread: [39,0,0] Assertion `srcIndex < srcSelectDimSize` failed.
/pytorch/aten/src/ATen/native/cuda/Indexing.cu:1553: indexSelectLargeIndex: block: [87,0,0], thread: [40,0,0] Assertion `srcIndex < srcSelectDimSize` failed.
/pytorch/aten/src/ATen/native/cuda/Indexing.cu:1553: indexSelectLargeIndex: block: [87,0,0], thread: [41,0,0] Assertion `srcIndex < srcSelectDimSize` failed.
/pytorch/aten/src/ATen/native/cuda/Indexing.cu:1553: indexSelectLargeIndex: block: [87,0,0], thread: [42,0,0] Assertion `srcIndex < srcSelectDimSize` failed.
/pytorch/aten/src/ATen/native/cuda/Indexing.cu:1553: indexSelectLargeIndex: block: [87,0,0], thread: [43,0,0] Assertion `srcIndex < srcSelectDimSize` failed.
/pytorch/aten/src/ATen/native/cuda/Indexing.cu:1553: indexSelectLargeIndex: block: [87,0,0], thread: [44,0,0] Assertion `srcIndex < srcSelectDimSize` failed.
/pytorch/aten/src/ATen/native/cuda/Indexing.cu:1553: indexSelectLargeIndex: block: [87,0,0], thread: [45,0,0] Assertion `srcIndex < srcSelectDimSize` failed.

【解决】检查一下 token 是否超过了 LLM 的允许索引。

5，CosyVoice2 Triton

【下载镜像】

docker pull docker-0.unsee.tech/soar97/triton-cosyvoice:25.06

【下载模型】

# 配置环境变量
export HTTP_PROXY=http://127.0.0.1:15732
export HTTPS_PROXY=http://127.0.0.1:15732

huggingface_model_local_dir=./cosyvoice2_llm

model_scope_model_local_dir=./CosyVoice2-0.5B

huggingface-cli download --local-dir $huggingface_model_local_dir yuekai/cosyvoice2_llm

modelscope download --model iic/CosyVoice2-0.5B --local_dir $model_scope_model_local_dir

【启动镜像】

docker run -it --name "cosyvoice-server" --gpus all --net host -v /root/PycharmProjects:/workspace --shm-size=2g docker-0.unsee.tech/soar97/triton-cosyvoice:25.06

【重新进入】

docker start cosyvoice-server
docker exec -it cosyvoice-server /bin/bash

【依次执行】

bash run.sh 1 1
bash run.sh 2 2
bash run.sh 3 3
bash run.sh 4 4
bash run.sh 5 5

【报错】error: creating server: Internal - failed to load all models
git submodule update --init --recursive
【报错】ConnectionError: Couldn't reach 'yuekai/seed_tts_cosy2' on the Hub (LocalEntryNotFoundError)
手动下载 yuekai/seed_tts_cosy2

技术共进，成长同行——讯飞AI开发者社区

更多推荐

论文笔记：AlphaEdit: Null-Space Constrained Knowledge Editing for Language Models（AlphaEdit）

论文发表于人工智能顶会ICLR（基于定位和修改的模型编辑方法（针对和等）会破坏LLM中最初保存的知识，特别是在顺序编辑场景。为此，本文提出AlphaEdit：1、在将保留知识应用于参数之前，将扰动投影到保留知识的零空间上。2、从理论上证明，这种预测确保了在查询保留的知识时，编辑后的LLM的输出保持不变，从而减轻中断问题。3、对各种LLM（包括LLaMA3、GPT2XL和GPT-J）的广泛实验表明，