学习日志|结合GPT-SoVites与LLM（2） – Sayoung的小站~正在施工中

在上一篇学习日志中，我介绍了GSV的API食用方法，本篇我们研究如何将LLM与GSV对接。

这乍一听很简单，只需要将大模型返回的结果简单处理之后交由GSV处理之后就行了……对吗？思路没问题，确实是可行的，不过由于GSV处理文字的时间过长，这导致实际延迟时间过长，体验极差。

我们真正想要的效果是这样的：让GSV处理完一部分音频后就立即播放，剩下的边播放边处理。要实现这个效果，我第一个想到的是流式输出，不过可惜的是目前的GSV似乎还不支持这样的输出方式。不过github上的其他大佬们使用另一种方式解决了这个问题：将大段内容按句子拆分，这样便可以充分利用播放前一段音频的时间处理下一段内容，达到无缝衔接。

那么现在的实现思路如下：

由LLM生成回答→处理文字（切分、去除多余符号）→利用多线程和队列机制让GSV生成音频的同时播放音频，减少等待时间。

作为小白来说，多线程机制还是有些太复杂了（）不过幸好有deepseek小姐的帮助，咱最后还是缝合完成了这个项目(‾◡◝)

import requests
import json
import re

from pydub import AudioSegment
from pydub.playback import play
import io

import threading
import queue

#分割文本，处理多余符号#
def split_text(text):
    # 使用正则表达式根据句号、感叹号、问号、分号、冒号分割文本
    sentences = re.split(r'[。！？；：]', text)
    # 过滤掉空字符串
    sentences = [s.strip() for s in sentences if s.strip()]
    return sentences

#调用GSV进行转换#
def tts(txt):
    url = "http://127.0.0.1:9880/tts"

    data = {
        "text": f"{txt}",                   # str.(required) text to be synthesized
        "text_lang": "zh",               # str.(required) language of the text to be synthesized
        "ref_audio_path": "D:/Workspace/RVC_GSV/干员报到.wav",         # str.(required) reference audio path
        "aux_ref_audio_paths": [],    # list.(optional) auxiliary reference audio paths for multi-speaker synthesis
        "prompt_text": "星象学者，星极，以近卫干员身份任职，今后就由您差遣了，博士。",     #アステシア、天文学者兼占い師よ。前衛オペレーターとしてお世話になるわ。よろしく、ドクター。       # str.(optional) prompt text for the reference audio
        "prompt_lang": "zh",            # str.(required) language of the prompt text for the reference audio
        "top_k": 5,                   # int. top k sampling
        "top_p": 1,                   # float. top p sampling
        "temperature": 1,             # float. temperature for sampling
        "text_split_method": "cut5",  # str. text split method, see text_segmentation_method.py for details.
        "batch_size": 1,              # int. batch size for inference
        "batch_threshold": 0.75,      # float. threshold for batch splitting.
        "split_bucket": True,          # bool. whether to split the batch into multiple buckets.
        "speed_factor":1.0,           # float. control the speed of the synthesized audio.
        "fragment_interval":0.3,      # float. to control the interval of the audio fragment.
        "seed": -1,                   # int. random seed for reproducibility.
        "media_type": "wav",          # str. media type of the output audio, support "wav", "raw", "ogg", "aac".
        "streaming_mode": False,      # bool. whether to return a streaming response.
        "parallel_infer": True,       # bool.(optional) whether to use parallel inference.
        "repetition_penalty": 1.35    # float.(optional) repetition penalty for T2S model.          
    }

    headers = {"Content-Type": "application/json"}

    response = requests.post(url, data=json.dumps(data), headers=headers)

    if response.status_code == 200:
        # 保存生成的音频
        """ with open("output2.wav", "wb") as f:
            f.write(response.content) """
        print("音频生成成功！")
        return response.content
    else:
        print(f"请求失败，状态码：{response.status_code}, 错误信息：{response.text}")




#LLM#
def ai(content):
        
    url = "https://qianfan.baidubce.com/v2/chat/completions"
    
    payload = json.dumps({
        "model": "deepseek-v3",
        "messages": [
            {
                "role": "user",
                "content": content
            }
        ],
        "disable_search": False,
        "enable_citation": False
    }, ensure_ascii=False)
    headers = {
        'Content-Type': 'application/json',
        'appid': '',
        'Authorization': 'YOUR KEY'
    }
    
    response = requests.request("POST", url, headers=headers, data=payload.encode("utf-8"))
    
    print(response.text)

    data = json.loads(response.text)
    text = re.sub(r'\s+', '', data["choices"][0]["message"]["content"])
    text_list = split_text(text)

    with open('result.txt', 'w', encoding= 'utf-8') as f:
        f.write(text)
    return text_list


def wav_play(result_wav):
    audio = AudioSegment.from_file(io.BytesIO(result_wav), format="wav")
    play(audio)
    print(f"正在播放：{audio}")

def process_data(data_list, audio_queue):
    for text in data_list:
        audio = tts(text)
        audio_queue.put(audio)  # 将音频放入队列
    audio_queue.put(None)  # 发送结束信号

def play_audio(audio_queue):
    while True:
        audio = audio_queue.get()  # 从队列获取音频
        if audio is None:  # 检查结束信号
            audio_queue.task_done()
            break
        wav_play(audio)  # 播放
        audio_queue.task_done()  

if __name__ == '__main__':

    data_list = ai("你是一袋猫粮")
    audio_queue = queue.Queue()

    # 创建并启动线程
    processor = threading.Thread(target=process_data, args=(data_list, audio_queue))
    player = threading.Thread(target=play_audio, args=(audio_queue,))

    processor.start()
    player.start()

    # 等待处理线程完成
    processor.join()
    # 等待队列中的所有任务处理完毕
    audio_queue.join()
    # 确保播放线程结束
    player.join()

    print("完成")

本着能跑就行的原则，代码写得很粗糙，谨慎参考〒▽〒

另外写这个的时候才发现deepseek居然把充值渠道给关了……只好先用其他平台的代替了。又以及，在使用这个pydub时可能会遇到Permission denied的报错，只需要pip install pyaudio安装pyaudio库就好了¹。

https://blog.csdn.net/weixin_41568999/article/details/105235853 ↩︎