无论我做什么,我都无法通过SSML增加音高

问题描述 投票:0回答:1

我似乎无法通过 ssml 增加音高,并且在这一点上迷失了。如果这个问题没有正确提出,我深表歉意,这是我第一次讨论堆栈溢出。我正在开发一个聊天机器人,它利用 Microsoft Azure 文本转语音 (TTS) 服务来合成语音。我正在尝试使用 SSML(语音合成标记语言)调整生成的语音的音调。尽管尝试了不同的方法,音高调整似乎不起作用,我不确定为什么它似乎只是忽略它。

import os
import time
import nltk
from azure.cognitiveservices import speech as speechsdk
from twitchio.ext import commands
from chat import *

output_file_name_with_path = '{0}\\output.wav'.format(os.path.dirname(__file__))


def get_value_from_json_key(key_name):
    with open("config.json", "r") as file:
        json_data = json.load(file)
    for i in json_data:
        if str(i) == str(key_name):
            return str(json_data[i])


def get_audio_or_return_error(result):
    if result.reason == speechsdk.ResultReason.SynthesizingAudioCompleted:
        stream = speechsdk.AudioDataStream(result)
        stream.save_to_wav_file(output_file_name_with_path)
    elif result.reason == speechsdk.ResultReason.Canceled:
        cancellation_details = result.cancellation_details
        print("Speech synthesis canceled: {}".format(cancellation_details.reason))
        if cancellation_details.reason == speechsdk.CancellationReason.Error:
            if cancellation_details.error_details:
                print("Error details: {}".format(cancellation_details.error_details))
                print("Did you set the speech resource key and region values?")


def get_output_audio_file(text):
    # This example requires environment variables named "SPEECH_KEY" and "SPEECH_REGION"
    speech_config = speechsdk.SpeechConfig(subscription=get_value_from_json_key("microsoft-azure-api-key"),
                                           region=get_value_from_json_key("microsoft-azure-speech-region"))
    speech_config.speech_synthesis_voice_name = get_value_from_json_key("voice-name")
    audio_config = speechsdk.audio.AudioOutputConfig(use_default_speaker=True)
    # The language of the voice that speaks.
    speech_synthesizer = speechsdk.SpeechSynthesizer(speech_config=speech_config, audio_config=audio_config)
      
    # Get text from the console and synthesize to the default speaker.
    print("<Speaking...>")
    with open("output.txt", "a", encoding="utf-8") as out:
        out.write(str(text) + "\n")
    speech_synthesis_result = speech_synthesizer.speak_text_async(text).get()
    get_audio_or_return_error(speech_synthesis_result)


def generate_conversation(message_content, message_author):
    print('------------------------------------------------------')
    print(message_content)
    print(message_author)
    print(Bot.conversation)

    Bot.conversation.append(f'CHATTER: {message_content}')
    text_block = '\n'.join(Bot.conversation)
    prompt = open_file('prompt_chat.txt').replace('<<BLOCK>>', text_block)
    bot_name = get_value_from_json_key("bot-name")
    prompt = prompt + '\n' + bot_name + ':'
    print(prompt)
    response = gpt3_completion(prompt)
    print(bot_name + ': ', response)
    if Bot.conversation.count(bot_name + ': ' + response) == 0:
        Bot.conversation.append(bot_name + f': {response}')
    return response


def generate_ssml(response):
    ssml_text = f'<speak><prosody pitch="+15.00%">{response}</prosody></voice></speak>'
    return ssml_text


def get_audio_and_text(message_content, message_author):
    response = generate_conversation(message_content, message_author)
#    response = message_content + "? " + response
    generate_ssml(response)
    get_output_audio_file(str(response))
    audio_file = output_file_name_with_path
    time.sleep(2)
    open('output.txt', 'w').close()
    print('------------------------------------------------------')
    os.remove(audio_file)


class Bot(commands.Bot):
    conversation = list()

    def __init__(self):
        # Initialise our Bot with our access token, prefix and a list of channels to join on boot...
        # prefix can be a callable, which returns a list of strings or a string...
        # initial_channels can also be a callable which returns a list of strings...
        super().__init__(token=get_value_from_json_key("twitch-access-key"), prefix='!',
                         initial_channels=[get_value_from_json_key("twitch-account-name")])

    async def event_ready(self):
        # Notify us when everything is ready!
        # We are logged in and ready to chat and use commands...
        print(f'Logged in as | {self.nick}')

    async def event_message(self, message):
        # Messages with echo set to True are messages sent by the bot...
        # For now, we just want to ignore them...
        if not message.echo:
            # download the words corpus
            nltk.download('words')
            # Check if the message contains english words
            if any(word in message.content for word in nltk.corpus.words.words()):
                # Check if the message is too long
                if len(message.content) <= 100:
                    get_audio_and_text(message.content, message.author.name)
                    # Since we have commands and are overriding the default `event_message`
                    # We must let the bot know we want to handle and invoke our commands...
        await self.handle_commands(message)

    @commands.command()
    async def hello(self, ctx: commands.Context):
        # Here we have a command hello, we can invoke our command with our prefix and command name
        # e.g ?hello
        # We can also give our commands aliases (different names) to invoke with.

        # Send a hello back!
        # Sending a reply back to the channel is easy... Below is an example.
        await ctx.send(f'Hello {ctx.author.name}!')


bot = Bot()
bot.run()
# bot.run() is blocking and will stop execution of any below code here until stopped or closed.
python azure text-to-speech azure-cognitive-services azure-speech
1个回答
0
投票

您可以使用以下 SSML 代码来调整文本转语音代码中的音调:-

<speak version='1.0' xmlns='http://www.w3.org/2001/10/synthesis' xmlns:mstts='http://www.w3.org/2001/mstts'><voice name='en-US-JennyNeural'><prosody pitch='+50%'>Hello, world!</prosody></voice></speak>

尝试更改

<prosody pitch='+50%'> from 50 to 10
的值,您会看到音高的差异,正如我尝试在语音工作室中运行相同的值一样。

enter image description here

我的python texttospeech.py代码:-

import os
import azure.cognitiveservices.speech as speechsdk

# Replace these variables with your own values
subscription_key = 'xxxxxc57f4a81feff3'
region = 'eastus'
voice_name = 'en-US-GuyRUS'
text = 'Hello, this is a pitch test.'
pitch_percentage = '+50%'  # Adjust the pitch by +50%. Use '-' for a lower pitch.

# Create the SSML with pitch adjustment
ssml = f"""
<speak version='1.0' xmlns='http://www.w3.org/2001/10/synthesis' xml:lang='en-US'>
    <voice name='{voice_name}'>
        <prosody pitch='{pitch_percentage}'>
            {text}
        </prosody>
    </voice>
</speak>
"""

# Speech configuration
speech_config = speechsdk.SpeechConfig(subscription=subscription_key, region=region)
speech_synthesizer = speechsdk.SpeechSynthesizer(speech_config=speech_config)

# Synthesize speech
result = speech_synthesizer.speak_ssml_async(ssml).get()

# Check if synthesis was successful and save the audio
if result.reason == speechsdk.ResultReason.SynthesizingAudioCompleted:
    audio_data = result.audio_data
    audio_filename = 'output.wav'
    with open(audio_filename, 'wb') as audio_file:
        audio_file.write(audio_data)
    print(f'Audio saved as {audio_filename}')
else:
    print(f'Synthesis failed: {result.reason}')

输出:-

enter image description here

© www.soinside.com 2019 - 2024. All rights reserved.