使用 Azure TTS 读取字节 AudioDataStream

问题描述 投票:0回答:1

我不明白如何在 python 中读取 TTS azure 服务的字节流。

来自文档:https://learn.microsoft.com/en-us/python/api/azure-cognitiveservices-speech/azure.cognitiveservices.speech.audiodatastream?view=azure-python

bool = can_read_data(requested_bytes: int, pos: int) 和 int = read_data(audio_buffer: bytes, pos: int | None = None)

好..

import azure.cognitiveservices.speech as speechsdk
speech_config = speechsdk.SpeechConfig(subscription='key', region='region')
speech_synthesizer = speechsdk.SpeechSynthesizer(speech_config=speech_config, audio_config=None)

text = "Hello, world!"
# Synthesize the speech
result = speech_synthesizer.speak_text_async(text).get()

# Create an AudioDataStream from the synthesized result
stream = speechsdk.AudioDataStream(result)

# Initialize Pygame - we will use this to play the audio
import pygame
pygame.mixer.init()

def play_audio(stream):
    chunk_size = 1024  # Size of each chunk to read
    audio_buffer = bytes()  # Bytes object to store audio data
    # Read and append audio data in chunks
    try:
        while stream.can_read_data(chunk_size):
            bytes_read = stream.read_data(chunk_size)  # Read a chunk of data as bytes
            audio_buffer += bytes_read  # Append the chunk to the bytes object
    except Exception as e:
        logging.error("[play_audio] Error during playback: {}".format(str(e)))
    # Play the audio with Pygame
    try:
        pygame.mixer.Sound(audio_buffer).play()
        print(audio_buffer)
    except pygame.error:
        print("Error playing sound")

# Call the play_audio function with the audio stream
play_audio(stream)

注意:我可以作弊并执行stream.save_to_wav_file,但我想流式传输这个..(这样我就可以播放和暂停它等..)

我无法弄清楚这一点。我觉得文档只是回避展示它的实际使用..

但是也许我没有正确阅读文档!

azure text-to-speech azure-cognitive-services
1个回答
0
投票

这仍然有一些不正确的地方,但它已经接近了很多..

我完全不理解文档..但我不明白的是为什么声音没有从流中播放..它是 ttt pfft 声音

import azure.cognitiveservices.speech as speechsdk
speech_config = speechsdk.SpeechConfig(subscription='key', region='uksouth')
speech_config.set_speech_synthesis_output_format(speechsdk.SpeechSynthesisOutputFormat.Riff16Khz16BitMonoPcm)
speech_synthesizer = speechsdk.SpeechSynthesizer(speech_config=speech_config, audio_config=None)


text = "Hello, world!"
# Synthesize the speech
result = speech_synthesizer.speak_text_async(text).get()

# Create an AudioDataStream from the synthesized result
if result.reason == speechsdk.ResultReason.SynthesizingAudioCompleted:
    print("Speech synthesized for text [{}]".format(text))
    audio_data_stream = speechsdk.AudioDataStream(result)
    audio_data_stream.save_to_wav_file("output.wav")
    # Reset the stream position to the beginning since saving to file puts the position to end.
    audio_data_stream.position = 0

    # Reads data from the stream
    audio_buffer = bytes(16000)
    total_size = 0
    filled_size = audio_data_stream.read_data(audio_buffer)
    while filled_size > 0:
        print("{} bytes received.".format(filled_size))
        total_size += filled_size
        filled_size = audio_data_stream.read_data(audio_buffer)
    print("Totally {} bytes received for text [{}].".format(total_size, text))
        # Initialize Pygame

    from pydub import AudioSegment
    import io

    audio_segment = AudioSegment(
        data=audio_buffer,  # The raw audio data you received
        sample_width=2,  # Bytes per sample
        frame_rate=16000,  # Sampling frequency
        channels=1  # Mono
    )
    
    from pydub.playback import play
    play(audio_segment)

elif result.reason == speechsdk.ResultReason.Canceled:
    cancellation_details = result.cancellation_details
    print("Speech synthesis canceled: {}".format(cancellation_details.reason))
    if cancellation_details.reason == speechsdk.CancellationReason.Error:
        print("Error details: {}".format(cancellation_details.error_details))
        
© www.soinside.com 2019 - 2024. All rights reserved.