我不明白如何在 python 中读取 TTS azure 服务的字节流。
bool = can_read_data(requested_bytes: int, pos: int) 和 int = read_data(audio_buffer: bytes, pos: int | None = None)
好..
import azure.cognitiveservices.speech as speechsdk
speech_config = speechsdk.SpeechConfig(subscription='key', region='region')
speech_synthesizer = speechsdk.SpeechSynthesizer(speech_config=speech_config, audio_config=None)
text = "Hello, world!"
# Synthesize the speech
result = speech_synthesizer.speak_text_async(text).get()
# Create an AudioDataStream from the synthesized result
stream = speechsdk.AudioDataStream(result)
# Initialize Pygame - we will use this to play the audio
import pygame
pygame.mixer.init()
def play_audio(stream):
chunk_size = 1024 # Size of each chunk to read
audio_buffer = bytes() # Bytes object to store audio data
# Read and append audio data in chunks
try:
while stream.can_read_data(chunk_size):
bytes_read = stream.read_data(chunk_size) # Read a chunk of data as bytes
audio_buffer += bytes_read # Append the chunk to the bytes object
except Exception as e:
logging.error("[play_audio] Error during playback: {}".format(str(e)))
# Play the audio with Pygame
try:
pygame.mixer.Sound(audio_buffer).play()
print(audio_buffer)
except pygame.error:
print("Error playing sound")
# Call the play_audio function with the audio stream
play_audio(stream)
注意:我可以作弊并执行stream.save_to_wav_file,但我想流式传输这个..(这样我就可以播放和暂停它等..)
我无法弄清楚这一点。我觉得文档只是回避展示它的实际使用..
但是也许我没有正确阅读文档!
这仍然有一些不正确的地方,但它已经接近了很多..
我完全不理解文档..但我不明白的是为什么声音没有从流中播放..它是 ttt pfft 声音
import azure.cognitiveservices.speech as speechsdk
speech_config = speechsdk.SpeechConfig(subscription='key', region='uksouth')
speech_config.set_speech_synthesis_output_format(speechsdk.SpeechSynthesisOutputFormat.Riff16Khz16BitMonoPcm)
speech_synthesizer = speechsdk.SpeechSynthesizer(speech_config=speech_config, audio_config=None)
text = "Hello, world!"
# Synthesize the speech
result = speech_synthesizer.speak_text_async(text).get()
# Create an AudioDataStream from the synthesized result
if result.reason == speechsdk.ResultReason.SynthesizingAudioCompleted:
print("Speech synthesized for text [{}]".format(text))
audio_data_stream = speechsdk.AudioDataStream(result)
audio_data_stream.save_to_wav_file("output.wav")
# Reset the stream position to the beginning since saving to file puts the position to end.
audio_data_stream.position = 0
# Reads data from the stream
audio_buffer = bytes(16000)
total_size = 0
filled_size = audio_data_stream.read_data(audio_buffer)
while filled_size > 0:
print("{} bytes received.".format(filled_size))
total_size += filled_size
filled_size = audio_data_stream.read_data(audio_buffer)
print("Totally {} bytes received for text [{}].".format(total_size, text))
# Initialize Pygame
from pydub import AudioSegment
import io
audio_segment = AudioSegment(
data=audio_buffer, # The raw audio data you received
sample_width=2, # Bytes per sample
frame_rate=16000, # Sampling frequency
channels=1 # Mono
)
from pydub.playback import play
play(audio_segment)
elif result.reason == speechsdk.ResultReason.Canceled:
cancellation_details = result.cancellation_details
print("Speech synthesis canceled: {}".format(cancellation_details.reason))
if cancellation_details.reason == speechsdk.CancellationReason.Error:
print("Error details: {}".format(cancellation_details.error_details))