我正在尝试使用 Google Cloud Speech-to-text API 将采访中的音频转录为文本。我设置了一个重试方法,如果发现错误,则允许重试 3 次。问题是我不断收到这些“400 音频超时错误:没有音频的持续时间很长。音频应该接近实时发送”,我不确定它来自哪里。
控制台输出:
Converted retries value: 3 -> Retry(total=3, connect=None, read=None, redirect=None, status=None)
Making request: POST https://oauth2.googleapis.com/token
Starting new HTTPS connection (1): oauth2.googleapis.com:443
https://oauth2.googleapis.com:443 "POST /token HTTP/1.1" 200 None
Starting new HTTPS connection (1): storage.googleapis.com:443
https://storage.googleapis.com:443 "GET /storage/v1/b/nicarg?projection=noAcl&prettyPrint=false HTTP/1.1" 200 727
https://storage.googleapis.com:443 "GET /download/storage/v1/b/*****/o/******.m4a?alt=media HTTP/1.1" 200 56660932
subprocess.call(['ffmpeg', '-y', '-f', 'mp4', '-i', 'audio\\******.m4a', '-acodec', 'pcm_s16le', '-vn', '-f', 'wav', '-'])
Authenticating credentials...
Authentication successful!
Creating a streaming recognizer...
Attempting to transcribe. Maximum retries: 3.
Loading stream...
Transcribing...
Transcribing Segments: 0%| | 0/12 [00:00<?, ?it/s]
Error during Google Cloud Speech-to-Text API request: 400 Audio Timeout Error: Long duration elapsed without audio. Audio should be sent close to real time.
Loading stream...
Transcribing...
Transcribing Segments: 0%| | 0/12 [00:00<?, ?it/s]
Error during Google Cloud Speech-to-Text API request: 400 Audio Timeout Error: Long duration elapsed without audio. Audio should be sent close to real time.
Loading stream...
Transcribing...
Transcribing Segments: 0%| | 0/12 [00:00<?, ?it/s]
Error during Google Cloud Speech-to-Text API request: 400 Audio Timeout Error: Long duration elapsed without audio. Audio should be sent close to real time.
Maximum retry attempts reached. Unable to transcribe the segments.
完整代码:
import os
import sys
from pydub import AudioSegment
from pydub.silence import split_on_silence
from transcriber import Transcriber
import logging
from google.cloud import storage
from google.cloud import speech
from tqdm import tqdm
import time
from google.api_core.exceptions import DeadlineExceeded, ServiceUnavailable
import random
# Constants
GOOGLE_CREDENTIALS_PATH = "********.json"
BUCKET_NAME = "*****"
AUDIO_FILE_NAME = "*****.m4a"
AUDIO_PATH = os.path.join("audio", AUDIO_FILE_NAME)
REMOTE_AUDIO_PATH = AUDIO_FILE_NAME
SEGMENT_DURATION = 5 * 60000
LANGUAGE = "es-419"
class Transcriptor:
"""
Functions to process audio into transcribable text.
"""
@staticmethod
def download_audio_from_cloud(audio_path, remote_path):
client = storage.Client.from_service_account_json(GOOGLE_CREDENTIALS_PATH)
bucket = client.get_bucket(BUCKET_NAME)
blob = bucket.blob(remote_path)
blob.download_to_filename(audio_path)
@staticmethod
def split_and_transcribe(audio_path, segment_duration=SEGMENT_DURATION, language=LANGUAGE):
"""
Splits the audio into segments and then transcribes each segment.
Parameters:
- audio_path (path): Audio file to process.
- segment_duration (int): Segment duration.
- language: Language of processing.
Returns:
- Audio transcripted.
"""
Transcriptor.download_audio_from_cloud(audio_path, REMOTE_AUDIO_PATH)
# Load audio file
audio = AudioSegment.from_file(audio_path, format="m4a")
# Calculate the number of segments based on the specified duration
num_segments = int(len(audio) / segment_duration) + 1
# Split the audio into segments
audio_segments = [audio[i * segment_duration:(i + 1) * segment_duration] for i in range(num_segments)]
# Transcribe the segments using Google Cloud Speech-to-Text API
result_transcription = Transcriptor.transcribe_cloud(
f"gs://{BUCKET_NAME}/{AUDIO_FILE_NAME}", language, audio_segments, total=num_segments
)
return result_transcription
@staticmethod
def transcribe_cloud(gcs_uri, language=LANGUAGE, audio_segments=None, total=100, max_retries=3):
"""
Transcribes audio segments using Google Cloud Speech-to-Text API.
Parameters:
- gcs_uri (str): Google Cloud storage URI for the audio file.
- language: Language of processing (always set as es-419)
- audio_segments: List of audio segments to transcribe.
Returns:
- Transcript of the audio segments.
"""
# Authenticate credentials
try:
print("Authenticating credentials...")
client = speech.SpeechClient.from_service_account_json(GOOGLE_CREDENTIALS_PATH)
print("Authentication successful!")
except Exception as e:
logging.error(f"Authentication error: {e}. Verify Google credentials and try again.", exc_info=True)
return None
# Create a streaming recognizer with the given config
print("Creating a streaming recognizer...")
config = speech.RecognitionConfig(
encoding=speech.RecognitionConfig.AudioEncoding.LINEAR16,
sample_rate_hertz=16000,
language_code=language,
enable_automatic_punctuation=True,
)
streaming_config = speech.StreamingRecognitionConfig(
config=config,
interim_results=True
)
num_tries = 0
# Attempting to transcribe
print(f"Attempting to transcribe. Maximum retries: {max_retries}.")
while num_tries < max_retries:
try:
# Loading stream
print("Loading stream...")
requests = (
speech.StreamingRecognizeRequest(audio_content=segment.raw_data if segment else b'')
for segment in audio_segments
)
# Call the streaming_recognize method with the generator of requests
responses = client.streaming_recognize(config=streaming_config, requests=requests)
# Transcribing
print("Transcribing...")
transcript_builder = []
# Process interim and final results
with tqdm(total=total, desc="Transcribing Segments") as pbar:
for response in responses:
for result in response.results:
for alternative in result.alternatives:
transcript_builder.append(f"\nTranscript: {alternative.transcript}")
transcript_builder.append(f"\nConfidence: {alternative.confidence}\n")
# Update the progress bar
pbar.update(1)
transcript = "".join(transcript_builder)
if transcript:
logging.info("Transcription successful!")
print(transcript)
return transcript
else:
logging.error("Transcribing result is empty. Retrying the segments.")
num_tries += 1
except DeadlineExceeded as timeout_error:
logging.warning(f"Timeout error: {timeout_error}. Retrying the segments. Retry attempt {num_tries}/{max_retries}.")
num_tries += 1
except ServiceUnavailable as service_unavailable_error:
logging.warning(f"Service Unavailable: {service_unavailable_error}. Retrying the segments.")
num_tries += 1
except Exception as e:
logging.error(f"Error during Google Cloud Speech-to-Text API request: {e}")
num_tries += 1
# Exponential backoff: wait for a random time between 2^nums_tries seconds
wait_time = random.uniform(0, 2**num_tries)
time.sleep(wait_time)
logging.error("Maximum retry attempts reached. Unable to transcribe the segments.")
sys.exit(1)
if __name__ == "__main__":
# Set up logging
logging.basicConfig(filename="logs/transcriptor.log", level=logging.DEBUG)
# Set up console logging
console_handler = logging.StreamHandler()
console_handler.setLevel(logging.DEBUG)
logging.getLogger().addHandler(console_handler)
# Set Google Cloud credentials
os.environ["GOOGLE_APPLICATION_CREDENTIALS"] = GOOGLE_CREDENTIALS_PATH
# Transcribe audio and pass the text to the Transcriber
try:
result_transcription = Transcriptor.split_and_transcribe(AUDIO_PATH, SEGMENT_DURATION, LANGUAGE)
app = Transcriber(result_transcription)
app.run()
except Exception as e:
logging.exception(f"An error occurred: {e}")
我尝试了 ChatGPT 向我建议的一切,但没有任何实际结果。请帮忙!
我能够重现您的问题。这可能是此处定义的语音到文本限制问题。根据谷歌文档:
- 超过 1 分钟的音频必须使用 uri 字段来引用 Google Cloud Storage 中的音频文件。 ** 如果您需要流式传输内容超过 5 分钟,请参阅无限流式传输教程。
从您的日志中,
https://storage.googleapis.com:443 "GET /download/storage/v1/b/*****/o/******.m4a?alt=media HTTP/1.1" 200 56660932
代码正在从本地文件路径传输约 50MB 的音频文件,每个传输片段可能超过 5 分钟 (
SEGMENT_DURATION = 5 * 60000
)
验证是否达到 API 限制的快速测试是将 SEGMENT_DURATION 调整为较小的值。例如
SEGMENT_DURATION = 5 * 30000
。您可能会遇到音频大小超出限制的错误,但它会确认音频长度限制导致了原始错误。
您可以使用异步语音识别以及GCS存储桶URL(而不是下载到本地,将存储桶URL传递给API)以批处理模式转录长音频文件。
如果您仍想观看超过 5 分钟的流媒体内容,可以参考无尽的流媒体示例。