Google Speech V2 从麦克风实时流式传输

Question

我似乎无法在文档中找到如何使用谷歌语音V2 API的任何地方。由于某种原因，V2 似乎比 V1 便宜（根据 google 的语音定价表——尽管我不知道为什么过时的版本似乎更贵），而且 V2 还支持自动语言检测。

这是一个使用 V1 从麦克风“收听”并实时转录的工作示例：

import queue
import re
import sys
import os

from google.cloud import speech


import pyaudio

# Audio recording parameters
RATE = 16000
CHUNK = int(RATE / 10)  # 100ms

os.environ["GOOGLE_APPLICATION_CREDENTIALS"]="key_google.json"

class MicrophoneStream:
    """Opens a recording stream as a generator yielding the audio chunks."""

    def __init__(self: object, rate: int = RATE, chunk: int = CHUNK) -> None:
        """The audio -- and generator -- is guaranteed to be on the main thread."""
        self._rate = rate
        self._chunk = chunk

        # Create a thread-safe buffer of audio data
        self._buff = queue.Queue()
        self.closed = True

    def __enter__(self: object) -> object:
        self._audio_interface = pyaudio.PyAudio()
        self._audio_stream = self._audio_interface.open(
            format=pyaudio.paInt16,
            # The API currently only supports 1-channel (mono) audio
            channels=1,
            rate=self._rate,
            input=True,
            frames_per_buffer=self._chunk,
            # Run the audio stream asynchronously to fill the buffer object.
            # This is necessary so that the input device's buffer doesn't
            # overflow while the calling thread makes network requests, etc.
            stream_callback=self._fill_buffer,
        )

        self.closed = False

        return self

    def __exit__(
        self: object,
        type: object,
        value: object,
        traceback: object,
    ) -> None:
        """Closes the stream, regardless of whether the connection was lost or not."""
        self._audio_stream.stop_stream()
        self._audio_stream.close()
        self.closed = True
        # Signal the generator to terminate so that the client's
        # streaming_recognize method will not block the process termination.
        self._buff.put(None)
        self._audio_interface.terminate()

    def _fill_buffer(
        self: object,
        in_data: object,
        frame_count: int,
        time_info: object,
        status_flags: object,
    ) -> object:
        """Continuously collect data from the audio stream, into the buffer.

        Args:
            in_data: The audio data as a bytes object
            frame_count: The number of frames captured
            time_info: The time information
            status_flags: The status flags

        Returns:
            The audio data as a bytes object
        """
        self._buff.put(in_data)
        return None, pyaudio.paContinue

    def generator(self: object) -> object:
        """Generates audio chunks from the stream of audio data in chunks.

        Args:
            self: The MicrophoneStream object

        Returns:
            A generator that outputs audio chunks.
        """
        while not self.closed:
            # Use a blocking get() to ensure there's at least one chunk of
            # data, and stop iteration if the chunk is None, indicating the
            # end of the audio stream.
            chunk = self._buff.get()
            if chunk is None:
                return
            data = [chunk]

            # Now consume whatever other data's still buffered.
            while True:
                try:
                    chunk = self._buff.get(block=False)
                    if chunk is None:
                        return
                    data.append(chunk)
                except queue.Empty:
                    break

            yield b"".join(data)


def listen_print_loop(responses: object) -> None:  # Changed the return type to None
    num_chars_printed = 0
    all_transcripts = []  # To store all transcripts

    for response in responses:
        if not response.results:
            continue

        result = response.results[0]
        if not result.alternatives:
            continue

        transcript = result.alternatives[0].transcript
        overwrite_chars = " " * (num_chars_printed - len(transcript))

        if not result.is_final:
            sys.stdout.write(transcript + overwrite_chars + "\r")
            sys.stdout.flush()
            num_chars_printed = len(transcript)
        else:
            print(transcript + overwrite_chars)
            all_transcripts.append(transcript)  # Storing the transcript

            if re.search(r"\b(exit|quit)\b", transcript, re.I):
                print("Exiting..")
                print("All Transcripts: ", all_transcripts)  # Print all transcripts if needed
                break

            num_chars_printed = 0


def main() -> None:
    """Transcribe speech from audio file."""
    # See http://g.co/cloud/speech/docs/languages
    # for a list of supported languages.
    language_code = "es"  # a BCP-47 language tag

    client = speech.SpeechClient()
    config = speech.RecognitionConfig(
        encoding=speech.RecognitionConfig.AudioEncoding.LINEAR16,
        sample_rate_hertz=RATE,
        language_code=language_code,
    )

    streaming_config = speech.StreamingRecognitionConfig(
        config=config, interim_results=True
    )

    with MicrophoneStream(RATE, CHUNK) as stream:
        audio_generator = stream.generator()
        requests = (
            speech.StreamingRecognizeRequest(audio_content=content)
            for content in audio_generator
        )

        responses = client.streaming_recognize(streaming_config, requests)

        # Now, put the transcription responses to use.
        listen_print_loop(responses)


if __name__ == "__main__":
    main()

但是，我不知道如何将相同的代码转换为使用 V2。这是谷歌唯一的例子，但那里的代码基本上是读取 .wav 文件或任何其他录制的文件，但我需要实时读取。有人知道如何使用V2进行实时流转录吗？

这是我的尝试（但根本不起作用）：

import os
import queue
from google.cloud.speech_v2 import SpeechClient
from google.cloud.speech_v2.types import cloud_speech
import pyaudio

# Audio recording parameters
RATE = 16000
CHUNK = int(RATE / 10)  # 100ms

os.environ["GOOGLE_APPLICATION_CREDENTIALS"] = "key_google.json"


class MicrophoneStream:
    def __init__(self, rate, chunk):
        self._rate = rate
        self._chunk = chunk
        self._buff = queue.Queue()
        self.closed = True

    def __enter__(self):
        self._audio_interface = pyaudio.PyAudio()
        self._audio_stream = self._audio_interface.open(
            format=pyaudio.paInt16,
            channels=1,
            rate=self._rate,
            input=True,
            frames_per_buffer=self._chunk,
            stream_callback=self._fill_buffer,
        )
        self.closed = False
        return self

    def __exit__(self, type, value, traceback):
        self._audio_stream.stop_stream()
        self._audio_stream.close()
        self.closed = True
        self._buff.put(None)
        self._audio_interface.terminate()

    def _fill_buffer(self, in_data, frame_count, time_info, status_flags):
        self._buff.put(in_data)
        return None, pyaudio.paContinue

    def generator(self):
        while not self.closed:
            chunk = self._buff.get()
            if chunk is None:
                return
            data = [chunk]
            while True:
                try:
                    chunk = self._buff.get(block=False)
                    if chunk is None:
                        return
                    data.append(chunk)
                except queue.Empty:
                    break
            yield b"".join(data)


def main():
    project_id = "stellar-cumulus-379717"

    client = SpeechClient()

    recognition_config = cloud_speech.RecognitionConfig(
        auto_decoding_config=cloud_speech.AutoDetectDecodingConfig(),
        language_codes=["en-US"],
        model="long",
    )

    streaming_config = cloud_speech.StreamingRecognitionConfig(config=recognition_config)
    config_request = cloud_speech.StreamingRecognizeRequest(
        recognizer=f"projects/{project_id}/locations/global/recognizers/_",
        streaming_config=streaming_config,
    )

    with MicrophoneStream(RATE, CHUNK) as stream:
        audio_generator = stream.generator()
        audio_requests = (
            cloud_speech.StreamingRecognizeRequest(audio_content=content)
            for content in audio_generator
        )

        def requests():
            yield config_request
            yield from audio_requests

        responses = client.streaming_recognize(requests=requests())

        for response in responses:
            for result in response.results:
                print(f"Transcript: {result.alternatives[0].transcript}")


if __name__ == "__main__":
    main()

我收到了一堆错误：

Traceback (most recent call last):
  File "C:\Python3\Lib\site-packages\google\api_core\grpc_helpers.py", line 162, in error_remapped_callable
    return _StreamingResponseIterator(
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "C:\Python3\Lib\site-packages\google\api_core\grpc_helpers.py", line 88, in __init__
    self._stored_first_result = next(self._wrapped)
                                ^^^^^^^^^^^^^^^^^^^
  File "C:\Python3\Lib\site-packages\grpc\_channel.py", line 541, in __next__
    return self._next()
           ^^^^^^^^^^^^
  File "C:\Python3\Lib\site-packages\grpc\_channel.py", line 967, in _next
    raise self
grpc._channel._MultiThreadedRendezvous: <_MultiThreadedRendezvous of RPC that terminated with:
        status = StatusCode.UNKNOWN
        details = "Exception iterating requests!"
        debug_error_string = "None"
>

The above exception was the direct cause of the following exception:

Traceback (most recent call last):
  File "D:\AI_workers\pocketchat\stt2.py", line 98, in <module>
    main()
  File "D:\AI_workers\pocketchat\stt2.py", line 90, in main
    responses = client.streaming_recognize(requests=requests())
                ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "C:\Python3\Lib\site-packages\google\cloud\speech_v2\services\speech\client.py", line 1639, in streaming_recognize
    response = rpc(
               ^^^^
  File "C:\Python3\Lib\site-packages\google\api_core\gapic_v1\method.py", line 113, in __call__
    return wrapped_func(*args, **kwargs)
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "C:\Python3\Lib\site-packages\google\api_core\grpc_helpers.py", line 166, in error_remapped_callable
    raise exceptions.from_grpc_error(exc) from exc
google.api_core.exceptions.Unknown: None Exception iterating requests!

Answer 1

你成功使用de V2版本了吗？我正在尝试做同样的事情，但遇到了同样的错误。谢谢你

Google Speech V2 从麦克风实时流式传输

问题描述投票：0回答：1

1个回答

最新问题

Google Speech V2 从麦克风实时流式传输

问题描述 投票：0回答：1

1个回答

最新问题

问题描述投票：0回答：1