Google Speech V2 从麦克风实时流式传输

问题描述 投票:0回答:1

我似乎无法在文档中找到如何使用谷歌语音V2 API的任何地方。由于某种原因,V2 似乎比 V1 便宜(根据 google 的语音定价表——尽管我不知道为什么过时的版本似乎更贵),而且 V2 还支持自动语言检测。

这是一个使用 V1 从麦克风“收听”并实时转录的工作示例:

import queue
import re
import sys
import os

from google.cloud import speech


import pyaudio

# Audio recording parameters
RATE = 16000
CHUNK = int(RATE / 10)  # 100ms

os.environ["GOOGLE_APPLICATION_CREDENTIALS"]="key_google.json"

class MicrophoneStream:
    """Opens a recording stream as a generator yielding the audio chunks."""

    def __init__(self: object, rate: int = RATE, chunk: int = CHUNK) -> None:
        """The audio -- and generator -- is guaranteed to be on the main thread."""
        self._rate = rate
        self._chunk = chunk

        # Create a thread-safe buffer of audio data
        self._buff = queue.Queue()
        self.closed = True

    def __enter__(self: object) -> object:
        self._audio_interface = pyaudio.PyAudio()
        self._audio_stream = self._audio_interface.open(
            format=pyaudio.paInt16,
            # The API currently only supports 1-channel (mono) audio
            channels=1,
            rate=self._rate,
            input=True,
            frames_per_buffer=self._chunk,
            # Run the audio stream asynchronously to fill the buffer object.
            # This is necessary so that the input device's buffer doesn't
            # overflow while the calling thread makes network requests, etc.
            stream_callback=self._fill_buffer,
        )

        self.closed = False

        return self

    def __exit__(
        self: object,
        type: object,
        value: object,
        traceback: object,
    ) -> None:
        """Closes the stream, regardless of whether the connection was lost or not."""
        self._audio_stream.stop_stream()
        self._audio_stream.close()
        self.closed = True
        # Signal the generator to terminate so that the client's
        # streaming_recognize method will not block the process termination.
        self._buff.put(None)
        self._audio_interface.terminate()

    def _fill_buffer(
        self: object,
        in_data: object,
        frame_count: int,
        time_info: object,
        status_flags: object,
    ) -> object:
        """Continuously collect data from the audio stream, into the buffer.

        Args:
            in_data: The audio data as a bytes object
            frame_count: The number of frames captured
            time_info: The time information
            status_flags: The status flags

        Returns:
            The audio data as a bytes object
        """
        self._buff.put(in_data)
        return None, pyaudio.paContinue

    def generator(self: object) -> object:
        """Generates audio chunks from the stream of audio data in chunks.

        Args:
            self: The MicrophoneStream object

        Returns:
            A generator that outputs audio chunks.
        """
        while not self.closed:
            # Use a blocking get() to ensure there's at least one chunk of
            # data, and stop iteration if the chunk is None, indicating the
            # end of the audio stream.
            chunk = self._buff.get()
            if chunk is None:
                return
            data = [chunk]

            # Now consume whatever other data's still buffered.
            while True:
                try:
                    chunk = self._buff.get(block=False)
                    if chunk is None:
                        return
                    data.append(chunk)
                except queue.Empty:
                    break

            yield b"".join(data)


def listen_print_loop(responses: object) -> None:  # Changed the return type to None
    num_chars_printed = 0
    all_transcripts = []  # To store all transcripts

    for response in responses:
        if not response.results:
            continue

        result = response.results[0]
        if not result.alternatives:
            continue

        transcript = result.alternatives[0].transcript
        overwrite_chars = " " * (num_chars_printed - len(transcript))

        if not result.is_final:
            sys.stdout.write(transcript + overwrite_chars + "\r")
            sys.stdout.flush()
            num_chars_printed = len(transcript)
        else:
            print(transcript + overwrite_chars)
            all_transcripts.append(transcript)  # Storing the transcript

            if re.search(r"\b(exit|quit)\b", transcript, re.I):
                print("Exiting..")
                print("All Transcripts: ", all_transcripts)  # Print all transcripts if needed
                break

            num_chars_printed = 0


def main() -> None:
    """Transcribe speech from audio file."""
    # See http://g.co/cloud/speech/docs/languages
    # for a list of supported languages.
    language_code = "es"  # a BCP-47 language tag

    client = speech.SpeechClient()
    config = speech.RecognitionConfig(
        encoding=speech.RecognitionConfig.AudioEncoding.LINEAR16,
        sample_rate_hertz=RATE,
        language_code=language_code,
    )

    streaming_config = speech.StreamingRecognitionConfig(
        config=config, interim_results=True
    )

    with MicrophoneStream(RATE, CHUNK) as stream:
        audio_generator = stream.generator()
        requests = (
            speech.StreamingRecognizeRequest(audio_content=content)
            for content in audio_generator
        )

        responses = client.streaming_recognize(streaming_config, requests)

        # Now, put the transcription responses to use.
        listen_print_loop(responses)


if __name__ == "__main__":
    main()

但是,我不知道如何将相同的代码转换为使用 V2。 这是谷歌唯一的例子,但那里的代码基本上是读取 .wav 文件或任何其他录制的文件,但我需要实时读取。有人知道如何使用V2进行实时流转录吗?

这是我的尝试(但根本不起作用):

import os
import queue
from google.cloud.speech_v2 import SpeechClient
from google.cloud.speech_v2.types import cloud_speech
import pyaudio

# Audio recording parameters
RATE = 16000
CHUNK = int(RATE / 10)  # 100ms

os.environ["GOOGLE_APPLICATION_CREDENTIALS"] = "key_google.json"


class MicrophoneStream:
    def __init__(self, rate, chunk):
        self._rate = rate
        self._chunk = chunk
        self._buff = queue.Queue()
        self.closed = True

    def __enter__(self):
        self._audio_interface = pyaudio.PyAudio()
        self._audio_stream = self._audio_interface.open(
            format=pyaudio.paInt16,
            channels=1,
            rate=self._rate,
            input=True,
            frames_per_buffer=self._chunk,
            stream_callback=self._fill_buffer,
        )
        self.closed = False
        return self

    def __exit__(self, type, value, traceback):
        self._audio_stream.stop_stream()
        self._audio_stream.close()
        self.closed = True
        self._buff.put(None)
        self._audio_interface.terminate()

    def _fill_buffer(self, in_data, frame_count, time_info, status_flags):
        self._buff.put(in_data)
        return None, pyaudio.paContinue

    def generator(self):
        while not self.closed:
            chunk = self._buff.get()
            if chunk is None:
                return
            data = [chunk]
            while True:
                try:
                    chunk = self._buff.get(block=False)
                    if chunk is None:
                        return
                    data.append(chunk)
                except queue.Empty:
                    break
            yield b"".join(data)


def main():
    project_id = "stellar-cumulus-379717"

    client = SpeechClient()

    recognition_config = cloud_speech.RecognitionConfig(
        auto_decoding_config=cloud_speech.AutoDetectDecodingConfig(),
        language_codes=["en-US"],
        model="long",
    )

    streaming_config = cloud_speech.StreamingRecognitionConfig(config=recognition_config)
    config_request = cloud_speech.StreamingRecognizeRequest(
        recognizer=f"projects/{project_id}/locations/global/recognizers/_",
        streaming_config=streaming_config,
    )

    with MicrophoneStream(RATE, CHUNK) as stream:
        audio_generator = stream.generator()
        audio_requests = (
            cloud_speech.StreamingRecognizeRequest(audio_content=content)
            for content in audio_generator
        )

        def requests():
            yield config_request
            yield from audio_requests

        responses = client.streaming_recognize(requests=requests())

        for response in responses:
            for result in response.results:
                print(f"Transcript: {result.alternatives[0].transcript}")


if __name__ == "__main__":
    main()

我收到了一堆错误:

Traceback (most recent call last):
  File "C:\Python3\Lib\site-packages\google\api_core\grpc_helpers.py", line 162, in error_remapped_callable
    return _StreamingResponseIterator(
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "C:\Python3\Lib\site-packages\google\api_core\grpc_helpers.py", line 88, in __init__
    self._stored_first_result = next(self._wrapped)
                                ^^^^^^^^^^^^^^^^^^^
  File "C:\Python3\Lib\site-packages\grpc\_channel.py", line 541, in __next__
    return self._next()
           ^^^^^^^^^^^^
  File "C:\Python3\Lib\site-packages\grpc\_channel.py", line 967, in _next
    raise self
grpc._channel._MultiThreadedRendezvous: <_MultiThreadedRendezvous of RPC that terminated with:
        status = StatusCode.UNKNOWN
        details = "Exception iterating requests!"
        debug_error_string = "None"
>

The above exception was the direct cause of the following exception:

Traceback (most recent call last):
  File "D:\AI_workers\pocketchat\stt2.py", line 98, in <module>
    main()
  File "D:\AI_workers\pocketchat\stt2.py", line 90, in main
    responses = client.streaming_recognize(requests=requests())
                ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "C:\Python3\Lib\site-packages\google\cloud\speech_v2\services\speech\client.py", line 1639, in streaming_recognize
    response = rpc(
               ^^^^
  File "C:\Python3\Lib\site-packages\google\api_core\gapic_v1\method.py", line 113, in __call__
    return wrapped_func(*args, **kwargs)
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "C:\Python3\Lib\site-packages\google\api_core\grpc_helpers.py", line 166, in error_remapped_callable
    raise exceptions.from_grpc_error(exc) from exc
google.api_core.exceptions.Unknown: None Exception iterating requests!
python google-speech-api google-speech-to-text-api
1个回答
0
投票

你成功使用de V2版本了吗?我正在尝试做同样的事情,但遇到了同样的错误。谢谢你

© www.soinside.com 2019 - 2024. All rights reserved.