Azure Recognize_once_async().get() 在使用我的 UI 运行时未接收输入

问题描述 投票:0回答:1

总而言之,当translator.recognize_once_async().get()运行时,它会阻止我的代码,但是,无论我对着麦克风说多少话,它都不起作用。

我已经针对实验室练习对此进行了测试,其本身不会返回任何错误,并且能够检测音频输入。但是当使用 tkinter 运行时,它会导致它拒绝接受任何语音输入并花费了整整 15 秒。

我尝试过使用线程和队列,认为这种方法可以让我在后台运行语音输入检测。然而还是不行。

我已经检查了我的麦克风输入访问权限,并根据 azure 提供的练习实验室对其进行了测试

以下是我用来测试的代码

from dotenv import load_dotenv
import os
import time as timer
import azure.cognitiveservices.speech as speech_sdk
import tkinter as tk
from tkinter import ttk

# note: dropdown boxes work fine in my code testing, but for code simplicity its preset to english input, chinese output

# initialize azure service
load_dotenv()
ai_key = os.getenv('SPEECH_KEY')
ai_region = os.getenv('SPEECH_REGION')

speech_config = speech_sdk.SpeechConfig(ai_key, ai_region)
# Configure translation
translation_config = speech_sdk.translation.SpeechTranslationConfig(ai_key, ai_region)
translation_config.add_target_language('en')
translation_config.add_target_language('zh-Hans')
translation_config.add_target_language('ta')
translation_config.add_target_language('yue')
translation_config.add_target_language('ms')


def reset_application():
    # Reset the combobox values
    input_lang_combobox.set('')
    output_lang_combobox.set('')

    # Reset the combobox options to all languages
    input_lang_combobox['values'] = languages
    output_lang_combobox['values'] = languages

    # Reset the status label to its initial state
    status_label.config(text="Status: Waiting for both input and output language to be selected")
    # Re-enable the run button and disable it until new selections are made
    run_button.config(state=tk.DISABLED)

def update_languages(*args):
    # Get the currently selected languages
    input_language = input_lang_combobox.get()
    output_language = output_lang_combobox.get()

    # Update output language options based on input selection
    if input_language:
        output_options = [lang for lang in languages if lang != input_language]
        output_lang_combobox['values'] = output_options
        if output_language == input_language:
            output_lang_combobox.set('')
    else:
        output_lang_combobox['values'] = languages

    # Update input language options based on output selection
    if output_language:
        input_options = [lang for lang in languages if lang != output_language]
        input_lang_combobox['values'] = input_options
        if input_language == output_language:
            input_lang_combobox.set('')
    else:
        input_lang_combobox['values'] = languages

    # Check if both languages are selected to update the status label and button state
    if input_language and output_language:
        style.configure('Custom.TButton', 
                        foreground='orange',  # Text color
                        background='white')  # Background color      
        status_label.config(text=f"Status: Ready to translate from {input_language} to {output_language}\n      "
                                    f"Click button to begin translation service",style='Custom.TButton')

        run_button.config(state=tk.NORMAL)  # Enable the button
    else:
        status_label.config(text="Status: Waiting for both input and output language to be selected")
        run_button.config(state=tk.DISABLED)  # Keep the button disabled

def start_translation():
    global translation_config
    run_button.config(state=tk.DISABLED)
    # configure input language
    translation_config.speech_recognition_language = 'en-US'
    
    status_label.config(text="Status: Please speak into the microphone now")
    def translate(targetLanguage):

        def recognition(targetLanguage):
            global translation_config
            audio_config = speech_sdk.AudioConfig(use_default_microphone=True)
            translator = speech_sdk.translation.TranslationRecognizer(translation_config, audio_config=audio_config)
            result = translator.recognize_once_async().get() #UI is supposed to be unresponsive when detecing input, but here it doesnt get any input back, elapsing the whole 15s
            print(result.text)
            print(result.translations['zh-Hans'])
            if result.reason == speech_sdk.ResultReason.TranslatedSpeech:
                root.after(1000, translate_process, targetLanguage, result)
            elif result.reason == speech_sdk.ResultReason.NoMatch:
                new_window2 = tk.Toplevel(root)
                new_window2.title("Error Pop-Up")
                translated_text_label = tk.Label(new_window2, text=f"Status: An error occurred {result.no_match_details}, event 'matching speech with input' failed. Restarting Process\nClose the pop-up to proceed", font=("Arial", 12,))
                translated_text_label.pack()
                reset_application()
            elif result.reason == speech_sdk.ResultReason.Canceled:
                cancellation_details = result.cancellation_details
                new_window2 = tk.Toplevel(root)
                new_window2.title("Error Pop-Up")
                translated_text_label = tk.Label(new_window2, text=f"Status: An error occurred {cancellation_details.reason}, event 'input speech' failed. Restarting Process\nClose the pop-up to proceed", font=("Arial", 12,))
                translated_text_label.pack()
                reset_application()   

        def translate_process(targetLanguage,result):
            status_label.config(text="Status: Valid Input Speech Detected. Translating In Process")
            try:
                translation = result.translations[targetLanguage]
            except Exception as e:
                new_window2 = tk.Toplevel(root)
                new_window2.title("Error Pop-Up")
                translated_text_label = tk.Label(new_window2, text=f"Status: An error occurred: {e}, event 'Translation' failed. Restarting Process\nClose the pop-up to proceed", font=("Arial", 12,))
                translated_text_label.pack()
                reset_application()
            root.after(1000, complete_translation, targetLanguage, translation)

        def complete_translation(targetLanguage, translation):
            global speech_config
            status_label.config(text="Status: Translation Complete")

            # Synthesize translation
            voices = {
                "en": "en-SG-WayneNeural",
                "yue": "yue-CN-YunSongNeural",
                "ta": "ta-SG-AnbuNeural",
                "zh-Hans": "zh-CN-YunxiNeural",
                "ms": "ms-MY-OsmanNeural"
            }

            speech_config.speech_synthesis_voice_name = voices.get(targetLanguage)
            speech_synthesizer = speech_sdk.SpeechSynthesizer(speech_config)
            root.after(1000, synthesize_output, translation, speech_synthesizer)

        def synthesize_output(translation, speech_synthesizer):
            status_label.config(text="Status: Producing Output")
            speak = speech_synthesizer.speak_text_async(translation).get()
            if speak.reason != speech_sdk.ResultReason.SynthesizingAudioCompleted:
                print(speak.reason)
            reset_application()

        # Start the translation after a short delay to allow the label to update
        root.after(500, recognition, targetLanguage)

    # Configure output language + begin translation
    translate(targetLanguage='zh-Hans')

languages = ["English", "Chinese", "Cantonese", "Malay", "Tamil"]

# Assuming root is your Tkinter root window
root = tk.Tk()
root.geometry("1200x400")  # Adjust the size as needed

# Add the Translation App label at the top
app_title_label = ttk.Label(root, text="Translation App", font=("LG Smart UI Bold", 56, "bold"), foreground='orange')
app_title_label.grid(row=0, column=0, padx=10, pady=5, columnspan=2)  # Span across 2 columns if needed

# Dropdown Frame
dropdown_frame = ttk.Frame(root)
dropdown_frame.grid(row=1, column=0, padx=10, pady=10)

# Input Language ComboBox setup
input_lang_label = ttk.Label(dropdown_frame, text="Select Input Language", foreground="white",background="orange" ,font=("LG Smart UI Bold", 24, "bold"))
input_lang_label.pack(side=tk.LEFT)
input_lang_combobox = ttk.Combobox(dropdown_frame, values=languages)
input_lang_combobox.pack(side=tk.LEFT, padx=10)
input_lang_combobox.bind('<<ComboboxSelected>>', update_languages)

# Output Language ComboBox setup
output_lang_label = ttk.Label(dropdown_frame, text="Select Output Language", foreground="white",background="orange",font=("LG Smart UI Bold", 24, "bold"))
output_lang_label.pack(side=tk.LEFT)
output_lang_combobox = ttk.Combobox(dropdown_frame, values=languages)
output_lang_combobox.pack(side=tk.LEFT)
output_lang_combobox.bind('<<ComboboxSelected>>', update_languages)

# Button Frame
button_frame = ttk.Frame(root)
button_frame.grid(row=2, column=0, pady=30)

# Configure style for buttons
style = ttk.Style()
style.configure('Custom.TButton', foreground='orange', background='white', font=("LG Smart UI Bold", 12, "bold"), padding=10)

# Run and Reset Buttons
run_button = ttk.Button(button_frame, text="Begin Translation", style='Custom.TButton', command=start_translation, state=tk.DISABLED)
run_button.pack(side=tk.LEFT, padx=10)

reset_button = ttk.Button(button_frame, text="Reset", style='Custom.TButton', command=reset_application)
reset_button.pack(side=tk.LEFT, padx=10)

# Status Bar
status_bar = ttk.Frame(root, relief=tk.SUNKEN)
status_bar.grid(row=3, column=0, sticky=tk.W+tk.E, padx=20, pady=50)
status_label = ttk.Label(status_bar, text="Status: Waiting for both input and output language to be selected", anchor=tk.W)
status_label.pack(fill=tk.X)

root.mainloop()
python azure tkinter speech-recognition translation
1个回答
0
投票

我尝试了以下示例代码,使用 tkinter 在对着麦克风说话时将语音转换为文本。

代码:

import tkinter as tk
from tkinter import ttk
import azure.cognitiveservices.speech as speechsdk

class AudioToTextTranslator:
    def __init__(self, root):
        self.root = root
        self.root.title("Speech to Text Translator")

        dropdown_frame = ttk.Frame(root)
        dropdown_frame.grid(row=0, column=0, padx=10, pady=5)

        input_lang_label = ttk.Label(dropdown_frame, text="Select Input Language", foreground="white", background="orange", font=("LG Smart UI Bold", 24, "bold"))
        input_lang_label.pack(side=tk.LEFT)
        self.input_lang_combobox = ttk.Combobox(dropdown_frame, values=list(languages.keys()))
        self.input_lang_combobox.pack(side=tk.LEFT, padx=10)
        self.input_lang_combobox.bind('<<ComboboxSelected>>', self.update_languages)

        output_lang_label = ttk.Label(dropdown_frame, text="Select Output Language", foreground="white", background="orange", font=("LG Smart UI Bold", 24, "bold"))
        output_lang_label.pack(side=tk.LEFT)
        self.output_lang_combobox = ttk.Combobox(dropdown_frame, values=list(languages.keys()))
        self.output_lang_combobox.pack(side=tk.LEFT)
        self.output_lang_combobox.bind('<<ComboboxSelected>>', self.update_languages)

        button_frame = ttk.Frame(root)
        button_frame.grid(row=1, column=0, pady=30)

        style = ttk.Style()
        style.configure('Custom.TButton', foreground='orange', background='white', font=("LG Smart UI Bold", 12, "bold"), padding=10)

        self.run_button = ttk.Button(button_frame, text="Begin Translation", style='Custom.TButton', command=self.start_translation, state=tk.DISABLED)
        self.run_button.pack(side=tk.LEFT, padx=10)

        self.reset_button = ttk.Button(button_frame, text="Reset", style='Custom.TButton', command=self.reset_application)
        self.reset_button.pack(side=tk.LEFT, padx=10)

    def update_languages(self, event):
        self.run_button.config(state=tk.NORMAL)

    def start_translation(self):
        output_language_code = languages[self.output_lang_combobox.get()]

        speech_config = speechsdk.SpeechConfig(subscription="<speech_key>", region="<speech_region>")
        speech_config.speech_recognition_language = output_language_code

        audio_config = speechsdk.audio.AudioConfig(use_default_microphone=True)
        recognizer = speechsdk.SpeechRecognizer(speech_config=speech_config, audio_config=audio_config)

        print("Listening for audio...")
        result = recognizer.recognize_once()

        if result.reason == speechsdk.ResultReason.RecognizedSpeech:
            print("Recognized: {}".format(result.text))
        elif result.reason == speechsdk.ResultReason.NoMatch:
            print("No speech could be recognized")
        elif result.reason == speechsdk.ResultReason.Canceled:
            cancellation_details = result.cancellation_details
            print("Speech Recognition canceled: {}".format(cancellation_details.reason))
            if cancellation_details.reason == speechsdk.CancellationReason.Error:
                print("Error details: {}".format(cancellation_details.error_details))

    def reset_application(self):
        self.input_lang_combobox.current(0)
        self.output_lang_combobox.current(0)
        self.run_button.config(state=tk.DISABLED)

if __name__ == "__main__":
    languages = {"English": "en-US", "Chinese (Simplified)": "zh-CN", "Tamil": "ta-IN", "Malay": "ms-MY"}
    
    root = tk.Tk()
    app = AudioToTextTranslator(root)
    root.mainloop()

输出:

它运行成功,将语音翻译转换为文本输出,如下所示。

C:\Users\xxxxxxx\Documents\xxxxxxx>python kam.py
Listening for audio...
Recognized: Hi Kamali. How are you?
Listening for audio...
Recognized: 嗨。
Listening for audio...
Recognized: வெல்கம்.

enter image description here

© www.soinside.com 2019 - 2024. All rights reserved.