Init

2025-01-06 18:04:01 +01:00 · 2025-01-06 18:04:01 +01:00 · cf935a16bc
commit cf935a16bc
1 changed files with 93 additions and 0 deletions
--- a/stt_test.py
+++ b/stt_test.py
@ -0,0 +1,93 @@
 import torch
 from transformers import AutoModelForSpeechSeq2Seq, AutoProcessor, pipeline
 import sounddevice as sd
 import numpy as np
 import wave
 import gradio as gr
 # Setup device
 device = "cuda:0" if torch.cuda.is_available() else "cpu"
 torch_dtype = torch.float16 if torch.cuda.is_available() else torch.float32
 # Load Whisper model
 model_id = "openai/whisper-large-v3-turbo"
 model = AutoModelForSpeechSeq2Seq.from_pretrained(
    model_id, torch_dtype=torch_dtype, low_cpu_mem_usage=True, use_safetensors=True
 )
 model.to(device)
 processor = AutoProcessor.from_pretrained(model_id)
 pipe = pipeline(
    "automatic-speech-recognition",
    model=model,
    tokenizer=processor.tokenizer,
    feature_extractor=processor.feature_extractor,
    torch_dtype=torch_dtype,
    device=device,
 )
 # Audio recording settings
 SAMPLE_RATE = 16000  # Whisper prefers 16 kHz
 FILENAME = "recorded_audio.wav"
 is_recording = False
 recorded_audio = None
 def start_recording():
    """Starts recording audio."""
    global is_recording, recorded_audio
    is_recording = True
    print("Recording started...")
    recorded_audio = sd.rec(int(SAMPLE_RATE * 60), samplerate=SAMPLE_RATE, channels=1, dtype=np.float32)
    return "Recording... Click 'Stop Recording' to finish."
 def stop_recording():
    """Stops recording audio and saves it."""
    global is_recording, recorded_audio
    if not is_recording:
        return "Not recording!"
    sd.stop()
    is_recording = False
    print("Recording stopped.")
    save_audio_to_wav(recorded_audio, FILENAME)
    return "Recording stopped. Click 'Transcribe' to see the result."
 def save_audio_to_wav(audio, filename):
    """Saves audio data to a WAV file."""
    audio = (audio * 32767).astype(np.int16)  # Convert to 16-bit PCM format
    with wave.open(filename, 'w') as wf:
        wf.setnchannels(1)  # Mono
        wf.setsampwidth(2)  # 2 bytes per sample
        wf.setframerate(SAMPLE_RATE)
        wf.writeframes(audio.tobytes())
 def transcribe_audio():
    """Transcribes the audio file using Whisper."""
    print("Transcribing...")
    result = pipe(FILENAME)
    print("Transcription complete.")
    return result["text"]
 # Gradio Interface
 with gr.Blocks() as interface:
    gr.Markdown("# Voice to Text App")
    gr.Markdown("Click 'Start Recording' to record your voice, 'Stop Recording' to save, and 'Transcribe' to convert speech to text.")
    start_button = gr.Button("Start Recording")
    stop_button = gr.Button("Stop Recording")
    transcribe_button = gr.Button("Transcribe")
    output = gr.Textbox(label="Output")
    start_button.click(start_recording, outputs=output)
    stop_button.click(stop_recording, outputs=output)
    transcribe_button.click(transcribe_audio, outputs=output)
 if __name__ == "__main__":
    interface.launch()