Init

2025-01-06 18:04:01 +01:00 · 2025-01-06 18:04:01 +01:00 · cf935a16bc
commit cf935a16bc
1 changed files with 93 additions and 0 deletions
--- a/stt_test.py
+++ b/stt_test.py
@ -0,0 +1,93 @@
+import torch
+from transformers import AutoModelForSpeechSeq2Seq, AutoProcessor, pipeline
+import sounddevice as sd
+import numpy as np
+import wave
+import gradio as gr
+
+# Setup device
+device = "cuda:0" if torch.cuda.is_available() else "cpu"
+torch_dtype = torch.float16 if torch.cuda.is_available() else torch.float32
+
+# Load Whisper model
+model_id = "openai/whisper-large-v3-turbo"
+
+model = AutoModelForSpeechSeq2Seq.from_pretrained(
+    model_id, torch_dtype=torch_dtype, low_cpu_mem_usage=True, use_safetensors=True
+)
+model.to(device)
+
+processor = AutoProcessor.from_pretrained(model_id)
+
+pipe = pipeline(
+    "automatic-speech-recognition",
+    model=model,
+    tokenizer=processor.tokenizer,
+    feature_extractor=processor.feature_extractor,
+    torch_dtype=torch_dtype,
+    device=device,
+)
+
+# Audio recording settings
+SAMPLE_RATE = 16000  # Whisper prefers 16 kHz
+FILENAME = "recorded_audio.wav"
+is_recording = False
+recorded_audio = None
+
+
+def start_recording():
+    """Starts recording audio."""
+    global is_recording, recorded_audio
+    is_recording = True
+    print("Recording started...")
+    recorded_audio = sd.rec(int(SAMPLE_RATE * 60), samplerate=SAMPLE_RATE, channels=1, dtype=np.float32)
+    return "Recording... Click 'Stop Recording' to finish."
+
+
+def stop_recording():
+    """Stops recording audio and saves it."""
+    global is_recording, recorded_audio
+    if not is_recording:
+        return "Not recording!"
+    sd.stop()
+    is_recording = False
+    print("Recording stopped.")
+    save_audio_to_wav(recorded_audio, FILENAME)
+    return "Recording stopped. Click 'Transcribe' to see the result."
+
+
+def save_audio_to_wav(audio, filename):
+    """Saves audio data to a WAV file."""
+    audio = (audio * 32767).astype(np.int16)  # Convert to 16-bit PCM format
+    with wave.open(filename, 'w') as wf:
+        wf.setnchannels(1)  # Mono
+        wf.setsampwidth(2)  # 2 bytes per sample
+        wf.setframerate(SAMPLE_RATE)
+        wf.writeframes(audio.tobytes())
+
+
+def transcribe_audio():
+    """Transcribes the audio file using Whisper."""
+    print("Transcribing...")
+    result = pipe(FILENAME)
+    print("Transcription complete.")
+    return result["text"]
+
+
+# Gradio Interface
+with gr.Blocks() as interface:
+    gr.Markdown("# Voice to Text App")
+    gr.Markdown("Click 'Start Recording' to record your voice, 'Stop Recording' to save, and 'Transcribe' to convert speech to text.")
+
+    start_button = gr.Button("Start Recording")
+    stop_button = gr.Button("Stop Recording")
+    transcribe_button = gr.Button("Transcribe")
+
+    output = gr.Textbox(label="Output")
+
+    start_button.click(start_recording, outputs=output)
+    stop_button.click(stop_recording, outputs=output)
+    transcribe_button.click(transcribe_audio, outputs=output)
+
+if __name__ == "__main__":
+    interface.launch()