commit cf935a16bc5662d4eb43ed1f9e6740f8daf67c46 Author: Christian Rute Date: Mon Jan 6 18:04:01 2025 +0100 Init diff --git a/stt_test.py b/stt_test.py new file mode 100644 index 0000000..df8ae91 --- /dev/null +++ b/stt_test.py @@ -0,0 +1,93 @@ +import torch +from transformers import AutoModelForSpeechSeq2Seq, AutoProcessor, pipeline +import sounddevice as sd +import numpy as np +import wave +import gradio as gr + +# Setup device +device = "cuda:0" if torch.cuda.is_available() else "cpu" +torch_dtype = torch.float16 if torch.cuda.is_available() else torch.float32 + +# Load Whisper model +model_id = "openai/whisper-large-v3-turbo" + +model = AutoModelForSpeechSeq2Seq.from_pretrained( + model_id, torch_dtype=torch_dtype, low_cpu_mem_usage=True, use_safetensors=True +) +model.to(device) + +processor = AutoProcessor.from_pretrained(model_id) + +pipe = pipeline( + "automatic-speech-recognition", + model=model, + tokenizer=processor.tokenizer, + feature_extractor=processor.feature_extractor, + torch_dtype=torch_dtype, + device=device, +) + +# Audio recording settings +SAMPLE_RATE = 16000 # Whisper prefers 16 kHz +FILENAME = "recorded_audio.wav" +is_recording = False +recorded_audio = None + + +def start_recording(): + """Starts recording audio.""" + global is_recording, recorded_audio + is_recording = True + print("Recording started...") + recorded_audio = sd.rec(int(SAMPLE_RATE * 60), samplerate=SAMPLE_RATE, channels=1, dtype=np.float32) + return "Recording... Click 'Stop Recording' to finish." + + +def stop_recording(): + """Stops recording audio and saves it.""" + global is_recording, recorded_audio + if not is_recording: + return "Not recording!" + sd.stop() + is_recording = False + print("Recording stopped.") + save_audio_to_wav(recorded_audio, FILENAME) + return "Recording stopped. Click 'Transcribe' to see the result." + + +def save_audio_to_wav(audio, filename): + """Saves audio data to a WAV file.""" + audio = (audio * 32767).astype(np.int16) # Convert to 16-bit PCM format + with wave.open(filename, 'w') as wf: + wf.setnchannels(1) # Mono + wf.setsampwidth(2) # 2 bytes per sample + wf.setframerate(SAMPLE_RATE) + wf.writeframes(audio.tobytes()) + + +def transcribe_audio(): + """Transcribes the audio file using Whisper.""" + print("Transcribing...") + result = pipe(FILENAME) + print("Transcription complete.") + return result["text"] + + +# Gradio Interface +with gr.Blocks() as interface: + gr.Markdown("# Voice to Text App") + gr.Markdown("Click 'Start Recording' to record your voice, 'Stop Recording' to save, and 'Transcribe' to convert speech to text.") + + start_button = gr.Button("Start Recording") + stop_button = gr.Button("Stop Recording") + transcribe_button = gr.Button("Transcribe") + + output = gr.Textbox(label="Output") + + start_button.click(start_recording, outputs=output) + stop_button.click(stop_recording, outputs=output) + transcribe_button.click(transcribe_audio, outputs=output) + +if __name__ == "__main__": + interface.launch()