import torch from transformers import AutoModelForSpeechSeq2Seq, AutoProcessor, pipeline import sounddevice as sd import numpy as np import wave import gradio as gr import time # Setup device device = "cuda:0" if torch.cuda.is_available() else "cpu" torch_dtype = torch.float16 if torch.cuda.is_available() else torch.float32 # Load Whisper model model_id = "openai/whisper-large-v3-turbo" model = AutoModelForSpeechSeq2Seq.from_pretrained( model_id, torch_dtype=torch_dtype, low_cpu_mem_usage=True, use_safetensors=True ) model.to(device) processor = AutoProcessor.from_pretrained(model_id) pipe = pipeline( "automatic-speech-recognition", model=model, tokenizer=processor.tokenizer, feature_extractor=processor.feature_extractor, torch_dtype=torch_dtype, device=device, ) # Audio recording settings SAMPLE_RATE = 16000 # Whisper prefers 16 kHz FILENAME = "recorded_audio.wav" is_recording = False start_time = None # Track the recording start time recorded_audio = None def start_recording(): """Starts recording audio.""" global is_recording, recorded_audio, start_time is_recording = True start_time = time.time() # Record the start time print("Recording started...") recorded_audio = sd.rec(int(SAMPLE_RATE * 60), samplerate=SAMPLE_RATE, channels=1, dtype=np.float32) return "Recording... Click 'Stop Recording' to finish." def stop_recording(): """Stops recording audio and saves it.""" global is_recording, recorded_audio, start_time if not is_recording: return "Not recording!" sd.stop() is_recording = False elapsed_time = time.time() - start_time # Calculate elapsed time print(f"Recording stopped. Duration: {elapsed_time:.2f} seconds.") save_audio_to_wav(recorded_audio[:int(SAMPLE_RATE * elapsed_time)], FILENAME) # Truncate to actual duration return "Recording stopped. Click 'Transcribe' to see the result." def save_audio_to_wav(audio, filename): """Saves audio data to a WAV file.""" audio = (audio * 32767).astype(np.int16) # Convert to 16-bit PCM format with wave.open(filename, 'w') as wf: wf.setnchannels(1) # Mono wf.setsampwidth(2) # 2 bytes per sample wf.setframerate(SAMPLE_RATE) wf.writeframes(audio.tobytes()) def transcribe_audio(): """Transcribes the audio file using Whisper.""" print("Transcribing...") result = pipe(FILENAME) print("Transcription complete.") return result["text"] # Gradio Interface with gr.Blocks() as interface: gr.Markdown("# Voice to Text App") gr.Markdown("Click 'Start Recording' to record your voice, 'Stop Recording' to save, and 'Transcribe' to convert speech to text.") start_button = gr.Button("Start Recording") stop_button = gr.Button("Stop Recording") transcribe_button = gr.Button("Transcribe") output = gr.Textbox(label="Output") start_button.click(start_recording, outputs=output) stop_button.click(stop_recording, outputs=output) transcribe_button.click(transcribe_audio, outputs=output) if __name__ == "__main__": interface.launch()