import torch from transformers import AutoModelForSpeechSeq2Seq, AutoProcessor, pipeline import sounddevice as sd import numpy as np import wave import gradio as gr import time from pydub import AudioSegment import os # Setup device device = "cuda:0" if torch.cuda.is_available() else "cpu" torch_dtype = torch.float16 if torch.cuda.is_available() else torch.float32 # Load Whisper model model_id = "openai/whisper-large-v3-turbo" model = AutoModelForSpeechSeq2Seq.from_pretrained( model_id, torch_dtype=torch_dtype, low_cpu_mem_usage=True, use_safetensors=True ) model.to(device) processor = AutoProcessor.from_pretrained(model_id) pipe = pipeline( "automatic-speech-recognition", model=model, tokenizer=processor.tokenizer, feature_extractor=processor.feature_extractor, torch_dtype=torch_dtype, device=device, ) # Audio recording settings SAMPLE_RATE = 16000 # Whisper prefers 16 kHz FILENAME = "recorded_audio.wav" is_recording = False start_time = None # Track the recording start time recorded_audio = None def start_recording(): """Starts recording audio.""" global is_recording, recorded_audio, start_time is_recording = True start_time = time.time() # Record the start time print("Recording started...") recorded_audio = sd.rec(int(SAMPLE_RATE * 60), samplerate=SAMPLE_RATE, channels=1, dtype=np.float32) return "Recording... Click 'Stop Recording' to finish." def stop_recording(): """Stops recording audio and saves it.""" global is_recording, recorded_audio, start_time if not is_recording: return "Not recording!" sd.stop() is_recording = False elapsed_time = time.time() - start_time # Calculate elapsed time print(f"Recording stopped. Duration: {elapsed_time:.2f} seconds.") save_audio_to_wav(recorded_audio[:int(SAMPLE_RATE * elapsed_time)], FILENAME) # Truncate to actual duration return "Recording stopped. Click 'Transcribe' to see the result." def save_audio_to_wav(audio, filename): """Saves audio data to a WAV file.""" audio = (audio * 32767).astype(np.int16) # Convert to 16-bit PCM format with wave.open(filename, 'w') as wf: wf.setnchannels(1) # Mono wf.setsampwidth(2) # 2 bytes per sample wf.setframerate(SAMPLE_RATE) wf.writeframes(audio.tobytes()) def get_audio_duration(filename): """Returns the duration of the audio file in seconds.""" audio = AudioSegment.from_wav(filename) return len(audio) / 1000 def split_audio(filename, chunk_length_ms=30000): """Splits an audio file into chunks.""" audio = AudioSegment.from_wav(filename) chunks = [audio[i:i + chunk_length_ms] for i in range(0, len(audio), chunk_length_ms)] return chunks def transcribe_audio(): """Transcribes the audio file using Whisper.""" print("Checking audio duration...") duration = get_audio_duration(FILENAME) if duration > 30: print(f"Audio is too long ({duration:.2f} seconds). Splitting into chunks...") chunks = split_audio(FILENAME) transcription = [] for i, chunk in enumerate(chunks): chunk_filename = f"chunk_{i}.wav" chunk.export(chunk_filename, format="wav") print(f"Transcribing chunk {i + 1}/{len(chunks)}...") result = pipe(chunk_filename) transcription.append(result["text"]) # Clean up temporary chunk file os.remove(chunk_filename) print("Transcription complete.") return " ".join(transcription) else: print(f"Audio is short enough ({duration:.2f} seconds). Transcribing directly...") result = pipe(FILENAME) print("Transcription complete.") return result["text"] # Gradio Interface with gr.Blocks() as interface: gr.Markdown("# Voice to Text App") gr.Markdown("Click 'Start Recording' to record your voice, 'Stop Recording' to save, and 'Transcribe' to convert speech to text.") start_button = gr.Button("Start Recording") stop_button = gr.Button("Stop Recording") transcribe_button = gr.Button("Transcribe") output = gr.Textbox(label="Output") start_button.click(start_recording, outputs=output) stop_button.click(stop_recording, outputs=output) transcribe_button.click(transcribe_audio, outputs=output) if __name__ == "__main__": interface.launch()