diff --git a/stt_test.py b/stt_test.py index 7b62924..773880d 100644 --- a/stt_test.py +++ b/stt_test.py @@ -5,6 +5,8 @@ import numpy as np import wave import gradio as gr import time +from pydub import AudioSegment +import os # Setup device device = "cuda:0" if torch.cuda.is_available() else "cpu" @@ -69,13 +71,45 @@ def save_audio_to_wav(audio, filename): wf.setframerate(SAMPLE_RATE) wf.writeframes(audio.tobytes()) +def get_audio_duration(filename): + """Returns the duration of the audio file in seconds.""" + audio = AudioSegment.from_wav(filename) + return len(audio) / 1000 # Convert milliseconds to seconds + +def split_audio(filename, chunk_length_ms=30000): + """Splits an audio file into chunks.""" + audio = AudioSegment.from_wav(filename) + chunks = [audio[i:i + chunk_length_ms] for i in range(0, len(audio), chunk_length_ms)] + return chunks + def transcribe_audio(): """Transcribes the audio file using Whisper.""" - print("Transcribing...") - result = pipe(FILENAME) - print("Transcription complete.") - return result["text"] + print("Checking audio duration...") + duration = get_audio_duration(FILENAME) + + if duration > 30: + print(f"Audio is too long ({duration:.2f} seconds). Splitting into chunks...") + chunks = split_audio(FILENAME) + transcription = [] + + for i, chunk in enumerate(chunks): + chunk_filename = f"chunk_{i}.wav" + chunk.export(chunk_filename, format="wav") + print(f"Transcribing chunk {i + 1}/{len(chunks)}...") + result = pipe(chunk_filename) + transcription.append(result["text"]) + + # Clean up temporary chunk file + os.remove(chunk_filename) + + print("Transcription complete.") + return " ".join(transcription) + else: + print(f"Audio is short enough ({duration:.2f} seconds). Transcribing directly...") + result = pipe(FILENAME) + print("Transcription complete.") + return result["text"] # Gradio Interface