sprachText/stt_test.py
2025-01-06 18:22:54 +01:00

132 lines
4.3 KiB
Python

import torch
from transformers import AutoModelForSpeechSeq2Seq, AutoProcessor, pipeline
import sounddevice as sd
import numpy as np
import wave
import gradio as gr
import time
from pydub import AudioSegment
import os
# Setup device
device = "cuda:0" if torch.cuda.is_available() else "cpu"
torch_dtype = torch.float16 if torch.cuda.is_available() else torch.float32
# Load Whisper model
model_id = "openai/whisper-large-v3-turbo"
model = AutoModelForSpeechSeq2Seq.from_pretrained(
model_id, torch_dtype=torch_dtype, low_cpu_mem_usage=True, use_safetensors=True
)
model.to(device)
processor = AutoProcessor.from_pretrained(model_id)
pipe = pipeline(
"automatic-speech-recognition",
model=model,
tokenizer=processor.tokenizer,
feature_extractor=processor.feature_extractor,
torch_dtype=torch_dtype,
device=device,
)
# Audio recording settings
SAMPLE_RATE = 16000 # Whisper prefers 16 kHz
FILENAME = "recorded_audio.wav"
is_recording = False
start_time = None # Track the recording start time
recorded_audio = None
def start_recording():
"""Starts recording audio."""
global is_recording, recorded_audio, start_time
is_recording = True
start_time = time.time() # Record the start time
print("Recording started...")
recorded_audio = sd.rec(int(SAMPLE_RATE * 60), samplerate=SAMPLE_RATE, channels=1, dtype=np.float32)
return "Recording... Click 'Stop Recording' to finish."
def stop_recording():
"""Stops recording audio and saves it."""
global is_recording, recorded_audio, start_time
if not is_recording:
return "Not recording!"
sd.stop()
is_recording = False
elapsed_time = time.time() - start_time # Calculate elapsed time
print(f"Recording stopped. Duration: {elapsed_time:.2f} seconds.")
save_audio_to_wav(recorded_audio[:int(SAMPLE_RATE * elapsed_time)], FILENAME) # Truncate to actual duration
return "Recording stopped. Click 'Transcribe' to see the result."
def save_audio_to_wav(audio, filename):
"""Saves audio data to a WAV file."""
audio = (audio * 32767).astype(np.int16) # Convert to 16-bit PCM format
with wave.open(filename, 'w') as wf:
wf.setnchannels(1) # Mono
wf.setsampwidth(2) # 2 bytes per sample
wf.setframerate(SAMPLE_RATE)
wf.writeframes(audio.tobytes())
def get_audio_duration(filename):
"""Returns the duration of the audio file in seconds."""
audio = AudioSegment.from_wav(filename)
return len(audio) / 1000
def split_audio(filename, chunk_length_ms=30000):
"""Splits an audio file into chunks."""
audio = AudioSegment.from_wav(filename)
chunks = [audio[i:i + chunk_length_ms] for i in range(0, len(audio), chunk_length_ms)]
return chunks
def transcribe_audio():
"""Transcribes the audio file using Whisper."""
print("Checking audio duration...")
duration = get_audio_duration(FILENAME)
if duration > 30:
print(f"Audio is too long ({duration:.2f} seconds). Splitting into chunks...")
chunks = split_audio(FILENAME)
transcription = []
for i, chunk in enumerate(chunks):
chunk_filename = f"chunk_{i}.wav"
chunk.export(chunk_filename, format="wav")
print(f"Transcribing chunk {i + 1}/{len(chunks)}...")
result = pipe(chunk_filename)
transcription.append(result["text"])
# Clean up temporary chunk file
os.remove(chunk_filename)
print("Transcription complete.")
return " ".join(transcription)
else:
print(f"Audio is short enough ({duration:.2f} seconds). Transcribing directly...")
result = pipe(FILENAME)
print("Transcription complete.")
return result["text"]
# Gradio Interface
with gr.Blocks() as interface:
gr.Markdown("# Voice to Text App")
gr.Markdown("Click 'Start Recording' to record your voice, 'Stop Recording' to save, and 'Transcribe' to convert speech to text.")
start_button = gr.Button("Start Recording")
stop_button = gr.Button("Stop Recording")
transcribe_button = gr.Button("Transcribe")
output = gr.Textbox(label="Output")
start_button.click(start_recording, outputs=output)
stop_button.click(stop_recording, outputs=output)
transcribe_button.click(transcribe_audio, outputs=output)
if __name__ == "__main__":
interface.launch()