add support for dynamic speech length

This commit is contained in:
Christian Rute 2025-01-16 17:00:39 +01:00
parent 19145ca1a4
commit dba250b798

31
app.py
View File

@ -45,7 +45,17 @@ def start_recording():
is_recording = True is_recording = True
start_time = time.time() # Record the start time start_time = time.time() # Record the start time
print("Recording started...") print("Recording started...")
recorded_audio = sd.rec(int(SAMPLE_RATE * 60), samplerate=SAMPLE_RATE, channels=1, dtype=np.float32) recorded_audio = [] # Start with an empty list to store audio chunks
# Start recording continuously
def callback(indata, frames, time, status):
if is_recording: # Append audio data only while recording
recorded_audio.append(indata.copy())
stream = sd.InputStream(
samplerate=SAMPLE_RATE, channels=1, dtype=np.float32, callback=callback
)
stream.start() # Start the stream
return "Recording... Click 'Stop Recording' to finish." return "Recording... Click 'Stop Recording' to finish."
@ -54,12 +64,15 @@ def stop_recording():
global is_recording, recorded_audio, start_time global is_recording, recorded_audio, start_time
if not is_recording: if not is_recording:
return "Not recording!" return "Not recording!"
sd.stop()
is_recording = False is_recording = False
elapsed_time = time.time() - start_time # Calculate elapsed time elapsed_time = time.time() - start_time # Calculate elapsed time
print(f"Recording stopped. Duration: {elapsed_time:.2f} seconds.") print(f"Recording stopped. Duration: {elapsed_time:.2f} seconds.")
save_audio_to_wav(recorded_audio[:int(SAMPLE_RATE * elapsed_time)], FILENAME) # Truncate to actual duration
return "Recording stopped. Click 'Transcribe' to see the result." # Combine all recorded chunks into a single array
audio_data = np.concatenate(recorded_audio, axis=0)
save_audio_to_wav(audio_data[: int(SAMPLE_RATE * elapsed_time)], FILENAME)
return f"Recording stopped. Duration: {elapsed_time:.2f} seconds. Click 'Transcribe' to see the result."
def save_audio_to_wav(audio, filename): def save_audio_to_wav(audio, filename):
@ -99,19 +112,25 @@ def transcribe_audio():
chunk_filename = f"chunk_{i}.wav" chunk_filename = f"chunk_{i}.wav"
chunk.export(chunk_filename, format="wav") chunk.export(chunk_filename, format="wav")
print(f"Transcribing chunk {i + 1}/{len(chunks)}...") print(f"Transcribing chunk {i + 1}/{len(chunks)}...")
# Transcribe the chunk
result = pipe(chunk_filename) result = pipe(chunk_filename)
transcription.append(result["text"]) transcription.append(result["text"])
# Clean up temporary chunk file # Clean up temporary chunk file
os.remove(chunk_filename) os.remove(chunk_filename)
# Stream intermediate transcription
yield f"{' '.join(transcription)}"
print("Transcription complete.") print("Transcription complete.")
return " ".join(transcription) yield f"{' '.join(transcription)}"
else: else:
print(f"Audio is short enough ({duration:.2f} seconds). Transcribing directly...") print(f"Audio is short enough ({duration:.2f} seconds). Transcribing directly...")
result = pipe(FILENAME) result = pipe(FILENAME)
print("Transcription complete.") print("Transcription complete.")
return result["text"] yield f"{result['text']}"
# Gradio Interface # Gradio Interface