From 33e6f229d348779aab623f518148c30079ecbe36 Mon Sep 17 00:00:00 2001 From: mrsobakin <68982655+mrsobakin@users.noreply.github.com> Date: Tue, 24 Feb 2026 16:49:04 +0300 Subject: [PATCH] feat: audio normalization --- pyproject.toml | 2 +- src/speechd/transcribe.py | 8 ++++++++ 2 files changed, 9 insertions(+), 1 deletion(-) diff --git a/pyproject.toml b/pyproject.toml index 78c993f..9ac9ea0 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -1,6 +1,6 @@ [project] name = "speechd" -version = "1.2.6" +version = "1.2.7" description = "Speech-to-Text daemon with Groq Whisper API" readme = "README.md" requires-python = ">=3.11" diff --git a/src/speechd/transcribe.py b/src/speechd/transcribe.py index a5ab261..d3d6eb0 100644 --- a/src/speechd/transcribe.py +++ b/src/speechd/transcribe.py @@ -47,8 +47,16 @@ class Transcriber: logger.error(f"Transcription failed: {e}") return TranscriptionResult(text="", success=False, error=str(e)) + def _normalize_rms(self, audio: np.ndarray, target_rms: float = 0.1) -> np.ndarray: + rms = np.sqrt(np.mean(audio**2)) + if rms > 1e-8: + audio = audio * (target_rms / rms) + np.clip(audio, -1.0, 1.0, out=audio) + return audio + def _encode_opus(self, audio_data: np.ndarray) -> bytes: audio_float = audio_data.astype(np.float32) / 32768.0 + audio_float = self._normalize_rms(audio_float) buf = io.BytesIO() sf.write( buf,