From 617d9cc0fc40141af828c6d813160fbafcfdc8e8 Mon Sep 17 00:00:00 2001 From: "Arthur K." Date: Sat, 28 Feb 2026 11:24:34 +0300 Subject: [PATCH] feat: compressor, more stastus verbosity, multiple ways to stop --- README.md | 1 + pyproject.toml | 1 + scripts/speechd-toggle | 26 +++++---- src/speechd/audio.py | 1 + src/speechd/audio_processor.py | 100 +++++++++++++++++++++++++++++++++ src/speechd/daemon.py | 76 +++++++++++++++++++------ src/speechd/transcribe.py | 13 ++--- uv.lock | 13 ++++- 8 files changed, 195 insertions(+), 36 deletions(-) create mode 100644 README.md create mode 100644 src/speechd/audio_processor.py diff --git a/README.md b/README.md new file mode 100644 index 0000000..fb6ac8f --- /dev/null +++ b/README.md @@ -0,0 +1 @@ +https://github.com/mrsobakin/speechd/ diff --git a/pyproject.toml b/pyproject.toml index d6fa7d8..8b7ae45 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -8,6 +8,7 @@ license = { text = "MIT" } dependencies = [ "groq>=0.4.0", "numpy>=1.24.0", + "packaging>=26.0", "sounddevice>=0.4.6", "soundfile>=0.12.0", "torch>=2.0.0", diff --git a/scripts/speechd-toggle b/scripts/speechd-toggle index 5e8aca9..b37212c 100755 --- a/scripts/speechd-toggle +++ b/scripts/speechd-toggle @@ -1,15 +1,21 @@ #!/bin/bash + PIDFILE="${XDG_RUNTIME_DIR:-/tmp}/speechd.pid" -if [ -f "$PIDFILE" ]; then - PID=$(cat "$PIDFILE") - if kill -0 "$PID" 2>/dev/null; then - kill -USR1 "$PID" - else - echo "Daemon not running (stale pidfile)" >&2 - exit 1 - fi -else +[ -f "$PIDFILE" ] || { echo "Daemon not running (no pidfile)" >&2 exit 1 -fi +} + +PID=$(cat "$PIDFILE") + +kill -0 "$PID" 2>/dev/null || { + echo "Daemon not running (stale pidfile)" >&2 + exit 1 +} + +case "$1" in + "-n") kill -USR2 "$PID";; + "-a") kill -ALRM "$PID";; + *) kill -USR1 "$PID";; +esac diff --git a/src/speechd/audio.py b/src/speechd/audio.py index a4af034..a6e35ab 100644 --- a/src/speechd/audio.py +++ b/src/speechd/audio.py @@ -21,6 +21,7 @@ class VoiceActivityDetector: def process(self, audio_data: np.ndarray) -> np.ndarray: if len(audio_data) == 0: + print("No audio data!") return audio_data audio_float = audio_data.astype(np.float32) / 32768.0 diff --git a/src/speechd/audio_processor.py b/src/speechd/audio_processor.py new file mode 100644 index 0000000..4a6d652 --- /dev/null +++ b/src/speechd/audio_processor.py @@ -0,0 +1,100 @@ +import logging +import subprocess + +import numpy as np + + +logger = logging.getLogger(__name__) + + +class AudioProcessor: + def __init__(self, peak_target: float = 1.0, silence_threshold: float = 1e-8): + if not 0 < peak_target <= 1.0: + raise ValueError("peak_target must be in range (0.0, 1.0]") + self.peak_target = peak_target + self.silence_threshold = silence_threshold + + @staticmethod + def pcm16_to_float(audio_data: np.ndarray) -> np.ndarray: + return audio_data.astype(np.float32) / 32768.0 + + def normalize_peak(self, audio_data: np.ndarray) -> np.ndarray: + if audio_data.size == 0: + return audio_data + + peak = np.max(np.abs(audio_data)) + if peak <= self.silence_threshold: + return audio_data + + normalized = audio_data * (self.peak_target / peak) + np.clip(normalized, -1.0, 1.0, out=normalized) + return normalized + + def normalize_pcm16_peak(self, audio_data: np.ndarray) -> np.ndarray: + if audio_data.size == 0: + return audio_data + + audio_int32 = audio_data.astype(np.int32) + peak = np.max(np.abs(audio_int32)) + if peak <= 0: + return audio_data + + target_peak = int(32767 * self.peak_target) + normalized = np.rint(audio_int32 * (target_peak / peak)).astype(np.int32) + np.clip(normalized, -32768, 32767, out=normalized) + return normalized.astype(np.int16) + + def compress_pcm16(self, audio_data: np.ndarray, sample_rate: int) -> np.ndarray: + if audio_data.size == 0: + return audio_data + + filter_chain = ( + "acompressor=threshold=0.1:ratio=3:attack=10:release=120:" + "makeup=2.0:link=average:detection=rms," + "alimiter=limit=0.98" + ) + ffmpeg_cmd = [ + "ffmpeg", + "-hide_banner", + "-loglevel", + "error", + "-f", + "s16le", + "-ar", + str(sample_rate), + "-ac", + "1", + "-i", + "pipe:0", + "-af", + filter_chain, + "-f", + "s16le", + "-acodec", + "pcm_s16le", + "pipe:1", + ] + + try: + result = subprocess.run( + ffmpeg_cmd, + input=audio_data.tobytes(), + stdout=subprocess.PIPE, + stderr=subprocess.PIPE, + check=True, + ) + except FileNotFoundError: + logger.warning("ffmpeg not found; skipping compression") + return audio_data + except subprocess.CalledProcessError as exc: + err = exc.stderr.decode(errors="replace").strip() + logger.warning("ffmpeg compression failed; using raw audio: %s", err) + return audio_data + + compressed = np.frombuffer(result.stdout, dtype=np.int16) + if compressed.size == 0: + return audio_data + return compressed.copy() + + def prepare_for_transcription(self, audio_data: np.ndarray) -> np.ndarray: + return self.pcm16_to_float(audio_data) diff --git a/src/speechd/daemon.py b/src/speechd/daemon.py index 53670b0..3267a9d 100644 --- a/src/speechd/daemon.py +++ b/src/speechd/daemon.py @@ -7,11 +7,13 @@ import signal import subprocess import time from pathlib import Path +from typing import Literal import numpy as np import sounddevice as sd from speechd.audio import VoiceActivityDetector +from speechd.audio_processor import AudioProcessor from speechd.config import Config from speechd.transcribe import Transcriber @@ -23,6 +25,7 @@ class SpeechDaemon: self.config = config logger.info("Loading VAD model...") self.vad = VoiceActivityDetector(sample_rate=config.sample_rate) + self.audio_processor = AudioProcessor() self.transcriber = Transcriber( api_key=config.groq_api_key, model=config.model, @@ -35,6 +38,7 @@ class SpeechDaemon: self.stream: sd.InputStream | None = None self.recording_start_time: float | None = None self.timeout_cancelled = False + self.skip_enter = False self.runtime_dir = Path(config.runtime_dir) self.indicator_file = self.runtime_dir / "speechd.recording" @@ -72,13 +76,36 @@ class SpeechDaemon: else: self._stop_recording() + def toggle_without_enter(self): + self.skip_enter = True + self.toggle() + + def cancel(self): + if not self.recording: + self.skip_enter = False + return + self._stop_recording(cancel=True) + + def _update_status(self, status: Literal["off", "rec", "trans", "type"]): + match status: + case "off": + try: + self.indicator_file.unlink(missing_ok=True) + except Exception: + pass + case "rec": + self.indicator_file.write_text("Recording...") + case "trans": + self.indicator_file.write_text("Transcribing...") + case "type": + self.indicator_file.write_text("Typing...") + subprocess.call(["pkill", "-39", "dwmblocks"]) + def _start_recording(self): logger.info("Recording started") self.frames = [] self.recording = True self.timeout_cancelled = False - self.recording_start_time = time.monotonic() - self.indicator_file.touch() self.stream = sd.InputStream( samplerate=self.config.sample_rate, channels=1, @@ -86,14 +113,13 @@ class SpeechDaemon: callback=self._audio_callback, ) self.stream.start() + self._update_status("rec") - def _stop_recording(self, timeout: bool = False): + def _stop_recording(self, timeout: bool = False, cancel: bool = False): self.recording = False self.recording_start_time = None - try: - self.indicator_file.unlink(missing_ok=True) - except Exception: - pass + append_enter = self.skip_enter + self.skip_enter = False if self.stream: self.stream.stop() @@ -102,12 +128,21 @@ class SpeechDaemon: if timeout: logger.info(f"Recording cancelled: exceeded {self.config.timeout_seconds}s timeout") + self._update_status("off") + return + + if cancel: + logger.info("Recording cancelled") + self._update_status("off") return if not self.frames: + self._update_status("off") return audio_data = np.concatenate(self.frames) + audio_data = self.audio_processor.normalize_pcm16_peak(audio_data) + audio_data = self.audio_processor.compress_pcm16(audio_data, self.config.sample_rate) duration = len(audio_data) / self.config.sample_rate logger.info(f"Processing {duration:.1f}s of audio...") @@ -117,8 +152,10 @@ class SpeechDaemon: if len(audio_clean) == 0: logger.info(f"No speech detected (VAD: {vad_time:.2f}s)") + self._update_status("off") return + self._update_status("trans") logger.info(f"Transcribing... (VAD: {vad_time:.2f}s)") t1 = time.monotonic() result = self.transcriber.transcribe(audio_clean) @@ -126,7 +163,9 @@ class SpeechDaemon: if result.success and result.text: logger.info(f"[{transcribe_time:.2f}s] {result.text}") - self._type_text(result.text) + self._update_status("type") + self._type_text(result.text, skip_enter=append_enter) + self._update_status("off") def _audio_callback(self, indata, _frames, _time, _status): if not self.recording: @@ -148,15 +187,18 @@ class SpeechDaemon: text = re.sub(r"(\s)-(\s)", r"\1--\2", text) return text - def _type_text(self, text: str): + def _type_text(self, text: str, skip_enter: bool = False): text = self._postprocess_text(text).strip().replace("\n", " ") - if text: - try: - subprocess.run(["wtype", "-"], input=text.encode(), check=True) - except subprocess.CalledProcessError as e: - logger.error(f"Failed to type text: {e}") - except FileNotFoundError: - logger.error("wtype not found - cannot type text") + if not text: + return + + text = text + ('\n' if not skip_enter else '') + try: + subprocess.run(["xtype", self.config.language], input=text.encode(), check=True) + except subprocess.CalledProcessError as e: + logger.error(f"Failed to type text: {e}") + except FileNotFoundError: + logger.error("wtype not found - cannot type text") def run(self): if not self._acquire_pidfile(): @@ -164,6 +206,8 @@ class SpeechDaemon: raise SystemExit(1) signal.signal(signal.SIGUSR1, lambda *_: self.toggle()) + signal.signal(signal.SIGUSR2, lambda *_: self.toggle_without_enter()) + signal.signal(signal.SIGALRM, lambda *_: self.cancel()) signal.signal(signal.SIGTERM, lambda *_: (self.cleanup(), exit(0))) signal.signal(signal.SIGINT, lambda *_: (self.cleanup(), exit(0))) atexit.register(self.cleanup) diff --git a/src/speechd/transcribe.py b/src/speechd/transcribe.py index d3d6eb0..0f18432 100644 --- a/src/speechd/transcribe.py +++ b/src/speechd/transcribe.py @@ -6,6 +6,8 @@ import numpy as np import soundfile as sf from groq import Groq +from speechd.audio_processor import AudioProcessor + logger = logging.getLogger(__name__) @@ -25,6 +27,7 @@ class Transcriber: self.language = language self.sample_rate = sample_rate self.audio_quality = audio_quality + self.audio_processor = AudioProcessor() def transcribe(self, audio_data: np.ndarray) -> TranscriptionResult: if len(audio_data) == 0: @@ -47,16 +50,8 @@ class Transcriber: logger.error(f"Transcription failed: {e}") return TranscriptionResult(text="", success=False, error=str(e)) - def _normalize_rms(self, audio: np.ndarray, target_rms: float = 0.1) -> np.ndarray: - rms = np.sqrt(np.mean(audio**2)) - if rms > 1e-8: - audio = audio * (target_rms / rms) - np.clip(audio, -1.0, 1.0, out=audio) - return audio - def _encode_opus(self, audio_data: np.ndarray) -> bytes: - audio_float = audio_data.astype(np.float32) / 32768.0 - audio_float = self._normalize_rms(audio_float) + audio_float = self.audio_processor.prepare_for_transcription(audio_data) buf = io.BytesIO() sf.write( buf, diff --git a/uv.lock b/uv.lock index aae076e..0549d89 100644 --- a/uv.lock +++ b/uv.lock @@ -538,6 +538,15 @@ wheels = [ { url = "https://files.pythonhosted.org/packages/a2/eb/86626c1bbc2edb86323022371c39aa48df6fd8b0a1647bc274577f72e90b/nvidia_nvtx_cu12-12.8.90-py3-none-manylinux2014_x86_64.manylinux_2_17_x86_64.whl", hash = "sha256:5b17e2001cc0d751a5bc2c6ec6d26ad95913324a4adb86788c944f8ce9ba441f", size = 89954, upload-time = "2025-03-07T01:42:44.131Z" }, ] +[[package]] +name = "packaging" +version = "26.0" +source = { registry = "https://pypi.org/simple" } +sdist = { url = "https://files.pythonhosted.org/packages/65/ee/299d360cdc32edc7d2cf530f3accf79c4fca01e96ffc950d8a52213bd8e4/packaging-26.0.tar.gz", hash = "sha256:00243ae351a257117b6a241061796684b084ed1c516a08c48a3f7e147a9d80b4", size = 143416, upload-time = "2026-01-21T20:50:39.064Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/b7/b9/c538f279a4e237a006a2c98387d081e9eb060d203d8ed34467cc0f0b9b53/packaging-26.0-py3-none-any.whl", hash = "sha256:b36f1fef9334a5588b4166f8bcd26a14e521f2b55e6b9de3aaa80d3ff7a37529", size = 74366, upload-time = "2026-01-21T20:50:37.788Z" }, +] + [[package]] name = "pycparser" version = "3.0" @@ -714,11 +723,12 @@ wheels = [ [[package]] name = "speechd" -version = "1.1.0" +version = "1.2.8" source = { editable = "." } dependencies = [ { name = "groq" }, { name = "numpy" }, + { name = "packaging" }, { name = "sounddevice" }, { name = "soundfile" }, { name = "torch" }, @@ -729,6 +739,7 @@ dependencies = [ requires-dist = [ { name = "groq", specifier = ">=0.4.0" }, { name = "numpy", specifier = ">=1.24.0" }, + { name = "packaging", specifier = ">=26.0" }, { name = "sounddevice", specifier = ">=0.4.6" }, { name = "soundfile", specifier = ">=0.12.0" }, { name = "torch", specifier = ">=2.0.0" },