1
0
Fork 0

feat: compressor, more stastus verbosity, multiple ways to stop

This commit is contained in:
Arthur K. 2026-02-28 11:24:34 +03:00
parent 8f61e0d37e
commit 617d9cc0fc
Signed by: wzray
GPG key ID: B97F30FDC4636357
8 changed files with 195 additions and 36 deletions

1
README.md Normal file
View file

@ -0,0 +1 @@
https://github.com/mrsobakin/speechd/

View file

@ -8,6 +8,7 @@ license = { text = "MIT" }
dependencies = [ dependencies = [
"groq>=0.4.0", "groq>=0.4.0",
"numpy>=1.24.0", "numpy>=1.24.0",
"packaging>=26.0",
"sounddevice>=0.4.6", "sounddevice>=0.4.6",
"soundfile>=0.12.0", "soundfile>=0.12.0",
"torch>=2.0.0", "torch>=2.0.0",

View file

@ -1,15 +1,21 @@
#!/bin/bash #!/bin/bash
PIDFILE="${XDG_RUNTIME_DIR:-/tmp}/speechd.pid" PIDFILE="${XDG_RUNTIME_DIR:-/tmp}/speechd.pid"
if [ -f "$PIDFILE" ]; then [ -f "$PIDFILE" ] || {
PID=$(cat "$PIDFILE")
if kill -0 "$PID" 2>/dev/null; then
kill -USR1 "$PID"
else
echo "Daemon not running (stale pidfile)" >&2
exit 1
fi
else
echo "Daemon not running (no pidfile)" >&2 echo "Daemon not running (no pidfile)" >&2
exit 1 exit 1
fi }
PID=$(cat "$PIDFILE")
kill -0 "$PID" 2>/dev/null || {
echo "Daemon not running (stale pidfile)" >&2
exit 1
}
case "$1" in
"-n") kill -USR2 "$PID";;
"-a") kill -ALRM "$PID";;
*) kill -USR1 "$PID";;
esac

View file

@ -21,6 +21,7 @@ class VoiceActivityDetector:
def process(self, audio_data: np.ndarray) -> np.ndarray: def process(self, audio_data: np.ndarray) -> np.ndarray:
if len(audio_data) == 0: if len(audio_data) == 0:
print("No audio data!")
return audio_data return audio_data
audio_float = audio_data.astype(np.float32) / 32768.0 audio_float = audio_data.astype(np.float32) / 32768.0

View file

@ -0,0 +1,100 @@
import logging
import subprocess
import numpy as np
logger = logging.getLogger(__name__)
class AudioProcessor:
def __init__(self, peak_target: float = 1.0, silence_threshold: float = 1e-8):
if not 0 < peak_target <= 1.0:
raise ValueError("peak_target must be in range (0.0, 1.0]")
self.peak_target = peak_target
self.silence_threshold = silence_threshold
@staticmethod
def pcm16_to_float(audio_data: np.ndarray) -> np.ndarray:
return audio_data.astype(np.float32) / 32768.0
def normalize_peak(self, audio_data: np.ndarray) -> np.ndarray:
if audio_data.size == 0:
return audio_data
peak = np.max(np.abs(audio_data))
if peak <= self.silence_threshold:
return audio_data
normalized = audio_data * (self.peak_target / peak)
np.clip(normalized, -1.0, 1.0, out=normalized)
return normalized
def normalize_pcm16_peak(self, audio_data: np.ndarray) -> np.ndarray:
if audio_data.size == 0:
return audio_data
audio_int32 = audio_data.astype(np.int32)
peak = np.max(np.abs(audio_int32))
if peak <= 0:
return audio_data
target_peak = int(32767 * self.peak_target)
normalized = np.rint(audio_int32 * (target_peak / peak)).astype(np.int32)
np.clip(normalized, -32768, 32767, out=normalized)
return normalized.astype(np.int16)
def compress_pcm16(self, audio_data: np.ndarray, sample_rate: int) -> np.ndarray:
if audio_data.size == 0:
return audio_data
filter_chain = (
"acompressor=threshold=0.1:ratio=3:attack=10:release=120:"
"makeup=2.0:link=average:detection=rms,"
"alimiter=limit=0.98"
)
ffmpeg_cmd = [
"ffmpeg",
"-hide_banner",
"-loglevel",
"error",
"-f",
"s16le",
"-ar",
str(sample_rate),
"-ac",
"1",
"-i",
"pipe:0",
"-af",
filter_chain,
"-f",
"s16le",
"-acodec",
"pcm_s16le",
"pipe:1",
]
try:
result = subprocess.run(
ffmpeg_cmd,
input=audio_data.tobytes(),
stdout=subprocess.PIPE,
stderr=subprocess.PIPE,
check=True,
)
except FileNotFoundError:
logger.warning("ffmpeg not found; skipping compression")
return audio_data
except subprocess.CalledProcessError as exc:
err = exc.stderr.decode(errors="replace").strip()
logger.warning("ffmpeg compression failed; using raw audio: %s", err)
return audio_data
compressed = np.frombuffer(result.stdout, dtype=np.int16)
if compressed.size == 0:
return audio_data
return compressed.copy()
def prepare_for_transcription(self, audio_data: np.ndarray) -> np.ndarray:
return self.pcm16_to_float(audio_data)

View file

@ -7,11 +7,13 @@ import signal
import subprocess import subprocess
import time import time
from pathlib import Path from pathlib import Path
from typing import Literal
import numpy as np import numpy as np
import sounddevice as sd import sounddevice as sd
from speechd.audio import VoiceActivityDetector from speechd.audio import VoiceActivityDetector
from speechd.audio_processor import AudioProcessor
from speechd.config import Config from speechd.config import Config
from speechd.transcribe import Transcriber from speechd.transcribe import Transcriber
@ -23,6 +25,7 @@ class SpeechDaemon:
self.config = config self.config = config
logger.info("Loading VAD model...") logger.info("Loading VAD model...")
self.vad = VoiceActivityDetector(sample_rate=config.sample_rate) self.vad = VoiceActivityDetector(sample_rate=config.sample_rate)
self.audio_processor = AudioProcessor()
self.transcriber = Transcriber( self.transcriber = Transcriber(
api_key=config.groq_api_key, api_key=config.groq_api_key,
model=config.model, model=config.model,
@ -35,6 +38,7 @@ class SpeechDaemon:
self.stream: sd.InputStream | None = None self.stream: sd.InputStream | None = None
self.recording_start_time: float | None = None self.recording_start_time: float | None = None
self.timeout_cancelled = False self.timeout_cancelled = False
self.skip_enter = False
self.runtime_dir = Path(config.runtime_dir) self.runtime_dir = Path(config.runtime_dir)
self.indicator_file = self.runtime_dir / "speechd.recording" self.indicator_file = self.runtime_dir / "speechd.recording"
@ -72,13 +76,36 @@ class SpeechDaemon:
else: else:
self._stop_recording() self._stop_recording()
def toggle_without_enter(self):
self.skip_enter = True
self.toggle()
def cancel(self):
if not self.recording:
self.skip_enter = False
return
self._stop_recording(cancel=True)
def _update_status(self, status: Literal["off", "rec", "trans", "type"]):
match status:
case "off":
try:
self.indicator_file.unlink(missing_ok=True)
except Exception:
pass
case "rec":
self.indicator_file.write_text("Recording...")
case "trans":
self.indicator_file.write_text("Transcribing...")
case "type":
self.indicator_file.write_text("Typing...")
subprocess.call(["pkill", "-39", "dwmblocks"])
def _start_recording(self): def _start_recording(self):
logger.info("Recording started") logger.info("Recording started")
self.frames = [] self.frames = []
self.recording = True self.recording = True
self.timeout_cancelled = False self.timeout_cancelled = False
self.recording_start_time = time.monotonic()
self.indicator_file.touch()
self.stream = sd.InputStream( self.stream = sd.InputStream(
samplerate=self.config.sample_rate, samplerate=self.config.sample_rate,
channels=1, channels=1,
@ -86,14 +113,13 @@ class SpeechDaemon:
callback=self._audio_callback, callback=self._audio_callback,
) )
self.stream.start() self.stream.start()
self._update_status("rec")
def _stop_recording(self, timeout: bool = False): def _stop_recording(self, timeout: bool = False, cancel: bool = False):
self.recording = False self.recording = False
self.recording_start_time = None self.recording_start_time = None
try: append_enter = self.skip_enter
self.indicator_file.unlink(missing_ok=True) self.skip_enter = False
except Exception:
pass
if self.stream: if self.stream:
self.stream.stop() self.stream.stop()
@ -102,12 +128,21 @@ class SpeechDaemon:
if timeout: if timeout:
logger.info(f"Recording cancelled: exceeded {self.config.timeout_seconds}s timeout") logger.info(f"Recording cancelled: exceeded {self.config.timeout_seconds}s timeout")
self._update_status("off")
return
if cancel:
logger.info("Recording cancelled")
self._update_status("off")
return return
if not self.frames: if not self.frames:
self._update_status("off")
return return
audio_data = np.concatenate(self.frames) audio_data = np.concatenate(self.frames)
audio_data = self.audio_processor.normalize_pcm16_peak(audio_data)
audio_data = self.audio_processor.compress_pcm16(audio_data, self.config.sample_rate)
duration = len(audio_data) / self.config.sample_rate duration = len(audio_data) / self.config.sample_rate
logger.info(f"Processing {duration:.1f}s of audio...") logger.info(f"Processing {duration:.1f}s of audio...")
@ -117,8 +152,10 @@ class SpeechDaemon:
if len(audio_clean) == 0: if len(audio_clean) == 0:
logger.info(f"No speech detected (VAD: {vad_time:.2f}s)") logger.info(f"No speech detected (VAD: {vad_time:.2f}s)")
self._update_status("off")
return return
self._update_status("trans")
logger.info(f"Transcribing... (VAD: {vad_time:.2f}s)") logger.info(f"Transcribing... (VAD: {vad_time:.2f}s)")
t1 = time.monotonic() t1 = time.monotonic()
result = self.transcriber.transcribe(audio_clean) result = self.transcriber.transcribe(audio_clean)
@ -126,7 +163,9 @@ class SpeechDaemon:
if result.success and result.text: if result.success and result.text:
logger.info(f"[{transcribe_time:.2f}s] {result.text}") logger.info(f"[{transcribe_time:.2f}s] {result.text}")
self._type_text(result.text) self._update_status("type")
self._type_text(result.text, skip_enter=append_enter)
self._update_status("off")
def _audio_callback(self, indata, _frames, _time, _status): def _audio_callback(self, indata, _frames, _time, _status):
if not self.recording: if not self.recording:
@ -148,15 +187,18 @@ class SpeechDaemon:
text = re.sub(r"(\s)-(\s)", r"\1--\2", text) text = re.sub(r"(\s)-(\s)", r"\1--\2", text)
return text return text
def _type_text(self, text: str): def _type_text(self, text: str, skip_enter: bool = False):
text = self._postprocess_text(text).strip().replace("\n", " ") text = self._postprocess_text(text).strip().replace("\n", " ")
if text: if not text:
try: return
subprocess.run(["wtype", "-"], input=text.encode(), check=True)
except subprocess.CalledProcessError as e: text = text + ('\n' if not skip_enter else '')
logger.error(f"Failed to type text: {e}") try:
except FileNotFoundError: subprocess.run(["xtype", self.config.language], input=text.encode(), check=True)
logger.error("wtype not found - cannot type text") except subprocess.CalledProcessError as e:
logger.error(f"Failed to type text: {e}")
except FileNotFoundError:
logger.error("wtype not found - cannot type text")
def run(self): def run(self):
if not self._acquire_pidfile(): if not self._acquire_pidfile():
@ -164,6 +206,8 @@ class SpeechDaemon:
raise SystemExit(1) raise SystemExit(1)
signal.signal(signal.SIGUSR1, lambda *_: self.toggle()) signal.signal(signal.SIGUSR1, lambda *_: self.toggle())
signal.signal(signal.SIGUSR2, lambda *_: self.toggle_without_enter())
signal.signal(signal.SIGALRM, lambda *_: self.cancel())
signal.signal(signal.SIGTERM, lambda *_: (self.cleanup(), exit(0))) signal.signal(signal.SIGTERM, lambda *_: (self.cleanup(), exit(0)))
signal.signal(signal.SIGINT, lambda *_: (self.cleanup(), exit(0))) signal.signal(signal.SIGINT, lambda *_: (self.cleanup(), exit(0)))
atexit.register(self.cleanup) atexit.register(self.cleanup)

View file

@ -6,6 +6,8 @@ import numpy as np
import soundfile as sf import soundfile as sf
from groq import Groq from groq import Groq
from speechd.audio_processor import AudioProcessor
logger = logging.getLogger(__name__) logger = logging.getLogger(__name__)
@ -25,6 +27,7 @@ class Transcriber:
self.language = language self.language = language
self.sample_rate = sample_rate self.sample_rate = sample_rate
self.audio_quality = audio_quality self.audio_quality = audio_quality
self.audio_processor = AudioProcessor()
def transcribe(self, audio_data: np.ndarray) -> TranscriptionResult: def transcribe(self, audio_data: np.ndarray) -> TranscriptionResult:
if len(audio_data) == 0: if len(audio_data) == 0:
@ -47,16 +50,8 @@ class Transcriber:
logger.error(f"Transcription failed: {e}") logger.error(f"Transcription failed: {e}")
return TranscriptionResult(text="", success=False, error=str(e)) return TranscriptionResult(text="", success=False, error=str(e))
def _normalize_rms(self, audio: np.ndarray, target_rms: float = 0.1) -> np.ndarray:
rms = np.sqrt(np.mean(audio**2))
if rms > 1e-8:
audio = audio * (target_rms / rms)
np.clip(audio, -1.0, 1.0, out=audio)
return audio
def _encode_opus(self, audio_data: np.ndarray) -> bytes: def _encode_opus(self, audio_data: np.ndarray) -> bytes:
audio_float = audio_data.astype(np.float32) / 32768.0 audio_float = self.audio_processor.prepare_for_transcription(audio_data)
audio_float = self._normalize_rms(audio_float)
buf = io.BytesIO() buf = io.BytesIO()
sf.write( sf.write(
buf, buf,

13
uv.lock generated
View file

@ -538,6 +538,15 @@ wheels = [
{ url = "https://files.pythonhosted.org/packages/a2/eb/86626c1bbc2edb86323022371c39aa48df6fd8b0a1647bc274577f72e90b/nvidia_nvtx_cu12-12.8.90-py3-none-manylinux2014_x86_64.manylinux_2_17_x86_64.whl", hash = "sha256:5b17e2001cc0d751a5bc2c6ec6d26ad95913324a4adb86788c944f8ce9ba441f", size = 89954, upload-time = "2025-03-07T01:42:44.131Z" }, { url = "https://files.pythonhosted.org/packages/a2/eb/86626c1bbc2edb86323022371c39aa48df6fd8b0a1647bc274577f72e90b/nvidia_nvtx_cu12-12.8.90-py3-none-manylinux2014_x86_64.manylinux_2_17_x86_64.whl", hash = "sha256:5b17e2001cc0d751a5bc2c6ec6d26ad95913324a4adb86788c944f8ce9ba441f", size = 89954, upload-time = "2025-03-07T01:42:44.131Z" },
] ]
[[package]]
name = "packaging"
version = "26.0"
source = { registry = "https://pypi.org/simple" }
sdist = { url = "https://files.pythonhosted.org/packages/65/ee/299d360cdc32edc7d2cf530f3accf79c4fca01e96ffc950d8a52213bd8e4/packaging-26.0.tar.gz", hash = "sha256:00243ae351a257117b6a241061796684b084ed1c516a08c48a3f7e147a9d80b4", size = 143416, upload-time = "2026-01-21T20:50:39.064Z" }
wheels = [
{ url = "https://files.pythonhosted.org/packages/b7/b9/c538f279a4e237a006a2c98387d081e9eb060d203d8ed34467cc0f0b9b53/packaging-26.0-py3-none-any.whl", hash = "sha256:b36f1fef9334a5588b4166f8bcd26a14e521f2b55e6b9de3aaa80d3ff7a37529", size = 74366, upload-time = "2026-01-21T20:50:37.788Z" },
]
[[package]] [[package]]
name = "pycparser" name = "pycparser"
version = "3.0" version = "3.0"
@ -714,11 +723,12 @@ wheels = [
[[package]] [[package]]
name = "speechd" name = "speechd"
version = "1.1.0" version = "1.2.8"
source = { editable = "." } source = { editable = "." }
dependencies = [ dependencies = [
{ name = "groq" }, { name = "groq" },
{ name = "numpy" }, { name = "numpy" },
{ name = "packaging" },
{ name = "sounddevice" }, { name = "sounddevice" },
{ name = "soundfile" }, { name = "soundfile" },
{ name = "torch" }, { name = "torch" },
@ -729,6 +739,7 @@ dependencies = [
requires-dist = [ requires-dist = [
{ name = "groq", specifier = ">=0.4.0" }, { name = "groq", specifier = ">=0.4.0" },
{ name = "numpy", specifier = ">=1.24.0" }, { name = "numpy", specifier = ">=1.24.0" },
{ name = "packaging", specifier = ">=26.0" },
{ name = "sounddevice", specifier = ">=0.4.6" }, { name = "sounddevice", specifier = ">=0.4.6" },
{ name = "soundfile", specifier = ">=0.12.0" }, { name = "soundfile", specifier = ">=0.12.0" },
{ name = "torch", specifier = ">=2.0.0" }, { name = "torch", specifier = ">=2.0.0" },