1
0
Fork 0

feat: Initial commit

This commit is contained in:
mrsobakin 2026-02-24 13:14:57 +03:00
commit 1e5654e591
No known key found for this signature in database
GPG key ID: 325CBF665E4FFD6E
10 changed files with 1336 additions and 0 deletions

4
src/speechd/__init__.py Normal file
View file

@ -0,0 +1,4 @@
from speechd.config import Config
from speechd.daemon import SpeechDaemon
__all__ = ["Config", "SpeechDaemon"]

70
src/speechd/__main__.py Normal file
View file

@ -0,0 +1,70 @@
from __future__ import annotations
import argparse
import logging
from speechd.config import Config
from speechd.daemon import SpeechDaemon
SERVICE_UNIT = """[Unit]
Description=Speech-to-Text daemon
After=graphical-session.target
[Service]
Type=simple
ExecStart=%h/.local/bin/speechd
Restart=on-failure
RestartSec=3
[Install]
WantedBy=default.target
"""
def install_service():
from pathlib import Path
service_dir = Path.home() / ".config" / "systemd" / "user"
service_file = service_dir / "speechd.service"
service_dir.mkdir(parents=True, exist_ok=True)
service_file.write_text(SERVICE_UNIT)
print(f"Installed: {service_file}")
print("\nNext steps:")
print(" 1. Run 'speechd' to create config file")
print(" 2. Edit ~/.config/speechd/config.toml and add your API key")
print(" 3. Run: systemctl --user enable --now speechd")
print(" 4. Toggle recording: speechd-toggle")
def main():
parser = argparse.ArgumentParser(description="Speech-to-Text daemon")
parser.add_argument(
"--install-service", action="store_true", help="Install systemd user service"
)
parser.add_argument("-v", "--verbose", action="store_true", help="Enable debug logging")
args = parser.parse_args()
if args.install_service:
install_service()
return
logging.basicConfig(
level=logging.DEBUG if args.verbose else logging.INFO,
format="%(asctime)s [%(levelname)s] %(message)s",
datefmt="%H:%M:%S",
)
try:
config = Config.load()
except RuntimeError as e:
logging.error(str(e))
raise SystemExit(1)
daemon = SpeechDaemon(config)
daemon.run()
if __name__ == "__main__":
main()

46
src/speechd/audio.py Normal file
View file

@ -0,0 +1,46 @@
import logging
import numpy as np
import torch
logger = logging.getLogger(__name__)
class VoiceActivityDetector:
def __init__(self, sample_rate: int = 16000):
self.sample_rate = sample_rate
logger.info("Loading Silero VAD model...")
self.model, self.utils = torch.hub.load(
repo_or_dir="snakers4/silero-vad",
model="silero_vad",
force_reload=False,
trust_repo=True,
)
self.model.eval()
logger.info("VAD model loaded")
def process(self, audio_data: np.ndarray) -> np.ndarray:
if len(audio_data) == 0:
return audio_data
audio_float = audio_data.astype(np.float32) / 32768.0
audio_tensor = torch.from_numpy(audio_float)
with torch.no_grad():
get_speech_ts = self.utils[0]
speech_timestamps = get_speech_ts(
audio_tensor,
self.model,
sampling_rate=self.sample_rate,
threshold=0.5,
min_speech_duration_ms=250,
min_silence_duration_ms=100,
)
if not speech_timestamps:
return np.array([], dtype=np.int16)
result = np.zeros_like(audio_data)
for ts in speech_timestamps:
result[ts["start"] : ts["end"]] = audio_data[ts["start"] : ts["end"]]
return result

50
src/speechd/config.py Normal file
View file

@ -0,0 +1,50 @@
import os
from dataclasses import dataclass
from pathlib import Path
DEFAULT_CONFIG = """api_key = "your-grok-api-key"
model = "whisper-large-v3-turbo"
language = "ru"
sample_rate = 16000
timeout = 300
"""
@dataclass(frozen=True)
class Config:
groq_api_key: str
model: str
language: str
sample_rate: int
timeout_seconds: int
runtime_dir: str
@classmethod
def load(cls) -> "Config":
import tomllib
config_path = Path.home() / ".config" / "speechd" / "config.toml"
if not config_path.exists():
config_path.parent.mkdir(parents=True, exist_ok=True)
config_path.write_text(DEFAULT_CONFIG)
config_path.chmod(0o600)
raise RuntimeError(
f"Config created at {config_path}\nPlease edit and add your Groq API key"
)
with open(config_path, "rb") as f:
data = tomllib.load(f)
api_key = data.get("api_key", "")
if not api_key or api_key == "your-api-key-here":
raise RuntimeError(f"Please set api_key in {config_path}")
return cls(
groq_api_key=api_key,
model=data.get("model", "whisper-large-v3-turbo"),
language=data.get("language", "ru"),
sample_rate=data.get("sample_rate", 16000),
timeout_seconds=data.get("timeout", 300),
runtime_dir=os.environ.get("XDG_RUNTIME_DIR", "/tmp"),
)

175
src/speechd/daemon.py Normal file
View file

@ -0,0 +1,175 @@
import atexit
import fcntl
import logging
import os
import re
import signal
import subprocess
import time
from pathlib import Path
import numpy as np
import sounddevice as sd
from speechd.audio import VoiceActivityDetector
from speechd.config import Config
from speechd.transcribe import Transcriber
logger = logging.getLogger(__name__)
class SpeechDaemon:
def __init__(self, config: Config):
self.config = config
logger.info("Loading VAD model...")
self.vad = VoiceActivityDetector(sample_rate=config.sample_rate)
self.transcriber = Transcriber(
api_key=config.groq_api_key,
model=config.model,
language=config.language,
sample_rate=config.sample_rate,
)
self.recording = False
self.frames: list[np.ndarray] = []
self.stream: sd.InputStream | None = None
self.recording_start_time: float | None = None
self.timeout_cancelled = False
self.runtime_dir = Path(config.runtime_dir)
self.indicator_file = self.runtime_dir / "speechd.recording"
self.pidfile = self.runtime_dir / "speechd.pid"
self._pidfile_fd: int | None = None
def _acquire_pidfile(self) -> bool:
fd = os.open(self.pidfile, os.O_RDWR | os.O_CREAT, 0o644)
try:
fcntl.flock(fd, fcntl.LOCK_EX | fcntl.LOCK_NB)
except (BlockingIOError, OSError):
os.close(fd)
return False
os.truncate(fd, 0)
os.write(fd, f"{os.getpid()}\n".encode())
self._pidfile_fd = fd
return True
def cleanup(self):
if self._pidfile_fd is not None:
try:
fcntl.flock(self._pidfile_fd, fcntl.LOCK_UN)
os.close(self._pidfile_fd)
self._pidfile_fd = None
except Exception:
pass
try:
self.indicator_file.unlink(missing_ok=True)
except Exception:
pass
def toggle(self):
if not self.recording:
self._start_recording()
else:
self._stop_recording()
def _start_recording(self):
logger.info("Recording started")
self.frames = []
self.recording = True
self.timeout_cancelled = False
self.recording_start_time = time.monotonic()
self.indicator_file.touch()
self.stream = sd.InputStream(
samplerate=self.config.sample_rate,
channels=1,
dtype=np.int16,
callback=self._audio_callback,
)
self.stream.start()
def _stop_recording(self, timeout: bool = False):
self.recording = False
self.recording_start_time = None
try:
self.indicator_file.unlink(missing_ok=True)
except Exception:
pass
if self.stream:
self.stream.stop()
self.stream.close()
self.stream = None
if timeout:
logger.info(f"Recording cancelled: exceeded {self.config.timeout_seconds}s timeout")
return
if not self.frames:
return
audio_data = np.concatenate(self.frames)
duration = len(audio_data) / self.config.sample_rate
logger.info(f"Processing {duration:.1f}s of audio...")
t0 = time.monotonic()
audio_clean = self.vad.process(audio_data)
vad_time = time.monotonic() - t0
if len(audio_clean) == 0:
logger.info(f"No speech detected (VAD: {vad_time:.2f}s)")
return
logger.info(f"Transcribing... (VAD: {vad_time:.2f}s)")
t1 = time.monotonic()
result = self.transcriber.transcribe(audio_clean)
transcribe_time = time.monotonic() - t1
if result.success and result.text:
logger.info(f"[{transcribe_time:.2f}s] {result.text}")
self._type_text(result.text)
def _audio_callback(self, indata, _frames, _time, _status):
if not self.recording:
return
if self.recording_start_time is not None:
elapsed = time.monotonic() - self.recording_start_time
if elapsed > self.config.timeout_seconds:
self.timeout_cancelled = True
self._stop_recording(timeout=True)
return
self.frames.append(indata.copy().flatten())
@staticmethod
def _postprocess_text(text: str) -> str:
text = text.replace("", "-")
text = text.replace("", "-")
text = re.sub(r"(\s)-(\s)", r"\1--\2", text)
return text
def _type_text(self, text: str):
text = self._postprocess_text(text).strip().replace("\n", " ")
if text:
try:
subprocess.run(["wtype", "-"], input=text.encode(), check=True)
except subprocess.CalledProcessError as e:
logger.error(f"Failed to type text: {e}")
except FileNotFoundError:
logger.error("wtype not found - cannot type text")
def run(self):
if not self._acquire_pidfile():
logger.error("Another instance is already running")
raise SystemExit(1)
signal.signal(signal.SIGUSR1, lambda *_: self.toggle())
signal.signal(signal.SIGTERM, lambda *_: (self.cleanup(), exit(0)))
signal.signal(signal.SIGINT, lambda *_: (self.cleanup(), exit(0)))
atexit.register(self.cleanup)
logger.info(f"Ready. PID: {os.getpid()}")
logger.info(f"Model: {self.config.model}, Language: {self.config.language}")
logger.info(f"Timeout: {self.config.timeout_seconds}s")
while True:
signal.pause()

58
src/speechd/transcribe.py Normal file
View file

@ -0,0 +1,58 @@
import io
import logging
from dataclasses import dataclass
import numpy as np
import soundfile as sf
from groq import Groq
logger = logging.getLogger(__name__)
@dataclass
class TranscriptionResult:
text: str
success: bool
error: str | None = None
class Transcriber:
def __init__(self, api_key: str, model: str, language: str, sample_rate: int):
self.client = Groq(api_key=api_key)
self.model = model
self.language = language
self.sample_rate = sample_rate
def transcribe(self, audio_data: np.ndarray) -> TranscriptionResult:
if len(audio_data) == 0:
return TranscriptionResult(text="", success=True)
try:
opus_data = self._encode_opus(audio_data)
buffer = io.BytesIO(opus_data)
buffer.name = "audio.ogg"
logger.debug(f"Transcribing {len(audio_data) / self.sample_rate:.1f}s of audio")
result = self.client.audio.transcriptions.create(
file=buffer,
model=self.model,
language=self.language,
response_format="text",
)
return TranscriptionResult(text=result, success=True)
except Exception as e:
logger.error(f"Transcription failed: {e}")
return TranscriptionResult(text="", success=False, error=str(e))
def _encode_opus(self, audio_data: np.ndarray) -> bytes:
audio_float = audio_data.astype(np.float32) / 32768.0
buf = io.BytesIO()
sf.write(
buf,
audio_float,
self.sample_rate,
format="OGG",
subtype="OPUS",
compression_level=0.8,
)
return buf.getvalue()