feat: Initial commit
This commit is contained in:
commit
1e5654e591
10 changed files with 1336 additions and 0 deletions
4
src/speechd/__init__.py
Normal file
4
src/speechd/__init__.py
Normal file
|
|
@ -0,0 +1,4 @@
|
|||
from speechd.config import Config
|
||||
from speechd.daemon import SpeechDaemon
|
||||
|
||||
__all__ = ["Config", "SpeechDaemon"]
|
||||
70
src/speechd/__main__.py
Normal file
70
src/speechd/__main__.py
Normal file
|
|
@ -0,0 +1,70 @@
|
|||
from __future__ import annotations
|
||||
|
||||
import argparse
|
||||
import logging
|
||||
|
||||
from speechd.config import Config
|
||||
from speechd.daemon import SpeechDaemon
|
||||
|
||||
SERVICE_UNIT = """[Unit]
|
||||
Description=Speech-to-Text daemon
|
||||
After=graphical-session.target
|
||||
|
||||
[Service]
|
||||
Type=simple
|
||||
ExecStart=%h/.local/bin/speechd
|
||||
Restart=on-failure
|
||||
RestartSec=3
|
||||
|
||||
[Install]
|
||||
WantedBy=default.target
|
||||
"""
|
||||
|
||||
|
||||
def install_service():
|
||||
from pathlib import Path
|
||||
|
||||
service_dir = Path.home() / ".config" / "systemd" / "user"
|
||||
service_file = service_dir / "speechd.service"
|
||||
|
||||
service_dir.mkdir(parents=True, exist_ok=True)
|
||||
service_file.write_text(SERVICE_UNIT)
|
||||
|
||||
print(f"Installed: {service_file}")
|
||||
print("\nNext steps:")
|
||||
print(" 1. Run 'speechd' to create config file")
|
||||
print(" 2. Edit ~/.config/speechd/config.toml and add your API key")
|
||||
print(" 3. Run: systemctl --user enable --now speechd")
|
||||
print(" 4. Toggle recording: speechd-toggle")
|
||||
|
||||
|
||||
def main():
|
||||
parser = argparse.ArgumentParser(description="Speech-to-Text daemon")
|
||||
parser.add_argument(
|
||||
"--install-service", action="store_true", help="Install systemd user service"
|
||||
)
|
||||
parser.add_argument("-v", "--verbose", action="store_true", help="Enable debug logging")
|
||||
args = parser.parse_args()
|
||||
|
||||
if args.install_service:
|
||||
install_service()
|
||||
return
|
||||
|
||||
logging.basicConfig(
|
||||
level=logging.DEBUG if args.verbose else logging.INFO,
|
||||
format="%(asctime)s [%(levelname)s] %(message)s",
|
||||
datefmt="%H:%M:%S",
|
||||
)
|
||||
|
||||
try:
|
||||
config = Config.load()
|
||||
except RuntimeError as e:
|
||||
logging.error(str(e))
|
||||
raise SystemExit(1)
|
||||
|
||||
daemon = SpeechDaemon(config)
|
||||
daemon.run()
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
46
src/speechd/audio.py
Normal file
46
src/speechd/audio.py
Normal file
|
|
@ -0,0 +1,46 @@
|
|||
import logging
|
||||
|
||||
import numpy as np
|
||||
import torch
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
|
||||
class VoiceActivityDetector:
|
||||
def __init__(self, sample_rate: int = 16000):
|
||||
self.sample_rate = sample_rate
|
||||
logger.info("Loading Silero VAD model...")
|
||||
self.model, self.utils = torch.hub.load(
|
||||
repo_or_dir="snakers4/silero-vad",
|
||||
model="silero_vad",
|
||||
force_reload=False,
|
||||
trust_repo=True,
|
||||
)
|
||||
self.model.eval()
|
||||
logger.info("VAD model loaded")
|
||||
|
||||
def process(self, audio_data: np.ndarray) -> np.ndarray:
|
||||
if len(audio_data) == 0:
|
||||
return audio_data
|
||||
|
||||
audio_float = audio_data.astype(np.float32) / 32768.0
|
||||
audio_tensor = torch.from_numpy(audio_float)
|
||||
|
||||
with torch.no_grad():
|
||||
get_speech_ts = self.utils[0]
|
||||
speech_timestamps = get_speech_ts(
|
||||
audio_tensor,
|
||||
self.model,
|
||||
sampling_rate=self.sample_rate,
|
||||
threshold=0.5,
|
||||
min_speech_duration_ms=250,
|
||||
min_silence_duration_ms=100,
|
||||
)
|
||||
|
||||
if not speech_timestamps:
|
||||
return np.array([], dtype=np.int16)
|
||||
|
||||
result = np.zeros_like(audio_data)
|
||||
for ts in speech_timestamps:
|
||||
result[ts["start"] : ts["end"]] = audio_data[ts["start"] : ts["end"]]
|
||||
return result
|
||||
50
src/speechd/config.py
Normal file
50
src/speechd/config.py
Normal file
|
|
@ -0,0 +1,50 @@
|
|||
import os
|
||||
from dataclasses import dataclass
|
||||
from pathlib import Path
|
||||
|
||||
DEFAULT_CONFIG = """api_key = "your-grok-api-key"
|
||||
model = "whisper-large-v3-turbo"
|
||||
language = "ru"
|
||||
sample_rate = 16000
|
||||
timeout = 300
|
||||
"""
|
||||
|
||||
|
||||
@dataclass(frozen=True)
|
||||
class Config:
|
||||
groq_api_key: str
|
||||
model: str
|
||||
language: str
|
||||
sample_rate: int
|
||||
timeout_seconds: int
|
||||
runtime_dir: str
|
||||
|
||||
@classmethod
|
||||
def load(cls) -> "Config":
|
||||
import tomllib
|
||||
|
||||
config_path = Path.home() / ".config" / "speechd" / "config.toml"
|
||||
|
||||
if not config_path.exists():
|
||||
config_path.parent.mkdir(parents=True, exist_ok=True)
|
||||
config_path.write_text(DEFAULT_CONFIG)
|
||||
config_path.chmod(0o600)
|
||||
raise RuntimeError(
|
||||
f"Config created at {config_path}\nPlease edit and add your Groq API key"
|
||||
)
|
||||
|
||||
with open(config_path, "rb") as f:
|
||||
data = tomllib.load(f)
|
||||
|
||||
api_key = data.get("api_key", "")
|
||||
if not api_key or api_key == "your-api-key-here":
|
||||
raise RuntimeError(f"Please set api_key in {config_path}")
|
||||
|
||||
return cls(
|
||||
groq_api_key=api_key,
|
||||
model=data.get("model", "whisper-large-v3-turbo"),
|
||||
language=data.get("language", "ru"),
|
||||
sample_rate=data.get("sample_rate", 16000),
|
||||
timeout_seconds=data.get("timeout", 300),
|
||||
runtime_dir=os.environ.get("XDG_RUNTIME_DIR", "/tmp"),
|
||||
)
|
||||
175
src/speechd/daemon.py
Normal file
175
src/speechd/daemon.py
Normal file
|
|
@ -0,0 +1,175 @@
|
|||
import atexit
|
||||
import fcntl
|
||||
import logging
|
||||
import os
|
||||
import re
|
||||
import signal
|
||||
import subprocess
|
||||
import time
|
||||
from pathlib import Path
|
||||
|
||||
import numpy as np
|
||||
import sounddevice as sd
|
||||
|
||||
from speechd.audio import VoiceActivityDetector
|
||||
from speechd.config import Config
|
||||
from speechd.transcribe import Transcriber
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
|
||||
class SpeechDaemon:
|
||||
def __init__(self, config: Config):
|
||||
self.config = config
|
||||
logger.info("Loading VAD model...")
|
||||
self.vad = VoiceActivityDetector(sample_rate=config.sample_rate)
|
||||
self.transcriber = Transcriber(
|
||||
api_key=config.groq_api_key,
|
||||
model=config.model,
|
||||
language=config.language,
|
||||
sample_rate=config.sample_rate,
|
||||
)
|
||||
self.recording = False
|
||||
self.frames: list[np.ndarray] = []
|
||||
self.stream: sd.InputStream | None = None
|
||||
self.recording_start_time: float | None = None
|
||||
self.timeout_cancelled = False
|
||||
|
||||
self.runtime_dir = Path(config.runtime_dir)
|
||||
self.indicator_file = self.runtime_dir / "speechd.recording"
|
||||
self.pidfile = self.runtime_dir / "speechd.pid"
|
||||
self._pidfile_fd: int | None = None
|
||||
|
||||
def _acquire_pidfile(self) -> bool:
|
||||
fd = os.open(self.pidfile, os.O_RDWR | os.O_CREAT, 0o644)
|
||||
try:
|
||||
fcntl.flock(fd, fcntl.LOCK_EX | fcntl.LOCK_NB)
|
||||
except (BlockingIOError, OSError):
|
||||
os.close(fd)
|
||||
return False
|
||||
os.truncate(fd, 0)
|
||||
os.write(fd, f"{os.getpid()}\n".encode())
|
||||
self._pidfile_fd = fd
|
||||
return True
|
||||
|
||||
def cleanup(self):
|
||||
if self._pidfile_fd is not None:
|
||||
try:
|
||||
fcntl.flock(self._pidfile_fd, fcntl.LOCK_UN)
|
||||
os.close(self._pidfile_fd)
|
||||
self._pidfile_fd = None
|
||||
except Exception:
|
||||
pass
|
||||
try:
|
||||
self.indicator_file.unlink(missing_ok=True)
|
||||
except Exception:
|
||||
pass
|
||||
|
||||
def toggle(self):
|
||||
if not self.recording:
|
||||
self._start_recording()
|
||||
else:
|
||||
self._stop_recording()
|
||||
|
||||
def _start_recording(self):
|
||||
logger.info("Recording started")
|
||||
self.frames = []
|
||||
self.recording = True
|
||||
self.timeout_cancelled = False
|
||||
self.recording_start_time = time.monotonic()
|
||||
self.indicator_file.touch()
|
||||
self.stream = sd.InputStream(
|
||||
samplerate=self.config.sample_rate,
|
||||
channels=1,
|
||||
dtype=np.int16,
|
||||
callback=self._audio_callback,
|
||||
)
|
||||
self.stream.start()
|
||||
|
||||
def _stop_recording(self, timeout: bool = False):
|
||||
self.recording = False
|
||||
self.recording_start_time = None
|
||||
try:
|
||||
self.indicator_file.unlink(missing_ok=True)
|
||||
except Exception:
|
||||
pass
|
||||
|
||||
if self.stream:
|
||||
self.stream.stop()
|
||||
self.stream.close()
|
||||
self.stream = None
|
||||
|
||||
if timeout:
|
||||
logger.info(f"Recording cancelled: exceeded {self.config.timeout_seconds}s timeout")
|
||||
return
|
||||
|
||||
if not self.frames:
|
||||
return
|
||||
|
||||
audio_data = np.concatenate(self.frames)
|
||||
duration = len(audio_data) / self.config.sample_rate
|
||||
logger.info(f"Processing {duration:.1f}s of audio...")
|
||||
|
||||
t0 = time.monotonic()
|
||||
audio_clean = self.vad.process(audio_data)
|
||||
vad_time = time.monotonic() - t0
|
||||
|
||||
if len(audio_clean) == 0:
|
||||
logger.info(f"No speech detected (VAD: {vad_time:.2f}s)")
|
||||
return
|
||||
|
||||
logger.info(f"Transcribing... (VAD: {vad_time:.2f}s)")
|
||||
t1 = time.monotonic()
|
||||
result = self.transcriber.transcribe(audio_clean)
|
||||
transcribe_time = time.monotonic() - t1
|
||||
|
||||
if result.success and result.text:
|
||||
logger.info(f"[{transcribe_time:.2f}s] {result.text}")
|
||||
self._type_text(result.text)
|
||||
|
||||
def _audio_callback(self, indata, _frames, _time, _status):
|
||||
if not self.recording:
|
||||
return
|
||||
|
||||
if self.recording_start_time is not None:
|
||||
elapsed = time.monotonic() - self.recording_start_time
|
||||
if elapsed > self.config.timeout_seconds:
|
||||
self.timeout_cancelled = True
|
||||
self._stop_recording(timeout=True)
|
||||
return
|
||||
|
||||
self.frames.append(indata.copy().flatten())
|
||||
|
||||
@staticmethod
|
||||
def _postprocess_text(text: str) -> str:
|
||||
text = text.replace("—", "-")
|
||||
text = text.replace("–", "-")
|
||||
text = re.sub(r"(\s)-(\s)", r"\1--\2", text)
|
||||
return text
|
||||
|
||||
def _type_text(self, text: str):
|
||||
text = self._postprocess_text(text).strip().replace("\n", " ")
|
||||
if text:
|
||||
try:
|
||||
subprocess.run(["wtype", "-"], input=text.encode(), check=True)
|
||||
except subprocess.CalledProcessError as e:
|
||||
logger.error(f"Failed to type text: {e}")
|
||||
except FileNotFoundError:
|
||||
logger.error("wtype not found - cannot type text")
|
||||
|
||||
def run(self):
|
||||
if not self._acquire_pidfile():
|
||||
logger.error("Another instance is already running")
|
||||
raise SystemExit(1)
|
||||
|
||||
signal.signal(signal.SIGUSR1, lambda *_: self.toggle())
|
||||
signal.signal(signal.SIGTERM, lambda *_: (self.cleanup(), exit(0)))
|
||||
signal.signal(signal.SIGINT, lambda *_: (self.cleanup(), exit(0)))
|
||||
atexit.register(self.cleanup)
|
||||
|
||||
logger.info(f"Ready. PID: {os.getpid()}")
|
||||
logger.info(f"Model: {self.config.model}, Language: {self.config.language}")
|
||||
logger.info(f"Timeout: {self.config.timeout_seconds}s")
|
||||
|
||||
while True:
|
||||
signal.pause()
|
||||
58
src/speechd/transcribe.py
Normal file
58
src/speechd/transcribe.py
Normal file
|
|
@ -0,0 +1,58 @@
|
|||
import io
|
||||
import logging
|
||||
from dataclasses import dataclass
|
||||
|
||||
import numpy as np
|
||||
import soundfile as sf
|
||||
from groq import Groq
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
|
||||
@dataclass
|
||||
class TranscriptionResult:
|
||||
text: str
|
||||
success: bool
|
||||
error: str | None = None
|
||||
|
||||
|
||||
class Transcriber:
|
||||
def __init__(self, api_key: str, model: str, language: str, sample_rate: int):
|
||||
self.client = Groq(api_key=api_key)
|
||||
self.model = model
|
||||
self.language = language
|
||||
self.sample_rate = sample_rate
|
||||
|
||||
def transcribe(self, audio_data: np.ndarray) -> TranscriptionResult:
|
||||
if len(audio_data) == 0:
|
||||
return TranscriptionResult(text="", success=True)
|
||||
|
||||
try:
|
||||
opus_data = self._encode_opus(audio_data)
|
||||
buffer = io.BytesIO(opus_data)
|
||||
buffer.name = "audio.ogg"
|
||||
|
||||
logger.debug(f"Transcribing {len(audio_data) / self.sample_rate:.1f}s of audio")
|
||||
result = self.client.audio.transcriptions.create(
|
||||
file=buffer,
|
||||
model=self.model,
|
||||
language=self.language,
|
||||
response_format="text",
|
||||
)
|
||||
return TranscriptionResult(text=result, success=True)
|
||||
except Exception as e:
|
||||
logger.error(f"Transcription failed: {e}")
|
||||
return TranscriptionResult(text="", success=False, error=str(e))
|
||||
|
||||
def _encode_opus(self, audio_data: np.ndarray) -> bytes:
|
||||
audio_float = audio_data.astype(np.float32) / 32768.0
|
||||
buf = io.BytesIO()
|
||||
sf.write(
|
||||
buf,
|
||||
audio_float,
|
||||
self.sample_rate,
|
||||
format="OGG",
|
||||
subtype="OPUS",
|
||||
compression_level=0.8,
|
||||
)
|
||||
return buf.getvalue()
|
||||
Loading…
Add table
Add a link
Reference in a new issue