Source code for scitex_audio._engines._base

#!/usr/bin/env python3
# Timestamp: "2025-12-11 (ywatanabe)"
# File: /home/ywatanabe/proj/scitex-code/src/scitex/audio/engines/base.py
# ----------------------------------------

"""
Base TTS class defining the common interface for all TTS backends.
"""

from __future__ import annotations

import subprocess
from abc import ABC, abstractmethod
from pathlib import Path
from typing import List, Optional

__all__ = ["BaseTTS", "TTSBackend"]


[docs] class TTSBackend: """Enum-like class for TTS backend types.""" ELEVENLABS = "elevenlabs" GTTS = "gtts" PYTTSX3 = "pyttsx3" LUXTTS = "luxtts" EDGE = "edge" # Future: edge-tts
[docs] @classmethod def available(cls) -> List[str]: """Return list of available backends.""" backends = [] # Check gTTS (always available if installed, needs internet) try: import gtts backends.append(cls.GTTS) except ImportError: pass # Check pyttsx3 try: import pyttsx3 backends.append(cls.PYTTSX3) except ImportError: pass # Check ElevenLabs try: import os import elevenlabs if os.environ.get("SCITEX_AUDIO_ELEVENLABS_API_KEY") or os.environ.get( "ELEVENLABS_API_KEY" ): backends.append(cls.ELEVENLABS) except ImportError: pass return backends
[docs] class BaseTTS(ABC): """Abstract base class for TTS implementations.""" def __init__(self, **kwargs): self.config = kwargs
[docs] @abstractmethod def synthesize(self, text: str, output_path: str) -> Path: """Synthesize text to audio file. Args: text: Text to convert to speech. output_path: Path to save the audio file. Returns ------- Path to the generated audio file. """ pass
[docs] @abstractmethod def get_voices(self) -> List[dict]: """Get available voices for this backend. Returns ------- List of voice dictionaries with 'name' and 'id' keys. """ pass
@property @abstractmethod def name(self) -> str: """Return the backend name.""" pass @property def requires_api_key(self) -> bool: """Whether this backend requires an API key.""" return False @property def requires_internet(self) -> bool: """Whether this backend requires internet connection.""" return False
[docs] def speak( self, text: str, output_path: Optional[str] = None, play: bool = True, voice: Optional[str] = None, ) -> dict: """Synthesize and optionally play text. Args: text: Text to speak. output_path: Optional path to save audio. play: Whether to play the audio. voice: Optional voice name/id. Returns ------- Dict with keys: path (if output_path), played (bool), success (bool). """ import tempfile # Determine output path if output_path: out_path = Path(output_path) else: suffix = ".mp3" fd, tmp_path = tempfile.mkstemp(suffix=suffix, prefix="scitex_tts_") import os os.close(fd) out_path = Path(tmp_path) # Set voice if provided if voice: self.config["voice"] = voice # Synthesize result_path = self.synthesize(text, str(out_path)) # Play if requested played = False if play: played = self._play_audio(result_path) result = {"success": True, "played": played, "play_requested": play} if output_path: result["path"] = result_path return result
[docs] def to_bytes( self, text: str, voice: Optional[str] = None, ) -> bytes: """Synthesize text and return raw audio bytes (MP3). Does not play audio — caller is responsible for playback. Useful for streaming audio to a browser or returning via HTTP. Args: text: Text to convert to speech. voice: Optional voice name/id. Returns ------- MP3 audio bytes. """ import os import tempfile if voice: self.config["voice"] = voice fd, tmp_path = tempfile.mkstemp(suffix=".mp3", prefix="scitex_tts_") os.close(fd) try: self.synthesize(text, tmp_path) with open(tmp_path, "rb") as f: return f.read() finally: try: os.unlink(tmp_path) except Exception: pass
def _play_audio(self, path: Path, runner=None) -> bool: """Play audio file using available system player. Includes Windows fallback for WSL environments where PulseAudio may be unstable. Args: path: Audio file to play. runner: Injectable subprocess runner (testing). A callable with the ``subprocess.run`` signature; defaults to the real ``subprocess.run`` when ``None``. Returns ------- True if playback succeeded, False otherwise. """ import os run = runner if runner is not None else subprocess.run # Check if we're in WSL - if so, prefer Windows playback directly # to avoid double playback issues with Linux audio hanging if os.path.exists("/mnt/c/Windows"): if self._play_audio_windows(path): return True # Fall through to Linux players if Windows playback fails players = [ ["ffplay", "-nodisp", "-autoexit", str(path)], ["mpv", "--no-video", str(path)], ["aplay", str(path)], ["afplay", str(path)], # macOS ] for player_cmd in players: try: run( player_cmd, check=True, stdout=subprocess.DEVNULL, stderr=subprocess.DEVNULL, timeout=30, ) return True except subprocess.TimeoutExpired: # Audio playback hung - might have played but can't confirm return False except (subprocess.CalledProcessError, FileNotFoundError): continue print(f"Warning: No audio player found. Audio saved to: {path}") return False def _play_audio_windows(self, path: Path) -> bool: """Play audio via Windows PowerShell SoundPlayer (WSL fallback). This is useful when WSLg PulseAudio connection is unstable. Uses System.Media.SoundPlayer which is headless (no GUI). Args: path: Path to audio file (in WSL filesystem) Returns ------- True if playback succeeded, False otherwise """ import os import shutil import tempfile # Check if we're in WSL if not os.path.exists("/mnt/c/Windows"): return False # Check if powershell.exe is available powershell = shutil.which("powershell.exe") if not powershell: return False try: # SoundPlayer only supports WAV, so convert if needed wav_path = path if path.suffix.lower() in (".mp3", ".ogg", ".m4a"): try: from pydub import AudioSegment # Create temp WAV file fd, tmp_wav = tempfile.mkstemp(suffix=".wav", prefix="scitex_") os.close(fd) wav_path = Path(tmp_wav) audio = AudioSegment.from_file(str(path)) audio.export(str(wav_path), format="wav") except ImportError: # pydub not available, try direct playback anyway pass # Convert WSL path to Windows path result = subprocess.run( ["wslpath", "-w", str(wav_path)], capture_output=True, text=True, timeout=5, ) if result.returncode != 0: return False windows_path = result.stdout.strip() # Play using PowerShell's SoundPlayer (headless, no GUI) ps_command = f""" $player = New-Object System.Media.SoundPlayer $player.SoundLocation = "{windows_path}" $player.PlaySync() """ subprocess.run( [powershell, "-NoProfile", "-Command", ps_command], stdout=subprocess.DEVNULL, stderr=subprocess.DEVNULL, timeout=60, ) # Clean up temp WAV if created if wav_path != path and wav_path.exists(): try: wav_path.unlink() except Exception: pass return True except (subprocess.TimeoutExpired, subprocess.CalledProcessError, Exception): return False
# EOF