#!/usr/bin/env python3
# Timestamp: "2025-12-11 (ywatanabe)"
# File: /home/ywatanabe/proj/scitex-code/src/scitex/audio/engines/base.py
# ----------------------------------------
"""
Base TTS class defining the common interface for all TTS backends.
"""
from __future__ import annotations
import subprocess
from abc import ABC, abstractmethod
from pathlib import Path
from typing import List, Optional
__all__ = ["BaseTTS", "TTSBackend"]
[docs]
class TTSBackend:
"""Enum-like class for TTS backend types."""
ELEVENLABS = "elevenlabs"
GTTS = "gtts"
PYTTSX3 = "pyttsx3"
LUXTTS = "luxtts"
EDGE = "edge" # Future: edge-tts
[docs]
@classmethod
def available(cls) -> List[str]:
"""Return list of available backends."""
backends = []
# Check gTTS (always available if installed, needs internet)
try:
import gtts
backends.append(cls.GTTS)
except ImportError:
pass
# Check pyttsx3
try:
import pyttsx3
backends.append(cls.PYTTSX3)
except ImportError:
pass
# Check ElevenLabs
try:
import os
import elevenlabs
if os.environ.get("SCITEX_AUDIO_ELEVENLABS_API_KEY") or os.environ.get(
"ELEVENLABS_API_KEY"
):
backends.append(cls.ELEVENLABS)
except ImportError:
pass
return backends
[docs]
class BaseTTS(ABC):
"""Abstract base class for TTS implementations."""
def __init__(self, **kwargs):
self.config = kwargs
[docs]
@abstractmethod
def synthesize(self, text: str, output_path: str) -> Path:
"""Synthesize text to audio file.
Args:
text: Text to convert to speech.
output_path: Path to save the audio file.
Returns
-------
Path to the generated audio file.
"""
pass
[docs]
@abstractmethod
def get_voices(self) -> List[dict]:
"""Get available voices for this backend.
Returns
-------
List of voice dictionaries with 'name' and 'id' keys.
"""
pass
@property
@abstractmethod
def name(self) -> str:
"""Return the backend name."""
pass
@property
def requires_api_key(self) -> bool:
"""Whether this backend requires an API key."""
return False
@property
def requires_internet(self) -> bool:
"""Whether this backend requires internet connection."""
return False
[docs]
def speak(
self,
text: str,
output_path: Optional[str] = None,
play: bool = True,
voice: Optional[str] = None,
) -> dict:
"""Synthesize and optionally play text.
Args:
text: Text to speak.
output_path: Optional path to save audio.
play: Whether to play the audio.
voice: Optional voice name/id.
Returns
-------
Dict with keys: path (if output_path), played (bool), success (bool).
"""
import tempfile
# Determine output path
if output_path:
out_path = Path(output_path)
else:
suffix = ".mp3"
fd, tmp_path = tempfile.mkstemp(suffix=suffix, prefix="scitex_tts_")
import os
os.close(fd)
out_path = Path(tmp_path)
# Set voice if provided
if voice:
self.config["voice"] = voice
# Synthesize
result_path = self.synthesize(text, str(out_path))
# Play if requested
played = False
if play:
played = self._play_audio(result_path)
result = {"success": True, "played": played, "play_requested": play}
if output_path:
result["path"] = result_path
return result
[docs]
def to_bytes(
self,
text: str,
voice: Optional[str] = None,
) -> bytes:
"""Synthesize text and return raw audio bytes (MP3).
Does not play audio — caller is responsible for playback.
Useful for streaming audio to a browser or returning via HTTP.
Args:
text: Text to convert to speech.
voice: Optional voice name/id.
Returns
-------
MP3 audio bytes.
"""
import os
import tempfile
if voice:
self.config["voice"] = voice
fd, tmp_path = tempfile.mkstemp(suffix=".mp3", prefix="scitex_tts_")
os.close(fd)
try:
self.synthesize(text, tmp_path)
with open(tmp_path, "rb") as f:
return f.read()
finally:
try:
os.unlink(tmp_path)
except Exception:
pass
def _play_audio(self, path: Path, runner=None) -> bool:
"""Play audio file using available system player.
Includes Windows fallback for WSL environments where PulseAudio
may be unstable.
Args:
path: Audio file to play.
runner: Injectable subprocess runner (testing). A callable with
the ``subprocess.run`` signature; defaults to the real
``subprocess.run`` when ``None``.
Returns
-------
True if playback succeeded, False otherwise.
"""
import os
run = runner if runner is not None else subprocess.run
# Check if we're in WSL - if so, prefer Windows playback directly
# to avoid double playback issues with Linux audio hanging
if os.path.exists("/mnt/c/Windows"):
if self._play_audio_windows(path):
return True
# Fall through to Linux players if Windows playback fails
players = [
["ffplay", "-nodisp", "-autoexit", str(path)],
["mpv", "--no-video", str(path)],
["aplay", str(path)],
["afplay", str(path)], # macOS
]
for player_cmd in players:
try:
run(
player_cmd,
check=True,
stdout=subprocess.DEVNULL,
stderr=subprocess.DEVNULL,
timeout=30,
)
return True
except subprocess.TimeoutExpired:
# Audio playback hung - might have played but can't confirm
return False
except (subprocess.CalledProcessError, FileNotFoundError):
continue
print(f"Warning: No audio player found. Audio saved to: {path}")
return False
def _play_audio_windows(self, path: Path) -> bool:
"""Play audio via Windows PowerShell SoundPlayer (WSL fallback).
This is useful when WSLg PulseAudio connection is unstable.
Uses System.Media.SoundPlayer which is headless (no GUI).
Args:
path: Path to audio file (in WSL filesystem)
Returns
-------
True if playback succeeded, False otherwise
"""
import os
import shutil
import tempfile
# Check if we're in WSL
if not os.path.exists("/mnt/c/Windows"):
return False
# Check if powershell.exe is available
powershell = shutil.which("powershell.exe")
if not powershell:
return False
try:
# SoundPlayer only supports WAV, so convert if needed
wav_path = path
if path.suffix.lower() in (".mp3", ".ogg", ".m4a"):
try:
from pydub import AudioSegment
# Create temp WAV file
fd, tmp_wav = tempfile.mkstemp(suffix=".wav", prefix="scitex_")
os.close(fd)
wav_path = Path(tmp_wav)
audio = AudioSegment.from_file(str(path))
audio.export(str(wav_path), format="wav")
except ImportError:
# pydub not available, try direct playback anyway
pass
# Convert WSL path to Windows path
result = subprocess.run(
["wslpath", "-w", str(wav_path)],
capture_output=True,
text=True,
timeout=5,
)
if result.returncode != 0:
return False
windows_path = result.stdout.strip()
# Play using PowerShell's SoundPlayer (headless, no GUI)
ps_command = f"""
$player = New-Object System.Media.SoundPlayer
$player.SoundLocation = "{windows_path}"
$player.PlaySync()
"""
subprocess.run(
[powershell, "-NoProfile", "-Command", ps_command],
stdout=subprocess.DEVNULL,
stderr=subprocess.DEVNULL,
timeout=60,
)
# Clean up temp WAV if created
if wav_path != path and wav_path.exists():
try:
wav_path.unlink()
except Exception:
pass
return True
except (subprocess.TimeoutExpired, subprocess.CalledProcessError, Exception):
return False
# EOF