espeak-loader broken link fix, invalid pipeline state

2025-09-18 21:39:23 +00:00 · 2025-02-08 20:36:50 -07:00 · 2025-02-08 20:36:50 -07:00 · af0e6dad6e
commit af0e6dad6e
parent 425c7d0eac
5 changed files with 249 additions and 113 deletions
--- a/api/src/services/tts_service.py
+++ b/api/src/services/tts_service.py
@ -307,15 +307,15 @@ class TTSService:
                pipeline_lang_code = lang_code if lang_code else voice[:1].lower()
                logger.info(f"Using lang_code '{pipeline_lang_code}' for voice '{voice_name}' in text chunking")

+                # Get pipelines from backend for proper device management
+                try:
                    # Initialize quiet pipeline for text chunking
-                quiet_pipeline = KPipeline(lang_code=pipeline_lang_code, model=False)
-                
-                # Split text into chunks and get initial tokens
                    text_chunks = []
                    current_offset = 0.0  # Track time offset for timestamps
                    
                    logger.debug("Splitting text into chunks...")
-                for result in quiet_pipeline(text):
+                    # Use backend's pipeline management
+                    for result in backend._get_pipeline(pipeline_lang_code)(text):
                        if result.graphemes and result.phonemes:
                            text_chunks.append((result.graphemes, result.phonemes))
                    logger.debug(f"Split text into {len(text_chunks)} chunks")
@ -324,10 +324,8 @@ class TTSService:
                    for chunk_idx, (chunk_text, chunk_phonemes) in enumerate(text_chunks):
                        logger.debug(f"Processing chunk {chunk_idx + 1}/{len(text_chunks)}: '{chunk_text[:50]}...'")
                        
-                    # Create a new pipeline with the lang_code
-                    generation_pipeline = KPipeline(lang_code=pipeline_lang_code, model=backend._model)
-                    logger.info(f"Using lang_code '{pipeline_lang_code}' for voice '{voice_name}' in generation pipeline")
-                    for result in generation_pipeline(
+                        # Use backend's pipeline for generation
+                        for result in backend._get_pipeline(pipeline_lang_code)(
                            chunk_text,
                            voice=voice_path,
                            speed=speed
@ -418,6 +416,10 @@ class TTSService:
                                    logger.warning(f"Error processing token: {e}")
                                    continue

+                except Exception as e:
+                    logger.error(f"Failed to process text with pipeline: {e}")
+                    raise RuntimeError(f"Pipeline processing failed: {e}")
+
                if not chunks:
                    raise ValueError("No audio chunks were generated successfully")

@ -512,9 +514,9 @@ class TTSService:
                pipeline_lang_code = lang_code if lang_code else voice[:1].lower()
                logger.info(f"Using lang_code '{pipeline_lang_code}' for voice '{voice_name}' in phoneme pipeline")
                
-                # Create a new pipeline with the lang_code
-                phoneme_pipeline = KPipeline(lang_code=pipeline_lang_code, model=backend._model)
-                for r in phoneme_pipeline.generate_from_tokens(
+                try:
+                    # Use backend's pipeline management
+                    for r in backend._get_pipeline(pipeline_lang_code).generate_from_tokens(
                        tokens=phonemes,  # Pass raw phonemes string
                        voice=voice_path,
                        speed=speed
@ -522,6 +524,9 @@ class TTSService:
                        if r.audio is not None:
                            result = r
                            break
+                except Exception as e:
+                    logger.error(f"Failed to generate from phonemes: {e}")
+                    raise RuntimeError(f"Phoneme generation failed: {e}")
                
                if result is None or result.audio is None:
                    raise ValueError("No audio generated")
--- a/docker/cpu/Dockerfile
+++ b/docker/cpu/Dockerfile
@ -48,7 +48,8 @@ ENV PYTHONUNBUFFERED=1 \
    UV_LINK_MODE=copy \
    USE_GPU=false \
    PHONEMIZER_ESPEAK_PATH=/usr/bin \
-    PHONEMIZER_ESPEAK_DATA=/usr/share/espeak-ng-data
+    PHONEMIZER_ESPEAK_DATA=/usr/share/espeak-ng-data \
+    ESPEAK_DATA_PATH=/usr/share/espeak-ng-data

 ENV DOWNLOAD_MODEL=true
 # Download model if enabled
--- a/docker/gpu/Dockerfile
+++ b/docker/gpu/Dockerfile
@ -50,7 +50,8 @@ ENV PYTHONUNBUFFERED=1 \
    UV_LINK_MODE=copy \
    USE_GPU=true \
    PHONEMIZER_ESPEAK_PATH=/usr/bin \
-    PHONEMIZER_ESPEAK_DATA=/usr/share/espeak-ng-data
+    PHONEMIZER_ESPEAK_DATA=/usr/share/espeak-ng-data \
+    ESPEAK_DATA_PATH=/usr/share/espeak-ng-data
    
 ENV DOWNLOAD_MODEL=true
 # Download model if enabled
--- a/docs/architecture/espeak_setup_fix.md
+++ b/docs/architecture/espeak_setup_fix.md
@ -0,0 +1,128 @@
+# ESpeak-NG Setup Fix
+
+## Issue Description
+
+Users are reporting two distinct errors:
+
+1. Missing espeak-ng-data/phontab file:
+```
+Error processing file '/home/runner/work/espeakng-loader/espeakng-loader/espeak-ng/_dynamic/share/espeak-ng-data/phontab': No such file or directory.
+```
+
+2. Invalid pipeline state:
+```
+Error generating speech: The object is in an invalid state.
+```
+
+## Root Cause Analysis
+
+### 1. ESpeak-NG Data Issue
+
+The dependency chain has changed:
+```
+Before:
+kokoro-fastapi (phonemizer 3.3.0) -> kokoro -> misaki -> phonemizer
+
+After:
+kokoro-fastapi -> kokoro -> misaki -> phonemizer-fork + espeakng-loader
+```
+
+The issue arises because:
+1. misaki now uses espeakng-loader to manage espeak paths
+2. espeakng-loader looks for data in its package directory
+3. We have a direct dependency on phonemizer 3.3.0 that conflicts
+
+### 2. Pipeline State Issue
+The "invalid state" error occurs due to device mismatch in pipeline creation.
+
+## Solution
+
+### 1. For ESpeak-NG Data
+
+Update dependencies and environment:
+
+1. Remove direct phonemizer dependency:
+```diff
+- "phonemizer==3.3.0",  # Remove this
+```
+
+2. Let misaki handle phonemizer-fork and espeakng-loader
+
+3. Set environment variable in Dockerfile:
+```dockerfile
+ENV PHONEMIZER_ESPEAK_PATH=/usr/bin \
+    PHONEMIZER_ESPEAK_DATA=/usr/share/espeak-ng-data \
+    ESPEAK_DATA_PATH=/usr/share/espeak-ng-data  # Add this
+```
+
+This approach:
+- Works with misaki's new dependencies
+- Maintains our working espeak setup
+- Avoids complex file copying or path manipulation
+
+### 2. For Pipeline State
+
+Use kokoro_v1's pipeline management:
+```python
+# Instead of creating pipelines directly:
+# pipeline = KPipeline(...)
+
+# Use backend's pipeline management:
+pipeline = backend._get_pipeline(pipeline_lang_code)
+```
+
+## Implementation Steps
+
+1. Update pyproject.toml:
+   - Remove direct phonemizer dependency
+   - Keep misaki dependency as is
+
+2. Update Dockerfiles:
+   - Add ESPEAK_DATA_PATH environment variable
+   - Keep existing espeak-ng setup
+
+3. Update tts_service.py:
+   - Use backend's pipeline management
+   - Add proper error handling
+
+## Testing
+
+1. Test espeak-ng functionality:
+   ```bash
+   # Verify environment variables
+   echo $ESPEAK_DATA_PATH
+   echo $PHONEMIZER_ESPEAK_DATA
+   
+   # Check data directory
+   ls /usr/share/espeak-ng-data
+   ```
+
+2. Test pipeline state:
+   - Test on both CPU and GPU
+   - Verify no invalid state errors
+   - Test with different voice models
+
+## Success Criteria
+
+1. No espeak-ng-data/phontab file errors
+2. No invalid state errors
+3. Consistent behavior across platforms
+4. Successful CI/CD pipeline runs
+
+## Future Considerations
+
+1. Potential PR to misaki:
+   - Add fallback mechanism if espeakng-loader fails
+   - Make path configuration more flexible
+   - Add better error messages
+
+2. Environment Variable Documentation:
+   - Document ESPEAK_DATA_PATH requirement
+   - Explain interaction with espeakng-loader
+   - Provide platform-specific setup instructions
+
+## Notes
+
+- This solution works with misaki's new dependencies while maintaining our setup
+- Environment variable approach is simpler than file copying
+- May want to contribute improvements back to misaki later
--- a/web/index.html
+++ b/web/index.html
@ -101,6 +101,7 @@
                            <option value="e">Spanish</option>
                            <option value="a">English</option>
                            <option value="f">French</option>
+                            <option value="h">Hindi</option>
                            <option value="i">Italian</option>
                            <option value="p">Portuguese</option>
                            <option value="j">Japanese</option>