espeak-loader broken link fix, invalid pipeline state

2025-08-05 16:48:53 +00:00 · 2025-02-08 20:36:50 -07:00 · 2025-02-08 20:36:50 -07:00 · af0e6dad6e
commit af0e6dad6e
parent 425c7d0eac
5 changed files with 249 additions and 113 deletions
--- a/api/src/services/tts_service.py
+++ b/api/src/services/tts_service.py
@ -307,116 +307,118 @@ class TTSService:
                pipeline_lang_code = lang_code if lang_code else voice[:1].lower()
                logger.info(f"Using lang_code '{pipeline_lang_code}' for voice '{voice_name}' in text chunking")
-                # Initialize quiet pipeline for text chunking
+                # Get pipelines from backend for proper device management
-                quiet_pipeline = KPipeline(lang_code=pipeline_lang_code, model=False)
+                try:
-                
+                    # Initialize quiet pipeline for text chunking
-                # Split text into chunks and get initial tokens
+                    text_chunks = []
-                text_chunks = []
+                    current_offset = 0.0  # Track time offset for timestamps
                current_offset = 0.0  # Track time offset for timestamps
                logger.debug("Splitting text into chunks...")
                for result in quiet_pipeline(text):
                    if result.graphemes and result.phonemes:
                        text_chunks.append((result.graphemes, result.phonemes))
                logger.debug(f"Split text into {len(text_chunks)} chunks")
                # Process each chunk
                for chunk_idx, (chunk_text, chunk_phonemes) in enumerate(text_chunks):
                    logger.debug(f"Processing chunk {chunk_idx + 1}/{len(text_chunks)}: '{chunk_text[:50]}...'")
-                    # Create a new pipeline with the lang_code
+                    logger.debug("Splitting text into chunks...")
-                    generation_pipeline = KPipeline(lang_code=pipeline_lang_code, model=backend._model)
+                    # Use backend's pipeline management
-                    logger.info(f"Using lang_code '{pipeline_lang_code}' for voice '{voice_name}' in generation pipeline")
+                    for result in backend._get_pipeline(pipeline_lang_code)(text):
-                    for result in generation_pipeline(
+                        if result.graphemes and result.phonemes:
-                        chunk_text,
+                            text_chunks.append((result.graphemes, result.phonemes))
-                        voice=voice_path,
+                    logger.debug(f"Split text into {len(text_chunks)} chunks")
-                        speed=speed
+                    
-                    ):
+                    # Process each chunk
-                        # Collect audio chunks
+                    for chunk_idx, (chunk_text, chunk_phonemes) in enumerate(text_chunks):
-                        if result.audio is not None:
+                        logger.debug(f"Processing chunk {chunk_idx + 1}/{len(text_chunks)}: '{chunk_text[:50]}...'")
                            chunks.append(result.audio.numpy())
-                        # Process timestamps for this chunk
+                        # Use backend's pipeline for generation
-                        if return_timestamps and hasattr(result, 'tokens') and result.tokens:
+                        for result in backend._get_pipeline(pipeline_lang_code)(
-                            logger.debug(f"Processing chunk timestamps with {len(result.tokens)} tokens")
+                            chunk_text,
-                            if result.pred_dur is not None:
+                            voice=voice_path,
                            speed=speed
                        ):
                            # Collect audio chunks
                            if result.audio is not None:
                                chunks.append(result.audio.numpy())
                            # Process timestamps for this chunk
                            if return_timestamps and hasattr(result, 'tokens') and result.tokens:
                                logger.debug(f"Processing chunk timestamps with {len(result.tokens)} tokens")
                                if result.pred_dur is not None:
                                    try:
                                        # Join timestamps for this chunk's tokens
                                        KPipeline.join_timestamps(result.tokens, result.pred_dur)
                                        # Add timestamps with offset
                                        for token in result.tokens:
                                            if not all(hasattr(token, attr) for attr in ['text', 'start_ts', 'end_ts']):
                                                continue
                                            if not token.text or not token.text.strip():
                                                continue
                                            # Apply offset to timestamps
                                            start_time = float(token.start_ts) + current_offset
                                            end_time = float(token.end_ts) + current_offset
                                            word_timestamps.append({
                                                'word': str(token.text).strip(),
                                                'start_time': start_time,
                                                'end_time': end_time
                                            })
                                            logger.debug(f"Added timestamp for word '{token.text}': {start_time:.3f}s - {end_time:.3f}s")
                                        # Update offset for next chunk based on pred_dur
                                        chunk_duration = float(result.pred_dur.sum()) / 80  # Convert frames to seconds
                                        current_offset = max(current_offset + chunk_duration, end_time)
                                        logger.debug(f"Updated time offset to {current_offset:.3f}s")
                                    except Exception as e:
                                        logger.error(f"Failed to process timestamps for chunk: {e}")
                                logger.debug(f"Processing timestamps with pred_dur shape: {result.pred_dur.shape}")
                                try:
                                    # Join timestamps for this chunk's tokens
                                    KPipeline.join_timestamps(result.tokens, result.pred_dur)
-                                    
+                                    logger.debug("Successfully joined timestamps for chunk")
                                    # Add timestamps with offset
                                    for token in result.tokens:
                                        if not all(hasattr(token, attr) for attr in ['text', 'start_ts', 'end_ts']):
                                            continue
                                        if not token.text or not token.text.strip():
                                            continue
                                        # Apply offset to timestamps
                                        start_time = float(token.start_ts) + current_offset
                                        end_time = float(token.end_ts) + current_offset
                                        word_timestamps.append({
                                            'word': str(token.text).strip(),
                                            'start_time': start_time,
                                            'end_time': end_time
                                        })
                                        logger.debug(f"Added timestamp for word '{token.text}': {start_time:.3f}s - {end_time:.3f}s")
                                    # Update offset for next chunk based on pred_dur
                                    chunk_duration = float(result.pred_dur.sum()) / 80  # Convert frames to seconds
                                    current_offset = max(current_offset + chunk_duration, end_time)
                                    logger.debug(f"Updated time offset to {current_offset:.3f}s")
                                except Exception as e:
-                                    logger.error(f"Failed to process timestamps for chunk: {e}")
+                                    logger.error(f"Failed to join timestamps for chunk: {e}")
                            logger.debug(f"Processing timestamps with pred_dur shape: {result.pred_dur.shape}")
                            try:
                                # Join timestamps for this chunk's tokens
                                KPipeline.join_timestamps(result.tokens, result.pred_dur)
                                logger.debug("Successfully joined timestamps for chunk")
                            except Exception as e:
                                logger.error(f"Failed to join timestamps for chunk: {e}")
                                continue
                        # Convert tokens to timestamps
                        for token in result.tokens:
                            try:
                                # Skip tokens without required attributes
                                if not all(hasattr(token, attr) for attr in ['text', 'start_ts', 'end_ts']):
                                    logger.debug(f"Skipping token missing attributes: {dir(token)}")
                                    continue
-                                
+                            
-                                # Get and validate text
+                            # Convert tokens to timestamps
-                                text = str(token.text).strip() if token.text is not None else ''
+                            for token in result.tokens:
                                if not text:
                                    logger.debug("Skipping empty token")
                                    continue
                                # Get and validate timestamps
                                start_ts = getattr(token, 'start_ts', None)
                                end_ts = getattr(token, 'end_ts', None)
                                if start_ts is None or end_ts is None:
                                    logger.debug(f"Skipping token with None timestamps: {text}")
                                    continue
                                # Convert timestamps to float
                                try:
-                                    start_time = float(start_ts)
+                                    # Skip tokens without required attributes
-                                    end_time = float(end_ts)
+                                    if not all(hasattr(token, attr) for attr in ['text', 'start_ts', 'end_ts']):
-                                except (TypeError, ValueError):
+                                        logger.debug(f"Skipping token missing attributes: {dir(token)}")
-                                    logger.debug(f"Skipping token with invalid timestamps: {text}")
+                                        continue
                                    # Get and validate text
                                    text = str(token.text).strip() if token.text is not None else ''
                                    if not text:
                                        logger.debug("Skipping empty token")
                                        continue
                                    # Get and validate timestamps
                                    start_ts = getattr(token, 'start_ts', None)
                                    end_ts = getattr(token, 'end_ts', None)
                                    if start_ts is None or end_ts is None:
                                        logger.debug(f"Skipping token with None timestamps: {text}")
                                        continue
                                    # Convert timestamps to float
                                    try:
                                        start_time = float(start_ts)
                                        end_time = float(end_ts)
                                    except (TypeError, ValueError):
                                        logger.debug(f"Skipping token with invalid timestamps: {text}")
                                        continue
                                    # Add timestamp
                                    word_timestamps.append({
                                        'word': text,
                                        'start_time': start_time,
                                        'end_time': end_time
                                    })
                                    logger.debug(f"Added timestamp for word '{text}': {start_time:.3f}s - {end_time:.3f}s")
                                except Exception as e:
                                    logger.warning(f"Error processing token: {e}")
                                    continue
-                                
+
-                                # Add timestamp
+                except Exception as e:
-                                word_timestamps.append({
+                    logger.error(f"Failed to process text with pipeline: {e}")
-                                    'word': text,
+                    raise RuntimeError(f"Pipeline processing failed: {e}")
                                    'start_time': start_time,
                                    'end_time': end_time
                                })
                                logger.debug(f"Added timestamp for word '{text}': {start_time:.3f}s - {end_time:.3f}s")
                            except Exception as e:
                                logger.warning(f"Error processing token: {e}")
                                continue
                if not chunks:
                    raise ValueError("No audio chunks were generated successfully")
@ -512,16 +514,19 @@ class TTSService:
                pipeline_lang_code = lang_code if lang_code else voice[:1].lower()
                logger.info(f"Using lang_code '{pipeline_lang_code}' for voice '{voice_name}' in phoneme pipeline")
-                # Create a new pipeline with the lang_code
+                try:
-                phoneme_pipeline = KPipeline(lang_code=pipeline_lang_code, model=backend._model)
+                    # Use backend's pipeline management
-                for r in phoneme_pipeline.generate_from_tokens(
+                    for r in backend._get_pipeline(pipeline_lang_code).generate_from_tokens(
-                    tokens=phonemes,  # Pass raw phonemes string
+                        tokens=phonemes,  # Pass raw phonemes string
-                    voice=voice_path,
+                        voice=voice_path,
-                    speed=speed
+                        speed=speed
-                ):
+                    ):
-                    if r.audio is not None:
+                        if r.audio is not None:
-                        result = r
+                            result = r
-                        break
+                            break
                except Exception as e:
                    logger.error(f"Failed to generate from phonemes: {e}")
                    raise RuntimeError(f"Phoneme generation failed: {e}")
                if result is None or result.audio is None:
                    raise ValueError("No audio generated")
--- a/docker/cpu/Dockerfile
+++ b/docker/cpu/Dockerfile
@ -48,7 +48,8 @@ ENV PYTHONUNBUFFERED=1 \
    UV_LINK_MODE=copy \
    USE_GPU=false \
    PHONEMIZER_ESPEAK_PATH=/usr/bin \
-    PHONEMIZER_ESPEAK_DATA=/usr/share/espeak-ng-data
+    PHONEMIZER_ESPEAK_DATA=/usr/share/espeak-ng-data \
    ESPEAK_DATA_PATH=/usr/share/espeak-ng-data
 ENV DOWNLOAD_MODEL=true
 # Download model if enabled
--- a/docker/gpu/Dockerfile
+++ b/docker/gpu/Dockerfile
@ -50,7 +50,8 @@ ENV PYTHONUNBUFFERED=1 \
    UV_LINK_MODE=copy \
    USE_GPU=true \
    PHONEMIZER_ESPEAK_PATH=/usr/bin \
-    PHONEMIZER_ESPEAK_DATA=/usr/share/espeak-ng-data
+    PHONEMIZER_ESPEAK_DATA=/usr/share/espeak-ng-data \
    ESPEAK_DATA_PATH=/usr/share/espeak-ng-data
 ENV DOWNLOAD_MODEL=true
 # Download model if enabled
--- a/docs/architecture/espeak_setup_fix.md
+++ b/docs/architecture/espeak_setup_fix.md
@ -0,0 +1,128 @@
 # ESpeak-NG Setup Fix
 ## Issue Description
 Users are reporting two distinct errors:
 1. Missing espeak-ng-data/phontab file:
 ```
 Error processing file '/home/runner/work/espeakng-loader/espeakng-loader/espeak-ng/_dynamic/share/espeak-ng-data/phontab': No such file or directory.
 ```
 2. Invalid pipeline state:
 ```
 Error generating speech: The object is in an invalid state.
 ```
 ## Root Cause Analysis
 ### 1. ESpeak-NG Data Issue
 The dependency chain has changed:
 ```
 Before:
 kokoro-fastapi (phonemizer 3.3.0) -> kokoro -> misaki -> phonemizer
 After:
 kokoro-fastapi -> kokoro -> misaki -> phonemizer-fork + espeakng-loader
 ```
 The issue arises because:
 1. misaki now uses espeakng-loader to manage espeak paths
 2. espeakng-loader looks for data in its package directory
 3. We have a direct dependency on phonemizer 3.3.0 that conflicts
 ### 2. Pipeline State Issue
 The "invalid state" error occurs due to device mismatch in pipeline creation.
 ## Solution
 ### 1. For ESpeak-NG Data
 Update dependencies and environment:
 1. Remove direct phonemizer dependency:
 ```diff
 - "phonemizer==3.3.0",  # Remove this
 ```
 2. Let misaki handle phonemizer-fork and espeakng-loader
 3. Set environment variable in Dockerfile:
 ```dockerfile
 ENV PHONEMIZER_ESPEAK_PATH=/usr/bin \
    PHONEMIZER_ESPEAK_DATA=/usr/share/espeak-ng-data \
    ESPEAK_DATA_PATH=/usr/share/espeak-ng-data  # Add this
 ```
 This approach:
 - Works with misaki's new dependencies
 - Maintains our working espeak setup
 - Avoids complex file copying or path manipulation
 ### 2. For Pipeline State
 Use kokoro_v1's pipeline management:
 ```python
 # Instead of creating pipelines directly:
 # pipeline = KPipeline(...)
 # Use backend's pipeline management:
 pipeline = backend._get_pipeline(pipeline_lang_code)
 ```
 ## Implementation Steps
 1. Update pyproject.toml:
   - Remove direct phonemizer dependency
   - Keep misaki dependency as is
 2. Update Dockerfiles:
   - Add ESPEAK_DATA_PATH environment variable
   - Keep existing espeak-ng setup
 3. Update tts_service.py:
   - Use backend's pipeline management
   - Add proper error handling
 ## Testing
 1. Test espeak-ng functionality:
   ```bash
   # Verify environment variables
   echo $ESPEAK_DATA_PATH
   echo $PHONEMIZER_ESPEAK_DATA
   # Check data directory
   ls /usr/share/espeak-ng-data
   ```
 2. Test pipeline state:
   - Test on both CPU and GPU
   - Verify no invalid state errors
   - Test with different voice models
 ## Success Criteria
 1. No espeak-ng-data/phontab file errors
 2. No invalid state errors
 3. Consistent behavior across platforms
 4. Successful CI/CD pipeline runs
 ## Future Considerations
 1. Potential PR to misaki:
   - Add fallback mechanism if espeakng-loader fails
   - Make path configuration more flexible
   - Add better error messages
 2. Environment Variable Documentation:
   - Document ESPEAK_DATA_PATH requirement
   - Explain interaction with espeakng-loader
   - Provide platform-specific setup instructions
 ## Notes
 - This solution works with misaki's new dependencies while maintaining our setup
 - Environment variable approach is simpler than file copying
 - May want to contribute improvements back to misaki later
--- a/web/index.html
+++ b/web/index.html
@ -101,6 +101,7 @@
                            <option value="e">Spanish</option>
                            <option value="a">English</option>
                            <option value="f">French</option>
                            <option value="h">Hindi</option>
                            <option value="i">Italian</option>
                            <option value="p">Portuguese</option>
                            <option value="j">Japanese</option>