From af0e6dad6e07a69a0ec399684177fa38ae074e22 Mon Sep 17 00:00:00 2001 From: remsky Date: Sat, 8 Feb 2025 20:36:50 -0700 Subject: [PATCH] espeak-loader broken link fix, invalid pipeline state --- api/src/services/tts_service.py | 227 +++++++++++++------------- docker/cpu/Dockerfile | 3 +- docker/gpu/Dockerfile | 3 +- docs/architecture/espeak_setup_fix.md | 128 +++++++++++++++ web/index.html | 1 + 5 files changed, 249 insertions(+), 113 deletions(-) create mode 100644 docs/architecture/espeak_setup_fix.md diff --git a/api/src/services/tts_service.py b/api/src/services/tts_service.py index dcd9535..74b20a2 100644 --- a/api/src/services/tts_service.py +++ b/api/src/services/tts_service.py @@ -307,116 +307,118 @@ class TTSService: pipeline_lang_code = lang_code if lang_code else voice[:1].lower() logger.info(f"Using lang_code '{pipeline_lang_code}' for voice '{voice_name}' in text chunking") - # Initialize quiet pipeline for text chunking - quiet_pipeline = KPipeline(lang_code=pipeline_lang_code, model=False) - - # Split text into chunks and get initial tokens - text_chunks = [] - current_offset = 0.0 # Track time offset for timestamps - - logger.debug("Splitting text into chunks...") - for result in quiet_pipeline(text): - if result.graphemes and result.phonemes: - text_chunks.append((result.graphemes, result.phonemes)) - logger.debug(f"Split text into {len(text_chunks)} chunks") - - # Process each chunk - for chunk_idx, (chunk_text, chunk_phonemes) in enumerate(text_chunks): - logger.debug(f"Processing chunk {chunk_idx + 1}/{len(text_chunks)}: '{chunk_text[:50]}...'") + # Get pipelines from backend for proper device management + try: + # Initialize quiet pipeline for text chunking + text_chunks = [] + current_offset = 0.0 # Track time offset for timestamps - # Create a new pipeline with the lang_code - generation_pipeline = KPipeline(lang_code=pipeline_lang_code, model=backend._model) - logger.info(f"Using lang_code '{pipeline_lang_code}' for voice '{voice_name}' in generation pipeline") - for result in generation_pipeline( - chunk_text, - voice=voice_path, - speed=speed - ): - # Collect audio chunks - if result.audio is not None: - chunks.append(result.audio.numpy()) + logger.debug("Splitting text into chunks...") + # Use backend's pipeline management + for result in backend._get_pipeline(pipeline_lang_code)(text): + if result.graphemes and result.phonemes: + text_chunks.append((result.graphemes, result.phonemes)) + logger.debug(f"Split text into {len(text_chunks)} chunks") + + # Process each chunk + for chunk_idx, (chunk_text, chunk_phonemes) in enumerate(text_chunks): + logger.debug(f"Processing chunk {chunk_idx + 1}/{len(text_chunks)}: '{chunk_text[:50]}...'") - # Process timestamps for this chunk - if return_timestamps and hasattr(result, 'tokens') and result.tokens: - logger.debug(f"Processing chunk timestamps with {len(result.tokens)} tokens") - if result.pred_dur is not None: + # Use backend's pipeline for generation + for result in backend._get_pipeline(pipeline_lang_code)( + chunk_text, + voice=voice_path, + speed=speed + ): + # Collect audio chunks + if result.audio is not None: + chunks.append(result.audio.numpy()) + + # Process timestamps for this chunk + if return_timestamps and hasattr(result, 'tokens') and result.tokens: + logger.debug(f"Processing chunk timestamps with {len(result.tokens)} tokens") + if result.pred_dur is not None: + try: + # Join timestamps for this chunk's tokens + KPipeline.join_timestamps(result.tokens, result.pred_dur) + + # Add timestamps with offset + for token in result.tokens: + if not all(hasattr(token, attr) for attr in ['text', 'start_ts', 'end_ts']): + continue + if not token.text or not token.text.strip(): + continue + + # Apply offset to timestamps + start_time = float(token.start_ts) + current_offset + end_time = float(token.end_ts) + current_offset + + word_timestamps.append({ + 'word': str(token.text).strip(), + 'start_time': start_time, + 'end_time': end_time + }) + logger.debug(f"Added timestamp for word '{token.text}': {start_time:.3f}s - {end_time:.3f}s") + + # Update offset for next chunk based on pred_dur + chunk_duration = float(result.pred_dur.sum()) / 80 # Convert frames to seconds + current_offset = max(current_offset + chunk_duration, end_time) + logger.debug(f"Updated time offset to {current_offset:.3f}s") + + except Exception as e: + logger.error(f"Failed to process timestamps for chunk: {e}") + logger.debug(f"Processing timestamps with pred_dur shape: {result.pred_dur.shape}") try: # Join timestamps for this chunk's tokens KPipeline.join_timestamps(result.tokens, result.pred_dur) - - # Add timestamps with offset - for token in result.tokens: - if not all(hasattr(token, attr) for attr in ['text', 'start_ts', 'end_ts']): - continue - if not token.text or not token.text.strip(): - continue - - # Apply offset to timestamps - start_time = float(token.start_ts) + current_offset - end_time = float(token.end_ts) + current_offset - - word_timestamps.append({ - 'word': str(token.text).strip(), - 'start_time': start_time, - 'end_time': end_time - }) - logger.debug(f"Added timestamp for word '{token.text}': {start_time:.3f}s - {end_time:.3f}s") - - # Update offset for next chunk based on pred_dur - chunk_duration = float(result.pred_dur.sum()) / 80 # Convert frames to seconds - current_offset = max(current_offset + chunk_duration, end_time) - logger.debug(f"Updated time offset to {current_offset:.3f}s") - + logger.debug("Successfully joined timestamps for chunk") except Exception as e: - logger.error(f"Failed to process timestamps for chunk: {e}") - logger.debug(f"Processing timestamps with pred_dur shape: {result.pred_dur.shape}") - try: - # Join timestamps for this chunk's tokens - KPipeline.join_timestamps(result.tokens, result.pred_dur) - logger.debug("Successfully joined timestamps for chunk") - except Exception as e: - logger.error(f"Failed to join timestamps for chunk: {e}") - continue - - # Convert tokens to timestamps - for token in result.tokens: - try: - # Skip tokens without required attributes - if not all(hasattr(token, attr) for attr in ['text', 'start_ts', 'end_ts']): - logger.debug(f"Skipping token missing attributes: {dir(token)}") + logger.error(f"Failed to join timestamps for chunk: {e}") continue - - # Get and validate text - text = str(token.text).strip() if token.text is not None else '' - if not text: - logger.debug("Skipping empty token") - continue - - # Get and validate timestamps - start_ts = getattr(token, 'start_ts', None) - end_ts = getattr(token, 'end_ts', None) - if start_ts is None or end_ts is None: - logger.debug(f"Skipping token with None timestamps: {text}") - continue - - # Convert timestamps to float + + # Convert tokens to timestamps + for token in result.tokens: try: - start_time = float(start_ts) - end_time = float(end_ts) - except (TypeError, ValueError): - logger.debug(f"Skipping token with invalid timestamps: {text}") + # Skip tokens without required attributes + if not all(hasattr(token, attr) for attr in ['text', 'start_ts', 'end_ts']): + logger.debug(f"Skipping token missing attributes: {dir(token)}") + continue + + # Get and validate text + text = str(token.text).strip() if token.text is not None else '' + if not text: + logger.debug("Skipping empty token") + continue + + # Get and validate timestamps + start_ts = getattr(token, 'start_ts', None) + end_ts = getattr(token, 'end_ts', None) + if start_ts is None or end_ts is None: + logger.debug(f"Skipping token with None timestamps: {text}") + continue + + # Convert timestamps to float + try: + start_time = float(start_ts) + end_time = float(end_ts) + except (TypeError, ValueError): + logger.debug(f"Skipping token with invalid timestamps: {text}") + continue + + # Add timestamp + word_timestamps.append({ + 'word': text, + 'start_time': start_time, + 'end_time': end_time + }) + logger.debug(f"Added timestamp for word '{text}': {start_time:.3f}s - {end_time:.3f}s") + except Exception as e: + logger.warning(f"Error processing token: {e}") continue - - # Add timestamp - word_timestamps.append({ - 'word': text, - 'start_time': start_time, - 'end_time': end_time - }) - logger.debug(f"Added timestamp for word '{text}': {start_time:.3f}s - {end_time:.3f}s") - except Exception as e: - logger.warning(f"Error processing token: {e}") - continue + + except Exception as e: + logger.error(f"Failed to process text with pipeline: {e}") + raise RuntimeError(f"Pipeline processing failed: {e}") if not chunks: raise ValueError("No audio chunks were generated successfully") @@ -512,16 +514,19 @@ class TTSService: pipeline_lang_code = lang_code if lang_code else voice[:1].lower() logger.info(f"Using lang_code '{pipeline_lang_code}' for voice '{voice_name}' in phoneme pipeline") - # Create a new pipeline with the lang_code - phoneme_pipeline = KPipeline(lang_code=pipeline_lang_code, model=backend._model) - for r in phoneme_pipeline.generate_from_tokens( - tokens=phonemes, # Pass raw phonemes string - voice=voice_path, - speed=speed - ): - if r.audio is not None: - result = r - break + try: + # Use backend's pipeline management + for r in backend._get_pipeline(pipeline_lang_code).generate_from_tokens( + tokens=phonemes, # Pass raw phonemes string + voice=voice_path, + speed=speed + ): + if r.audio is not None: + result = r + break + except Exception as e: + logger.error(f"Failed to generate from phonemes: {e}") + raise RuntimeError(f"Phoneme generation failed: {e}") if result is None or result.audio is None: raise ValueError("No audio generated") diff --git a/docker/cpu/Dockerfile b/docker/cpu/Dockerfile index 7106a6e..5d6f99e 100644 --- a/docker/cpu/Dockerfile +++ b/docker/cpu/Dockerfile @@ -48,7 +48,8 @@ ENV PYTHONUNBUFFERED=1 \ UV_LINK_MODE=copy \ USE_GPU=false \ PHONEMIZER_ESPEAK_PATH=/usr/bin \ - PHONEMIZER_ESPEAK_DATA=/usr/share/espeak-ng-data + PHONEMIZER_ESPEAK_DATA=/usr/share/espeak-ng-data \ + ESPEAK_DATA_PATH=/usr/share/espeak-ng-data ENV DOWNLOAD_MODEL=true # Download model if enabled diff --git a/docker/gpu/Dockerfile b/docker/gpu/Dockerfile index 040cca5..ce0f646 100644 --- a/docker/gpu/Dockerfile +++ b/docker/gpu/Dockerfile @@ -50,7 +50,8 @@ ENV PYTHONUNBUFFERED=1 \ UV_LINK_MODE=copy \ USE_GPU=true \ PHONEMIZER_ESPEAK_PATH=/usr/bin \ - PHONEMIZER_ESPEAK_DATA=/usr/share/espeak-ng-data + PHONEMIZER_ESPEAK_DATA=/usr/share/espeak-ng-data \ + ESPEAK_DATA_PATH=/usr/share/espeak-ng-data ENV DOWNLOAD_MODEL=true # Download model if enabled diff --git a/docs/architecture/espeak_setup_fix.md b/docs/architecture/espeak_setup_fix.md new file mode 100644 index 0000000..f20997d --- /dev/null +++ b/docs/architecture/espeak_setup_fix.md @@ -0,0 +1,128 @@ +# ESpeak-NG Setup Fix + +## Issue Description + +Users are reporting two distinct errors: + +1. Missing espeak-ng-data/phontab file: +``` +Error processing file '/home/runner/work/espeakng-loader/espeakng-loader/espeak-ng/_dynamic/share/espeak-ng-data/phontab': No such file or directory. +``` + +2. Invalid pipeline state: +``` +Error generating speech: The object is in an invalid state. +``` + +## Root Cause Analysis + +### 1. ESpeak-NG Data Issue + +The dependency chain has changed: +``` +Before: +kokoro-fastapi (phonemizer 3.3.0) -> kokoro -> misaki -> phonemizer + +After: +kokoro-fastapi -> kokoro -> misaki -> phonemizer-fork + espeakng-loader +``` + +The issue arises because: +1. misaki now uses espeakng-loader to manage espeak paths +2. espeakng-loader looks for data in its package directory +3. We have a direct dependency on phonemizer 3.3.0 that conflicts + +### 2. Pipeline State Issue +The "invalid state" error occurs due to device mismatch in pipeline creation. + +## Solution + +### 1. For ESpeak-NG Data + +Update dependencies and environment: + +1. Remove direct phonemizer dependency: +```diff +- "phonemizer==3.3.0", # Remove this +``` + +2. Let misaki handle phonemizer-fork and espeakng-loader + +3. Set environment variable in Dockerfile: +```dockerfile +ENV PHONEMIZER_ESPEAK_PATH=/usr/bin \ + PHONEMIZER_ESPEAK_DATA=/usr/share/espeak-ng-data \ + ESPEAK_DATA_PATH=/usr/share/espeak-ng-data # Add this +``` + +This approach: +- Works with misaki's new dependencies +- Maintains our working espeak setup +- Avoids complex file copying or path manipulation + +### 2. For Pipeline State + +Use kokoro_v1's pipeline management: +```python +# Instead of creating pipelines directly: +# pipeline = KPipeline(...) + +# Use backend's pipeline management: +pipeline = backend._get_pipeline(pipeline_lang_code) +``` + +## Implementation Steps + +1. Update pyproject.toml: + - Remove direct phonemizer dependency + - Keep misaki dependency as is + +2. Update Dockerfiles: + - Add ESPEAK_DATA_PATH environment variable + - Keep existing espeak-ng setup + +3. Update tts_service.py: + - Use backend's pipeline management + - Add proper error handling + +## Testing + +1. Test espeak-ng functionality: + ```bash + # Verify environment variables + echo $ESPEAK_DATA_PATH + echo $PHONEMIZER_ESPEAK_DATA + + # Check data directory + ls /usr/share/espeak-ng-data + ``` + +2. Test pipeline state: + - Test on both CPU and GPU + - Verify no invalid state errors + - Test with different voice models + +## Success Criteria + +1. No espeak-ng-data/phontab file errors +2. No invalid state errors +3. Consistent behavior across platforms +4. Successful CI/CD pipeline runs + +## Future Considerations + +1. Potential PR to misaki: + - Add fallback mechanism if espeakng-loader fails + - Make path configuration more flexible + - Add better error messages + +2. Environment Variable Documentation: + - Document ESPEAK_DATA_PATH requirement + - Explain interaction with espeakng-loader + - Provide platform-specific setup instructions + +## Notes + +- This solution works with misaki's new dependencies while maintaining our setup +- Environment variable approach is simpler than file copying +- May want to contribute improvements back to misaki later \ No newline at end of file diff --git a/web/index.html b/web/index.html index 1736f94..eb99816 100644 --- a/web/index.html +++ b/web/index.html @@ -101,6 +101,7 @@ +