mirror of
https://github.com/remsky/Kokoro-FastAPI.git
synced 2025-08-05 16:48:53 +00:00
espeak-loader broken link fix, invalid pipeline state
This commit is contained in:
parent
425c7d0eac
commit
af0e6dad6e
5 changed files with 249 additions and 113 deletions
|
@ -307,116 +307,118 @@ class TTSService:
|
|||
pipeline_lang_code = lang_code if lang_code else voice[:1].lower()
|
||||
logger.info(f"Using lang_code '{pipeline_lang_code}' for voice '{voice_name}' in text chunking")
|
||||
|
||||
# Initialize quiet pipeline for text chunking
|
||||
quiet_pipeline = KPipeline(lang_code=pipeline_lang_code, model=False)
|
||||
# Get pipelines from backend for proper device management
|
||||
try:
|
||||
# Initialize quiet pipeline for text chunking
|
||||
text_chunks = []
|
||||
current_offset = 0.0 # Track time offset for timestamps
|
||||
|
||||
# Split text into chunks and get initial tokens
|
||||
text_chunks = []
|
||||
current_offset = 0.0 # Track time offset for timestamps
|
||||
logger.debug("Splitting text into chunks...")
|
||||
# Use backend's pipeline management
|
||||
for result in backend._get_pipeline(pipeline_lang_code)(text):
|
||||
if result.graphemes and result.phonemes:
|
||||
text_chunks.append((result.graphemes, result.phonemes))
|
||||
logger.debug(f"Split text into {len(text_chunks)} chunks")
|
||||
|
||||
logger.debug("Splitting text into chunks...")
|
||||
for result in quiet_pipeline(text):
|
||||
if result.graphemes and result.phonemes:
|
||||
text_chunks.append((result.graphemes, result.phonemes))
|
||||
logger.debug(f"Split text into {len(text_chunks)} chunks")
|
||||
# Process each chunk
|
||||
for chunk_idx, (chunk_text, chunk_phonemes) in enumerate(text_chunks):
|
||||
logger.debug(f"Processing chunk {chunk_idx + 1}/{len(text_chunks)}: '{chunk_text[:50]}...'")
|
||||
|
||||
# Process each chunk
|
||||
for chunk_idx, (chunk_text, chunk_phonemes) in enumerate(text_chunks):
|
||||
logger.debug(f"Processing chunk {chunk_idx + 1}/{len(text_chunks)}: '{chunk_text[:50]}...'")
|
||||
# Use backend's pipeline for generation
|
||||
for result in backend._get_pipeline(pipeline_lang_code)(
|
||||
chunk_text,
|
||||
voice=voice_path,
|
||||
speed=speed
|
||||
):
|
||||
# Collect audio chunks
|
||||
if result.audio is not None:
|
||||
chunks.append(result.audio.numpy())
|
||||
|
||||
# Create a new pipeline with the lang_code
|
||||
generation_pipeline = KPipeline(lang_code=pipeline_lang_code, model=backend._model)
|
||||
logger.info(f"Using lang_code '{pipeline_lang_code}' for voice '{voice_name}' in generation pipeline")
|
||||
for result in generation_pipeline(
|
||||
chunk_text,
|
||||
voice=voice_path,
|
||||
speed=speed
|
||||
):
|
||||
# Collect audio chunks
|
||||
if result.audio is not None:
|
||||
chunks.append(result.audio.numpy())
|
||||
# Process timestamps for this chunk
|
||||
if return_timestamps and hasattr(result, 'tokens') and result.tokens:
|
||||
logger.debug(f"Processing chunk timestamps with {len(result.tokens)} tokens")
|
||||
if result.pred_dur is not None:
|
||||
try:
|
||||
# Join timestamps for this chunk's tokens
|
||||
KPipeline.join_timestamps(result.tokens, result.pred_dur)
|
||||
|
||||
# Process timestamps for this chunk
|
||||
if return_timestamps and hasattr(result, 'tokens') and result.tokens:
|
||||
logger.debug(f"Processing chunk timestamps with {len(result.tokens)} tokens")
|
||||
if result.pred_dur is not None:
|
||||
# Add timestamps with offset
|
||||
for token in result.tokens:
|
||||
if not all(hasattr(token, attr) for attr in ['text', 'start_ts', 'end_ts']):
|
||||
continue
|
||||
if not token.text or not token.text.strip():
|
||||
continue
|
||||
|
||||
# Apply offset to timestamps
|
||||
start_time = float(token.start_ts) + current_offset
|
||||
end_time = float(token.end_ts) + current_offset
|
||||
|
||||
word_timestamps.append({
|
||||
'word': str(token.text).strip(),
|
||||
'start_time': start_time,
|
||||
'end_time': end_time
|
||||
})
|
||||
logger.debug(f"Added timestamp for word '{token.text}': {start_time:.3f}s - {end_time:.3f}s")
|
||||
|
||||
# Update offset for next chunk based on pred_dur
|
||||
chunk_duration = float(result.pred_dur.sum()) / 80 # Convert frames to seconds
|
||||
current_offset = max(current_offset + chunk_duration, end_time)
|
||||
logger.debug(f"Updated time offset to {current_offset:.3f}s")
|
||||
|
||||
except Exception as e:
|
||||
logger.error(f"Failed to process timestamps for chunk: {e}")
|
||||
logger.debug(f"Processing timestamps with pred_dur shape: {result.pred_dur.shape}")
|
||||
try:
|
||||
# Join timestamps for this chunk's tokens
|
||||
KPipeline.join_timestamps(result.tokens, result.pred_dur)
|
||||
|
||||
# Add timestamps with offset
|
||||
for token in result.tokens:
|
||||
if not all(hasattr(token, attr) for attr in ['text', 'start_ts', 'end_ts']):
|
||||
continue
|
||||
if not token.text or not token.text.strip():
|
||||
continue
|
||||
|
||||
# Apply offset to timestamps
|
||||
start_time = float(token.start_ts) + current_offset
|
||||
end_time = float(token.end_ts) + current_offset
|
||||
|
||||
word_timestamps.append({
|
||||
'word': str(token.text).strip(),
|
||||
'start_time': start_time,
|
||||
'end_time': end_time
|
||||
})
|
||||
logger.debug(f"Added timestamp for word '{token.text}': {start_time:.3f}s - {end_time:.3f}s")
|
||||
|
||||
# Update offset for next chunk based on pred_dur
|
||||
chunk_duration = float(result.pred_dur.sum()) / 80 # Convert frames to seconds
|
||||
current_offset = max(current_offset + chunk_duration, end_time)
|
||||
logger.debug(f"Updated time offset to {current_offset:.3f}s")
|
||||
|
||||
logger.debug("Successfully joined timestamps for chunk")
|
||||
except Exception as e:
|
||||
logger.error(f"Failed to process timestamps for chunk: {e}")
|
||||
logger.debug(f"Processing timestamps with pred_dur shape: {result.pred_dur.shape}")
|
||||
try:
|
||||
# Join timestamps for this chunk's tokens
|
||||
KPipeline.join_timestamps(result.tokens, result.pred_dur)
|
||||
logger.debug("Successfully joined timestamps for chunk")
|
||||
except Exception as e:
|
||||
logger.error(f"Failed to join timestamps for chunk: {e}")
|
||||
continue
|
||||
|
||||
# Convert tokens to timestamps
|
||||
for token in result.tokens:
|
||||
try:
|
||||
# Skip tokens without required attributes
|
||||
if not all(hasattr(token, attr) for attr in ['text', 'start_ts', 'end_ts']):
|
||||
logger.debug(f"Skipping token missing attributes: {dir(token)}")
|
||||
logger.error(f"Failed to join timestamps for chunk: {e}")
|
||||
continue
|
||||
|
||||
# Get and validate text
|
||||
text = str(token.text).strip() if token.text is not None else ''
|
||||
if not text:
|
||||
logger.debug("Skipping empty token")
|
||||
continue
|
||||
|
||||
# Get and validate timestamps
|
||||
start_ts = getattr(token, 'start_ts', None)
|
||||
end_ts = getattr(token, 'end_ts', None)
|
||||
if start_ts is None or end_ts is None:
|
||||
logger.debug(f"Skipping token with None timestamps: {text}")
|
||||
continue
|
||||
|
||||
# Convert timestamps to float
|
||||
# Convert tokens to timestamps
|
||||
for token in result.tokens:
|
||||
try:
|
||||
start_time = float(start_ts)
|
||||
end_time = float(end_ts)
|
||||
except (TypeError, ValueError):
|
||||
logger.debug(f"Skipping token with invalid timestamps: {text}")
|
||||
# Skip tokens without required attributes
|
||||
if not all(hasattr(token, attr) for attr in ['text', 'start_ts', 'end_ts']):
|
||||
logger.debug(f"Skipping token missing attributes: {dir(token)}")
|
||||
continue
|
||||
|
||||
# Get and validate text
|
||||
text = str(token.text).strip() if token.text is not None else ''
|
||||
if not text:
|
||||
logger.debug("Skipping empty token")
|
||||
continue
|
||||
|
||||
# Get and validate timestamps
|
||||
start_ts = getattr(token, 'start_ts', None)
|
||||
end_ts = getattr(token, 'end_ts', None)
|
||||
if start_ts is None or end_ts is None:
|
||||
logger.debug(f"Skipping token with None timestamps: {text}")
|
||||
continue
|
||||
|
||||
# Convert timestamps to float
|
||||
try:
|
||||
start_time = float(start_ts)
|
||||
end_time = float(end_ts)
|
||||
except (TypeError, ValueError):
|
||||
logger.debug(f"Skipping token with invalid timestamps: {text}")
|
||||
continue
|
||||
|
||||
# Add timestamp
|
||||
word_timestamps.append({
|
||||
'word': text,
|
||||
'start_time': start_time,
|
||||
'end_time': end_time
|
||||
})
|
||||
logger.debug(f"Added timestamp for word '{text}': {start_time:.3f}s - {end_time:.3f}s")
|
||||
except Exception as e:
|
||||
logger.warning(f"Error processing token: {e}")
|
||||
continue
|
||||
|
||||
# Add timestamp
|
||||
word_timestamps.append({
|
||||
'word': text,
|
||||
'start_time': start_time,
|
||||
'end_time': end_time
|
||||
})
|
||||
logger.debug(f"Added timestamp for word '{text}': {start_time:.3f}s - {end_time:.3f}s")
|
||||
except Exception as e:
|
||||
logger.warning(f"Error processing token: {e}")
|
||||
continue
|
||||
except Exception as e:
|
||||
logger.error(f"Failed to process text with pipeline: {e}")
|
||||
raise RuntimeError(f"Pipeline processing failed: {e}")
|
||||
|
||||
if not chunks:
|
||||
raise ValueError("No audio chunks were generated successfully")
|
||||
|
@ -512,16 +514,19 @@ class TTSService:
|
|||
pipeline_lang_code = lang_code if lang_code else voice[:1].lower()
|
||||
logger.info(f"Using lang_code '{pipeline_lang_code}' for voice '{voice_name}' in phoneme pipeline")
|
||||
|
||||
# Create a new pipeline with the lang_code
|
||||
phoneme_pipeline = KPipeline(lang_code=pipeline_lang_code, model=backend._model)
|
||||
for r in phoneme_pipeline.generate_from_tokens(
|
||||
tokens=phonemes, # Pass raw phonemes string
|
||||
voice=voice_path,
|
||||
speed=speed
|
||||
):
|
||||
if r.audio is not None:
|
||||
result = r
|
||||
break
|
||||
try:
|
||||
# Use backend's pipeline management
|
||||
for r in backend._get_pipeline(pipeline_lang_code).generate_from_tokens(
|
||||
tokens=phonemes, # Pass raw phonemes string
|
||||
voice=voice_path,
|
||||
speed=speed
|
||||
):
|
||||
if r.audio is not None:
|
||||
result = r
|
||||
break
|
||||
except Exception as e:
|
||||
logger.error(f"Failed to generate from phonemes: {e}")
|
||||
raise RuntimeError(f"Phoneme generation failed: {e}")
|
||||
|
||||
if result is None or result.audio is None:
|
||||
raise ValueError("No audio generated")
|
||||
|
|
|
@ -48,7 +48,8 @@ ENV PYTHONUNBUFFERED=1 \
|
|||
UV_LINK_MODE=copy \
|
||||
USE_GPU=false \
|
||||
PHONEMIZER_ESPEAK_PATH=/usr/bin \
|
||||
PHONEMIZER_ESPEAK_DATA=/usr/share/espeak-ng-data
|
||||
PHONEMIZER_ESPEAK_DATA=/usr/share/espeak-ng-data \
|
||||
ESPEAK_DATA_PATH=/usr/share/espeak-ng-data
|
||||
|
||||
ENV DOWNLOAD_MODEL=true
|
||||
# Download model if enabled
|
||||
|
|
|
@ -50,7 +50,8 @@ ENV PYTHONUNBUFFERED=1 \
|
|||
UV_LINK_MODE=copy \
|
||||
USE_GPU=true \
|
||||
PHONEMIZER_ESPEAK_PATH=/usr/bin \
|
||||
PHONEMIZER_ESPEAK_DATA=/usr/share/espeak-ng-data
|
||||
PHONEMIZER_ESPEAK_DATA=/usr/share/espeak-ng-data \
|
||||
ESPEAK_DATA_PATH=/usr/share/espeak-ng-data
|
||||
|
||||
ENV DOWNLOAD_MODEL=true
|
||||
# Download model if enabled
|
||||
|
|
128
docs/architecture/espeak_setup_fix.md
Normal file
128
docs/architecture/espeak_setup_fix.md
Normal file
|
@ -0,0 +1,128 @@
|
|||
# ESpeak-NG Setup Fix
|
||||
|
||||
## Issue Description
|
||||
|
||||
Users are reporting two distinct errors:
|
||||
|
||||
1. Missing espeak-ng-data/phontab file:
|
||||
```
|
||||
Error processing file '/home/runner/work/espeakng-loader/espeakng-loader/espeak-ng/_dynamic/share/espeak-ng-data/phontab': No such file or directory.
|
||||
```
|
||||
|
||||
2. Invalid pipeline state:
|
||||
```
|
||||
Error generating speech: The object is in an invalid state.
|
||||
```
|
||||
|
||||
## Root Cause Analysis
|
||||
|
||||
### 1. ESpeak-NG Data Issue
|
||||
|
||||
The dependency chain has changed:
|
||||
```
|
||||
Before:
|
||||
kokoro-fastapi (phonemizer 3.3.0) -> kokoro -> misaki -> phonemizer
|
||||
|
||||
After:
|
||||
kokoro-fastapi -> kokoro -> misaki -> phonemizer-fork + espeakng-loader
|
||||
```
|
||||
|
||||
The issue arises because:
|
||||
1. misaki now uses espeakng-loader to manage espeak paths
|
||||
2. espeakng-loader looks for data in its package directory
|
||||
3. We have a direct dependency on phonemizer 3.3.0 that conflicts
|
||||
|
||||
### 2. Pipeline State Issue
|
||||
The "invalid state" error occurs due to device mismatch in pipeline creation.
|
||||
|
||||
## Solution
|
||||
|
||||
### 1. For ESpeak-NG Data
|
||||
|
||||
Update dependencies and environment:
|
||||
|
||||
1. Remove direct phonemizer dependency:
|
||||
```diff
|
||||
- "phonemizer==3.3.0", # Remove this
|
||||
```
|
||||
|
||||
2. Let misaki handle phonemizer-fork and espeakng-loader
|
||||
|
||||
3. Set environment variable in Dockerfile:
|
||||
```dockerfile
|
||||
ENV PHONEMIZER_ESPEAK_PATH=/usr/bin \
|
||||
PHONEMIZER_ESPEAK_DATA=/usr/share/espeak-ng-data \
|
||||
ESPEAK_DATA_PATH=/usr/share/espeak-ng-data # Add this
|
||||
```
|
||||
|
||||
This approach:
|
||||
- Works with misaki's new dependencies
|
||||
- Maintains our working espeak setup
|
||||
- Avoids complex file copying or path manipulation
|
||||
|
||||
### 2. For Pipeline State
|
||||
|
||||
Use kokoro_v1's pipeline management:
|
||||
```python
|
||||
# Instead of creating pipelines directly:
|
||||
# pipeline = KPipeline(...)
|
||||
|
||||
# Use backend's pipeline management:
|
||||
pipeline = backend._get_pipeline(pipeline_lang_code)
|
||||
```
|
||||
|
||||
## Implementation Steps
|
||||
|
||||
1. Update pyproject.toml:
|
||||
- Remove direct phonemizer dependency
|
||||
- Keep misaki dependency as is
|
||||
|
||||
2. Update Dockerfiles:
|
||||
- Add ESPEAK_DATA_PATH environment variable
|
||||
- Keep existing espeak-ng setup
|
||||
|
||||
3. Update tts_service.py:
|
||||
- Use backend's pipeline management
|
||||
- Add proper error handling
|
||||
|
||||
## Testing
|
||||
|
||||
1. Test espeak-ng functionality:
|
||||
```bash
|
||||
# Verify environment variables
|
||||
echo $ESPEAK_DATA_PATH
|
||||
echo $PHONEMIZER_ESPEAK_DATA
|
||||
|
||||
# Check data directory
|
||||
ls /usr/share/espeak-ng-data
|
||||
```
|
||||
|
||||
2. Test pipeline state:
|
||||
- Test on both CPU and GPU
|
||||
- Verify no invalid state errors
|
||||
- Test with different voice models
|
||||
|
||||
## Success Criteria
|
||||
|
||||
1. No espeak-ng-data/phontab file errors
|
||||
2. No invalid state errors
|
||||
3. Consistent behavior across platforms
|
||||
4. Successful CI/CD pipeline runs
|
||||
|
||||
## Future Considerations
|
||||
|
||||
1. Potential PR to misaki:
|
||||
- Add fallback mechanism if espeakng-loader fails
|
||||
- Make path configuration more flexible
|
||||
- Add better error messages
|
||||
|
||||
2. Environment Variable Documentation:
|
||||
- Document ESPEAK_DATA_PATH requirement
|
||||
- Explain interaction with espeakng-loader
|
||||
- Provide platform-specific setup instructions
|
||||
|
||||
## Notes
|
||||
|
||||
- This solution works with misaki's new dependencies while maintaining our setup
|
||||
- Environment variable approach is simpler than file copying
|
||||
- May want to contribute improvements back to misaki later
|
|
@ -101,6 +101,7 @@
|
|||
<option value="e">Spanish</option>
|
||||
<option value="a">English</option>
|
||||
<option value="f">French</option>
|
||||
<option value="h">Hindi</option>
|
||||
<option value="i">Italian</option>
|
||||
<option value="p">Portuguese</option>
|
||||
<option value="j">Japanese</option>
|
||||
|
|
Loading…
Add table
Reference in a new issue