mirror of
https://github.com/remsky/Kokoro-FastAPI.git
synced 2025-08-05 16:48:53 +00:00
fixed no stream file writing
This commit is contained in:
parent
dbf2b99026
commit
7772dbc2e4
4 changed files with 28 additions and 28 deletions
|
@ -21,7 +21,7 @@ class AudioChunk:
|
||||||
output=AudioChunk(audio_chunk_list[0].audio,audio_chunk_list[0].word_timestamps)
|
output=AudioChunk(audio_chunk_list[0].audio,audio_chunk_list[0].word_timestamps)
|
||||||
|
|
||||||
for audio_chunk in audio_chunk_list[1:]:
|
for audio_chunk in audio_chunk_list[1:]:
|
||||||
output.audio=np.concatenate((output.audio,audio_chunk.audio))
|
output.audio=np.concatenate((output.audio,audio_chunk.audio),dtype=np.int16)
|
||||||
if output.word_timestamps is not None:
|
if output.word_timestamps is not None:
|
||||||
output.word_timestamps+=output.word_timestamps
|
output.word_timestamps+=output.word_timestamps
|
||||||
|
|
||||||
|
|
|
@ -7,6 +7,7 @@ import re
|
||||||
import tempfile
|
import tempfile
|
||||||
from typing import AsyncGenerator, Dict, List, Union, Tuple
|
from typing import AsyncGenerator, Dict, List, Union, Tuple
|
||||||
from urllib import response
|
from urllib import response
|
||||||
|
import numpy as np
|
||||||
|
|
||||||
import aiofiles
|
import aiofiles
|
||||||
from ..inference.base import AudioChunk
|
from ..inference.base import AudioChunk
|
||||||
|
@ -259,24 +260,31 @@ async def create_speech(
|
||||||
)
|
)
|
||||||
else:
|
else:
|
||||||
# Generate complete audio using public interface
|
# Generate complete audio using public interface
|
||||||
audio, audio_data = await tts_service.generate_audio(
|
_, audio_data = await tts_service.generate_audio(
|
||||||
text=request.input,
|
text=request.input,
|
||||||
voice=voice_name,
|
voice=voice_name,
|
||||||
speed=request.speed,
|
speed=request.speed,
|
||||||
lang_code=request.lang_code,
|
lang_code=request.lang_code,
|
||||||
)
|
)
|
||||||
|
|
||||||
# Convert to requested format with proper finalization
|
|
||||||
content, audio_data = await AudioService.convert_audio(
|
content, audio_data = await AudioService.convert_audio(
|
||||||
audio_data,
|
audio_data,
|
||||||
24000,
|
24000,
|
||||||
request.response_format,
|
request.response_format,
|
||||||
is_first_chunk=True,
|
is_first_chunk=True,
|
||||||
|
is_last_chunk=False,
|
||||||
|
)
|
||||||
|
|
||||||
|
# Convert to requested format with proper finalization
|
||||||
|
final, _ = await AudioService.convert_audio(
|
||||||
|
AudioChunk(np.array([], dtype=np.int16)),
|
||||||
|
24000,
|
||||||
|
request.response_format,
|
||||||
|
is_first_chunk=False,
|
||||||
is_last_chunk=True,
|
is_last_chunk=True,
|
||||||
)
|
)
|
||||||
print(content,request.response_format)
|
output=content+final
|
||||||
return Response(
|
return Response(
|
||||||
content=content,
|
content=output,
|
||||||
media_type=content_type,
|
media_type=content_type,
|
||||||
headers={
|
headers={
|
||||||
"Content-Disposition": f"attachment; filename=speech.{request.response_format}",
|
"Content-Disposition": f"attachment; filename=speech.{request.response_format}",
|
||||||
|
|
|
@ -72,7 +72,7 @@ class AudioNormalizer:
|
||||||
|
|
||||||
return max(non_silent_index_start - self.samples_to_pad_start,0), min(non_silent_index_end + math.ceil(samples_to_pad_end / speed),len(audio_data))
|
return max(non_silent_index_start - self.samples_to_pad_start,0), min(non_silent_index_end + math.ceil(samples_to_pad_end / speed),len(audio_data))
|
||||||
|
|
||||||
async def normalize(self, audio_data: np.ndarray) -> np.ndarray:
|
def normalize(self, audio_data: np.ndarray) -> np.ndarray:
|
||||||
"""Convert audio data to int16 range
|
"""Convert audio data to int16 range
|
||||||
|
|
||||||
Args:
|
Args:
|
||||||
|
@ -80,12 +80,10 @@ class AudioNormalizer:
|
||||||
Returns:
|
Returns:
|
||||||
Normalized audio data
|
Normalized audio data
|
||||||
"""
|
"""
|
||||||
if len(audio_data) == 0:
|
if audio_data.dtype != np.int16:
|
||||||
raise ValueError("Empty audio data")
|
|
||||||
|
|
||||||
# Scale directly to int16 range with clipping
|
# Scale directly to int16 range with clipping
|
||||||
return np.clip(audio_data * 32767, -32768, 32767).astype(np.int16)
|
return np.clip(audio_data * 32767, -32768, 32767).astype(np.int16)
|
||||||
|
return audio_data
|
||||||
|
|
||||||
class AudioService:
|
class AudioService:
|
||||||
"""Service for audio format conversions with streaming support"""
|
"""Service for audio format conversions with streaming support"""
|
||||||
|
@ -148,11 +146,9 @@ class AudioService:
|
||||||
if normalizer is None:
|
if normalizer is None:
|
||||||
normalizer = AudioNormalizer()
|
normalizer = AudioNormalizer()
|
||||||
|
|
||||||
print(len(audio_chunk.audio),"1")
|
audio_chunk.audio = normalizer.normalize(audio_chunk.audio)
|
||||||
audio_chunk.audio = await normalizer.normalize(audio_chunk.audio)
|
|
||||||
print(len(audio_chunk.audio),"2")
|
|
||||||
audio_chunk = AudioService.trim_audio(audio_chunk,chunk_text,speed,is_last_chunk,normalizer)
|
audio_chunk = AudioService.trim_audio(audio_chunk,chunk_text,speed,is_last_chunk,normalizer)
|
||||||
print(len(audio_chunk.audio),"3")
|
|
||||||
# Get or create format-specific writer
|
# Get or create format-specific writer
|
||||||
writer_key = f"{output_format}_{sample_rate}"
|
writer_key = f"{output_format}_{sample_rate}"
|
||||||
if is_first_chunk or writer_key not in AudioService._writers:
|
if is_first_chunk or writer_key not in AudioService._writers:
|
||||||
|
@ -169,7 +165,6 @@ class AudioService:
|
||||||
if is_last_chunk:
|
if is_last_chunk:
|
||||||
final_data = writer.write_chunk(finalize=True)
|
final_data = writer.write_chunk(finalize=True)
|
||||||
del AudioService._writers[writer_key]
|
del AudioService._writers[writer_key]
|
||||||
print(audio_chunk.audio)
|
|
||||||
return final_data if final_data else b"", audio_chunk
|
return final_data if final_data else b"", audio_chunk
|
||||||
|
|
||||||
return chunk_data if chunk_data else b"", audio_chunk
|
return chunk_data if chunk_data else b"", audio_chunk
|
||||||
|
@ -196,6 +191,7 @@ class AudioService:
|
||||||
if normalizer is None:
|
if normalizer is None:
|
||||||
normalizer = AudioNormalizer()
|
normalizer = AudioNormalizer()
|
||||||
|
|
||||||
|
audio_chunk.audio=normalizer.normalize(audio_chunk.audio)
|
||||||
# Trim start and end if enough samples
|
# Trim start and end if enough samples
|
||||||
if len(audio_chunk.audio) > (2 * normalizer.samples_to_trim):
|
if len(audio_chunk.audio) > (2 * normalizer.samples_to_trim):
|
||||||
audio_chunk.audio = audio_chunk.audio[normalizer.samples_to_trim : -normalizer.samples_to_trim]
|
audio_chunk.audio = audio_chunk.audio[normalizer.samples_to_trim : -normalizer.samples_to_trim]
|
||||||
|
|
|
@ -62,7 +62,7 @@ class TTSService:
|
||||||
if is_last:
|
if is_last:
|
||||||
# Skip format conversion for raw audio mode
|
# Skip format conversion for raw audio mode
|
||||||
if not output_format:
|
if not output_format:
|
||||||
yield np.array([], dtype=np.float32)
|
yield np.array([], dtype=np.int16), AudioChunk(np.array([], dtype=np.int16))
|
||||||
return
|
return
|
||||||
result, chunk_data = await AudioService.convert_audio(
|
result, chunk_data = await AudioService.convert_audio(
|
||||||
AudioChunk(np.array([0], dtype=np.float32)), # Dummy data for type checking
|
AudioChunk(np.array([0], dtype=np.float32)), # Dummy data for type checking
|
||||||
|
@ -111,7 +111,7 @@ class TTSService:
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
logger.error(f"Failed to convert audio: {str(e)}")
|
logger.error(f"Failed to convert audio: {str(e)}")
|
||||||
else:
|
else:
|
||||||
chunk_data = await AudioService.trim_audio(chunk_data,
|
chunk_data = AudioService.trim_audio(chunk_data,
|
||||||
chunk_text,
|
chunk_text,
|
||||||
speed,
|
speed,
|
||||||
is_last,
|
is_last,
|
||||||
|
@ -152,7 +152,7 @@ class TTSService:
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
logger.error(f"Failed to convert audio: {str(e)}")
|
logger.error(f"Failed to convert audio: {str(e)}")
|
||||||
else:
|
else:
|
||||||
trimmed = await AudioService.trim_audio(chunk_data,
|
trimmed = AudioService.trim_audio(chunk_data,
|
||||||
chunk_text,
|
chunk_text,
|
||||||
speed,
|
speed,
|
||||||
is_last,
|
is_last,
|
||||||
|
@ -288,7 +288,6 @@ class TTSService:
|
||||||
current_offset+=len(chunk_data.audio) / 24000
|
current_offset+=len(chunk_data.audio) / 24000
|
||||||
|
|
||||||
if result is not None:
|
if result is not None:
|
||||||
print(chunk_data.word_timestamps)
|
|
||||||
yield result,chunk_data
|
yield result,chunk_data
|
||||||
chunk_index += 1
|
chunk_index += 1
|
||||||
else:
|
else:
|
||||||
|
@ -342,17 +341,14 @@ class TTSService:
|
||||||
audio_data_chunks=[]
|
audio_data_chunks=[]
|
||||||
|
|
||||||
try:
|
try:
|
||||||
async for audio_stream,audio_stream_data in self.generate_audio_stream(text,voice,speed=speed,return_timestamps=return_timestamps,lang_code=lang_code):
|
async for audio_stream,audio_stream_data in self.generate_audio_stream(text,voice,speed=speed,return_timestamps=return_timestamps,lang_code=lang_code,output_format=None):
|
||||||
audio_chunks.append(audio_stream_data.audio)
|
audio_chunks.append(audio_stream_data.audio)
|
||||||
audio_data_chunks.append(audio_stream_data)
|
audio_data_chunks.append(audio_stream_data)
|
||||||
|
|
||||||
print(audio_data_chunks[0].audio.shape)
|
|
||||||
|
|
||||||
combined_audio=np.concatenate(audio_chunks)
|
|
||||||
print("1")
|
combined_audio=np.concatenate(audio_chunks,dtype=np.int16)
|
||||||
combined_audio_data=AudioChunk.combine(audio_data_chunks)
|
combined_audio_data=AudioChunk.combine(audio_data_chunks)
|
||||||
print("2")
|
|
||||||
print(len(combined_audio_data.audio))
|
|
||||||
return combined_audio,combined_audio_data
|
return combined_audio,combined_audio_data
|
||||||
"""
|
"""
|
||||||
# Get backend and voice path
|
# Get backend and voice path
|
||||||
|
|
Loading…
Add table
Reference in a new issue