diff --git a/api/src/inference/base.py b/api/src/inference/base.py index b9e75d9..9ae466f 100644 --- a/api/src/inference/base.py +++ b/api/src/inference/base.py @@ -21,7 +21,7 @@ class AudioChunk: output=AudioChunk(audio_chunk_list[0].audio,audio_chunk_list[0].word_timestamps) for audio_chunk in audio_chunk_list[1:]: - output.audio=np.concatenate((output.audio,audio_chunk.audio)) + output.audio=np.concatenate((output.audio,audio_chunk.audio),dtype=np.int16) if output.word_timestamps is not None: output.word_timestamps+=output.word_timestamps diff --git a/api/src/routers/openai_compatible.py b/api/src/routers/openai_compatible.py index 46a0ee9..0d06547 100644 --- a/api/src/routers/openai_compatible.py +++ b/api/src/routers/openai_compatible.py @@ -7,6 +7,7 @@ import re import tempfile from typing import AsyncGenerator, Dict, List, Union, Tuple from urllib import response +import numpy as np import aiofiles from ..inference.base import AudioChunk @@ -259,24 +260,31 @@ async def create_speech( ) else: # Generate complete audio using public interface - audio, audio_data = await tts_service.generate_audio( + _, audio_data = await tts_service.generate_audio( text=request.input, voice=voice_name, speed=request.speed, lang_code=request.lang_code, ) - - # Convert to requested format with proper finalization content, audio_data = await AudioService.convert_audio( audio_data, 24000, request.response_format, is_first_chunk=True, + is_last_chunk=False, + ) + + # Convert to requested format with proper finalization + final, _ = await AudioService.convert_audio( + AudioChunk(np.array([], dtype=np.int16)), + 24000, + request.response_format, + is_first_chunk=False, is_last_chunk=True, ) - print(content,request.response_format) + output=content+final return Response( - content=content, + content=output, media_type=content_type, headers={ "Content-Disposition": f"attachment; filename=speech.{request.response_format}", diff --git a/api/src/services/audio.py b/api/src/services/audio.py index f453a0c..c713c09 100644 --- a/api/src/services/audio.py +++ b/api/src/services/audio.py @@ -72,7 +72,7 @@ class AudioNormalizer: return max(non_silent_index_start - self.samples_to_pad_start,0), min(non_silent_index_end + math.ceil(samples_to_pad_end / speed),len(audio_data)) - async def normalize(self, audio_data: np.ndarray) -> np.ndarray: + def normalize(self, audio_data: np.ndarray) -> np.ndarray: """Convert audio data to int16 range Args: @@ -80,12 +80,10 @@ class AudioNormalizer: Returns: Normalized audio data """ - if len(audio_data) == 0: - raise ValueError("Empty audio data") - - # Scale directly to int16 range with clipping - return np.clip(audio_data * 32767, -32768, 32767).astype(np.int16) - + if audio_data.dtype != np.int16: + # Scale directly to int16 range with clipping + return np.clip(audio_data * 32767, -32768, 32767).astype(np.int16) + return audio_data class AudioService: """Service for audio format conversions with streaming support""" @@ -148,11 +146,9 @@ class AudioService: if normalizer is None: normalizer = AudioNormalizer() - print(len(audio_chunk.audio),"1") - audio_chunk.audio = await normalizer.normalize(audio_chunk.audio) - print(len(audio_chunk.audio),"2") + audio_chunk.audio = normalizer.normalize(audio_chunk.audio) audio_chunk = AudioService.trim_audio(audio_chunk,chunk_text,speed,is_last_chunk,normalizer) - print(len(audio_chunk.audio),"3") + # Get or create format-specific writer writer_key = f"{output_format}_{sample_rate}" if is_first_chunk or writer_key not in AudioService._writers: @@ -169,7 +165,6 @@ class AudioService: if is_last_chunk: final_data = writer.write_chunk(finalize=True) del AudioService._writers[writer_key] - print(audio_chunk.audio) return final_data if final_data else b"", audio_chunk return chunk_data if chunk_data else b"", audio_chunk @@ -196,6 +191,7 @@ class AudioService: if normalizer is None: normalizer = AudioNormalizer() + audio_chunk.audio=normalizer.normalize(audio_chunk.audio) # Trim start and end if enough samples if len(audio_chunk.audio) > (2 * normalizer.samples_to_trim): audio_chunk.audio = audio_chunk.audio[normalizer.samples_to_trim : -normalizer.samples_to_trim] diff --git a/api/src/services/tts_service.py b/api/src/services/tts_service.py index 1e49671..b08f64d 100644 --- a/api/src/services/tts_service.py +++ b/api/src/services/tts_service.py @@ -62,7 +62,7 @@ class TTSService: if is_last: # Skip format conversion for raw audio mode if not output_format: - yield np.array([], dtype=np.float32) + yield np.array([], dtype=np.int16), AudioChunk(np.array([], dtype=np.int16)) return result, chunk_data = await AudioService.convert_audio( AudioChunk(np.array([0], dtype=np.float32)), # Dummy data for type checking @@ -111,7 +111,7 @@ class TTSService: except Exception as e: logger.error(f"Failed to convert audio: {str(e)}") else: - chunk_data = await AudioService.trim_audio(chunk_data, + chunk_data = AudioService.trim_audio(chunk_data, chunk_text, speed, is_last, @@ -152,7 +152,7 @@ class TTSService: except Exception as e: logger.error(f"Failed to convert audio: {str(e)}") else: - trimmed = await AudioService.trim_audio(chunk_data, + trimmed = AudioService.trim_audio(chunk_data, chunk_text, speed, is_last, @@ -288,7 +288,6 @@ class TTSService: current_offset+=len(chunk_data.audio) / 24000 if result is not None: - print(chunk_data.word_timestamps) yield result,chunk_data chunk_index += 1 else: @@ -342,17 +341,14 @@ class TTSService: audio_data_chunks=[] try: - async for audio_stream,audio_stream_data in self.generate_audio_stream(text,voice,speed=speed,return_timestamps=return_timestamps,lang_code=lang_code): + async for audio_stream,audio_stream_data in self.generate_audio_stream(text,voice,speed=speed,return_timestamps=return_timestamps,lang_code=lang_code,output_format=None): audio_chunks.append(audio_stream_data.audio) audio_data_chunks.append(audio_stream_data) - print(audio_data_chunks[0].audio.shape) + - combined_audio=np.concatenate(audio_chunks) - print("1") + combined_audio=np.concatenate(audio_chunks,dtype=np.int16) combined_audio_data=AudioChunk.combine(audio_data_chunks) - print("2") - print(len(combined_audio_data.audio)) return combined_audio,combined_audio_data """ # Get backend and voice path