diff --git a/api/src/services/tts_service.py b/api/src/services/tts_service.py index 9196798..51f7346 100644 --- a/api/src/services/tts_service.py +++ b/api/src/services/tts_service.py @@ -103,7 +103,7 @@ class TTSService: if isinstance(backend, KokoroV1): internal_chunk_index = 0 async for chunk_data in self.model_manager.generate( - text_for_model.strip(), # Pass cleaned text to model + chunk_text, (voice_name, voice_path), speed=speed, lang_code=lang_code, @@ -189,7 +189,7 @@ class TTSService: """Get voice path, handling combined voices. Args: - voice: Voice name or combined voice names (e.g., 'af_jadzia(0.7)+af_jessica(0.3)') + voice: Voice name or combined voice names (e.g., 'af_jadzia+af_jessica') Returns: Tuple of (voice name to use, voice path to use) @@ -198,84 +198,70 @@ class TTSService: RuntimeError: If voice not found """ try: - # Regex to handle names, weights, and operators: af_name(weight)[+-]af_other(weight)... - pattern = re.compile(r"([a-zA-Z0-9_]+)(?:\((\d+(?:\.\d+)?)\))?([+-]?)") - matches = pattern.findall(voice.replace(" ", "")) # Remove spaces + # Split the voice on + and - and ensure that they get added to the list eg: hi+bob = ["hi","+","bob"] + split_voice = re.split(r"([-+])", voice) - if not matches: - raise ValueError(f"Could not parse voice string: {voice}") + # If it is only once voice there is no point in loading it up, doing nothing with it, then saving it + if len(split_voice) == 1: + # Since its a single voice the only time that the weight would matter is if voice_weight_normalization is off + if ( + "(" not in voice and ")" not in voice + ) or settings.voice_weight_normalization == True: + path = await self._voice_manager.get_voice_path(voice) + if not path: + raise RuntimeError(f"Voice not found: {voice}") + logger.debug(f"Using single voice path: {path}") + return voice, path - # If only one voice and no explicit weight or operators, handle directly - if len(matches) == 1 and not matches[0][1] and not matches[0][2]: - voice_name = matches[0][0] - path = await self._voice_manager.get_voice_path(voice_name) - if not path: - raise RuntimeError(f"Voice not found: {voice_name}") - logger.debug(f"Using single voice path: {path}") - return voice_name, path - - # Process combinations - voice_parts = [] total_weight = 0 - for name, weight_str, operator in matches: - weight = float(weight_str) if weight_str else 1.0 - voice_parts.append({"name": name, "weight": weight, "op": operator}) - # Use weight directly for total, normalization happens later if enabled - total_weight += weight # Summing base weights before potential normalization - # Check base voices exist - available_voices = await self._voice_manager.list_voices() - for part in voice_parts: - if part["name"] not in available_voices: - raise ValueError(f"Base voice '{part['name']}' not found in combined string '{voice}'. Available: {available_voices}") + for voice_index in range(0, len(split_voice), 2): + voice_object = split_voice[voice_index] + if "(" in voice_object and ")" in voice_object: + voice_name = voice_object.split("(")[0].strip() + voice_weight = float(voice_object.split("(")[1].split(")")[0]) + else: + voice_name = voice_object + voice_weight = 1 - # Determine normalization factor - norm_factor = total_weight if settings.voice_weight_normalization and total_weight > 0 else 1.0 - if settings.voice_weight_normalization: - logger.debug(f"Normalizing combined voice weights by factor: {norm_factor:.2f}") - else: - logger.debug("Voice weight normalization disabled, using raw weights.") + total_weight += voice_weight + split_voice[voice_index] = (voice_name, voice_weight) + # If voice_weight_normalization is false prevent normalizing the weights by setting the total_weight to 1 so it divides each weight by 1 + if settings.voice_weight_normalization == False: + total_weight = 1 - # Load and combine tensors - first_part = voice_parts[0] - base_path = await self._voice_manager.get_voice_path(first_part["name"]) - combined_tensor = await self._load_voice_from_path(base_path, first_part["weight"] / norm_factor) + # Load the first voice as the starting point for voices to be combined onto + path = await self._voice_manager.get_voice_path(split_voice[0][0]) + combined_tensor = await self._load_voice_from_path( + path, split_voice[0][1] / total_weight + ) - current_op = "+" # Implicitly start with addition for the first voice + # Loop through each + or - in split_voice so they can be applied to combined voice + for operation_index in range(1, len(split_voice) - 1, 2): + # Get the voice path of the voice 1 index ahead of the operator + path = await self._voice_manager.get_voice_path( + split_voice[operation_index + 1][0] + ) + voice_tensor = await self._load_voice_from_path( + path, split_voice[operation_index + 1][1] / total_weight + ) - for i in range(len(voice_parts) - 1): - current_part = voice_parts[i] - next_part = voice_parts[i+1] - - # Determine the operation based on the *current* part's operator - op_symbol = current_part["op"] if current_part["op"] else "+" # Default to '+' if no operator - - path = await self._voice_manager.get_voice_path(next_part["name"]) - voice_tensor = await self._load_voice_from_path(path, next_part["weight"] / norm_factor) - - if op_symbol == "+": + # Either add or subtract the voice from the current combined voice + if split_voice[operation_index] == "+": combined_tensor += voice_tensor - logger.debug(f"Adding voice {next_part['name']} (weight {next_part['weight']/norm_factor:.2f})") - elif op_symbol == "-": + else: combined_tensor -= voice_tensor - logger.debug(f"Subtracting voice {next_part['name']} (weight {next_part['weight']/norm_factor:.2f})") - - # Save the new combined voice so it can be loaded later - # Use a safe filename based on the original input string - safe_filename = re.sub(r'[^\w+-]', '_', voice) + ".pt" + # Save the new combined voice so it can be loaded latter temp_dir = tempfile.gettempdir() - combined_path = os.path.join(temp_dir, safe_filename) - logger.debug(f"Saving combined voice '{voice}' to temporary path: {combined_path}") - # Save the tensor to the device specified by settings for model loading consistency - target_device = settings.get_device() - torch.save(combined_tensor.to(target_device), combined_path) - return voice, combined_path # Return original name and temp path - + combined_path = os.path.join(temp_dir, f"{voice}.pt") + logger.debug(f"Saving combined voice to: {combined_path}") + torch.save(combined_tensor, combined_path) + return voice, combined_path except Exception as e: - logger.error(f"Failed to get or combine voice path for '{voice}': {e}") + logger.error(f"Failed to get voice path: {e}") raise @@ -462,7 +448,7 @@ class TTSService: self, text: str, voice: str, - writer: StreamingAudioWriter, # Writer needed even for non-streaming internally + writer: StreamingAudioWriter, speed: float = 1.0, return_timestamps: bool = False, normalization_options: Optional[NormalizationOptions] = NormalizationOptions(), @@ -470,54 +456,26 @@ class TTSService: ) -> AudioChunk: """Generate complete audio for text using streaming internally.""" audio_data_chunks = [] - output_format = None # Signal raw audio mode for internal streaming - combined_chunk = None - try: - # Pass a dummy writer if none provided, as generate_audio_stream requires one - # Although in raw mode (output_format=None), it shouldn't be heavily used for formatting - internal_writer = writer if writer else StreamingAudioWriter(format='wav', sample_rate=settings.sample_rate) + try: async for audio_stream_data in self.generate_audio_stream( text, voice, - internal_writer, # Pass the writer instance + writer, speed=speed, normalization_options=normalization_options, - return_timestamps=return_timestamps, # Pass this down + return_timestamps=return_timestamps, lang_code=lang_code, - output_format=output_format, # Explicitly None for raw audio + output_format=None, ): - # Ensure we only append chunks with actual audio data - # Raw silence chunks generated for pauses will have audio data (zeros) - if audio_stream_data.audio is not None and len(audio_stream_data.audio) > 0: - # Ensure timestamps are preserved if requested - if return_timestamps and not audio_stream_data.word_timestamps: - audio_stream_data.word_timestamps = [] # Initialize if needed + if len(audio_stream_data.audio) > 0: audio_data_chunks.append(audio_stream_data) - if not audio_data_chunks: - logger.warning("No valid audio chunks generated.") - combined_chunk = AudioChunk(audio=np.array([], dtype=np.int16), word_timestamps=[]) - - else: - combined_chunk = AudioChunk.combine(audio_data_chunks) - # Ensure the combined audio is int16 before returning, as downstream expects this raw format. - if combined_chunk.audio.dtype != np.int16: - logger.warning(f"Combined audio dtype is {combined_chunk.audio.dtype}, converting to int16.") - # Assuming normalization happened, scale from float [-1, 1] to int16 - if np.issubdtype(combined_chunk.audio.dtype, np.floating): - combined_chunk.audio = np.clip(combined_chunk.audio * 32767, -32768, 32767).astype(np.int16) - else: - # If it's another type, attempt direct conversion (might be lossy) - combined_chunk.audio = combined_chunk.audio.astype(np.int16) - - - return combined_chunk + combined_audio_data = AudioChunk.combine(audio_data_chunks) + return combined_audio_data except Exception as e: - logger.error(f"Error in combined audio generation: {str(e)}") - raise # Re-raise after logging - # Removed finally block that closed the writer prematurely - # The caller is now responsible for closing the writer after final conversion. + logger.error(f"Error in audio generation: {str(e)}") + raise async def combine_voices(self, voices: List[str]) -> torch.Tensor: @@ -555,49 +513,38 @@ class TTSService: try: # Get backend and voice path backend = self.model_manager.get_backend() - # Use _get_voices_path to handle potential combined voice names passed here too voice_name, voice_path = await self._get_voices_path(voice) if isinstance(backend, KokoroV1): # For Kokoro V1, use generate_from_tokens with raw phonemes - result_audio = None - # Determine language code - first_base_voice_match = re.match(r"([a-zA-Z0-9_]+)", voice_name) - first_base_voice = first_base_voice_match.group(1) if first_base_voice_match else "a" - pipeline_lang_code = lang_code if lang_code else (settings.default_voice_code if settings.default_voice_code else first_base_voice[:1].lower()) - + result = None + # Use provided lang_code or determine from voice name + pipeline_lang_code = lang_code if lang_code else voice[:1].lower() logger.info( - f"Using lang_code '{pipeline_lang_code}' for voice '{voice_name}' in phoneme generation" + f"Using lang_code '{pipeline_lang_code}' for voice '{voice_name}' in phoneme pipeline" ) - # Use backend's pipeline management and iterate through potential chunks - full_audio_list = [] - async for r in backend.generate_from_tokens( # generate_from_tokens is now async - tokens=phonemes, # Pass raw phonemes string - voice=(voice_name, voice_path), # Pass tuple - speed=speed, - lang_code=pipeline_lang_code, - ): - if r is not None and len(r) > 0: - # r is directly the numpy array chunk - full_audio_list.append(r) + try: + # Use backend's pipeline management + for r in backend._get_pipeline( + pipeline_lang_code + ).generate_from_tokens( + tokens=phonemes, # Pass raw phonemes string + voice=voice_path, + speed=speed, + ): + if r.audio is not None: + result = r + break + except Exception as e: + logger.error(f"Failed to generate from phonemes: {e}") + raise RuntimeError(f"Phoneme generation failed: {e}") - - if not full_audio_list: - raise ValueError("No audio generated from phonemes") - - # Combine chunks if necessary - result_audio = np.concatenate(full_audio_list) if len(full_audio_list) > 1 else full_audio_list[0] + if result is None or result.audio is None: + raise ValueError("No audio generated") processing_time = time.time() - start_time - # Normalize the final audio before returning - normalizer = AudioNormalizer() - normalized_audio = normalizer.normalize(result_audio) - # Return as int16 for consistency - if normalized_audio.dtype != np.int16: - normalized_audio = np.clip(normalized_audio * 32767, -32768, 32767).astype(np.int16) - - return normalized_audio, processing_time + return result.audio.numpy(), processing_time else: raise ValueError( "Phoneme generation only supported with Kokoro V1 backend"