From 9c1ced237b1bdf1be6e230e777f75098e5a1a94e Mon Sep 17 00:00:00 2001
From: Fireblade <fireblade5234@gmail.com>
Date: Fri, 14 Feb 2025 14:36:17 -0500
Subject: [PATCH] Cleaned up some code and fixed an error in the readme

---
 README.md                       |   2 +-
 api/src/routers/development.py  | 157 +-------------------
 api/src/services/tts_service.py | 244 --------------------------------
 3 files changed, 3 insertions(+), 400 deletions(-)
diff --git a/README.md b/README.md
index e7106ec..50dd23d 100644
--- a/README.md
+++ b/README.md
@@ -342,7 +342,7 @@ Key Performance Metrics:
 <summary>GPU Vs. CPU</summary>
 
 ```bash
-# GPU: Requires NVIDIA GPU with CUDA 12.1 support (~35x-100x realtime speed)
+# GPU: Requires NVIDIA GPU with CUDA 12.8 support (~35x-100x realtime speed)
 cd docker/gpu
 docker compose up --build
 
diff --git a/api/src/routers/development.py b/api/src/routers/development.py
index 1087243..3b7d38b 100644
--- a/api/src/routers/development.py
+++ b/api/src/routers/development.py
@@ -321,7 +321,7 @@ async def create_captioned_speech(
         )
     except Exception as e:
         # Handle unexpected errors
-        logger.error(f"Unexpected error in speech generation: {str(e)}")
+        logger.error(f"Unexpected error in captioned speech generation: {str(e)}")
         raise HTTPException(
             status_code=500,
             detail={
@@ -329,157 +329,4 @@ async def create_captioned_speech(
                 "message": str(e),
                 "type": "server_error",
             },
-        )
-    
-    """
-    try:
-        # Set content type based on format
-        content_type = {
-            "mp3": "audio/mpeg",
-            "opus": "audio/opus",
-            "aac": "audio/aac",
-            "flac": "audio/flac",
-            "wav": "audio/wav",
-            "pcm": "audio/pcm",
-        }.get(request.response_format, f"audio/{request.response_format}")
-
-        # Create streaming audio writer and normalizer
-        writer = StreamingAudioWriter(
-            format=request.response_format, sample_rate=24000, channels=1
-        )
-        normalizer = AudioNormalizer()
-
-        # Get voice path
-        voice_name, voice_path = await tts_service._get_voice_path(request.voice)
-
-        # Use provided lang_code or determine from voice name
-        pipeline_lang_code = request.lang_code if request.lang_code else request.voice[0].lower()
-        logger.info(
-            f"Using lang_code '{pipeline_lang_code}' for voice '{voice_name}' in text chunking"
-        )
-
-        # Get backend and pipeline
-        backend = tts_service.model_manager.get_backend()
-        pipeline = backend._get_pipeline(pipeline_lang_code)
-
-        # Create temp file writer for timestamps
-        temp_writer = TempFileWriter("json")
-        await temp_writer.__aenter__()  # Initialize temp file
-        # Get just the filename without the path
-        timestamps_filename = Path(temp_writer.download_path).name
-
-        # Initialize variables for timestamps
-        word_timestamps = []
-        current_offset = 0.0
-
-        async def generate_chunks():
-            nonlocal current_offset, word_timestamps
-            try:
-                # Process text in chunks with smart splitting
-                async for chunk_text, tokens in smart_split(request.input):
-                    # Process chunk with pipeline
-                    for result in pipeline(chunk_text, voice=voice_path, speed=request.speed):
-                        if result.audio is not None:
-                            # Process timestamps for this chunk
-                            if hasattr(result, "tokens") and result.tokens and result.pred_dur is not None:
-                                try:
-                                    # Join timestamps for this chunk's tokens
-                                    KPipeline.join_timestamps(result.tokens, result.pred_dur)
-
-                                    # Add timestamps with offset
-                                    for token in result.tokens:
-                                        if not all(
-                                            hasattr(token, attr)
-                                            for attr in ["text", "start_ts", "end_ts"]
-                                        ):
-                                            continue
-                                        if not token.text or not token.text.strip():
-                                            continue
-
-                                        # Apply offset to timestamps
-                                        start_time = float(token.start_ts) + current_offset
-                                        end_time = float(token.end_ts) + current_offset
-
-                                        word_timestamps.append(
-                                            {
-                                                "word": str(token.text).strip(),
-                                                "start_time": start_time,
-                                                "end_time": end_time,
-                                            }
-                                        )
-
-                                    # Update offset for next chunk
-                                    chunk_duration = float(result.pred_dur.sum()) / 80  # Convert frames to seconds
-                                    current_offset = max(current_offset + chunk_duration, end_time)
-
-                                except Exception as e:
-                                    logger.error(f"Failed to process timestamps for chunk: {e}")
-
-                            # Process audio
-                            audio_chunk = result.audio.numpy()
-                            normalized_audio = await normalizer.normalize(audio_chunk)
-                            chunk_bytes = writer.write_chunk(normalized_audio)
-                            if chunk_bytes:
-                                yield chunk_bytes
-
-                # Write timestamps to temp file
-                timestamps_json = json.dumps(word_timestamps)
-                await temp_writer.write(timestamps_json.encode())
-                await temp_writer.finalize()
-
-                # Finalize audio
-                final_bytes = writer.write_chunk(finalize=True)
-                if final_bytes:
-                    yield final_bytes
-
-            except Exception as e:
-                logger.error(f"Error in audio generation: {str(e)}")
-                # Clean up writer on error
-                writer.write_chunk(finalize=True)
-                await temp_writer.__aexit__(type(e), e, e.__traceback__)
-                # Re-raise the original exception
-                raise
-
-        return StreamingResponse(
-            generate_chunks(),
-            media_type=content_type,
-            headers={
-                "Content-Disposition": f"attachment; filename=speech.{request.response_format}",
-                "X-Accel-Buffering": "no",
-                "Cache-Control": "no-cache",
-                "Transfer-Encoding": "chunked",
-                "X-Timestamps-Path": timestamps_filename,
-            },
-        )
-
-    except ValueError as e:
-        logger.warning(f"Invalid request: {str(e)}")
-        raise HTTPException(
-            status_code=400,
-            detail={
-                "error": "validation_error",
-                "message": str(e),
-                "type": "invalid_request_error",
-            },
-        )
-    except RuntimeError as e:
-        logger.error(f"Processing error: {str(e)}")
-        raise HTTPException(
-            status_code=500,
-            detail={
-                "error": "processing_error",
-                "message": str(e),
-                "type": "server_error",
-            },
-        )
-    except Exception as e:
-        logger.error(f"Unexpected error in speech generation: {str(e)}")
-        raise HTTPException(
-            status_code=500,
-            detail={
-                "error": "processing_error",
-                "message": str(e),
-                "type": "server_error",
-            },
-        )
-    """
\ No newline at end of file
+        )
\ No newline at end of file
diff --git a/api/src/services/tts_service.py b/api/src/services/tts_service.py
index 733c275..cb94f94 100644
--- a/api/src/services/tts_service.py
+++ b/api/src/services/tts_service.py
@@ -346,250 +346,6 @@ class TTSService:
 
             combined_audio_data=AudioChunk.combine(audio_data_chunks)
             return combined_audio_data.audio,combined_audio_data
-            """
-            # Get backend and voice path
-            backend = self.model_manager.get_backend()
-            voice_name, voice_path = await self._get_voice_path(voice)
-
-            if isinstance(backend, KokoroV1):
-                # Use provided lang_code or determine from voice name
-                pipeline_lang_code = lang_code if lang_code else voice[:1].lower()
-                logger.info(
-                    f"Using lang_code '{pipeline_lang_code}' for voice '{voice_name}' in text chunking"
-                )
-
-                # Get pipelines from backend for proper device management
-                try:
-                    # Initialize quiet pipeline for text chunking
-                    text_chunks = []
-                    current_offset = 0.0  # Track time offset for timestamps
-
-                    logger.debug("Splitting text into chunks...")
-                    # Use backend's pipeline management
-                    for result in backend._get_pipeline(pipeline_lang_code)(text):
-                        if result.graphemes and result.phonemes:
-                            text_chunks.append((result.graphemes, result.phonemes))
-                    logger.debug(f"Split text into {len(text_chunks)} chunks")
-
-                    # Process each chunk
-                    for chunk_idx, (chunk_text, chunk_phonemes) in enumerate(
-                        text_chunks
-                    ):
-                        logger.debug(
-                            f"Processing chunk {chunk_idx + 1}/{len(text_chunks)}: '{chunk_text[:50]}...'"
-                        )
-
-                        # Use backend's pipeline for generation
-                        for result in backend._get_pipeline(pipeline_lang_code)(
-                            chunk_text, voice=voice_path, speed=speed
-                        ):
-                            # Collect audio chunks
-                            if result.audio is not None:
-                                chunks.append(result.audio.numpy())
-
-                            # Process timestamps for this chunk
-                            if (
-                                return_timestamps
-                                and hasattr(result, "tokens")
-                                and result.tokens
-                            ):
-                                logger.debug(
-                                    f"Processing chunk timestamps with {len(result.tokens)} tokens"
-                                )
-                                if result.pred_dur is not None:
-                                    try:
-                                        # Join timestamps for this chunk's tokens
-                                        KPipeline.join_timestamps(
-                                            result.tokens, result.pred_dur
-                                        )
-
-                                        # Add timestamps with offset
-                                        for token in result.tokens:
-                                            if not all(
-                                                hasattr(token, attr)
-                                                for attr in [
-                                                    "text",
-                                                    "start_ts",
-                                                    "end_ts",
-                                                ]
-                                            ):
-                                                continue
-                                            if not token.text or not token.text.strip():
-                                                continue
-
-                                            # Apply offset to timestamps
-                                            start_time = (
-                                                float(token.start_ts) + current_offset
-                                            )
-                                            end_time = (
-                                                float(token.end_ts) + current_offset
-                                            )
-
-                                            word_timestamps.append(
-                                                {
-                                                    "word": str(token.text).strip(),
-                                                    "start_time": start_time,
-                                                    "end_time": end_time,
-                                                }
-                                            )
-                                            logger.debug(
-                                                f"Added timestamp for word '{token.text}': {start_time:.3f}s - {end_time:.3f}s"
-                                            )
-
-                                        # Update offset for next chunk based on pred_dur
-                                        chunk_duration = (
-                                            float(result.pred_dur.sum()) / 80
-                                        )  # Convert frames to seconds
-                                        current_offset = max(
-                                            current_offset + chunk_duration, end_time
-                                        )
-                                        logger.debug(
-                                            f"Updated time offset to {current_offset:.3f}s"
-                                        )
-
-                                    except Exception as e:
-                                        logger.error(
-                                            f"Failed to process timestamps for chunk: {e}"
-                                        )
-                                logger.debug(
-                                    f"Processing timestamps with pred_dur shape: {result.pred_dur.shape}"
-                                )
-                                try:
-                                    # Join timestamps for this chunk's tokens
-                                    KPipeline.join_timestamps(
-                                        result.tokens, result.pred_dur
-                                    )
-                                    logger.debug(
-                                        "Successfully joined timestamps for chunk"
-                                    )
-                                except Exception as e:
-                                    logger.error(
-                                        f"Failed to join timestamps for chunk: {e}"
-                                    )
-                                    continue
-
-                            # Convert tokens to timestamps
-                            for token in result.tokens:
-                                try:
-                                    # Skip tokens without required attributes
-                                    if not all(
-                                        hasattr(token, attr)
-                                        for attr in ["text", "start_ts", "end_ts"]
-                                    ):
-                                        logger.debug(
-                                            f"Skipping token missing attributes: {dir(token)}"
-                                        )
-                                        continue
-
-                                    # Get and validate text
-                                    text = (
-                                        str(token.text).strip()
-                                        if token.text is not None
-                                        else ""
-                                    )
-                                    if not text:
-                                        logger.debug("Skipping empty token")
-                                        continue
-
-                                    # Get and validate timestamps
-                                    start_ts = getattr(token, "start_ts", None)
-                                    end_ts = getattr(token, "end_ts", None)
-                                    if start_ts is None or end_ts is None:
-                                        logger.debug(
-                                            f"Skipping token with None timestamps: {text}"
-                                        )
-                                        continue
-
-                                    # Convert timestamps to float
-                                    try:
-                                        start_time = float(start_ts)
-                                        end_time = float(end_ts)
-                                    except (TypeError, ValueError):
-                                        logger.debug(
-                                            f"Skipping token with invalid timestamps: {text}"
-                                        )
-                                        continue
-
-                                    # Add timestamp
-                                    word_timestamps.append(
-                                        {
-                                            "word": text,
-                                            "start_time": start_time,
-                                            "end_time": end_time,
-                                        }
-                                    )
-                                    logger.debug(
-                                        f"Added timestamp for word '{text}': {start_time:.3f}s - {end_time:.3f}s"
-                                    )
-                                except Exception as e:
-                                    logger.warning(f"Error processing token: {e}")
-                                    continue
-
-                except Exception as e:
-                    logger.error(f"Failed to process text with pipeline: {e}")
-                    raise RuntimeError(f"Pipeline processing failed: {e}")
-
-                if not chunks:
-                    raise ValueError("No audio chunks were generated successfully")
-
-                # Combine chunks
-                audio = np.concatenate(chunks) if len(chunks) > 1 else chunks[0]
-                processing_time = time.time() - start_time
-
-                if return_timestamps:
-                    # Validate timestamps before returning
-                    if not word_timestamps:
-                        logger.warning("No valid timestamps were generated")
-                    else:
-                        # Sort timestamps by start time to ensure proper order
-                        word_timestamps.sort(key=lambda x: x["start_time"])
-                        # Validate timestamp sequence
-                        for i in range(1, len(word_timestamps)):
-                            prev = word_timestamps[i - 1]
-                            curr = word_timestamps[i]
-                            if curr["start_time"] < prev["end_time"]:
-                                logger.warning(
-                                    f"Overlapping timestamps detected: '{prev['word']}' ({prev['start_time']:.3f}-{prev['end_time']:.3f}) and '{curr['word']}' ({curr['start_time']:.3f}-{curr['end_time']:.3f})"
-                                )
-
-                        logger.debug(
-                            f"Returning {len(word_timestamps)} word timestamps"
-                        )
-                        logger.debug(
-                            f"First timestamp: {word_timestamps[0]['word']} at {word_timestamps[0]['start_time']:.3f}s"
-                        )
-                        logger.debug(
-                            f"Last timestamp: {word_timestamps[-1]['word']} at {word_timestamps[-1]['end_time']:.3f}s"
-                        )
-
-                    return audio, processing_time, word_timestamps
-                return audio, processing_time
-
-            else:
-                # For legacy backends
-                async for chunk in self.generate_audio_stream(
-                    text,
-                    voice,
-                    speed,  # Default to WAV for raw audio
-                ):
-                    if chunk is not None:
-                        chunks.append(chunk)
-
-                if not chunks:
-                    raise ValueError("No audio chunks were generated successfully")
-
-                # Combine chunks
-                audio = np.concatenate(chunks) if len(chunks) > 1 else chunks[0]
-                processing_time = time.time() - start_time
-
-                if return_timestamps:
-                    return (
-                        audio,
-                        processing_time,
-                        [],
-                    )  # Empty timestamps for legacy backends
-                return audio, processing_time
-        """
         except Exception as e:
             logger.error(f"Error in audio generation: {str(e)}")
             raise