diff --git a/api/src/services/tts_service.py b/api/src/services/tts_service.py
index 45163c6..e465f73 100644
--- a/api/src/services/tts_service.py
+++ b/api/src/services/tts_service.py
@@ -270,7 +270,7 @@ class TTSService:
 
             # Save the new combined voice so it can be loaded later
             # Use a safe filename based on the original input string
-            safe_filename = re.sub(r'[^\w+-]', '_', voice) + ".pt"
+            safe_filename = re.sub(r'[^\w+-.\(\)]', '_', voice) + ".pt" # Allow weights in filename
             temp_dir = tempfile.gettempdir()
             combined_path = os.path.join(temp_dir, safe_filename)
             logger.debug(f"Saving combined voice '{voice}' to temporary path: {combined_path}")
@@ -328,6 +328,7 @@ class TTSService:
                     try:
                         logger.debug(f"Generating {pause_duration_s}s silence chunk")
                         silence_samples = int(pause_duration_s * settings.sample_rate)
+                        # Create silence appropriate for AudioService (float32)
                         silence_audio = np.zeros(silence_samples, dtype=np.float32)
                         pause_chunk = AudioChunk(audio=silence_audio, word_timestamps=[]) # Empty timestamps for silence
 
@@ -340,13 +341,14 @@ class TTSService:
                             if formatted_pause_chunk.output:
                                 yield formatted_pause_chunk
                         else: # Raw audio mode
+                            # Normalize to int16 for raw output consistency
                             pause_chunk.audio = stream_normalizer.normalize(pause_chunk.audio)
                             if len(pause_chunk.audio) > 0:
                                 yield pause_chunk
 
                         # Update offset based on silence duration
                         current_offset += pause_duration_s
-                        chunk_index += 1
+                        chunk_index += 1 # Count pause as a yielded chunk
 
                     except Exception as e:
                         logger.error(f"Failed to process pause chunk: {str(e)}")
@@ -368,8 +370,8 @@ class TTSService:
                             speed,
                             writer,
                             output_format,
-                            is_first=(chunk_index == 0),
-                            is_last=False,
+                            is_first=(chunk_index == 0), # Check if this is the very first *audio* chunk
+                            is_last=False, # is_last is handled separately after the loop
                             normalizer=stream_normalizer,
                             lang_code=pipeline_lang_code,
                             return_timestamps=return_timestamps,
@@ -381,7 +383,6 @@ class TTSService:
                                     timestamp.end_time += current_offset
 
                             # Update offset based on the *actual duration* of the generated audio chunk
-                            # Check if audio data exists before calculating duration
                             chunk_duration = 0
                             if chunk_data.audio is not None and len(chunk_data.audio) > 0:
                                 chunk_duration = len(chunk_data.audio) / settings.sample_rate
@@ -397,7 +398,6 @@ class TTSService:
                                     f"No audio generated or output for text chunk: '{text_chunk_for_model[:50]}...'"
                                 )
 
-
                         # --- Add pause after newline (if applicable) ---
                         if has_trailing_newline:
                             newline_pause_s = 0.5
@@ -445,20 +445,21 @@ class TTSService:
                         "", [], voice_name, voice_path, speed, writer, output_format,
                         is_first=False, is_last=True, normalizer=stream_normalizer, lang_code=pipeline_lang_code
                     ):
+                         # Yield final formatted chunk or raw empty chunk
                         if output_format and final_chunk_data.output:
                              yield final_chunk_data
-                        elif not output_format and final_chunk_data.audio is not None and len(final_chunk_data.audio) > 0:
-                             yield final_chunk_data # Should yield empty chunk in raw mode upon finalize
+                        elif not output_format: # Raw mode: Finalize yields empty chunk signal
+                             yield final_chunk_data # Yields empty AudioChunk
                 except Exception as e:
                     logger.error(f"Failed to finalize audio stream: {str(e)}")
 
         except Exception as e:
             logger.exception(f"Error during audio stream generation: {str(e)}") # Use exception for traceback
-            # Ensure writer is closed on error
-            try:
-                writer.close()
-            except Exception as close_e:
-                logger.error(f"Error closing writer during exception handling: {close_e}")
+            # Ensure writer is closed on error - moved to caller (e.g., route handler)
+            # try:
+            #     writer.close()
+            # except Exception as close_e:
+            #     logger.error(f"Error closing writer during exception handling: {close_e}")
             raise e # Re-raise the original exception
 
 
@@ -477,38 +478,51 @@ class TTSService:
         output_format = None # Signal raw audio mode for internal streaming
         combined_chunk = None
         try:
+            # Pass a dummy writer if none provided, as generate_audio_stream requires one
+            # Although in raw mode (output_format=None), it shouldn't be heavily used for formatting
+            internal_writer = writer if writer else StreamingAudioWriter(format='wav', sample_rate=settings.sample_rate)
+
             async for audio_stream_data in self.generate_audio_stream(
                 text,
                 voice,
-                writer, # Pass writer, although it won't be used for formatting here
+                internal_writer, # Pass the writer instance
                 speed=speed,
                 normalization_options=normalization_options,
-                return_timestamps=return_timestamps,
+                return_timestamps=return_timestamps, # Pass this down
                 lang_code=lang_code,
                 output_format=output_format, # Explicitly None for raw audio
             ):
                 # Ensure we only append chunks with actual audio data
                 # Raw silence chunks generated for pauses will have audio data (zeros)
                 if audio_stream_data.audio is not None and len(audio_stream_data.audio) > 0:
+                     # Ensure timestamps are preserved if requested
+                    if return_timestamps and not audio_stream_data.word_timestamps:
+                         audio_stream_data.word_timestamps = [] # Initialize if needed
                     audio_data_chunks.append(audio_stream_data)
 
             if not audio_data_chunks:
                  logger.warning("No valid audio chunks generated.")
                  combined_chunk = AudioChunk(audio=np.array([], dtype=np.int16), word_timestamps=[])
+
             else:
                 combined_chunk = AudioChunk.combine(audio_data_chunks)
+                # Ensure the combined audio is int16 before returning, as downstream expects this raw format.
+                if combined_chunk.audio.dtype != np.int16:
+                     logger.warning(f"Combined audio dtype is {combined_chunk.audio.dtype}, converting to int16.")
+                     # Assuming normalization happened, scale from float [-1, 1] to int16
+                     if np.issubdtype(combined_chunk.audio.dtype, np.floating):
+                         combined_chunk.audio = np.clip(combined_chunk.audio * 32767, -32768, 32767).astype(np.int16)
+                     else:
+                         # If it's another type, attempt direct conversion (might be lossy)
+                         combined_chunk.audio = combined_chunk.audio.astype(np.int16)
+
 
             return combined_chunk
         except Exception as e:
             logger.error(f"Error in combined audio generation: {str(e)}")
             raise # Re-raise after logging
-        finally:
-            # Explicitly close the writer if it was passed, though it shouldn't hold resources in raw mode
-             try:
-                 writer.close()
-             except Exception:
-                 pass # Ignore errors during cleanup
-
+        # Removed finally block that closed the writer prematurely
+        # The caller is now responsible for closing the writer after final conversion.
 
 
     async def combine_voices(self, voices: List[str]) -> torch.Tensor:
@@ -584,6 +598,10 @@ class TTSService:
                  # Normalize the final audio before returning
                 normalizer = AudioNormalizer()
                 normalized_audio = normalizer.normalize(result_audio)
+                # Return as int16 for consistency
+                if normalized_audio.dtype != np.int16:
+                     normalized_audio = np.clip(normalized_audio * 32767, -32768, 32767).astype(np.int16)
+
                 return normalized_audio, processing_time
             else:
                 raise ValueError(
diff --git a/run-tests.sh b/run-tests.sh
new file mode 100755
index 0000000..9cc671e
--- /dev/null
+++ b/run-tests.sh
@@ -0,0 +1,21 @@
+#!/bin/bash
+
+# Get project root directory
+PROJECT_ROOT=$(pwd)
+
+# Set environment variables
+export USE_GPU=false
+export USE_ONNX=false
+export PYTHONPATH=$PROJECT_ROOT:$PROJECT_ROOT/api
+export MODEL_DIR=src/models
+export VOICES_DIR=src/voices/v1_0
+export WEB_PLAYER_PATH=$PROJECT_ROOT/web
+# Set the espeak-ng data path to your location
+export ESPEAK_DATA_PATH=/usr/lib/x86_64-linux-gnu/espeak-ng-data
+
+# Run FastAPI with CPU extras using uv run
+# Note: espeak may still require manual installation,
+uv pip install -e ".[test,cpu]"
+uv run --no-sync python docker/scripts/download_model.py --output api/src/models/v1_0
+
+uv run pytest api/tests/ --asyncio-mode=auto --cov=api --cov-report=term-missing