diff --git a/api/src/inference/base.py b/api/src/inference/base.py index e25c2b5..fe70639 100644 --- a/api/src/inference/base.py +++ b/api/src/inference/base.py @@ -12,12 +12,12 @@ class AudioChunk: def __init__( self, - audio: np.ndarray, - word_timestamps: Optional[List] = [], + audio: np.ndarray, # dtype: np.int16 + word_timestamps: Optional[List] = None, # Using None instead of `[]` to avoid reference to the same empty list. output: Optional[Union[bytes, np.ndarray]] = b"", ): self.audio = audio - self.word_timestamps = word_timestamps + self.word_timestamps = word_timestamps if word_timestamps is not None else [] self.output = output @staticmethod diff --git a/api/src/services/tts_service.py b/api/src/services/tts_service.py index 962d42d..7a8aaa3 100644 --- a/api/src/services/tts_service.py +++ b/api/src/services/tts_service.py @@ -65,7 +65,7 @@ class TTSService: # Handle silence tags, eg: `[silent](0.5s)` if match := SILENCE_TAG.match(chunk_text): silence_duration = float(match.group(1)) - silence_audio = np.zeros(int(silence_duration * 24000), dtype=np.float32) + silence_audio = np.zeros(int(silence_duration * 24000), dtype=np.int16) if not output_format: yield AudioChunk(silence_audio, output=b"") return @@ -89,7 +89,7 @@ class TTSService: return chunk_data = await AudioService.convert_audio( AudioChunk( - np.array([], dtype=np.float32) + np.array([], dtype=np.int16) ), # Dummy data for type checking output_format, writer,