mirror of
https://github.com/remsky/Kokoro-FastAPI.git
synced 2025-08-05 16:48:53 +00:00
Refactor audio service chunk size, remove unused help icon, and optimize text processing limits
This commit is contained in:
parent
5d48688ab0
commit
444491defe
5 changed files with 63 additions and 226 deletions
|
@ -9,9 +9,9 @@ from .normalizer import normalize_text
|
||||||
from .vocabulary import tokenize
|
from .vocabulary import tokenize
|
||||||
|
|
||||||
# Target token ranges
|
# Target token ranges
|
||||||
TARGET_MIN = 200
|
TARGET_MIN = 175
|
||||||
TARGET_MAX = 350
|
TARGET_MAX = 250
|
||||||
ABSOLUTE_MAX = 500
|
ABSOLUTE_MAX = 450
|
||||||
|
|
||||||
def process_text_chunk(text: str, language: str = "a", skip_phonemize: bool = False) -> List[int]:
|
def process_text_chunk(text: str, language: str = "a", skip_phonemize: bool = False) -> List[int]:
|
||||||
"""Process a chunk of text through normalization, phonemization, and tokenization.
|
"""Process a chunk of text through normalization, phonemization, and tokenization.
|
||||||
|
@ -27,12 +27,24 @@ def process_text_chunk(text: str, language: str = "a", skip_phonemize: bool = Fa
|
||||||
start_time = time.time()
|
start_time = time.time()
|
||||||
|
|
||||||
if skip_phonemize:
|
if skip_phonemize:
|
||||||
|
# Input is already phonemes, just tokenize
|
||||||
|
t0 = time.time()
|
||||||
tokens = tokenize(text)
|
tokens = tokenize(text)
|
||||||
|
t1 = time.time()
|
||||||
else:
|
else:
|
||||||
# Normal text processing pipeline
|
# Normal text processing pipeline
|
||||||
|
t0 = time.time()
|
||||||
normalized = normalize_text(text)
|
normalized = normalize_text(text)
|
||||||
|
t1 = time.time()
|
||||||
|
|
||||||
|
|
||||||
|
t0 = time.time()
|
||||||
phonemes = phonemize(normalized, language, normalize=False) # Already normalized
|
phonemes = phonemize(normalized, language, normalize=False) # Already normalized
|
||||||
|
t1 = time.time()
|
||||||
|
|
||||||
|
t0 = time.time()
|
||||||
tokens = tokenize(phonemes)
|
tokens = tokenize(phonemes)
|
||||||
|
t1 = time.time()
|
||||||
|
|
||||||
total_time = time.time() - start_time
|
total_time = time.time() - start_time
|
||||||
logger.debug(f"Total processing took {total_time*1000:.2f}ms for chunk: '{text[:50]}...'")
|
logger.debug(f"Total processing took {total_time*1000:.2f}ms for chunk: '{text[:50]}...'")
|
||||||
|
@ -83,55 +95,13 @@ def get_sentence_info(text: str) -> List[Tuple[str, List[int], int]]:
|
||||||
return results
|
return results
|
||||||
|
|
||||||
async def smart_split(text: str, max_tokens: int = ABSOLUTE_MAX) -> AsyncGenerator[Tuple[str, List[int]], None]:
|
async def smart_split(text: str, max_tokens: int = ABSOLUTE_MAX) -> AsyncGenerator[Tuple[str, List[int]], None]:
|
||||||
"""Build optimal chunks targeting 300-400 tokens, never exceeding max_tokens.
|
"""Build optimal chunks targeting 300-400 tokens, never exceeding max_tokens."""
|
||||||
Special symbols:
|
|
||||||
- <<>> : Forces a break between chunks
|
|
||||||
"""
|
|
||||||
CHUNK_BREAK = "<<>>"
|
|
||||||
|
|
||||||
start_time = time.time()
|
start_time = time.time()
|
||||||
chunk_count = 0
|
chunk_count = 0
|
||||||
logger.info(f"Starting smart split for {len(text)} chars")
|
logger.info(f"Starting smart split for {len(text)} chars")
|
||||||
|
|
||||||
# First split on forced break symbol
|
# Process all sentences
|
||||||
forced_chunks = [chunk.strip() for chunk in text.split(CHUNK_BREAK) if chunk.strip()]
|
sentences = get_sentence_info(text)
|
||||||
|
|
||||||
# If no forced breaks, process normally
|
|
||||||
if len(forced_chunks) <= 1:
|
|
||||||
sentences = get_sentence_info(text)
|
|
||||||
else:
|
|
||||||
# Process each forced chunk separately
|
|
||||||
for forced_chunk in forced_chunks:
|
|
||||||
# Process sentences within this forced chunk
|
|
||||||
chunk_sentences = get_sentence_info(forced_chunk)
|
|
||||||
|
|
||||||
# Process and yield all sentences in this chunk before moving to next
|
|
||||||
current_chunk = []
|
|
||||||
current_tokens = []
|
|
||||||
current_count = 0
|
|
||||||
|
|
||||||
for sentence, tokens, count in chunk_sentences:
|
|
||||||
if current_count + count <= TARGET_MAX:
|
|
||||||
current_chunk.append(sentence)
|
|
||||||
current_tokens.extend(tokens)
|
|
||||||
current_count += count
|
|
||||||
else:
|
|
||||||
if current_chunk:
|
|
||||||
chunk_text = " ".join(current_chunk)
|
|
||||||
chunk_count += 1
|
|
||||||
yield chunk_text, current_tokens
|
|
||||||
current_chunk = [sentence]
|
|
||||||
current_tokens = tokens
|
|
||||||
current_count = count
|
|
||||||
|
|
||||||
# Yield remaining sentences in this forced chunk
|
|
||||||
if current_chunk:
|
|
||||||
chunk_text = " ".join(current_chunk)
|
|
||||||
chunk_count += 1
|
|
||||||
yield chunk_text, current_tokens
|
|
||||||
|
|
||||||
# Skip the rest of the processing since we've handled all chunks
|
|
||||||
return
|
|
||||||
|
|
||||||
current_chunk = []
|
current_chunk = []
|
||||||
current_tokens = []
|
current_tokens = []
|
||||||
|
|
|
@ -45,17 +45,6 @@
|
||||||
</header>
|
</header>
|
||||||
|
|
||||||
<main>
|
<main>
|
||||||
<div class="help-icon" title="Tips">
|
|
||||||
<svg width="20" height="20" viewBox="0 0 24 24" fill="none" xmlns="http://www.w3.org/2000/svg">
|
|
||||||
<path d="M12 22C6.477 22 2 17.523 2 12S6.477 2 12 2s10 4.477 10 10-4.477 10-10 10zm-1-7v2h2v-2h-2zm2-1.645A3.502 3.502 0 0012 6.5a3.501 3.501 0 00-3.433 2.813l1.962.393A1.5 1.5 0 1112 11.5a1 1 0 00-1 1V14h2v-.645z" fill="currentColor"/>
|
|
||||||
</svg>
|
|
||||||
<div class="tooltip-content">
|
|
||||||
<h4>Tips</h4>
|
|
||||||
<ul>
|
|
||||||
<li>Use <code><<>></code> to add an intentional break between chunks</li>
|
|
||||||
</ul>
|
|
||||||
</div>
|
|
||||||
</div>
|
|
||||||
<div id="text-editor"></div>
|
<div id="text-editor"></div>
|
||||||
<div class="controls">
|
<div class="controls">
|
||||||
<div class="voice-select-container">
|
<div class="voice-select-container">
|
||||||
|
|
|
@ -8,7 +8,7 @@ export class AudioService {
|
||||||
this.minimumPlaybackSize = 50000; // 50KB minimum before playback
|
this.minimumPlaybackSize = 50000; // 50KB minimum before playback
|
||||||
this.textLength = 0;
|
this.textLength = 0;
|
||||||
this.shouldAutoplay = false;
|
this.shouldAutoplay = false;
|
||||||
this.CHARS_PER_CHUNK = 300; // Estimated chars per chunk
|
this.CHARS_PER_CHUNK = 150; // Estimated chars per chunk
|
||||||
this.serverDownloadPath = null; // Server-side download path
|
this.serverDownloadPath = null; // Server-side download path
|
||||||
this.pendingOperations = []; // Queue for buffer operations
|
this.pendingOperations = []; // Queue for buffer operations
|
||||||
}
|
}
|
||||||
|
|
|
@ -77,7 +77,7 @@
|
||||||
color: var(--text-light);
|
color: var(--text-light);
|
||||||
padding: 0.5rem 1rem;
|
padding: 0.5rem 1rem;
|
||||||
border-radius: 0.5rem;
|
border-radius: 0.5rem;
|
||||||
cursor: pointer;
|
cursor: pointer;
|
||||||
transition: all 0.2s ease;
|
transition: all 0.2s ease;
|
||||||
width: auto;
|
width: auto;
|
||||||
font-size: 0.875rem;
|
font-size: 0.875rem;
|
||||||
|
@ -181,111 +181,17 @@
|
||||||
box-shadow: none;
|
box-shadow: none;
|
||||||
}
|
}
|
||||||
|
|
||||||
.help-icon {
|
|
||||||
position: absolute;
|
|
||||||
top: 1rem;
|
|
||||||
right: 1rem;
|
|
||||||
color: var(--text-light);
|
|
||||||
cursor: pointer;
|
|
||||||
z-index: 2;
|
|
||||||
opacity: 0.7;
|
|
||||||
transition: opacity 0.3s ease;
|
|
||||||
}
|
|
||||||
|
|
||||||
.help-icon:hover {
|
|
||||||
opacity: 1;
|
|
||||||
}
|
|
||||||
|
|
||||||
.tooltip-content {
|
|
||||||
visibility: hidden;
|
|
||||||
opacity: 0;
|
|
||||||
position: absolute;
|
|
||||||
top: calc(100% + 10px);
|
|
||||||
right: 0;
|
|
||||||
background: var(--surface);
|
|
||||||
border: 1px solid var(--border);
|
|
||||||
border-radius: 0.5rem;
|
|
||||||
padding: 1rem;
|
|
||||||
width: 300px;
|
|
||||||
box-shadow: 0 4px 6px -1px rgba(0, 0, 0, 0.1),
|
|
||||||
0 2px 4px -1px rgba(0, 0, 0, 0.06);
|
|
||||||
z-index: 1000;
|
|
||||||
transition: visibility 0s linear 0.3s, opacity 0.3s ease;
|
|
||||||
}
|
|
||||||
|
|
||||||
.help-icon:hover .tooltip-content {
|
|
||||||
visibility: visible;
|
|
||||||
opacity: 1;
|
|
||||||
transition: visibility 0s linear 0.2s, opacity 0.3s ease 0.2s;
|
|
||||||
}
|
|
||||||
|
|
||||||
.tooltip-content h4 {
|
|
||||||
margin: 0 0 0.5rem 0;
|
|
||||||
color: var(--text);
|
|
||||||
}
|
|
||||||
|
|
||||||
.tooltip-content ul {
|
|
||||||
margin: 0;
|
|
||||||
padding-left: 1.25rem;
|
|
||||||
color: var(--text-light);
|
|
||||||
}
|
|
||||||
|
|
||||||
.tooltip-content code {
|
|
||||||
background: rgba(99, 102, 241, 0.1);
|
|
||||||
padding: 0.125rem 0.25rem;
|
|
||||||
border-radius: 0.25rem;
|
|
||||||
font-family: monospace;
|
|
||||||
}
|
|
||||||
|
|
||||||
main {
|
main {
|
||||||
display: grid;
|
display: grid;
|
||||||
grid-template-columns: 1fr 300px;
|
grid-template-columns: 1fr 300px;
|
||||||
grid-template-rows: 1fr auto;
|
gap: 1.5rem;
|
||||||
gap: 1.25rem;
|
row-gap: 0;
|
||||||
height: auto;
|
|
||||||
min-height: calc(100vh - 3rem);
|
|
||||||
max-width: 1200px;
|
max-width: 1200px;
|
||||||
margin: 0 auto;
|
margin: 0 auto;
|
||||||
padding: 1rem 1rem 2rem 1rem;
|
padding: 1.5rem;
|
||||||
align-items: start;
|
|
||||||
}
|
}
|
||||||
|
|
||||||
.text-editor,
|
|
||||||
.controls,
|
|
||||||
.player-container {
|
|
||||||
margin-bottom: 0.5rem;
|
|
||||||
}
|
|
||||||
|
|
||||||
.generation-progress {
|
|
||||||
width: 100%;
|
|
||||||
height: 4px;
|
|
||||||
background: rgba(99, 102, 241, 0.1);
|
|
||||||
border-radius: 2px;
|
|
||||||
margin: 0.75rem 0;
|
|
||||||
overflow: hidden;
|
|
||||||
position: relative;
|
|
||||||
}
|
|
||||||
|
|
||||||
.generation-progress::after {
|
|
||||||
content: '';
|
|
||||||
position: absolute;
|
|
||||||
top: 0;
|
|
||||||
left: 0;
|
|
||||||
height: 100%;
|
|
||||||
width: 30%;
|
|
||||||
background: var(--fg-color);
|
|
||||||
border-radius: 2px;
|
|
||||||
animation: progress 1.5s ease-in-out infinite;
|
|
||||||
}
|
|
||||||
|
|
||||||
@keyframes progress {
|
|
||||||
0% {
|
|
||||||
left: -30%;
|
|
||||||
}
|
|
||||||
100% {
|
|
||||||
left: 100%;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
/* Custom scrollbar styles */
|
/* Custom scrollbar styles */
|
||||||
::-webkit-scrollbar {
|
::-webkit-scrollbar {
|
||||||
|
@ -309,33 +215,30 @@ main {
|
||||||
|
|
||||||
.text-editor {
|
.text-editor {
|
||||||
grid-column: 1;
|
grid-column: 1;
|
||||||
grid-row: 1;
|
min-height: 400px;
|
||||||
min-height: 0;
|
max-height: 600px;
|
||||||
height: calc(100vh - 14rem);
|
|
||||||
overflow: auto;
|
overflow: auto;
|
||||||
scrollbar-width: thin;
|
scrollbar-width: thin;
|
||||||
scrollbar-color: rgba(99, 102, 241, 0.2) transparent;
|
scrollbar-color: rgba(99, 102, 241, 0.2) transparent;
|
||||||
margin: 0;
|
margin-bottom: 1.5rem;
|
||||||
}
|
}
|
||||||
|
|
||||||
.controls {
|
.controls {
|
||||||
grid-column: 2;
|
grid-column: 2;
|
||||||
grid-row: 1;
|
|
||||||
min-height: 0;
|
|
||||||
height: calc(100vh - 14rem);
|
|
||||||
display: flex;
|
display: flex;
|
||||||
flex-direction: column;
|
flex-direction: column;
|
||||||
gap: 1.25rem;
|
gap: 1rem;
|
||||||
background: var(--surface);
|
background: var(--surface);
|
||||||
border: 1px solid var(--border);
|
border: 1px solid var(--border);
|
||||||
border-radius: 0.5rem;
|
border-radius: 0.5rem;
|
||||||
padding: 1.25rem;
|
padding: 1rem;
|
||||||
overflow-y: auto;
|
overflow-y: auto;
|
||||||
overflow-x: hidden;
|
overflow-x: hidden;
|
||||||
scrollbar-width: thin;
|
scrollbar-width: thin;
|
||||||
scrollbar-color: rgba(99, 102, 241, 0.2) transparent;
|
scrollbar-color: rgba(99, 102, 241, 0.2) transparent;
|
||||||
margin: 0;
|
height: fit-content;
|
||||||
position: relative;
|
margin-bottom: 1rem;
|
||||||
|
max-height: 600px;
|
||||||
}
|
}
|
||||||
|
|
||||||
.voice-select-container {
|
.voice-select-container {
|
||||||
|
@ -347,8 +250,8 @@ main {
|
||||||
border-radius: 0.5rem;
|
border-radius: 0.5rem;
|
||||||
padding: 1rem;
|
padding: 1rem;
|
||||||
height: auto;
|
height: auto;
|
||||||
min-height: 120px;
|
min-height: 160px;
|
||||||
max-height: 200px;
|
max-height: 240px;
|
||||||
flex-shrink: 0;
|
flex-shrink: 0;
|
||||||
margin: 0.5rem 0;
|
margin: 0.5rem 0;
|
||||||
overflow: visible;
|
overflow: visible;
|
||||||
|
@ -356,13 +259,13 @@ main {
|
||||||
|
|
||||||
.selected-voices {
|
.selected-voices {
|
||||||
display: grid;
|
display: grid;
|
||||||
grid-template-columns: repeat(auto-fill, minmax(120px, 1fr));
|
grid-template-columns: repeat(auto-fill, minmax(120px, 4fr));
|
||||||
gap: 0.5rem;
|
gap: 0.5rem;
|
||||||
margin-top: 0.25rem;
|
margin-top: 0.25rem;
|
||||||
height: auto;
|
height: auto;
|
||||||
min-height: 60px;
|
min-height: 50px;
|
||||||
max-height: none;
|
max-height: 100%;
|
||||||
overflow-y: visible;
|
overflow-y: auto;
|
||||||
padding: 0.75rem;
|
padding: 0.75rem;
|
||||||
background: rgba(15, 23, 42, 0.3);
|
background: rgba(15, 23, 42, 0.3);
|
||||||
border-radius: 0.25rem;
|
border-radius: 0.25rem;
|
||||||
|
@ -435,22 +338,36 @@ main {
|
||||||
padding: 0.375rem 0.75rem;
|
padding: 0.375rem 0.75rem;
|
||||||
border-radius: 1rem;
|
border-radius: 1rem;
|
||||||
font-size: 0.75rem;
|
font-size: 0.75rem;
|
||||||
display: inline-flex;
|
display: grid;
|
||||||
|
grid-template-columns: auto auto 1fr;
|
||||||
align-items: center;
|
align-items: center;
|
||||||
gap: 0.375rem;
|
gap: 0.75rem;
|
||||||
border: 1px solid rgba(99, 102, 241, 0.3);
|
border: 1px solid rgba(99, 102, 241, 0.3);
|
||||||
white-space: nowrap;
|
white-space: nowrap;
|
||||||
flex-shrink: 0;
|
flex-shrink: 0;
|
||||||
|
min-width: 180px;
|
||||||
|
max-width: 100%;
|
||||||
|
}
|
||||||
|
.remove-voice {
|
||||||
|
font-size: 1.2em;
|
||||||
}
|
}
|
||||||
|
|
||||||
.selected-voice-tag input {
|
.selected-voice-tag input {
|
||||||
width: 2.5em;
|
width: 100%;
|
||||||
padding: 0.1rem;
|
padding: 0.25rem;
|
||||||
min-height: 1.25em;
|
min-height: 1.5em;
|
||||||
background: transparent;
|
background: transparent;
|
||||||
border: none;
|
border: none;
|
||||||
color: inherit;
|
color: inherit;
|
||||||
font-size: inherit;
|
font-size: inherit;
|
||||||
|
text-align: center;
|
||||||
|
border-radius: 0.25rem;
|
||||||
|
transition: background-color 0.2s;
|
||||||
|
}
|
||||||
|
|
||||||
|
.selected-voice-tag input:hover,
|
||||||
|
.selected-voice-tag input:focus {
|
||||||
|
background: rgba(99, 102, 241, 0.1);
|
||||||
}
|
}
|
||||||
|
|
||||||
.remove-voice {
|
.remove-voice {
|
||||||
|
@ -569,29 +486,23 @@ main {
|
||||||
display: flex;
|
display: flex;
|
||||||
flex-direction: column;
|
flex-direction: column;
|
||||||
height: 100vh;
|
height: 100vh;
|
||||||
padding: 0.75rem 1.25rem 1.5rem 1.25rem;
|
padding: 0.75rem 1rem 1rem 1rem;
|
||||||
gap: 1rem;
|
gap: 1rem;
|
||||||
}
|
}
|
||||||
|
|
||||||
.player-container {
|
.player-container {
|
||||||
grid-column: 1 / -1;
|
grid-column: 1 / -1;
|
||||||
grid-row: 2;
|
|
||||||
background: var(--surface);
|
background: var(--surface);
|
||||||
border: 1px solid var(--border);
|
border: 1px solid var(--border);
|
||||||
border-radius: 0.5rem;
|
border-radius: 0.5rem;
|
||||||
padding: 1.25rem 1.5rem;
|
padding: 1.25rem 1.5rem;
|
||||||
height: auto;
|
|
||||||
min-height: 90px;
|
min-height: 90px;
|
||||||
display: flex;
|
display: flex;
|
||||||
flex-direction: column;
|
flex-direction: column;
|
||||||
justify-content: center;
|
justify-content: center;
|
||||||
box-shadow: 0 4px 6px -1px rgba(0, 0, 0, 0.1),
|
box-shadow: 0 4px 6px -1px rgba(0, 0, 0, 0.1),
|
||||||
0 2px 4px -1px rgba(0, 0, 0, 0.06);
|
0 2px 4px -1px rgba(0, 0, 0, 0.06);
|
||||||
margin: 1rem 0;
|
|
||||||
align-self: start;
|
|
||||||
width: 100%;
|
width: 100%;
|
||||||
position: relative;
|
|
||||||
z-index: 1;
|
|
||||||
}
|
}
|
||||||
|
|
||||||
.options {
|
.options {
|
||||||
|
@ -613,7 +524,7 @@ main {
|
||||||
align-items: center;
|
align-items: center;
|
||||||
gap: 1rem;
|
gap: 1rem;
|
||||||
padding: 0.5rem 0;
|
padding: 0.5rem 0;
|
||||||
margin-top: 0.5rem;
|
margin-top: auto;
|
||||||
border-top: 1px solid var(--border);
|
border-top: 1px solid var(--border);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -686,4 +597,8 @@ button:disabled {
|
||||||
|
|
||||||
.loading .btn-text {
|
.loading .btn-text {
|
||||||
display: none;
|
display: none;
|
||||||
}
|
}
|
||||||
|
.text-editor,
|
||||||
|
.controls {
|
||||||
|
height: 600px;
|
||||||
|
}
|
||||||
|
|
|
@ -133,43 +133,6 @@
|
||||||
border-left: 1px solid rgba(99, 102, 241, 0.2);
|
border-left: 1px solid rgba(99, 102, 241, 0.2);
|
||||||
}
|
}
|
||||||
|
|
||||||
.generation-progress {
|
|
||||||
-webkit-appearance: none;
|
|
||||||
appearance: none;
|
|
||||||
width: 100%;
|
|
||||||
height: 6px;
|
|
||||||
border: none;
|
|
||||||
background: rgba(99, 102, 241, 0.1);
|
|
||||||
border-radius: 3px;
|
|
||||||
margin: 0.5rem 0;
|
|
||||||
display: block;
|
|
||||||
}
|
|
||||||
|
|
||||||
.generation-progress::-webkit-progress-bar {
|
|
||||||
background: rgba(99, 102, 241, 0.1);
|
|
||||||
border-radius: 3px;
|
|
||||||
}
|
|
||||||
|
|
||||||
.generation-progress::-webkit-progress-value {
|
|
||||||
background: var(--fg-color);
|
|
||||||
border-radius: 3px;
|
|
||||||
transition: width 0.2s ease;
|
|
||||||
box-shadow: 0 0 10px rgba(99, 102, 241, 0.3);
|
|
||||||
}
|
|
||||||
|
|
||||||
.generation-progress::-moz-progress-bar {
|
|
||||||
background: var(--fg-color);
|
|
||||||
border-radius: 3px;
|
|
||||||
transition: width 0.2s ease;
|
|
||||||
box-shadow: 0 0 10px rgba(99, 102, 241, 0.3);
|
|
||||||
}
|
|
||||||
|
|
||||||
.generation-progress::-ms-fill {
|
|
||||||
background: var(--fg-color);
|
|
||||||
border-radius: 3px;
|
|
||||||
transition: width 0.2s ease;
|
|
||||||
box-shadow: 0 0 10px rgba(99, 102, 241, 0.3);
|
|
||||||
}
|
|
||||||
|
|
||||||
.wave-container canvas {
|
.wave-container canvas {
|
||||||
position: absolute;
|
position: absolute;
|
||||||
|
|
Loading…
Add table
Reference in a new issue