Spaces:
Running
on
Zero
Running
on
Zero
Upload app.py
Browse files
app.py
CHANGED
@@ -100,8 +100,19 @@ phonemizers = dict(
|
|
100 |
j=Katsu(),
|
101 |
)
|
102 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
103 |
def phonemize(text, voice, norm=True):
|
104 |
-
lang = voice[0]
|
105 |
if norm:
|
106 |
text = normalize(text)
|
107 |
ps = phonemizers[lang].phonemize([text])
|
@@ -182,8 +193,8 @@ VOICES = {device: {k: torch.load(os.path.join(snapshot, 'voicepacks', f'{k}.pt')
|
|
182 |
SAMPLE_RATE = 24000
|
183 |
|
184 |
@torch.no_grad()
|
185 |
-
def forward(tokens,
|
186 |
-
ref_s = VOICES[device][
|
187 |
tokens = torch.LongTensor([[0, *tokens, 0]]).to(device)
|
188 |
input_lengths = torch.LongTensor([tokens.shape[-1]]).to(device)
|
189 |
text_mask = length_to_mask(input_lengths).to(device)
|
@@ -207,8 +218,8 @@ def forward(tokens, voice, speed, device='cpu'):
|
|
207 |
return models[device].decoder(asr, F0_pred, N_pred, ref_s[:, :128]).squeeze().cpu().numpy()
|
208 |
|
209 |
@spaces.GPU(duration=10)
|
210 |
-
def forward_gpu(tokens,
|
211 |
-
return forward(tokens,
|
212 |
|
213 |
def clamp_speed(speed):
|
214 |
if not isinstance(speed, float) and not isinstance(speed, int):
|
@@ -221,7 +232,7 @@ def clamp_speed(speed):
|
|
221 |
|
222 |
# Must be backwards compatible with https://huggingface.co/spaces/Pendrokar/TTS-Spaces-Arena
|
223 |
def generate(text, voice='af', ps=None, speed=1, trim=3000, use_gpu='auto'):
|
224 |
-
|
225 |
ps = ps or phonemize(text, voice)
|
226 |
speed = clamp_speed(speed)
|
227 |
trim = trim if isinstance(trim, int) else 3000
|
@@ -235,14 +246,14 @@ def generate(text, voice='af', ps=None, speed=1, trim=3000, use_gpu='auto'):
|
|
235 |
use_gpu = len(ps) > 99 if use_gpu == 'auto' else use_gpu
|
236 |
try:
|
237 |
if use_gpu:
|
238 |
-
out = forward_gpu(tokens,
|
239 |
else:
|
240 |
-
out = forward(tokens,
|
241 |
except gr.exceptions.Error as e:
|
242 |
if use_gpu:
|
243 |
gr.Warning(str(e))
|
244 |
gr.Info('GPU failover to CPU')
|
245 |
-
out = forward(tokens,
|
246 |
else:
|
247 |
raise gr.Error(e)
|
248 |
return (None, '')
|
@@ -265,12 +276,15 @@ USE_GPU_INFOS = {
|
|
265 |
def change_use_gpu(value):
|
266 |
return gr.Dropdown(USE_GPU_CHOICES, value=value, label='Hardware', info=USE_GPU_INFOS[value], interactive=CUDA_AVAILABLE)
|
267 |
|
|
|
|
|
|
|
268 |
with gr.Blocks() as basic_tts:
|
269 |
with gr.Row():
|
270 |
with gr.Column():
|
271 |
text = gr.Textbox(label='Input Text', info='Generate speech for one segment of text using Kokoro, a TTS model with 80 million parameters')
|
272 |
with gr.Row():
|
273 |
-
voice = gr.Dropdown(list(CHOICES.items()), value='af', label='Voice', info='Starred voices are more stable')
|
274 |
use_gpu = gr.Dropdown(
|
275 |
USE_GPU_CHOICES,
|
276 |
value='auto' if CUDA_AVAILABLE else False,
|
@@ -298,12 +312,21 @@ with gr.Blocks() as basic_tts:
|
|
298 |
trim = gr.Slider(minimum=0, maximum=24000, value=3000, step=1000, label='✂️ Trim', info='Cut from both ends')
|
299 |
with gr.Accordion('Output Tokens', open=True):
|
300 |
out_ps = gr.Textbox(interactive=False, show_label=False, info='Tokens used to generate the audio, up to 510 allowed. Same as input tokens if supplied, excluding unknowns.')
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
301 |
text.submit(generate, inputs=[text, voice, in_ps, speed, trim, use_gpu], outputs=[audio, out_ps])
|
302 |
generate_btn.click(generate, inputs=[text, voice, in_ps, speed, trim, use_gpu], outputs=[audio, out_ps])
|
303 |
|
304 |
@torch.no_grad()
|
305 |
-
def lf_forward(token_lists,
|
306 |
-
voicepack = VOICES[device][
|
307 |
outs = []
|
308 |
for tokens in token_lists:
|
309 |
ref_s = voicepack[len(tokens)]
|
@@ -331,8 +354,8 @@ def lf_forward(token_lists, voice, speed, device='cpu'):
|
|
331 |
return outs
|
332 |
|
333 |
@spaces.GPU
|
334 |
-
def lf_forward_gpu(token_lists,
|
335 |
-
return lf_forward(token_lists,
|
336 |
|
337 |
def resplit_strings(arr):
|
338 |
# Handle edge cases
|
@@ -388,6 +411,8 @@ def segment_and_tokenize(text, voice, skip_square_brackets=True, newline_split=2
|
|
388 |
|
389 |
def lf_generate(segments, voice, speed=1, trim=0, pad_between=0, use_gpu=True):
|
390 |
token_lists = list(map(tokenize, segments['Tokens']))
|
|
|
|
|
391 |
wavs = []
|
392 |
trim = int(trim / speed)
|
393 |
pad_between = int(pad_between / speed)
|
@@ -438,7 +463,7 @@ with gr.Blocks() as lf_tts:
|
|
438 |
text = gr.Textbox(label='Input Text', info='Generate speech in batches of 100 text segments and automatically join them together')
|
439 |
file_input.upload(fn=extract_text, inputs=[file_input], outputs=[text])
|
440 |
with gr.Row():
|
441 |
-
voice = gr.Dropdown(list(CHOICES.items()), value='af', label='Voice', info='Starred voices are more stable')
|
442 |
use_gpu = gr.Dropdown(
|
443 |
[('ZeroGPU 🚀', True), ('CPU 🐌', False)],
|
444 |
value=CUDA_AVAILABLE,
|
@@ -515,20 +540,26 @@ Random Japanese texts: CC0 public domain from [Common Voice](https://github.com/
|
|
515 |
|
516 |
with gr.Blocks() as changelog:
|
517 |
gr.Markdown("""
|
518 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
519 |
🔀 Hardware switching between CPU and GPU<br/>
|
520 |
🗣️ Restored old voices, back up to 32 total
|
521 |
|
522 |
-
|
523 |
🚀 Model v0.19<br/>
|
524 |
🧪 Validation losses: 0.261 mel, 0.627 dur, 1.897 f0<br/>
|
525 |
📄 https://hf.co/blog/hexgrad/kokoro-short-burst-upgrade
|
526 |
|
527 |
-
|
528 |
🚀 Model v0.16<br/>
|
529 |
🧪 Validation losses: 0.263 mel, 0.646 dur, 1.934 f0
|
530 |
|
531 |
-
|
532 |
🚀 Model v0.14<br/>
|
533 |
🧪 Validation losses: 0.262 mel, 0.642 dur, 1.889 f0
|
534 |
""")
|
|
|
100 |
j=Katsu(),
|
101 |
)
|
102 |
|
103 |
+
def resolve_voices(voice, warn=True):
|
104 |
+
if not isinstance(voice, str):
|
105 |
+
return ['af']
|
106 |
+
voices = voice.lower().replace('/', '_').replace(' ', '+').replace(',', '+').split('+')
|
107 |
+
if warn:
|
108 |
+
unks = {v for v in voices if v and v not in VOICES['cpu']}
|
109 |
+
if unks:
|
110 |
+
gr.Warning(f"Unknown voice{'s' if len(unks) > 1 else ''}: {','.join(unks)}")
|
111 |
+
voices = [v for v in voices if v in VOICES['cpu']]
|
112 |
+
return voices if voices else ['af']
|
113 |
+
|
114 |
def phonemize(text, voice, norm=True):
|
115 |
+
lang = resolve_voices(voice)[0][0]
|
116 |
if norm:
|
117 |
text = normalize(text)
|
118 |
ps = phonemizers[lang].phonemize([text])
|
|
|
193 |
SAMPLE_RATE = 24000
|
194 |
|
195 |
@torch.no_grad()
|
196 |
+
def forward(tokens, voices, speed, device='cpu'):
|
197 |
+
ref_s = torch.mean(torch.stack([VOICES[device][v][len(tokens)] for v in voices]), dim=0)
|
198 |
tokens = torch.LongTensor([[0, *tokens, 0]]).to(device)
|
199 |
input_lengths = torch.LongTensor([tokens.shape[-1]]).to(device)
|
200 |
text_mask = length_to_mask(input_lengths).to(device)
|
|
|
218 |
return models[device].decoder(asr, F0_pred, N_pred, ref_s[:, :128]).squeeze().cpu().numpy()
|
219 |
|
220 |
@spaces.GPU(duration=10)
|
221 |
+
def forward_gpu(tokens, voices, speed):
|
222 |
+
return forward(tokens, voices, speed, device='cuda')
|
223 |
|
224 |
def clamp_speed(speed):
|
225 |
if not isinstance(speed, float) and not isinstance(speed, int):
|
|
|
232 |
|
233 |
# Must be backwards compatible with https://huggingface.co/spaces/Pendrokar/TTS-Spaces-Arena
|
234 |
def generate(text, voice='af', ps=None, speed=1, trim=3000, use_gpu='auto'):
|
235 |
+
voices = resolve_voices(voice, warn=ps)
|
236 |
ps = ps or phonemize(text, voice)
|
237 |
speed = clamp_speed(speed)
|
238 |
trim = trim if isinstance(trim, int) else 3000
|
|
|
246 |
use_gpu = len(ps) > 99 if use_gpu == 'auto' else use_gpu
|
247 |
try:
|
248 |
if use_gpu:
|
249 |
+
out = forward_gpu(tokens, voices, speed)
|
250 |
else:
|
251 |
+
out = forward(tokens, voices, speed)
|
252 |
except gr.exceptions.Error as e:
|
253 |
if use_gpu:
|
254 |
gr.Warning(str(e))
|
255 |
gr.Info('GPU failover to CPU')
|
256 |
+
out = forward(tokens, voices, speed)
|
257 |
else:
|
258 |
raise gr.Error(e)
|
259 |
return (None, '')
|
|
|
276 |
def change_use_gpu(value):
|
277 |
return gr.Dropdown(USE_GPU_CHOICES, value=value, label='Hardware', info=USE_GPU_INFOS[value], interactive=CUDA_AVAILABLE)
|
278 |
|
279 |
+
def update_voice(voice, btn):
|
280 |
+
return f'{voice}+{btn}' if voice.startswith(btn[:2]) else btn
|
281 |
+
|
282 |
with gr.Blocks() as basic_tts:
|
283 |
with gr.Row():
|
284 |
with gr.Column():
|
285 |
text = gr.Textbox(label='Input Text', info='Generate speech for one segment of text using Kokoro, a TTS model with 80 million parameters')
|
286 |
with gr.Row():
|
287 |
+
voice = gr.Dropdown(list(CHOICES.items()), value='af', allow_custom_value=True, label='Voice', info='Starred voices are more stable')
|
288 |
use_gpu = gr.Dropdown(
|
289 |
USE_GPU_CHOICES,
|
290 |
value='auto' if CUDA_AVAILABLE else False,
|
|
|
312 |
trim = gr.Slider(minimum=0, maximum=24000, value=3000, step=1000, label='✂️ Trim', info='Cut from both ends')
|
313 |
with gr.Accordion('Output Tokens', open=True):
|
314 |
out_ps = gr.Textbox(interactive=False, show_label=False, info='Tokens used to generate the audio, up to 510 allowed. Same as input tokens if supplied, excluding unknowns.')
|
315 |
+
with gr.Accordion('Voice Mixer', open=False):
|
316 |
+
gr.Markdown('Create a custom voice by mixing and matching other voices. Click an orange button to add one part to your mix, or click a gray button to start over. Free text input also allowed.')
|
317 |
+
for i in range(8):
|
318 |
+
with gr.Row():
|
319 |
+
for j in range(4):
|
320 |
+
with gr.Column():
|
321 |
+
btn = gr.Button(list(CHOICES.values())[i*4+j], variant='primary' if i*4+j < 10 else 'secondary')
|
322 |
+
btn.click(update_voice, inputs=[voice, btn], outputs=[voice])
|
323 |
+
voice.change(lambda v, b: gr.Button(b, variant='primary' if v.startswith(b[:2]) else 'secondary'), inputs=[voice, btn], outputs=[btn])
|
324 |
text.submit(generate, inputs=[text, voice, in_ps, speed, trim, use_gpu], outputs=[audio, out_ps])
|
325 |
generate_btn.click(generate, inputs=[text, voice, in_ps, speed, trim, use_gpu], outputs=[audio, out_ps])
|
326 |
|
327 |
@torch.no_grad()
|
328 |
+
def lf_forward(token_lists, voices, speed, device='cpu'):
|
329 |
+
voicepack = torch.mean(torch.stack([VOICES[device][v] for v in voices]), dim=0)
|
330 |
outs = []
|
331 |
for tokens in token_lists:
|
332 |
ref_s = voicepack[len(tokens)]
|
|
|
354 |
return outs
|
355 |
|
356 |
@spaces.GPU
|
357 |
+
def lf_forward_gpu(token_lists, voices, speed):
|
358 |
+
return lf_forward(token_lists, voices, speed, device='cuda')
|
359 |
|
360 |
def resplit_strings(arr):
|
361 |
# Handle edge cases
|
|
|
411 |
|
412 |
def lf_generate(segments, voice, speed=1, trim=0, pad_between=0, use_gpu=True):
|
413 |
token_lists = list(map(tokenize, segments['Tokens']))
|
414 |
+
voices = resolve_voices(voice)
|
415 |
+
speed = clamp_speed(speed)
|
416 |
wavs = []
|
417 |
trim = int(trim / speed)
|
418 |
pad_between = int(pad_between / speed)
|
|
|
463 |
text = gr.Textbox(label='Input Text', info='Generate speech in batches of 100 text segments and automatically join them together')
|
464 |
file_input.upload(fn=extract_text, inputs=[file_input], outputs=[text])
|
465 |
with gr.Row():
|
466 |
+
voice = gr.Dropdown(list(CHOICES.items()), value='af', allow_custom_value=True, label='Voice', info='Starred voices are more stable')
|
467 |
use_gpu = gr.Dropdown(
|
468 |
[('ZeroGPU 🚀', True), ('CPU 🐌', False)],
|
469 |
value=CUDA_AVAILABLE,
|
|
|
540 |
|
541 |
with gr.Blocks() as changelog:
|
542 |
gr.Markdown("""
|
543 |
+
**25 Nov 2024**<br/>
|
544 |
+
🎨 Voice Mixer added
|
545 |
+
|
546 |
+
**24 Nov 2024**<br/>
|
547 |
+
🛑 Model training halted, v0.19 is the current stable version
|
548 |
+
|
549 |
+
**23 Nov 2024**<br/>
|
550 |
🔀 Hardware switching between CPU and GPU<br/>
|
551 |
🗣️ Restored old voices, back up to 32 total
|
552 |
|
553 |
+
**22 Nov 2024**<br/>
|
554 |
🚀 Model v0.19<br/>
|
555 |
🧪 Validation losses: 0.261 mel, 0.627 dur, 1.897 f0<br/>
|
556 |
📄 https://hf.co/blog/hexgrad/kokoro-short-burst-upgrade
|
557 |
|
558 |
+
**15 Nov 2024**<br/>
|
559 |
🚀 Model v0.16<br/>
|
560 |
🧪 Validation losses: 0.263 mel, 0.646 dur, 1.934 f0
|
561 |
|
562 |
+
**12 Nov 2024**<br/>
|
563 |
🚀 Model v0.14<br/>
|
564 |
🧪 Validation losses: 0.262 mel, 0.642 dur, 1.889 f0
|
565 |
""")
|