hexgrad commited on
Commit
f5504a3
·
verified ·
1 Parent(s): 2986a1b

Upload app.py

Browse files
Files changed (1) hide show
  1. app.py +50 -19
app.py CHANGED
@@ -100,8 +100,19 @@ phonemizers = dict(
100
  j=Katsu(),
101
  )
102
 
 
 
 
 
 
 
 
 
 
 
 
103
  def phonemize(text, voice, norm=True):
104
- lang = voice[0]
105
  if norm:
106
  text = normalize(text)
107
  ps = phonemizers[lang].phonemize([text])
@@ -182,8 +193,8 @@ VOICES = {device: {k: torch.load(os.path.join(snapshot, 'voicepacks', f'{k}.pt')
182
  SAMPLE_RATE = 24000
183
 
184
  @torch.no_grad()
185
- def forward(tokens, voice, speed, device='cpu'):
186
- ref_s = VOICES[device][voice][len(tokens)]
187
  tokens = torch.LongTensor([[0, *tokens, 0]]).to(device)
188
  input_lengths = torch.LongTensor([tokens.shape[-1]]).to(device)
189
  text_mask = length_to_mask(input_lengths).to(device)
@@ -207,8 +218,8 @@ def forward(tokens, voice, speed, device='cpu'):
207
  return models[device].decoder(asr, F0_pred, N_pred, ref_s[:, :128]).squeeze().cpu().numpy()
208
 
209
  @spaces.GPU(duration=10)
210
- def forward_gpu(tokens, voice, speed):
211
- return forward(tokens, voice, speed, device='cuda')
212
 
213
  def clamp_speed(speed):
214
  if not isinstance(speed, float) and not isinstance(speed, int):
@@ -221,7 +232,7 @@ def clamp_speed(speed):
221
 
222
  # Must be backwards compatible with https://huggingface.co/spaces/Pendrokar/TTS-Spaces-Arena
223
  def generate(text, voice='af', ps=None, speed=1, trim=3000, use_gpu='auto'):
224
- voice = voice if voice in VOICES['cpu'] else 'af'
225
  ps = ps or phonemize(text, voice)
226
  speed = clamp_speed(speed)
227
  trim = trim if isinstance(trim, int) else 3000
@@ -235,14 +246,14 @@ def generate(text, voice='af', ps=None, speed=1, trim=3000, use_gpu='auto'):
235
  use_gpu = len(ps) > 99 if use_gpu == 'auto' else use_gpu
236
  try:
237
  if use_gpu:
238
- out = forward_gpu(tokens, voice, speed)
239
  else:
240
- out = forward(tokens, voice, speed)
241
  except gr.exceptions.Error as e:
242
  if use_gpu:
243
  gr.Warning(str(e))
244
  gr.Info('GPU failover to CPU')
245
- out = forward(tokens, voice, speed)
246
  else:
247
  raise gr.Error(e)
248
  return (None, '')
@@ -265,12 +276,15 @@ USE_GPU_INFOS = {
265
  def change_use_gpu(value):
266
  return gr.Dropdown(USE_GPU_CHOICES, value=value, label='Hardware', info=USE_GPU_INFOS[value], interactive=CUDA_AVAILABLE)
267
 
 
 
 
268
  with gr.Blocks() as basic_tts:
269
  with gr.Row():
270
  with gr.Column():
271
  text = gr.Textbox(label='Input Text', info='Generate speech for one segment of text using Kokoro, a TTS model with 80 million parameters')
272
  with gr.Row():
273
- voice = gr.Dropdown(list(CHOICES.items()), value='af', label='Voice', info='Starred voices are more stable')
274
  use_gpu = gr.Dropdown(
275
  USE_GPU_CHOICES,
276
  value='auto' if CUDA_AVAILABLE else False,
@@ -298,12 +312,21 @@ with gr.Blocks() as basic_tts:
298
  trim = gr.Slider(minimum=0, maximum=24000, value=3000, step=1000, label='✂️ Trim', info='Cut from both ends')
299
  with gr.Accordion('Output Tokens', open=True):
300
  out_ps = gr.Textbox(interactive=False, show_label=False, info='Tokens used to generate the audio, up to 510 allowed. Same as input tokens if supplied, excluding unknowns.')
 
 
 
 
 
 
 
 
 
301
  text.submit(generate, inputs=[text, voice, in_ps, speed, trim, use_gpu], outputs=[audio, out_ps])
302
  generate_btn.click(generate, inputs=[text, voice, in_ps, speed, trim, use_gpu], outputs=[audio, out_ps])
303
 
304
  @torch.no_grad()
305
- def lf_forward(token_lists, voice, speed, device='cpu'):
306
- voicepack = VOICES[device][voice]
307
  outs = []
308
  for tokens in token_lists:
309
  ref_s = voicepack[len(tokens)]
@@ -331,8 +354,8 @@ def lf_forward(token_lists, voice, speed, device='cpu'):
331
  return outs
332
 
333
  @spaces.GPU
334
- def lf_forward_gpu(token_lists, voice, speed):
335
- return lf_forward(token_lists, voice, speed, device='cuda')
336
 
337
  def resplit_strings(arr):
338
  # Handle edge cases
@@ -388,6 +411,8 @@ def segment_and_tokenize(text, voice, skip_square_brackets=True, newline_split=2
388
 
389
  def lf_generate(segments, voice, speed=1, trim=0, pad_between=0, use_gpu=True):
390
  token_lists = list(map(tokenize, segments['Tokens']))
 
 
391
  wavs = []
392
  trim = int(trim / speed)
393
  pad_between = int(pad_between / speed)
@@ -438,7 +463,7 @@ with gr.Blocks() as lf_tts:
438
  text = gr.Textbox(label='Input Text', info='Generate speech in batches of 100 text segments and automatically join them together')
439
  file_input.upload(fn=extract_text, inputs=[file_input], outputs=[text])
440
  with gr.Row():
441
- voice = gr.Dropdown(list(CHOICES.items()), value='af', label='Voice', info='Starred voices are more stable')
442
  use_gpu = gr.Dropdown(
443
  [('ZeroGPU 🚀', True), ('CPU 🐌', False)],
444
  value=CUDA_AVAILABLE,
@@ -515,20 +540,26 @@ Random Japanese texts: CC0 public domain from [Common Voice](https://github.com/
515
 
516
  with gr.Blocks() as changelog:
517
  gr.Markdown("""
518
- ### 23 Nov 2024
 
 
 
 
 
 
519
  🔀 Hardware switching between CPU and GPU<br/>
520
  🗣️ Restored old voices, back up to 32 total
521
 
522
- ### 22 Nov 2024
523
  🚀 Model v0.19<br/>
524
  🧪 Validation losses: 0.261 mel, 0.627 dur, 1.897 f0<br/>
525
  📄 https://hf.co/blog/hexgrad/kokoro-short-burst-upgrade
526
 
527
- ### 15 Nov 2024
528
  🚀 Model v0.16<br/>
529
  🧪 Validation losses: 0.263 mel, 0.646 dur, 1.934 f0
530
 
531
- ### 12 Nov 2024
532
  🚀 Model v0.14<br/>
533
  🧪 Validation losses: 0.262 mel, 0.642 dur, 1.889 f0
534
  """)
 
100
  j=Katsu(),
101
  )
102
 
103
+ def resolve_voices(voice, warn=True):
104
+ if not isinstance(voice, str):
105
+ return ['af']
106
+ voices = voice.lower().replace('/', '_').replace(' ', '+').replace(',', '+').split('+')
107
+ if warn:
108
+ unks = {v for v in voices if v and v not in VOICES['cpu']}
109
+ if unks:
110
+ gr.Warning(f"Unknown voice{'s' if len(unks) > 1 else ''}: {','.join(unks)}")
111
+ voices = [v for v in voices if v in VOICES['cpu']]
112
+ return voices if voices else ['af']
113
+
114
  def phonemize(text, voice, norm=True):
115
+ lang = resolve_voices(voice)[0][0]
116
  if norm:
117
  text = normalize(text)
118
  ps = phonemizers[lang].phonemize([text])
 
193
  SAMPLE_RATE = 24000
194
 
195
  @torch.no_grad()
196
+ def forward(tokens, voices, speed, device='cpu'):
197
+ ref_s = torch.mean(torch.stack([VOICES[device][v][len(tokens)] for v in voices]), dim=0)
198
  tokens = torch.LongTensor([[0, *tokens, 0]]).to(device)
199
  input_lengths = torch.LongTensor([tokens.shape[-1]]).to(device)
200
  text_mask = length_to_mask(input_lengths).to(device)
 
218
  return models[device].decoder(asr, F0_pred, N_pred, ref_s[:, :128]).squeeze().cpu().numpy()
219
 
220
  @spaces.GPU(duration=10)
221
+ def forward_gpu(tokens, voices, speed):
222
+ return forward(tokens, voices, speed, device='cuda')
223
 
224
  def clamp_speed(speed):
225
  if not isinstance(speed, float) and not isinstance(speed, int):
 
232
 
233
  # Must be backwards compatible with https://huggingface.co/spaces/Pendrokar/TTS-Spaces-Arena
234
  def generate(text, voice='af', ps=None, speed=1, trim=3000, use_gpu='auto'):
235
+ voices = resolve_voices(voice, warn=ps)
236
  ps = ps or phonemize(text, voice)
237
  speed = clamp_speed(speed)
238
  trim = trim if isinstance(trim, int) else 3000
 
246
  use_gpu = len(ps) > 99 if use_gpu == 'auto' else use_gpu
247
  try:
248
  if use_gpu:
249
+ out = forward_gpu(tokens, voices, speed)
250
  else:
251
+ out = forward(tokens, voices, speed)
252
  except gr.exceptions.Error as e:
253
  if use_gpu:
254
  gr.Warning(str(e))
255
  gr.Info('GPU failover to CPU')
256
+ out = forward(tokens, voices, speed)
257
  else:
258
  raise gr.Error(e)
259
  return (None, '')
 
276
  def change_use_gpu(value):
277
  return gr.Dropdown(USE_GPU_CHOICES, value=value, label='Hardware', info=USE_GPU_INFOS[value], interactive=CUDA_AVAILABLE)
278
 
279
+ def update_voice(voice, btn):
280
+ return f'{voice}+{btn}' if voice.startswith(btn[:2]) else btn
281
+
282
  with gr.Blocks() as basic_tts:
283
  with gr.Row():
284
  with gr.Column():
285
  text = gr.Textbox(label='Input Text', info='Generate speech for one segment of text using Kokoro, a TTS model with 80 million parameters')
286
  with gr.Row():
287
+ voice = gr.Dropdown(list(CHOICES.items()), value='af', allow_custom_value=True, label='Voice', info='Starred voices are more stable')
288
  use_gpu = gr.Dropdown(
289
  USE_GPU_CHOICES,
290
  value='auto' if CUDA_AVAILABLE else False,
 
312
  trim = gr.Slider(minimum=0, maximum=24000, value=3000, step=1000, label='✂️ Trim', info='Cut from both ends')
313
  with gr.Accordion('Output Tokens', open=True):
314
  out_ps = gr.Textbox(interactive=False, show_label=False, info='Tokens used to generate the audio, up to 510 allowed. Same as input tokens if supplied, excluding unknowns.')
315
+ with gr.Accordion('Voice Mixer', open=False):
316
+ gr.Markdown('Create a custom voice by mixing and matching other voices. Click an orange button to add one part to your mix, or click a gray button to start over. Free text input also allowed.')
317
+ for i in range(8):
318
+ with gr.Row():
319
+ for j in range(4):
320
+ with gr.Column():
321
+ btn = gr.Button(list(CHOICES.values())[i*4+j], variant='primary' if i*4+j < 10 else 'secondary')
322
+ btn.click(update_voice, inputs=[voice, btn], outputs=[voice])
323
+ voice.change(lambda v, b: gr.Button(b, variant='primary' if v.startswith(b[:2]) else 'secondary'), inputs=[voice, btn], outputs=[btn])
324
  text.submit(generate, inputs=[text, voice, in_ps, speed, trim, use_gpu], outputs=[audio, out_ps])
325
  generate_btn.click(generate, inputs=[text, voice, in_ps, speed, trim, use_gpu], outputs=[audio, out_ps])
326
 
327
  @torch.no_grad()
328
+ def lf_forward(token_lists, voices, speed, device='cpu'):
329
+ voicepack = torch.mean(torch.stack([VOICES[device][v] for v in voices]), dim=0)
330
  outs = []
331
  for tokens in token_lists:
332
  ref_s = voicepack[len(tokens)]
 
354
  return outs
355
 
356
  @spaces.GPU
357
+ def lf_forward_gpu(token_lists, voices, speed):
358
+ return lf_forward(token_lists, voices, speed, device='cuda')
359
 
360
  def resplit_strings(arr):
361
  # Handle edge cases
 
411
 
412
  def lf_generate(segments, voice, speed=1, trim=0, pad_between=0, use_gpu=True):
413
  token_lists = list(map(tokenize, segments['Tokens']))
414
+ voices = resolve_voices(voice)
415
+ speed = clamp_speed(speed)
416
  wavs = []
417
  trim = int(trim / speed)
418
  pad_between = int(pad_between / speed)
 
463
  text = gr.Textbox(label='Input Text', info='Generate speech in batches of 100 text segments and automatically join them together')
464
  file_input.upload(fn=extract_text, inputs=[file_input], outputs=[text])
465
  with gr.Row():
466
+ voice = gr.Dropdown(list(CHOICES.items()), value='af', allow_custom_value=True, label='Voice', info='Starred voices are more stable')
467
  use_gpu = gr.Dropdown(
468
  [('ZeroGPU 🚀', True), ('CPU 🐌', False)],
469
  value=CUDA_AVAILABLE,
 
540
 
541
  with gr.Blocks() as changelog:
542
  gr.Markdown("""
543
+ **25 Nov 2024**<br/>
544
+ 🎨 Voice Mixer added
545
+
546
+ **24 Nov 2024**<br/>
547
+ 🛑 Model training halted, v0.19 is the current stable version
548
+
549
+ **23 Nov 2024**<br/>
550
  🔀 Hardware switching between CPU and GPU<br/>
551
  🗣️ Restored old voices, back up to 32 total
552
 
553
+ **22 Nov 2024**<br/>
554
  🚀 Model v0.19<br/>
555
  🧪 Validation losses: 0.261 mel, 0.627 dur, 1.897 f0<br/>
556
  📄 https://hf.co/blog/hexgrad/kokoro-short-burst-upgrade
557
 
558
+ **15 Nov 2024**<br/>
559
  🚀 Model v0.16<br/>
560
  🧪 Validation losses: 0.263 mel, 0.646 dur, 1.934 f0
561
 
562
+ **12 Nov 2024**<br/>
563
  🚀 Model v0.14<br/>
564
  🧪 Validation losses: 0.262 mel, 0.642 dur, 1.889 f0
565
  """)