hexgrad commited on
Commit
bbacd9b
·
verified ·
1 Parent(s): 0b561c5

Upload app.py

Browse files
Files changed (1) hide show
  1. app.py +26 -71
app.py CHANGED
@@ -156,16 +156,6 @@ CHOICES = {
156
  }
157
  VOICES = {device: {k: torch.load(os.path.join(snapshot, 'voicepacks', f'{k}.pt'), weights_only=True).to(device) for k in CHOICES.values()} for device in models}
158
 
159
- np_log_99 = np.log(99)
160
- def s_curve(p):
161
- if p <= 0:
162
- return 0
163
- elif p >= 1:
164
- return 1
165
- s = 1 / (1 + np.exp((1-p*2)*np_log_99))
166
- s = (s-0.01) * 50/49
167
- return s
168
-
169
  SAMPLE_RATE = 24000
170
 
171
  @torch.no_grad()
@@ -198,10 +188,10 @@ def forward_gpu(tokens, voice, speed):
198
  return forward(tokens, voice, speed, device='cuda')
199
 
200
  # Must be backwards compatible with https://huggingface.co/spaces/Pendrokar/TTS-Spaces-Arena
201
- def generate(text, voice, ps, speed, _reduce_noise, opening_cut, closing_cut, ease_in, ease_out, _pad_before, _pad_after, use_gpu):
202
- return _generate(text, voice, ps, speed, opening_cut, closing_cut, ease_in, ease_out, use_gpu)
203
 
204
- def _generate(text, voice, ps, speed, opening_cut, closing_cut, ease_in, ease_out, use_gpu):
205
  if voice not in VOICES['cpu']:
206
  voice = 'af'
207
  ps = ps or phonemize(text, voice)
@@ -219,18 +209,11 @@ def _generate(text, voice, ps, speed, opening_cut, closing_cut, ease_in, ease_ou
219
  except gr.exceptions.Error as e:
220
  raise gr.Error(e)
221
  return (None, '')
222
- opening_cut = int(opening_cut / speed)
223
- if opening_cut > 0:
224
- out = out[opening_cut:]
225
- closing_cut = int(closing_cut / speed)
226
- if closing_cut > 0:
227
- out = out[:-closing_cut]
228
- ease_in = min(int(ease_in / speed), len(out)//2)
229
- for i in range(ease_in):
230
- out[i] *= s_curve(i / ease_in)
231
- ease_out = min(int(ease_out / speed), len(out)//2)
232
- for i in range(ease_out):
233
- out[-i-1] *= s_curve(i / ease_out)
234
  return ((SAMPLE_RATE, out), ps)
235
 
236
  def toggle_autoplay(autoplay):
@@ -271,25 +254,15 @@ with gr.Blocks() as basic_tts:
271
  phonemize_btn.click(phonemize, inputs=[text, voice], outputs=[in_ps])
272
  with gr.Column():
273
  audio = gr.Audio(interactive=False, label='Output Audio', autoplay=True)
274
- autoplay = gr.Checkbox(value=True, label='Autoplay')
275
- autoplay.change(toggle_autoplay, inputs=[autoplay], outputs=[audio])
 
 
 
276
  with gr.Accordion('Output Tokens', open=True):
277
  out_ps = gr.Textbox(interactive=False, show_label=False, info='Tokens used to generate the audio, up to 510 allowed. Same as input tokens if supplied, excluding unknowns.')
278
- with gr.Accordion('Audio Settings', open=False):
279
- with gr.Row():
280
- speed = gr.Slider(minimum=0.5, maximum=2, value=1, step=0.1, label='⚡️ Speed', info='Adjust the speed of the audio; the settings below are auto-scaled by speed')
281
- with gr.Row():
282
- with gr.Column():
283
- opening_cut = gr.Slider(minimum=0, maximum=24000, value=4000, step=1000, label='✂️ Opening Cut', info='Cut samples from the start')
284
- with gr.Column():
285
- closing_cut = gr.Slider(minimum=0, maximum=24000, value=2000, step=1000, label='🎬 Closing Cut', info='Cut samples from the end')
286
- with gr.Row():
287
- with gr.Column():
288
- ease_in = gr.Slider(minimum=0, maximum=24000, value=3000, step=1000, label='🎢 Ease In', info='Ease in samples, after opening cut')
289
- with gr.Column():
290
- ease_out = gr.Slider(minimum=0, maximum=24000, value=1000, step=1000, label='🛝 Ease Out', info='Ease out samples, before closing cut')
291
- text.submit(_generate, inputs=[text, voice, in_ps, speed, opening_cut, closing_cut, ease_in, ease_out, use_gpu], outputs=[audio, out_ps])
292
- generate_btn.click(_generate, inputs=[text, voice, in_ps, speed, opening_cut, closing_cut, ease_in, ease_out, use_gpu], outputs=[audio, out_ps])
293
 
294
  @torch.no_grad()
295
  def lf_forward(token_lists, voice, speed, device='cpu'):
@@ -376,11 +349,10 @@ def segment_and_tokenize(text, voice, skip_square_brackets=True, newline_split=2
376
  segments = [row for t in texts for row in recursive_split(t, voice)]
377
  return [(i, *row) for i, row in enumerate(segments)]
378
 
379
- def lf_generate(segments, voice, speed, opening_cut, closing_cut, ease_in, ease_out, pad_between, use_gpu):
380
  token_lists = list(map(tokenize, segments['Tokens']))
381
  wavs = []
382
- opening_cut = int(opening_cut / speed)
383
- closing_cut = int(closing_cut / speed)
384
  pad_between = int(pad_between / speed)
385
  batch_size = 100
386
  for i in range(0, len(token_lists), batch_size):
@@ -396,16 +368,10 @@ def lf_generate(segments, voice, speed, opening_cut, closing_cut, ease_in, ease_
396
  raise gr.Error(e)
397
  break
398
  for out in outs:
399
- if opening_cut > 0:
400
- out = out[opening_cut:]
401
- if closing_cut > 0:
402
- out = out[:-closing_cut]
403
- ease_in = min(int(ease_in / speed), len(out)//2)
404
- for i in range(ease_in):
405
- out[i] *= s_curve(i / ease_in)
406
- ease_out = min(int(ease_out / speed), len(out)//2)
407
- for i in range(ease_out):
408
- out[-i-1] *= s_curve(i / ease_out)
409
  if wavs and pad_between > 0:
410
  wavs.append(np.zeros(pad_between))
411
  wavs.append(out)
@@ -451,26 +417,15 @@ with gr.Blocks() as lf_tts:
451
  generate_btn = gr.Button('Generate x0', variant='secondary', interactive=False)
452
  with gr.Column():
453
  audio = gr.Audio(interactive=False, label='Output Audio')
454
- with gr.Accordion('Audio Settings', open=False):
455
- with gr.Row():
456
- speed = gr.Slider(minimum=0.5, maximum=2, value=1, step=0.1, label='⚡️ Speed', info='Adjust the speed of the audio; the settings below are auto-scaled by speed')
457
- with gr.Row():
458
- with gr.Column():
459
- opening_cut = gr.Slider(minimum=0, maximum=24000, value=4000, step=1000, label='✂️ Opening Cut', info='Cut samples from the start')
460
- with gr.Column():
461
- closing_cut = gr.Slider(minimum=0, maximum=24000, value=2000, step=1000, label='🎬 Closing Cut', info='Cut samples from the end')
462
- with gr.Row():
463
- with gr.Column():
464
- ease_in = gr.Slider(minimum=0, maximum=24000, value=3000, step=1000, label='🎢 Ease In', info='Ease in samples, after opening cut')
465
- with gr.Column():
466
- ease_out = gr.Slider(minimum=0, maximum=24000, value=1000, step=1000, label='🛝 Ease Out', info='Ease out samples, before closing cut')
467
- with gr.Row():
468
- pad_between = gr.Slider(minimum=0, maximum=24000, value=10000, step=1000, label='🔇 Pad Between', info='How many samples of silence to insert between segments')
469
  with gr.Row():
470
  segments = gr.Dataframe(headers=['#', 'Text', 'Tokens', 'Length'], row_count=(1, 'dynamic'), col_count=(4, 'fixed'), label='Segments', interactive=False, wrap=True)
471
  segments.change(fn=did_change_segments, inputs=[segments], outputs=[segment_btn, generate_btn])
472
  segment_btn.click(segment_and_tokenize, inputs=[text, voice, skip_square_brackets, newline_split], outputs=[segments])
473
- generate_btn.click(lf_generate, inputs=[segments, voice, speed, opening_cut, closing_cut, ease_in, ease_out, pad_between, use_gpu], outputs=[audio])
474
 
475
  with gr.Blocks() as about:
476
  gr.Markdown("""
 
156
  }
157
  VOICES = {device: {k: torch.load(os.path.join(snapshot, 'voicepacks', f'{k}.pt'), weights_only=True).to(device) for k in CHOICES.values()} for device in models}
158
 
 
 
 
 
 
 
 
 
 
 
159
  SAMPLE_RATE = 24000
160
 
161
  @torch.no_grad()
 
188
  return forward(tokens, voice, speed, device='cuda')
189
 
190
  # Must be backwards compatible with https://huggingface.co/spaces/Pendrokar/TTS-Spaces-Arena
191
+ def generate(text, voice, ps, speed, _reduce_noise, trim, _closing_cut, _ease_in, _ease_out, _pad_before, _pad_after, use_gpu):
192
+ return _generate(text, voice, ps, speed, trim, use_gpu)
193
 
194
+ def _generate(text, voice, ps, speed, trim, use_gpu):
195
  if voice not in VOICES['cpu']:
196
  voice = 'af'
197
  ps = ps or phonemize(text, voice)
 
209
  except gr.exceptions.Error as e:
210
  raise gr.Error(e)
211
  return (None, '')
212
+ trim = int(trim / speed)
213
+ if trim > 0:
214
+ if trim * 2 >= len(out):
215
+ return (None, '')
216
+ out = out[trim:-trim]
 
 
 
 
 
 
 
217
  return ((SAMPLE_RATE, out), ps)
218
 
219
  def toggle_autoplay(autoplay):
 
254
  phonemize_btn.click(phonemize, inputs=[text, voice], outputs=[in_ps])
255
  with gr.Column():
256
  audio = gr.Audio(interactive=False, label='Output Audio', autoplay=True)
257
+ with gr.Accordion('Audio Settings', open=False):
258
+ autoplay = gr.Checkbox(value=True, label='Autoplay')
259
+ autoplay.change(toggle_autoplay, inputs=[autoplay], outputs=[audio])
260
+ speed = gr.Slider(minimum=0.5, maximum=2, value=1, step=0.1, label='⚡️ Speed', info='Adjust the speaking speed')
261
+ trim = gr.Slider(minimum=0, maximum=24000, value=4000, step=1000, label='✂️ Trim', info='Cut from both ends')
262
  with gr.Accordion('Output Tokens', open=True):
263
  out_ps = gr.Textbox(interactive=False, show_label=False, info='Tokens used to generate the audio, up to 510 allowed. Same as input tokens if supplied, excluding unknowns.')
264
+ text.submit(_generate, inputs=[text, voice, in_ps, speed, trim, use_gpu], outputs=[audio, out_ps])
265
+ generate_btn.click(_generate, inputs=[text, voice, in_ps, speed, trim, use_gpu], outputs=[audio, out_ps])
 
 
 
 
 
 
 
 
 
 
 
 
 
266
 
267
  @torch.no_grad()
268
  def lf_forward(token_lists, voice, speed, device='cpu'):
 
349
  segments = [row for t in texts for row in recursive_split(t, voice)]
350
  return [(i, *row) for i, row in enumerate(segments)]
351
 
352
+ def lf_generate(segments, voice, speed, trim, pad_between, use_gpu):
353
  token_lists = list(map(tokenize, segments['Tokens']))
354
  wavs = []
355
+ trim = int(trim / speed)
 
356
  pad_between = int(pad_between / speed)
357
  batch_size = 100
358
  for i in range(0, len(token_lists), batch_size):
 
368
  raise gr.Error(e)
369
  break
370
  for out in outs:
371
+ if trim > 0:
372
+ if trim * 2 >= len(out):
373
+ continue
374
+ out = out[trim:-trim]
 
 
 
 
 
 
375
  if wavs and pad_between > 0:
376
  wavs.append(np.zeros(pad_between))
377
  wavs.append(out)
 
417
  generate_btn = gr.Button('Generate x0', variant='secondary', interactive=False)
418
  with gr.Column():
419
  audio = gr.Audio(interactive=False, label='Output Audio')
420
+ with gr.Accordion('Audio Settings', open=True):
421
+ speed = gr.Slider(minimum=0.5, maximum=2, value=1, step=0.1, label='⚡️ Speed', info='Adjust the speaking speed')
422
+ trim = gr.Slider(minimum=0, maximum=24000, value=0, step=1000, label='✂️ Trim', info='Cut from both ends')
423
+ pad_between = gr.Slider(minimum=0, maximum=24000, value=0, step=1000, label='🔇 Pad Between', info='How much silence to insert between segments')
 
 
 
 
 
 
 
 
 
 
 
424
  with gr.Row():
425
  segments = gr.Dataframe(headers=['#', 'Text', 'Tokens', 'Length'], row_count=(1, 'dynamic'), col_count=(4, 'fixed'), label='Segments', interactive=False, wrap=True)
426
  segments.change(fn=did_change_segments, inputs=[segments], outputs=[segment_btn, generate_btn])
427
  segment_btn.click(segment_and_tokenize, inputs=[text, voice, skip_square_brackets, newline_split], outputs=[segments])
428
+ generate_btn.click(lf_generate, inputs=[segments, voice, speed, trim, pad_between, use_gpu], outputs=[audio])
429
 
430
  with gr.Blocks() as about:
431
  gr.Markdown("""