forked from cyhuang-tw/AdaIN-VC
/
inference.py
51 lines (39 loc) · 1.38 KB
/
inference.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
import soundfile as sf
import torch
import torchaudio
from data import Wav2Mel
PRETRAINED_VC_MODEL_PATH = 'pretrained/vc_model.pt'
PRETRAINED_VOCODER_PATH = 'pretrained/vocoder.pt'
def convert_voice(src, tgt, model, vocoder):
with torch.no_grad():
cvt, _, _ = model.convert(src, tgt)
wav = vocoder.generate([cvt.squeeze(0).data.T])
return wav
def main(
source: str,
target: str,
output: str,
model_path: str = PRETRAINED_VC_MODEL_PATH,
vocoder_path: str = PRETRAINED_VOCODER_PATH,
):
"""
Perform one-shot voice conversion.
Args:
source: The utterance providing linguistic content.
target: The utterance providing target speaker timbre.
output: The converted utterance.
model_path: The path of the model file.
vocoder_path: The path of the vocoder file.
"""
device = "cuda" if torch.cuda.is_available() else "cpu"
model = torch.jit.load(model_path, map_location=device)
vocoder = torch.jit.load(vocoder_path, map_location=device)
wav2mel = Wav2Mel()
src, src_sr = torchaudio.load(source)
tgt, tgt_sr = torchaudio.load(target)
with torch.no_grad():
src = wav2mel(src, src_sr)[None, :].to(device)
tgt = wav2mel(tgt, tgt_sr)[None, :].to(device)
wav = convert_voice(src, tgt, model, vocoder)
wav = wav[0].data.cpu().numpy()
sf.write(output, wav, wav2mel.sample_rate)