在华为ModelArts上进行Tacotron2实验、部署DiffSinger在线训练与推理
【摘要】 姓名:席宇辰 学号:1120202444 摘要实验报告记录了我在进行结课作业时完成的所有任务,整理了完成这些任务所需要的必要的基础知识、完成实验过程中搜集的资料,记录了我对一些模型项目代码的改进、重构的详细细节,以及在进行实验中遇到的错误及其修正。实验报告主要包含以下两部分内容,它们将顺序出现在后面的小节中:两个华为架构、模型相关的TTS实验:其中第一个实现使用了华为的机器学习的平台和架构...
OBS服务
Ascend
MindSpore1.70
Tacotron2
TTS Frontend
Acoustic model
Vocoder
Tacotron
Wavenet
Griffin-Lim
CBHG
端到端
.yaml
tacotron2/
؋── eval.py // 评估条目
؋── generate_hdf5.py // 从数据集中生成hdf5文件
─ ljspeech_config.yaml
── model_utils
│ ؋── config.py // 解析参数
│ ─ device_adapter.py // ModelArts的设备适配器
│ ؋── __init__.py // init 文件
│ ؋── local_adapter.py // 本地适配器
│ └── moxing_adapter.py // ModelArts的Moxing适配器
── README.md // 关于Tacotron2的描述
── requirements.txt // reqired package
脚本
│ ─ run_distribute_train.sh // 启动分布式培训
│ ؋── run_eval.sh // 启动评估
│ └── run_standalone_train.sh // 启动独立培训
── src
│ ؋── callback.py // 用于监控培训的回调
│ ؋── dataset.py // 定义数据集和采样器
│ ؋─ hparams.py // Tacotron2 配置
│ ؋── rnn_cells.py // rnn 单元格实现
│ ؋── rnns.py // lstm 实现与长度掩码
│ ── tacotron2.py // Tacotron2 网络
│ ── 文本
│ │ ── cleaners.py // 干净的文本序列
│ │ ؋── cmudict.py // 定义 cmudict
│ │ ؋── __init__.py // 处理文本序列
││ ؋── numbers.py // 规范化数字
│ │ └── symbols.py // 编码符号
│ └── utils
│ ── audio.py // 提取音频功能
│ └── convert.py // 通过 meanvar 规范化 mel spectrogram
└── train.py // 培训条目
https://gitee.com/mindspore/models/tree/master/official/audio/Tacotron2#tacotron2-description
train.py
ljspeech_config.yaml
generate_hdf5.py
Codelab
ata_url: "" # set on the page
train_url: "" # set on the page
checkpoint_url: "" # set on the page
# Path for local
data_path: "/cache/data" # download data to data_path from data_url(obs address)
output_path: "/cache/train" # upload output data from output_path dirs to train_url(obs address)
load_path: "/cache/checkpoint_path" # download checkpoint to load_path from checkpoint_url(obs address)
device_target: "Ascend"
need_modelarts_dataset_unzip: False #这里如果进行解压缩会出现流程问题
modelarts_dataset_unzip_name: "LJSpeech-1.1"
‘pretrain_ckpt": '/path/to/model. ckpt'# use pretrained ckpt at training phase
'model_ckpt': '/path/to/model.ckpt'
# use pretrained ckpt at inference phase
'Ir': 0.002
# initial learning rate
'batch_size': 16
# training batch size
'epoch_num': 2000
# total training epochs
'warmup_epochs': 30
# warmpup lr epochs
'save_ckpt_dir:' './ckpt'
# specify ckpt saving dir
'keep_checkpoint_max': 10
# only keep the last keep_checkpoint_max checkpoint
'text': 'text to synthesize'
'dataset_path': '/dir/to/hdf5'
'data_name': 'ljspeech'
'audioname': 'text2speech'
'run distribute': False
'device_id': 0
# specify text to synthesize at inference
# specify dir to haf5 file
# specify dataset name
# specify filename for generated audio
# whether distributed training
# specify which device to use
执行a或b。
a.在[DATASET_NAME]_config.yaml文件上设置“enable_modelarts=True”。
在[DATASET_NAME]_config.yaml文件上设置“dataset_path='/cache/data/[DATASET_NAME]”。
在[DATASET_NAME]_config.yaml文件上设置“data_name='[DATASET_NAME]'”。
(选项)在您需要的[DATASET_NAME]_config.yaml文件上设置其他参数。
b.在网站UI界面上添加“enable_modelarts=True”。
在网站用户界面界面上添加“dataset_path='/cache/data/[DATASET_NAME]”。
在网站UI界面上添加“data_name='[DATASET_NAME]”。
(选项)在UI中设置其他参数。
Ascend-Powered-Engine
执行a或者b
a.在[DATASET_NAME]_config.yaml文件上设置“enable_modelarts=True”。
在[DATASET_NAME]_config.yaml文件上设置“data_name='[DATASET_NAME]'”。
在[DATASET_NAME]_config.yaml文件上设置“model_ckpt='/cache/checkpoint_path/model.ckpt'”。
在[DATASET_NAME]_config.yaml文件上设置“text='text to synthesize'”。
在[DATASET_NAME]_config.yaml文件上设置“checkpoint_url='s3://dir_to_trained_ckpt/'”。
(选项)在您需要的[DATASET_NAME]_config.yaml文件上设置其他参数。
b。在网站UI界面上添加“enable_modelarts=True”。
在网站UI界面上添加“data_name='[DATASET_NAME]”。
在网站UI界面上添加“model_ckpt=/cache/checkpoint_path/model.ckpt”。
在网站UI界面上添加“text='text to synthesize”。
在网站用户界面界面上添加“checkpoint_url='s3://dir_to_trained_ckpt/'”。
(选项)在网站UI界面上添加其他参数。
.py
eval.py
.yaml
def prepare_dataloaders(dataset_path, rank_id, group_size):
'''prepare dataloaders'''
dataset = ljdataset(dataset_path, group_size)
ds_dataset = ds.GeneratorDataset(dataset,
['text_padded',
'input_lengths',
'mel_padded',
'gate_padded',
'text_mask',
'mel_mask',
'rnn_mask'],
num_parallel_workers=4,
sampler=Sampler(dataset.sample_nums,
rank_id,
group_size))
ds_dataset = ds_dataset.batch(hps.batch_size)
return ds_dataset
.py
import os
import argparse
import random
import h5py
from tqdm import tqdm
import numpy as np
import librosa
from src.utils.audio import load_wav, melspectrogram
from src.hparams import hparams as hps
from src.text import text_to_sequence
from src.utils import audio
random.seed(0)
def files_to_list(fdir):
''' collect text and filepath to list'''
f_list = []
with open(os.path.join(fdir, 'metadata.csv'), encoding='utf-8') as f:
for line in f:
parts = line.strip().split('|')
wav_path = os.path.join(fdir, 'wavs', '%s.wav' % parts[0])
f_list.append([wav_path, parts[1]])
return f_list
def get_mel_text_pair(filename_and_text):
'''preprocessing mel and text '''
filename, text = filename_and_text[0], filename_and_text[1]
text += '~'
text = get_text(text)
mel = produce_mel_features(filename)
print(mel.shape)
return (text, mel)
def get_text(text):
'''encode text to sequence'''
return text_to_sequence(text, hps.text_cleaners)
def get_mel(filename):
'''extract mel spectrogram'''
wav = load_wav(filename)
trim_wav, _ = librosa.effects.trim(
wav, top_db=60, frame_length=2048, hop_length=512)
wav = np.concatenate(
(trim_wav,
np.zeros(
(5 * hps.hop_length),
np.float32)),
0)
mel = melspectrogram(wav).astype(np.float32)
return mel
def produce_mel_features(filename):
'''produce Mel-Frequency features'''
wav, fs = librosa.load(filename, sr=22050)
wav = librosa.resample(wav, fs, 16000)
# between audio and mel-spectrogram
wav = audio.wav_padding(wav, hps)
assert len(wav) % hps.hop_size == 0
# Pre-emphasize
preem_wav = audio.preemphasis(wav, hps.preemphasis, hps.preemphasize)
# Compute the mel scale spectrogram from the wav
mel_spectrogram = audio.mel_spectrogram(preem_wav, hps).astype(np.float32)
mel = (mel_spectrogram + hps.max_abs_value) / (2 * hps.max_abs_value)
return mel.astype(np.float32)
def generate_hdf5_derictcall(fdir):
'''generate hdf5 file'''
f_list = files_to_list(fdir)
random.shuffle(f_list)
max_text, max_mel = 0, 0
for idx, filename_and_text in tqdm(enumerate(f_list)):
text, mel = get_mel_text_pair(filename_and_text)
max_text = max(max_text, len(text))
max_mel = max(max_mel, mel.shape[1])
with h5py.File('ljdataset.hdf5', 'a') as hf:
hf.create_dataset('{}_mel'.format(idx), data=mel)
hf.create_dataset('{}_text'.format(idx), data=text)
def prepare_dataloaders(dataset_path, rank_id, group_size):
'''prepare dataloaders'''
generate_hdf5_derictcall(dataset_path) #生成hdf5文件
dataset = ljdataset(dataset_path, group_size)
ds_dataset = ds.GeneratorDataset(dataset,
['text_padded',
'input_lengths',
'mel_padded',
'gate_padded',
'text_mask',
'mel_mask',
'rnn_mask'],
num_parallel_workers=4,
sampler=Sampler(dataset.sample_nums,
rank_id,
group_size))
ds_dataset = ds_dataset.batch(hps.batch_size)
return ds_dataset
tar.gz2
def modelarts_pre_process():
'''modelarts pre process function.'''
def unzip(zip_file, save_dir):
import zipfile
s_time = time.time()
if not os.path.exists(os.path.join(save_dir, config.modelarts_dataset_unzip_name)):
zip_isexist = zipfile.is_zipfile(zip_file)
if zip_isexist:
fz = zipfile.ZipFile(zip_file, 'r')
data_num = len(fz.namelist())
print("Extract Start...")
print("unzip file num: {}".format(data_num))
data_print = int(data_num / 100) if data_num > 100 else 1
i = 0
for file in fz.namelist():
if i % data_print == 0:
print("unzip percent: {}%".format(int(i * 100 / data_num)), flush=True)
i += 1
fz.extract(file, save_dir)
print("cost time: {}min:{}s.".format(int((time.time() - s_time) / 60),
int(int(time.time() - s_time) % 60)))
print("Extract Done.")
else:
print(zip_file, " is not zip.")
else:
print("Zip has been extracted.")
if config.need_modelarts_dataset_unzip:
zip_file_1 = os.path.join(config.data_path, config.modelarts_dataset_unzip_name + ".zip")
save_dir_1 = os.path.join(config.data_path)
sync_lock = "/tmp/unzip_sync.lock"
# Each server contains 8 devices as most.
if get_device_id() % min(get_device_num(), 8) == 0 and not os.path.exists(sync_lock):
print("Zip file path: ", zip_file_1)
print("Unzip file save dir: ", save_dir_1)
unzip(zip_file_1, save_dir_1)
print("===Finish extract data synchronization===")
try:
os.mknod(sync_lock)
except IOError:
pass
while True:
if os.path.exists(sync_lock):
break
time.sleep(1)
print("Device: {}, Finish sync unzip data from {} to {}.".format(get_device_id(), zip_file_1, save_dir_1))
config.save_ckpt_dir = config.save_ckpt_dir
path = config.modelarts_dataset_unzip_name + ".zip"
tarfile
import moxing as mox
mox.file.copy('/home/ma-user/work/xxx.hdf5', 'obs://obsname/xxx.hdf5)
class DiffusionDecoder(nn.Module):
def __init__(self,
unet_channels=64,
unet_in_channels=2,
unet_out_channels=1,
dim_mults=(1, 2, 4),
groups=8,
with_time_emb=True,
beta_0=0.05,
beta_1=20,
N=1000,
T=1):
super().__init__()
self.beta_0 = beta_0
self.beta_1 = beta_1
self.N = N
self.T = T
self.delta_t = T*1.0 / N
self.discrete_betas = torch.linspace(beta_0, beta_1, N)
self.unet = unet.Unet(dim=unet_channels, out_dim=unet_out_channels, dim_mults=dim_mults, groups=groups, channels=unet_in_channels, with_time_emb=with_time_emb)
def marginal_prob(self, mu, x, t):
log_mean_coeff = -0.25 * t ** 2 * (self.beta_1 - self.beta_0) - 0.5 * t * self.beta_0
mean = torch.exp(log_mean_coeff[:, None, None]) * x + (1-torch.exp(log_mean_coeff[:, None, None]) ) * mu
std = torch.sqrt(1. - torch.exp(2. * log_mean_coeff))
return mean, std
def cal_loss(self, x, mu, t, z, std, g=None):
time_steps = t * (self.N - 1)
if g:
x = torch.stack([x, mu, g], 1)
else:
x = torch.stack([x, mu], 1)
grad = self.unet(x, time_steps)
loss = torch.square(grad + z / std[:, None, None]) * torch.square(std[:, None, None])
return loss
def forward(self, mu, y=None, g=None, gen=False):
if not gen:
t = torch.FloatTensor(y.shape[0]).uniform_(0, self.T-self.delta_t).to(y.device)+self.delta_t # sample a random t
mean, std = self.marginal_prob(mu, y, t)
z = torch.randn_like(y)
x = mean + std[:, None, None] * z
loss = self.cal_loss(x, mu, t, z, std, g)
return loss
else:
with torch.no_grad():
y_T = torch.randn_like(mu) + mu
y_t_plus_one = y_T
y_t = None
for n in tqdm(range(self.N - 1, 0, -1)):
t = torch.FloatTensor(1).fill_(n).to(mu.device)
if g:
x = torch.stack([y_t_plus_one, mu, g], 1)
else:
x = torch.stack([y_t_plus_one, mu], 1)
grad = self.unet(x, t)
y_t = y_t_plus_one-0.5*self.delta_t*self.discrete_betas[n]*(mu-y_t_plus_one-grad)
y_t_plus_one = y_t
return y_t
class FlowSpecDecoder(nn.Module):
def __init__(self,
in_channels,
hidden_channels,
kernel_size,
dilation_rate,
n_blocks,
n_layers,
p_dropout=0.,
n_split=4,
n_sqz=2,
sigmoid_scale=False,
gin_channels=0):
super().__init__()
self.in_channels = in_channels
self.hidden_channels = hidden_channels
self.kernel_size = kernel_size
self.dilation_rate = dilation_rate
self.n_blocks = n_blocks
self.n_layers = n_layers
self.p_dropout = p_dropout
self.n_split = n_split
self.n_sqz = n_sqz
self.sigmoid_scale = sigmoid_scale
self.gin_channels = gin_channels
self.flows = nn.ModuleList()
for b in range(n_blocks):
self.flows.append(modules.ActNorm(channels=in_channels * n_sqz))
self.flows.append(modules.InvConvNear(channels=in_channels * n_sqz, n_split=n_split))
self.flows.append(
attentions.CouplingBlock(
in_channels * n_sqz,
hidden_channels,
kernel_size=kernel_size,
dilation_rate=dilation_rate,
n_layers=n_layers,
gin_channels=gin_channels,
p_dropout=p_dropout,
sigmoid_scale=sigmoid_scale))
def forward(self, x, x_mask, g=None, reverse=False):
if not reverse:
flows = self.flows
logdet_tot = 0
else:
flows = reversed(self.flows)
logdet_tot = None
if self.n_sqz > 1:
x, x_mask = commons.squeeze(x, x_mask, self.n_sqz)
for f in flows:
if not reverse:
x, logdet = f(x, x_mask, g=g, reverse=reverse)
logdet_tot += logdet
else:
x, logdet = f(x, x_mask, g=g, reverse=reverse)
if self.n_sqz > 1:
x, x_mask = commons.unsqueeze(x, x_mask, self.n_sqz)
return x, logdet_tot
def store_inverse(self):
for f in self.flows:
f.store_inverse()
https://github.com/huawei-noah/Speech-Backbones/tree/main/Grad-TTS
class Diffusion(BaseModule):
def __init__(self, n_feats, dim,
n_spks=1, spk_emb_dim=64,
beta_min=0.05, beta_max=20, pe_scale=1000):
super(Diffusion, self).__init__()
self.n_feats = n_feats
self.dim = dim
self.n_spks = n_spks
self.spk_emb_dim = spk_emb_dim
self.beta_min = beta_min
self.beta_max = beta_max
self.pe_scale = pe_scale
self.estimator = GradLogPEstimator2d(dim, n_spks=n_spks,
spk_emb_dim=spk_emb_dim,
pe_scale=pe_scale)
def forward_diffusion(self, x0, mask, mu, t):
time = t.unsqueeze(-1).unsqueeze(-1)
cum_noise = get_noise(time, self.beta_min, self.beta_max, cumulative=True)
mean = x0*torch.exp(-0.5*cum_noise) + mu*(1.0 - torch.exp(-0.5*cum_noise))
variance = 1.0 - torch.exp(-cum_noise)
z = torch.randn(x0.shape, dtype=x0.dtype, device=x0.device,
requires_grad=False)
xt = mean + z * torch.sqrt(variance)
return xt * mask, z * mask
@torch.no_grad()
def reverse_diffusion(self, z, mask, mu, n_timesteps, stoc=False, spk=None):
h = 1.0 / n_timesteps
xt = z * mask
for i in range(n_timesteps):
t = (1.0 - (i + 0.5)*h) * torch.ones(z.shape[0], dtype=z.dtype,
device=z.device)
time = t.unsqueeze(-1).unsqueeze(-1)
noise_t = get_noise(time, self.beta_min, self.beta_max,
cumulative=False)
if stoc: # adds stochastic term
dxt_det = 0.5 * (mu - xt) - self.estimator(xt, mask, mu, t, spk)
dxt_det = dxt_det * noise_t * h
dxt_stoc = torch.randn(z.shape, dtype=z.dtype, device=z.device,
requires_grad=False)
dxt_stoc = dxt_stoc * torch.sqrt(noise_t * h)
dxt = dxt_det + dxt_stoc
else:
dxt = 0.5 * (mu - xt - self.estimator(xt, mask, mu, t, spk))
dxt = dxt * noise_t * h
xt = (xt - dxt) * mask
return xt
@torch.no_grad()
def forward(self, z, mask, mu, n_timesteps, stoc=False, spk=None):
return self.reverse_diffusion(z, mask, mu, n_timesteps, stoc, spk)
def loss_t(self, x0, mask, mu, t, spk=None):
xt, z = self.forward_diffusion(x0, mask, mu, t)
time = t.unsqueeze(-1).unsqueeze(-1)
cum_noise = get_noise(time, self.beta_min, self.beta_max, cumulative=True)
noise_estimation = self.estimator(xt, mask, mu, t, spk)
noise_estimation *= torch.sqrt(1.0 - torch.exp(-cum_noise))
loss = torch.sum((noise_estimation + z)**2) / (torch.sum(mask)*self.n_feats)
return loss, xt
def compute_loss(self, x0, mask, mu, spk=None, offset=1e-5):
t = torch.rand(x0.shape[0], dtype=x0.dtype, device=x0.device,
requires_grad=False)
t = torch.clamp(t, offset, 1.0 - offset)
return self.loss_t(x0, mask, mu, t, spk)
generator = GradTTS(len(symbols)+1, N_SPKS, params.spk_emb_dim, params.n_enc_channels, params.filter_channels,
params.filter_channels_dp, params.n_heads, params.n_enc_layers,
params.enc_kernel, params.enc_dropout, params.window_size,
params.n_feats, params.dec_dim, params.beta_min, params.beta_max,
pe_scale=1000) # pe_scale=1 for `grad-tts-old.pt`
generator.load_state_dict(torch.load('./Grad-TTS/checkpts/grad-tts-libri-tts.pt', map_location=lambda loc, storage: loc))
_ = generator.cuda().eval()
cmu = cmudict.CMUDict('./Grad-TTS/resources/cmu_dictionary')
with open('./Grad-TTS/checkpts/hifigan-config.json') as f:
h = AttrDict(json.load(f))
hifigan = HiFiGAN(h)
hifigan.load_state_dict(torch.load('./Grad-TTS/checkpts/hifigan.pt',
map_location=lambda loc, storage: loc)['generator'])
_ = hifigan.cuda().eval()
hifigan.remove_weight_norm()
%matplotlib inline
text = "Here are the match lineups for the Colombia Haiti match."
x = torch.LongTensor(intersperse(text_to_sequence(text, dictionary=cmu), len(symbols))).cuda()[None]
x_lengths = torch.LongTensor([x.shape[-1]]).cuda()
x.shape, x_lengths
SPEAKER_ID = 15
t = dt.datetime.now()
y_enc, y_dec, attn = generator.forward(x, x_lengths, n_timesteps=10, temperature=1.5,
stoc=False, spk=torch.LongTensor([SPEAKER_ID]).cuda() if N_SPKS > 1 else None,
length_scale=0.91)
t = (dt.datetime.now() - t).total_seconds()
print(f'Grad-TTS RTF: {t * 22050 / (y_dec.shape[-1] * 256)}')
plt.figure(figsize=(15, 4))
plt.subplot(1, 3, 1)
plt.title('Encoder outputs')
plt.imshow(y_enc.cpu().squeeze(), aspect='auto', origin='lower')
plt.colorbar()
plt.subplot(1, 3, 2)
plt.title('Decoder outputs')
plt.imshow(y_dec.cpu().squeeze(), aspect='auto', origin='lower')
plt.colorbar()
plt.subplot(1, 3, 3)
plt.title('Alignment')
plt.imshow(attn.cpu().squeeze(), aspect='auto', origin='lower');
干声
干声
单峰分布
DiffSinger
Diff
Diffusion
⻢尔可夫链
浅层扩散
前馈Transformer
class OpencpopBinarizer(MidiSingingBinarizer):
def split_train_test_set(self, item_names):
item_names = deepcopy(item_names)
test_item_names = [x for x in item_names if any([x.startswith(ts) for ts in hparams['test_prefixes']])]
train_item_names = [x for x in item_names if x not in set(test_item_names)]
logging.info("train {}".format(len(train_item_names)))
logging.info("test {}".format(len(test_item_names)))
return train_item_names, test_item_names
def load_meta_data(self, processed_data_dir, ds_id):
from preprocessing.opencpop import File2Batch
self.items = File2Batch.file2temporary_dict()
import os
import torch
from modules.nsf_hifigan.models import load_model
from modules.nsf_hifigan.nvSTFT import load_wav_to_torch, STFT
from src.vocoders.base_vocoder import BaseVocoder, register_vocoder
from utils.hparams import hparams
@register_vocoder
class NsfHifiGAN(BaseVocoder):
def __init__(self, device=None):
if device is None:
device = 'cuda' if torch.cuda.is_available() else 'cpu'
self.device = device
model_path = hparams['vocoder_ckpt']
assert os.path.exists(model_path), 'HifiGAN model file is not found!'
print('| Load HifiGAN: ', model_path)
self.model, self.h = load_model(model_path, device=self.device)
def spec2wav_torch(self, mel, **kwargs): # mel: [B, T, bins]
if self.h.sampling_rate != hparams['audio_sample_rate']:
print('Mismatch parameters: hparams[\'audio_sample_rate\']=', hparams['audio_sample_rate'], '!=',
self.h.sampling_rate, '(vocoder)')
if self.h.num_mels != hparams['audio_num_mel_bins']:
print('Mismatch parameters: hparams[\'audio_num_mel_bins\']=', hparams['audio_num_mel_bins'], '!=',
self.h.num_mels, '(vocoder)')
if self.h.n_fft != hparams['fft_size']:
print('Mismatch parameters: hparams[\'fft_size\']=', hparams['fft_size'], '!=', self.h.n_fft, '(vocoder)')
if self.h.win_size != hparams['win_size']:
print('Mismatch parameters: hparams[\'win_size\']=', hparams['win_size'], '!=', self.h.win_size,
'(vocoder)')
if self.h.hop_size != hparams['hop_size']:
print('Mismatch parameters: hparams[\'hop_size\']=', hparams['hop_size'], '!=', self.h.hop_size,
'(vocoder)')
if self.h.fmin != hparams['fmin']:
print('Mismatch parameters: hparams[\'fmin\']=', hparams['fmin'], '!=', self.h.fmin, '(vocoder)')
if self.h.fmax != hparams['fmax']:
print('Mismatch parameters: hparams[\'fmax\']=', hparams['fmax'], '!=', self.h.fmax, '(vocoder)')
with torch.no_grad():
c = mel.transpose(2, 1) # [B, T, bins]
# log10 to log mel
c = 2.30259 * c
f0 = kwargs.get('f0') # [B, T]
if f0 is not None and hparams.get('use_nsf'):
y = self.model(c, f0).view(-1)
else:
y = self.model(c).view(-1)
return y
def spec2wav(self, mel, **kwargs):
if self.h.sampling_rate != hparams['audio_sample_rate']:
print('Mismatch parameters: hparams[\'audio_sample_rate\']=', hparams['audio_sample_rate'], '!=',
self.h.sampling_rate, '(vocoder)')
if self.h.num_mels != hparams['audio_num_mel_bins']:
print('Mismatch parameters: hparams[\'audio_num_mel_bins\']=', hparams['audio_num_mel_bins'], '!=',
self.h.num_mels, '(vocoder)')
if self.h.n_fft != hparams['fft_size']:
print('Mismatch parameters: hparams[\'fft_size\']=', hparams['fft_size'], '!=', self.h.n_fft, '(vocoder)')
if self.h.win_size != hparams['win_size']:
print('Mismatch parameters: hparams[\'win_size\']=', hparams['win_size'], '!=', self.h.win_size,
'(vocoder)')
if self.h.hop_size != hparams['hop_size']:
print('Mismatch parameters: hparams[\'hop_size\']=', hparams['hop_size'], '!=', self.h.hop_size,
'(vocoder)')
if self.h.fmin != hparams['fmin']:
print('Mismatch parameters: hparams[\'fmin\']=', hparams['fmin'], '!=', self.h.fmin, '(vocoder)')
if self.h.fmax != hparams['fmax']:
print('Mismatch parameters: hparams[\'fmax\']=', hparams['fmax'], '!=', self.h.fmax, '(vocoder)')
with torch.no_grad():
c = torch.FloatTensor(mel).unsqueeze(0).transpose(2, 1).to(self.device)
# log10 to log mel
c = 2.30259 * c
f0 = kwargs.get('f0')
if f0 is not None and hparams.get('use_nsf'):
f0 = torch.FloatTensor(f0[None, :]).to(self.device)
y = self.model(c, f0).view(-1)
else:
y = self.model(c).view(-1)
wav_out = y.cpu().numpy()
return wav_out
@staticmethod
def wav2spec(inp_path, device=None):
if device is None:
device = 'cuda' if torch.cuda.is_available() else 'cpu'
sampling_rate = hparams['audio_sample_rate']
num_mels = hparams['audio_num_mel_bins']
n_fft = hparams['fft_size']
win_size = hparams['win_size']
hop_size = hparams['hop_size']
fmin = hparams['fmin']
fmax = hparams['fmax']
stft = STFT(sampling_rate, num_mels, n_fft, win_size, hop_size, fmin, fmax)
with torch.no_grad():
wav_torch, _ = load_wav_to_torch(inp_path, target_sr=stft.target_sr)
mel_torch = stft.get_mel(wav_torch.unsqueeze(0).to(device)).squeeze(0).T
# log mel to log10 mel
mel_torch = 0.434294 * mel_torch
return wav_torch.cpu().numpy(), mel_torch.cpu().numpy()
class GaussianDiffusion(nn.Module):
def __init__(self, phone_encoder, out_dims, denoise_fn,
timesteps=1000, K_step=1000, loss_type=hparams.get('diff_loss_type', 'l1'), betas=None, spec_min=None,
spec_max=None):
super().__init__()
self.denoise_fn = denoise_fn
if hparams.get('use_midi') is not None and hparams['use_midi']:
self.fs2 = FastSpeech2MIDI(phone_encoder, out_dims)
else:
#self.fs2 = FastSpeech2(phone_encoder, out_dims)
self.fs2 = ParameterEncoder(phone_encoder)
self.mel_bins = out_dims
if exists(betas):
betas = betas.detach().cpu().numpy() if isinstance(betas, torch.Tensor) else betas
else:
if 'schedule_type' in hparams.keys():
betas = beta_schedule[hparams['schedule_type']](timesteps)
else:
betas = cosine_beta_schedule(timesteps)
alphas = 1. - betas
alphas_cumprod = np.cumprod(alphas, axis=0)
alphas_cumprod_prev = np.append(1., alphas_cumprod[:-1])
timesteps, = betas.shape
self.num_timesteps = int(timesteps)
self.K_step = K_step
self.loss_type = loss_type
self.noise_list = deque(maxlen=4)
to_torch = partial(torch.tensor, dtype=torch.float32)
self.register_buffer('betas', to_torch(betas))
self.register_buffer('alphas_cumprod', to_torch(alphas_cumprod))
self.register_buffer('alphas_cumprod_prev', to_torch(alphas_cumprod_prev))
# calculations for diffusion q(x_t | x_{t-1}) and others
self.register_buffer('sqrt_alphas_cumprod', to_torch(np.sqrt(alphas_cumprod)))
self.register_buffer('sqrt_one_minus_alphas_cumprod', to_torch(np.sqrt(1. - alphas_cumprod)))
self.register_buffer('log_one_minus_alphas_cumprod', to_torch(np.log(1. - alphas_cumprod)))
self.register_buffer('sqrt_recip_alphas_cumprod', to_torch(np.sqrt(1. / alphas_cumprod)))
self.register_buffer('sqrt_recipm1_alphas_cumprod', to_torch(np.sqrt(1. / alphas_cumprod - 1)))
# calculations for posterior q(x_{t-1} | x_t, x_0)
posterior_variance = betas * (1. - alphas_cumprod_prev) / (1. - alphas_cumprod)
# above: equal to 1. / (1. / (1. - alpha_cumprod_tm1) + alpha_t / beta_t)
self.register_buffer('posterior_variance', to_torch(posterior_variance))
# below: log calculation clipped because the posterior variance is 0 at the beginning of the diffusion chain
self.register_buffer('posterior_log_variance_clipped', to_torch(np.log(np.maximum(posterior_variance, 1e-20))))
self.register_buffer('posterior_mean_coef1', to_torch(
betas * np.sqrt(alphas_cumprod_prev) / (1. - alphas_cumprod)))
self.register_buffer('posterior_mean_coef2', to_torch(
(1. - alphas_cumprod_prev) * np.sqrt(alphas) / (1. - alphas_cumprod)))
self.register_buffer('spec_min', torch.FloatTensor(spec_min)[None, None, :hparams['keep_bins']])
self.register_buffer('spec_max', torch.FloatTensor(spec_max)[None, None, :hparams['keep_bins']])
# Install PyTorch manually (1.8.2 LTS recommended)
# See instructions at https://pytorch.org/get-started/locally/
# Below is an example for CUDA 11.1
pip3 install torch==1.8.2 torchvision==0.9.2 torchaudio==0.8.2 --extra-index-url https://download.pytorch.org/whl/lts/1.8/cu111
# Install other requirements
pip install -r requirements.txt
export PYTHONPATH=.
CUDA_VISIBLE_DEVICES=0 python data_gen/binarize.py --config configs/midi/cascade/opencs/ds1000.yaml
run.py
CUDA_VISIBLE_DEVICES=0 python run.py --config configs/midi/cascade/opencs/ds1000.yaml --exp_name $MY_DS_EXP_NAME --reset
inp = {
'text': '小酒窝长睫毛AP是你最美的记号',
'notes': 'C#4/Db4 | F#4/Gb4 | G#4/Ab4 | A#4/Bb4 F#4/Gb4 | F#4/Gb4 C#4/Db4 | C#4/Db4 | rest | C#4/Db4 | A#4/Bb4 | G#4/Ab4 | A#4/Bb4 | G#4/Ab4 | F4 | C#4/Db4',
'notes_duration': '0.407140 | 0.376190 | 0.242180 | 0.509550 0.183420 | 0.315400 0.235020 | 0.361660 | 0.223070 | 0.377270 | 0.340550 | 0.299620 | 0.344510 | 0.283770 | 0.323390 | 0.360340',
'input_type': 'word'
}
target = "/content/DiffSinger/infer_out/小酒窝.wav"
ds.DiffSingerE2EInfer.example_run(inp, target=target)
Audio(filename=target)
OPENSVIP
"ph_seq": "AP sh ir zh e SP j v y i b a x in ch en SP z ai sh ou x in SP",
"note_seq": "rest D#3 D#3 C4 C4 rest D#4 D#4 C4 C4 A#3 A#3 C4 C4 C4 C4 rest D#3 D#3 G3 G3 G#3 G#3 rest",
"note_dur_seq": "0.6 0.4 0.4 0.6 0.6 0.1999999 0.4000001 0.4000001 0.3999999 0.3999999 0.4000001 0.4000001 0.2 0.2 0.3999999 0.3999999 0.2 0.3999999 0.3999999 0.6000004 0.6000004 1 1 0.05",
"ph_seq": "AP q ve zh e SP zh u m i l ian y ao y van d e y En j in SP",
"note_seq": "rest C4 C4 F4 F4 rest D#4 D#4 D#4 D#4 C4 C4 A#3 A#3 A#3 A#3 G#3 G#3 A#3 A#3 C4 C4 rest",
"note_dur_seq": "0.6 0.4 0.4 1 1 0.2 0.2 0.2 0.1999998 0.1999998 0.4000001 0.4000001 0.4000001 0.4000001 0.5999999 0.5999999 0.1999998 0.1999998 0.4000001 0.4000001 1.4 1.4 0.05"
CUDA_VISIBLE_DEVICES=0 python run.py --exp_name $MY_DS_EXP_NAME --infer
【版权声明】本文为华为云社区用户原创内容,转载时必须标注文章的来源(华为云社区)、文章链接、文章作者等基本信息, 否则作者和本社区有权追究责任。如果您发现本社区中有涉嫌抄袭的内容,欢迎发送邮件进行举报,并提供相关证据,一经查实,本社区将立刻删除涉嫌侵权内容,举报邮箱:
cloudbbs@huaweicloud.com
- 点赞
- 收藏
- 关注作者
评论(0)