# Ignore a bunch of deprecation warnings
import sys
sys.path.append('../../..')
sys.path.append('../.')
import warnings
warnings.filterwarnings("ignore")

import copy
import os
import time
from tqdm import tqdm
import math

import ddsp
import ddsp.training

from data_handling.ddspdataset import DDSPDataset
from utils.training_utils import print_hparams, set_seed, save_results, str2bool
from hparams_midiae_interp_cond import hparams as hp
from midiae_interp_cond.get_model import get_model, get_fake_data

import librosa
import matplotlib.pyplot as plt
import numpy as np
import tensorflow.compat.v2 as tf
import tensorflow_datasets as tfds
import pandas as pd
import qgrid

from notebook_utils import *

set_seed(1234)

# Helper Functions
sample_rate = 16000


print('Done!')

Done!

Softmax postnet result¶

I tried using a softmax postnet to predict the diff in harmonic distribution and noise magnitudes.

The training loss is pretty low (~4.9). But the generated sample is not perceptually good. It is "noisy", similar to what I seen in mixture density (mixture of Gaussian and mixture of logistics) models:

import utils.audio_io

ref_wav = utils.audio_io.load_audio(r'/data/6.28/0_ref.wav', 16000)
pred_wav = utils.audio_io.load_audio(r'/data/6.28/0_pred.wav', 16000)
plt.figure(figsize=(10,4))
plt.subplot(1, 2, 1)
plot_spec(ref_wav, hp.sample_rate, title='original')
plt.subplot(1, 2, 2)
plot_spec(pred_wav, hp.sample_rate, title='prediction')

ref_wav = utils.audio_io.load_audio(r'/data/6.28/1_ref.wav', 16000)
pred_wav = utils.audio_io.load_audio(r'/data/6.28/1_pred.wav', 16000)
plt.figure(figsize=(10,4))
plt.subplot(1, 2, 1)
plot_spec(ref_wav, hp.sample_rate, title='original')
plt.subplot(1, 2, 2)
plot_spec(pred_wav, hp.sample_rate, title='prediction')

ref_wav = utils.audio_io.load_audio(r'/data/6.28/2_ref.wav', 16000)
pred_wav = utils.audio_io.load_audio(r'/data/6.28/2_pred.wav', 16000)
plt.figure(figsize=(10,4))
plt.subplot(1, 2, 1)
plot_spec(ref_wav, hp.sample_rate, title='original')
plt.subplot(1, 2, 2)
plot_spec(pred_wav, hp.sample_rate, title='prediction')

ref_wav = utils.audio_io.load_audio(r'/data/6.28/3_ref.wav', 16000)
pred_wav = utils.audio_io.load_audio(r'/data/6.28/3_pred.wav', 16000)
plt.figure(figsize=(10,4))
plt.subplot(1, 2, 1)
plot_spec(ref_wav, hp.sample_rate, title='original')
plt.subplot(1, 2, 2)
plot_spec(pred_wav, hp.sample_rate, title='prediction')

Ld_f0 model results¶

The result of the autoregressive model predicting loudness and f0 using softmax and random sampling, and "drives" the DDSP inference module to synthesize audio.

TLDR: 1. the DDSP inference module is also suffering from oversmoothing because it only inputs f0 and loudness but not mel-spectrogram. 2. the model is not stable, artifacts of note attack can appear in the middle of a note.

The input to the model remains the same as the MIDI note and interpretable conditioning. The output is f0 and loudness categorical distribution. The f0 is quantized in cents of midi scale diff. The loudness is quantized in integer dB ranging from -120 to 0.

Below are the reconstruction and generation results when expression control parameters are changed. Note for the artifacts, especially in the form of attack and f0 glitches.

Also, the control seems less effective here, please refer to the "all decrescendo" case.

Synth-coder Prediction (ld, f0, mel -> synth params)¶

f0, amps, hd, noise = synth_params_normalized
f0_midi = ddsp.core.hz_to_midi(f0)
synth_params_normalized = (f0_midi, amps, hd, noise)
plot_pred_acoustic_feature(sample['audio'].numpy()[0], synth_audio.numpy()[0], get_synth_params(synth_params_normalized), mask_zero_f0=True)

Note how the "synth coder" is also oversmoothed. This is because the "synth coder" here only inputs loudness and f0, whereas the "synth coder" in the dilated conv model can additionally input mel-spectrogram as input. So the latter has a more "realistic" generation in timbre.

The result of the synth coder with mel-spectrogram as input, in comparison:

ref_wav = utils.audio_io.load_audio(r'/data/6.28/synth_coder_mel.wav', 16000)
plot_spec(ref_wav, hp.sample_rate)

Default Value (Reconstruction)¶

plot_pred_acoustic_feature(sample['audio'].numpy()[0], midi_audio_changed.numpy()[0], get_synth_params(params_changed), mask_zero_f0=True)

ALL vibrato¶

plot_pred_acoustic_feature(sample['audio'].numpy()[0], midi_audio_changed.numpy()[0], get_synth_params(params_changed), mask_zero_f0=True)

None vibrato¶

plot_pred_acoustic_feature(sample['audio'].numpy()[0], midi_audio_changed.numpy()[0], get_synth_params(params_changed), mask_zero_f0=True)

ALL crescendo¶

plot_pred_acoustic_feature(sample['audio'].numpy()[0], midi_audio_changed.numpy()[0], get_synth_params(params_changed), mask_zero_f0=True)

All decrescendo¶

plot_pred_acoustic_feature(sample['audio'].numpy()[0], midi_audio_changed.numpy()[0], get_synth_params(params_changed), mask_zero_f0=True)

ALL staccato¶

plot_pred_acoustic_feature(sample['audio'].numpy()[0], midi_audio_changed.numpy()[0], get_synth_params(params_changed), mask_zero_f0=True)