In [1]:
# Ignore a bunch of deprecation warnings
import sys
sys.path.append('../../..')
sys.path.append('../.')
import warnings
warnings.filterwarnings("ignore")

import copy
import os
import time
from tqdm import tqdm
import math

import ddsp
import ddsp.training

from data_handling.ddspdataset import DDSPDataset
from utils.training_utils import print_hparams, set_seed, save_results, str2bool
from hparams_midiae_interp_cond import hparams as hp
from midiae_interp_cond.get_model import get_model, get_fake_data

import librosa
import matplotlib.pyplot as plt
import numpy as np
import tensorflow.compat.v2 as tf
import tensorflow_datasets as tfds
import pandas as pd

from notebook_utils import *

set_seed(1234)

# Helper Functions
sample_rate = 16000


print('Done!')
Done!
In [2]:
model_path = r'/data/ddsp-experiment/logs/logs/logs_interp_cond_5.28_rnn_synth_params/2021-05-28-10-48-16/50000'
hp_dict = get_hp(os.path.join(os.path.dirname(model_path), 'train.log'))
for k, v in hp_dict.items():
    setattr(hp, k, v)
In [3]:
model = get_model(hp)
_ = model._build(get_fake_data(hp))
model.load_weights(model_path)
Out[3]:
<tensorflow.python.training.tracking.util.CheckpointLoadStatus at 0x7fa1046c2050>
In [4]:
from data_handling.urmp_tfrecord_dataloader import UrmpMidi
from data_handling.get_tfrecord_length import get_tfrecord_length
from midiae_interp_cond.recon_loss import ReconLossHelper
In [5]:
from data_handling.get_dataset import get_batch
In [6]:
from data_handling.instrument_name_utils import INST_ABB_TO_NAME_DICT, INST_NAME_TO_ABB_DICT, INST_ABB_LIST
In [7]:
log_dir = '/data/ddsp-experiment/urmp_single_instrument_recon'
os.makedirs(log_dir, exist_ok=True)
In [8]:
data_dir = r'/data/music_dataset/urmp_dataset/tfrecord_ddsp/batched/solo_instrument'
test_data_loader = UrmpMidi(data_dir, instrument_key='vn', split='test')
evaluation_data = get_batch(test_data_loader, batch_size=1, shuffle=True, repeats=1, drop_remainder=False)
evaluation_data = iter(evaluation_data)
In [14]:
data = next(evaluation_data)
In [15]:
outputs = model(data, training=False)
In [16]:
plot_spec(outputs['midi_audio'][0].numpy(), sample_rate)
In [17]:
plot_spec(outputs['synth_audio'][0].numpy(), sample_rate)
In [23]:
params_pred = outputs['params_pred']
midi_synth_params = {'amplitudes': params_pred['amplitudes'],
                       'harmonic_distribution': params_pred[
                           'harmonic_distribution'],
                       'noise_magnitudes': params_pred['noise_magnitudes'],
                       'f0_hz': params_pred['f0_hz'], }
synth_params = outputs['synth_params']

Synthesizer Parameter Switching

Try to switch the parameters from ddsp inference to synthesizer parameters generator, to debug what causes the quality degration.

TLDR: The perceptual quality degration is mainly caused by harmonic distribution. The synthesizer parameters generator generates oversmoothed harmonic distribution and noise magnitudes.

switch amplitudes

In [25]:
synth_params_mix = {
    'amplitudes': synth_params['amplitudes'],
   'harmonic_distribution': midi_synth_params['harmonic_distribution'],
   'noise_magnitudes': midi_synth_params['noise_magnitudes'],
   'f0_hz': midi_synth_params['f0_hz']
}

audio = model.processor_group(synth_params_mix)
audio = model.reverb_module(audio, reverb_number=data['instrument_id'])
plot_spec(audio[0].numpy(), sample_rate)

switch harmonic distribution

In [26]:
synth_params_mix = {
    'amplitudes': midi_synth_params['amplitudes'],
   'harmonic_distribution': synth_params['harmonic_distribution'],
   'noise_magnitudes': midi_synth_params['noise_magnitudes'],
   'f0_hz': midi_synth_params['f0_hz']
}

audio = model.processor_group(synth_params_mix)
audio = model.reverb_module(audio, reverb_number=data['instrument_id'])
plot_spec(audio[0].numpy(), sample_rate)

switch noise magnitudes

In [27]:
synth_params_mix = {
    'amplitudes': midi_synth_params['amplitudes'],
   'harmonic_distribution': midi_synth_params['harmonic_distribution'],
   'noise_magnitudes': synth_params['noise_magnitudes'],
   'f0_hz': midi_synth_params['f0_hz']
}

audio = model.processor_group(synth_params_mix)
audio = model.reverb_module(audio, reverb_number=data['instrument_id'])
plot_spec(audio[0].numpy(), sample_rate)
In [ ]:
 

Looks like it is indeed the problem of harmonic distribution. Below I will plot the harmonic distribution.

In [29]:
import librosa
import librosa.display

The harmonic distribution of the DDSP inference (autoencoder):

In [35]:
plt.figure(figsize=(20,4))
librosa.display.specshow(synth_params['harmonic_distribution'].numpy()[0].T)
plt.colorbar()
Out[35]:
<matplotlib.colorbar.Colorbar at 0x7fa0ec463210>

The harmonic distribution of the synthesizer parameters generator:

In [36]:
plt.figure(figsize=(20,4))
librosa.display.specshow(midi_synth_params['harmonic_distribution'].numpy()[0].T)
plt.colorbar()
Out[36]:
<matplotlib.colorbar.Colorbar at 0x7fa1047ac0d0>

Yeah it's definately over smoothing.

Here is the noise magnitudes, for comparison.

The noise magnitudes of the DDSP inference (autoencoder):

In [37]:
plt.figure(figsize=(20,4))
librosa.display.specshow(synth_params['noise_magnitudes'].numpy()[0].T)
plt.colorbar()
Out[37]:
<matplotlib.colorbar.Colorbar at 0x7fa1046fe510>

The harmonic distribution of the synthesizer parameters generator:

In [38]:
plt.figure(figsize=(20,4))
librosa.display.specshow(midi_synth_params['noise_magnitudes'].numpy()[0].T)
plt.colorbar()
Out[38]:
<matplotlib.colorbar.Colorbar at 0x7fa1042ff690>

Yeah it's also over smoothing.

In here I'm showing the samples from test-set. I also tried training set, and found the over-smoothing problem still exists. So it's not the train-test gap.

I am thinking that it's maybe the dilated conv to blame. I will try another network that's bi-directional, but not autoregressive for comparison.

In [ ]: