In [1]:
# Ignore a bunch of deprecation warnings
import sys
sys.path.append('../..')
import warnings
warnings.filterwarnings("ignore")

import copy
import os
import time
from tqdm import tqdm
import math

import ddsp
import ddsp.training

from data_handling.ddspdataset import DDSPDataset
from utils.training_utils import print_hparams, set_seed, save_results, str2bool
from hparams_midiae_interp_cond import hparams as hp
from midiae_interp_cond.get_model import get_model, get_fake_data

import librosa
import matplotlib.pyplot as plt
import numpy as np
import tensorflow.compat.v2 as tf
import tensorflow_datasets as tfds
import pandas as pd
import qgrid

from notebook_utils import *

set_seed(1234)

# Helper Functions
sample_rate = 16000


print('Done!')
Done!
In [2]:
model_path = r'/data/ddsp-experiment/logs/logs/logs_interp_cond_5.9_rnn_synth_params_no_pv_mean/2021-05-09-11-43-12/120000'
hp_dict = get_hp(os.path.join(os.path.dirname(model_path), 'train.log'))
for k, v in hp_dict.items():
    setattr(hp, k, v)
hp.sequence_length=1000
In [3]:
from data_handling.urmp_tfrecord_dataloader import UrmpMidi
from data_handling.get_tfrecord_length import get_tfrecord_length
data_dir = r'/data/music_dataset/urmp_dataset/tfrecord_ddsp/batched/solo_instrument'
test_data_loader = UrmpMidi(data_dir, instrument_key='vn', split='test')
evaluation_data = test_data_loader.get_batch(batch_size=1, shuffle=True, repeats=1)

# from data_handling.google_solo_inst_dataloader import GoogleSoloInstrument
# test_data_loader = GoogleSoloInstrument(base_dir=r'/data/music_dataset/solo_performance_google/solo-inst_midi_features', instrument_key='vn', split='test')
# evaluation_data = test_data_loader.get_batch(batch_size=1, shuffle=True, repeats=1)
In [4]:
evaluation_data = iter(evaluation_data)
In [5]:
model = get_model(hp)
_ = model._build(get_fake_data(hp))
model.load_weights(model_path)
Out[5]:
<tensorflow.python.training.tracking.util.CheckpointLoadStatus at 0x7f622c791e10>
In [6]:
sample = next(evaluation_data)
sample = next(evaluation_data)
sample = next(evaluation_data)
In [7]:
from midiae_interp_cond.interpretable_conditioning import midi_to_hz, get_interpretable_conditioning, extract_harm_controls
In [8]:
plot_spec(sample['audio'][0].numpy(), sr=16000)
In [9]:
synth_params, control_params, synth_audio = model.run_synth_coder(sample, training=False)
synth_params_normalized, midi_features, conditioning_dict = model.gen_cond_dict_from_feature(sample, training=False)
In [10]:
midi_audio, params = model.gen_audio_from_cond_dict(conditioning_dict, midi_features, instrument_id=sample['instrument_id'])

Synth-coder Prediction (ld, f0, mel -> synth params)

In [11]:
f0, amps, hd, noise = synth_params_normalized
f0_midi = ddsp.core.hz_to_midi(f0)
synth_params_normalized = (f0_midi, amps, hd, noise)
plot_pred_acoustic_feature(sample['audio'].numpy()[0], synth_audio.numpy()[0], get_synth_params(synth_params_normalized), mask_zero_f0=True)

Conditioning

The function of each conditioning:

Loudness:

  • loudness mean: overall volume of a note
  • loudness std: the extend of the volume changing (crescendo & decrescendo)
  • amplitudes_max_pos: relative position (0-1) inside a note where the amplidutes reach maximum (=0 decrescendo, =1 crescendo)

Attack:

  • attack_level: the level of note attack (the average amount of noise in the first 10 frames of each note)

Timbre:

  • brightness: controls the average timbre of a note (centroid of harmonic distribution.)

Pitch:

  • pitch variation std: control the extend of vibrato, taken from the amplitude of rfft (actually it should be called "vibrato extend", but I did not change it for compatability)
  • vibrato rate: rate of the vibrato (taken from rfft)

Conditionings are note-pooled. The conditionings of the rest notes are masked to 0.

In [12]:
synth_params_normalized, midi_features, conditioning_dict = model.gen_cond_dict_from_feature(sample, training=False)
In [13]:
conditioning_df = conditioining_dict_to_df(conditioning_dict, sample['onsets'], sample['offsets'], sample['midi'])
qgrid_widget = qgrid.show_grid(conditioning_df, show_toolbar=True)
qgrid_widget
In [14]:
conditioning_df_changed = qgrid_widget.get_changed_df()
conditioning_dict = conditioning_df_to_dict(conditioning_df_changed)
midi_audio_changed, params_changed = model.gen_audio_from_cond_dict(conditioning_dict, midi_features, instrument_id=sample['instrument_id'])
In [15]:
conditioning_df_changed
Out[15]:
loudness_mean loudness_std pitch_variation_std brightness attack_level vibrato_rate amplitudes_max_pos pitch onset offset note_length
0 0.535996 0.016886 0.565990 0.064784 0.072786 6.506506 0.653333 73 0 74 74
1 0.535169 0.054680 0.000000 0.048733 0.150693 0.000000 0.750000 71 76 191 115
2 0.484832 0.037266 0.613795 0.076973 0.088458 5.005005 0.524038 69 192 400 208
3 0.484967 0.031815 0.000000 0.072595 0.131842 0.000000 0.574713 67 401 487 86
4 0.426444 0.057836 0.870234 0.071446 0.053127 5.255255 0.310345 69 488 777 289
5 0.484034 0.076221 0.000000 0.080374 0.087149 0.000000 0.781513 62 794 912 118
6 0.523222 0.030999 0.444527 0.063658 0.072319 6.006006 0.987342 66 921 1000 79

Default Value (Reconstruction)

In [16]:
plot_pred_acoustic_feature(sample['audio'].numpy()[0], midi_audio_changed.numpy()[0], get_synth_params(params_changed), mask_zero_f0=True)

ALL vibrato

In [17]:
synth_params_normalized, midi_features, conditioning_dict = model.gen_cond_dict_from_feature(sample, training=False)
conditioning_df = conditioining_dict_to_df(conditioning_dict, sample['onsets'], sample['offsets'], sample['midi'])
conditioning_df['vibrato_rate'] = np.ones_like(conditioning_df['vibrato_rate'].values)*5.25
conditioning_df['pitch_variation_std'] = np.ones_like(conditioning_df['pitch_variation_std'].values)
conditioning_df
Out[17]:
loudness_mean loudness_std pitch_variation_std brightness attack_level vibrato_rate amplitudes_max_pos pitch onset offset note_length
0 0.535996 0.016886 1.0 0.064784 0.072786 5.25 0.653333 73 0 74 74
1 0.535169 0.054680 1.0 0.048733 0.150693 5.25 0.750000 71 76 191 115
2 0.484832 0.037266 1.0 0.076973 0.088458 5.25 0.524038 69 192 400 208
3 0.484967 0.031815 1.0 0.072595 0.131842 5.25 0.574713 67 401 487 86
4 0.426444 0.057836 1.0 0.071446 0.053127 5.25 0.310345 69 488 777 289
5 0.484034 0.076221 1.0 0.080374 0.087149 5.25 0.781513 62 794 912 118
6 0.523222 0.030999 1.0 0.063658 0.072319 5.25 0.987342 66 921 1000 79
In [18]:
conditioning_dict = conditioning_df_to_dict(conditioning_df)
midi_audio_changed, params_changed = model.gen_audio_from_cond_dict(conditioning_dict, midi_features, instrument_id=sample['instrument_id'])
In [19]:
plot_pred_acoustic_feature(sample['audio'].numpy()[0], midi_audio_changed.numpy()[0], get_synth_params(params_changed), mask_zero_f0=True)
In [ ]:
 

None vibrato

In [20]:
synth_params_normalized, midi_features, conditioning_dict = model.gen_cond_dict_from_feature(sample, training=False)
conditioning_df = conditioining_dict_to_df(conditioning_dict, sample['onsets'], sample['offsets'], sample['midi'])
conditioning_df['vibrato_rate'] = np.zeros_like(conditioning_df['vibrato_rate'].values)
conditioning_df['pitch_variation_std'] = np.zeros_like(conditioning_df['pitch_variation_std'].values)
conditioning_df
Out[20]:
loudness_mean loudness_std pitch_variation_std brightness attack_level vibrato_rate amplitudes_max_pos pitch onset offset note_length
0 0.535996 0.016886 0.0 0.064784 0.072786 0.0 0.653333 73 0 74 74
1 0.535169 0.054680 0.0 0.048733 0.150693 0.0 0.750000 71 76 191 115
2 0.484832 0.037266 0.0 0.076973 0.088458 0.0 0.524038 69 192 400 208
3 0.484967 0.031815 0.0 0.072595 0.131842 0.0 0.574713 67 401 487 86
4 0.426444 0.057836 0.0 0.071446 0.053127 0.0 0.310345 69 488 777 289
5 0.484034 0.076221 0.0 0.080374 0.087149 0.0 0.781513 62 794 912 118
6 0.523222 0.030999 0.0 0.063658 0.072319 0.0 0.987342 66 921 1000 79
In [21]:
conditioning_dict = conditioning_df_to_dict(conditioning_df)
midi_audio_changed, params_changed = model.gen_audio_from_cond_dict(conditioning_dict, midi_features, instrument_id=sample['instrument_id'])
In [22]:
plot_pred_acoustic_feature(sample['audio'].numpy()[0], midi_audio_changed.numpy()[0], get_synth_params(params_changed), mask_zero_f0=True)

ALL crescendo

In [23]:
synth_params_normalized, midi_features, conditioning_dict = model.gen_cond_dict_from_feature(sample, training=False)
conditioning_df = conditioining_dict_to_df(conditioning_dict, sample['onsets'], sample['offsets'], sample['midi'])
conditioning_df['amplitudes_max_pos'] = np.ones_like(conditioning_df['amplitudes_max_pos'].values)
conditioning_df['loudness_std'] = conditioning_df['loudness_std'].values * 1.5
conditioning_df
Out[23]:
loudness_mean loudness_std pitch_variation_std brightness attack_level vibrato_rate amplitudes_max_pos pitch onset offset note_length
0 0.535996 0.025329 0.565990 0.064784 0.072786 6.506506 1.0 73 0 74 74
1 0.535169 0.082021 0.000000 0.048733 0.150693 0.000000 1.0 71 76 191 115
2 0.484832 0.055899 0.613795 0.076973 0.088458 5.005005 1.0 69 192 400 208
3 0.484967 0.047723 0.000000 0.072595 0.131842 0.000000 1.0 67 401 487 86
4 0.426444 0.086753 0.870234 0.071446 0.053127 5.255255 1.0 69 488 777 289
5 0.484034 0.114332 0.000000 0.080374 0.087149 0.000000 1.0 62 794 912 118
6 0.523222 0.046498 0.444527 0.063658 0.072319 6.006006 1.0 66 921 1000 79
In [24]:
conditioning_dict = conditioning_df_to_dict(conditioning_df)
midi_audio_changed, params_changed = model.gen_audio_from_cond_dict(conditioning_dict, midi_features, instrument_id=sample['instrument_id'])
In [25]:
plot_pred_acoustic_feature(sample['audio'].numpy()[0], midi_audio_changed.numpy()[0], get_synth_params(params_changed), mask_zero_f0=True)

All decrescendo

In [26]:
synth_params_normalized, midi_features, conditioning_dict = model.gen_cond_dict_from_feature(sample, training=False)
conditioning_df = conditioining_dict_to_df(conditioning_dict, sample['onsets'], sample['offsets'], sample['midi'])
conditioning_df['amplitudes_max_pos'] = np.zeros_like(conditioning_df['amplitudes_max_pos'].values)
conditioning_df['loudness_std'] = conditioning_df['loudness_std'].values * 1.5
conditioning_df
Out[26]:
loudness_mean loudness_std pitch_variation_std brightness attack_level vibrato_rate amplitudes_max_pos pitch onset offset note_length
0 0.535996 0.025329 0.565990 0.064784 0.072786 6.506506 0.0 73 0 74 74
1 0.535169 0.082021 0.000000 0.048733 0.150693 0.000000 0.0 71 76 191 115
2 0.484832 0.055899 0.613795 0.076973 0.088458 5.005005 0.0 69 192 400 208
3 0.484967 0.047723 0.000000 0.072595 0.131842 0.000000 0.0 67 401 487 86
4 0.426444 0.086753 0.870234 0.071446 0.053127 5.255255 0.0 69 488 777 289
5 0.484034 0.114332 0.000000 0.080374 0.087149 0.000000 0.0 62 794 912 118
6 0.523222 0.046498 0.444527 0.063658 0.072319 6.006006 0.0 66 921 1000 79
In [27]:
conditioning_dict = conditioning_df_to_dict(conditioning_df)
midi_audio_changed, params_changed = model.gen_audio_from_cond_dict(conditioning_dict, midi_features, instrument_id=sample['instrument_id'])
In [28]:
plot_pred_acoustic_feature(sample['audio'].numpy()[0], midi_audio_changed.numpy()[0], get_synth_params(params_changed), mask_zero_f0=True)
In [ ]:
 

ALL staccato

In [34]:
synth_params_normalized, midi_features, conditioning_dict = model.gen_cond_dict_from_feature(sample, training=False)
conditioning_df = conditioining_dict_to_df(conditioning_dict, sample['onsets'], sample['offsets'], sample['midi'])
conditioning_df['amplitudes_max_pos'] = np.zeros_like(conditioning_df['amplitudes_max_pos'].values)
conditioning_df['loudness_std'] = conditioning_df['loudness_std'].values * 1.5
conditioning_df['brightness'] = conditioning_df['brightness'].values * 0.7
conditioning_df
Out[34]:
loudness_mean loudness_std pitch_variation_std brightness attack_level vibrato_rate amplitudes_max_pos pitch onset offset note_length
0 0.535996 0.08 0.565990 0.05 0.072786 6.506506 0.0 73 0 74 74
1 0.535169 0.08 0.000000 0.05 0.150693 0.000000 0.0 71 76 191 115
2 0.484832 0.08 0.613795 0.05 0.088458 5.005005 0.0 69 192 400 208
3 0.484967 0.08 0.000000 0.05 0.131842 0.000000 0.0 67 401 487 86
4 0.426444 0.08 0.870234 0.05 0.053127 5.255255 0.0 69 488 777 289
5 0.484034 0.08 0.000000 0.05 0.087149 0.000000 0.0 62 794 912 118
6 0.523222 0.08 0.444527 0.05 0.072319 6.006006 0.0 66 921 1000 79
In [35]:
conditioning_dict = conditioning_df_to_dict(conditioning_df)
midi_audio_changed, params_changed = model.gen_audio_from_cond_dict(conditioning_dict, midi_features, instrument_id=sample['instrument_id'])
In [36]:
plot_pred_acoustic_feature(sample['audio'].numpy()[0], midi_audio_changed.numpy()[0], get_synth_params(params_changed), mask_zero_f0=True)
In [ ]: