In [1]:
# Ignore a bunch of deprecation warnings
import sys
sys.path.append('../..')
import warnings
warnings.filterwarnings("ignore")

import copy
import os
import time
from tqdm import tqdm
import math

import ddsp
import ddsp.training

from data_handling.ddspdataset import DDSPDataset
from utils.training_utils import print_hparams, set_seed, save_results, str2bool
from hparams_midiae_interp_cond import hparams as hp
from midiae_interp_cond.get_model import get_model, get_fake_data

import librosa
import matplotlib.pyplot as plt
import numpy as np
import tensorflow.compat.v2 as tf
import tensorflow_datasets as tfds
import pandas as pd
import qgrid

from notebook_utils import *

set_seed(1234)

# Helper Functions
sample_rate = 16000


print('Done!')
Done!
In [2]:
model_path = r'/data/ddsp-experiment/logs/5.13_samples/150000'
hp_dict = get_hp(os.path.join(os.path.dirname(model_path), 'train.log'))
for k, v in hp_dict.items():
    setattr(hp, k, v)
hp.sequence_length=1000
In [3]:
# from data_handling.urmp_tfrecord_dataloader import UrmpMidi
# from data_handling.get_tfrecord_length import get_tfrecord_length
# data_dir = r'/data/music_dataset/urmp_dataset/tfrecord_ddsp/batched/solo_instrument'
# test_data_loader = UrmpMidi(data_dir, instrument_key='vn', split='test')
# evaluation_data = test_data_loader.get_batch(batch_size=1, shuffle=True, repeats=1)

from data_handling.google_solo_inst_dataloader import GoogleSoloInstrument
test_data_loader = GoogleSoloInstrument(base_dir=r'/data/music_dataset/solo_performance_google/solo-inst_midi_features', instrument_key='sax', split='test')
evaluation_data = test_data_loader.get_batch(batch_size=1, shuffle=True, repeats=1)
In [4]:
evaluation_data = iter(evaluation_data)
In [5]:
model = get_model(hp)
_ = model._build(get_fake_data(hp))
model.load_weights(model_path)
Out[5]:
<tensorflow.python.training.tracking.util.CheckpointLoadStatus at 0x7f416445bf90>
In [6]:
sample = next(evaluation_data)
In [7]:
from midiae_interp_cond.interpretable_conditioning import midi_to_hz, get_interpretable_conditioning, extract_harm_controls
In [8]:
plot_spec(sample['audio'][0].numpy(), sr=16000)
In [9]:
synth_params, control_params, synth_audio = model.run_synth_coder(sample, training=False)
synth_params_normalized, midi_features, conditioning_dict = model.gen_cond_dict_from_feature(sample, training=False)
In [10]:
midi_audio, params = model.gen_audio_from_cond_dict(conditioning_dict, midi_features, instrument_id=sample['instrument_id'])

Synth-coder Prediction (ld, f0, mel -> synth params)

In [11]:
f0, amps, hd, noise = synth_params_normalized
f0_midi = ddsp.core.hz_to_midi(f0)
synth_params_normalized = (f0_midi, amps, hd, noise)
plot_pred_acoustic_feature(sample['audio'].numpy()[0], synth_audio.numpy()[0], get_synth_params(synth_params_normalized), mask_zero_f0=True)

Conditioning

The function of each conditioning:

Loudness:

  • loudness mean: overall volume of a note
  • loudness std: the extend of the volume changing (crescendo & decrescendo)
  • amplitudes_max_pos: relative position (0-1) inside a note where the amplidutes reach maximum (=0 decrescendo, =1 crescendo)

Attack:

  • attack_level: the level of note attack (the average amount of noise in the first 10 frames of each note)

Timbre:

  • brightness: controls the average timbre of a note (centroid of harmonic distribution.)

Pitch:

  • pitch variation std: control the extend of vibrato, taken from the amplitude of rfft (actually it should be called "vibrato extend", but I did not change it for compatability)
  • vibrato rate: rate of the vibrato (taken from rfft)

Conditionings are note-pooled. The conditionings of the rest notes are masked to 0.

In [12]:
# sample = next(evaluation_data)
In [13]:
synth_params_normalized, midi_features, conditioning_dict = model.gen_cond_dict_from_feature(sample, training=False)
In [ ]:
 
In [14]:
conditioning_df = conditioining_dict_to_df(conditioning_dict, sample['onsets'], sample['offsets'], sample['midi'])
qgrid_widget = qgrid.show_grid(conditioning_df, show_toolbar=True)
qgrid_widget
In [15]:
conditioning_df_changed = qgrid_widget.get_changed_df()
conditioning_dict = conditioning_df_to_dict(conditioning_df_changed, length=1000)
midi_audio_changed, params_changed = model.gen_audio_from_cond_dict(conditioning_dict, midi_features, instrument_id=sample['instrument_id'])
In [16]:
conditioning_df_changed
Out[16]:
loudness_mean loudness_std pitch_variation_std brightness attack_level vibrato_rate amplitudes_max_pos pitch onset offset note_length
0 0.311018 0.072201 0.000000 0.078191 0.266380 0.000000 0.000000 62 0 39 39
1 0.359263 0.124708 0.000000 0.056880 0.174391 0.000000 0.928571 60 40 53 13
2 0.645508 0.044678 0.709127 0.117619 0.376115 6.256256 0.050505 58 54 152 98
3 0.532684 0.003021 0.000000 0.054896 0.213675 0.000000 0.000000 57 153 154 1
4 0.386477 0.099897 0.000000 0.050006 0.231448 0.000000 0.000000 58 155 187 32
5 0.209911 0.010479 0.000000 0.061585 0.206175 0.000000 0.000000 59 188 191 3
6 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0 192 234 42
7 0.093225 0.019836 0.000000 0.093652 0.161212 0.000000 0.103448 58 235 263 28
8 0.093819 0.007367 0.000000 0.084131 0.054407 0.000000 0.500000 57 264 265 1
9 0.092428 0.000000 0.000000 0.099066 0.082864 0.000000 0.000000 58 266 281 15
10 0.660671 0.083924 0.000000 0.124992 0.323896 0.000000 0.593750 58 282 313 31
11 0.599143 0.007453 0.000000 0.144421 0.465159 0.000000 0.500000 59 314 315 1
12 0.666812 0.042902 0.000000 0.114269 0.433716 0.000000 0.666667 60 316 336 20
13 0.680449 0.033706 0.000000 0.116891 0.403745 0.000000 0.666667 61 337 366 29
14 0.659927 0.030461 0.000000 0.109713 0.398930 0.000000 0.370370 60 367 393 26
15 0.675361 0.025488 0.000000 0.151527 0.384414 0.000000 0.242857 58 394 463 69
16 0.621580 0.026350 0.000000 0.158175 0.270096 0.000000 0.000000 59 464 469 5
17 0.640321 0.067031 0.413912 0.144639 0.272773 7.257257 0.177632 57 470 621 151
18 0.457197 0.014537 0.000000 0.039420 0.234420 0.000000 0.000000 54 622 626 4
19 0.700031 0.059286 0.000000 0.187541 0.283101 0.000000 0.180328 55 627 687 60
20 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0 688 694 6
21 0.540478 0.045907 0.000000 0.068476 0.280638 0.000000 0.675676 50 695 731 36
22 0.473436 0.098042 0.000000 0.044233 0.274693 0.000000 0.928571 52 732 745 13
23 0.720140 0.017563 0.000000 0.221445 0.369850 0.000000 0.067797 53 746 804 58
24 0.632591 0.010688 0.000000 0.155975 0.388803 0.000000 0.000000 54 805 806 1
25 0.681596 0.061116 0.105454 0.182880 0.364497 4.004004 0.237705 55 807 928 121
26 0.517686 0.013825 0.000000 0.102918 0.383765 0.000000 0.000000 54 929 931 2
27 0.565969 0.010929 0.000000 0.117668 0.312407 0.000000 0.153846 53 932 944 12
28 0.505904 0.050271 0.000000 0.083352 0.280559 0.000000 0.000000 54 945 952 7
29 0.405626 0.012881 0.000000 0.035917 0.136089 0.000000 0.200000 55 953 957 4
30 0.353328 0.010900 0.000000 0.030772 0.177271 0.000000 0.000000 56 958 964 6
31 0.351082 0.007010 0.000000 0.033235 0.260542 0.000000 0.666667 55 965 970 5
32 0.495271 0.128915 0.000000 0.045689 0.271796 0.000000 0.944444 54 971 988 17
33 0.718423 0.012859 0.000000 0.214997 0.355494 0.000000 0.090909 55 989 999 10

Default Value (Reconstruction)

In [17]:
plot_pred_acoustic_feature(sample['audio'].numpy()[0], midi_audio_changed.numpy()[0], get_synth_params(params_changed), mask_zero_f0=True)

ALL vibrato

In [18]:
synth_params_normalized, midi_features, conditioning_dict = model.gen_cond_dict_from_feature(sample, training=False)
conditioning_df = conditioining_dict_to_df(conditioning_dict, sample['onsets'], sample['offsets'], sample['midi'])
conditioning_df['vibrato_rate'] = np.ones_like(conditioning_df['vibrato_rate'].values)*5.25
conditioning_df['pitch_variation_std'] = np.ones_like(conditioning_df['pitch_variation_std'].values)
conditioning_df
Out[18]:
loudness_mean loudness_std pitch_variation_std brightness attack_level vibrato_rate amplitudes_max_pos pitch onset offset note_length
0 0.311018 0.072201 1.0 0.078191 0.266380 5.25 0.000000 62 0 39 39
1 0.359263 0.124708 1.0 0.056880 0.174391 5.25 0.928571 60 40 53 13
2 0.645508 0.044678 1.0 0.117619 0.376115 5.25 0.050505 58 54 152 98
3 0.532684 0.003021 1.0 0.054896 0.213675 5.25 0.000000 57 153 154 1
4 0.386477 0.099897 1.0 0.050006 0.231448 5.25 0.000000 58 155 187 32
5 0.209911 0.010479 1.0 0.061585 0.206175 5.25 0.000000 59 188 191 3
6 0.000000 0.000000 1.0 0.000000 0.000000 5.25 0.000000 0 192 234 42
7 0.093225 0.019836 1.0 0.093652 0.161212 5.25 0.103448 58 235 263 28
8 0.093819 0.007367 1.0 0.084131 0.054407 5.25 0.500000 57 264 265 1
9 0.092428 0.000000 1.0 0.099066 0.082864 5.25 0.000000 58 266 281 15
10 0.660671 0.083924 1.0 0.124992 0.323896 5.25 0.593750 58 282 313 31
11 0.599143 0.007453 1.0 0.144421 0.465159 5.25 0.500000 59 314 315 1
12 0.666812 0.042902 1.0 0.114269 0.433716 5.25 0.666667 60 316 336 20
13 0.680449 0.033706 1.0 0.116891 0.403745 5.25 0.666667 61 337 366 29
14 0.659927 0.030461 1.0 0.109713 0.398930 5.25 0.370370 60 367 393 26
15 0.675361 0.025488 1.0 0.151527 0.384414 5.25 0.242857 58 394 463 69
16 0.621580 0.026350 1.0 0.158175 0.270096 5.25 0.000000 59 464 469 5
17 0.640321 0.067031 1.0 0.144639 0.272773 5.25 0.177632 57 470 621 151
18 0.457197 0.014537 1.0 0.039420 0.234420 5.25 0.000000 54 622 626 4
19 0.700031 0.059286 1.0 0.187541 0.283101 5.25 0.180328 55 627 687 60
20 0.000000 0.000000 1.0 0.000000 0.000000 5.25 0.000000 0 688 694 6
21 0.540478 0.045907 1.0 0.068476 0.280638 5.25 0.675676 50 695 731 36
22 0.473436 0.098042 1.0 0.044233 0.274693 5.25 0.928571 52 732 745 13
23 0.720140 0.017563 1.0 0.221445 0.369850 5.25 0.067797 53 746 804 58
24 0.632591 0.010688 1.0 0.155975 0.388803 5.25 0.000000 54 805 806 1
25 0.681596 0.061116 1.0 0.182880 0.364497 5.25 0.237705 55 807 928 121
26 0.517686 0.013825 1.0 0.102918 0.383765 5.25 0.000000 54 929 931 2
27 0.565969 0.010929 1.0 0.117668 0.312407 5.25 0.153846 53 932 944 12
28 0.505904 0.050271 1.0 0.083352 0.280559 5.25 0.000000 54 945 952 7
29 0.405626 0.012881 1.0 0.035917 0.136089 5.25 0.200000 55 953 957 4
30 0.353328 0.010900 1.0 0.030772 0.177271 5.25 0.000000 56 958 964 6
31 0.351082 0.007010 1.0 0.033235 0.260542 5.25 0.666667 55 965 970 5
32 0.495271 0.128915 1.0 0.045689 0.271796 5.25 0.944444 54 971 988 17
33 0.718423 0.012859 1.0 0.214997 0.355494 5.25 0.090909 55 989 999 10
In [19]:
conditioning_dict = conditioning_df_to_dict(conditioning_df, length=1000)
midi_audio_changed, params_changed = model.gen_audio_from_cond_dict(conditioning_dict, midi_features, instrument_id=sample['instrument_id'])
In [20]:
plot_pred_acoustic_feature(sample['audio'].numpy()[0], midi_audio_changed.numpy()[0], get_synth_params(params_changed), mask_zero_f0=True)
In [ ]:
 

None vibrato

In [21]:
synth_params_normalized, midi_features, conditioning_dict = model.gen_cond_dict_from_feature(sample, training=False)
conditioning_df = conditioining_dict_to_df(conditioning_dict, sample['onsets'], sample['offsets'], sample['midi'])
conditioning_df['vibrato_rate'] = np.zeros_like(conditioning_df['vibrato_rate'].values)
conditioning_df['pitch_variation_std'] = np.zeros_like(conditioning_df['pitch_variation_std'].values)
conditioning_df
Out[21]:
loudness_mean loudness_std pitch_variation_std brightness attack_level vibrato_rate amplitudes_max_pos pitch onset offset note_length
0 0.311018 0.072201 0.0 0.078191 0.266380 0.0 0.000000 62 0 39 39
1 0.359263 0.124708 0.0 0.056880 0.174391 0.0 0.928571 60 40 53 13
2 0.645508 0.044678 0.0 0.117619 0.376115 0.0 0.050505 58 54 152 98
3 0.532684 0.003021 0.0 0.054896 0.213675 0.0 0.000000 57 153 154 1
4 0.386477 0.099897 0.0 0.050006 0.231448 0.0 0.000000 58 155 187 32
5 0.209911 0.010479 0.0 0.061585 0.206175 0.0 0.000000 59 188 191 3
6 0.000000 0.000000 0.0 0.000000 0.000000 0.0 0.000000 0 192 234 42
7 0.093225 0.019836 0.0 0.093652 0.161212 0.0 0.103448 58 235 263 28
8 0.093819 0.007367 0.0 0.084131 0.054407 0.0 0.500000 57 264 265 1
9 0.092428 0.000000 0.0 0.099066 0.082864 0.0 0.000000 58 266 281 15
10 0.660671 0.083924 0.0 0.124992 0.323896 0.0 0.593750 58 282 313 31
11 0.599143 0.007453 0.0 0.144421 0.465159 0.0 0.500000 59 314 315 1
12 0.666812 0.042902 0.0 0.114269 0.433716 0.0 0.666667 60 316 336 20
13 0.680449 0.033706 0.0 0.116891 0.403745 0.0 0.666667 61 337 366 29
14 0.659927 0.030461 0.0 0.109713 0.398930 0.0 0.370370 60 367 393 26
15 0.675361 0.025488 0.0 0.151527 0.384414 0.0 0.242857 58 394 463 69
16 0.621580 0.026350 0.0 0.158175 0.270096 0.0 0.000000 59 464 469 5
17 0.640321 0.067031 0.0 0.144639 0.272773 0.0 0.177632 57 470 621 151
18 0.457197 0.014537 0.0 0.039420 0.234420 0.0 0.000000 54 622 626 4
19 0.700031 0.059286 0.0 0.187541 0.283101 0.0 0.180328 55 627 687 60
20 0.000000 0.000000 0.0 0.000000 0.000000 0.0 0.000000 0 688 694 6
21 0.540478 0.045907 0.0 0.068476 0.280638 0.0 0.675676 50 695 731 36
22 0.473436 0.098042 0.0 0.044233 0.274693 0.0 0.928571 52 732 745 13
23 0.720140 0.017563 0.0 0.221445 0.369850 0.0 0.067797 53 746 804 58
24 0.632591 0.010688 0.0 0.155975 0.388803 0.0 0.000000 54 805 806 1
25 0.681596 0.061116 0.0 0.182880 0.364497 0.0 0.237705 55 807 928 121
26 0.517686 0.013825 0.0 0.102918 0.383765 0.0 0.000000 54 929 931 2
27 0.565969 0.010929 0.0 0.117668 0.312407 0.0 0.153846 53 932 944 12
28 0.505904 0.050271 0.0 0.083352 0.280559 0.0 0.000000 54 945 952 7
29 0.405626 0.012881 0.0 0.035917 0.136089 0.0 0.200000 55 953 957 4
30 0.353328 0.010900 0.0 0.030772 0.177271 0.0 0.000000 56 958 964 6
31 0.351082 0.007010 0.0 0.033235 0.260542 0.0 0.666667 55 965 970 5
32 0.495271 0.128915 0.0 0.045689 0.271796 0.0 0.944444 54 971 988 17
33 0.718423 0.012859 0.0 0.214997 0.355494 0.0 0.090909 55 989 999 10
In [22]:
conditioning_dict = conditioning_df_to_dict(conditioning_df, length=1000)
midi_audio_changed, params_changed = model.gen_audio_from_cond_dict(conditioning_dict, midi_features, instrument_id=sample['instrument_id'])
In [23]:
plot_pred_acoustic_feature(sample['audio'].numpy()[0], midi_audio_changed.numpy()[0], get_synth_params(params_changed), mask_zero_f0=True)

ALL crescendo

In [24]:
synth_params_normalized, midi_features, conditioning_dict = model.gen_cond_dict_from_feature(sample, training=False)
conditioning_df = conditioining_dict_to_df(conditioning_dict, sample['onsets'], sample['offsets'], sample['midi'])
conditioning_df['amplitudes_max_pos'] = np.ones_like(conditioning_df['amplitudes_max_pos'].values)
conditioning_df['loudness_std'] = np.ones_like(conditioning_df['loudness_std'].values) * 0.15
conditioning_df
Out[24]:
loudness_mean loudness_std pitch_variation_std brightness attack_level vibrato_rate amplitudes_max_pos pitch onset offset note_length
0 0.311018 0.15 0.000000 0.078191 0.266380 0.000000 1.0 62 0 39 39
1 0.359263 0.15 0.000000 0.056880 0.174391 0.000000 1.0 60 40 53 13
2 0.645508 0.15 0.709127 0.117619 0.376115 6.256256 1.0 58 54 152 98
3 0.532684 0.15 0.000000 0.054896 0.213675 0.000000 1.0 57 153 154 1
4 0.386477 0.15 0.000000 0.050006 0.231448 0.000000 1.0 58 155 187 32
5 0.209911 0.15 0.000000 0.061585 0.206175 0.000000 1.0 59 188 191 3
6 0.000000 0.15 0.000000 0.000000 0.000000 0.000000 1.0 0 192 234 42
7 0.093225 0.15 0.000000 0.093652 0.161212 0.000000 1.0 58 235 263 28
8 0.093819 0.15 0.000000 0.084131 0.054407 0.000000 1.0 57 264 265 1
9 0.092428 0.15 0.000000 0.099066 0.082864 0.000000 1.0 58 266 281 15
10 0.660671 0.15 0.000000 0.124992 0.323896 0.000000 1.0 58 282 313 31
11 0.599143 0.15 0.000000 0.144421 0.465159 0.000000 1.0 59 314 315 1
12 0.666812 0.15 0.000000 0.114269 0.433716 0.000000 1.0 60 316 336 20
13 0.680449 0.15 0.000000 0.116891 0.403745 0.000000 1.0 61 337 366 29
14 0.659927 0.15 0.000000 0.109713 0.398930 0.000000 1.0 60 367 393 26
15 0.675361 0.15 0.000000 0.151527 0.384414 0.000000 1.0 58 394 463 69
16 0.621580 0.15 0.000000 0.158175 0.270096 0.000000 1.0 59 464 469 5
17 0.640321 0.15 0.413912 0.144639 0.272773 7.257257 1.0 57 470 621 151
18 0.457197 0.15 0.000000 0.039420 0.234420 0.000000 1.0 54 622 626 4
19 0.700031 0.15 0.000000 0.187541 0.283101 0.000000 1.0 55 627 687 60
20 0.000000 0.15 0.000000 0.000000 0.000000 0.000000 1.0 0 688 694 6
21 0.540478 0.15 0.000000 0.068476 0.280638 0.000000 1.0 50 695 731 36
22 0.473436 0.15 0.000000 0.044233 0.274693 0.000000 1.0 52 732 745 13
23 0.720140 0.15 0.000000 0.221445 0.369850 0.000000 1.0 53 746 804 58
24 0.632591 0.15 0.000000 0.155975 0.388803 0.000000 1.0 54 805 806 1
25 0.681596 0.15 0.105454 0.182880 0.364497 4.004004 1.0 55 807 928 121
26 0.517686 0.15 0.000000 0.102918 0.383765 0.000000 1.0 54 929 931 2
27 0.565969 0.15 0.000000 0.117668 0.312407 0.000000 1.0 53 932 944 12
28 0.505904 0.15 0.000000 0.083352 0.280559 0.000000 1.0 54 945 952 7
29 0.405626 0.15 0.000000 0.035917 0.136089 0.000000 1.0 55 953 957 4
30 0.353328 0.15 0.000000 0.030772 0.177271 0.000000 1.0 56 958 964 6
31 0.351082 0.15 0.000000 0.033235 0.260542 0.000000 1.0 55 965 970 5
32 0.495271 0.15 0.000000 0.045689 0.271796 0.000000 1.0 54 971 988 17
33 0.718423 0.15 0.000000 0.214997 0.355494 0.000000 1.0 55 989 999 10
In [25]:
conditioning_dict = conditioning_df_to_dict(conditioning_df, length=1000)
midi_audio_changed, params_changed = model.gen_audio_from_cond_dict(conditioning_dict, midi_features, instrument_id=sample['instrument_id'])
In [26]:
plot_pred_acoustic_feature(sample['audio'].numpy()[0], midi_audio_changed.numpy()[0], get_synth_params(params_changed), mask_zero_f0=True)

All decrescendo

In [27]:
synth_params_normalized, midi_features, conditioning_dict = model.gen_cond_dict_from_feature(sample, training=False)
conditioning_df = conditioining_dict_to_df(conditioning_dict, sample['onsets'], sample['offsets'], sample['midi'])
conditioning_df['amplitudes_max_pos'] = np.zeros_like(conditioning_df['amplitudes_max_pos'].values)
conditioning_df['loudness_std'] = np.ones_like(conditioning_df['loudness_std'].values) * 0.2
# conditioning_df['brightness'] = conditioning_df['brightness'].values * 0.8
conditioning_df
Out[27]:
loudness_mean loudness_std pitch_variation_std brightness attack_level vibrato_rate amplitudes_max_pos pitch onset offset note_length
0 0.311018 0.2 0.000000 0.078191 0.266380 0.000000 0.0 62 0 39 39
1 0.359263 0.2 0.000000 0.056880 0.174391 0.000000 0.0 60 40 53 13
2 0.645508 0.2 0.709127 0.117619 0.376115 6.256256 0.0 58 54 152 98
3 0.532684 0.2 0.000000 0.054896 0.213675 0.000000 0.0 57 153 154 1
4 0.386477 0.2 0.000000 0.050006 0.231448 0.000000 0.0 58 155 187 32
5 0.209911 0.2 0.000000 0.061585 0.206175 0.000000 0.0 59 188 191 3
6 0.000000 0.2 0.000000 0.000000 0.000000 0.000000 0.0 0 192 234 42
7 0.093225 0.2 0.000000 0.093652 0.161212 0.000000 0.0 58 235 263 28
8 0.093819 0.2 0.000000 0.084131 0.054407 0.000000 0.0 57 264 265 1
9 0.092428 0.2 0.000000 0.099066 0.082864 0.000000 0.0 58 266 281 15
10 0.660671 0.2 0.000000 0.124992 0.323896 0.000000 0.0 58 282 313 31
11 0.599143 0.2 0.000000 0.144421 0.465159 0.000000 0.0 59 314 315 1
12 0.666812 0.2 0.000000 0.114269 0.433716 0.000000 0.0 60 316 336 20
13 0.680449 0.2 0.000000 0.116891 0.403745 0.000000 0.0 61 337 366 29
14 0.659927 0.2 0.000000 0.109713 0.398930 0.000000 0.0 60 367 393 26
15 0.675361 0.2 0.000000 0.151527 0.384414 0.000000 0.0 58 394 463 69
16 0.621580 0.2 0.000000 0.158175 0.270096 0.000000 0.0 59 464 469 5
17 0.640321 0.2 0.413912 0.144639 0.272773 7.257257 0.0 57 470 621 151
18 0.457197 0.2 0.000000 0.039420 0.234420 0.000000 0.0 54 622 626 4
19 0.700031 0.2 0.000000 0.187541 0.283101 0.000000 0.0 55 627 687 60
20 0.000000 0.2 0.000000 0.000000 0.000000 0.000000 0.0 0 688 694 6
21 0.540478 0.2 0.000000 0.068476 0.280638 0.000000 0.0 50 695 731 36
22 0.473436 0.2 0.000000 0.044233 0.274693 0.000000 0.0 52 732 745 13
23 0.720140 0.2 0.000000 0.221445 0.369850 0.000000 0.0 53 746 804 58
24 0.632591 0.2 0.000000 0.155975 0.388803 0.000000 0.0 54 805 806 1
25 0.681596 0.2 0.105454 0.182880 0.364497 4.004004 0.0 55 807 928 121
26 0.517686 0.2 0.000000 0.102918 0.383765 0.000000 0.0 54 929 931 2
27 0.565969 0.2 0.000000 0.117668 0.312407 0.000000 0.0 53 932 944 12
28 0.505904 0.2 0.000000 0.083352 0.280559 0.000000 0.0 54 945 952 7
29 0.405626 0.2 0.000000 0.035917 0.136089 0.000000 0.0 55 953 957 4
30 0.353328 0.2 0.000000 0.030772 0.177271 0.000000 0.0 56 958 964 6
31 0.351082 0.2 0.000000 0.033235 0.260542 0.000000 0.0 55 965 970 5
32 0.495271 0.2 0.000000 0.045689 0.271796 0.000000 0.0 54 971 988 17
33 0.718423 0.2 0.000000 0.214997 0.355494 0.000000 0.0 55 989 999 10
In [28]:
conditioning_dict = conditioning_df_to_dict(conditioning_df, length=1000)
midi_audio_changed, params_changed = model.gen_audio_from_cond_dict(conditioning_dict, midi_features, instrument_id=sample['instrument_id'])
In [29]:
plot_pred_acoustic_feature(sample['audio'].numpy()[0], midi_audio_changed.numpy()[0], get_synth_params(params_changed), mask_zero_f0=True)
In [ ]:
 

ALL staccato

In [30]:
synth_params_normalized, midi_features, conditioning_dict = model.gen_cond_dict_from_feature(sample, training=False)
conditioning_df = conditioining_dict_to_df(conditioning_dict, sample['onsets'], sample['offsets'], sample['midi'])
conditioning_df['amplitudes_max_pos'] = np.zeros_like(conditioning_df['amplitudes_max_pos'].values)
conditioning_df['loudness_std'] = np.ones_like(conditioning_df['loudness_std'].values) * 0.2
conditioning_df['brightness'] = conditioning_df['brightness'].values * 0.45
conditioning_df
Out[30]:
loudness_mean loudness_std pitch_variation_std brightness attack_level vibrato_rate amplitudes_max_pos pitch onset offset note_length
0 0.311018 0.2 0.000000 0.035186 0.266380 0.000000 0.0 62 0 39 39
1 0.359263 0.2 0.000000 0.025596 0.174391 0.000000 0.0 60 40 53 13
2 0.645508 0.2 0.709127 0.052928 0.376115 6.256256 0.0 58 54 152 98
3 0.532684 0.2 0.000000 0.024703 0.213675 0.000000 0.0 57 153 154 1
4 0.386477 0.2 0.000000 0.022503 0.231448 0.000000 0.0 58 155 187 32
5 0.209911 0.2 0.000000 0.027713 0.206175 0.000000 0.0 59 188 191 3
6 0.000000 0.2 0.000000 0.000000 0.000000 0.000000 0.0 0 192 234 42
7 0.093225 0.2 0.000000 0.042144 0.161212 0.000000 0.0 58 235 263 28
8 0.093819 0.2 0.000000 0.037859 0.054407 0.000000 0.0 57 264 265 1
9 0.092428 0.2 0.000000 0.044580 0.082864 0.000000 0.0 58 266 281 15
10 0.660671 0.2 0.000000 0.056247 0.323896 0.000000 0.0 58 282 313 31
11 0.599143 0.2 0.000000 0.064989 0.465159 0.000000 0.0 59 314 315 1
12 0.666812 0.2 0.000000 0.051421 0.433716 0.000000 0.0 60 316 336 20
13 0.680449 0.2 0.000000 0.052601 0.403745 0.000000 0.0 61 337 366 29
14 0.659927 0.2 0.000000 0.049371 0.398930 0.000000 0.0 60 367 393 26
15 0.675361 0.2 0.000000 0.068187 0.384414 0.000000 0.0 58 394 463 69
16 0.621580 0.2 0.000000 0.071179 0.270096 0.000000 0.0 59 464 469 5
17 0.640321 0.2 0.413912 0.065088 0.272773 7.257257 0.0 57 470 621 151
18 0.457197 0.2 0.000000 0.017739 0.234420 0.000000 0.0 54 622 626 4
19 0.700031 0.2 0.000000 0.084394 0.283101 0.000000 0.0 55 627 687 60
20 0.000000 0.2 0.000000 0.000000 0.000000 0.000000 0.0 0 688 694 6
21 0.540478 0.2 0.000000 0.030814 0.280638 0.000000 0.0 50 695 731 36
22 0.473436 0.2 0.000000 0.019905 0.274693 0.000000 0.0 52 732 745 13
23 0.720140 0.2 0.000000 0.099650 0.369850 0.000000 0.0 53 746 804 58
24 0.632591 0.2 0.000000 0.070189 0.388803 0.000000 0.0 54 805 806 1
25 0.681596 0.2 0.105454 0.082296 0.364497 4.004004 0.0 55 807 928 121
26 0.517686 0.2 0.000000 0.046313 0.383765 0.000000 0.0 54 929 931 2
27 0.565969 0.2 0.000000 0.052951 0.312407 0.000000 0.0 53 932 944 12
28 0.505904 0.2 0.000000 0.037508 0.280559 0.000000 0.0 54 945 952 7
29 0.405626 0.2 0.000000 0.016162 0.136089 0.000000 0.0 55 953 957 4
30 0.353328 0.2 0.000000 0.013847 0.177271 0.000000 0.0 56 958 964 6
31 0.351082 0.2 0.000000 0.014956 0.260542 0.000000 0.0 55 965 970 5
32 0.495271 0.2 0.000000 0.020560 0.271796 0.000000 0.0 54 971 988 17
33 0.718423 0.2 0.000000 0.096749 0.355494 0.000000 0.0 55 989 999 10
In [31]:
conditioning_dict = conditioning_df_to_dict(conditioning_df, length=1000)
midi_audio_changed, params_changed = model.gen_audio_from_cond_dict(conditioning_dict, midi_features, instrument_id=sample['instrument_id'])
In [32]:
plot_pred_acoustic_feature(sample['audio'].numpy()[0], midi_audio_changed.numpy()[0], get_synth_params(params_changed), mask_zero_f0=True)
In [ ]: