In [1]:
# Ignore a bunch of deprecation warnings
import sys
sys.path.append('../../..')
sys.path.append('../.')
import warnings
warnings.filterwarnings("ignore")

import copy
import os
import time
from tqdm import tqdm
import math

import ddsp
import ddsp.training

from data_handling.ddspdataset import DDSPDataset
from utils.training_utils import print_hparams, set_seed, save_results, str2bool
from hparams_midiae_interp_cond import hparams as hp
from midiae_interp_cond.get_model import get_model, get_fake_data

import librosa
import matplotlib.pyplot as plt
import numpy as np
import tensorflow.compat.v2 as tf
import tensorflow_datasets as tfds
import pandas as pd
import qgrid

from notebook_utils import *

set_seed(1234)

# Helper Functions
sample_rate = 16000


print('Done!')
Done!

Softmax postnet result

I tried using a softmax postnet to predict the diff in harmonic distribution and noise magnitudes.

The training loss is pretty low (~4.9). But the generated sample is not perceptually good. It is "noisy", similar to what I seen in mixture density (mixture of Gaussian and mixture of logistics) models:

In [6]:
import utils.audio_io
In [8]:
ref_wav = utils.audio_io.load_audio(r'/data/6.28/0_ref.wav', 16000)
pred_wav = utils.audio_io.load_audio(r'/data/6.28/0_pred.wav', 16000)
plt.figure(figsize=(10,4))
plt.subplot(1, 2, 1)
plot_spec(ref_wav, hp.sample_rate, title='original')
plt.subplot(1, 2, 2)
plot_spec(pred_wav, hp.sample_rate, title='prediction')
In [9]:
ref_wav = utils.audio_io.load_audio(r'/data/6.28/1_ref.wav', 16000)
pred_wav = utils.audio_io.load_audio(r'/data/6.28/1_pred.wav', 16000)
plt.figure(figsize=(10,4))
plt.subplot(1, 2, 1)
plot_spec(ref_wav, hp.sample_rate, title='original')
plt.subplot(1, 2, 2)
plot_spec(pred_wav, hp.sample_rate, title='prediction')