Very untidy first working draft of the idea mentioned on the efficient computation page. Here, I fit a spectral mixture to some audio data to build a “generative model” for audio. I’ll implement efficient sampling later, and I’ll replace the arbitrary way this is trained with an LSTM-RNN to go straight from text/spectrograms to waveforms.

The idea though, is that we fit a gaussian process to some waveforms in the frequency domain rather than the time domain. This also showcases the power of SymPy. This is original work - I drew from Richard Turner’s thesis for the amplitude demodulation part, Andrew Gordon Wilson’s papers for the spectral kernel and the LJSpeech dataset for the speech data. This was a learning excercise - it’s not meant to be groundbreaking, however, a motivation is to create a simple model for speech that should give reasonable results on modest hardware.

Synthesized audio sample:


Python Code
# conda activate gpsynth; ipython

# import pystan
# import simpleaudio as sa
# from statsmodels.stats.correlation_tools import cov_nearest

import sympy as sy
import numpy as np
import pandas as pd
import pickle as pkl
import tensorflow as tf
from tqdm import tqdm, trange
import matplotlib.pyplot as plt
from scipy.linalg import toeplitz
from scipy.io import wavfile as wav
from scipy.signal import stft, find_peaks
from scipy.interpolate import UnivariateSpline

sy.init_printing()
md = sy.functions.Abs
sess = tf.InteractiveSession()
plt.style.use("ggplot"); plt.ion()

def spectrum(x, u = 1, return_freq = True):
	n = len(x)
	intm = np.fft.fft(x)
	intm = (intm * intm.conjugate()).real/n
	intm = intm * u/n 
	psd = intm[1:int(np.ceil(0.5 * (n + 1)))]
	if return_freq:
		freq = range(1, len(psd) + 1) 
		freq = np.array(freq, dtype = float)
		freq *= 0.5*n / (u * len(psd))
		return freq, psd
	else:
		return psd

class SimulationData:
	def __init__(self):
		self.num_data = None
		self.time = None
		self.upper_lim = None
		self.kernel_func = None
		self.cholesky_fac = None
	def gen_grid(self, num_data = 500, upper_lim = 5):
		self.num_data = num_data
		self.upper_lim = upper_lim
		self.time = np.linspace(0, upper_lim, num_data)
	def gen_chol(self, kernel_func = None):
		if kernel_func is None:
			kernel_func = self.kernel_func
		C = toeplitz(kernel_func(self.time))
		C += np.identity(len(self.time))*1e-10
		C = self.condition_at_zero(C)
		C += np.identity(len(self.time) - 2)*1e-10
		C = np.linalg.cholesky(C)
		self.cholesky_fac = C
	def simulate(self):
		C = self.cholesky_fac
		z = np.random.normal(size = self.num_data - 2)
		return np.hstack([0.0, (C @ z).reshape(-1), 0.0])
	def gen_expcos_kern(self, l = None, p = None, s = None, noise = 0.0, lambdify = True):
		sm = dict(r = sy.symbols("r", real = True),
			w = sy.symbols("w", real = True, positive = True),
			l = sy.symbols("l", real = True, positive = True),
			p = sy.symbols("p", real = True, positive = True),
			s = sy.symbols("s", real = True, positive = True))
		if l is not None: sm['l'] = l
		if p is not None: sm['p'] = p
		if s is not None: sm['s'] = s
		if None in [l, p, s]: raise NotImplementedError
		self.symbols = sm
		K = sm['s']**2 * sy.exp(-(sm['r']/sm['l'])**2) * sy.cos(2*sm['p']*sy.pi*sm['r'])
		S = sy.fourier_transform(K, sm['r'], sm['w'])
		W = noise**2 / sy.sqrt(2)
		if lambdify:
			S = sy.lambdify(sm['w'], S + W)
			K = sy.lambdify(sm['r'], K)
			def W(x):
				k = np.zeros(len(x))
				k[x == 0] += noise**2
				return k
		self.kernel_func = lambda x: K(x) + W(x)
		self.spectrum = S
		self.white = W
	@staticmethod
	def condition_at_zero(mat):
		n = mat.shape[0]
		indices = np.hstack([range(1, n - 1), 0, n - 1])
		mat = mat[indices, :]
		mat = mat[:, indices]

		K11 = mat[:(n - 2), :(n - 2)]
		K12 = mat[:(n - 2), (n - 2):]
		K22 = mat[(n - 2):, (n - 2):]
		return K11 - (K12 @ np.linalg.inv(K22) @ K12.T)

# for block-wise sythesis of an autoregressive sequence
def condition_on_prv(kern, seg = None):
	n = len(kern)//2
	K11 = K22 = toeplitz(kern)
	K12 = K11[:n, n:].T; K11 = K22 = K11[:n, :n]
	K22i = np.linalg.inv(K22 + np.identity(n)*1e-7)
	K1222i = K12 @ K22i
	C = np.linalg.cholesky(K11[:n, :n] - K1222i @ K12.T + np.identity(n)*1e-7)
	if seg is None:
		Cb = np.linalg.cholesky(K11[:n, :n] + np.identity(n)*1e-7)
		seg = Cb @ np.random.normal(size = n)
	return seg, (K1222i @ seg) + (C @ np.random.normal(size = n))

def spectrum_check(l = 0.5, s = 2.0, p = 0.5, n = 10000):
	r, w = sy.symbols("r, w", real = True)
	K = s**2 * sy.exp(-(r/l)**2) * sy.cos(2*p*sy.pi*r)
	S = sy.fourier_transform(K, r, w)
	S = sy.lambdify(w, S)
	K = sy.lambdify(r, K)

	data = SimulationData()
	data.gen_grid()
	data.gen_chol(K)

	freq, psd = spectrum(data.simulate(), data.upper_lim)
	for i in range(n - 1):
		psd += spectrum(data.simulate(), data.upper_lim)[1]

	s_t = S(data.time)
	plt.plot(data.time, s_t, label = "Theoretical")
	plt.xlim(-0.5, data.upper_lim)
	plt.plot(freq, psd/n, label = "Empirical")
	plt.xlabel("Frequency")
	plt.ylabel("Power")
	plt.legend()

def convergence_check():
	l, s, p, u = 0.5, 2.0, 0.5, 5.0
	r, w = sy.symbols("r, w", real = True)
	s_p, p_p, l_p = sy.symbols("s, p, l", real = True, positive = True)

	K = s**2 * sy.exp(-(r/l)**2) * sy.cos(2*p*sy.pi*r)
	K_t = s_p**2 * sy.exp(-(r/l_p)**2) * sy.cos(2*p_p*sy.pi*r)
	S = sy.fourier_transform(K_t, r, w)
	S = sy.lambdify([w, l_p, p_p, s_p], S, "tensorflow")
	K = sy.lambdify(r, K)
	W = s_p**2 / sy.sqrt(2) # sy.fourier_transform((s_p**2)*sy.exp(-(r/1e-20)**2)/sy.sqrt(2*sy.pi*(1e-20)**2), r, w)
	W = sy.lambdify(s_p, W)

	data = SimulationData()
	data.gen_grid(upper_lim = u)
	data.gen_chol(K)

	l_p_0 = tf.Variable(1.1)
	p_p_0 = tf.Variable(2.2)
	s_p_0 = tf.Variable(0.9)

	l_param = tf.clip_by_value(l_p_0, 1e-5, 1e5)
	p_param = tf.clip_by_value(p_p_0, 1e-5, 1e5)
	s_param = tf.clip_by_value(s_p_0, 1e-5, 1e5)

	freq_obs, spec_obs = spectrum(data.simulate(), u)
	freq_obs = np.array(freq_obs, dtype = "float32")
	spec_obs = np.array(spec_obs, dtype = "float32")

	spec_const = tf.constant(spec_obs)

	add_term = S(freq_obs, l_param, p_param, s_param) + 1e-10
	loss = tf.reduce_sum((spec_const - add_term)**2) # likelihood is too unstable
	step = tf.train.AdamOptimizer().minimize(loss = loss, var_list = [l_p_0, p_p_0, s_p_0])

	sess.run(tf.initialize_all_variables())

	for i in tqdm(range(2000)):
		if i % 50 == 0:
			freq_obs, spec_obs = spectrum(data.simulate(), u)
			freq_obs = np.array(freq_obs, dtype = "float32")
			spec_obs = np.array(spec_obs, dtype = "float32")
			print(sess.run(loss))
		sess.run(step, feed_dict = {spec_const: spec_obs})
	print(np.round(sess.run([l_param, p_param, s_param]), 2))
	print(np.round([l, p, s], 2))

if __name__ == "__main__":

	life_sr, life = wav.read("/Users/adityaravuri/data/LJSpeech-1.1/wavs/LJ001-0001.wav")
	life = np.array(life) if len(life.shape) == 1 else np.array(life[:, 0], dtype = "float64")
	life_scale = life.std()
	life = life/life_scale
	n_t = len(life)
	t = n_t/life_sr

	# replace this with the sparse GP in the future, and have the LSTM-RNN generate the necessary pseudodata
	envelope = np.array(pd.Series(life).rolling(2500).std(center = True).fillna(0.004))
	envelope_div = envelope.copy()
	envelope_div[envelope_div <= 0.2] = 0.2

	stn = life/envelope_div

	segm_resolution = 0.05
	segm_len = int(segm_resolution*(n_t - 2)/t); segm_n = n_t//segm_len
	segments = tuple([stn[(segm_len*i):(segm_len*(i + 1))] for i in range(segm_n)])

	r, w = sy.symbols("r, w", real = True)
	s, p, l = sy.symbols("s, p, l", real = True, positive = True)

	K = s**2 * sy.exp(-(r/l)**2) * sy.cos(2*p*sy.pi*r)
	S = sy.fourier_transform(K, r, w)
	S = sy.lambdify([w, l, p, s], S, "tensorflow")
	K = sy.lambdify([r, l, p, s], K)
	W = s**2 / sy.sqrt(2.0)
	W = sy.lambdify(s, W, "tensorflow")

	# humans can't resolve frequencies exactly, the bands matter more than the exact frequencies
	bark_scale = [20, 100, 200, 300, 400, 510, 630, 770, 920, 1080,
				  1270, 1480, 1720, 2000, 2320, 2700, 3150, 3700, 4400,
				  5300, 6400, 7700, 9500, 12000, 15500, np.infty]
	bark_scale = pd.DataFrame(dict(cuts_end = bark_scale))
	bark_scale['cuts_start'] = bark_scale.cuts_end.shift(1, fill_value = 0.0)
	bark_scale.reset_index(inplace = True)

	identifier = lambda x: bark_scale[bark_scale.cuts_end >= x].iloc[0, 0]

	v = lambda x: tf.exp(tf.Variable(x))

	freq_obs, _ = spectrum(segments[0], t * len(segments[0])/n_t)
	spec_obs = np.vstack([spectrum(seg, t * len(seg)/n_t, False) for seg in segments])
	freq_obs, spec_obs = [np.array(i, dtype = "float32") + 1e-30 for i in [freq_obs, spec_obs]]

	obs_spectrum = pd.DataFrame(dict(f = freq_obs))
	obs_spectrum['index'] = obs_spectrum.f.map(identifier)
	def maxer(spec):
		obs_spectrum['s'] = spec
		obs_spectrum['s_prime'] = obs_spectrum.groupby('index')['s'].transform('max')
		return np.array(obs_spectrum.s_prime, dtype = "float32")
	spec_obs_max = np.vstack([maxer(spec_obs[i, :]) for i in range(segm_n)])

	sigma = 1/(1 + v(3.))

	p_ps = np.array(np.tile(freq_obs, segm_n).reshape(segm_n, len(freq_obs))[spec_obs_max == spec_obs], dtype = "float32").reshape(segm_n, -1, order = 'C')
	n_sel = ps_non_trainable = p_ps.shape[1]
	l_ps = 1/(1 + v(np.array(np.random.uniform(1, 3, size = p_ps.shape), dtype = "float32")))
	s_ps = 1/(1 + v(np.array(np.random.uniform(-0.2, 0.2, size = p_ps.shape), dtype = "float32")))

	theo_spec = tf.reduce_sum(S(freq_obs.reshape(1, 1, -1), tf.reshape(l_ps, p_ps.shape + (1, )),
		p_ps.reshape(p_ps.shape + (1, )), tf.reshape(s_ps, p_ps.shape + (1, ))), axis = 1) + W(sigma) + 1e-30

	loss = tf.reduce_sum((tf.log(spec_obs) - tf.log(theo_spec))**2) # likelihood is too unstable
	# loss += tf.reduce_sum((tf.log(spec_obs_max) - tf.log(theo_spec))**2)
	loss += tf.reduce_sum(((tf.log(spec_obs) - tf.log(theo_spec))[spec_obs == spec_obs_max])**2)*250
	loss += tf.reduce_sum((tf.log(theo_spec[1:, :]) - tf.log(theo_spec[:-1, :]))**2)*0.1
	step = tf.train.AdamOptimizer(0.005).minimize(loss = loss)

	sess.run(tf.initialize_all_variables())

	looper = trange(40001, desc = 'ML')
	for i in looper:
		sess.run(step)
		if i % 500 == 0: looper.set_description('Loss: %g' % sess.run(loss))

	l_ps = sess.run(l_ps); s_ps = sess.run(s_ps); sigma = np.array(sess.run(sigma), dtype = 'float32')

	kern_grid = np.linspace(0, segm_resolution, segm_len)

	theo_kern = np.sum(K(kern_grid.reshape(1, 1, -1), l_ps.reshape(p_ps.shape + (1, )),
		p_ps.reshape(p_ps.shape + (1, )), s_ps.reshape(p_ps.shape + (1, ))), axis = 1) + sigma**2 + 1e-30

	def sim(row):
		C = toeplitz(row)
		C += np.identity(segm_len)*1e-10
		C = SimulationData.condition_at_zero(C)
		C += np.identity(segm_len - 2)*1e-10
		C = np.linalg.cholesky(C)
		z = np.random.normal(size = segm_len - 2)
		return np.hstack([0.0, (C @ z).reshape(-1), 0.0])

	audio = np.hstack([sim(theo_kern[i, :]) for i in tqdm(range(segm_n))])
	audio_to_write = np.array(life_scale * audio * envelope[:len(audio)], dtype = "int16")
	wav.write("life_synth_o.wav", life_sr, np.repeat(audio_to_write, 2).reshape(-1, 2))

	# Other stuff:
	
	# blockwise generation doesn't give much better results than conditioning at zero as done below
	# audio = []
	# for i in tqdm(range(segm_n)):
	# 	if i == 0:
	# 		seg, _ = condition_on_prv(theo_kern[i, :])
	# 	else:
	# 		_, seg = condition_on_prv(theo_kern[i, :], seg)
	# 	audio.append(seg)
	# audio = np.hstack(audio)

	# sparse gp code
	# model_code = """
	# data {
	# 	int n;     // len
	# 	real x[n]; // audio
	# }
	# parameters {
	# 	real<lower = 0, upper = 1> l;
	# 	real<lower = 0, upper = 10> s;
	# 	vector<lower = -10, upper = 2>[n] sigma;
	# }
	# model {
	# 	sigma[1] ~ normal(0, s);
	# 	sigma[2:n] ~ normal(l * sigma[1:(n - 1)], s*(1 - l^2)^0.5);
	# 	x ~ normal(0, exp(sigma));
	# }
	# """

	# n_s = n_t//40; seg = life[range(n_s)]
	# data_list = dict(n = n_s, x = seg)

	# model = pystan.StanModel(model_code = model_code)
	# with open('ar_amp_model.pkl', 'wb') as f:
	# 	pkl.dump(model, f, protocol = pkl.HIGHEST_PROTOCOL)

	# with open('ar_amp_model.pkl', 'rb') as f:
	# 	model = pkl.load(f)

	# fit = model.sampling(data = data_list, iter = 1000, chains = 1)
	# while fit['value'] < -2950: # 2011.08:
	# 	temp = model.optimizing(data = data_list, as_vector = False)
	# 	if temp['value'] > fit['value']:
	# 		fit = temp

	# plt.plot(np.exp(fit['par']['sigma']) *  2)
	# plt.plot(np.exp(fit['par']['sigma']) * -2)
	# plt.plot(seg)

	# b_a = int(n_t/4); b_b = int(n_t/2); b_c = int(n_t*0.825)
	# plt.vlines(b_a, -3, 3)
	# plt.vlines(b_b, -3, 3)
	# plt.vlines(b_c, -3, 3)

	# envelope = np.log(1 + np.exp(fit['par']['sigma_vec']))

	# expt to determine my own listening ability
	# if False: # expt - volume 8 headphones room
	# 	ss = list(np.tile([0.005, 0.01, 0.05, 0.1, 0.5, 1.0, 5.0], 16))
	# 	fs = list(np.repeat([20, 50, 100, 200, 500, 1000, 1500, 2000, 2500,
	# 		3500, 5000, 7000, 9000, 11000, 15000, 17500], 7))
	# 	audios = []
	# 	r = np.random.normal(size = 2203)
	#	t = np.linspace(0, 0.1, 2205); l = 1.0
	# 	for s, f in zip(ss, fs):
	# 		C = np.linalg.cholesky(SimulationData.condition_at_zero(toeplitz(s**2 * np.exp(-(t/l)**2) * np.cos(2*f*np.pi*t))) + np.identity(2203)*1e-7)
	# 		z = np.hstack([0.0, C @ r, 0.0])
	# 		# f, s = spectrum(z, 0.1)
	# 		audios.append(np.array(np.tile(z * 100, 10), dtype = "int16"))
	# 		# audio = np.array(np.tile(z * 100, 10), dtype = "int16")
	# 		# sa.play_buffer(audio, 1, 2, 22050)

	# 	data = pd.DataFrame(dict(s = ss, f = fs, a = audios))
	# 	data['audible'] = False

	# 	def set_audible(i):
	# 		data.loc[i, 'audible'] = True

	# 	data_rec = data.copy()
	# 	plt.plot(data.groupby('f')['audible'].mean())	

	# 	i -= 1
	# 	x = sa.play_buffer(data.loc[i, 'a'], 1, 2, 22050)
	# 	time.sleep(2)
	# 	assert not x.is_playing(); assert i >= 1
	# 	set_audible(i)

	# old training method
	# n_sel = 20
	# smth_signal = UnivariateSpline(freq_obs, np.log(spec_obs), s = len(freq_obs)/1.5)(freq_obs)
	# peaks = smth_signal[find_peaks(smth_signal)[0]]
	# ps_to_add = freq_obs[np.isin(smth_signal, np.sort(peaks)[-n_sel:])]
	# ps_non_trainable = len(ps_to_add)

	# n_sel = 20
	# smth_signal = UnivariateSpline(freq_obs, np.log(spec_obs), s = len(freq_obs)/0.75)(freq_obs)
	# peaks = smth_signal[find_peaks(smth_signal)[0]]
	# ps_to_add = freq_obs[np.isin(smth_signal, np.sort(peaks)[-2*n_sel:])]
	# ps_to_add = ps_to_add if len(ps_to_add) <= n_sel else np.random.choice(ps_to_add, n_sel, False)
	# ps_non_trainable += len(ps_to_add)
	# l_ps += [1/(1 + v(np.random.uniform(1, 3))) for i in range(len(ps_to_add))]
	# p_ps += [i for i in ps_to_add]
	# s_ps += [1/(1 + v(np.random.uniform(-0.2, 0.2))) for i in range(len(ps_to_add))]

	# ps_to_add = 20
	# ps_to_add = [v(np.random.uniform(0, 2)) for i in range(ps_to_add)]
	# l_ps += [1/(1 + v(np.random.uniform(1, 3))) for i in range(len(ps_to_add))]
	# p_ps += [i for i in ps_to_add]
	# s_ps += [1/(1 + v(np.random.uniform(-0.2, 0.2))) for i in range(len(ps_to_add))]

2020

Efficient Gaussian Process Computation

I’ll try to give examples of efficient gaussian process computation here, like the vec trick (Kronecker product trick), efficient toeliptz and circulant matrix computations, RTS smoothing and Kalman filtering using state space representations, and so on.

3 min read

Gaussian Process Speech Synthesis (Draft)

Very untidy first working draft of the idea mentioned on the efficient computation page. Here, I fit a spectral mixture to some audio data to build a “generative model” for audio. I’ll implement efficient sampling later, and I’ll replace the arbitrary way this is trained with an LSTM-RNN to go straight from text/spectrograms to waveforms.

6 min read
Back to Top ↑

2019

Gaussian Process Middle C

First of my experiments on audio modelling using gaussian processes. Here, I construct a GP that, when sampled, plays middle c the way a grand piano would.

1 min read
Back to Top ↑

2018

Back to Top ↑