blob: 6a32f0e57c2d141f382b47cfa1b1dedb5c9fa171 [file] [log] [blame]
import logging
import os
import os.path
import numpy as np
import soundfile
from numpy.lib.stride_tricks import as_strided
logger = logging.getLogger(__name__)
def calc_feat_dim(window, max_freq):
return int(0.001 * window * max_freq) + 1
def conv_output_length(input_length, filter_size, border_mode, stride,
dilation=1):
""" Compute the length of the output sequence after 1D convolution along
time. Note that this function is in line with the function used in
Convolution1D class from Keras.
Params:
input_length (int): Length of the input sequence.
filter_size (int): Width of the convolution kernel.
border_mode (str): Only support `same` or `valid`.
stride (int): Stride size used in 1D convolution.
dilation (int)
"""
if input_length is None:
return None
assert border_mode in {'same', 'valid'}
dilated_filter_size = filter_size + (filter_size - 1) * (dilation - 1)
if border_mode == 'same':
output_length = input_length
elif border_mode == 'valid':
output_length = input_length - dilated_filter_size + 1
return (output_length + stride - 1) // stride
def spectrogram(samples, fft_length=256, sample_rate=2, hop_length=128):
"""
Compute the spectrogram for a real signal.
The parameters follow the naming convention of
matplotlib.mlab.specgram
Args:
samples (1D array): input audio signal
fft_length (int): number of elements in fft window
sample_rate (scalar): sample rate
hop_length (int): hop length (relative offset between neighboring
fft windows).
Returns:
x (2D array): spectrogram [frequency x time]
freq (1D array): frequency of each row in x
Note:
This is a truncating computation e.g. if fft_length=10,
hop_length=5 and the signal has 23 elements, then the
last 3 elements will be truncated.
"""
assert not np.iscomplexobj(samples), "Must not pass in complex numbers"
window = np.hanning(fft_length)[:, None]
window_norm = np.sum(window ** 2)
# The scaling below follows the convention of
# matplotlib.mlab.specgram which is the same as
# matlabs specgram.
scale = window_norm * sample_rate
trunc = (len(samples) - fft_length) % hop_length
x = samples[:len(samples) - trunc]
# "stride trick" reshape to include overlap
nshape = (fft_length, (len(x) - fft_length) // hop_length + 1)
nstrides = (x.strides[0], x.strides[0] * hop_length)
x = as_strided(x, shape=nshape, strides=nstrides)
# window stride sanity check
assert np.all(x[:, 1] == samples[hop_length:(hop_length + fft_length)])
# broadcast window, compute fft over columns and square mod
# This function computes the one-dimensional n-point discrete Fourier Transform (DFT) of a real-valued array by means of an efficient algorithm called the Fast Fourier Transform (FFT).
x = np.fft.rfft(x * window, axis=0)
x = np.absolute(x) ** 2
# scale, 2.0 for everything except dc and fft_length/2
x[1:-1, :] *= (2.0 / scale)
x[(0, -1), :] /= scale
freqs = float(sample_rate) / fft_length * np.arange(x.shape[0])
return x, freqs
def spectrogram_from_file(filename, step=10, window=20, max_freq=None,
eps=1e-14, overwrite=False):
""" Calculate the log of linear spectrogram from FFT energy
Params:
filename (str): Path to the audio file
step (int): Step size in milliseconds between windows
window (int): FFT window size in milliseconds
max_freq (int): Only FFT bins corresponding to frequencies between
[0, max_freq] are returned
eps (float): Small value to ensure numerical stability (for ln(x))
"""
csvfilename = filename.replace(".wav", ".csv")
if (os.path.isfile(csvfilename) is False) or overwrite:
with soundfile.SoundFile(filename) as sound_file:
audio = sound_file.read(dtype='float32')
sample_rate = sound_file.samplerate
if audio.ndim >= 2:
audio = np.mean(audio, 1)
if max_freq is None:
max_freq = sample_rate / 2
if max_freq > sample_rate / 2:
raise ValueError("max_freq must not be greater than half of "
" sample rate")
if step > window:
raise ValueError("step size must not be greater than window size")
hop_length = int(0.001 * step * sample_rate)
fft_length = int(0.001 * window * sample_rate)
pxx, freqs = spectrogram(
audio, fft_length=fft_length, sample_rate=sample_rate,
hop_length=hop_length)
ind = np.where(freqs <= max_freq)[0][-1] + 1
res = np.transpose(np.log(pxx[:ind, :] + eps))
np.savetxt(csvfilename, res)
return res
else:
return np.loadtxt(csvfilename)