| import logging |
| import os |
| import os.path |
| |
| import numpy as np |
| import soundfile |
| from numpy.lib.stride_tricks import as_strided |
| |
| |
| logger = logging.getLogger(__name__) |
| |
| |
| def calc_feat_dim(window, max_freq): |
| return int(0.001 * window * max_freq) + 1 |
| |
| |
| def conv_output_length(input_length, filter_size, border_mode, stride, |
| dilation=1): |
| """ Compute the length of the output sequence after 1D convolution along |
| time. Note that this function is in line with the function used in |
| Convolution1D class from Keras. |
| Params: |
| input_length (int): Length of the input sequence. |
| filter_size (int): Width of the convolution kernel. |
| border_mode (str): Only support `same` or `valid`. |
| stride (int): Stride size used in 1D convolution. |
| dilation (int) |
| """ |
| if input_length is None: |
| return None |
| assert border_mode in {'same', 'valid'} |
| dilated_filter_size = filter_size + (filter_size - 1) * (dilation - 1) |
| if border_mode == 'same': |
| output_length = input_length |
| elif border_mode == 'valid': |
| output_length = input_length - dilated_filter_size + 1 |
| return (output_length + stride - 1) // stride |
| |
| |
| def spectrogram(samples, fft_length=256, sample_rate=2, hop_length=128): |
| """ |
| Compute the spectrogram for a real signal. |
| The parameters follow the naming convention of |
| matplotlib.mlab.specgram |
| Args: |
| samples (1D array): input audio signal |
| fft_length (int): number of elements in fft window |
| sample_rate (scalar): sample rate |
| hop_length (int): hop length (relative offset between neighboring |
| fft windows). |
| Returns: |
| x (2D array): spectrogram [frequency x time] |
| freq (1D array): frequency of each row in x |
| Note: |
| This is a truncating computation e.g. if fft_length=10, |
| hop_length=5 and the signal has 23 elements, then the |
| last 3 elements will be truncated. |
| """ |
| assert not np.iscomplexobj(samples), "Must not pass in complex numbers" |
| |
| window = np.hanning(fft_length)[:, None] |
| window_norm = np.sum(window ** 2) |
| |
| # The scaling below follows the convention of |
| # matplotlib.mlab.specgram which is the same as |
| # matlabs specgram. |
| scale = window_norm * sample_rate |
| |
| trunc = (len(samples) - fft_length) % hop_length |
| x = samples[:len(samples) - trunc] |
| |
| # "stride trick" reshape to include overlap |
| nshape = (fft_length, (len(x) - fft_length) // hop_length + 1) |
| nstrides = (x.strides[0], x.strides[0] * hop_length) |
| x = as_strided(x, shape=nshape, strides=nstrides) |
| |
| # window stride sanity check |
| assert np.all(x[:, 1] == samples[hop_length:(hop_length + fft_length)]) |
| |
| # broadcast window, compute fft over columns and square mod |
| # This function computes the one-dimensional n-point discrete Fourier Transform (DFT) of a real-valued array by means of an efficient algorithm called the Fast Fourier Transform (FFT). |
| x = np.fft.rfft(x * window, axis=0) |
| x = np.absolute(x) ** 2 |
| |
| # scale, 2.0 for everything except dc and fft_length/2 |
| x[1:-1, :] *= (2.0 / scale) |
| x[(0, -1), :] /= scale |
| |
| freqs = float(sample_rate) / fft_length * np.arange(x.shape[0]) |
| |
| return x, freqs |
| |
| |
| def spectrogram_from_file(filename, step=10, window=20, max_freq=None, |
| eps=1e-14, overwrite=False): |
| """ Calculate the log of linear spectrogram from FFT energy |
| Params: |
| filename (str): Path to the audio file |
| step (int): Step size in milliseconds between windows |
| window (int): FFT window size in milliseconds |
| max_freq (int): Only FFT bins corresponding to frequencies between |
| [0, max_freq] are returned |
| eps (float): Small value to ensure numerical stability (for ln(x)) |
| """ |
| |
| csvfilename = filename.replace(".wav", ".csv") |
| if (os.path.isfile(csvfilename) is False) or overwrite: |
| with soundfile.SoundFile(filename) as sound_file: |
| audio = sound_file.read(dtype='float32') |
| sample_rate = sound_file.samplerate |
| if audio.ndim >= 2: |
| audio = np.mean(audio, 1) |
| if max_freq is None: |
| max_freq = sample_rate / 2 |
| if max_freq > sample_rate / 2: |
| raise ValueError("max_freq must not be greater than half of " |
| " sample rate") |
| if step > window: |
| raise ValueError("step size must not be greater than window size") |
| hop_length = int(0.001 * step * sample_rate) |
| fft_length = int(0.001 * window * sample_rate) |
| |
| pxx, freqs = spectrogram( |
| audio, fft_length=fft_length, sample_rate=sample_rate, |
| hop_length=hop_length) |
| |
| ind = np.where(freqs <= max_freq)[0][-1] + 1 |
| res = np.transpose(np.log(pxx[:ind, :] + eps)) |
| np.savetxt(csvfilename, res) |
| return res |
| else: |
| return np.loadtxt(csvfilename) |
| |