example/speech_recognition/stt_utils.py - mxnet-test - Git at Google

 import logging
 import os
 import os.path

 import numpy as np
 import soundfile
 from numpy.lib.stride_tricks import as_strided


 logger = logging.getLogger(__name__)


 def calc_feat_dim(window, max_freq):
     return int(0.001 * window * max_freq) + 1


 def conv_output_length(input_length, filter_size, border_mode, stride,
                        dilation=1):
     """ Compute the length of the output sequence after 1D convolution along
         time. Note that this function is in line with the function used in
         Convolution1D class from Keras.
     Params:
         input_length (int): Length of the input sequence.
         filter_size (int): Width of the convolution kernel.
         border_mode (str): Only support `same` or `valid`.
         stride (int): Stride size used in 1D convolution.
         dilation (int)
     """
     if input_length is None:
         return None
     assert border_mode in {'same', 'valid'}
     dilated_filter_size = filter_size + (filter_size - 1) * (dilation - 1)
     if border_mode == 'same':
         output_length = input_length
     elif border_mode == 'valid':
         output_length = input_length - dilated_filter_size + 1
     return (output_length + stride - 1) // stride


 def spectrogram(samples, fft_length=256, sample_rate=2, hop_length=128):
     """
     Compute the spectrogram for a real signal.
     The parameters follow the naming convention of
     matplotlib.mlab.specgram
     Args:
         samples (1D array): input audio signal
         fft_length (int): number of elements in fft window
         sample_rate (scalar): sample rate
         hop_length (int): hop length (relative offset between neighboring
             fft windows).
     Returns:
         x (2D array): spectrogram [frequency x time]
         freq (1D array): frequency of each row in x
     Note:
         This is a truncating computation e.g. if fft_length=10,
         hop_length=5 and the signal has 23 elements, then the
         last 3 elements will be truncated.
     """
     assert not np.iscomplexobj(samples), "Must not pass in complex numbers"

     window = np.hanning(fft_length)[:, None]
     window_norm = np.sum(window ** 2)

     # The scaling below follows the convention of
     # matplotlib.mlab.specgram which is the same as
     # matlabs specgram.
     scale = window_norm * sample_rate

     trunc = (len(samples) - fft_length) % hop_length
     x = samples[:len(samples) - trunc]

     # "stride trick" reshape to include overlap
     nshape = (fft_length, (len(x) - fft_length) // hop_length + 1)
     nstrides = (x.strides[0], x.strides[0] * hop_length)
     x = as_strided(x, shape=nshape, strides=nstrides)

     # window stride sanity check
     assert np.all(x[:, 1] == samples[hop_length:(hop_length + fft_length)])

     # broadcast window, compute fft over columns and square mod
     # This function computes the one-dimensional n-point discrete Fourier Transform (DFT) of a real-valued array by means of an efficient algorithm called the Fast Fourier Transform (FFT).
     x = np.fft.rfft(x * window, axis=0)
     x = np.absolute(x) ** 2

     # scale, 2.0 for everything except dc and fft_length/2
     x[1:-1, :] *= (2.0 / scale)
     x[(0, -1), :] /= scale

     freqs = float(sample_rate) / fft_length * np.arange(x.shape[0])

     return x, freqs


 def spectrogram_from_file(filename, step=10, window=20, max_freq=None,
                           eps=1e-14, overwrite=False):
     """ Calculate the log of linear spectrogram from FFT energy
     Params:
         filename (str): Path to the audio file
         step (int): Step size in milliseconds between windows
         window (int): FFT window size in milliseconds
         max_freq (int): Only FFT bins corresponding to frequencies between
             [0, max_freq] are returned
         eps (float): Small value to ensure numerical stability (for ln(x))
     """

     csvfilename = filename.replace(".wav", ".csv")
     if (os.path.isfile(csvfilename) is False) or overwrite:
         with soundfile.SoundFile(filename) as sound_file:
             audio = sound_file.read(dtype='float32')
             sample_rate = sound_file.samplerate
             if audio.ndim >= 2:
                 audio = np.mean(audio, 1)
             if max_freq is None:
                 max_freq = sample_rate / 2
             if max_freq > sample_rate / 2:
                 raise ValueError("max_freq must not be greater than half of "
                                  " sample rate")
             if step > window:
                 raise ValueError("step size must not be greater than window size")
             hop_length = int(0.001 * step * sample_rate)
             fft_length = int(0.001 * window * sample_rate)

             pxx, freqs = spectrogram(
                 audio, fft_length=fft_length, sample_rate=sample_rate,
                 hop_length=hop_length)

             ind = np.where(freqs <= max_freq)[0][-1] + 1
             res = np.transpose(np.log(pxx[:ind, :] + eps))
             np.savetxt(csvfilename, res)
             return res
     else:
         return np.loadtxt(csvfilename)
	import logging
	import os
	import os.path

	import numpy as np
	import soundfile
	from numpy.lib.stride_tricks import as_strided


	logger = logging.getLogger(__name__)


	def calc_feat_dim(window, max_freq):
	return int(0.001 * window * max_freq) + 1


	def conv_output_length(input_length, filter_size, border_mode, stride,
	dilation=1):
	""" Compute the length of the output sequence after 1D convolution along
	time. Note that this function is in line with the function used in
	Convolution1D class from Keras.
	Params:
	input_length (int): Length of the input sequence.
	filter_size (int): Width of the convolution kernel.
	border_mode (str): Only support `same` or `valid`.
	stride (int): Stride size used in 1D convolution.
	dilation (int)
	"""
	if input_length is None:
	return None
	assert border_mode in {'same', 'valid'}
	dilated_filter_size = filter_size + (filter_size - 1) * (dilation - 1)
	if border_mode == 'same':
	output_length = input_length
	elif border_mode == 'valid':
	output_length = input_length - dilated_filter_size + 1
	return (output_length + stride - 1) // stride


	def spectrogram(samples, fft_length=256, sample_rate=2, hop_length=128):
	"""
	Compute the spectrogram for a real signal.
	The parameters follow the naming convention of
	matplotlib.mlab.specgram
	Args:
	samples (1D array): input audio signal
	fft_length (int): number of elements in fft window
	sample_rate (scalar): sample rate
	hop_length (int): hop length (relative offset between neighboring
	fft windows).
	Returns:
	x (2D array): spectrogram [frequency x time]
	freq (1D array): frequency of each row in x
	Note:
	This is a truncating computation e.g. if fft_length=10,
	hop_length=5 and the signal has 23 elements, then the
	last 3 elements will be truncated.
	"""
	assert not np.iscomplexobj(samples), "Must not pass in complex numbers"

	window = np.hanning(fft_length)[:, None]
	window_norm = np.sum(window ** 2)

	# The scaling below follows the convention of
	# matplotlib.mlab.specgram which is the same as
	# matlabs specgram.
	scale = window_norm * sample_rate

	trunc = (len(samples) - fft_length) % hop_length
	x = samples[:len(samples) - trunc]

	# "stride trick" reshape to include overlap
	nshape = (fft_length, (len(x) - fft_length) // hop_length + 1)
	nstrides = (x.strides[0], x.strides[0] * hop_length)
	x = as_strided(x, shape=nshape, strides=nstrides)

	# window stride sanity check
	assert np.all(x[:, 1] == samples[hop_length:(hop_length + fft_length)])

	# broadcast window, compute fft over columns and square mod
	# This function computes the one-dimensional n-point discrete Fourier Transform (DFT) of a real-valued array by means of an efficient algorithm called the Fast Fourier Transform (FFT).
	x = np.fft.rfft(x * window, axis=0)
	x = np.absolute(x) ** 2

	# scale, 2.0 for everything except dc and fft_length/2
	x[1:-1, :] *= (2.0 / scale)
	x[(0, -1), :] /= scale

	freqs = float(sample_rate) / fft_length * np.arange(x.shape[0])

	return x, freqs


	def spectrogram_from_file(filename, step=10, window=20, max_freq=None,
	eps=1e-14, overwrite=False):
	""" Calculate the log of linear spectrogram from FFT energy
	Params:
	filename (str): Path to the audio file
	step (int): Step size in milliseconds between windows
	window (int): FFT window size in milliseconds
	max_freq (int): Only FFT bins corresponding to frequencies between
	[0, max_freq] are returned
	eps (float): Small value to ensure numerical stability (for ln(x))
	"""

	csvfilename = filename.replace(".wav", ".csv")
	if (os.path.isfile(csvfilename) is False) or overwrite:
	with soundfile.SoundFile(filename) as sound_file:
	audio = sound_file.read(dtype='float32')
	sample_rate = sound_file.samplerate
	if audio.ndim >= 2:
	audio = np.mean(audio, 1)
	if max_freq is None:
	max_freq = sample_rate / 2
	if max_freq > sample_rate / 2:
	raise ValueError("max_freq must not be greater than half of "
	" sample rate")
	if step > window:
	raise ValueError("step size must not be greater than window size")
	hop_length = int(0.001 * step * sample_rate)
	fft_length = int(0.001 * window * sample_rate)

	pxx, freqs = spectrogram(
	audio, fft_length=fft_length, sample_rate=sample_rate,
	hop_length=hop_length)

	ind = np.where(freqs <= max_freq)[0][-1] + 1
	res = np.transpose(np.log(pxx[:ind, :] + eps))
	np.savetxt(csvfilename, res)
	return res
	else:
	return np.loadtxt(csvfilename)