example/gluon/audio/transforms.py - mxnet - Git at Google

 # Licensed to the Apache Software Foundation (ASF) under one
 # or more contributor license agreements.  See the NOTICE file
 # distributed with this work for additional information
 # regarding copyright ownership.  The ASF licenses this file
 # to you under the Apache License, Version 2.0 (the
 # "License"); you may not use this file except in compliance
 # with the License.  You may obtain a copy of the License at
 #
 #   http://www.apache.org/licenses/LICENSE-2.0
 #
 # Unless required by applicable law or agreed to in writing,
 # software distributed under the License is distributed on an
 # "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
 # KIND, either express or implied.  See the License for the
 # specific language governing permissions and limitations
 # under the License.
 # coding: utf-8
 # pylint: disable= arguments-differ
 """Audio transforms."""

 import warnings
 import numpy as np
 try:
     import librosa
 except ImportError as e:
     warnings.warn("librosa dependency could not be resolved or \
     imported, could not provide some/all transform.")

 from mxnet import ndarray as nd
 from mxnet.gluon.block import Block

 class MFCC(Block):
     """Extracts Mel frequency cepstrum coefficients from the audio data file
     More details : https://librosa.github.io/librosa/generated/librosa.feature.mfcc.html

     Attributes
     ----------
     sampling_rate: int, default 22050
         sampling rate of the input audio signal
     num_mfcc: int, default 20
         number of mfccs to return


     Inputs:
         - **x**: input tensor (samples, ) shape.

     Outputs:
         - **out**: output array is a scaled NDArray with (samples, ) shape.

     """

     def __init__(self, sampling_rate=22050, num_mfcc=20):
         self._sampling_rate = sampling_rate
         self._num_fcc = num_mfcc
         super(MFCC, self).__init__()

     def forward(self, x):
         if isinstance(x, np.ndarray):
             y = x
         elif isinstance(x, nd.NDArray):
             y = x.asnumpy()
         else:
             warnings.warn("MFCC - allowed datatypes mx.nd.NDArray and numpy.ndarray")
             return x

         audio_tmp = np.mean(librosa.feature.mfcc(y=y, sr=self._sampling_rate, n_mfcc=self._num_fcc).T, axis=0)
         return nd.array(audio_tmp)


 class Scale(Block):
     """Scale audio numpy.ndarray from a 16-bit integer to a floating point number between
     -1.0 and 1.0. The 16-bit integer is the sample resolution or bit depth.

     Attributes
     ----------
     scale_factor : float
         The factor to scale the input tensor by.


     Inputs:
         - **x**: input tensor (samples, ) shape.

     Outputs:
         - **out**: output array is a scaled NDArray with (samples, ) shape.

     Examples
     --------
     >>> scale = audio.transforms.Scale(scale_factor=2)
     >>> audio_samples = mx.nd.array([2,3,4])
     >>> scale(audio_samples)
     [1.  1.5 2. ]
     <NDArray 3 @cpu(0)>

     """

     def __init__(self, scale_factor=2**31):
         self.scale_factor = scale_factor
         super(Scale, self).__init__()

     def forward(self, x):
         if self.scale_factor == 0:
             warnings.warn("Scale factor cannot be 0.")
             return x
         if isinstance(x, np.ndarray):
             return nd.array(x/self.scale_factor)
         return x / self.scale_factor


 class PadTrim(Block):
     """Pad/Trim a 1d-NDArray of NPArray (Signal or Labels)

     Attributes
     ----------
     max_len : int
         Length to which the array will be padded or trimmed to.
     fill_value: int or float
         If there is a need of padding, what value to pad at the end of the input array.


     Inputs:
         - **x**: input tensor (samples, ) shape.

     Outputs:
         - **out**: output array is a scaled NDArray with (max_len, ) shape.

     Examples
     --------
     >>> padtrim = audio.transforms.PadTrim(max_len=9, fill_value=0)
     >>> audio_samples = mx.nd.array([1,2,3,4,5])
     >>> padtrim(audio_samples)
     [1. 2. 3. 4. 5. 0. 0. 0. 0.]
     <NDArray 9 @cpu(0)>

     """

     def __init__(self, max_len, fill_value=0):
         self._max_len = max_len
         self._fill_value = fill_value
         super(PadTrim, self).__init__()

     def forward(self, x):
         if  isinstance(x, np.ndarray):
             x = nd.array(x)
         if self._max_len > x.size:
             pad = nd.ones((self._max_len - x.size,)) * self._fill_value
             x = nd.concat(x, pad, dim=0)
         elif self._max_len < x.size:
             x = x[:self._max_len]
         return x


 class MEL(Block):
     """Create MEL Spectrograms from a raw audio signal. Relatively pretty slow.

     Attributes
     ----------
     sampling_rate: int, default 22050
         sampling rate of the input audio signal
     num_fft: int, default 2048
         length of the Fast Fourier transform window
     num_mels: int, default 20
         number of mel bands to generate
     hop_length: int, default 512
         total samples between successive frames


     Inputs:
         - **x**: input tensor (samples, ) shape.

     Outputs:
         - **out**: output array which consists of mel spectograms, shape = (n_mels, 1)

        Usage (see librosa.feature.melspectrogram docs):
            MEL(sr=16000, n_fft=1600, hop_length=800, n_mels=64)

     Examples
     --------
     >>> mel = audio.transforms.MEL()
     >>> audio_samples = mx.nd.array([1,2,3,4,5])
     >>> mel(audio_samples)
     [[3.81801406e+04]
     [9.86858240e-29]
     [1.87405472e-29]
     [2.38637225e-29]
     [3.94043010e-29]
     [3.67071565e-29]
     [7.29390295e-29]
     [8.84324438e-30]...
     <NDArray 128x1 @cpu(0)>

     """

     def __init__(self, sampling_rate=22050, num_fft=2048, num_mels=20, hop_length=512):
         self._sampling_rate = sampling_rate
         self._num_fft = num_fft
         self._num_mels = num_mels
         self._hop_length = hop_length
         super(MEL, self).__init__()

     def forward(self, x):
         if isinstance(x, nd.NDArray):
             x = x.asnumpy()
         specs = librosa.feature.melspectrogram(x, sr=self._sampling_rate,\
         n_fft=self._num_fft, n_mels=self._num_mels, hop_length=self._hop_length)
         return nd.array(specs)
	# Licensed to the Apache Software Foundation (ASF) under one
	# or more contributor license agreements. See the NOTICE file
	# distributed with this work for additional information
	# regarding copyright ownership. The ASF licenses this file
	# to you under the Apache License, Version 2.0 (the
	# "License"); you may not use this file except in compliance
	# with the License. You may obtain a copy of the License at
	#
	# http://www.apache.org/licenses/LICENSE-2.0
	#
	# Unless required by applicable law or agreed to in writing,
	# software distributed under the License is distributed on an
	# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
	# KIND, either express or implied. See the License for the
	# specific language governing permissions and limitations
	# under the License.
	# coding: utf-8
	# pylint: disable= arguments-differ
	"""Audio transforms."""

	import warnings
	import numpy as np
	try:
	import librosa
	except ImportError as e:
	warnings.warn("librosa dependency could not be resolved or \
	imported, could not provide some/all transform.")

	from mxnet import ndarray as nd
	from mxnet.gluon.block import Block

	class MFCC(Block):
	"""Extracts Mel frequency cepstrum coefficients from the audio data file
	More details : https://librosa.github.io/librosa/generated/librosa.feature.mfcc.html

	Attributes
	----------
	sampling_rate: int, default 22050
	sampling rate of the input audio signal
	num_mfcc: int, default 20
	number of mfccs to return


	Inputs:
	- x: input tensor (samples, ) shape.

	Outputs:
	- out: output array is a scaled NDArray with (samples, ) shape.

	"""

	def __init__(self, sampling_rate=22050, num_mfcc=20):
	self._sampling_rate = sampling_rate
	self._num_fcc = num_mfcc
	super(MFCC, self).__init__()

	def forward(self, x):
	if isinstance(x, np.ndarray):
	y = x
	elif isinstance(x, nd.NDArray):
	y = x.asnumpy()
	else:
	warnings.warn("MFCC - allowed datatypes mx.nd.NDArray and numpy.ndarray")
	return x

	audio_tmp = np.mean(librosa.feature.mfcc(y=y, sr=self._sampling_rate, n_mfcc=self._num_fcc).T, axis=0)
	return nd.array(audio_tmp)


	class Scale(Block):
	"""Scale audio numpy.ndarray from a 16-bit integer to a floating point number between
	-1.0 and 1.0. The 16-bit integer is the sample resolution or bit depth.

	Attributes
	----------
	scale_factor : float
	The factor to scale the input tensor by.


	Inputs:
	- x: input tensor (samples, ) shape.

	Outputs:
	- out: output array is a scaled NDArray with (samples, ) shape.

	Examples
	--------
	>>> scale = audio.transforms.Scale(scale_factor=2)
	>>> audio_samples = mx.nd.array([2,3,4])
	>>> scale(audio_samples)
	[1. 1.5 2. ]
	<NDArray 3 @cpu(0)>

	"""

	def __init__(self, scale_factor=2**31):
	self.scale_factor = scale_factor
	super(Scale, self).__init__()

	def forward(self, x):
	if self.scale_factor == 0:
	warnings.warn("Scale factor cannot be 0.")
	return x
	if isinstance(x, np.ndarray):
	return nd.array(x/self.scale_factor)
	return x / self.scale_factor


	class PadTrim(Block):
	"""Pad/Trim a 1d-NDArray of NPArray (Signal or Labels)

	Attributes
	----------
	max_len : int
	Length to which the array will be padded or trimmed to.
	fill_value: int or float
	If there is a need of padding, what value to pad at the end of the input array.


	Inputs:
	- x: input tensor (samples, ) shape.

	Outputs:
	- out: output array is a scaled NDArray with (max_len, ) shape.

	Examples
	--------
	>>> padtrim = audio.transforms.PadTrim(max_len=9, fill_value=0)
	>>> audio_samples = mx.nd.array([1,2,3,4,5])
	>>> padtrim(audio_samples)
	[1. 2. 3. 4. 5. 0. 0. 0. 0.]
	<NDArray 9 @cpu(0)>

	"""

	def __init__(self, max_len, fill_value=0):
	self._max_len = max_len
	self._fill_value = fill_value
	super(PadTrim, self).__init__()

	def forward(self, x):
	if isinstance(x, np.ndarray):
	x = nd.array(x)
	if self._max_len > x.size:
	pad = nd.ones((self._max_len - x.size,)) * self._fill_value
	x = nd.concat(x, pad, dim=0)
	elif self._max_len < x.size:
	x = x[:self._max_len]
	return x


	class MEL(Block):
	"""Create MEL Spectrograms from a raw audio signal. Relatively pretty slow.

	Attributes
	----------
	sampling_rate: int, default 22050
	sampling rate of the input audio signal
	num_fft: int, default 2048
	length of the Fast Fourier transform window
	num_mels: int, default 20
	number of mel bands to generate
	hop_length: int, default 512
	total samples between successive frames


	Inputs:
	- x: input tensor (samples, ) shape.

	Outputs:
	- out: output array which consists of mel spectograms, shape = (n_mels, 1)

	Usage (see librosa.feature.melspectrogram docs):
	MEL(sr=16000, n_fft=1600, hop_length=800, n_mels=64)

	Examples
	--------
	>>> mel = audio.transforms.MEL()
	>>> audio_samples = mx.nd.array([1,2,3,4,5])
	>>> mel(audio_samples)
	[[3.81801406e+04]
	[9.86858240e-29]
	[1.87405472e-29]
	[2.38637225e-29]
	[3.94043010e-29]
	[3.67071565e-29]
	[7.29390295e-29]
	[8.84324438e-30]...
	<NDArray 128x1 @cpu(0)>

	"""

	def __init__(self, sampling_rate=22050, num_fft=2048, num_mels=20, hop_length=512):
	self._sampling_rate = sampling_rate
	self._num_fft = num_fft
	self._num_mels = num_mels
	self._hop_length = hop_length
	super(MEL, self).__init__()

	def forward(self, x):
	if isinstance(x, nd.NDArray):
	x = x.asnumpy()
	specs = librosa.feature.melspectrogram(x, sr=self._sampling_rate,\
	n_fft=self._num_fft, n_mels=self._num_mels, hop_length=self._hop_length)
	return nd.array(specs)