blob: 8b76d131cdb12784550abe813c58e6f41b9ca222 [file]
# Licensed to the Apache Software Foundation (ASF) under one
# or more contributor license agreements. See the NOTICE file
# distributed with this work for additional information
# regarding copyright ownership. The ASF licenses this file
# to you under the Apache License, Version 2.0 (the
# "License"); you may not use this file except in compliance
# with the License. You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing,
# software distributed under the License is distributed on an
# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
# KIND, either express or implied. See the License for the
# specific language governing permissions and limitations
# under the License.
# coding: utf-8
# pylint: disable= arguments-differ
"""Audio transforms."""
import warnings
import numpy as np
try:
import librosa
except ImportError as e:
warnings.warn("librosa dependency could not be resolved or \
imported, could not provide some/all transform.")
from mxnet import ndarray as nd
from mxnet.gluon.block import Block
class MFCC(Block):
"""Extracts Mel frequency cepstrum coefficients from the audio data file
More details : https://librosa.github.io/librosa/generated/librosa.feature.mfcc.html
Attributes
----------
sampling_rate: int, default 22050
sampling rate of the input audio signal
num_mfcc: int, default 20
number of mfccs to return
Inputs:
- **x**: input tensor (samples, ) shape.
Outputs:
- **out**: output array is a scaled NDArray with (samples, ) shape.
"""
def __init__(self, sampling_rate=22050, num_mfcc=20):
self._sampling_rate = sampling_rate
self._num_fcc = num_mfcc
super(MFCC, self).__init__()
def forward(self, x):
if isinstance(x, np.ndarray):
y = x
elif isinstance(x, nd.NDArray):
y = x.asnumpy()
else:
warnings.warn("MFCC - allowed datatypes mx.nd.NDArray and numpy.ndarray")
return x
audio_tmp = np.mean(librosa.feature.mfcc(y=y, sr=self._sampling_rate, n_mfcc=self._num_fcc).T, axis=0)
return nd.array(audio_tmp)
class Scale(Block):
"""Scale audio numpy.ndarray from a 16-bit integer to a floating point number between
-1.0 and 1.0. The 16-bit integer is the sample resolution or bit depth.
Attributes
----------
scale_factor : float
The factor to scale the input tensor by.
Inputs:
- **x**: input tensor (samples, ) shape.
Outputs:
- **out**: output array is a scaled NDArray with (samples, ) shape.
Examples
--------
>>> scale = audio.transforms.Scale(scale_factor=2)
>>> audio_samples = mx.nd.array([2,3,4])
>>> scale(audio_samples)
[1. 1.5 2. ]
<NDArray 3 @cpu(0)>
"""
def __init__(self, scale_factor=2**31):
self.scale_factor = scale_factor
super(Scale, self).__init__()
def forward(self, x):
if self.scale_factor == 0:
warnings.warn("Scale factor cannot be 0.")
return x
if isinstance(x, np.ndarray):
return nd.array(x/self.scale_factor)
return x / self.scale_factor
class PadTrim(Block):
"""Pad/Trim a 1d-NDArray of NPArray (Signal or Labels)
Attributes
----------
max_len : int
Length to which the array will be padded or trimmed to.
fill_value: int or float
If there is a need of padding, what value to pad at the end of the input array.
Inputs:
- **x**: input tensor (samples, ) shape.
Outputs:
- **out**: output array is a scaled NDArray with (max_len, ) shape.
Examples
--------
>>> padtrim = audio.transforms.PadTrim(max_len=9, fill_value=0)
>>> audio_samples = mx.nd.array([1,2,3,4,5])
>>> padtrim(audio_samples)
[1. 2. 3. 4. 5. 0. 0. 0. 0.]
<NDArray 9 @cpu(0)>
"""
def __init__(self, max_len, fill_value=0):
self._max_len = max_len
self._fill_value = fill_value
super(PadTrim, self).__init__()
def forward(self, x):
if isinstance(x, np.ndarray):
x = nd.array(x)
if self._max_len > x.size:
pad = nd.ones((self._max_len - x.size,)) * self._fill_value
x = nd.concat(x, pad, dim=0)
elif self._max_len < x.size:
x = x[:self._max_len]
return x
class MEL(Block):
"""Create MEL Spectrograms from a raw audio signal. Relatively pretty slow.
Attributes
----------
sampling_rate: int, default 22050
sampling rate of the input audio signal
num_fft: int, default 2048
length of the Fast Fourier transform window
num_mels: int, default 20
number of mel bands to generate
hop_length: int, default 512
total samples between successive frames
Inputs:
- **x**: input tensor (samples, ) shape.
Outputs:
- **out**: output array which consists of mel spectograms, shape = (n_mels, 1)
Usage (see librosa.feature.melspectrogram docs):
MEL(sr=16000, n_fft=1600, hop_length=800, n_mels=64)
Examples
--------
>>> mel = audio.transforms.MEL()
>>> audio_samples = mx.nd.array([1,2,3,4,5])
>>> mel(audio_samples)
[[3.81801406e+04]
[9.86858240e-29]
[1.87405472e-29]
[2.38637225e-29]
[3.94043010e-29]
[3.67071565e-29]
[7.29390295e-29]
[8.84324438e-30]...
<NDArray 128x1 @cpu(0)>
"""
def __init__(self, sampling_rate=22050, num_fft=2048, num_mels=20, hop_length=512):
self._sampling_rate = sampling_rate
self._num_fft = num_fft
self._num_mels = num_mels
self._hop_length = hop_length
super(MEL, self).__init__()
def forward(self, x):
if isinstance(x, nd.NDArray):
x = x.asnumpy()
specs = librosa.feature.melspectrogram(x, sr=self._sampling_rate,\
n_fft=self._num_fft, n_mels=self._num_mels, hop_length=self._hop_length)
return nd.array(specs)