| from __future__ import absolute_import, division, print_function |
| |
| import json |
| import random |
| |
| import numpy as np |
| from stt_utils import calc_feat_dim, spectrogram_from_file |
| |
| from config_util import generate_file_path |
| from log_util import LogUtil |
| from label_util import LabelUtil |
| from stt_bi_graphemes_util import generate_bi_graphemes_label |
| |
| class DataGenerator(object): |
| def __init__(self, save_dir, model_name, step=10, window=20, max_freq=8000, desc_file=None): |
| """ |
| Params: |
| step (int): Step size in milliseconds between windows |
| window (int): FFT window size in milliseconds |
| max_freq (int): Only FFT bins corresponding to frequencies between |
| [0, max_freq] are returned |
| desc_file (str, optional): Path to a JSON-line file that contains |
| labels and paths to the audio files. If this is None, then |
| load metadata right away |
| """ |
| #calc_feat_dim returns int(0.001*window*max_freq)+1 |
| super(DataGenerator, self).__init__() |
| # feat_dim=0.001*20*8000+1=161 |
| self.feat_dim = calc_feat_dim(window, max_freq) |
| # 1d 161 length of array filled with zeros |
| self.feats_mean = np.zeros((self.feat_dim,)) |
| # 1d 161 length of array filled with 1s |
| self.feats_std = np.ones((self.feat_dim,)) |
| self.max_input_length = 0 |
| self.max_length_list_in_batch =[] |
| # 1d 161 length of array filled with random value |
| #[0.0, 1.0) |
| self.rng = random.Random() |
| if desc_file is not None: |
| self.load_metadata_from_desc_file(desc_file) |
| self.step = step |
| self.window = window |
| self.max_freq = max_freq |
| self.save_dir = save_dir |
| self.model_name = model_name |
| |
| def get_meta_from_file(self, feats_mean, feats_std): |
| self.feats_mean = feats_mean |
| self.feats_std = feats_std |
| |
| def featurize(self, audio_clip, overwrite=False): |
| """ For a given audio clip, calculate the log of its Fourier Transform |
| Params: |
| audio_clip(str): Path to the audio clip |
| """ |
| return spectrogram_from_file( |
| audio_clip, step=self.step, window=self.window, |
| max_freq=self.max_freq, overwrite=overwrite) |
| |
| def load_metadata_from_desc_file(self, desc_file, partition='train', |
| max_duration=16.0,): |
| """ Read metadata from the description file |
| (possibly takes long, depending on the filesize) |
| Params: |
| desc_file (str): Path to a JSON-line file that contains labels and |
| paths to the audio files |
| partition (str): One of 'train', 'validation' or 'test' |
| max_duration (float): In seconds, the maximum duration of |
| utterances to train or test on |
| """ |
| logger = LogUtil().getlogger() |
| logger.info('Reading description file: {} for partition: {}' |
| .format(desc_file, partition)) |
| audio_paths, durations, texts = [], [], [] |
| with open(desc_file) as json_line_file: |
| for line_num, json_line in enumerate(json_line_file): |
| try: |
| spec = json.loads(json_line) |
| if float(spec['duration']) > max_duration: |
| continue |
| audio_paths.append(spec['key']) |
| durations.append(float(spec['duration'])) |
| texts.append(spec['text']) |
| except Exception as e: |
| # Change to (KeyError, ValueError) or |
| # (KeyError,json.decoder.JSONDecodeError), depending on |
| # json module version |
| logger.warn('Error reading line #{}: {}' |
| .format(line_num, json_line)) |
| logger.warn(str(e)) |
| |
| if partition == 'train': |
| self.count = len(audio_paths) |
| self.train_audio_paths = audio_paths |
| self.train_durations = durations |
| self.train_texts = texts |
| elif partition == 'validation': |
| self.val_audio_paths = audio_paths |
| self.val_durations = durations |
| self.val_texts = texts |
| self.val_count = len(audio_paths) |
| elif partition == 'test': |
| self.test_audio_paths = audio_paths |
| self.test_durations = durations |
| self.test_texts = texts |
| else: |
| raise Exception("Invalid partition to load metadata. " |
| "Must be train/validation/test") |
| |
| def load_train_data(self, desc_file): |
| self.load_metadata_from_desc_file(desc_file, 'train') |
| |
| def load_validation_data(self, desc_file): |
| self.load_metadata_from_desc_file(desc_file, 'validation') |
| |
| @staticmethod |
| def sort_by_duration(durations, audio_paths, texts): |
| return zip(*sorted(zip(durations, audio_paths, texts))) |
| |
| def normalize(self, feature, eps=1e-14): |
| return (feature - self.feats_mean) / (self.feats_std + eps) |
| |
| def get_max_label_length(self, partition, is_bi_graphemes=False): |
| if partition == 'train': |
| texts = self.train_texts + self.val_texts |
| elif partition == 'test': |
| texts = self.train_texts |
| else: |
| raise Exception("Invalid partition to load metadata. " |
| "Must be train/validation/test") |
| if is_bi_graphemes: |
| self.max_label_length = max([len(generate_bi_graphemes_label(text)) for text in texts]) |
| else: |
| self.max_label_length = max([len(text) for text in texts]) |
| return self.max_label_length |
| |
| def get_max_seq_length(self, partition): |
| if partition == 'train': |
| audio_paths = self.train_audio_paths + self.val_audio_paths |
| durations = self.train_durations + self.val_durations |
| elif partition == 'test': |
| audio_paths = self.train_audio_paths |
| durations = self.train_durations |
| else: |
| raise Exception("Invalid partition to load metadata. " |
| "Must be train/validation/test") |
| max_duration_indexes = durations.index(max(durations)) |
| max_seq_length = self.featurize(audio_paths[max_duration_indexes]).shape[0] |
| self.max_seq_length=max_seq_length |
| return max_seq_length |
| |
| def prepare_minibatch(self, audio_paths, texts, overwrite=False, is_bi_graphemes=False): |
| """ Featurize a minibatch of audio, zero pad them and return a dictionary |
| Params: |
| audio_paths (list(str)): List of paths to audio files |
| texts (list(str)): List of texts corresponding to the audio files |
| Returns: |
| dict: See below for contents |
| """ |
| assert len(audio_paths) == len(texts),\ |
| "Inputs and outputs to the network must be of the same number" |
| # Features is a list of (timesteps, feature_dim) arrays |
| # Calculate the features for each audio clip, as the log of the |
| # Fourier Transform of the audio |
| features = [self.featurize(a, overwrite=overwrite) for a in audio_paths] |
| input_lengths = [f.shape[0] for f in features] |
| feature_dim = features[0].shape[1] |
| mb_size = len(features) |
| # Pad all the inputs so that they are all the same length |
| x = np.zeros((mb_size, self.max_seq_length, feature_dim)) |
| y = np.zeros((mb_size, self.max_label_length)) |
| labelUtil = LabelUtil.getInstance() |
| label_lengths = [] |
| for i in range(mb_size): |
| feat = features[i] |
| feat = self.normalize(feat) # Center using means and std |
| x[i, :feat.shape[0], :] = feat |
| if is_bi_graphemes: |
| label = generate_bi_graphemes_label(texts[i]) |
| label = labelUtil.convert_bi_graphemes_to_num(label) |
| y[i, :len(label)] = label |
| else: |
| label = labelUtil.convert_word_to_num(texts[i]) |
| y[i, :len(texts[i])] = label |
| label_lengths.append(len(label)) |
| return { |
| 'x': x, # (0-padded features of shape(mb_size,timesteps,feat_dim) |
| 'y': y, # list(int) Flattened labels (integer sequences) |
| 'texts': texts, # list(str) Original texts |
| 'input_lengths': input_lengths, # list(int) Length of each input |
| 'label_lengths': label_lengths, # list(int) Length of each label |
| } |
| |
| def iterate_test(self, minibatch_size=16): |
| return self.iterate(self.test_audio_paths, self.test_texts, |
| minibatch_size) |
| |
| def iterate_validation(self, minibatch_size=16): |
| return self.iterate(self.val_audio_paths, self.val_texts, |
| minibatch_size) |
| |
| def sample_normalize(self, k_samples=1000, overwrite=False): |
| """ Estimate the mean and std of the features from the training set |
| Params: |
| k_samples (int): Use this number of samples for estimation |
| """ |
| # if k_samples is negative then it goes through total dataset |
| if k_samples < 0: |
| audio_paths_iter = iter(self.audio_paths) |
| # using sample |
| else: |
| k_samples = min(k_samples, len(self.train_audio_paths)) |
| samples = self.rng.sample(self.train_audio_paths, k_samples) |
| audio_paths_iter = iter(samples) |
| audio_clip = audio_paths_iter.next() |
| feat = self.featurize(audio_clip=audio_clip, overwrite=overwrite) |
| feat_squared = np.square(feat) |
| count = float(feat.shape[0]) |
| dim = feat.shape[1] |
| |
| for iter_index in range(len(samples) - 1): |
| next_feat = self.featurize(audio_clip=audio_paths_iter.next(), overwrite=overwrite) |
| next_feat_squared = np.square(next_feat) |
| feat_vertically_stacked = np.concatenate((feat, next_feat)).reshape(-1, dim) |
| feat = np.sum(feat_vertically_stacked, axis=0, keepdims=True) |
| feat_squared_vertically_stacked = np.concatenate((feat_squared, next_feat_squared)).reshape(-1, dim) |
| feat_squared = np.sum(feat_squared_vertically_stacked, axis=0, keepdims=True) |
| count = count + float(next_feat.shape[0]) |
| self.feats_mean = feat / float(count) |
| self.feats_std = np.sqrt(feat_squared / float(count) - np.square(self.feats_mean)) |
| np.savetxt(generate_file_path(self.save_dir, self.model_name, 'feats_mean'), self.feats_mean) |
| np.savetxt(generate_file_path(self.save_dir, self.model_name, 'feats_std'), self.feats_std) |