example/named_entity_recognition/src/iterators.py - mxnet - Git at Google

 # !/usr/bin/env python

 # Licensed to the Apache Software Foundation (ASF) under one
 # or more contributor license agreements.  See the NOTICE file
 # distributed with this work for additional information
 # regarding copyright ownership.  The ASF licenses this file
 # to you under the Apache License, Version 2.0 (the
 # "License"); you may not use this file except in compliance
 # with the License.  You may obtain a copy of the License at
 #
 #   http://www.apache.org/licenses/LICENSE-2.0
 #
 # Unless required by applicable law or agreed to in writing,
 # software distributed under the License is distributed on an
 # "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
 # KIND, either express or implied.  See the License for the
 # specific language governing permissions and limitations
 # under the License.

 # -*- coding: utf-8 -*-

 import bisect
 import random
 import numpy as np
 from mxnet.io import DataIter, DataBatch, DataDesc
 from mxnet import ndarray
 from sklearn.utils import shuffle

 class BucketNerIter(DataIter):
     """
     This iterator can handle variable length feature/label arrays for MXNet RNN classifiers.
     This iterator can ingest 2d list of sentences, 2d list of entities and 3d list of characters.
     """

     def __init__(self, sentences, characters, label, max_token_chars, batch_size, buckets=None, data_pad=-1, label_pad = -1, data_names=['sentences', 'characters'],
                  label_name='seq_label', dtype='float32'):

         super(BucketNerIter, self).__init__()

         # Create a bucket for every seq length where there are more examples than the batch size
         if not buckets:
             seq_counts = np.bincount([len(s) for s in sentences])
             buckets = [i for i, j in enumerate(seq_counts) if j >= batch_size]
         buckets.sort()
         print("\nBuckets  created: ", buckets)
         assert(len(buckets) > 0), "Not enough utterances to create any buckets."

         ###########
         # Sentences
         ###########
         nslice = 0
         # Create empty nested lists for storing data that falls into each bucket
         self.sentences = [[] for _ in buckets]
         for i, sent in enumerate(sentences):
             # Find the index of the smallest bucket that is larger than the sentence length
             buck_idx = bisect.bisect_left(buckets, len(sent))

             if buck_idx == len(buckets): # If the sentence is larger than the largest bucket
                 buck_idx = buck_idx - 1
                 nslice += 1
                 sent = sent[:buckets[buck_idx]] #Slice sentence to largest bucket size

             buff = np.full((buckets[buck_idx]), data_pad, dtype=dtype) # Create an array filled with 'data_pad'
             buff[:len(sent)] = sent # Fill with actual values
             self.sentences[buck_idx].append(buff) # Append array to index = bucket index
         self.sentences = [np.asarray(i, dtype=dtype) for i in self.sentences] # Convert to list of array
         print("Warning, {0} sentences sliced to largest bucket size.".format(nslice)) if nslice > 0 else None

         ############
         # Characters
         ############
         # Create empty nested lists for storing data that falls into each bucket
         self.characters = [[] for _ in buckets]
         for i, charsent in enumerate(characters):
             # Find the index of the smallest bucket that is larger than the sentence length
             buck_idx = bisect.bisect_left(buckets, len(charsent))

             if buck_idx == len(buckets): # If the sentence is larger than the largest bucket
                 buck_idx = buck_idx - 1
                 charsent = charsent[:buckets[buck_idx]] #Slice sentence to largest bucket size

             charsent = [word[:max_token_chars]for word in charsent] # Slice to max length
             charsent = [word + [data_pad]*(max_token_chars-len(word)) for word in charsent]# Pad to max length
             charsent = np.array(charsent)
             buff = np.full((buckets[buck_idx], max_token_chars), data_pad, dtype=dtype)
             buff[:charsent.shape[0], :] = charsent # Fill with actual values
             self.characters[buck_idx].append(buff) # Append array to index = bucket index
         self.characters = [np.asarray(i, dtype=dtype) for i in self.characters] # Convert to list of array

         ##########
         # Entities
         ##########
         # Create empty nested lists for storing data that falls into each bucket
         self.label = [[] for _ in buckets]
         self.indices = [[] for _ in buckets]
         for i, entities in enumerate(label):
             # Find the index of the smallest bucket that is larger than the sentence length
             buck_idx = bisect.bisect_left(buckets, len(entities))

             if buck_idx == len(buckets):  # If the sentence is larger than the largest bucket
                 buck_idx = buck_idx - 1
                 entities = entities[:buckets[buck_idx]]  # Slice sentence to largest bucket size

             buff = np.full((buckets[buck_idx]), label_pad, dtype=dtype)  # Create an array filled with 'data_pad'
             buff[:len(entities)] = entities  # Fill with actual values
             self.label[buck_idx].append(buff)  # Append array to index = bucket index
             self.indices[buck_idx].append(i)
         self.label = [np.asarray(i, dtype=dtype) for i in self.label]  # Convert to list of array
         self.indices = [np.asarray(i, dtype=dtype) for i in self.indices]  # Convert to list of array

         self.data_names = data_names
         self.label_name = label_name
         self.batch_size = batch_size
         self.max_token_chars = max_token_chars
         self.buckets = buckets
         self.dtype = dtype
         self.data_pad = data_pad
         self.label_pad = label_pad
         self.default_bucket_key = max(buckets)
         self.layout = 'NT'

         self.provide_data = [DataDesc(name=self.data_names[0], shape=(self.batch_size, self.default_bucket_key), layout=self.layout),
                              DataDesc(name=self.data_names[1], shape=(self.batch_size, self.default_bucket_key, self.max_token_chars), layout=self.layout)]
         self.provide_label=[DataDesc(name=self.label_name, shape=(self.batch_size, self.default_bucket_key), layout=self.layout)]

         #create empty list to store batch index values
         self.idx = []
         #for each bucketarray
         for i, buck in enumerate(self.sentences):
             #extend the list eg output with batch size 5 and 20 training examples in bucket. [(0,0), (0,5), (0,10), (0,15), (1,0), (1,5), (1,10), (1,15)]
             self.idx.extend([(i, j) for j in range(0, len(buck) - batch_size + 1, batch_size)])
         self.curr_idx = 0
         self.reset()

     def reset(self):
         """Resets the iterator to the beginning of the data."""
         self.curr_idx = 0
         #shuffle data in each bucket
         random.shuffle(self.idx)
         for i, buck in enumerate(self.sentences):
             self.indices[i], self.sentences[i], self.characters[i], self.label[i] = shuffle(self.indices[i],
                                                                                             self.sentences[i],
                                                                                             self.characters[i],
                                                                                             self.label[i])

         self.ndindex = []
         self.ndsent = []
         self.ndchar = []
         self.ndlabel = []

         #for each bucket of data
         for i, buck in enumerate(self.sentences):
             #append the lists with an array
             self.ndindex.append(ndarray.array(self.indices[i], dtype=self.dtype))
             self.ndsent.append(ndarray.array(self.sentences[i], dtype=self.dtype))
             self.ndchar.append(ndarray.array(self.characters[i], dtype=self.dtype))
             self.ndlabel.append(ndarray.array(self.label[i], dtype=self.dtype))

     def next(self):
         """Returns the next batch of data."""
         if self.curr_idx == len(self.idx):
             raise StopIteration
         #i = batches index, j = starting record
         i, j = self.idx[self.curr_idx]
         self.curr_idx += 1

         indices = self.ndindex[i][j:j + self.batch_size]
         sentences = self.ndsent[i][j:j + self.batch_size]
         characters = self.ndchar[i][j:j + self.batch_size]
         label = self.ndlabel[i][j:j + self.batch_size]

         return DataBatch([sentences, characters], [label], pad=0, index = indices, bucket_key=self.buckets[i],
                          provide_data=[DataDesc(name=self.data_names[0], shape=sentences.shape, layout=self.layout),
                                        DataDesc(name=self.data_names[1], shape=characters.shape, layout=self.layout)],
                          provide_label=[DataDesc(name=self.label_name, shape=label.shape, layout=self.layout)])
	# !/usr/bin/env python

	# Licensed to the Apache Software Foundation (ASF) under one
	# or more contributor license agreements. See the NOTICE file
	# distributed with this work for additional information
	# regarding copyright ownership. The ASF licenses this file
	# to you under the Apache License, Version 2.0 (the
	# "License"); you may not use this file except in compliance
	# with the License. You may obtain a copy of the License at
	#
	# http://www.apache.org/licenses/LICENSE-2.0
	#
	# Unless required by applicable law or agreed to in writing,
	# software distributed under the License is distributed on an
	# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
	# KIND, either express or implied. See the License for the
	# specific language governing permissions and limitations
	# under the License.

	# -- coding: utf-8 --

	import bisect
	import random
	import numpy as np
	from mxnet.io import DataIter, DataBatch, DataDesc
	from mxnet import ndarray
	from sklearn.utils import shuffle

	class BucketNerIter(DataIter):
	"""
	This iterator can handle variable length feature/label arrays for MXNet RNN classifiers.
	This iterator can ingest 2d list of sentences, 2d list of entities and 3d list of characters.
	"""

	def __init__(self, sentences, characters, label, max_token_chars, batch_size, buckets=None, data_pad=-1, label_pad = -1, data_names=['sentences', 'characters'],
	label_name='seq_label', dtype='float32'):

	super(BucketNerIter, self).__init__()

	# Create a bucket for every seq length where there are more examples than the batch size
	if not buckets:
	seq_counts = np.bincount([len(s) for s in sentences])
	buckets = [i for i, j in enumerate(seq_counts) if j >= batch_size]
	buckets.sort()
	print("\nBuckets created: ", buckets)
	assert(len(buckets) > 0), "Not enough utterances to create any buckets."

	###########
	# Sentences
	###########
	nslice = 0
	# Create empty nested lists for storing data that falls into each bucket
	self.sentences = [[] for _ in buckets]
	for i, sent in enumerate(sentences):
	# Find the index of the smallest bucket that is larger than the sentence length
	buck_idx = bisect.bisect_left(buckets, len(sent))

	if buck_idx == len(buckets): # If the sentence is larger than the largest bucket
	buck_idx = buck_idx - 1
	nslice += 1
	sent = sent[:buckets[buck_idx]] #Slice sentence to largest bucket size

	buff = np.full((buckets[buck_idx]), data_pad, dtype=dtype) # Create an array filled with 'data_pad'
	buff[:len(sent)] = sent # Fill with actual values
	self.sentences[buck_idx].append(buff) # Append array to index = bucket index
	self.sentences = [np.asarray(i, dtype=dtype) for i in self.sentences] # Convert to list of array
	print("Warning, {0} sentences sliced to largest bucket size.".format(nslice)) if nslice > 0 else None

	############
	# Characters
	############
	# Create empty nested lists for storing data that falls into each bucket
	self.characters = [[] for _ in buckets]
	for i, charsent in enumerate(characters):
	# Find the index of the smallest bucket that is larger than the sentence length
	buck_idx = bisect.bisect_left(buckets, len(charsent))

	if buck_idx == len(buckets): # If the sentence is larger than the largest bucket
	buck_idx = buck_idx - 1
	charsent = charsent[:buckets[buck_idx]] #Slice sentence to largest bucket size

	charsent = [word[:max_token_chars]for word in charsent] # Slice to max length
	charsent = [word + [data_pad]*(max_token_chars-len(word)) for word in charsent]# Pad to max length
	charsent = np.array(charsent)
	buff = np.full((buckets[buck_idx], max_token_chars), data_pad, dtype=dtype)
	buff[:charsent.shape[0], :] = charsent # Fill with actual values
	self.characters[buck_idx].append(buff) # Append array to index = bucket index
	self.characters = [np.asarray(i, dtype=dtype) for i in self.characters] # Convert to list of array

	##########
	# Entities
	##########
	# Create empty nested lists for storing data that falls into each bucket
	self.label = [[] for _ in buckets]
	self.indices = [[] for _ in buckets]
	for i, entities in enumerate(label):
	# Find the index of the smallest bucket that is larger than the sentence length
	buck_idx = bisect.bisect_left(buckets, len(entities))

	if buck_idx == len(buckets): # If the sentence is larger than the largest bucket
	buck_idx = buck_idx - 1
	entities = entities[:buckets[buck_idx]] # Slice sentence to largest bucket size

	buff = np.full((buckets[buck_idx]), label_pad, dtype=dtype) # Create an array filled with 'data_pad'
	buff[:len(entities)] = entities # Fill with actual values
	self.label[buck_idx].append(buff) # Append array to index = bucket index
	self.indices[buck_idx].append(i)
	self.label = [np.asarray(i, dtype=dtype) for i in self.label] # Convert to list of array
	self.indices = [np.asarray(i, dtype=dtype) for i in self.indices] # Convert to list of array

	self.data_names = data_names
	self.label_name = label_name
	self.batch_size = batch_size
	self.max_token_chars = max_token_chars
	self.buckets = buckets
	self.dtype = dtype
	self.data_pad = data_pad
	self.label_pad = label_pad
	self.default_bucket_key = max(buckets)
	self.layout = 'NT'

	self.provide_data = [DataDesc(name=self.data_names[0], shape=(self.batch_size, self.default_bucket_key), layout=self.layout),
	DataDesc(name=self.data_names[1], shape=(self.batch_size, self.default_bucket_key, self.max_token_chars), layout=self.layout)]
	self.provide_label=[DataDesc(name=self.label_name, shape=(self.batch_size, self.default_bucket_key), layout=self.layout)]

	#create empty list to store batch index values
	self.idx = []
	#for each bucketarray
	for i, buck in enumerate(self.sentences):
	#extend the list eg output with batch size 5 and 20 training examples in bucket. [(0,0), (0,5), (0,10), (0,15), (1,0), (1,5), (1,10), (1,15)]
	self.idx.extend([(i, j) for j in range(0, len(buck) - batch_size + 1, batch_size)])
	self.curr_idx = 0
	self.reset()

	def reset(self):
	"""Resets the iterator to the beginning of the data."""
	self.curr_idx = 0
	#shuffle data in each bucket
	random.shuffle(self.idx)
	for i, buck in enumerate(self.sentences):
	self.indices[i], self.sentences[i], self.characters[i], self.label[i] = shuffle(self.indices[i],
	self.sentences[i],
	self.characters[i],
	self.label[i])

	self.ndindex = []
	self.ndsent = []
	self.ndchar = []
	self.ndlabel = []

	#for each bucket of data
	for i, buck in enumerate(self.sentences):
	#append the lists with an array
	self.ndindex.append(ndarray.array(self.indices[i], dtype=self.dtype))
	self.ndsent.append(ndarray.array(self.sentences[i], dtype=self.dtype))
	self.ndchar.append(ndarray.array(self.characters[i], dtype=self.dtype))
	self.ndlabel.append(ndarray.array(self.label[i], dtype=self.dtype))

	def next(self):
	"""Returns the next batch of data."""
	if self.curr_idx == len(self.idx):
	raise StopIteration
	#i = batches index, j = starting record
	i, j = self.idx[self.curr_idx]
	self.curr_idx += 1

	indices = self.ndindex[i][j:j + self.batch_size]
	sentences = self.ndsent[i][j:j + self.batch_size]
	characters = self.ndchar[i][j:j + self.batch_size]
	label = self.ndlabel[i][j:j + self.batch_size]

	return DataBatch([sentences, characters], [label], pad=0, index = indices, bucket_key=self.buckets[i],
	provide_data=[DataDesc(name=self.data_names[0], shape=sentences.shape, layout=self.layout),
	DataDesc(name=self.data_names[1], shape=characters.shape, layout=self.layout)],
	provide_label=[DataDesc(name=self.label_name, shape=label.shape, layout=self.layout)])