example/rcnn/symdata/image.py - mxnet - Git at Google

 # Licensed to the Apache Software Foundation (ASF) under one
 # or more contributor license agreements.  See the NOTICE file
 # distributed with this work for additional information
 # regarding copyright ownership.  The ASF licenses this file
 # to you under the Apache License, Version 2.0 (the
 # "License"); you may not use this file except in compliance
 # with the License.  You may obtain a copy of the License at
 #
 #   http://www.apache.org/licenses/LICENSE-2.0
 #
 # Unless required by applicable law or agreed to in writing,
 # software distributed under the License is distributed on an
 # "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
 # KIND, either express or implied.  See the License for the
 # specific language governing permissions and limitations
 # under the License.

 import numpy as np
 import cv2


 def get_image(roi_rec, short, max_size, mean, std):
     """
     read, resize, transform image, return im_tensor, im_info, gt_boxes
     roi_rec should have keys: ["image", "boxes", "gt_classes", "flipped"]
     0 --- x (width, second dim of im)
     |
     y (height, first dim of im)
     """
     im = imdecode(roi_rec['image'])
     if roi_rec["flipped"]:
         im = im[:, ::-1, :]
     im, im_scale = resize(im, short, max_size)
     height, width = im.shape[:2]
     im_info = np.array([height, width, im_scale], dtype=np.float32)
     im_tensor = transform(im, mean, std)

     # gt boxes: (x1, y1, x2, y2, cls)
     if roi_rec['gt_classes'].size > 0:
         gt_inds = np.where(roi_rec['gt_classes'] != 0)[0]
         gt_boxes = np.empty((len(gt_inds), 5), dtype=np.float32)
         gt_boxes[:, 0:4] = roi_rec['boxes'][gt_inds, :]
         gt_boxes[:, 4] = roi_rec['gt_classes'][gt_inds]
         # scale gt_boxes
         gt_boxes[:, 0:4] *= im_scale
     else:
         gt_boxes = np.empty((0, 5), dtype=np.float32)

     return im_tensor, im_info, gt_boxes


 def imdecode(image_path):
     """Return BGR image read by opencv"""
     import os
     assert os.path.exists(image_path), image_path + ' not found'
     im = cv2.imread(image_path)
     return im


 def resize(im, short, max_size):
     """
     only resize input image to target size and return scale
     :param im: BGR image input by opencv
     :param short: one dimensional size (the short side)
     :param max_size: one dimensional max size (the long side)
     :return: resized image (NDArray) and scale (float)
     """
     im_shape = im.shape
     im_size_min = np.min(im_shape[0:2])
     im_size_max = np.max(im_shape[0:2])
     im_scale = float(short) / float(im_size_min)
     # prevent bigger axis from being more than max_size:
     if np.round(im_scale * im_size_max) > max_size:
         im_scale = float(max_size) / float(im_size_max)
     im = cv2.resize(im, None, None, fx=im_scale, fy=im_scale, interpolation=cv2.INTER_LINEAR)
     return im, im_scale


 def transform(im, mean, std):
     """
     transform into mxnet tensor,
     subtract pixel size and transform to correct format
     :param im: [height, width, channel] in BGR
     :param mean: [RGB pixel mean]
     :param std: [RGB pixel std var]
     :return: [batch, channel, height, width]
     """
     im_tensor = np.zeros((3, im.shape[0], im.shape[1]))
     for i in range(3):
         im_tensor[i, :, :] = (im[:, :, 2 - i] - mean[i]) / std[i]
     return im_tensor


 def transform_inverse(im_tensor, mean, std):
     """
     transform from mxnet im_tensor to ordinary RGB image
     im_tensor is limited to one image
     :param im_tensor: [batch, channel, height, width]
     :param mean: [RGB pixel mean]
     :param std: [RGB pixel std var]
     :return: im [height, width, channel(RGB)]
     """
     assert im_tensor.shape[0] == 3
     im = im_tensor.transpose((1, 2, 0))
     im = im * std + mean
     im = im.astype(np.uint8)
     return im


 def tensor_vstack(tensor_list, pad=0):
     """
     vertically stack tensors by adding a new axis
     expand dims if only 1 tensor
     :param tensor_list: list of tensor to be stacked vertically
     :param pad: label to pad with
     :return: tensor with max shape
     """
     if len(tensor_list) == 1:
         return tensor_list[0][np.newaxis, :]

     ndim = len(tensor_list[0].shape)
     dimensions = [len(tensor_list)]  # first dim is batch size
     for dim in range(ndim):
         dimensions.append(max([tensor.shape[dim] for tensor in tensor_list]))

     dtype = tensor_list[0].dtype
     if pad == 0:
         all_tensor = np.zeros(tuple(dimensions), dtype=dtype)
     elif pad == 1:
         all_tensor = np.ones(tuple(dimensions), dtype=dtype)
     else:
         all_tensor = np.full(tuple(dimensions), pad, dtype=dtype)
     if ndim == 1:
         for ind, tensor in enumerate(tensor_list):
             all_tensor[ind, :tensor.shape[0]] = tensor
     elif ndim == 2:
         for ind, tensor in enumerate(tensor_list):
             all_tensor[ind, :tensor.shape[0], :tensor.shape[1]] = tensor
     elif ndim == 3:
         for ind, tensor in enumerate(tensor_list):
             all_tensor[ind, :tensor.shape[0], :tensor.shape[1], :tensor.shape[2]] = tensor
     else:
         raise Exception('Sorry, unimplemented.')
     return all_tensor
	# Licensed to the Apache Software Foundation (ASF) under one
	# or more contributor license agreements. See the NOTICE file
	# distributed with this work for additional information
	# regarding copyright ownership. The ASF licenses this file
	# to you under the Apache License, Version 2.0 (the
	# "License"); you may not use this file except in compliance
	# with the License. You may obtain a copy of the License at
	#
	# http://www.apache.org/licenses/LICENSE-2.0
	#
	# Unless required by applicable law or agreed to in writing,
	# software distributed under the License is distributed on an
	# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
	# KIND, either express or implied. See the License for the
	# specific language governing permissions and limitations
	# under the License.

	import numpy as np
	import cv2


	def get_image(roi_rec, short, max_size, mean, std):
	"""
	read, resize, transform image, return im_tensor, im_info, gt_boxes
	roi_rec should have keys: ["image", "boxes", "gt_classes", "flipped"]
	0 --- x (width, second dim of im)
	\|
	y (height, first dim of im)
	"""
	im = imdecode(roi_rec['image'])
	if roi_rec["flipped"]:
	im = im[:, ::-1, :]
	im, im_scale = resize(im, short, max_size)
	height, width = im.shape[:2]
	im_info = np.array([height, width, im_scale], dtype=np.float32)
	im_tensor = transform(im, mean, std)

	# gt boxes: (x1, y1, x2, y2, cls)
	if roi_rec['gt_classes'].size > 0:
	gt_inds = np.where(roi_rec['gt_classes'] != 0)[0]
	gt_boxes = np.empty((len(gt_inds), 5), dtype=np.float32)
	gt_boxes[:, 0:4] = roi_rec['boxes'][gt_inds, :]
	gt_boxes[:, 4] = roi_rec['gt_classes'][gt_inds]
	# scale gt_boxes
	gt_boxes[:, 0:4] *= im_scale
	else:
	gt_boxes = np.empty((0, 5), dtype=np.float32)

	return im_tensor, im_info, gt_boxes


	def imdecode(image_path):
	"""Return BGR image read by opencv"""
	import os
	assert os.path.exists(image_path), image_path + ' not found'
	im = cv2.imread(image_path)
	return im


	def resize(im, short, max_size):
	"""
	only resize input image to target size and return scale
	:param im: BGR image input by opencv
	:param short: one dimensional size (the short side)
	:param max_size: one dimensional max size (the long side)
	:return: resized image (NDArray) and scale (float)
	"""
	im_shape = im.shape
	im_size_min = np.min(im_shape[0:2])
	im_size_max = np.max(im_shape[0:2])
	im_scale = float(short) / float(im_size_min)
	# prevent bigger axis from being more than max_size:
	if np.round(im_scale * im_size_max) > max_size:
	im_scale = float(max_size) / float(im_size_max)
	im = cv2.resize(im, None, None, fx=im_scale, fy=im_scale, interpolation=cv2.INTER_LINEAR)
	return im, im_scale


	def transform(im, mean, std):
	"""
	transform into mxnet tensor,
	subtract pixel size and transform to correct format
	:param im: [height, width, channel] in BGR
	:param mean: [RGB pixel mean]
	:param std: [RGB pixel std var]
	:return: [batch, channel, height, width]
	"""
	im_tensor = np.zeros((3, im.shape[0], im.shape[1]))
	for i in range(3):
	im_tensor[i, :, :] = (im[:, :, 2 - i] - mean[i]) / std[i]
	return im_tensor


	def transform_inverse(im_tensor, mean, std):
	"""
	transform from mxnet im_tensor to ordinary RGB image
	im_tensor is limited to one image
	:param im_tensor: [batch, channel, height, width]
	:param mean: [RGB pixel mean]
	:param std: [RGB pixel std var]
	:return: im [height, width, channel(RGB)]
	"""
	assert im_tensor.shape[0] == 3
	im = im_tensor.transpose((1, 2, 0))
	im = im * std + mean
	im = im.astype(np.uint8)
	return im


	def tensor_vstack(tensor_list, pad=0):
	"""
	vertically stack tensors by adding a new axis
	expand dims if only 1 tensor
	:param tensor_list: list of tensor to be stacked vertically
	:param pad: label to pad with
	:return: tensor with max shape
	"""
	if len(tensor_list) == 1:
	return tensor_list[0][np.newaxis, :]

	ndim = len(tensor_list[0].shape)
	dimensions = [len(tensor_list)] # first dim is batch size
	for dim in range(ndim):
	dimensions.append(max([tensor.shape[dim] for tensor in tensor_list]))

	dtype = tensor_list[0].dtype
	if pad == 0:
	all_tensor = np.zeros(tuple(dimensions), dtype=dtype)
	elif pad == 1:
	all_tensor = np.ones(tuple(dimensions), dtype=dtype)
	else:
	all_tensor = np.full(tuple(dimensions), pad, dtype=dtype)
	if ndim == 1:
	for ind, tensor in enumerate(tensor_list):
	all_tensor[ind, :tensor.shape[0]] = tensor
	elif ndim == 2:
	for ind, tensor in enumerate(tensor_list):
	all_tensor[ind, :tensor.shape[0], :tensor.shape[1]] = tensor
	elif ndim == 3:
	for ind, tensor in enumerate(tensor_list):
	all_tensor[ind, :tensor.shape[0], :tensor.shape[1], :tensor.shape[2]] = tensor
	else:
	raise Exception('Sorry, unimplemented.')
	return all_tensor