blob: e468e4647b976da8dc20660c7620012f29e853ba [file] [log] [blame]
# Licensed to the Apache Software Foundation (ASF) under one
# or more contributor license agreements. See the NOTICE file
# distributed with this work for additional information
# regarding copyright ownership. The ASF licenses this file
# to you under the Apache License, Version 2.0 (the
# "License"); you may not use this file except in compliance
# with the License. You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing,
# software distributed under the License is distributed on an
# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
# KIND, either express or implied. See the License for the
# specific language governing permissions and limitations
# under the License.
import numpy as np
import cv2
import os
import random
from ..config import config
def get_image(roidb):
"""
preprocess image and return processed roidb
:param roidb: a list of roidb
:return: list of img as in mxnet format
roidb add new item['im_info']
0 --- x (width, second dim of im)
|
y (height, first dim of im)
"""
num_images = len(roidb)
processed_ims = []
processed_roidb = []
for i in range(num_images):
roi_rec = roidb[i]
assert os.path.exists(roi_rec['image']), '%s does not exist'.format(roi_rec['image'])
im = cv2.imread(roi_rec['image'])
if roidb[i]['flipped']:
im = im[:, ::-1, :]
new_rec = roi_rec.copy()
scale_ind = random.randrange(len(config.SCALES))
target_size = config.SCALES[scale_ind][0]
max_size = config.SCALES[scale_ind][1]
im, im_scale = resize(im, target_size, max_size, stride=config.IMAGE_STRIDE)
im_tensor = transform(im, config.PIXEL_MEANS)
processed_ims.append(im_tensor)
im_info = [im_tensor.shape[2], im_tensor.shape[3], im_scale]
new_rec['boxes'] = roi_rec['boxes'].copy() * im_scale
new_rec['im_info'] = im_info
processed_roidb.append(new_rec)
return processed_ims, processed_roidb
def resize(im, target_size, max_size, stride=0):
"""
only resize input image to target size and return scale
:param im: BGR image input by opencv
:param target_size: one dimensional size (the short side)
:param max_size: one dimensional max size (the long side)
:param stride: if given, pad the image to designated stride
:return:
"""
im_shape = im.shape
im_size_min = np.min(im_shape[0:2])
im_size_max = np.max(im_shape[0:2])
im_scale = float(target_size) / float(im_size_min)
# prevent bigger axis from being more than max_size:
if np.round(im_scale * im_size_max) > max_size:
im_scale = float(max_size) / float(im_size_max)
im = cv2.resize(im, None, None, fx=im_scale, fy=im_scale, interpolation=cv2.INTER_LINEAR)
if stride == 0:
return im, im_scale
else:
# pad to product of stride
im_height = int(np.ceil(im.shape[0] / float(stride)) * stride)
im_width = int(np.ceil(im.shape[1] / float(stride)) * stride)
im_channel = im.shape[2]
padded_im = np.zeros((im_height, im_width, im_channel))
padded_im[:im.shape[0], :im.shape[1], :] = im
return padded_im, im_scale
def transform(im, pixel_means):
"""
transform into mxnet tensor,
subtract pixel size and transform to correct format
:param im: [height, width, channel] in BGR
:param pixel_means: [B, G, R pixel means]
:return: [batch, channel, height, width]
"""
im_tensor = np.zeros((1, 3, im.shape[0], im.shape[1]))
for i in range(3):
im_tensor[0, i, :, :] = im[:, :, 2 - i] - pixel_means[2 - i]
return im_tensor
def transform_inverse(im_tensor, pixel_means):
"""
transform from mxnet im_tensor to ordinary RGB image
im_tensor is limited to one image
:param im_tensor: [batch, channel, height, width]
:param pixel_means: [B, G, R pixel means]
:return: im [height, width, channel(RGB)]
"""
assert im_tensor.shape[0] == 1
im_tensor = im_tensor.copy()
# put channel back
channel_swap = (0, 2, 3, 1)
im_tensor = im_tensor.transpose(channel_swap)
im = im_tensor[0]
assert im.shape[2] == 3
im += pixel_means[[2, 1, 0]]
im = im.astype(np.uint8)
return im
def tensor_vstack(tensor_list, pad=0):
"""
vertically stack tensors
:param tensor_list: list of tensor to be stacked vertically
:param pad: label to pad with
:return: tensor with max shape
"""
ndim = len(tensor_list[0].shape)
dtype = tensor_list[0].dtype
islice = tensor_list[0].shape[0]
dimensions = []
first_dim = sum([tensor.shape[0] for tensor in tensor_list])
dimensions.append(first_dim)
for dim in range(1, ndim):
dimensions.append(max([tensor.shape[dim] for tensor in tensor_list]))
if pad == 0:
all_tensor = np.zeros(tuple(dimensions), dtype=dtype)
elif pad == 1:
all_tensor = np.ones(tuple(dimensions), dtype=dtype)
else:
all_tensor = np.full(tuple(dimensions), pad, dtype=dtype)
if ndim == 1:
for ind, tensor in enumerate(tensor_list):
all_tensor[ind*islice:(ind+1)*islice] = tensor
elif ndim == 2:
for ind, tensor in enumerate(tensor_list):
all_tensor[ind*islice:(ind+1)*islice, :tensor.shape[1]] = tensor
elif ndim == 3:
for ind, tensor in enumerate(tensor_list):
all_tensor[ind*islice:(ind+1)*islice, :tensor.shape[1], :tensor.shape[2]] = tensor
elif ndim == 4:
for ind, tensor in enumerate(tensor_list):
all_tensor[ind*islice:(ind+1)*islice, :tensor.shape[1], :tensor.shape[2], :tensor.shape[3]] = tensor
else:
raise Exception('Sorry, unimplemented.')
return all_tensor