blob: e36125f77b17fd6e51feab45f44f2c28c51989b9 [file] [log] [blame]
# Licensed to the Apache Software Foundation (ASF) under one
# or more contributor license agreements. See the NOTICE file
# distributed with this work for additional information
# regarding copyright ownership. The ASF licenses this file
# to you under the Apache License, Version 2.0 (the
# "License"); you may not use this file except in compliance
# with the License. You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing,
# software distributed under the License is distributed on an
# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
# KIND, either express or implied. See the License for the
# specific language governing permissions and limitations
# under the License.
from __future__ import print_function
import os
import time
import ctypes
from mxnet.base import _LIB
from mxnet.base import check_call
import mxnet as mx
import argparse
class IndexCreator(mx.recordio.MXRecordIO):
"""Reads `RecordIO` data format, and creates index file
that enables random access.
Example usage:
----------
>>> creator = IndexCreator('data/test.rec','data/test.idx')
>>> creator.create_index()
>>> creator.close()
>>> !ls data/
test.rec test.idx
Parameters
----------
uri : str
Path to the record file.
idx_path : str
Path to the index file, that will be created/overwritten.
key_type : type
Data type for keys (optional, default = int).
"""
def __init__(self, uri, idx_path, key_type=int):
self.key_type = key_type
self.fidx = None
self.idx_path = idx_path
super(IndexCreator, self).__init__(uri, 'r')
def open(self):
super(IndexCreator, self).open()
self.fidx = open(self.idx_path, 'w')
def close(self):
"""Closes the record and index files."""
if not self.is_open:
return
super(IndexCreator, self).close()
self.fidx.close()
def tell(self):
"""Returns the current position of read head.
"""
pos = ctypes.c_size_t()
check_call(_LIB.MXRecordIOReaderTell(self.handle, ctypes.byref(pos)))
return pos.value
def create_index(self):
"""Creates the index file from open record file
"""
self.reset()
counter = 0
pre_time = time.time()
while True:
if counter % 1000 == 0:
cur_time = time.time()
print('time:', cur_time - pre_time, ' count:', counter)
pos = self.tell()
cont = self.read()
if cont is None:
break
key = self.key_type(counter)
self.fidx.write(f'{str(key)}\t{pos}\n')
counter = counter + 1
def parse_args():
parser = argparse.ArgumentParser(
formatter_class=argparse.ArgumentDefaultsHelpFormatter,
description='Create an index file from .rec file')
parser.add_argument('record', help='path to .rec file.')
parser.add_argument('index', help='path to index file.')
args = parser.parse_args()
args.record = os.path.abspath(args.record)
args.index = os.path.abspath(args.index)
return args
if __name__ == '__main__':
args = parse_args()
creator = IndexCreator(args.record, args.index)
creator.create_index()
creator.close()