blob: 5564c3830cb693464aea9258b9da54b1f7be5292 [file] [log] [blame]
/************************************************************
*
* Licensed to the Apache Software Foundation (ASF) under one
* or more contributor license agreements. See the NOTICE file
* distributed with this work for additional information
* regarding copyright ownership. The ASF licenses this file
* to you under the Apache License, Version 2.0 (the
* "License"); you may not use this file except in compliance
* with the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing,
* software distributed under the License is distributed on an
* "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
* KIND, either express or implied. See the License for the
* specific language governing permissions and limitations
* under the License.
*
*************************************************************/
/**
* Create training and test DataShard for CIFAR dataset.
* It is adapted from convert_cifar_data from Caffe.
* create_shard.bin <input> <output_folder>
*
* Read from JobConf object the option to use KVfile, HDFS or other (1st layer
* store_conf object).
* To load to HDFS, specify "hdfs://namenode/examples" as the output folder
*/
#include <glog/logging.h>
#include <fstream>
#include <string>
#include <cstdint>
#include <iostream>
#include "singa/io/store.h"
#include "singa/proto/common.pb.h"
#include "singa/utils/common.h"
using std::string;
const int kCIFARSize = 32;
const int kCIFARImageNBytes = 3072;
const int kCIFARBatchSize = 10000;
const int kCIFARTrainBatches = 5;
void read_image(std::ifstream* file, int* label, char* buffer) {
char label_char;
file->read(&label_char, 1);
*label = label_char;
file->read(buffer, kCIFARImageNBytes);
return;
}
void create_data(const string& input_folder, const string& output_folder) {
int label;
char str_buffer[kCIFARImageNBytes];
string rec_buf;
singa::RecordProto image;
image.add_shape(3);
image.add_shape(kCIFARSize);
image.add_shape(kCIFARSize);
singa::RecordProto mean;
mean.CopyFrom(image);
for (int i = 0; i < kCIFARImageNBytes; i++)
mean.add_data(0.f);
string store_backend = output_folder.find("hdfs") !=-1 ?
"hdfsfile" : "kvfile";
auto store = singa::io::CreateStore(store_backend);
CHECK(store->Open(output_folder + "/train_data.bin", singa::io::kCreate));
LOG(INFO) << "Preparing training data";
int count = 0;
for (int fileid = 0; fileid < kCIFARTrainBatches; ++fileid) {
LOG(INFO) << "Training Batch " << fileid + 1;
snprintf(str_buffer, kCIFARImageNBytes, "/data_batch_%d.bin", fileid + 1);
std::ifstream data_file((input_folder + str_buffer).c_str(),
std::ios::in | std::ios::binary);
CHECK(data_file.is_open()) << "Unable to open train file #" << fileid + 1;
for (int itemid = 0; itemid < kCIFARBatchSize; ++itemid) {
read_image(&data_file, &label, str_buffer);
image.set_label(label);
image.set_pixel(str_buffer, kCIFARImageNBytes);
image.SerializeToString(&rec_buf);
int length = snprintf(str_buffer, kCIFARImageNBytes, "%05d", count);
CHECK(store->Write(string(str_buffer, length), rec_buf));
const string& pixels = image.pixel();
for (int i = 0; i < kCIFARImageNBytes; i++)
mean.set_data(i, mean.data(i) + static_cast<uint8_t>(pixels[i]));
count += 1;
}
}
store->Flush();
store->Close();
LOG(INFO) << "Create image mean";
store->Open(output_folder + "/image_mean.bin", singa::io::kCreate);
for (int i = 0; i < kCIFARImageNBytes; i++)
mean.set_data(i, mean.data(i) / count);
mean.SerializeToString(&rec_buf);
store->Write("mean", rec_buf);
store->Flush();
store->Close();
LOG(INFO) << "Create test data";
store->Open(output_folder + "/test_data.bin", singa::io::kCreate);
std::ifstream data_file((input_folder + "/test_batch.bin").c_str(),
std::ios::in | std::ios::binary);
CHECK(data_file.is_open()) << "Unable to open test file.";
for (int itemid = 0; itemid < kCIFARBatchSize; ++itemid) {
read_image(&data_file, &label, str_buffer);
image.set_label(label);
image.set_pixel(str_buffer, kCIFARImageNBytes);
image.SerializeToString(&rec_buf);
int length = snprintf(str_buffer, kCIFARImageNBytes, "%05d", itemid);
CHECK(store->Write(string(str_buffer, length), rec_buf));
}
store->Flush();
store->Close();
}
int main(int argc, char** argv) {
if (argc != 3) {
std::cout <<"Create train and test DataShard for Cifar dataset.\n"
<< "Usage:\n"
<< " create_data.bin input_folder output_folder\n"
<< "Where the input folder should contain the binary batch files.\n";
} else {
google::InitGoogleLogging(argv[0]);
create_data(string(argv[1]), string(argv[2]));
}
return 0;
}