blob: 2da5d7097d96cd5d2cdb55e740de34512f80310f [file] [log] [blame]
from __future__ import print_function
import csv
import os
import sys
import random
import numpy as np
import argparse
parser = argparse.ArgumentParser(description='generate train/test image list files form input directory. If training it will also split into tr and va sets.')
parser.add_argument('--image-folder', type=str, default="data/train/",
help='the input data directory')
parser.add_argument('--out-folder', type=str, default="data/",
help='the output folder')
parser.add_argument('--out-file', type=str, default="train.lst",
help='the output lst file')
parser.add_argument('--train', action='store_true',
help='if we are generating training list and hence we have to loop over subdirectories')
## These options are only used if we are doing training lst
parser.add_argument('--percent-val', type=float, default=0.25,
help='the percentage of training list to use as validation')
parser.add_argument('--stratified', action='store_true',
help='if True it will split train lst into tr and va sets using stratified sampling')
args = parser.parse_args()
random.seed(888)
fo_name=os.path.join(args.out_folder+args.out_file)
fo = csv.writer(open(fo_name, "w"), delimiter='\t', lineterminator='\n')
if args.train:
tr_fo_name=os.path.join(args.out_folder+"tr.lst")
va_fo_name=os.path.join(args.out_folder+"va.lst")
tr_fo = csv.writer(open(tr_fo_name, "w"), delimiter='\t', lineterminator='\n')
va_fo = csv.writer(open(va_fo_name, "w"), delimiter='\t', lineterminator='\n')
#check sampleSubmission.csv from kaggle website to view submission format
head = "acantharia_protist_big_center,acantharia_protist_halo,acantharia_protist,amphipods,appendicularian_fritillaridae,appendicularian_s_shape,appendicularian_slight_curve,appendicularian_straight,artifacts_edge,artifacts,chaetognath_non_sagitta,chaetognath_other,chaetognath_sagitta,chordate_type1,copepod_calanoid_eggs,copepod_calanoid_eucalanus,copepod_calanoid_flatheads,copepod_calanoid_frillyAntennae,copepod_calanoid_large_side_antennatucked,copepod_calanoid_large,copepod_calanoid_octomoms,copepod_calanoid_small_longantennae,copepod_calanoid,copepod_cyclopoid_copilia,copepod_cyclopoid_oithona_eggs,copepod_cyclopoid_oithona,copepod_other,crustacean_other,ctenophore_cestid,ctenophore_cydippid_no_tentacles,ctenophore_cydippid_tentacles,ctenophore_lobate,decapods,detritus_blob,detritus_filamentous,detritus_other,diatom_chain_string,diatom_chain_tube,echinoderm_larva_pluteus_brittlestar,echinoderm_larva_pluteus_early,echinoderm_larva_pluteus_typeC,echinoderm_larva_pluteus_urchin,echinoderm_larva_seastar_bipinnaria,echinoderm_larva_seastar_brachiolaria,echinoderm_seacucumber_auricularia_larva,echinopluteus,ephyra,euphausiids_young,euphausiids,fecal_pellet,fish_larvae_deep_body,fish_larvae_leptocephali,fish_larvae_medium_body,fish_larvae_myctophids,fish_larvae_thin_body,fish_larvae_very_thin_body,heteropod,hydromedusae_aglaura,hydromedusae_bell_and_tentacles,hydromedusae_h15,hydromedusae_haliscera_small_sideview,hydromedusae_haliscera,hydromedusae_liriope,hydromedusae_narco_dark,hydromedusae_narco_young,hydromedusae_narcomedusae,hydromedusae_other,hydromedusae_partial_dark,hydromedusae_shapeA_sideview_small,hydromedusae_shapeA,hydromedusae_shapeB,hydromedusae_sideview_big,hydromedusae_solmaris,hydromedusae_solmundella,hydromedusae_typeD_bell_and_tentacles,hydromedusae_typeD,hydromedusae_typeE,hydromedusae_typeF,invertebrate_larvae_other_A,invertebrate_larvae_other_B,jellies_tentacles,polychaete,protist_dark_center,protist_fuzzy_olive,protist_noctiluca,protist_other,protist_star,pteropod_butterfly,pteropod_theco_dev_seq,pteropod_triangle,radiolarian_chain,radiolarian_colony,shrimp_caridean,shrimp_sergestidae,shrimp_zoea,shrimp-like_other,siphonophore_calycophoran_abylidae,siphonophore_calycophoran_rocketship_adult,siphonophore_calycophoran_rocketship_young,siphonophore_calycophoran_sphaeronectes_stem,siphonophore_calycophoran_sphaeronectes_young,siphonophore_calycophoran_sphaeronectes,siphonophore_other_parts,siphonophore_partial,siphonophore_physonect_young,siphonophore_physonect,stomatopod,tornaria_acorn_worm_larvae,trichodesmium_bowtie,trichodesmium_multiple,trichodesmium_puff,trichodesmium_tuft,trochophore_larvae,tunicate_doliolid_nurse,tunicate_doliolid,tunicate_partial,tunicate_salp_chains,tunicate_salp,unknown_blobs_and_smudges,unknown_sticks,unknown_unclassified".split(',')
# make image list
img_lst = []
cnt = 0
if args.train:
for i in xrange(len(head)):
path = args.image_folder + head[i]
lst = os.listdir(args.image_folder + head[i])
for img in lst:
img_lst.append((cnt, i, path + '/' + img))
cnt += 1
else:
lst = os.listdir(args.image_folder)
for img in lst:
img_lst.append((cnt, 0, args.image_folder + img))
cnt += 1
# shuffle
random.shuffle(img_lst)
#write
for item in img_lst:
fo.writerow(item)
## If training, split into train and validation lists (tr.lst and va.lst)
## Optional stratified sampling
if args.train:
img_lst=np.array(img_lst)
if args.stratified:
from sklearn.cross_validation import StratifiedShuffleSplit
## Stratified sampling to generate train and validation sets
labels_train=img_lst[:,1]
# unique_train, counts_train = np.unique(labels_train, return_counts=True) # To have a look at the frecuency distribution
sss = StratifiedShuffleSplit(labels_train, 1, test_size=args.percent_val, random_state=0)
for tr_idx, va_idx in sss:
print("Train subset has ", len(tr_idx), " cases. Validation subset has ", len(va_idx), "cases")
else:
(nRows, nCols) = img_lst.shape
splitat=int(round(nRows*(1-args.percent_val),0))
tr_idx=range(0,splitat)
va_idx=range(splitat,nRows)
print("Train subset has ", len(tr_idx), " cases. Validation subset has ", len(va_idx), "cases")
tr_lst=img_lst[tr_idx,:].tolist()
va_lst=img_lst[va_idx,:].tolist()
for item in tr_lst:
tr_fo.writerow(item)
for item in va_lst:
va_fo.writerow(item)