examples/model_selection/Trails/internal/ml/model_selection/src/dataset_utils/structure_data_loader.py - singa - Git at Google

 #
 # Licensed to the Apache Software Foundation (ASF) under one
 # or more contributor license agreements.  See the NOTICE file
 # distributed with this work for additional information
 # regarding copyright ownership.  The ASF licenses this file
 # to you under the Apache License, Version 2.0 (the
 # "License"); you may not use this file except in compliance
 # with the License.  You may obtain a copy of the License at
 #
 #     http://www.apache.org/licenses/LICENSE-2.0
 #
 # Unless required by applicable law or agreed to in writing, software
 # distributed under the License is distributed on an "AS IS" BASIS,
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
 #


 from tqdm import tqdm
 import torch
 from torch.utils.data import Dataset, DataLoader, TensorDataset
 import os
 import glob
 import numpy as np
 import sklearn.model_selection
 from scipy.io.arff import loadarff


 def load_data(data_dir, namespace):
     print(f'# loading data from '
           f'{data_dir}/{namespace}_feat_id.pt, '
           f'{data_dir}/{namespace}_feat_value.pt'
           f'{data_dir}/{namespace}_y.pt ......')

     feat_id = torch.load(f'{data_dir}/{namespace}_feat_id.pt')
     feat_value = torch.load(f'{data_dir}/{namespace}_feat_value.pt')
     y = torch.load(f'{data_dir}/{namespace}_y.pt')

     print(f'# {int(y.shape[0])} data samples loaded...')

     return feat_id, feat_value, y, int(y.shape[0])


 class LibsvmDatasetReadOnce(Dataset):
     """ Dataset loader for Libsvm data format """

     def __init__(self, fname):
         parent_directory = os.path.dirname(fname)
         if "train" in fname:
             namespace = "decoded_train"
         elif "valid" in fname:
             namespace = "decoded_valid"
         else:
             raise
         self.feat_id, self.feat_value, self.y, self.nsamples = load_data(parent_directory, namespace)

         print(f'# {self.nsamples} data samples loaded...')

     def __len__(self):
         return self.nsamples

     def __getitem__(self, idx):
         return {'id': self.feat_id[idx],
                 'value': self.feat_value[idx],
                 'y': self.y[idx]}


 class LibsvmDataset(Dataset):
     """ Dataset loader for Libsvm data format """

     def __init__(self, fname, nfields, max_load=-1):

         def decode_libsvm(line):
             columns = line.split(' ')
             map_func = lambda pair: (int(pair[0]), float(pair[1]))
             id, value = zip(*map(lambda col: map_func(col.split(':')), columns[1:]))
             sample = {'id': torch.LongTensor(id),
                       'value': torch.FloatTensor(value),
                       'y': float(columns[0])}
             return sample

         with open(fname) as f:
             sample_lines = sum(1 for line in f)

         self.feat_id = torch.LongTensor(sample_lines, nfields)
         self.feat_value = torch.FloatTensor(sample_lines, nfields)
         self.y = torch.FloatTensor(sample_lines)

         self.nsamples = 0
         with tqdm(total=sample_lines) as pbar:
             with open(fname) as fp:
                 line = fp.readline()
                 while line:
                     if max_load > 0 and self.nsamples > max_load:
                         break
                     try:
                         sample = decode_libsvm(line)
                         self.feat_id[self.nsamples] = sample['id']
                         self.feat_value[self.nsamples] = sample['value']
                         self.y[self.nsamples] = sample['y']
                         self.nsamples += 1
                     except Exception:
                         print(f'incorrect data format line "{line}" !')
                     line = fp.readline()
                     pbar.update(1)
         print(f'# {self.nsamples} data samples loaded...')

     def __len__(self):
         return self.nsamples

     def __getitem__(self, idx):
         return {'id': self.feat_id[idx],
                 'value': self.feat_value[idx],
                 'y': self.y[idx]}


 def libsvm_dataloader(args, data_dir, nfield, batch_size):
     print("Loading data from ", data_dir)
     workers = args.workers
     train_file_name = f"{data_dir}/train.libsvm"
     valid_file_name = f"{data_dir}/valid.libsvm"
     test_file_name = f"{data_dir}/test.libsvm"
     print(f"using train={train_file_name}, valid={valid_file_name}")
     # read the converted file
     if args.device == "cpu":
         train_loader = DataLoader(LibsvmDatasetReadOnce(train_file_name),
                                   batch_size=batch_size,
                                   shuffle=True)
         val_loader = DataLoader(LibsvmDatasetReadOnce(valid_file_name),
                                 batch_size=batch_size * 8,
                                 shuffle=False)

     else:
         train_loader = DataLoader(LibsvmDatasetReadOnce(train_file_name),
                                   batch_size=batch_size,
                                   shuffle=True,
                                   num_workers=workers,
                                   pin_memory=False)

         val_loader = DataLoader(LibsvmDatasetReadOnce(valid_file_name),
                                 batch_size=batch_size * 8,
                                 shuffle=False,
                                 num_workers=workers,
                                 pin_memory=False)

     return train_loader, val_loader, val_loader


 def libsvm_dataloader_ori(args):
     data_dir = args.base_dir + args.dataset
     print(data_dir)
     train_file = glob.glob("%s/tr*libsvm" % data_dir)[0]
     val_file = glob.glob("%s/va*libsvm" % data_dir)[0]
     test_file = glob.glob("%s/te*libsvm" % data_dir)[0]

     train_loader = DataLoader(LibsvmDataset(train_file, args.nfield, args.max_load),
                               batch_size=args.batch_size, shuffle=True,
                               num_workers=args.workers, pin_memory=True)
     val_loader = DataLoader(LibsvmDataset(val_file, args.nfield, args.max_load),
                             batch_size=args.batch_size, shuffle=False,
                             num_workers=args.workers, pin_memory=True)
     # test_loader = DataLoader(LibsvmDataset(test_file, args.nfield),
     #                         batch_size=args.batch_size, shuffle=False,
     #                         num_workers=args.workers, pin_memory=True)

     return train_loader, val_loader, val_loader


 class UCILibsvmDataset(Dataset):
     """ Dataset loader for loading UCI dataset of Libsvm format """

     def __init__(self, X, y):
         assert X.shape[0] == y.shape[0]
         self.nsamples, self.nfeat = X.shape

         self.feat_id = torch.LongTensor(self.nsamples, self.nfeat)
         self.feat_value = torch.FloatTensor(self.nsamples, self.nfeat)
         self.y = torch.FloatTensor(self.nsamples)

         with tqdm(total=self.nsamples) as pbar:
             id = torch.LongTensor(range(self.nfeat))
             for idx in range(self.nsamples):
                 self.feat_id[idx] = id
                 self.feat_value[idx] = torch.FloatTensor(X[idx])
                 self.y[idx] = y[idx]

                 pbar.update(1)
         print(f'Data loader: {self.nsamples} data samples')

     def __len__(self):
         return self.nsamples

     def __getitem__(self, idx):
         return {'id': self.feat_id[idx],
                 'value': self.feat_value[idx],
                 'y': self.y[idx]}


 def uci_loader(data_dir, batch_size, valid_perc=0., libsvm=False, workers=4):
     '''
     :param data_dir:        Path to load the uci dataset
     :param batch_size:      Batch size
     :param valid_perc:      valid percentage split from train (default 0, whole train set)
     :param libsvm:          Libsvm loader of format {'id', 'value', 'y'}
     :param workers:         the number of subprocesses to load data
     :return:                train/valid/test loader, train_loader.nclass
     '''

     def uci_validation_set(X, y, split_perc=0.2):
         return sklearn.model_selection.train_test_split(
             X, y, test_size=split_perc, random_state=0)

     def make_loader(X, y, transformer=None, batch_size=64):
         if transformer is None:
             transformer = sklearn.preprocessing.StandardScaler()
             transformer.fit(X)
         X = transformer.transform(X)
         if libsvm:
             return DataLoader(UCILibsvmDataset(X, y),
                               batch_size=batch_size,
                               shuffle=transformer is None,
                               num_workers=workers, pin_memory=True
                               ), transformer
         else:
             return DataLoader(
                 dataset=TensorDataset(*[torch.from_numpy(e) for e in [X, y]]),
                 batch_size=batch_size,
                 shuffle=transformer is None,
                 num_workers=workers, pin_memory=True
             ), transformer

     def uci_folder_to_name(f):
         return f.split('/')[-1]

     def line_to_idx(l):
         return np.array([int(e) for e in l.split()], dtype=np.int32)

     def load_uci_dataset(folder, train=True):
         full_file = f'{folder}/{uci_folder_to_name(folder)}.arff'
         if os.path.exists(full_file):
             data = loadarff(full_file)
             train_idx, test_idx = [line_to_idx(l) for l in open(f'{folder}/conxuntos.dat').readlines()]
             assert len(set(train_idx) & set(test_idx)) == 0
             all_idx = list(train_idx) + list(test_idx)
             assert len(all_idx) == np.max(all_idx) + 1
             assert np.min(all_idx) == 0
             if train:
                 data = (data[0][train_idx], data[1])
             else:
                 data = (data[0][test_idx], data[1])
         else:
             typename = 'train' if train else 'test'
             filename = f'{folder}/{uci_folder_to_name(folder)}_{typename}.arff'
             data = loadarff(filename)
         assert data[1].types() == ['numeric'] * (len(data[1].types()) - 1) + ['nominal']
         X = np.array(data[0][data[1].names()[:-1]].tolist())
         y = np.array([int(e) for e in data[0][data[1].names()[-1]]])
         nclass = len(data[1]['clase'][1])
         return X.astype(np.float32), y, nclass

     Xtrain, ytrain, nclass = load_uci_dataset(data_dir)
     if valid_perc > 0:
         Xtrain, Xvalid, ytrain, yvalid = uci_validation_set(Xtrain, ytrain, split_perc=valid_perc)
         train_loader, _ = make_loader(Xtrain, ytrain, batch_size=batch_size)
         valid_loader, _ = make_loader(Xvalid, yvalid, batch_size=batch_size)
     else:
         train_loader, _ = make_loader(Xtrain, ytrain, batch_size=batch_size)
         valid_loader = train_loader

     print(f'{uci_folder_to_name(data_dir)}: {len(ytrain)} training samples loaded.')
     Xtest, ytest, _ = load_uci_dataset(data_dir, False)
     test_loader, _ = make_loader(Xtest, ytest, batch_size=batch_size)
     print(f'{uci_folder_to_name(data_dir)}: {len(ytest)} testing samples loaded.')
     train_loader.nclass = nclass
     return train_loader, valid_loader, test_loader
	#
	# Licensed to the Apache Software Foundation (ASF) under one
	# or more contributor license agreements. See the NOTICE file
	# distributed with this work for additional information
	# regarding copyright ownership. The ASF licenses this file
	# to you under the Apache License, Version 2.0 (the
	# "License"); you may not use this file except in compliance
	# with the License. You may obtain a copy of the License at
	#
	# http://www.apache.org/licenses/LICENSE-2.0
	#
	# Unless required by applicable law or agreed to in writing, software
	# distributed under the License is distributed on an "AS IS" BASIS,
	# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
	# See the License for the specific language governing permissions and
	# limitations under the License.
	#


	from tqdm import tqdm
	import torch
	from torch.utils.data import Dataset, DataLoader, TensorDataset
	import os
	import glob
	import numpy as np
	import sklearn.model_selection
	from scipy.io.arff import loadarff


	def load_data(data_dir, namespace):
	print(f'# loading data from '
	f'{data_dir}/{namespace}_feat_id.pt, '
	f'{data_dir}/{namespace}_feat_value.pt'
	f'{data_dir}/{namespace}_y.pt ......')

	feat_id = torch.load(f'{data_dir}/{namespace}_feat_id.pt')
	feat_value = torch.load(f'{data_dir}/{namespace}_feat_value.pt')
	y = torch.load(f'{data_dir}/{namespace}_y.pt')

	print(f'# {int(y.shape[0])} data samples loaded...')

	return feat_id, feat_value, y, int(y.shape[0])


	class LibsvmDatasetReadOnce(Dataset):
	""" Dataset loader for Libsvm data format """

	def __init__(self, fname):
	parent_directory = os.path.dirname(fname)
	if "train" in fname:
	namespace = "decoded_train"
	elif "valid" in fname:
	namespace = "decoded_valid"
	else:
	raise
	self.feat_id, self.feat_value, self.y, self.nsamples = load_data(parent_directory, namespace)

	print(f'# {self.nsamples} data samples loaded...')

	def __len__(self):
	return self.nsamples

	def __getitem__(self, idx):
	return {'id': self.feat_id[idx],
	'value': self.feat_value[idx],
	'y': self.y[idx]}


	class LibsvmDataset(Dataset):
	""" Dataset loader for Libsvm data format """

	def __init__(self, fname, nfields, max_load=-1):

	def decode_libsvm(line):
	columns = line.split(' ')
	map_func = lambda pair: (int(pair[0]), float(pair[1]))
	id, value = zip(*map(lambda col: map_func(col.split(':')), columns[1:]))
	sample = {'id': torch.LongTensor(id),
	'value': torch.FloatTensor(value),
	'y': float(columns[0])}
	return sample

	with open(fname) as f:
	sample_lines = sum(1 for line in f)

	self.feat_id = torch.LongTensor(sample_lines, nfields)
	self.feat_value = torch.FloatTensor(sample_lines, nfields)
	self.y = torch.FloatTensor(sample_lines)

	self.nsamples = 0
	with tqdm(total=sample_lines) as pbar:
	with open(fname) as fp:
	line = fp.readline()
	while line:
	if max_load > 0 and self.nsamples > max_load:
	break
	try:
	sample = decode_libsvm(line)
	self.feat_id[self.nsamples] = sample['id']
	self.feat_value[self.nsamples] = sample['value']
	self.y[self.nsamples] = sample['y']
	self.nsamples += 1
	except Exception:
	print(f'incorrect data format line "{line}" !')
	line = fp.readline()
	pbar.update(1)
	print(f'# {self.nsamples} data samples loaded...')

	def __len__(self):
	return self.nsamples

	def __getitem__(self, idx):
	return {'id': self.feat_id[idx],
	'value': self.feat_value[idx],
	'y': self.y[idx]}


	def libsvm_dataloader(args, data_dir, nfield, batch_size):
	print("Loading data from ", data_dir)
	workers = args.workers
	train_file_name = f"{data_dir}/train.libsvm"
	valid_file_name = f"{data_dir}/valid.libsvm"
	test_file_name = f"{data_dir}/test.libsvm"
	print(f"using train={train_file_name}, valid={valid_file_name}")
	# read the converted file
	if args.device == "cpu":
	train_loader = DataLoader(LibsvmDatasetReadOnce(train_file_name),
	batch_size=batch_size,
	shuffle=True)
	val_loader = DataLoader(LibsvmDatasetReadOnce(valid_file_name),
	batch_size=batch_size * 8,
	shuffle=False)

	else:
	train_loader = DataLoader(LibsvmDatasetReadOnce(train_file_name),
	batch_size=batch_size,
	shuffle=True,
	num_workers=workers,
	pin_memory=False)

	val_loader = DataLoader(LibsvmDatasetReadOnce(valid_file_name),
	batch_size=batch_size * 8,
	shuffle=False,
	num_workers=workers,
	pin_memory=False)

	return train_loader, val_loader, val_loader


	def libsvm_dataloader_ori(args):
	data_dir = args.base_dir + args.dataset
	print(data_dir)
	train_file = glob.glob("%s/tr*libsvm" % data_dir)[0]
	val_file = glob.glob("%s/va*libsvm" % data_dir)[0]
	test_file = glob.glob("%s/te*libsvm" % data_dir)[0]

	train_loader = DataLoader(LibsvmDataset(train_file, args.nfield, args.max_load),
	batch_size=args.batch_size, shuffle=True,
	num_workers=args.workers, pin_memory=True)
	val_loader = DataLoader(LibsvmDataset(val_file, args.nfield, args.max_load),
	batch_size=args.batch_size, shuffle=False,
	num_workers=args.workers, pin_memory=True)
	# test_loader = DataLoader(LibsvmDataset(test_file, args.nfield),
	# batch_size=args.batch_size, shuffle=False,
	# num_workers=args.workers, pin_memory=True)

	return train_loader, val_loader, val_loader


	class UCILibsvmDataset(Dataset):
	""" Dataset loader for loading UCI dataset of Libsvm format """

	def __init__(self, X, y):
	assert X.shape[0] == y.shape[0]
	self.nsamples, self.nfeat = X.shape

	self.feat_id = torch.LongTensor(self.nsamples, self.nfeat)
	self.feat_value = torch.FloatTensor(self.nsamples, self.nfeat)
	self.y = torch.FloatTensor(self.nsamples)

	with tqdm(total=self.nsamples) as pbar:
	id = torch.LongTensor(range(self.nfeat))
	for idx in range(self.nsamples):
	self.feat_id[idx] = id
	self.feat_value[idx] = torch.FloatTensor(X[idx])
	self.y[idx] = y[idx]

	pbar.update(1)
	print(f'Data loader: {self.nsamples} data samples')

	def __len__(self):
	return self.nsamples

	def __getitem__(self, idx):
	return {'id': self.feat_id[idx],
	'value': self.feat_value[idx],
	'y': self.y[idx]}


	def uci_loader(data_dir, batch_size, valid_perc=0., libsvm=False, workers=4):
	'''
	:param data_dir: Path to load the uci dataset
	:param batch_size: Batch size
	:param valid_perc: valid percentage split from train (default 0, whole train set)
	:param libsvm: Libsvm loader of format {'id', 'value', 'y'}
	:param workers: the number of subprocesses to load data
	:return: train/valid/test loader, train_loader.nclass
	'''

	def uci_validation_set(X, y, split_perc=0.2):
	return sklearn.model_selection.train_test_split(
	X, y, test_size=split_perc, random_state=0)

	def make_loader(X, y, transformer=None, batch_size=64):
	if transformer is None:
	transformer = sklearn.preprocessing.StandardScaler()
	transformer.fit(X)
	X = transformer.transform(X)
	if libsvm:
	return DataLoader(UCILibsvmDataset(X, y),
	batch_size=batch_size,
	shuffle=transformer is None,
	num_workers=workers, pin_memory=True
	), transformer
	else:
	return DataLoader(
	dataset=TensorDataset(*[torch.from_numpy(e) for e in [X, y]]),
	batch_size=batch_size,
	shuffle=transformer is None,
	num_workers=workers, pin_memory=True
	), transformer

	def uci_folder_to_name(f):
	return f.split('/')[-1]

	def line_to_idx(l):
	return np.array([int(e) for e in l.split()], dtype=np.int32)

	def load_uci_dataset(folder, train=True):
	full_file = f'{folder}/{uci_folder_to_name(folder)}.arff'
	if os.path.exists(full_file):
	data = loadarff(full_file)
	train_idx, test_idx = [line_to_idx(l) for l in open(f'{folder}/conxuntos.dat').readlines()]
	assert len(set(train_idx) & set(test_idx)) == 0
	all_idx = list(train_idx) + list(test_idx)
	assert len(all_idx) == np.max(all_idx) + 1
	assert np.min(all_idx) == 0
	if train:
	data = (data[0][train_idx], data[1])
	else:
	data = (data[0][test_idx], data[1])
	else:
	typename = 'train' if train else 'test'
	filename = f'{folder}/{uci_folder_to_name(folder)}_{typename}.arff'
	data = loadarff(filename)
	assert data[1].types() == ['numeric'] * (len(data[1].types()) - 1) + ['nominal']
	X = np.array(data[0][data[1].names()[:-1]].tolist())
	y = np.array([int(e) for e in data[0][data[1].names()[-1]]])
	nclass = len(data[1]['clase'][1])
	return X.astype(np.float32), y, nclass

	Xtrain, ytrain, nclass = load_uci_dataset(data_dir)
	if valid_perc > 0:
	Xtrain, Xvalid, ytrain, yvalid = uci_validation_set(Xtrain, ytrain, split_perc=valid_perc)
	train_loader, _ = make_loader(Xtrain, ytrain, batch_size=batch_size)
	valid_loader, _ = make_loader(Xvalid, yvalid, batch_size=batch_size)
	else:
	train_loader, _ = make_loader(Xtrain, ytrain, batch_size=batch_size)
	valid_loader = train_loader

	print(f'{uci_folder_to_name(data_dir)}: {len(ytrain)} training samples loaded.')
	Xtest, ytest, _ = load_uci_dataset(data_dir, False)
	test_loader, _ = make_loader(Xtest, ytest, batch_size=batch_size)
	print(f'{uci_folder_to_name(data_dir)}: {len(ytest)} testing samples loaded.')
	train_loader.nclass = nclass
	return train_loader, valid_loader, test_loader