blob: d83dfc77c54ffa528280972a61644cbe682d0f53 [file] [log] [blame]
#
# Licensed to the Apache Software Foundation (ASF) under one
# or more contributor license agreements. See the NOTICE file
# distributed with this work for additional information
# regarding copyright ownership. The ASF licenses this file
# to you under the Apache License, Version 2.0 (the
# "License"); you may not use this file except in compliance
# with the License. You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
#
from tqdm import tqdm
import torch
from torch.utils.data import Dataset, DataLoader, TensorDataset
import os
import glob
import numpy as np
import sklearn.model_selection
from scipy.io.arff import loadarff
def load_data(data_dir, namespace):
print(f'# loading data from '
f'{data_dir}/{namespace}_feat_id.pt, '
f'{data_dir}/{namespace}_feat_value.pt'
f'{data_dir}/{namespace}_y.pt ......')
feat_id = torch.load(f'{data_dir}/{namespace}_feat_id.pt')
feat_value = torch.load(f'{data_dir}/{namespace}_feat_value.pt')
y = torch.load(f'{data_dir}/{namespace}_y.pt')
print(f'# {int(y.shape[0])} data samples loaded...')
return feat_id, feat_value, y, int(y.shape[0])
class LibsvmDatasetReadOnce(Dataset):
""" Dataset loader for Libsvm data format """
def __init__(self, fname):
parent_directory = os.path.dirname(fname)
if "train" in fname:
namespace = "decoded_train"
elif "valid" in fname:
namespace = "decoded_valid"
else:
raise
self.feat_id, self.feat_value, self.y, self.nsamples = load_data(parent_directory, namespace)
print(f'# {self.nsamples} data samples loaded...')
def __len__(self):
return self.nsamples
def __getitem__(self, idx):
return {'id': self.feat_id[idx],
'value': self.feat_value[idx],
'y': self.y[idx]}
class LibsvmDataset(Dataset):
""" Dataset loader for Libsvm data format """
def __init__(self, fname, nfields, max_load=-1):
def decode_libsvm(line):
columns = line.split(' ')
map_func = lambda pair: (int(pair[0]), float(pair[1]))
id, value = zip(*map(lambda col: map_func(col.split(':')), columns[1:]))
sample = {'id': torch.LongTensor(id),
'value': torch.FloatTensor(value),
'y': float(columns[0])}
return sample
with open(fname) as f:
sample_lines = sum(1 for line in f)
self.feat_id = torch.LongTensor(sample_lines, nfields)
self.feat_value = torch.FloatTensor(sample_lines, nfields)
self.y = torch.FloatTensor(sample_lines)
self.nsamples = 0
with tqdm(total=sample_lines) as pbar:
with open(fname) as fp:
line = fp.readline()
while line:
if max_load > 0 and self.nsamples > max_load:
break
try:
sample = decode_libsvm(line)
self.feat_id[self.nsamples] = sample['id']
self.feat_value[self.nsamples] = sample['value']
self.y[self.nsamples] = sample['y']
self.nsamples += 1
except Exception:
print(f'incorrect data format line "{line}" !')
line = fp.readline()
pbar.update(1)
print(f'# {self.nsamples} data samples loaded...')
def __len__(self):
return self.nsamples
def __getitem__(self, idx):
return {'id': self.feat_id[idx],
'value': self.feat_value[idx],
'y': self.y[idx]}
def libsvm_dataloader(args, data_dir, nfield, batch_size):
print("Loading data from ", data_dir)
workers = args.workers
train_file_name = f"{data_dir}/train.libsvm"
valid_file_name = f"{data_dir}/valid.libsvm"
test_file_name = f"{data_dir}/test.libsvm"
print(f"using train={train_file_name}, valid={valid_file_name}")
# read the converted file
if args.device == "cpu":
train_loader = DataLoader(LibsvmDatasetReadOnce(train_file_name),
batch_size=batch_size,
shuffle=True)
val_loader = DataLoader(LibsvmDatasetReadOnce(valid_file_name),
batch_size=batch_size * 8,
shuffle=False)
else:
train_loader = DataLoader(LibsvmDatasetReadOnce(train_file_name),
batch_size=batch_size,
shuffle=True,
num_workers=workers,
pin_memory=False)
val_loader = DataLoader(LibsvmDatasetReadOnce(valid_file_name),
batch_size=batch_size * 8,
shuffle=False,
num_workers=workers,
pin_memory=False)
return train_loader, val_loader, val_loader
def libsvm_dataloader_ori(args):
data_dir = args.base_dir + args.dataset
print(data_dir)
train_file = glob.glob("%s/tr*libsvm" % data_dir)[0]
val_file = glob.glob("%s/va*libsvm" % data_dir)[0]
test_file = glob.glob("%s/te*libsvm" % data_dir)[0]
train_loader = DataLoader(LibsvmDataset(train_file, args.nfield, args.max_load),
batch_size=args.batch_size, shuffle=True,
num_workers=args.workers, pin_memory=True)
val_loader = DataLoader(LibsvmDataset(val_file, args.nfield, args.max_load),
batch_size=args.batch_size, shuffle=False,
num_workers=args.workers, pin_memory=True)
# test_loader = DataLoader(LibsvmDataset(test_file, args.nfield),
# batch_size=args.batch_size, shuffle=False,
# num_workers=args.workers, pin_memory=True)
return train_loader, val_loader, val_loader
class UCILibsvmDataset(Dataset):
""" Dataset loader for loading UCI dataset of Libsvm format """
def __init__(self, X, y):
assert X.shape[0] == y.shape[0]
self.nsamples, self.nfeat = X.shape
self.feat_id = torch.LongTensor(self.nsamples, self.nfeat)
self.feat_value = torch.FloatTensor(self.nsamples, self.nfeat)
self.y = torch.FloatTensor(self.nsamples)
with tqdm(total=self.nsamples) as pbar:
id = torch.LongTensor(range(self.nfeat))
for idx in range(self.nsamples):
self.feat_id[idx] = id
self.feat_value[idx] = torch.FloatTensor(X[idx])
self.y[idx] = y[idx]
pbar.update(1)
print(f'Data loader: {self.nsamples} data samples')
def __len__(self):
return self.nsamples
def __getitem__(self, idx):
return {'id': self.feat_id[idx],
'value': self.feat_value[idx],
'y': self.y[idx]}
def uci_loader(data_dir, batch_size, valid_perc=0., libsvm=False, workers=4):
'''
:param data_dir: Path to load the uci dataset
:param batch_size: Batch size
:param valid_perc: valid percentage split from train (default 0, whole train set)
:param libsvm: Libsvm loader of format {'id', 'value', 'y'}
:param workers: the number of subprocesses to load data
:return: train/valid/test loader, train_loader.nclass
'''
def uci_validation_set(X, y, split_perc=0.2):
return sklearn.model_selection.train_test_split(
X, y, test_size=split_perc, random_state=0)
def make_loader(X, y, transformer=None, batch_size=64):
if transformer is None:
transformer = sklearn.preprocessing.StandardScaler()
transformer.fit(X)
X = transformer.transform(X)
if libsvm:
return DataLoader(UCILibsvmDataset(X, y),
batch_size=batch_size,
shuffle=transformer is None,
num_workers=workers, pin_memory=True
), transformer
else:
return DataLoader(
dataset=TensorDataset(*[torch.from_numpy(e) for e in [X, y]]),
batch_size=batch_size,
shuffle=transformer is None,
num_workers=workers, pin_memory=True
), transformer
def uci_folder_to_name(f):
return f.split('/')[-1]
def line_to_idx(l):
return np.array([int(e) for e in l.split()], dtype=np.int32)
def load_uci_dataset(folder, train=True):
full_file = f'{folder}/{uci_folder_to_name(folder)}.arff'
if os.path.exists(full_file):
data = loadarff(full_file)
train_idx, test_idx = [line_to_idx(l) for l in open(f'{folder}/conxuntos.dat').readlines()]
assert len(set(train_idx) & set(test_idx)) == 0
all_idx = list(train_idx) + list(test_idx)
assert len(all_idx) == np.max(all_idx) + 1
assert np.min(all_idx) == 0
if train:
data = (data[0][train_idx], data[1])
else:
data = (data[0][test_idx], data[1])
else:
typename = 'train' if train else 'test'
filename = f'{folder}/{uci_folder_to_name(folder)}_{typename}.arff'
data = loadarff(filename)
assert data[1].types() == ['numeric'] * (len(data[1].types()) - 1) + ['nominal']
X = np.array(data[0][data[1].names()[:-1]].tolist())
y = np.array([int(e) for e in data[0][data[1].names()[-1]]])
nclass = len(data[1]['clase'][1])
return X.astype(np.float32), y, nclass
Xtrain, ytrain, nclass = load_uci_dataset(data_dir)
if valid_perc > 0:
Xtrain, Xvalid, ytrain, yvalid = uci_validation_set(Xtrain, ytrain, split_perc=valid_perc)
train_loader, _ = make_loader(Xtrain, ytrain, batch_size=batch_size)
valid_loader, _ = make_loader(Xvalid, yvalid, batch_size=batch_size)
else:
train_loader, _ = make_loader(Xtrain, ytrain, batch_size=batch_size)
valid_loader = train_loader
print(f'{uci_folder_to_name(data_dir)}: {len(ytrain)} training samples loaded.')
Xtest, ytest, _ = load_uci_dataset(data_dir, False)
test_loader, _ = make_loader(Xtest, ytest, batch_size=batch_size)
print(f'{uci_folder_to_name(data_dir)}: {len(ytest)} testing samples loaded.')
train_loader.nclass = nclass
return train_loader, valid_loader, test_loader