blob: b5161f3b6a91c04abbfe8332031324b73e2ad9b0 [file] [log] [blame]
# Licensed to the Apache Software Foundation (ASF) under one
# or more contributor license agreements. See the NOTICE file
# distributed with this work for additional information
# regarding copyright ownership. The ASF licenses this file
# to you under the Apache License, Version 2.0 (the
# "License"); you may not use this file except in compliance
# with the License. You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing,
# software distributed under the License is distributed on an
# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
# KIND, either express or implied. See the License for the
# specific language governing permissions and limitations
# under the License.
import mxnet as mx
from mxnet.test_utils import use_np
import numpy as np
import scipy
from scipy.stats import pearsonr
import json
import math
from common import xfail_when_nonstandard_decimal_separator
from copy import deepcopy
def check_metric(metric, *args, **kwargs):
metric = mx.gluon.metric.create(metric, *args, **kwargs)
str_metric = json.dumps(metric.get_config())
metric2 = mx.gluon.metric.create(str_metric)
assert metric.get_config() == metric2.get_config()
def test_metrics():
check_metric('acc', axis=0)
check_metric('f1')
check_metric('mcc')
check_metric('perplexity', axis=-1)
check_metric('pearsonr')
check_metric('pcc')
check_metric('ce')
check_metric('loss')
composite = mx.gluon.metric.create(['acc', 'f1'])
check_metric(composite)
def test_ce():
metric = mx.gluon.metric.create('ce')
pred = mx.np.array([[0.2, 0.3, 0.5], [0.6, 0.1, 0.3]])
label = mx.np.array([2, 1])
metric.update([label], [pred])
_, loss = metric.get()
expected_loss = -(np.log(pred[0][2].item()) + np.log(pred[1][1].item())) / 2
np.testing.assert_almost_equal(loss, expected_loss)
metric = mx.gluon.metric.create('ce', from_logits=True)
pred = mx.np.log(pred)
metric.update([label], [pred])
_, loss = metric.get()
np.testing.assert_almost_equal(loss, expected_loss)
def test_acc():
pred = mx.np.array([[0.3, 0.7], [0, 1.], [0.4, 0.6]])
label = mx.np.array([0, 1, 1])
metric = mx.gluon.metric.create('acc')
metric.update([label], [pred])
_, acc = metric.get()
expected_acc = (np.argmax(pred, axis=1) == label).sum().item() / label.size
np.testing.assert_almost_equal(acc, expected_acc)
def test_acc_2d_label():
# label maybe provided in 2d arrays in custom data iterator
pred = mx.np.array([[0.3, 0.7], [0, 1.], [0.4, 0.6], [0.8, 0.2], [0.3, 0.5], [0.6, 0.4]])
label = mx.np.array([[0, 1, 1], [1, 0, 1]])
metric = mx.gluon.metric.create('acc')
metric.update([label], [pred])
_, acc = metric.get()
expected_acc = (np.argmax(pred, axis=1).asnumpy() == label.asnumpy().ravel()).sum() / \
float(label.asnumpy().ravel().size)
np.testing.assert_almost_equal(acc, expected_acc)
def test_loss_update():
pred = mx.np.array([[0.3, 0.7], [0, 1.], [0.4, 0.6]])
metric1 = mx.gluon.metric.create('loss')
metric2 = mx.gluon.metric.create('loss')
metric1.update(None, [pred])
metric2.update(None, pred)
_, acc1 = metric1.get()
_, acc2 = metric2.get()
assert acc1 == acc2
@xfail_when_nonstandard_decimal_separator
def test_binary_f1():
microF1 = mx.gluon.metric.create("f1", average="micro")
macroF1 = mx.gluon.metric.F1(average="macro")
assert np.isnan(macroF1.get()[1])
assert np.isnan(microF1.get()[1])
# check divide by zero
pred = mx.np.array([[0.9, 0.1],
[0.8, 0.2]])
label = mx.np.array([0, 0])
macroF1.update([label], [pred])
microF1.update([label], [pred])
assert macroF1.get()[1] == 0.0
assert microF1.get()[1] == 0.0
macroF1.reset()
microF1.reset()
pred11 = mx.np.array([[0.1, 0.9],
[0.5, 0.5]])
label11 = mx.np.array([1, 0])
pred12 = mx.np.array([[0.85, 0.15],
[1.0, 0.0]])
label12 = mx.np.array([1, 0])
pred21 = mx.np.array([[0.6, 0.4]])
label21 = mx.np.array([0])
pred22 = mx.np.array([[0.2, 0.8]])
label22 = mx.np.array([1])
microF1.update([label11, label12], [pred11, pred12])
macroF1.update([label11, label12], [pred11, pred12])
assert microF1.num_inst == 4
assert macroF1.num_inst == 4
# f1 = 2 * tp / (2 * tp + fp + fn)
fscore1 = 2. * (1) / (2 * 1 + 1 + 0)
np.testing.assert_almost_equal(microF1.get()[1], fscore1)
np.testing.assert_almost_equal(macroF1.get()[1], fscore1)
microF1.update([label21, label22], [pred21, pred22])
macroF1.update([label21, label22], [pred21, pred22])
assert microF1.num_inst == 6
assert macroF1.num_inst == 6
fscore2 = 2. * (1) / (2 * 1 + 0 + 0)
fscore_total = 2. * (1 + 1) / (2 * (1 + 1) + (1 + 0) + (0 + 0))
np.testing.assert_almost_equal(microF1.get()[1], fscore_total)
np.testing.assert_almost_equal(macroF1.get()[1], fscore_total)
def test_multiclass_f1():
microF1 = mx.gluon.metric.create("f1", class_type="multiclass", average="micro")
macroF1 = mx.gluon.metric.F1(class_type="multiclass", average="macro")
assert np.isnan(macroF1.get()[1])
assert np.isnan(microF1.get()[1])
# check one class is zero
pred = mx.np.array([[0.9, 0.1],
[0.8, 0.2]])
label = mx.np.array([0, 0])
macroF1.update([label], [pred])
microF1.update([label], [pred])
assert macroF1.get()[1] == 0.5 # one class is 1.0, the other is 0. (divided by 0)
assert microF1.get()[1] == 1.0 # globally f1 is 1.0
macroF1.reset()
microF1.reset()
# test case from sklearn, here pred is probabilistic distributions instead of predicted labels
pred11 = mx.np.array([[1, 0, 0], [0, 1, 0]])
label11 = mx.np.array([0, 2])
pred12 = mx.np.array([[0, 0, 1], [1, 0, 0], [0, 1, 0], [0, 0, 1]])
label12 = mx.np.array([1, 0, 0, 1])
microF1.update([label11, label12], [pred11, pred12])
macroF1.update([label11, label12], [pred11, pred12])
assert microF1.num_inst == 6
assert macroF1.num_inst == 6
# from sklearn.metrics import f1_score
# overall_pred = [0, 1, 2, 0, 1, 2]
# overall_label = [0, 2, 1, 0, 0, 1]
fmacro = 0.26666666666666666 #f1_score(overall_label, overall_pred, average="macro")
fmicro = 0.3333333333333333 #f1_score(overall_label, overall_pred, average="micro")
np.testing.assert_almost_equal(microF1.get()[1], fmicro)
np.testing.assert_almost_equal(macroF1.get()[1], fmacro)
@xfail_when_nonstandard_decimal_separator
def test_multilabel_f1():
microF1 = mx.gluon.metric.create("f1", class_type="multilabel", average="micro")
macroF1 = mx.gluon.metric.F1(class_type="multilabel", average="macro")
assert np.isnan(macroF1.get()[1])
assert np.isnan(microF1.get()[1])
# check one class is zero
pred = mx.np.array([[0.9, 0.1],
[0.8, 0.2]])
label = mx.np.array([[1, 1], [1, 1]])
macroF1.update([label], [pred])
microF1.update([label], [pred])
assert macroF1.get()[1] == 0.5 # one class is 1.0, the other is 0. (divided by 0)
np.testing.assert_almost_equal(microF1.get()[1], 2.0 / 3)
macroF1.reset()
microF1.reset()
pred11 = mx.np.array([[0.9, 0.4, 0.3], [0.2, 0.7, 0.8]])
label11 = mx.np.array([[1, 0, 1], [0, 0, 1]])
pred12 = mx.np.array([[0.6, 0.6, 0.7]])
label12 = mx.np.array([[0, 1, 1]])
microF1.update([label11, label12], [pred11, pred12])
macroF1.update([label11, label12], [pred11, pred12])
assert microF1.num_inst == 3
assert macroF1.num_inst == 3
#from sklearn.metrics import f1_score
#overall_pred = [[1, 0, 0], [0, 1, 1], [1, 1, 1]]
#overall_label = [[1, 0, 1], [0, 0, 1], [0, 1, 1]]
fmacro = 0.7111111111111111 #f1_score(overall_label, overall_pred, average="macro")
fmicro = 0.7272727272727272 #f1_score(overall_label, overall_pred, average="micro")
np.testing.assert_almost_equal(microF1.get()[1], fmicro)
np.testing.assert_almost_equal(macroF1.get()[1], fmacro)
@xfail_when_nonstandard_decimal_separator
def test_mcc():
microMCC = mx.gluon.metric.create("mcc")
assert np.isnan(microMCC.get()[1])
# check divide by zero
pred = mx.np.array([[0.9, 0.1],
[0.8, 0.2]])
label = mx.np.array([0, 0])
microMCC.update([label], [pred])
assert microMCC.get()[1] == 0.0
microMCC.reset()
pred11 = mx.np.array([[0.1, 0.9],
[0.5, 0.5]])
label11 = mx.np.array([1, 0])
pred12 = mx.np.array([[0.85, 0.15],
[1.0, 0.0]])
label12 = mx.np.array([1, 0])
pred21 = mx.np.array([[0.6, 0.4]])
label21 = mx.np.array([0])
pred22 = mx.np.array([[0.2, 0.8]])
label22 = mx.np.array([1])
microMCC.update([label11, label12], [pred11, pred12])
assert microMCC.num_inst == 4
tp1 = 1; fp1 = 0; fn1 = 1; tn1=2
mcc1 = (tp1*tn1 - fp1*fn1) / np.sqrt((tp1+fp1)*(tp1+fn1)*(tn1+fp1)*(tn1+fn1))
np.testing.assert_almost_equal(microMCC.get()[1], mcc1)
microMCC.update([label21, label22], [pred21, pred22])
assert microMCC.num_inst == 6
tp2 = 1; fp2 = 0; fn2 = 0; tn2=1
mcc2 = (tp2*tn2 - fp2*fn2) / np.sqrt((tp2+fp2)*(tp2+fn2)*(tn2+fp2)*(tn2+fn2))
tpT = tp1+tp2; fpT = fp1+fp2; fnT = fn1+fn2; tnT = tn1+tn2;
mccT = (tpT*tnT - fpT*fnT) / np.sqrt((tpT+fpT)*(tpT+fnT)*(tnT+fpT)*(tnT+fnT))
np.testing.assert_almost_equal(microMCC.get()[1], mccT)
def test_perplexity():
pred = mx.np.array([[0.8, 0.2], [0.2, 0.8], [0, 1.]])
label = mx.np.array([0, 1, 1])
p = pred.asnumpy()[np.arange(label.size), label.asnumpy().astype('int32')]
perplexity_expected = np.exp(-np.log(p).sum()/label.size)
metric = mx.gluon.metric.create('perplexity', axis=-1)
metric.update([label], [pred])
_, perplexity = metric.get()
np.testing.assert_almost_equal(perplexity, perplexity_expected)
def test_pearsonr():
pred1 = mx.np.array([[0.3, 0.7], [0, 1.], [0.4, 0.6]])
label1 = mx.np.array([[1, 0], [0, 1], [0, 1]])
pearsonr_expected_np = np.corrcoef(pred1.asnumpy().ravel(), label1.asnumpy().ravel())[0, 1]
pearsonr_expected_scipy, _ = pearsonr(pred1.asnumpy().ravel(), label1.asnumpy().ravel())
micro_pr = mx.gluon.metric.create('pearsonr')
assert np.isnan(micro_pr.get()[1])
micro_pr.update([label1], [pred1])
np.testing.assert_almost_equal(micro_pr.get()[1], pearsonr_expected_np)
np.testing.assert_almost_equal(micro_pr.get()[1], pearsonr_expected_scipy)
pred2 = mx.np.array([[1, 2], [3, 2], [4, 6]])
label2 = mx.np.array([[1, 0], [0, 1], [0, 1]])
# Note that pred12 = pred1 + pred2; label12 = label1 + label2
pred12 = mx.np.array([[0.3, 0.7], [0, 1.], [0.4, 0.6],[1, 2], [3, 2], [4, 6]])
label12 = mx.np.array([[1, 0], [0, 1], [0, 1], [1, 0], [0, 1], [0, 1]])
pearsonr_expected_np = np.corrcoef(pred12.asnumpy().ravel(), label12.asnumpy().ravel())[0, 1]
pearsonr_expected_scipy, _ = pearsonr(pred12.asnumpy().ravel(), label12.asnumpy().ravel())
micro_pr.update([label2], [pred2])
np.testing.assert_almost_equal(micro_pr.get()[1], pearsonr_expected_np)
np.testing.assert_almost_equal(micro_pr.get()[1], pearsonr_expected_scipy)
def cm_batch(cm):
# generate a batch yielding a given confusion matrix
n = len(cm)
ident = np.identity(n)
labels = []
preds = []
for i in range(n):
for j in range(n):
labels += [ i ] * cm[i][j]
preds += [ ident[j] ] * cm[i][j]
return ([ mx.np.array(labels, dtype='int32') ], [ mx.np.array(preds) ])
def test_pcc():
labels, preds = cm_batch([
[ 7, 3 ],
[ 2, 5 ],
])
met_pcc = mx.gluon.metric.create('pcc')
met_pcc.update(labels, preds)
_, pcc = met_pcc.get()
# pcc should agree with mcc for binary classification
met_mcc = mx.gluon.metric.create('mcc')
met_mcc.update(labels, preds)
_, mcc = met_mcc.get()
np.testing.assert_almost_equal(pcc, mcc)
# pcc should agree with Pearson for binary classification
met_pear = mx.gluon.metric.create('pearsonr')
met_pear.update(labels, [p.argmax(axis=1) for p in preds])
_, pear = met_pear.get()
np.testing.assert_almost_equal(pcc, pear)
# pcc should also accept pred as scalar rather than softmax vector
# like acc does
met_pcc.reset()
met_pcc.update(labels, [p.argmax(axis=1) for p in preds])
_, chk = met_pcc.get()
np.testing.assert_almost_equal(pcc, chk)
# check multiclass case against reference implementation
CM = [
[ 23, 13, 3 ],
[ 7, 19, 11 ],
[ 2, 5, 17 ],
]
K = 3
ref = sum(
CM[k][k] * CM[l][m] - CM[k][l] * CM[m][k]
for k in range(K)
for l in range(K)
for m in range(K)
) / (sum(
sum(CM[k][l] for l in range(K)) * sum(
sum(CM[f][g] for g in range(K))
for f in range(K)
if f != k
)
for k in range(K)
) * sum(
sum(CM[l][k] for l in range(K)) * sum(
sum(CM[f][g] for f in range(K))
for g in range(K)
if g != k
)
for k in range(K)
)) ** 0.5
labels, preds = cm_batch(CM)
met_pcc.reset()
met_pcc.update(labels, preds)
_, pcc = met_pcc.get()
np.testing.assert_almost_equal(pcc, ref)
# things that should not change metric score:
# * order
# * batch size
# * update frequency
labels = [ [ i.reshape(-1) ] for i in labels[0] ]
labels.reverse()
preds = [ [ i.reshape((1, -1)) ] for i in preds[0] ]
preds.reverse()
met_pcc.reset()
for l, p in zip(labels, preds):
met_pcc.update(l, p)
assert pcc == met_pcc.get()[1]
@xfail_when_nonstandard_decimal_separator
def test_single_array_input():
pred = mx.np.array([[1,2,3,4]])
label = pred + 0.1
mse = mx.gluon.metric.create('mse')
mse.update(label, pred)
_, mse_res = mse.get()
np.testing.assert_almost_equal(mse_res, 0.01)
mae = mx.gluon.metric.create('mae')
mae.update(label, pred)
mae.get()
_, mae_res = mae.get()
np.testing.assert_almost_equal(mae_res, 0.1)
rmse = mx.gluon.metric.create('rmse')
rmse.update(label, pred)
rmse.get()
_, rmse_res = rmse.get()
np.testing.assert_almost_equal(rmse_res, 0.1)