diff --git a/raster/r.kappa/calc_metrics.c b/raster/r.kappa/calc_metrics.c index 7b772a06629..63365624c9a 100644 --- a/raster/r.kappa/calc_metrics.c +++ b/raster/r.kappa/calc_metrics.c @@ -1,4 +1,5 @@ #include +#include #include #include #include "kappa.h" @@ -6,6 +7,7 @@ static int longcomp(const void *aa, const void *bb); static int collapse(long *l, int n); +static void update_sum(double *sum, double *c, double value); void calc_metrics(void) { @@ -15,9 +17,12 @@ void calc_metrics(void) int ncat1, ncat2; int cndx; double *pi, *pj, *pii; - double p0 = 0.0, pC = 0.0; - double inter1 = 0.0, inter2 = 0.0; + double p0 = 0.0, pC = 0.0, p0c = 0.0, pCc = 0.0; + double inter1 = 0.0, inter2 = 0.0, inter1c = 0.0, inter2c = 0.0; int a_i = 0, b_i = 0; + double spktk = 0.0, spktkc = 0.0; + double spk2 = 0.0, spk2c = 0.0, stk2 = 0.0, stk2c = 0.0; + double unrooted; metrics = (METRICS *)G_malloc(sizeof(METRICS)); if (nstats == 0) { @@ -28,6 +33,7 @@ void calc_metrics(void) metrics->overall_accuracy = 0.0; metrics->kappa = na_value; metrics->kappa_variance = na_value; + metrics->mcc = na_value; return; } @@ -142,10 +148,12 @@ void calc_metrics(void) else metrics->producers_accuracy[i] = 100 * (pii[i] / pj[i]); /* theta 1 */ - p0 += pii[i]; + update_sum(&p0, &p0c, pii[i]); /* theta 2 */ - pC += pi[i] * pj[i]; + update_sum(&pC, &pCc, pi[i] * pj[i]); } + p0 += p0c; + pC += pCc; if (pC != 1) metrics->kappa = (p0 - pC) / (1 - pC); else @@ -158,7 +166,8 @@ void calc_metrics(void) else metrics->conditional_kappa[i] = (pii[i] - pi[i] * pj[i]) / (pi[i] - pi[i] * pj[i]); - inter1 += pii[i] * pow(((1 - pC) - (1 - p0) * (pi[i] + pj[i])), 2.); + update_sum(&inter1, &inter1c, + pii[i] * pow(((1 - pC) - (1 - p0) * (pi[i] + pj[i])), 2.)); } /* kappa variance */ @@ -170,17 +179,37 @@ void calc_metrics(void) if (Gstats[l].cats[1] == rlst[i]) b_i = i; } - inter2 += Gstats[l].count * pow((pi[a_i] + pj[b_i]), 2.) / - metrics->observations; + update_sum(&inter2, &inter2c, + Gstats[l].count * pow((pi[a_i] + pj[b_i]), 2.) / + metrics->observations); } } + inter1 += inter1c; + inter2 += inter2c; metrics->kappa_variance = (inter1 + pow((1 - p0), 2.) * inter2 - pow((p0 * pC - 2 * pC + p0), 2.)) / pow((1 - pC), 4.) / metrics->observations; - G_free(pi); G_free(pj); G_free(pii); + + /* MCC */ + for (i = 0; i < ncat; i++) { + update_sum(&spktk, &spktkc, metrics->row_sum[i] * metrics->col_sum[i]); + update_sum(&spk2, &spk2c, metrics->row_sum[i] * metrics->row_sum[i]); + update_sum(&stk2, &stk2c, metrics->col_sum[i] * metrics->col_sum[i]); + } + spktk += spktkc; + spk2 += spk2c; + stk2 += stk2c; + unrooted = (metrics->observations * metrics->observations - spk2) * + (metrics->observations * metrics->observations - stk2); + if (unrooted <= 0.0) { + metrics->mcc = na_value; + return; + } + metrics->mcc = + (metrics->correct * metrics->observations - spktk) / sqrt(unrooted); }; /* remove repeated values */ @@ -213,3 +242,16 @@ static int longcomp(const void *aa, const void *bb) return (*a > *b); } + +/* Implements improved Kahan–Babuska algorithm by Neumaier, A. 1974 */ +void update_sum(double *sum, double *c, double value) +{ + double tmp = *sum + value; + + if (fabs(*sum) >= fabs(value)) + *c += (*sum - tmp) + value; + else + *c += (value - tmp) + *sum; + + *sum = tmp; +} diff --git a/raster/r.kappa/kappa.h b/raster/r.kappa/kappa.h index 7e20092f52b..d5f1cf38d41 100644 --- a/raster/r.kappa/kappa.h +++ b/raster/r.kappa/kappa.h @@ -28,6 +28,7 @@ struct _metrics_ double kappa; double kappa_variance; double *conditional_kappa; + double mcc; }; extern struct Cell_head window; diff --git a/raster/r.kappa/print_json.c b/raster/r.kappa/print_json.c index 45e7c4b1e86..c605cdd9842 100644 --- a/raster/r.kappa/print_json.c +++ b/raster/r.kappa/print_json.c @@ -117,7 +117,11 @@ void print_json() else fprintf(fd, "%.5f", metrics->conditional_kappa[i]); } - fprintf(fd, "]\n"); + fprintf(fd, "],\n"); + if (metrics->mcc == na_value) + fprintf(fd, " \"mcc\": null\n"); + else + fprintf(fd, " \"mcc\": %.5f\n", metrics->mcc); fprintf(fd, "}\n"); if (output != NULL) diff --git a/raster/r.kappa/print_kappa.c b/raster/r.kappa/print_kappa.c index d47f0b327cd..c5e8b8e5c48 100644 --- a/raster/r.kappa/print_kappa.c +++ b/raster/r.kappa/print_kappa.c @@ -34,16 +34,19 @@ void print_kappa(void) fprintf(fd, "%f\n", metrics->conditional_kappa[i]); } fprintf(fd, "\n"); - fprintf(fd, "Kappa\t\tKappa Variance\n"); + fprintf(fd, "Kappa\t\tKappa Variance\tMCC\n"); if (metrics->kappa == na_value) fprintf(fd, "NA"); else fprintf(fd, "%f", metrics->kappa); if (metrics->kappa_variance == na_value) + fprintf(fd, "\tNA"); + else + fprintf(fd, "\t%f", metrics->kappa_variance); + if (metrics->mcc == na_value) fprintf(fd, "\tNA\n"); else - fprintf(fd, "\t%f\n", metrics->kappa_variance); - + fprintf(fd, "\t%f\n", metrics->mcc); fprintf(fd, "\nObs Correct\tTotal Obs\t%% Observed Correct\n"); fprintf(fd, "%ld\t\t%ld\t\t%f\n", metrics->correct, metrics->observations, metrics->overall_accuracy); diff --git a/raster/r.kappa/r.kappa.html b/raster/r.kappa/r.kappa.html index 32569b12933..2aa18045ac1 100644 --- a/raster/r.kappa/r.kappa.html +++ b/raster/r.kappa/r.kappa.html @@ -69,6 +69,10 @@

OUTPUT VARIABLES

Variance of kappa index. Correctness needs to be validated.
Conditional kappa
Conditional user's kappa for specified class.
+
MCC
+
Matthews (Mattheus) Correlation Coefficient is implemented + according to Grandini, M., Bagli, E., Visani, G. 2020. + "Metrics for multi-class classification: An overview."

NOTES

@@ -106,9 +110,11 @@

NOTES

-Some of reported values (Choen's kappa, overall accuracy) can be +Some of reported values (overall accuracy, Choen's kappa, MCC) can be misleading if cell count among classes is not balanced. See e.g. -Powers, D.M.W., 2012. The Problem with Kappa. +Powers, D.M.W., 2012. "The Problem with Kappa"; Zhu, Q., 2020. +"On the performance of Matthews correlation coefficient (MCC) for +imbalanced dataset".

EXAMPLE

@@ -143,4 +149,4 @@

SEE ALSO

AUTHORS

Tao Wen, University of Illinois at Urbana-Champaign, Illinois
-Maris Nartiss, University of Latvia (JSON output) +Maris Nartiss, University of Latvia (JSON output, MCC) diff --git a/raster/r.kappa/testsuite/test_r_kappa.py b/raster/r.kappa/testsuite/test_r_kappa.py index d19a7a3b37d..56e356ca20b 100644 --- a/raster/r.kappa/testsuite/test_r_kappa.py +++ b/raster/r.kappa/testsuite/test_r_kappa.py @@ -149,6 +149,8 @@ def match(self, pat, ref): # 4 0 14 4 0 - 0.000 0.000 0.222 0.000 0.000 # 5 1 17 0 0 1.000 1.000 0.056 0.056 0.056 1.000 # 6 2 13 2 1 0.667 0.500 0.111 0.222 0.167 0.400 + # Correct MCC value was calculated manually and validated with + # mcc function of R package mltools. def test_standard_output(self): out = read_command( @@ -181,6 +183,7 @@ def test_standard_output(self): # Kappa value vals = rows[28].split() self.assertTrue(self.match(vals[0], 0.52091)) + self.assertTrue(self.match(vals[2], 0.55930)) # Overall characteristics vals = rows[31].split() @@ -283,6 +286,7 @@ def test_standard_output(self): # Kappa value vals = rows[28].split() self.assertTrue(self.match(vals[0], 0.0)) + self.assertTrue(self.match(vals[2], "NA")) # Overall characteristics vals = rows[31].split() @@ -341,6 +345,7 @@ def setUpClass(cls): "producers_accuracy": [57.1429, 0.0, 100.0, None, 100.0, 66.66666], "users_accuracy": [100.0, None, 80.0, 0.0, 100.0, 50.0], "conditional_kappa": [1.0, None, 0.742857, 0.0, 1.0, 0.400], + "mcc": 0.55930, } ) @@ -380,6 +385,7 @@ def setUpClass(cls): "producers_accuracy": [0.0, 0.0, 0.0, 0.0, 0.0, None], "users_accuracy": [None, None, None, None, None, 0.0], "conditional_kappa": [None, None, None, None, None, 0.0], + "mcc": None, } ) @@ -410,6 +416,7 @@ def setUpClass(cls): "producers_accuracy": [], "users_accuracy": [], "conditional_kappa": [], + "mcc": None, } ) @@ -440,6 +447,7 @@ def setUpClass(cls): "producers_accuracy": [], "users_accuracy": [], "conditional_kappa": [], + "mcc": None, } )