From 86c93d2a7ba406c52621890babfd9d4c81a1931f Mon Sep 17 00:00:00 2001 From: jayant chauhan <0001jayant@gmail.com> Date: Sun, 28 Dec 2025 12:57:30 +0530 Subject: [PATCH 01/10] implemented dummyvar --- inst/dummyvar.m | 114 ++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 114 insertions(+) create mode 100644 inst/dummyvar.m diff --git a/inst/dummyvar.m b/inst/dummyvar.m new file mode 100644 index 00000000..3f659f38 --- /dev/null +++ b/inst/dummyvar.m @@ -0,0 +1,114 @@ +function D = dummyvar (g) + %#ok<*STRETCH> % for compatibility with different Octave linters + + % Table single-column extraction (non-fatal if 'table' not present) + try + if isa(g, "table") + if (size(g,2) ~= 1) + error ("dummyvar:InvalidInput", ... + "dummyvar on a table expects a single-column input or call dummyvar(T.Var)."); + end + g = g{:,1}; + end + catch + % If table class not available, skip extraction and let later checks handle it. + end + + % --- CATEGORICAL branch --- + if (exist ("categorical", "class") && isa (g, "categorical")) + % Match MATLAB error for empty categorical shape: MATLAB complains + % "Categorical grouping variable must have one column." + % In MATLAB that arises when categorical is 0x0 or not a single column. + s = size (g); + if (numel (s) > 2 || s(2) ~= 1) + error ("Categorical grouping variable must have one column."); + end + + % categories and double mapping must exist + if ~ (exist ("categories", "file") || exist ("categories", "builtin")) + error ("datatypes:Missing", "datatypes: 'categories' not found on path; load datatypes."); + end + + cats = categories (g); + K = numel (cats); + n = numel (g); + + % For empty (0x1) categorical, MATLAB errors out above; if not thrown, + % keep consistent handling. (We already enforced one column.) + if (n == 0) + % If we got here, treat as MATLAB does (above check should have errored), + % but to be safe produce an empty 0xK double. + D = zeros (0, K); + return; + end + + % Convert to numeric indices. Unknown/undefined map to NaN or 0 depending on API. + % In MATLAB, results in NaN rows in output. + try + idx = double (g); % maps categories to 1..K, undefined -> NaN + catch + % fallback: construct mapping manually (less efficient) + % Use grp2idx-like approach + [~, ~, idx] = unique (g); + idx = double (idx); + % Unique will not produce NaN for undefined; handle undefined explicitly + undef_mask = isundefined (g); + idx (undef_mask) = NaN; + end + + % Build matrix: rows with idx==NaN are rows of NaN(1,K) + rows = (1:n)'; + D = zeros (n, K); + + % Build using sparse (skip NaNs) + valid = ~isnan (idx); + if any (valid) + S = sparse (rows(valid), idx(valid), 1, n, K); + D (:) = full (S); + end + + % Replace rows where idx is NaN with NaN across all columns (MATLAB semantics) + nan_rows = find (isnan (idx)); + if ~ isempty (nan_rows) + D (nan_rows, :) = NaN; + end + + D = double (D); + return; + end + + % --- NUMERIC / LEGACY branch --- + % If the input is numeric (vector), create indicator columns for 1..max(g) + if (isnumeric (g) && isvector (g)) + g = g(:); % ensure column + if isempty (g) + D = zeros (0, 0); + return; + end + % If g has non-integer values, MATLAB implicitly coerces to integer indices + % by using unique? Historically dummyvar expects group indices (positive ints). + % We'll follow MATLAB's numeric behavior: use max(g) as number of columns. + % Construct column count K = max(g) + K = max (g); + if ~ isreal (K) || K < 0 + error ("dummyvar:InvalidInput", "Numeric grouping must produce a positive integer number of groups."); + end + K = double (K); + n = numel (g); + % Build sparse/dense + rows = (1:n)'; + idx = round (g); % keep integer mapping + valid = (idx >= 1) & (idx <= K) & ~isnan (idx); + if any (valid) + S = sparse (rows(valid), idx(valid), 1, n, K); + D = full (S); + else + D = zeros (n, K); + end + D = double (D); + return; + end + + % --- FALLBACK: unsupported input types --- + error ("dummyvar:UnsupportedType", "dummyvar requires a numeric vector or a categorical array."); +end From ee5ffdc19ba2f43298574049bfb615db06ca1e1b Mon Sep 17 00:00:00 2001 From: jayant chauhan <0001jayant@gmail.com> Date: Sun, 28 Dec 2025 13:02:02 +0530 Subject: [PATCH 02/10] dummyvar: added BISTs --- inst/dummyvar.m | 37 +++++++++++++++++++++++++++++++++++++ 1 file changed, 37 insertions(+) diff --git a/inst/dummyvar.m b/inst/dummyvar.m index 3f659f38..39f9e283 100644 --- a/inst/dummyvar.m +++ b/inst/dummyvar.m @@ -112,3 +112,40 @@ % --- FALLBACK: unsupported input types --- error ("dummyvar:UnsupportedType", "dummyvar requires a numeric vector or a categorical array."); end + +## Test dummyvar behavior (MATLAB-compatible) + +%!test +%! % numeric grouping vector +%! g = [1;2;1;3;2]; +%! D = dummyvar(g); +%! assert(isequal(D, [1 0 0; 0 1 0; 1 0 0; 0 0 1; 0 1 0])); + +%!test +%! % categorical with universe -> columns for each category in same order +%! g = categorical({'a';'b';'a'}, {'a','b','c'}); +%! D = dummyvar(g); +%! assert(size(D,2) == numel(categories(g))); +%! assert(all(D(:,1) == [1;0;1])); +%! assert(all(D(:,2) == [0;1;0])); +%! assert(all(D(:,3) == [0;0;0])); + +%!test +%! % categorical with -> row of NaNs +%! g = categorical({'a'; ''; 'b'}, {'a','b','c'}); +%! D = dummyvar(g); +%! assert(all(isnan(D(2,:)))); +%! assert(all(D(1,:) == [1 0 0])); +%! assert(all(D(3,:) == [0 1 0])); + +%!test +%! % empty categorical -> MATLAB-style error +%! g = categorical({}, {'a','b'}); +%! assert (throws (@() dummyvar(g))); + +%!test +%! % table column input +%! G = categorical({'a'; 'b'; 'a'}, {'a','b','c'}); +%! T = table(G, [10;20;30], 'VariableNames', {'G','Val'}); +%! D = dummyvar(T.G); +%! assert(size(D,2) == numel(categories(G))); From 5b61e846431601560a23024d551b09d6fefbc63b Mon Sep 17 00:00:00 2001 From: jayant chauhan <0001jayant@gmail.com> Date: Sun, 28 Dec 2025 13:03:55 +0530 Subject: [PATCH 03/10] dummyvar: added copyright --- inst/dummyvar.m | 17 +++++++++++++++++ 1 file changed, 17 insertions(+) diff --git a/inst/dummyvar.m b/inst/dummyvar.m index 39f9e283..6a7b7d4c 100644 --- a/inst/dummyvar.m +++ b/inst/dummyvar.m @@ -1,3 +1,20 @@ +## Copyright (C) 2025 Jayant Chauhan <0001jayant@gmail.com> +## +## This file is part of the statistics package for GNU Octave. +## +## This program is free software; you can redistribute it and/or modify it under +## the terms of the GNU General Public License as published by the Free Software +## Foundation; either version 3 of the License, or (at your option) any later +## version. +## +## This program is distributed in the hope that it will be useful, but WITHOUT +## ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or +## FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for more +## details. +## +## You should have received a copy of the GNU General Public License along with +## this program; if not, see . + function D = dummyvar (g) %#ok<*STRETCH> % for compatibility with different Octave linters From 73099c33502e7a4ae18b7be4a08c806331316285 Mon Sep 17 00:00:00 2001 From: jayant chauhan <0001jayant@gmail.com> Date: Sun, 28 Dec 2025 13:05:05 +0530 Subject: [PATCH 04/10] dummyvar: added textinfo --- inst/dummyvar.m | 31 +++++++++++++++++++++++++++++++ 1 file changed, 31 insertions(+) diff --git a/inst/dummyvar.m b/inst/dummyvar.m index 6a7b7d4c..b6c5f405 100644 --- a/inst/dummyvar.m +++ b/inst/dummyvar.m @@ -14,6 +14,37 @@ ## ## You should have received a copy of the GNU General Public License along with ## this program; if not, see . +## +## -*- texinfo -*- +## @deftypefn {statistics} {@var{D} =} dummyvar (@var{g}) +## +## Create dummy variables (one-hot encoding) from a grouping variable. +## +## The input @var{g} must be a numeric vector of group indices or a +## @code{categorical} array. The output @var{D} is a numeric matrix whose +## columns correspond to the distinct groups and whose rows correspond to the +## elements of @var{g}. +## +## For numeric inputs, the number of columns in @var{D} is equal to +## @code{max (@var{g})}, and column @math{k} corresponds to group @math{k}. +## +## For @code{categorical} inputs, the number and order of columns in @var{D} +## correspond to the categories returned by @code{categories (@var{g})}. +## Categories that are defined but not present in @var{g} produce columns of +## zeros. +## +## Elements of @var{g} that are @code{} result in rows of +## @code{NaN} values in @var{D}, matching MATLAB behavior. +## +## If @var{g} is a single-column table, the grouping variable is taken from that +## column. For example: +## +## @example +## D = dummyvar (T.Group) +## @end example +## +## @seealso{tabulate, grpstats} +## @end deftypefn function D = dummyvar (g) %#ok<*STRETCH> % for compatibility with different Octave linters From fd755d165daa0c7adc8df91f0f5553e1e477faeb Mon Sep 17 00:00:00 2001 From: jayant chauhan <0001jayant@gmail.com> Date: Sun, 28 Dec 2025 13:28:24 +0530 Subject: [PATCH 05/10] dummyvar: added test input validation --- inst/dummyvar.m | 17 +++++++++++++++++ 1 file changed, 17 insertions(+) diff --git a/inst/dummyvar.m b/inst/dummyvar.m index b6c5f405..17dd97f5 100644 --- a/inst/dummyvar.m +++ b/inst/dummyvar.m @@ -197,3 +197,20 @@ %! T = table(G, [10;20;30], 'VariableNames', {'G','Val'}); %! D = dummyvar(T.G); %! assert(size(D,2) == numel(categories(G))); + +## Test input validation + +%!error dummyvar +%!error dummyvar (1, 2) + +%!error ... +%! dummyvar (categorical ([], {'a','b'})) + +%!error ... +%! dummyvar (categorical ({'a','b'}, {'a','b'})) % row categorical + +%!error ... +%! dummyvar (table ([1;2], [3;4])) + +%!error ... +%! dummyvar (struct ("a", 1)) From 677c21bf4bf62e7b10e5b51729241e00ea2e6845 Mon Sep 17 00:00:00 2001 From: jayant chauhan <0001jayant@gmail.com> Date: Sun, 28 Dec 2025 18:07:30 +0530 Subject: [PATCH 06/10] dummyvar: clear error trows --- inst/dummyvar.m | 9 ++------- 1 file changed, 2 insertions(+), 7 deletions(-) diff --git a/inst/dummyvar.m b/inst/dummyvar.m index 17dd97f5..bfd1f21d 100644 --- a/inst/dummyvar.m +++ b/inst/dummyvar.m @@ -72,10 +72,7 @@ error ("Categorical grouping variable must have one column."); end - % categories and double mapping must exist - if ~ (exist ("categories", "file") || exist ("categories", "builtin")) - error ("datatypes:Missing", "datatypes: 'categories' not found on path; load datatypes."); - end + % categories is a categorical class method; no standalone function check needed cats = categories (g); K = numel (cats); @@ -84,9 +81,7 @@ % For empty (0x1) categorical, MATLAB errors out above; if not thrown, % keep consistent handling. (We already enforced one column.) if (n == 0) - % If we got here, treat as MATLAB does (above check should have errored), - % but to be safe produce an empty 0xK double. - D = zeros (0, K); + error ("Categorical grouping variable must have one column."); return; end From 03238a057b4bbe8c4c40da1aa067190762f66549 Mon Sep 17 00:00:00 2001 From: jayant chauhan <0001jayant@gmail.com> Date: Sun, 28 Dec 2025 18:23:27 +0530 Subject: [PATCH 07/10] dummyvar: applying octave coding style --- inst/dummyvar.m | 32 +++++++++++++++----------------- 1 file changed, 15 insertions(+), 17 deletions(-) diff --git a/inst/dummyvar.m b/inst/dummyvar.m index bfd1f21d..832f12a9 100644 --- a/inst/dummyvar.m +++ b/inst/dummyvar.m @@ -34,7 +34,7 @@ ## zeros. ## ## Elements of @var{g} that are @code{} result in rows of -## @code{NaN} values in @var{D}, matching MATLAB behavior. +## @code{NaN} values in @var{D}. ## ## If @var{g} is a single-column table, the grouping variable is taken from that ## column. For example: @@ -49,7 +49,7 @@ function D = dummyvar (g) %#ok<*STRETCH> % for compatibility with different Octave linters - % Table single-column extraction (non-fatal if 'table' not present) + ## Table single-column extraction (non-fatal if 'table' not present) try if isa(g, "table") if (size(g,2) ~= 1) @@ -64,9 +64,7 @@ % --- CATEGORICAL branch --- if (exist ("categorical", "class") && isa (g, "categorical")) - % Match MATLAB error for empty categorical shape: MATLAB complains % "Categorical grouping variable must have one column." - % In MATLAB that arises when categorical is 0x0 or not a single column. s = size (g); if (numel (s) > 2 || s(2) ~= 1) error ("Categorical grouping variable must have one column."); @@ -78,15 +76,15 @@ K = numel (cats); n = numel (g); - % For empty (0x1) categorical, MATLAB errors out above; if not thrown, - % keep consistent handling. (We already enforced one column.) if (n == 0) error ("Categorical grouping variable must have one column."); return; end - % Convert to numeric indices. Unknown/undefined map to NaN or 0 depending on API. - % In MATLAB, results in NaN rows in output. + ## Convert to numeric indices. + ## Unknown/undefined map to NaN or 0 depending on API. + ## results in NaN rows in output. + try idx = double (g); % maps categories to 1..K, undefined -> NaN catch @@ -110,7 +108,7 @@ D (:) = full (S); end - % Replace rows where idx is NaN with NaN across all columns (MATLAB semantics) + % Replace rows where idx is NaN with NaN across all columns nan_rows = find (isnan (idx)); if ~ isempty (nan_rows) D (nan_rows, :) = NaN; @@ -128,17 +126,17 @@ D = zeros (0, 0); return; end - % If g has non-integer values, MATLAB implicitly coerces to integer indices - % by using unique? Historically dummyvar expects group indices (positive ints). - % We'll follow MATLAB's numeric behavior: use max(g) as number of columns. - % Construct column count K = max(g) + ## If g has non-integer values, we implicitly coerces to integer indices + ## by using unique? Historically dummyvar expects group indices (positive ints). + ## We'll follow this behavior: use max(g) as number of columns. + ## Construct column count K = max(g) K = max (g); if ~ isreal (K) || K < 0 error ("dummyvar:InvalidInput", "Numeric grouping must produce a positive integer number of groups."); end K = double (K); n = numel (g); - % Build sparse/dense + ## Build sparse/dense rows = (1:n)'; idx = round (g); % keep integer mapping valid = (idx >= 1) & (idx <= K) & ~isnan (idx); @@ -152,11 +150,11 @@ return; end - % --- FALLBACK: unsupported input types --- + ## --- FALLBACK: unsupported input types --- error ("dummyvar:UnsupportedType", "dummyvar requires a numeric vector or a categorical array."); end -## Test dummyvar behavior (MATLAB-compatible) +## Test dummyvar behavior %!test %! % numeric grouping vector @@ -182,7 +180,7 @@ %! assert(all(D(3,:) == [0 1 0])); %!test -%! % empty categorical -> MATLAB-style error +%! % empty categorical -> %! g = categorical({}, {'a','b'}); %! assert (throws (@() dummyvar(g))); From 1725d42d2291a7f0e2dade7d6c80ed4412a5e3fd Mon Sep 17 00:00:00 2001 From: jayant chauhan <0001jayant@gmail.com> Date: Sun, 28 Dec 2025 18:28:35 +0530 Subject: [PATCH 08/10] dummyvar: applying octave coding style_2 --- inst/dummyvar.m | 45 +++++++++++++++++++++++---------------------- 1 file changed, 23 insertions(+), 22 deletions(-) diff --git a/inst/dummyvar.m b/inst/dummyvar.m index 832f12a9..e142dc3c 100644 --- a/inst/dummyvar.m +++ b/inst/dummyvar.m @@ -47,7 +47,6 @@ ## @end deftypefn function D = dummyvar (g) - %#ok<*STRETCH> % for compatibility with different Octave linters ## Table single-column extraction (non-fatal if 'table' not present) try @@ -59,18 +58,18 @@ g = g{:,1}; end catch - % If table class not available, skip extraction and let later checks handle it. + ## If table class not available, skip extraction and let later checks handle it. end - % --- CATEGORICAL branch --- + ## --- CATEGORICAL branch --- if (exist ("categorical", "class") && isa (g, "categorical")) - % "Categorical grouping variable must have one column." + ## "Categorical grouping variable must have one column." s = size (g); if (numel (s) > 2 || s(2) ~= 1) error ("Categorical grouping variable must have one column."); end - % categories is a categorical class method; no standalone function check needed + ## categories is a categorical class method; no standalone function check needed cats = categories (g); K = numel (cats); @@ -86,29 +85,29 @@ ## results in NaN rows in output. try - idx = double (g); % maps categories to 1..K, undefined -> NaN + idx = double (g); ## maps categories to 1..K, undefined -> NaN catch - % fallback: construct mapping manually (less efficient) - % Use grp2idx-like approach + ## fallback: construct mapping manually (less efficient) + ## Use grp2idx-like approach [~, ~, idx] = unique (g); idx = double (idx); - % Unique will not produce NaN for undefined; handle undefined explicitly + ## Unique will not produce NaN for undefined; handle undefined explicitly undef_mask = isundefined (g); idx (undef_mask) = NaN; end - % Build matrix: rows with idx==NaN are rows of NaN(1,K) + ## Build matrix: rows with idx==NaN are rows of NaN(1,K) rows = (1:n)'; D = zeros (n, K); - % Build using sparse (skip NaNs) + ## Build using sparse (skip NaNs) valid = ~isnan (idx); if any (valid) S = sparse (rows(valid), idx(valid), 1, n, K); D (:) = full (S); end - % Replace rows where idx is NaN with NaN across all columns + ## Replace rows where idx is NaN with NaN across all columns nan_rows = find (isnan (idx)); if ~ isempty (nan_rows) D (nan_rows, :) = NaN; @@ -118,10 +117,11 @@ return; end - % --- NUMERIC / LEGACY branch --- - % If the input is numeric (vector), create indicator columns for 1..max(g) + ## --- NUMERIC / LEGACY branch --- + ## If the input is numeric (vector), create indicator columns for 1..max(g) if (isnumeric (g) && isvector (g)) - g = g(:); % ensure column + ## ensure column + g = g(:); if isempty (g) D = zeros (0, 0); return; @@ -138,7 +138,8 @@ n = numel (g); ## Build sparse/dense rows = (1:n)'; - idx = round (g); % keep integer mapping + ## keep integer mapping + idx = round (g); valid = (idx >= 1) & (idx <= K) & ~isnan (idx); if any (valid) S = sparse (rows(valid), idx(valid), 1, n, K); @@ -157,13 +158,13 @@ ## Test dummyvar behavior %!test -%! % numeric grouping vector +%! ## numeric grouping vector %! g = [1;2;1;3;2]; %! D = dummyvar(g); %! assert(isequal(D, [1 0 0; 0 1 0; 1 0 0; 0 0 1; 0 1 0])); %!test -%! % categorical with universe -> columns for each category in same order +%! ## categorical with universe -> columns for each category in same order %! g = categorical({'a';'b';'a'}, {'a','b','c'}); %! D = dummyvar(g); %! assert(size(D,2) == numel(categories(g))); @@ -172,7 +173,7 @@ %! assert(all(D(:,3) == [0;0;0])); %!test -%! % categorical with -> row of NaNs +%! ## categorical with -> row of NaNs %! g = categorical({'a'; ''; 'b'}, {'a','b','c'}); %! D = dummyvar(g); %! assert(all(isnan(D(2,:)))); @@ -180,12 +181,12 @@ %! assert(all(D(3,:) == [0 1 0])); %!test -%! % empty categorical -> +%! ## empty categorical -> %! g = categorical({}, {'a','b'}); %! assert (throws (@() dummyvar(g))); %!test -%! % table column input +%! ## table column input %! G = categorical({'a'; 'b'; 'a'}, {'a','b','c'}); %! T = table(G, [10;20;30], 'VariableNames', {'G','Val'}); %! D = dummyvar(T.G); @@ -200,7 +201,7 @@ %! dummyvar (categorical ([], {'a','b'})) %!error ... -%! dummyvar (categorical ({'a','b'}, {'a','b'})) % row categorical +%! dummyvar (categorical ({'a','b'}, {'a','b'})) ## row categorical %!error ... %! dummyvar (table ([1;2], [3;4])) From ba6ef733b5ef76f7f2b60ed4a60aec4c4b54b875 Mon Sep 17 00:00:00 2001 From: jayant chauhan <0001jayant@gmail.com> Date: Sun, 28 Dec 2025 20:02:10 +0530 Subject: [PATCH 09/10] dummyvar: prevent function shadowing --- inst/dummyvar.m | 12 +++++++----- 1 file changed, 7 insertions(+), 5 deletions(-) diff --git a/inst/dummyvar.m b/inst/dummyvar.m index e142dc3c..d6b3b54b 100644 --- a/inst/dummyvar.m +++ b/inst/dummyvar.m @@ -86,6 +86,7 @@ try idx = double (g); ## maps categories to 1..K, undefined -> NaN + idx = idx(:); ## FORCE column vector catch ## fallback: construct mapping manually (less efficient) ## Use grp2idx-like approach @@ -96,14 +97,15 @@ idx (undef_mask) = NaN; end - ## Build matrix: rows with idx==NaN are rows of NaN(1,K) - rows = (1:n)'; + ## Build matrix: rows_idx with idx==NaN are rows_idx of NaN(1,K) + rows_idx = (1:n)'; D = zeros (n, K); ## Build using sparse (skip NaNs) valid = ~isnan (idx); + valid = valid(:); ## FORCE column vector if any (valid) - S = sparse (rows(valid), idx(valid), 1, n, K); + S = sparse (rows_idx(valid), idx(valid), 1, n, K); D (:) = full (S); end @@ -137,12 +139,12 @@ K = double (K); n = numel (g); ## Build sparse/dense - rows = (1:n)'; + rows_idx = (1:n)'; ## keep integer mapping idx = round (g); valid = (idx >= 1) & (idx <= K) & ~isnan (idx); if any (valid) - S = sparse (rows(valid), idx(valid), 1, n, K); + S = sparse (rows_idx(valid), idx(valid), 1, n, K); D = full (S); else D = zeros (n, K); From 2d61da6bf9ea7e2976475093a614c12e13b9abcc Mon Sep 17 00:00:00 2001 From: jayant chauhan <0001jayant@gmail.com> Date: Sun, 28 Dec 2025 22:09:11 +0530 Subject: [PATCH 10/10] dummyvar: refactoring to integrate datatypes correctly --- inst/dummyvar.m | 152 +++++++++++++++++++++--------------------------- 1 file changed, 66 insertions(+), 86 deletions(-) diff --git a/inst/dummyvar.m b/inst/dummyvar.m index d6b3b54b..4782210f 100644 --- a/inst/dummyvar.m +++ b/inst/dummyvar.m @@ -48,113 +48,89 @@ function D = dummyvar (g) - ## Table single-column extraction (non-fatal if 'table' not present) - try - if isa(g, "table") - if (size(g,2) ~= 1) - error ("dummyvar:InvalidInput", ... - "dummyvar on a table expects a single-column input or call dummyvar(T.Var)."); - end + if (nargin ~= 1) + error ("Invalid call to dummyvar. Correct usage is:\n\n D = dummyvar (g)"); + end + + ## Table single-column extraction + if (isa (g, "table")) + if (size (g, 2) ~= 1) + error ("dummyvar on a table expects a single-column input"); + end + try g = g{:,1}; + catch + ## table class exists but indexing failed — let later checks handle it end - catch - ## If table class not available, skip extraction and let later checks handle it. end + ## --- CATEGORICAL branch --- if (exist ("categorical", "class") && isa (g, "categorical")) - ## "Categorical grouping variable must have one column." - s = size (g); - if (numel (s) > 2 || s(2) ~= 1) + + if (! isvector (g) || size (g,2) ~= 1) error ("Categorical grouping variable must have one column."); end - ## categories is a categorical class method; no standalone function check needed - - cats = categories (g); + cats = cellstr (categories (g)); K = numel (cats); - n = numel (g); + n = rows (g); if (n == 0) error ("Categorical grouping variable must have one column."); - return; - end - - ## Convert to numeric indices. - ## Unknown/undefined map to NaN or 0 depending on API. - ## results in NaN rows in output. - - try - idx = double (g); ## maps categories to 1..K, undefined -> NaN - idx = idx(:); ## FORCE column vector - catch - ## fallback: construct mapping manually (less efficient) - ## Use grp2idx-like approach - [~, ~, idx] = unique (g); - idx = double (idx); - ## Unique will not produce NaN for undefined; handle undefined explicitly - undef_mask = isundefined (g); - idx (undef_mask) = NaN; end - ## Build matrix: rows_idx with idx==NaN are rows_idx of NaN(1,K) - rows_idx = (1:n)'; + g_str = cellstr (g(:)); D = zeros (n, K); - ## Build using sparse (skip NaNs) - valid = ~isnan (idx); - valid = valid(:); ## FORCE column vector - if any (valid) - S = sparse (rows_idx(valid), idx(valid), 1, n, K); - D (:) = full (S); - end - - ## Replace rows where idx is NaN with NaN across all columns - nan_rows = find (isnan (idx)); - if ~ isempty (nan_rows) - D (nan_rows, :) = NaN; - end + for i = 1:n + if (isundefined (g(i))) + D(i,:) = NaN; + else + for k = 1:K + if (strcmp (g_str{i}, cats{k})) + D(i,k) = 1; + break; + end + endfor + end + endfor D = double (D); return; end - ## --- NUMERIC / LEGACY branch --- - ## If the input is numeric (vector), create indicator columns for 1..max(g) + ## --- NUMERIC branch --- if (isnumeric (g) && isvector (g)) - ## ensure column - g = g(:); - if isempty (g) + + g = g(:); + if (isempty (g)) D = zeros (0, 0); return; end - ## If g has non-integer values, we implicitly coerces to integer indices - ## by using unique? Historically dummyvar expects group indices (positive ints). - ## We'll follow this behavior: use max(g) as number of columns. - ## Construct column count K = max(g) + K = max (g); - if ~ isreal (K) || K < 0 - error ("dummyvar:InvalidInput", "Numeric grouping must produce a positive integer number of groups."); + if (! isreal (K) || K < 0) + error ("dummyvar:InvalidInput", ... + "Numeric grouping must produce a positive integer number of groups."); end - K = double (K); + n = numel (g); - ## Build sparse/dense - rows_idx = (1:n)'; - ## keep integer mapping - idx = round (g); - valid = (idx >= 1) & (idx <= K) & ~isnan (idx); - if any (valid) - S = sparse (rows_idx(valid), idx(valid), 1, n, K); - D = full (S); - else - D = zeros (n, K); - end + D = zeros (n, K); + idx = round (g); + + for i = 1:n + if (! isnan (idx(i)) && idx(i) >= 1 && idx(i) <= K) + D(i, idx(i)) = 1; + end + endfor + D = double (D); return; end - ## --- FALLBACK: unsupported input types --- - error ("dummyvar:UnsupportedType", "dummyvar requires a numeric vector or a categorical array."); + error ("dummyvar:UnsupportedType", ... + "dummyvar requires a numeric vector or a categorical array."); end ## Test dummyvar behavior @@ -166,26 +142,30 @@ %! assert(isequal(D, [1 0 0; 0 1 0; 1 0 0; 0 0 1; 0 1 0])); %!test -%! ## categorical with universe -> columns for each category in same order %! g = categorical({'a';'b';'a'}, {'a','b','c'}); %! D = dummyvar(g); -%! assert(size(D,2) == numel(categories(g))); -%! assert(all(D(:,1) == [1;0;1])); -%! assert(all(D(:,2) == [0;1;0])); -%! assert(all(D(:,3) == [0;0;0])); +%! cats = categories(g); +%! g_str = cellstr(g); +%! +%! for k = 1:numel(cats) +%! mask = strcmp(g_str, cats{k}); +%! assert(all(D(mask, k) == 1)); +%! assert(all(D(!mask, k) == 0)); +%! endfor + %!test -%! ## categorical with -> row of NaNs %! g = categorical({'a'; ''; 'b'}, {'a','b','c'}); %! D = dummyvar(g); %! assert(all(isnan(D(2,:)))); -%! assert(all(D(1,:) == [1 0 0])); -%! assert(all(D(3,:) == [0 1 0])); +%! assert(sum(D(1,:) == 1) == 1); +%! assert(sum(D(3,:) == 1) == 1); %!test -%! ## empty categorical -> -%! g = categorical({}, {'a','b'}); -%! assert (throws (@() dummyvar(g))); +%! G = categorical({'a'; 'b'; 'a'}, {'a','b','c'}); +%! T = table(G, [10;20;30], 'VariableNames', {'G','Val'}); +%! D = dummyvar(T.G); +%! assert(size(D,2) == numel(categories(G))); %!test %! ## table column input @@ -197,7 +177,7 @@ ## Test input validation %!error dummyvar -%!error dummyvar (1, 2) +%!error dummyvar (1, 2) %!error ... %! dummyvar (categorical ([], {'a','b'}))