diff --git a/DESCRIPTION b/DESCRIPTION index 4c891ed..fca7d4f 100644 --- a/DESCRIPTION +++ b/DESCRIPTION @@ -1,6 +1,6 @@ Package: stream -Version: 1.5-0.1 -Date: 2021-xx-xx +Version: 1.5-1.0 +Date: 2021-10-11 Encoding: UTF-8 Title: Infrastructure for Data Stream Mining Authors@R: c(person("Michael", "Hahsler", role = c("aut", "cre", "cph"), @@ -14,8 +14,9 @@ Authors@R: c(person("Michael", "Hahsler", role = c("aut", "cre", "cph"), Description: A framework for data stream modeling and associated data mining tasks such as clustering and classification. The development of this package was supported in part by NSF IIS-0948893 and NIH R21HG005912. Hahsler et al (2017) . Depends: R (>= 3.5.0), methods, registry, proxy (>= 0.4-7) Imports: clue, cluster, clusterGeneration, dbscan (>= 1.0-0), fpc, graphics, grDevices, MASS, mlbench, Rcpp (>= 0.11.4), stats, utils -Suggests: animation, DBI, rJava, RSQLite, testthat +Suggests: animation, DBI, rJava, RSQLite, testthat, ggplot2 URL: https://github.com/mhahsler/stream BugReports: https://github.com/mhahsler/stream/issues -LinkingTo: Rcpp, BH +LinkingTo: Rcpp, BH, RcppEigen +RcppModules: SHCModule, SigmaIndexModule License: GPL-3 diff --git a/NAMESPACE b/NAMESPACE index 588593d..95556c2 100644 --- a/NAMESPACE +++ b/NAMESPACE @@ -148,7 +148,16 @@ export( description, - DSC_registry + DSC_registry, + + SHCAgglomerationType, + SHCDriftType, + DSC_SHC.behavioral, + DSC_SHC.man, + SHCEvalCallback, + stream.SHC, + stream.SHC.clone, + stream.SHC.man ) ### update @@ -247,3 +256,18 @@ S3method(add_keyframe, MGC_Linear) S3method(get_keyframes, MGC_Linear) S3method(remove_keyframe, MGC_Linear) +S3method(get_stats, DSC_SHC) +S3method(get_microclusters, DSC_SHC) +S3method(get_microweights, DSC_SHC) +S3method(get_macroclusters, DSC_SHC) +S3method(get_macroweights, DSC_SHC) +S3method(microToMacro, DSC_SHC) +S3method(get_assignment, DSC_SHC) +S3method(get_outlier_positions, DSC_SHC) +S3method(recheck_outlier, DSC_SHC) +S3method(clean_outliers, DSC_SHC) +S3method(plot, DSC_SHC) +S3method(getHistogram, DSC_SHC) +S3method(clearEigenMPSupport, DSC_SHC) + +S3method(evaluate_callback, SHCEvalCallback) diff --git a/NEWS.md b/NEWS.md index 6a79635..c6b0782 100644 --- a/NEWS.md +++ b/NEWS.md @@ -1,3 +1,8 @@ +# stream 1.5-1.0 (10/11/21) + +## New Features +* Added DSC_SHC. Code and Interface by Dalibor Krleža. + # stream 1.5-0.1 (xx/xx/21) ## Bug Fixes diff --git a/R/AAAregistry.R b/R/AAAregistry.R index 91e848e..c357ef1 100644 --- a/R/AAAregistry.R +++ b/R/AAAregistry.R @@ -29,3 +29,7 @@ DSC_registry$set_field("DSC_Micro", type = "logical", is_key = TRUE) DSC_registry$set_field("DSC_Macro", type = "logical", is_key = TRUE) +DSC_registry$set_field("DSC_Outlier", type = "logical", + is_key = TRUE) +DSC_registry$set_field("DSC_SinglePass", type = "logical", + is_key = TRUE) diff --git a/R/DSC_BICO.R b/R/DSC_BICO.R index e2b34de..173954f 100644 --- a/R/DSC_BICO.R +++ b/R/DSC_BICO.R @@ -87,6 +87,6 @@ BICO_R$methods( ) DSC_registry$set_entry(name = "DSC_BICO", - DSC_Micro = TRUE, DSC_Macro = FALSE, + DSC_Micro = TRUE, DSC_Macro = FALSE, DSC_Outlier = FALSE, DSC_SinglePass = FALSE, description = "BICO - Fast computation of k-means coresets") diff --git a/R/DSC_BIRCH.R b/R/DSC_BIRCH.R index f8a1396..38018b1 100644 --- a/R/DSC_BIRCH.R +++ b/R/DSC_BIRCH.R @@ -76,6 +76,6 @@ birch$methods( DSC_registry$set_entry(name = "DSC_BIRCH", - DSC_Micro = TRUE, DSC_Macro = FALSE, + DSC_Micro = TRUE, DSC_Macro = FALSE, DSC_Outlier = FALSE, DSC_SinglePass = FALSE, description = "BIRCH - Balanced Iterative Reducing Clustering using Hierarchies") diff --git a/R/DSC_DBSCAN.R b/R/DSC_DBSCAN.R index d84f553..e7474c8 100644 --- a/R/DSC_DBSCAN.R +++ b/R/DSC_DBSCAN.R @@ -112,6 +112,6 @@ DBSCAN$methods( ) DSC_registry$set_entry(name = "DSC_DBSCAN", - DSC_Micro = FALSE, DSC_Macro = TRUE, + DSC_Micro = FALSE, DSC_Macro = TRUE, DSC_Outlier = FALSE, DSC_SinglePass = FALSE, description = "DBSCAN Reclustering") diff --git a/R/DSC_DBSTREAM.R b/R/DSC_DBSTREAM.R index 4e16729..196f95e 100644 --- a/R/DSC_DBSTREAM.R +++ b/R/DSC_DBSTREAM.R @@ -445,5 +445,5 @@ get_cluster_assignments <- function(x) { DSC_registry$set_entry(name = "DSC_DBSTREAM", - DSC_Micro = TRUE, DSC_Macro = FALSE, + DSC_Micro = TRUE, DSC_Macro = FALSE, DSC_Outlier = FALSE, DSC_SinglePass = FALSE, description = "DBSTREAM - density-based stream clustering with shared-density-based reclustering.") diff --git a/R/DSC_DStream.R b/R/DSC_DStream.R index 8a93e55..0ddceea 100644 --- a/R/DSC_DStream.R +++ b/R/DSC_DStream.R @@ -482,6 +482,6 @@ get_assignment.DSC_DStream <- function(dsc, points, type=c("auto", "micro", "mac } DSC_registry$set_entry(name = "DSC_DStream", - DSC_Micro = TRUE, DSC_Macro = FALSE, + DSC_Micro = TRUE, DSC_Macro = FALSE, DSC_Outlier = FALSE, DSC_SinglePass = FALSE, description = "DStream") diff --git a/R/DSC_EA.R b/R/DSC_EA.R index bc4bda3..2217f11 100644 --- a/R/DSC_EA.R +++ b/R/DSC_EA.R @@ -144,6 +144,6 @@ EA_R$methods( DSC_registry$set_entry(name = "DSC_EA", - DSC_Micro = FALSE, DSC_Macro = TRUE, + DSC_Micro = FALSE, DSC_Macro = TRUE, DSC_Outlier = FALSE, DSC_SinglePass = FALSE, description = "EA - Reclustering using an evolutionary algorithm") diff --git a/R/DSC_Hierarchical.R b/R/DSC_Hierarchical.R index 2969c73..ee4d244 100644 --- a/R/DSC_Hierarchical.R +++ b/R/DSC_Hierarchical.R @@ -153,6 +153,6 @@ hierarchical$methods( ) DSC_registry$set_entry(name = "DSC_Hierarchical", - DSC_Micro = FALSE, DSC_Macro = TRUE, + DSC_Micro = FALSE, DSC_Macro = TRUE, DSC_Outlier = FALSE, DSC_SinglePass = FALSE, description = "Hierarchical reclustering") diff --git a/R/DSC_Kmeans.R b/R/DSC_Kmeans.R index bad87e5..936b1d6 100644 --- a/R/DSC_Kmeans.R +++ b/R/DSC_Kmeans.R @@ -1,6 +1,6 @@ ####################################################################### # stream - Infrastructure for Data Stream Mining -# Copyright (C) 2013 Michael Hahsler, Matthew Bolanos, John Forrest +# Copyright (C) 2013 Michael Hahsler, Matthew Bolanos, John Forrest # # This program is free software; you can redistribute it and/or modify # it under the terms of the GNU General Public License as published by @@ -16,17 +16,17 @@ # with this program; if not, write to the Free Software Foundation, Inc., # 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA. -### creator +### creator DSC_Kmeans <- function(k, weighted = TRUE, iter.max = 10, nstart = 1, algorithm = c("Hartigan-Wong", "Lloyd", "Forgy", - "MacQueen"), + "MacQueen"), min_weight = NULL, description=NULL) { - + algorithm <- match.arg(algorithm) if(!is.null(description)) desc <- description else if(weighted) desc <- "k-Means (weighted)" else desc <-"k-Means" - + structure(list(description = desc, RObj = kmeans_refClass$new( k=k, weighted=weighted, iter.max = iter.max, nstart = nstart, @@ -35,7 +35,7 @@ DSC_Kmeans <- function(k, weighted = TRUE, iter.max = 10, nstart = 1, } -kmeans_refClass <- setRefClass("kmeans", +kmeans_refClass <- setRefClass("kmeans", fields = list( k = "numeric", weighted = "logical", @@ -49,62 +49,62 @@ kmeans_refClass <- setRefClass("kmeans", clusterWeights = "numeric", details = "ANY", min_weight = "numeric" - ), - + ), + methods = list( initialize = function( k = 3, weighted = TRUE, iter.max = 10, nstart = 1, - algorithm = c("Hartigan-Wong", "Lloyd", + algorithm = c("Hartigan-Wong", "Lloyd", "Forgy","MacQueen"), min_weight = NULL ) { - - k <<- k + + k <<- k weighted <<- weighted - iter.max <<- iter.max + iter.max <<- iter.max nstart <<- nstart algorithm <<- match.arg(algorithm) - assignment <<- numeric() - weights <<- numeric() - clusterWeights <<- numeric() + assignment <<- numeric() + weights <<- numeric() + clusterWeights <<- numeric() clusterCenters <<- data.frame() data <<- data.frame() - + if(is.null(min_weight)) min_weight <<- 0 else min_weight <<- min_weight - + .self } - + ), ) kmeans_refClass$methods( cluster = function(x, weight = rep(1,nrow(x)), ...) { - - # if(nrow(x)==1) + + # if(nrow(x)==1) # warning("DSC_Kmeans does not support iterative updating! Old data is overwritten.") - + ### filter weak clusters if(min_weight>0) { x <- x[weight>min_weight,] weight <- weight[weight>min_weight] } - - + + weights <<- weight data <<- x - + if(nrow(data)>k) { - if(weighted) km <- kmeansW(x=data, weight=weights, centers=k, + if(weighted) km <- kmeansW(x=data, weight=weights, centers=k, iter.max = iter.max, nstart = nstart) - else km <- kmeans(x=data, centers=k, + else km <- kmeans(x=data, centers=k, iter.max = iter.max, nstart = nstart, algorithm = algorithm) - + assignment <<- km$cluster clusterCenters <<- data.frame(km$centers) details <<- km @@ -113,26 +113,26 @@ kmeans_refClass$methods( clusterCenters <<- x details <<- NULL } - + clusterWeights <<- sapply(1:k, FUN = function(i) sum(weights[assignment==i])) - + }, - + get_macroclusters = function(...) { clusterCenters }, get_macroweights = function(...) { clusterWeights }, - + get_microclusters = function(...) { data }, get_microweights = function(x) { weights }, - - microToMacro = function(micro=NULL, ...){ + + microToMacro = function(micro=NULL, ...){ if(is.null(micro)) micro <- 1:nrow(data) structure(assignment[micro], names=micro) - } + } ) DSC_registry$set_entry(name = "DSC_Kmeans", - DSC_Micro = FALSE, DSC_Macro = TRUE, + DSC_Micro = FALSE, DSC_Macro = TRUE, DSC_Outlier = FALSE, DSC_SinglePass = FALSE, description = "K-means reclustering") diff --git a/R/DSC_Reachability.R b/R/DSC_Reachability.R index f80e0e7..3480156 100644 --- a/R/DSC_Reachability.R +++ b/R/DSC_Reachability.R @@ -1,6 +1,6 @@ ####################################################################### # stream - Infrastructure for Data Stream Mining -# Copyright (C) 2013 Michael Hahsler, Matthew Bolanos, John Forrest +# Copyright (C) 2013 Michael Hahsler, Matthew Bolanos, John Forrest # # This program is free software; you can redistribute it and/or modify # it under the terms of the GNU General Public License as published by @@ -18,20 +18,20 @@ # Reachability and single-link hierarchical clustering are equivalent -### creator +### creator DSC_Reachability <- function(epsilon, min_weight=NULL, description=NULL) { - - hierarchical <- hierarchical$new( + + hierarchical <- hierarchical$new( h=epsilon, method="single", min_weight=min_weight) - + if(is.null(description)) description <- "Reachability" - + l <- list(description = description, RObj = hierarchical) class(l) <- c("DSC_Reachability", "DSC_Hierarchical", "DSC_Macro", "DSC_R", "DSC") l } DSC_registry$set_entry(name = "DSC_Reachability", - DSC_Micro = FALSE, DSC_Macro = TRUE, + DSC_Micro = FALSE, DSC_Macro = TRUE, DSC_Outlier = FALSE, DSC_SinglePass = FALSE, description = "Reachability reclustering") diff --git a/R/DSC_SHC.R b/R/DSC_SHC.R new file mode 100644 index 0000000..9bfda8c --- /dev/null +++ b/R/DSC_SHC.R @@ -0,0 +1,454 @@ +SHCAgglomerationType <- list(NormalAgglomeration=0,AggresiveAgglomeration=1,RelaxedAgglomeration=2) +SHCDriftType <- list(NormalDrift=0,FastDrift=1,SlowDrift=2,NoDrift=3,UltraFastDrift=4) +.reserved <- c("class","assigned","clazz","component","cids","isOutlier","stopHere","clazzAsOutlier") + +stream.SHC <- setRefClass("stream.SHC", + fields = list( + shc="ANY", + recStats="logical", + stat_val="data.frame", + stat_idx="numeric" + ), + methods = list( + initialize=function(dimensions,aggloType=SHCAgglomerationType$NormalAgglomeration,driftType=SHCDriftType$NormalDrift, + decaySpeed=10,sharedAgglomerationThreshold=1,recStats=FALSE,sigmaIndex=FALSE,sigmaIndexNeighborhood=3, + sigmaIndexPrecisionSwitch=TRUE) { + recStats <<- recStats + stat_val <<- data.frame() + stat_idx <<- 1 + shc <<- new(SHC_R,dimensions,aggloType,driftType,decaySpeed,sharedAgglomerationThreshold) + if(sigmaIndex) + shc$useSigmaIndex(sigmaIndexNeighborhood,sigmaIndexPrecisionSwitch) + } +)) + +stream.SHC$methods(list( + copy = function(...) { + n <- stream.SHC(shc) + return(n) + }, + + cache = function(...){ + }, + + uncache = function(...) { + }, + + process = function(newdata) { + if(!is.data.frame(newdata)) + stop("Submitted data parameter must be a data frame") + return(shc$process(newdata,F)) + }, + + cluster=function(newdata,type=c("none", "auto", "micro", "macro"),...) { + if(!is.data.frame(newdata)) + stop("Submitted data parameter must be a data frame") + type <- match.arg(type) + ret <- shc$process(newdata,F) + if(recStats) { + s <- shc$stats() + stat_val <<- rbind(stat_val,data.frame(index=stat_idx,components=s$components,outliers=s$outliers)) + stat_idx <<- stat_idx + 1 + } + ret <- cbind(ret,data.frame(outlier_id=ret[,"component_id"])) + ret[!ret$outlier,"outlier_id"] <- NA + #ret[ret$outlier,"assigned_comp"] <- NA + #ret[ret$outlier,"assigned_cluster"] <- NA + if(type=="none" || type=="auto" || type=="micro") + predict <- ret[,"assigned_comp"] + else + predict <- ret[,"assigned_cluster"] + attr(predict,"outliers") <- ret[,"outlier"] + attr(predict,"outliers_corrid") <- ret[,"outlier_id"] + predict + }, + + getStats=function() { + return(stat_val) + }, + + getComponentAndOutlierStatistics=function() { + return(shc$stats()) + }, + + get_assignment=function(newdata,type=c("none", "auto", "micro", "macro"),...) { + if(!is.data.frame(newdata)) + stop("Submitted data parameter must be a data frame") + type <- match.arg(type) + ret <- shc$process(newdata,T) + ret <- cbind(ret,data.frame(outlier_id=ret[,"component_id"])) + ret[!ret$outlier,"outlier_id"] <- NA + #ret[ret$outlier,"assigned_comp"] <- NA + #ret[ret$outlier,"assigned_cluster"] <- NA + if(type=="none" || type=="auto" || type=="micro") + predict <- ret[,"assigned_comp"] + else + predict <- ret[,"assigned_cluster"] + attr(predict,"outliers") <- ret[,"outlier"] + attr(predict,"outliers_corrid") <- ret[,"outlier_id"] + predict + }, + + get_microclusters=function(...) { + df <- data.frame() + for(compId in shc$getAllComponents()) { + compDesc <- shc$getComponentDetails(compId) + df <- rbind(df, data.frame(matrix(compDesc$mean, ncol=length(compDesc@mean)))) + } + return(df) + }, + + get_microweights=function(...) { + df <- data.frame() + for(compId in shc$getAllComponents()) { + df <- rbind(df, data.frame(W=shc$getComponentWeight(compId))) + } + return(df) + }, + + get_macroclusters=function(...) { + df <- data.frame() + for(clusId in shc$getClusters(T,T)) { + tw <- 0 + cen <- NULL + dim <- 0 + for(compId in shc$getComponents(clusId)) { + compDesc <- shc$getComponentDetails(compId) + if(is.null(cen)) cen <- rep(0,compDesc$dimensions) + dim <- length(compDesc$mean) + tw <- tw+compDesc$elements; + cen <- cen+(compDesc$elements*compDesc$mean) + } + df <- rbind(df, data.frame(matrix(cen/tw, ncol=dim))) + } + return(df) + }, + + get_macroweights=function(...) { + df <- data.frame() + for(clusId in shc$getClusters(T,T)) { + df <- rbind(df, data.frame(W=shc$getClusterWeigth(clusId))) + } + return(df) + }, + + microToMacro=function(micro, ...) { + if(is.null(micro) || all(is.na(micro)) || length(micro)==0) + return(c(0)) + return(shc$microToMacro(micro)) + }, + + microToMicro=function(micro, ...) { # this mapps obsolete, removed and agglomerated components to new ones + if(is.null(micro) || all(is.na(micro)) || length(micro)==0) + return(c(0)) + res <- c() + for(i in 1:length(micro)) { + assigned_compId <- micro[[i]] # this is not SHC component id, it is a mapped value needed for the stream package + for(nassigned_compId in shc$microToMicro(assigned_compId)) + if(nassigned_compId>0) res <- append(res, nassigned_compId) + } + if(length(res)==0) return(c(0)) # return UNASSIGNED if no new components has been found + return(res) + }, + + setPseudoOfflineCounter=function(counter, ...) { + shc$setPseudoOfflineCounter(counter) + }, + + getTimes=function(...) { + return(shc$getTimes()) + }, + + getNodeCounter=function(...) { + return(shc$getNodeCounter()) + }, + + getComputationCostReduction=function(...) { + return(shc$getComputationCostReduction()) + }, + + getHistogram=function(...) { + return(shc$getHistogram()) + }, + + recheckOutlier=function(id,...) { + return(shc$recheckOutlier(id)) + }, + + getOutlierPositions=function(...) { + return(shc$getOutlierPositions()) + }, + + getTrace=function(id,...) { + return(shc$getTrace(id)) + }, + + clearEigenMPSupport=function() { + shc$clearEigenMPSupport(); + } +)) + +stream.SHC.clone <- setRefClass("stream.SHC.clone", + contains = "stream.SHC", + methods = list( + initialize=function(old_shc) { + stat_val <<- data.frame() + stat_idx <<- 1 + recStats <<- FALSE + shc <<- old_shc$shc$cloneSHC() + } + )) + +stream.SHC.man <- setRefClass("stream.SHC.man", + contains = "stream.SHC", + methods = list( + initialize=function(dimensions,theta,virtualVariance,parallelize=FALSE,performSharedAgglomeration=TRUE, + agglo_count=100,cbVarianceLimit=as.double(10.0),cbNLimit=as.integer(40), + driftRemoveCompSizeRatio=as.double(0.3),driftCheckingSizeRatio=as.double(1.3), + driftMovementMDThetaRatio=as.double(0.8),decaySpeed=as.integer(10), + sharedAgglomerationThreshold=as.integer(1),compFormingMinVVRatio=as.double(0.2), + compBlockingLimitVVRatio=as.double(0.0),recStats=FALSE,sigmaIndex=FALSE, + sigmaIndexNeighborhood=3,sigmaIndexPrecisionSwitch=TRUE) { + recStats <<- recStats + stat_val <<- data.frame() + stat_idx <<- 1 + vv <- rep(virtualVariance,dimensions) + params <- list(theta=theta,parallelize=parallelize,performSharedAgglomeration=performSharedAgglomeration, + virtualVariance=vv,agglo_count=agglo_count,cbVarianceLimit=cbVarianceLimit, + cbNLimit=cbNLimit,driftRemoveCompSizeRatio=driftRemoveCompSizeRatio, + driftCheckingSizeRatio=driftCheckingSizeRatio,driftMovementMDThetaRatio=driftMovementMDThetaRatio, + decayPace=decaySpeed,sharedAgglomerationThreshold=sharedAgglomerationThreshold, + componentFormingMinVVRatio=compFormingMinVVRatio, + componentBlockingLimitVVRatio=compBlockingLimitVVRatio) + shc <<- new(SHC_R,params) + if(sigmaIndex) + shc$useSigmaIndex(sigmaIndexNeighborhood,sigmaIndexPrecisionSwitch) + } + )) + +DSC_SHC.behavioral <- function(dimensions,aggloType=SHCAgglomerationType$NormalAgglomeration, + driftType=SHCDriftType$NormalDrift,decaySpeed=10,sharedAgglomerationThreshold=1, + recStats=FALSE,sigmaIndex=FALSE,sigmaIndexNeighborhood=3,sigmaIndexPrecisionSwitch=TRUE) { + x <- stream.SHC(dimensions,aggloType,driftType,decaySpeed,sharedAgglomerationThreshold,recStats, + sigmaIndex,sigmaIndexNeighborhood,sigmaIndexPrecisionSwitch) + macro <- new.env() + macro$theta <- x$shc$theta() + macro$virtualVariance <- x$shc$virtualVariance() + + structure( + list( + description = "Statistical Hierarchical Clustering", + RObj = x, + recheck_outliers = T, + macro = macro + ), class = c("DSC_SHC", "DSC_SinglePass", "DSC_Outlier", "DSC_Micro", "DSC_R", "DSC") + ) +} + +DSC_SHC.man <- function(dimensions,theta,virtualVariance,parallelize=FALSE,performSharedAgglomeration=TRUE, + compAssimilationCheckCounter=50,cbVarianceLimit=as.double(10.0),cbNLimit=as.integer(40), + driftRemoveCompSizeRatio=as.double(0.3),driftCheckingSizeRatio=as.double(1.3), + driftMovementMDThetaRatio=as.double(0.8),decaySpeed=as.integer(10), + sharedAgglomerationThreshold=as.integer(1),compFormingMinVVRatio=as.double(0.2), + compBlockingLimitVVRatio=as.double(0.0),recStats=FALSE,sigmaIndex=FALSE, + sigmaIndexNeighborhood=3,sigmaIndexPrecisionSwitch=TRUE) { + x <- stream.SHC.man(dimensions,theta,virtualVariance,parallelize,performSharedAgglomeration, + compAssimilationCheckCounter,cbVarianceLimit,cbNLimit,driftRemoveCompSizeRatio,driftCheckingSizeRatio, + driftMovementMDThetaRatio,decaySpeed,sharedAgglomerationThreshold, + compFormingMinVVRatio,compBlockingLimitVVRatio,recStats,sigmaIndex, + sigmaIndexNeighborhood,sigmaIndexPrecisionSwitch) + macro <- new.env() + macro$theta <- x$shc$theta() + macro$virtualVariance <- x$shc$virtualVariance() + + structure( + list( + description = "Statistical Hierarchical Clustering", + RObj = x, + recheck_outliers = T, + macro = macro + ), class = c("DSC_SHC", "DSC_SinglePass", "DSC_Outlier", "DSC_Micro", "DSC_R", "DSC") + ) +} + +get_stats <- function(x) UseMethod("get_stats") +get_stats.default <- function(x, ...) { + stop(gettextf("get_stats not implemented for class '%s'.", + paste(class(x), collapse=", "))) +} +get_stats.DSC_SHC <- function(x, ...) { + return(x$RObj$getStats()) +} + +get_microclusters.DSC_SHC <- function(x,...){ + return(x$RObj$get_microclusters(...)) +} + +get_microweights.DSC_SHC <- function(x, ...) { + return(x$RObj$get_microweights(...)) +} + +get_macroclusters.DSC_SHC <- function(x,...){ + return(x$RObj$get_macroclusters(...)) +} + +get_macroweights.DSC_SHC <- function(x, ...) { + return(x$RObj$get_macroweights(...)) +} + +microToMacro.DSC_SHC <- function(x, micro=NULL, ...) { + return(x$RObj$microToMacro(micro,...)) +} + +get_assignment.DSC_SHC <- function(dsc, points, type=c("auto", "micro", "macro"), method=c("auto", "model", "nn"), ...) { + return(dsc$RObj$cluster(points,type,...)) +} + +get_outlier_positions.DSC_SHC <- function(x, ...) { + return(x$RObj$getOutlierPositions()) +} + +recheck_outlier.DSC_SHC <- function(x, outlier_correlated_id, ...) { + return(x$RObj$recheckOutlier(outlier_correlated_id)) +} + +plot.DSC_SHC <- function(x, dsd = NULL, n = 500, type = c("auto", "micro", "macro", "both"), + displayDataPoints=TRUE, ...) { + if(!requireNamespace("ggplot2")) + message("Plotting for the Statistical Hierarchical Clusterer requires ggplot2 package") + else { + #loadNamespace("ggplot2") + shc_plot <- ggplot2::ggplot() + ggplot2::theme_bw() + + ggplot2::theme(panel.grid.major = ggplot2::element_blank(), panel.grid.minor = ggplot2::element_blank(), + axis.ticks = ggplot2::element_blank(), axis.text = ggplot2::element_blank(), + axis.title = ggplot2::element_blank(), legend.position = "none") + if(displayDataPoints) { + d <- get_points(dsd, n, cluster = TRUE, outlier = TRUE) + assignment_c <- attr(d, "cluster") + assignment_o <- attr(d, "outlier") + d_clus <- d[which(!assignment_o),] + d_clus_col <- assignment_c[which(!assignment_o)] + colnames(d_clus) <- paste0("X", 1:ncol(d_clus)) + d_out <- d[which(assignment_o),] + d_out_col <- assignment_c[which(assignment_o)] + colnames(d_out) <- paste0("X", 1:ncol(d_out)) + shc_plot <- shc_plot + + ggplot2::geom_point(data = d_clus, ggplot2::aes_string(x="X1", y="X2", colour=factor(d_clus_col)), size=1, show.legend=F) + + ggplot2::geom_point(data = d_out, ggplot2::aes_string(x="X1", y="X2", colour=factor(d_out_col)), shape=8, size=2, show.legend=F) + } + + clusts <- x$RObj$shc$getClusters(T,F) + #print(paste0("Clusters [",paste(clusts,collapse=","),"]")) + outls <- x$RObj$shc$getClusters(F,T) + #print(paste0("Outliers [",paste(outls,collapse=","),"]")) + for(clus_id in c(clusts)) { + comps <- x$RObj$shc$getComponents(clus_id) + cd <- x$RObj$shc$getClusterContours(clus_id) + for(comp_id in comps) { + details <- as.data.frame(cd[comp_id]); + colnames(details)=c("X1","X2") + shc_plot <- shc_plot + ggplot2::geom_path(data=details, ggplot2::aes_string(x="X1", y="X2"), show.legend=FALSE, size=0.9, colour="red") + } + } + for(clus_id in c(outls)) { + comps <- x$RObj$shc$getComponents(clus_id) + cd <- x$RObj$shc$getClusterContours(clus_id) + for(comp_id in comps) { + details <- as.data.frame(cd[comp_id]); + colnames(details)=c("X1","X2") + shc_plot <- shc_plot + ggplot2::geom_path(data=details, ggplot2::aes_string(x="X1", y="X2"), show.legend=FALSE, size=0.7, colour="green", linetype="dotted") + } + } + plot(shc_plot) + } +} + +get_times <- function(x) UseMethod("get_times") +get_times.default <- function(x,...) { + stop(gettextf("get_times not implemented for class '%s'.", + paste(class(x), collapse=", "))) +} +get_times.DSC_SHC <- function(x,...) { + return(x$RObj$getTimes()) +} + +get_node_counter <- function(x) UseMethod("get_node_counter") +get_node_counter.default <- function(x,...) { + stop(gettextf("get_node_counter not implemented for class '%s'.", + paste(class(x), collapse=", "))) +} +get_node_counter.DSC_SHC <- function(x,...) { + return(x$RObj$getNodeCounter()) +} + +get_computation_cost_reduction <- function(x) UseMethod("get_computation_cost_reduction") +get_computation_cost_reduction.default <- function(x,...) { + stop(gettextf("get_computation_cost_reduction not implemented for class '%s'.", + paste(class(x), collapse=", "))) +} +get_computation_cost_reduction.DSC_SHC <- function(x,...) { + return(x$RObj$getComputationCostReduction()) +} + +clean_outliers.DSC_SHC <- function(x, ...) { + x$RObj$clean_outliers() +} + +setPseudoOfflineCounter <- function(x,counter) UseMethod("setPseudoOfflineCounter") +setPseudoOfflineCounter.default <- function(x,counter, ...) { + stop(gettextf("setPseudoOfflineCounter not implemented for class '%s'.", + paste(class(x), collapse=", "))) +} +setPseudoOfflineCounter.DSC_SHC <- function(x,counter, ...) { + x$RObj$setPseudoOfflineCounter(counter) +} + +getHistogram <- function(x) UseMethod("getHistogram") +getHistogram.default <- function(x, ...) { + stop(gettextf("getHistogram not implemented for class '%s'.", + paste(class(x), collapse=", "))) +} +getHistogram.DSC_SHC <- function(x, ...) { + x$RObj$getHistogram() +} + +clearEigenMPSupport <- function(x) UseMethod("clearEigenMPSupport") +clearEigenMPSupport.default <- function(x, ...) { + stop(gettextf("clearEigenMPSupport not implemented for class '%s'.", + paste(class(x), collapse=", "))) +} +clearEigenMPSupport.DSC_SHC <- function(x, ...) { + x$RObj$clearEigenMPSupport() +} + +.shc_measures <- c("queryTime","updateTime","processTime","nodeCount","computationCostReduction") + +SHCEvalCallback <- function() { + env = environment() + all_measures <- .shc_measures + internal_measures <- .shc_measures + external_measures <- c() + outlier_measures <- c() + this <- list( + description = "SHC evaluation callback", + env = env + ) + class(this) <- c("SHCEvalCallback", "EvalCallback") + this +} +evaluate_callback.SHCEvalCallback <- function(cb_obj, dsc, measure, points, actual, predict, outliers, + predict_outliers, predict_outliers_corrid, + centers, noise, ...) { + r <- list() + times <- get_times(dsc) + if("queryTime" %in% measure) r$queryTime <- times$queryTime + if("updateTime" %in% measure) r$updateTime <- times$updateTime + if("processTime" %in% measure) r$processTime <- times$processTime + if("nodeCount" %in% measure) r$nodeCount <- get_node_counter(dsc) + if("computationCostReduction" %in% measure) r$computationCostReduction <- get_computation_cost_reduction(dsc)*100 + r +} + +DSC_registry$set_entry(name = "DSC_SHC", + DSC_Micro = TRUE, DSC_Macro = TRUE, DSC_Outlier = TRUE, DSC_SinglePass = TRUE, + description = "SHC - Statistical Hierarchical Clusterer") diff --git a/R/DSC_Sample.R b/R/DSC_Sample.R index 7969a43..81134a1 100644 --- a/R/DSC_Sample.R +++ b/R/DSC_Sample.R @@ -24,5 +24,5 @@ DSC_Sample <- function(k = 100, biased = FALSE) class = c("DSC_Sample","DSC_Micro","DSC_R","DSC")) DSC_registry$set_entry(name = "DSC_Sample", - DSC_Micro = TRUE, DSC_Macro = FALSE, + DSC_Micro = TRUE, DSC_Macro = FALSE, DSC_Outlier = FALSE, DSC_SinglePass = FALSE, description = "Reservoir sampling") diff --git a/R/DSC_Static.R b/R/DSC_Static.R index 58a69b2..5633d5f 100644 --- a/R/DSC_Static.R +++ b/R/DSC_Static.R @@ -1,6 +1,6 @@ ####################################################################### # stream - Infrastructure for Data Stream Mining -# Copyright (C) 2013 Michael Hahsler, Matthew Bolanos, John Forrest +# Copyright (C) 2013 Michael Hahsler, Matthew Bolanos, John Forrest # # This program is free software; you can redistribute it and/or modify # it under the terms of the GNU General Public License as published by @@ -17,18 +17,18 @@ # 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA. -DSC_Static <- function(x, type=c("auto", "micro", "macro"), +DSC_Static <- function(x, type=c("auto", "micro", "macro"), k_largest=NULL, min_weight=NULL) { - + ### figure out type type <- get_type(x, type) if(type=="macro") macro <- TRUE else macro <- FALSE - + ### make sure it is a data.frame centers <- as.data.frame(get_centers(x, type)) weights <- get_weights(x, type) - + if(!is.null(k_largest)) { if(k_largest>nclusters(x)) { warning("Less clusters than k. Using all clusters.") @@ -38,47 +38,47 @@ DSC_Static <- function(x, type=c("auto", "micro", "macro"), weights <- weights[o] } } - + if(!is.null(min_weight)) { take <- weights>=min_weight centers <- centers[take,] weights <- weights[take] } - + static <- Static$new(centers, weights, macro=macro) - + l <- list(description = "Static clustering", RObj = static) - + if(macro) micromacro <- "DSC_Macro" else micromacro <- "DSC_Micro" - + class(l) <- c("DSC_Static", micromacro, "DSC_R","DSC") - + l } -Static <- setRefClass("Static", +Static <- setRefClass("Static", fields = list( centers = "data.frame", weights = "numeric", macro = "logical" - ), - + ), + methods = list( initialize = function( centers = data.frame(), weights = numeric(), macro = FALSE ) { - - centers <<- centers + + centers <<- centers weights <<- weights macro <<- macro - + .self } - + ), ) @@ -86,22 +86,22 @@ Static$methods( cluster = function(newdata, ...) { stop("DSC_Static: cluster not implemented!") }, - + get_macroweights = function(...) { if(!macro) stop("This is a micro-clustering!") weights }, - + get_macroclusters = function(...) { if(!macro) stop("This is a micro-clustering!") centers }, - + get_microweights = function(...) { if(macro) stop("This is a macro-clustering!") weights }, - + get_microclusters = function(...) { if(macro) stop("This is a macro-clustering!") centers @@ -109,7 +109,7 @@ Static$methods( ) DSC_registry$set_entry(name = "DSC_Static", - DSC_Micro = TRUE, DSC_Macro = TRUE, + DSC_Micro = TRUE, DSC_Macro = TRUE, DSC_Outlier = FALSE, DSC_SinglePass = FALSE, description = "Static Copy of a Clustering") diff --git a/R/DSC_Window.R b/R/DSC_Window.R index 64cf580..8f5128a 100644 --- a/R/DSC_Window.R +++ b/R/DSC_Window.R @@ -1,6 +1,6 @@ ####################################################################### # stream - Infrastructure for Data Stream Mining -# Copyright (C) 2013 Michael Hahsler, Matthew Bolanos, John Forrest +# Copyright (C) 2013 Michael Hahsler, Matthew Bolanos, John Forrest # # This program is free software; you can redistribute it and/or modify # it under the terms of the GNU General Public License as published by @@ -17,13 +17,13 @@ # 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA. ### interface for DSO_Window (see DSO_Window.R) -DSC_Window <- function(horizon = 100, lambda=0) +DSC_Window <- function(horizon = 100, lambda=0) structure(list(description = if(lambda>0) "Damped sliding window" else "Sliding window", RObj = WindowDSC$new(horizon = as.integer(horizon), lambda=lambda)), class = c("DSC_Window","DSC_Micro","DSC_R","DSC")) DSC_registry$set_entry(name = "DSC_Window", - DSC_Micro = TRUE, DSC_Macro = FALSE, + DSC_Micro = TRUE, DSC_Macro = FALSE, DSC_Outlier = FALSE, DSC_SinglePass = FALSE, description = "Sliding window") - + diff --git a/R/DSC_evoStream.R b/R/DSC_evoStream.R index 897a626..26d327d 100644 --- a/R/DSC_evoStream.R +++ b/R/DSC_evoStream.R @@ -157,6 +157,6 @@ evoStream_R$methods( ) DSC_registry$set_entry(name = "DSC_evoStream", - DSC_Micro = TRUE, DSC_Macro = FALSE, + DSC_Micro = TRUE, DSC_Macro = FALSE, DSC_Outlier = FALSE, DSC_SinglePass = FALSE, description = "evoStream - Evolutionary Stream Clustering") diff --git a/R/RcppExports.R b/R/RcppExports.R index bb311fd..ec81566 100644 --- a/R/RcppExports.R +++ b/R/RcppExports.R @@ -1,6 +1,22 @@ # Generated by using Rcpp::compileAttributes() -> do not edit by hand # Generator token: 10BE3573-1514-4C36-9D1C-5A225CD40393 +shc_MDSeparation <- function(mean1, covariance1, virtualVariance1, N1, isInversion1, th1, mean2, covariance2, virtualVariance2, N2, isInversion2, th2) { + .Call(`_stream_shc_MDSeparation`, mean1, covariance1, virtualVariance1, N1, isInversion1, th1, mean2, covariance2, virtualVariance2, N2, isInversion2, th2) +} + +shc_MutualMinMahalanobis <- function(mean1, covariance1, virtualVariance1, N1, isInversion1, mean2, covariance2, virtualVariance2, N2, isInversion2) { + .Call(`_stream_shc_MutualMinMahalanobis`, mean1, covariance1, virtualVariance1, N1, isInversion1, mean2, covariance2, virtualVariance2, N2, isInversion2) +} + +shc_CalculateNewMean <- function(mean, newElement, N) { + .Call(`_stream_shc_CalculateNewMean`, mean, newElement, N) +} + +shc_CalculateCovariance <- function(oldMean, newMean, oldCovariance, N, newElement, isInversion) { + .Call(`_stream_shc_CalculateCovariance`, oldMean, newMean, oldCovariance, N, newElement, isInversion) +} + kmnsw <- function(a_R, c_R, wh_R, k, iter) { .Call(`_stream_kmnsw`, a_R, c_R, wh_R, k, iter) } diff --git a/R/stream-package.R b/R/stream-package.R new file mode 100644 index 0000000..8b23f05 --- /dev/null +++ b/R/stream-package.R @@ -0,0 +1 @@ +loadModule("SHCModule",TRUE) diff --git a/man/DSC_SHC.behavioral.Rd b/man/DSC_SHC.behavioral.Rd new file mode 100644 index 0000000..9da2fcc --- /dev/null +++ b/man/DSC_SHC.behavioral.Rd @@ -0,0 +1,72 @@ +\docType{class} +\name{DSC_SHC.behavioral-class} +\alias{DSC_SHC.behavioral} +\alias{clearEigenMPSupport,DSC_SHC.behavioral-method} +\alias{clearEigenMPSupport} +\title{Statistical Hierarchical Clusterer - behavioral construction} +\usage{ +DSC_SHC.behavioral(dimensions,aggloType=SHCAgglomerationType$NormalAgglomeration, +driftType=SHCDriftType$NormalDrift,decaySpeed=10,sharedAgglomerationThreshold=1, +recStats=FALSE,sigmaIndex=FALSE,sigmaIndexNeighborhood=3,sigmaIndexPrecisionSwitch=TRUE) +} +\arguments{ +\item{dimensions}{(integer) - A number of space dimensions.} +\item{aggloType}{(list, SHCAgglomerationType) - Agglomeration type: \emph{NormalAgglomeration,AggresiveAgglomeration,RelaxedAgglomeration}.} +\item{driftType}{(list, SHCDriftType) - Drift type: \emph{NormalDrift,FastDrift,SlowDrift,NoDrift,UltraFastDrift}.} +\item{decaySpeed}{(integer) - Components decay speed. 0 = no decay, >0 = higher number represents slower decay.} +\item{sharedAgglomerationThreshold}{(integer) - A number of data instances between components that cause their agglomeration under the same cluster.} +\item{recStats}{(logical) - A flag that indicated whether the SH clusterer should return statistics about the number of components and outliers generated during values processing.} +\item{sigmaIndex}{(logical) - A flag that indicates whether the SH clusterer should utilize the Sigma-index for speeing up the statistical space query.} +\item{sigmaIndexNeighborhood}{(integer) - A multiplier for the statistical neighborhood used to manage Sigma-index. This multiplier is used in conjunction with SH clusterer statistical thershold theta - hereby determined by the \emph{aggloType} parameter.} +\item{sigmaIndexPrecisionSwitch}{(logical) - A flag that indicates whether the Sigma-index should maintain precision over the speed.} +} +\section{Methods}{ +\describe{ +All methods here are detailed in the stream package. +\item{\code{get_stats(dsc,...)}}{ + Returns components and outliers statistics for the last data processing. Only when \emph{recStats=T}. +} +\item{\code{\link[stream]{get_microclusters}(dsc,...)}}{ + Returns a list of current component centers. +} +\item{\code{\link[stream]{get_microweights}(dsc,...)}}{ + Returns the list of current component weights, based on the component population divided by the total population. +} +\item{\code{\link[stream]{get_macroclusters}(dsc,...)}}{ + Returns a list of current cluster centers. +} +\item{\code{\link[stream]{get_macroweights}(dsc,...)}}{ + Returns a list of current cluster weights, based on the cluster population (sum of component populations) divided by the total population.. +} +\item{\code{\link[stream]{microToMacro}(dsc,micro=NULL,...)}}{ + Returns a mapping between components and clusters. +} +\item{\code{\link[stream]{get_assignment}(dsc, points, type=c("auto", "micro", "macro"), method=c("auto", "model", "nn"), ...)}}{ + Returns assignments of points to component and clusters, depending on the \emph{type} parameter value. Returns a data frame with assignment values, and possibly additional attributes that indicate outliers and their SHC internal identifiers. +} +\item{\code{get_outlier_positions(dsc, ...)}}{ + Returns a list of outliers and their positions. +} +\item{\code{recheck_outlier_positions(dsc, outlier_correlated_id, ...)}}{ + Performs re-checking of the previously encountered outlier. An SHC outlier identifier must be supplied. Function returns TRUE when the outlier still stands (or decayed), or FALSE if it was assimilated by some component in the meantime. +} +\item{\code{clearEigenMPSupport(dsc, ...)}}{ + Clears the OpenMP usage by the Eigen linear algebra package. Introduced only for the reproducibility purposes. +}}} +\description{ +The Statistical Hierachical Clusterer class. +} +\details{ +Instantiates an SHC \link[stream]{DSC} object that represents an extension of abstract classes in the stream package, namely \link[stream]{DSC_SinglePass}, \link[stream]{DSC_Outlier}, \link[stream]{DSC_Macro}, and \link[stream]{DSC_R}. This object can be used in all stream package methods. +} +\examples{ +d <- DSD_Gaussians(k=2,d=2,outliers=2,separation_type="Mahalanobis",separation=4, + space_limit=c(0,15),outlier_options=list(outlier_horizon=10000)) +c <- DSC_SHC.behavioral(2,driftType = SHCDriftType$NoDrift,decaySpeed = 0) +update(c,d,n=10000) +reset_stream(d) +plot(c,d,n=10000) +} +\references{ +[1] Krleža D, Vrdoljak B, and Brčić M, Statistical hierarchical clustering algorithm for outlier detection in evolving data streams, \emph{Machine Learning}, Sep. 2020 +} diff --git a/man/DSC_SHC.man.Rd b/man/DSC_SHC.man.Rd new file mode 100644 index 0000000..8b97112 --- /dev/null +++ b/man/DSC_SHC.man.Rd @@ -0,0 +1,87 @@ +\docType{class} +\name{DSC_SHC.man-class} +\alias{DSC_SHC.man} +\alias{clearEigenMPSupport,DSC_SHC.man-method} +\title{Statistical Hierarchical Clusterer - manual construction} +\usage{ +DSC_SHC.man(dimensions,theta,virtualVariance,parallelize=FALSE, +performSharedAgglomeration=TRUE,compAssimilationCheckCounter=50, +cbVarianceLimit=as.double(10.0),cbNLimit=as.integer(40), +driftRemoveCompSizeRatio=as.double(0.3),driftCheckingSizeRatio=as.double(1.3), +driftMovementMDThetaRatio=as.double(0.8),decaySpeed=as.integer(10), +sharedAgglomerationThreshold=as.integer(1),compFormingMinVVRatio=as.double(0.2), +compBlockingLimitVVRatio=as.double(0.0),recStats=FALSE,sigmaIndex=FALSE, +sigmaIndexNeighborhood=3,sigmaIndexPrecisionSwitch=TRUE) +} +\arguments{ +\item{dimensions}{(integer) - A number of space dimensions.} +\item{theta}{(real) - A statistical distance threshold for component bounds.} +\item{virtualVariance}{(real) - A variance used to construct virtual covariance matrix surrounding outliers.} +\item{parallelize}{(logical) - Not implemented yet. Leave at FALSE.} +\item{performSharedAgglomeration}{(logical) - Depreciated. Use \emph{sharedAgglomerationThreshold} instead.} +\item{compAssimilationCheckCounter}{(integer) - A number of data instances between two component assimilation checks. A smaller component can be assimilated by a bigger component when the centroid of the smaller component get within bounds of the bigger components. With this parameter we define periodicity of such check. The smaller number, the slower SHC response.} +\item{cbVarianceLimit}{(real) - Minimal variance for component baseline snapshot. This parameter is closely related to the component baseline, which is related to the SHC drifting concept.} +\item{cbNLimit}{(integer) - Minimal population for component baseline snapshot. This parameter is closely related to the component baseline, which is related to the SHC drifting concept. Having \emph{cbVarianceLimit=0} and \emph{cbNLimit=0} means that driting mechanism is switched off.} +\item{driftRemoveCompSizeRatio}{(real) - Used in the SHC drifting mechanism [1]. After drifting was detected, new components that represent drifting front smaller than driftRemoveCompSizeRatio*population(original component) are removed. This parameter allows some cleaning of the space when having ultra-fast drifting populations.} +\item{driftCheckingSizeRatio}{(real) - Used in the SHC drifting mechanism [1]. When the biggest drifting child components reach driftCheckingSizeRatio*population(component baseline), we start calculating drifting index, to assess whether drift and split occured or not.} +\item{driftMovementMDThetaRatio}{(real) - Used in the SHC drifting mechanism [1]. Drifting index is calculated from the sub-clustered drifting components, i.e., dritfing front. We calculate mean weighted distance between sub-clustered components and the baseline components. If drifting index is bigger than driftMovementMDThetaRatio*theta, we consider that the drifting front is advancing away from the original component baseline position. Or that drifting front is splitting into several sub-populations.} +\item{decaySpeed}{(integer) - Components decay speed. 0 = no decay, >0 = higher number represents slower decay.} +\item{sharedAgglomerationThreshold}{(integer) - A number of data instances between components that cause their agglomeration under the same cluster.} +\item{compFormingMinVVRatio}{(real) - A minimal variance that needs to be attained in order SHC can promote an outlier into a newly formed components. This is used to prevent multiple observations having same values from forming a small component.} +\item{compBlockingLimitVVRatio}{(real) - A maximal variance for components. If a component reaches such variance it gets blocked, and all surrouding observations start forming new components.} +\item{recStats}{(logical) - A flag that indicated whether the SH clusterer should return statistics about the number of components and outliers generated during values processing.} +\item{sigmaIndex}{(logical) - A flag that indicates whether the SH clusterer should utilize the Sigma-index for speeing up the statistical space query.} +\item{sigmaIndexNeighborhood}{(integer) - A multiplier for the statistical neighborhood used to manage Sigma-index. This multiplier is used in conjunction with SH clusterer statistical thershold theta - hereby determined by the \emph{aggloType} parameter.} +\item{sigmaIndexPrecisionSwitch}{(logical) - A flag that indicates whether the Sigma-index should maintain precision over the speed.} +} +\section{Methods}{ +\describe{ +All methods here are detailed in the stream package. +\item{\code{get_stats(dsc,...)}}{ + Returns components and outliers statistics for the last data processing. Only when \emph{recStats=T}. +} +\item{\code{\link[stream]{get_microclusters}(dsc,...)}}{ + Returns a list of current component centers. +} +\item{\code{\link[stream]{get_microweights}(dsc,...)}}{ + Returns the list of current component weights, based on the component population divided by the total population. +} +\item{\code{\link[stream]{get_macroclusters}(dsc,...)}}{ + Returns a list of current cluster centers. +} +\item{\code{\link[stream]{get_macroweights}(dsc,...)}}{ + Returns a list of current cluster weights, based on the cluster population (sum of component populations) divided by the total population.. +} +\item{\code{\link[stream]{microToMacro}(dsc,micro=NULL,...)}}{ + Returns a mapping between components and clusters. +} +\item{\code{\link[stream]{get_assignment}(dsc, points, type=c("auto", "micro", "macro"), method=c("auto", "model", "nn"), ...)}}{ + Returns assignments of points to component and clusters, depending on the \emph{type} parameter value. Returns a data frame with assignment values, and possibly additional attributes that indicate outliers and their SHC internal identifiers. +} +\item{\code{get_outlier_positions(dsc, ...)}}{ + Returns a list of outliers and their positions. +} +\item{\code{recheck_outlier_positions(dsc, outlier_correlated_id, ...)}}{ + Performs re-checking of the previously encountered outlier. An SHC outlier identifier must be supplied. Function returns TRUE when the outlier still stands (or decayed), or FALSE if it was assimilated by some component in the meantime. +} +\item{\code{clearEigenMPSupport(dsc, ...)}}{ + Clears the OpenMP usage by the Eigen linear algebra package. Introduced only for the reproducibility purposes. +}}} +\description{ +The Statistical Hierachical Clusterer class. +} +\details{ +Instantiates an SHC \link[stream]{DSC} object that represents an extension of abstract classes in the stream package, namely \link[stream]{DSC_SinglePass}, \link[stream]{DSC_Outlier}, \link[stream]{DSC_Macro}, and \link[stream]{DSC_R}. This object can be used in all stream package methods. +} +\examples{ +d <- DSD_Gaussians(k=2,d=2,outliers=2,separation_type="Mahalanobis", + separation=4,space_limit=c(0,20),variance_limit=2, + outlier_options=list(outlier_virtual_variance=2,outlier_horizon=10000)) +c <- DSC_SHC.man(2,theta=3.2,virtualVariance=0.2,cbVarianceLimit=0,cbNLimit=0,decaySpeed=0) +update(c,d,n=10000) +reset_stream(d) +plot(c,d,n=10000) +} +\references{ +[1] D. Krleža, B. Vrdoljak, and M. Brčić, Statistical hierarchical clustering algorithm for outlier detection in evolving data streams, \emph{Machine Learning}, Sep. 2020 +} \ No newline at end of file diff --git a/man/SHCAgglomerationType.Rd b/man/SHCAgglomerationType.Rd new file mode 100644 index 0000000..1418aa2 --- /dev/null +++ b/man/SHCAgglomerationType.Rd @@ -0,0 +1,35 @@ +\docType{class} +\name{SHCAgglomerationType} +\alias{SHCAgglomerationType} +\title{List of SHC Agglomeration Templates} +\description{ +Described in [1]. The following templates are available: + \itemize{ + \item \strong{NormalAgglomeration} - \code{theta = 3.2}, \code{virtualVariance = 1.0} + \item \strong{AggresiveAgglomeration} - \code{theta = 3.5}, \code{virtualVariance = 1.2} + \item \strong{RelaxedAgglomeration} - \code{theta = 2.9}, \code{virtualVariance = 0.8} + } +} +\author{ +Dalibor Krleža +} +\examples{ +set.seed(0) +d <- DSD_Gaussians(k = 3, d = 2, outliers = 5, separation_type = "Mahalanobis", + separation = 6, space_limit = c(0, 50), variance_limit = 2, + outlier_options = list(outlier_horizon = 10000,outlier_virtual_variance = 2)) +c <- DSC_SHC.behavioral(2, aggloType = SHCAgglomerationType$AggresiveAgglomeration, decaySpeed = 0, + sigmaIndex = TRUE) +evaluate(c, d, n = 10000, type = "macro", measure = c("crand", "outlierjaccard")) + +set.seed(0) +d <- DSD_Gaussians(k = 3, d = 2, outliers = 5, separation_type = "Mahalanobis", + separation = 6, space_limit = c(0, 50), variance_limit = 2, + outlier_options = list(outlier_horizon = 10000,outlier_virtual_variance = 2)) +c <- DSC_SHC.behavioral(2, aggloType = SHCAgglomerationType$RelaxedAgglomeration, decaySpeed = 0, + sigmaIndex = TRUE) +evaluate(c, d, n = 10000, type = "macro", measure = c("crand", "outlierjaccard")) +} +\references{ +[1] Krleža D, Vrdoljak B, and Brčić M, Statistical hierarchical clustering algorithm for outlier detection in evolving data streams, \emph{Machine Learning}, Sep. 2020 +} diff --git a/man/SHCDriftType.Rd b/man/SHCDriftType.Rd new file mode 100644 index 0000000..359e3a8 --- /dev/null +++ b/man/SHCDriftType.Rd @@ -0,0 +1,20 @@ +\docType{class} +\name{SHCDriftType} +\alias{SHCDriftType} +\title{List of SHC Drift Templates} +\description{ +Described in [1]. The following templates are available: + \itemize{ + \item \strong{FastDrift} - \code{cbNLimit = 40}, \code{cbVarianceLimit = 6}, \code{driftCheckingSizeRatio = 1.0}, \code{driftMovementMDThetaRatio = 0.6} + \item \strong{SlowDrift} - \code{cbNLimit = 120}, \code{cbVarianceLimit = 9}, \code{driftCheckingSizeRatio = 2.0}, \code{driftMovementMDThetaRatio = 1.0} + \item \strong{NoDrift} - \code{cbNLimit = 0}, \code{cbVarianceLimit = 0} + \item \strong{UltraFastDrift} - \code{cbNLimit = 20}, \code{cbVarianceLimit = 6}, \code{driftCheckingSizeRatio = 0.5}, \code{driftMovementMDThetaRatio = 0.5} + \item \strong{NormalDrift} - \code{cbNLimit = 80}, \code{cbVarianceLimit = 8}, \code{driftCheckingSizeRatio = 1.3}, \code{driftMovementMDThetaRatio = 0.8} + } +} +\author{ +Dalibor Krleža +} +\references{ +[1] Krleža D, Vrdoljak B, and Brčić M, Statistical hierarchical clustering algorithm for outlier detection in evolving data streams, \emph{Machine Learning}, Sep. 2020 +} diff --git a/man/SHCEvalCallback.Rd b/man/SHCEvalCallback.Rd new file mode 100644 index 0000000..17e5337 --- /dev/null +++ b/man/SHCEvalCallback.Rd @@ -0,0 +1,43 @@ +\docType{class} +\name{SHCEvalCallback-class} +\alias{SHCEvalCallback} +\title{Statistical Hierarchical Clusterer Evaluation Callbacks} +\section{Fields}{ +\describe{ +\item{all_measures}{ + The following list of measures are available: + \itemize{ + \item \emph{queryTime} - Total query time for the last processing. + \item \emph{updateTime} - Total sigma-index update time. + \item \emph{processTime} - Total time needed for the last processing... measured inside SHC algorithm. + \item \emph{nodeCount} - Total mahalanobis distances calculated for the last processing. + \item \emph{computationCostReduction} - Computational cost reduction when using sigma-index comparing to the sequential scan. + } +} +\item{internal_measures}{ +Same as \code{all_measures}. +} +\item{external_measures}{ +Empty. +} +\item{outlier_measures}{ +Empty. +} +}} +\description{ +Allows SHC evaluation callback for the \pkg{stream} package. This call contains measures mostly related to use of the sigma-index. +} +\author{ +Dalibor Krleža +} +\examples{ +set.seed(0) +d <- DSD_Gaussians(k = 3, d = 2, outliers = 5, separation_type = "Mahalanobis", + separation = 4, space_limit = c(0, 25), variance_limit = 1, + outlier_options = list(outlier_horizon = 10000)) +c <- DSC_SHC.man(2, 3.2, 0.3, cbNLimit = 0, cbVarianceLimit = 0,decaySpeed = 0, + sigmaIndex = TRUE) +evaluate_with_callbacks(c, d, n = 10000, type = "macro", + measure = c("crand", "outlierjaccard", "computationCostReduction"), + callbacks = list(shc = SHCEvalCallback())) +} diff --git a/man/stream.SHC.behavioral.Rd b/man/stream.SHC.behavioral.Rd new file mode 100644 index 0000000..4f56b86 --- /dev/null +++ b/man/stream.SHC.behavioral.Rd @@ -0,0 +1,72 @@ +\docType{class} +\name{stream.SHC-class} +\alias{stream.SHC} +\alias{initialize,stream.SHC-method} +\title{Statistical Hierarchical Clusterer - reference class, behavioral construction} +\usage{ +\S4method{initialize}{stream.SHC}(dimensions, + aggloType=SHCAgglomerationType$NormalAgglomeration,driftType=SHCDriftType$NormalDrift, + decaySpeed=10,sharedAgglomerationThreshold=1,recStats=FALSE,sigmaIndex=FALSE, + sigmaIndexNeighborhood=3,sigmaIndexPrecisionSwitch=TRUE) +} +\arguments{ +\item{dimensions}{(integer) - A number of space dimensions.} +\item{aggloType}{(list, SHCAgglomerationType) - Agglomeration type: \emph{NormalAgglomeration,AggresiveAgglomeration,RelaxedAgglomeration}.} +\item{decaySpeed}{(integer) - Components decay speed. 0 = no decay, >0 = higher number represents slower decay.} +\item{driftType}{(list, SHCDriftType) - Drift type: \emph{NormalDrift,FastDrift,SlowDrift,NoDrift,UltraFastDrift}.} +\item{sharedAgglomerationThreshold}{(integer) - A number of data instances between components that cause their agglomeration under the same cluster.} +\item{recStats}{(logical) - A flag that indicated whether the SH clusterer should return statistics about the number of components and outliers generated during values processing.} +\item{sigmaIndex}{(logical) - A flag that indicates whether the SH clusterer should utilize the Sigma-index for speeing up the statistical space query.} +\item{sigmaIndexNeighborhood}{(integer) - A multiplier for the statistical neighborhood used to manage Sigma-index. This multiplier is used in conjunction with SH clusterer statistical thershold theta - hereby determined by the \emph{aggloType} parameter.} +\item{sigmaIndexPrecisionSwitch}{(logical) - A flag that indicates whether the Sigma-index should maintain high precision over a speed.} +} +\section{Methods}{ +\describe{ +\item{process(data)}{ + Initiates clustering for a suppled data frame (dataset). Returned data frame comprises macro and micro-level details, as well as outlier flags. +} +\item{getComponentAndOutlierStatistics()}{ + Returns the number of current components and outliers. +} +\item{getTimes()}{ + Returns the list of processing times during last clustering. +} +\item{getNodeCounter()}{ + Returns the number of statistical distance calculations when querying for statistical classification. Meaningful when utilizing sigma-index, to compare statistical neighborhood density for the processed dataset. +} +\item{getComputationCostReduction()}{ +Can be used only with sigma-index. Returns the ratio of the number of statistical distance calculations and the maximal number of statistical distance calculations for the \emph{sequential scan} approach. The returned number tells how much statistical calculations was saved by utilizing simga-index for the processed dataset. +} +\item{getHistogram()}{ +Returns computation cost reduction histogram when utilizing sigma-index. +} +\item{recheckOutlier(id, ...)}{ +Can be used to re-check the outlier status for the supplied id. If the supplied id still represents an outlier, this method will return \emph{true}. +} +\item{getTrace(id)}{ +Returns a list of current component identifiers that have trace back to the supplied component id. This method is used when we want to know new components to which are successors of the predecessor component that once had the supplied identifier. This method establishes direct temporal connection between the supplied component id and that returned list of current components. +} +\item{clearEigenMPSupport()}{ + Clears the OpenMP usage by the Eigen linear algebra package. Introduced only for the reproducibility purposes. +} +}} +\description{ +The Statistical Hierachical Clusterer reference class. +} +\details{ +Instantiates an SHC object that represents an instance of the stream.SHC reference class. +} +\examples{ +s <- stream.SHC(2,driftType=SHCDriftType$NoDrift,decaySpeed=0,sigmaIndex=TRUE) +res <- s$process(data.frame(X=c(1,2,3,34,5,3,2,2,3,34,150),Y=c(3,4,2,1,6,7,4,5,6,3,150))) +res +s$getComponentAndOutlierStatistics() +s$recheckOutlier(res[11,"component_id"]) +orig_id <- res[3,"component_id"] +trace_id <- s$getTrace(orig_id) +message(paste("Original id",orig_id,"traced to",trace_id)) +s$getHistogram() +} +\references{ +[1] Krleža D, Vrdoljak B, and Brčić M, Statistical hierarchical clustering algorithm for outlier detection in evolving data streams, \emph{Machine Learning}, Sep. 2020 +} diff --git a/man/stream.SHC.clone.Rd b/man/stream.SHC.clone.Rd new file mode 100644 index 0000000..c6e55ec --- /dev/null +++ b/man/stream.SHC.clone.Rd @@ -0,0 +1,30 @@ +\docType{class} +\name{stream.SHC.clone-class} +\alias{stream.SHC.clone} +\alias{initialize,stream.SHC.clone-method} +\title{Statistical Hierarchical Clusterer - reference class, cloning construction} +\usage{ +\S4method{initialize}{stream.SHC.clone}(old_shc) +} +\arguments{ +\item{old_shc}{(SHC object) - An SHC object instantiated from the SHC reference classes (both \link{stream.SHC} and \link{stream.SHC.man}).} +} +\value{ +Returns the cloned SHC object. +} +\description{ +The Statistical Hierachical Clusterer reference class cloner. +} +\details{ +Performs cloning of the existing SHC object, instantiated from the SHC reference class. +} +\examples{ +s <- stream.SHC(2,driftType=SHCDriftType$NoDrift,decaySpeed=0,sigmaIndex=TRUE) +res <- s$process(data.frame(X=c(1,2,3,34,5,3,2,2,3,34,150),Y=c(3,4,2,1,6,7,4,5,6,3,150))) +s$getHistogram() +s_cloned <- stream.SHC.clone(s) +s_cloned$getHistogram() +} +\references{ +[1] Krleža D, Vrdoljak B, and Brčić M, Statistical hierarchical clustering algorithm for outlier detection in evolving data streams, \emph{Machine Learning}, Sep. 2020 +} diff --git a/man/stream.SHC.man.Rd b/man/stream.SHC.man.Rd new file mode 100644 index 0000000..aedabd3 --- /dev/null +++ b/man/stream.SHC.man.Rd @@ -0,0 +1,86 @@ +\docType{class} +\name{stream.SHC.man-class} +\alias{stream.SHC.man} +\alias{initialize,stream.SHC.man-method} +\title{Statistical Hierarchical Clusterer - reference class, manual construction} +\usage{ +\S4method{initialize}{stream.SHC.man}(dimensions,theta,virtualVariance,parallelize=FALSE, +performSharedAgglomeration=TRUE,compAssimilationCheckCounter=50, +cbVarianceLimit=as.double(10.0),cbNLimit=as.integer(40), +driftRemoveCompSizeRatio=as.double(0.3),driftCheckingSizeRatio=as.double(1.3), +driftMovementMDThetaRatio=as.double(0.8),decaySpeed=as.integer(10), +sharedAgglomerationThreshold=as.integer(1),compFormingMinVVRatio=as.double(0.2), +compBlockingLimitVVRatio=as.double(0.0),recStats=FALSE,sigmaIndex=FALSE, +sigmaIndexNeighborhood=3,sigmaIndexPrecisionSwitch=TRUE) +} +\arguments{ +\item{dimensions}{(integer) - A number of space dimensions.} +\item{theta}{(real) - A statistical distance threshold for component bounds.} +\item{virtualVariance}{(real) - A variance used to construct virtual covariance matrix surrounding outliers.} +\item{parallelize}{(logical) - Not implemented yet. Leave at FALSE.} +\item{performSharedAgglomeration}{(logical) - Depreciated. Use \emph{sharedAgglomerationThreshold} instead.} +\item{compAssimilationCheckCounter}{(integer) - A number of data instances between two component assimilation checks. A smaller component can be assimilated by a bigger component when the centroid of the smaller component get within bounds of the bigger components. With this parameter we define periodicity of such check. The smaller number, the slower SHC response.} +\item{cbVarianceLimit}{(real) - Minimal variance for component baseline snapshot. This parameter is closely related to the component baseline, which is related to the SHC drifting concept.} +\item{cbNLimit}{(integer) - Minimal population for component baseline snapshot. This parameter is closely related to the component baseline, which is related to the SHC drifting concept. Having \emph{cbVarianceLimit=0} and \emph{cbNLimit=0} means that driting mechanism is switched off.} +\item{driftRemoveCompSizeRatio}{(real) - Used in the SHC drifting mechanism [1]. After drifting was detected, new components that represent drifting front smaller than driftRemoveCompSizeRatio*population(original component) are removed. This parameter allows some cleaning of the space when having ultra-fast drifting populations.} +\item{driftCheckingSizeRatio}{(real) - Used in the SHC drifting mechanism [1]. When the biggest drifting child components reach driftCheckingSizeRatio*population(component baseline), we start calculating drifting index, to assess whether drift and split occured or not.} +\item{driftMovementMDThetaRatio}{(real) - Used in the SHC drifting mechanism [1]. Drifting index is calculated from the sub-clustered drifting components, i.e., dritfing front. We calculate mean weighted distance between sub-clustered components and the baseline components. If drifting index is bigger than driftMovementMDThetaRatio*theta, we consider that the drifting front is advancing away from the original component baseline position. Or that drifting front is splitting into several sub-populations.} +\item{decaySpeed}{(integer) - Components decay speed. 0 = no decay, >0 = higher number represents slower decay.} +\item{sharedAgglomerationThreshold}{(integer) - A number of data instances between components that cause their agglomeration under the same cluster.} +\item{compFormingMinVVRatio}{(real) - A minimal variance that needs to be attained in order SHC can promote an outlier into a newly formed components. This is used to prevent multiple observations having same values from forming a small component.} +\item{compBlockingLimitVVRatio}{(real) - A maximal variance for components. If a component reaches such variance it gets blocked, and all surrouding observations start forming new components.} +\item{recStats}{(logical) - A flag that indicated whether the SH clusterer should return statistics about the number of components and outliers generated during values processing.} +\item{sigmaIndex}{(logical) - A flag that indicates whether the SH clusterer should utilize the Sigma-index for speeing up the statistical space query.} +\item{sigmaIndexNeighborhood}{(integer) - A multiplier for the statistical neighborhood used to manage Sigma-index. This multiplier is used in conjunction with SH clusterer statistical thershold theta - hereby determined by the \emph{theta} parameter.} +\item{sigmaIndexPrecisionSwitch}{(logical) - A flag that indicates whether the Sigma-index should maintain precision over the speed.} +} +\section{Methods}{ +\describe{ +\item{process(data)}{ + Initiates clustering for a suppled data frame (dataset). Returned data frame comprises macro and micro-level details, as well as outlier flags. +} +\item{getComponentAndOutlierStatistics()}{ + Returns the number of current components and outliers. +} +\item{getTimes()}{ + Returns the list of processing times during last clustering. +} +\item{getNodeCounter()}{ + Returns the number of statistical distance calculations when querying for statistical classification. Meaningful when utilizing sigma-index, to compare statistical neighborhood density for the processed dataset. +} +\item{getComputationCostReduction()}{ +Can be used only with sigma-index. Returns the ratio of the number of statistical distance calculations and the maximal number of statistical distance calculations for the \emph{sequential scan} approach. The returned number tells how much statistical calculations was saved by utilizing simga-index for the processed dataset. +} +\item{getHistogram()}{ +Returns computation cost reduction histogram when utilizing sigma-index. +} +\item{recheckOutlier(id, ...)}{ +Can be used to re-check the outlier status for the supplied id. If the supplied id still represents an outlier, this method will return \emph{true}. +} +\item{getTrace(id)}{ +Returns a list of current component identifiers that have trace back to the supplied component id. This method is used when we want to know new components to which are successors of the predecessor component that once had the supplied identifier. This method establishes direct temporal connection between the supplied component id and that returned list of current components. +} +\item{clearEigenMPSupport()}{ + Clears the OpenMP usage by the Eigen linear algebra package. Introduced only for the reproducibility purposes. +} +}} +\description{ +The Statistical Hierachical Clusterer reference class. +} +\details{ +Instantiates an SHC object that represents an instance of the stream.SHC.man reference class. +} +\examples{ +s <- stream.SHC.man(2,3.2,1.0,decaySpeed=0,cbNLimit=0,cbVarianceLimit=0,sigmaIndex=TRUE) +res <- s$process(data.frame(X=c(1,2,3,34,5,3,2,2,3,34,150),Y=c(3,4,2,1,6,7,4,5,6,3,150))) +res +s$getComponentAndOutlierStatistics() +s$recheckOutlier(res[11,"component_id"]) +orig_id <- res[3,"component_id"] +trace_id <- s$getTrace(orig_id) +message(paste("Original id",orig_id,"traced to",trace_id)) +s$getHistogram() +} +\references{ +[1] Krleža D, Vrdoljak B, and Brčić M, Statistical hierarchical clustering algorithm for outlier detection in evolving data streams, \emph{Machine Learning}, Sep. 2020 +} diff --git a/src/Makevars b/src/Makevars index fe8152a..51912e4 100644 --- a/src/Makevars +++ b/src/Makevars @@ -1,6 +1,8 @@ CXX_STD = CXX11 - -OBJECTS = evoStream.o BIRCH.o BICO.o DBSTREAM.o DStream.o init.o kmeansw.o NumericVector.o RcppExports.o BICO/l2metric.o BICO/point.o BICO/pointcentroid.o BICO/realspaceprovider.o BICO/squaredl2metric.o BIRCH/CFLeafNode.o BIRCH/CFNode.o BIRCH/CFNonLeafNode.o BIRCH/CFTree.o BIRCH/ClusteringFeature.o BIRCH/Util.o +PKG_CXXFLAGS = -I./SHC +OBJECTS.SHC = SHC/SHC.o SHC/SHC_Component.o SHC/SHC_ComponentConnection.o SHC/SHC_Container.o SHC/SHC_Decay.o SHC/SHC_Utils.o SHC/SigmaIndex.o SHC/SigmaIndexProxy.o SHC/SHC_DeltaLogger.o +OBJECTS.SHC.root = SHC_R.o SHC_Module.o Utils.o +OBJECTS = evoStream.o BIRCH.o BICO.o DBSTREAM.o DStream.o init.o kmeansw.o NumericVector.o RcppExports.o BICO/l2metric.o BICO/point.o BICO/pointcentroid.o BICO/realspaceprovider.o BICO/squaredl2metric.o BIRCH/CFLeafNode.o BIRCH/CFNode.o BIRCH/CFNonLeafNode.o BIRCH/CFTree.o BIRCH/ClusteringFeature.o BIRCH/Util.o $(OBJECTS.SHC) $(OBJECTS.SHC.root) all: $(SHLIB) diff --git a/src/RcppExports.cpp b/src/RcppExports.cpp index d74cf22..4b4166a 100644 --- a/src/RcppExports.cpp +++ b/src/RcppExports.cpp @@ -1,15 +1,82 @@ // Generated by using Rcpp::compileAttributes() -> do not edit by hand // Generator token: 10BE3573-1514-4C36-9D1C-5A225CD40393 +#include #include using namespace Rcpp; -#ifdef RCPP_USE_GLOBAL_ROSTREAM -Rcpp::Rostream& Rcpp::Rcout = Rcpp::Rcpp_cout_get(); -Rcpp::Rostream& Rcpp::Rcerr = Rcpp::Rcpp_cerr_get(); -#endif - +// shc_MDSeparation +double shc_MDSeparation(NumericVector mean1, NumericMatrix covariance1, NumericVector virtualVariance1, int N1, bool isInversion1, double th1, NumericVector mean2, NumericMatrix covariance2, NumericVector virtualVariance2, int N2, bool isInversion2, double th2); +RcppExport SEXP _stream_shc_MDSeparation(SEXP mean1SEXP, SEXP covariance1SEXP, SEXP virtualVariance1SEXP, SEXP N1SEXP, SEXP isInversion1SEXP, SEXP th1SEXP, SEXP mean2SEXP, SEXP covariance2SEXP, SEXP virtualVariance2SEXP, SEXP N2SEXP, SEXP isInversion2SEXP, SEXP th2SEXP) { +BEGIN_RCPP + Rcpp::RObject rcpp_result_gen; + Rcpp::RNGScope rcpp_rngScope_gen; + Rcpp::traits::input_parameter< NumericVector >::type mean1(mean1SEXP); + Rcpp::traits::input_parameter< NumericMatrix >::type covariance1(covariance1SEXP); + Rcpp::traits::input_parameter< NumericVector >::type virtualVariance1(virtualVariance1SEXP); + Rcpp::traits::input_parameter< int >::type N1(N1SEXP); + Rcpp::traits::input_parameter< bool >::type isInversion1(isInversion1SEXP); + Rcpp::traits::input_parameter< double >::type th1(th1SEXP); + Rcpp::traits::input_parameter< NumericVector >::type mean2(mean2SEXP); + Rcpp::traits::input_parameter< NumericMatrix >::type covariance2(covariance2SEXP); + Rcpp::traits::input_parameter< NumericVector >::type virtualVariance2(virtualVariance2SEXP); + Rcpp::traits::input_parameter< int >::type N2(N2SEXP); + Rcpp::traits::input_parameter< bool >::type isInversion2(isInversion2SEXP); + Rcpp::traits::input_parameter< double >::type th2(th2SEXP); + rcpp_result_gen = Rcpp::wrap(shc_MDSeparation(mean1, covariance1, virtualVariance1, N1, isInversion1, th1, mean2, covariance2, virtualVariance2, N2, isInversion2, th2)); + return rcpp_result_gen; +END_RCPP +} +// shc_MutualMinMahalanobis +double shc_MutualMinMahalanobis(NumericVector mean1, NumericMatrix covariance1, NumericVector virtualVariance1, int N1, bool isInversion1, NumericVector mean2, NumericMatrix covariance2, NumericVector virtualVariance2, int N2, bool isInversion2); +RcppExport SEXP _stream_shc_MutualMinMahalanobis(SEXP mean1SEXP, SEXP covariance1SEXP, SEXP virtualVariance1SEXP, SEXP N1SEXP, SEXP isInversion1SEXP, SEXP mean2SEXP, SEXP covariance2SEXP, SEXP virtualVariance2SEXP, SEXP N2SEXP, SEXP isInversion2SEXP) { +BEGIN_RCPP + Rcpp::RObject rcpp_result_gen; + Rcpp::RNGScope rcpp_rngScope_gen; + Rcpp::traits::input_parameter< NumericVector >::type mean1(mean1SEXP); + Rcpp::traits::input_parameter< NumericMatrix >::type covariance1(covariance1SEXP); + Rcpp::traits::input_parameter< NumericVector >::type virtualVariance1(virtualVariance1SEXP); + Rcpp::traits::input_parameter< int >::type N1(N1SEXP); + Rcpp::traits::input_parameter< bool >::type isInversion1(isInversion1SEXP); + Rcpp::traits::input_parameter< NumericVector >::type mean2(mean2SEXP); + Rcpp::traits::input_parameter< NumericMatrix >::type covariance2(covariance2SEXP); + Rcpp::traits::input_parameter< NumericVector >::type virtualVariance2(virtualVariance2SEXP); + Rcpp::traits::input_parameter< int >::type N2(N2SEXP); + Rcpp::traits::input_parameter< bool >::type isInversion2(isInversion2SEXP); + rcpp_result_gen = Rcpp::wrap(shc_MutualMinMahalanobis(mean1, covariance1, virtualVariance1, N1, isInversion1, mean2, covariance2, virtualVariance2, N2, isInversion2)); + return rcpp_result_gen; +END_RCPP +} +// shc_CalculateNewMean +NumericVector shc_CalculateNewMean(NumericVector mean, NumericVector newElement, int N); +RcppExport SEXP _stream_shc_CalculateNewMean(SEXP meanSEXP, SEXP newElementSEXP, SEXP NSEXP) { +BEGIN_RCPP + Rcpp::RObject rcpp_result_gen; + Rcpp::RNGScope rcpp_rngScope_gen; + Rcpp::traits::input_parameter< NumericVector >::type mean(meanSEXP); + Rcpp::traits::input_parameter< NumericVector >::type newElement(newElementSEXP); + Rcpp::traits::input_parameter< int >::type N(NSEXP); + rcpp_result_gen = Rcpp::wrap(shc_CalculateNewMean(mean, newElement, N)); + return rcpp_result_gen; +END_RCPP +} +// shc_CalculateCovariance +NumericMatrix shc_CalculateCovariance(NumericVector oldMean, NumericVector newMean, NumericMatrix oldCovariance, int N, NumericVector newElement, bool isInversion); +RcppExport SEXP _stream_shc_CalculateCovariance(SEXP oldMeanSEXP, SEXP newMeanSEXP, SEXP oldCovarianceSEXP, SEXP NSEXP, SEXP newElementSEXP, SEXP isInversionSEXP) { +BEGIN_RCPP + Rcpp::RObject rcpp_result_gen; + Rcpp::RNGScope rcpp_rngScope_gen; + Rcpp::traits::input_parameter< NumericVector >::type oldMean(oldMeanSEXP); + Rcpp::traits::input_parameter< NumericVector >::type newMean(newMeanSEXP); + Rcpp::traits::input_parameter< NumericMatrix >::type oldCovariance(oldCovarianceSEXP); + Rcpp::traits::input_parameter< int >::type N(NSEXP); + Rcpp::traits::input_parameter< NumericVector >::type newElement(newElementSEXP); + Rcpp::traits::input_parameter< bool >::type isInversion(isInversionSEXP); + rcpp_result_gen = Rcpp::wrap(shc_CalculateCovariance(oldMean, newMean, oldCovariance, N, newElement, isInversion)); + return rcpp_result_gen; +END_RCPP +} // kmnsw Rcpp::List kmnsw(Rcpp::NumericMatrix a_R, Rcpp::NumericMatrix c_R, Rcpp::NumericVector wh_R, int k, int iter); RcppExport SEXP _stream_kmnsw(SEXP a_RSEXP, SEXP c_RSEXP, SEXP wh_RSEXP, SEXP kSEXP, SEXP iterSEXP) { @@ -30,14 +97,20 @@ RcppExport SEXP _rcpp_module_boot_MOD_BICO(); RcppExport SEXP _rcpp_module_boot_MOD_BIRCH(); RcppExport SEXP _rcpp_module_boot_MOD_DBSTREAM(); RcppExport SEXP _rcpp_module_boot_MOD_DStream(); +RcppExport SEXP _rcpp_module_boot_SHCModule(); RcppExport SEXP _rcpp_module_boot_MOD_evoStream(); static const R_CallMethodDef CallEntries[] = { + {"_stream_shc_MDSeparation", (DL_FUNC) &_stream_shc_MDSeparation, 12}, + {"_stream_shc_MutualMinMahalanobis", (DL_FUNC) &_stream_shc_MutualMinMahalanobis, 10}, + {"_stream_shc_CalculateNewMean", (DL_FUNC) &_stream_shc_CalculateNewMean, 3}, + {"_stream_shc_CalculateCovariance", (DL_FUNC) &_stream_shc_CalculateCovariance, 6}, {"_stream_kmnsw", (DL_FUNC) &_stream_kmnsw, 5}, {"_rcpp_module_boot_MOD_BICO", (DL_FUNC) &_rcpp_module_boot_MOD_BICO, 0}, {"_rcpp_module_boot_MOD_BIRCH", (DL_FUNC) &_rcpp_module_boot_MOD_BIRCH, 0}, {"_rcpp_module_boot_MOD_DBSTREAM", (DL_FUNC) &_rcpp_module_boot_MOD_DBSTREAM, 0}, {"_rcpp_module_boot_MOD_DStream", (DL_FUNC) &_rcpp_module_boot_MOD_DStream, 0}, + {"_rcpp_module_boot_SHCModule", (DL_FUNC) &_rcpp_module_boot_SHCModule, 0}, {"_rcpp_module_boot_MOD_evoStream", (DL_FUNC) &_rcpp_module_boot_MOD_evoStream, 0}, {NULL, NULL, 0} }; diff --git a/src/SHC/.DS_Store b/src/SHC/.DS_Store new file mode 100644 index 0000000..5008ddf Binary files /dev/null and b/src/SHC/.DS_Store differ diff --git a/src/SHC/SHC b/src/SHC/SHC new file mode 100644 index 0000000..26f1e57 --- /dev/null +++ b/src/SHC/SHC @@ -0,0 +1 @@ +#include "SHC.hpp" diff --git a/src/SHC/SHC.cpp b/src/SHC/SHC.cpp new file mode 100644 index 0000000..e1274b3 --- /dev/null +++ b/src/SHC/SHC.cpp @@ -0,0 +1,1208 @@ +#include +#include +#include "SHC.hpp" +#include "SHC_Component.hpp" +#include "SHC_Container.hpp" +#include +#include +#include +#include +#include + +using namespace std; + +SHC_Exception::SHC_Exception(const char *msg) { + this->msg=msg; +} +const char *SHC_Exception::what() const throw() { + return this->msg; +} + +SHC::SHC(double theta, bool parallelize, bool performSharedAgglomeration, VectorXd *virtualVariance, int agglo_count, double cbVarianceLimit, + long cbNLimit, float driftRemoveCompSizeRatio, float driftCheckingSizeRatio, float driftMovementMDThetaRatio, int decayPace, + float componentFormingMinVVRatio, float componentBlockingLimitVVRatio, bool cache) { + this->parallelize=parallelize; + this->performSharedAgglomeration=performSharedAgglomeration; + this->decay_pace=decayPace; + this->theta=theta; + this->virtualVariance=new VectorXd(*virtualVariance); + this->agglo_count=agglo_count; + this->agglo_counter=agglo_count; + this->cbVarianceLimit=cbVarianceLimit; + this->cbNLimit=cbNLimit; + this->containers=new SHC_Containable_Set(); + this->driftRemoveCompSizeRatio=driftRemoveCompSizeRatio; + this->driftCheckingSizeRatio=driftCheckingSizeRatio; + this->driftMovementMDThetaRatio=driftMovementMDThetaRatio; + this->componentFormingMinVVRatio=componentFormingMinVVRatio; + this->componentBlockingLimitVVRatio=componentBlockingLimitVVRatio; + this->cache=cache; +} + +SHC::SHC(int dimensions,AgglomerationType aggloType,DriftType driftType,int decayPace,bool cache,bool parallelize) { + if(dimensions<1) throw SHC_Exception("Behavioral SHC must have supplied dimensions>0"); + this->parallelize=parallelize; + this->performSharedAgglomeration=true; + this->decay_pace=decayPace; + this->componentFormingMinVVRatio=0.2; + this->componentBlockingLimitVVRatio=0.0; + this->cache=cache; + switch (aggloType) { + case RelaxedAgglomeration: + this->theta=2.9; + this->virtualVariance=new VectorXd(0.8*VectorXd::Ones(dimensions)); + this->agglo_count=50; + this->driftRemoveCompSizeRatio=0.3; + break; + case AggresiveAgglomeration: + this->theta=3.5; + this->virtualVariance=new VectorXd(1.2*VectorXd::Ones(dimensions)); + this->agglo_count=1; + this->driftRemoveCompSizeRatio=0.05; + break; + default: + this->theta=3.2; + this->virtualVariance=new VectorXd(1.0*VectorXd::Ones(dimensions)); + this->agglo_count=25; + this->driftRemoveCompSizeRatio=0.1; + break; + } + switch (driftType) { + case FastDrift: + this->cbNLimit=40; + this->cbVarianceLimit=6; + this->driftCheckingSizeRatio=1.0; + this->driftMovementMDThetaRatio=0.6; + break; + case SlowDrift: + this->cbNLimit=120; + this->cbVarianceLimit=9; + this->driftCheckingSizeRatio=2.0; + this->driftMovementMDThetaRatio=1.0; + break; + case NoDrift: + this->cbNLimit=0; + this->cbVarianceLimit=0; + break; + case UltraFastDrift: + this->cbNLimit=20; + this->cbVarianceLimit=6; + this->driftCheckingSizeRatio=0.5; + this->driftMovementMDThetaRatio=0.5; + this->driftRemoveCompSizeRatio=0.05; + this->agglo_count=1; + break; + default: + this->cbNLimit=80; + this->cbVarianceLimit=8; + this->driftCheckingSizeRatio=1.3; + this->driftMovementMDThetaRatio=0.8; + break; + } + this->containers=new SHC_Containable_Set(); + this->agglo_counter=this->agglo_count; +} + +void SHC::t1(SHC *shc, SHC_Component *comp, VectorXd *newElement, vector> *classified_map, vector> *neighborhood_map, + vector> *obsolete_map, double theta) { + + std::chrono::time_point qst=std::chrono::system_clock::now(); + double md=comp->mahalanobisDistance(newElement); + std::chrono::time_point qet=std::chrono::system_clock::now(); + shc->qTime+=std::chrono::duration_cast(qet-qst).count(); + shc->nodeCounter++; + shc->m1.lock(); + if(comp->isObsolete() && md<=theta) obsolete_map->push_back(pair(comp,md)); + else if(!comp->isObsolete() && md<=theta) classified_map->push_back(pair(comp,md)); + else if(!comp->isObsolete() && md>theta && md<=(3*theta)) neighborhood_map->push_back(pair(comp,md)); + shc->m1.unlock(); +} + +double SHC::getTheta() { + return theta; +} + +VectorXd *SHC::getVirtualVariance() { + return virtualVariance; +} + +_int_cr SHC::classify_p1(bool classifyOnly,vector> *obsolete_map,vector> *classified_map, + vector> *neighborhood_map/*,vector *workers*/) { + /*if(parallelize && workers!=NULL) + for(thread *t:*workers) + t->join(); + for(thread *t:*workers) delete t; + delete workers;*/ + sort(classified_map->begin(),classified_map->end(),[=](pair& a,pair& b) { + return a.secondbegin(),neighborhood_map->end(),[=](pair& a,pair& b) { + return a.secondsize()>0) { + int offset=1; + while((classified_map->begin()+offset)!=classified_map->end()) { + pair removed=agglomerate((classified_map->begin())->first,(classified_map->begin()+offset)->first); + if(removed.second) { + classified_map->erase(classified_map->begin()+offset); + offset=1; + } else if(removed.first) { + classified_map->erase(classified_map->begin()); + offset=1; + } else offset++; + } + } +// string *agglo_comp=NULL,*min_comp=NULL; +/* if(!classifyOnly) { + if(res_map->size()>1) { + pair removed=agglomerate(res_map->at(0).first,res_map->at(1).first); + if(removed.second) { + res_map->erase(res_map->begin()+1); + } else if(removed.first) { + res_map->erase(res_map->begin()); + } else agglo_comp=new string(res_map->at(1).first); + } + if(res_map->size()>0) for(auto obs_comp_id:*obs_map) agglomerate(res_map->at(0).first, obs_comp_id.first); + }*/ + // we changed it slightly + for(auto c_comp_ip:*classified_map) + for(auto obs_comp_id:*obsolete_map) + agglomerate(obs_comp_id.first,c_comp_ip.first); + SHC_Component *min_comp=NULL; + double min_md=-1; + for(auto c_comp_ip:*classified_map) + if(min_comp==NULL && !c_comp_ip.first->isBlocked()) { + min_comp=c_comp_ip.first; + min_md=c_comp_ip.second; + break; + } + if(min_comp==NULL && classified_map->size()>0) { + min_comp=classified_map->at(0).first; + min_md=classified_map->at(0).second; + } + delete obsolete_map; + return _int_cr{min_comp,neighborhood_map,classified_map,min_md}; +} + +_int_cr SHC::classify(Eigen::VectorXd *newElement, bool classifyOnly, set *excludeComponents) { + vector> *classified_map=new vector>(), + *obsolete_map=new vector>(), + *neighborhood_map=new vector>(); + + for(unordered_map::iterator it=components.begin();it!=components.end();it++) { + if(excludeComponents==NULL || excludeComponents->find(it->second)==excludeComponents->end()) { + SHC::t1(this,it->second,newElement,classified_map,neighborhood_map,obsolete_map,theta); + } + } + + return classify_p1(classifyOnly,obsolete_map,classified_map,neighborhood_map/*,workers*/); +} + +_int_cr SHC::classifySigmaIndex(Eigen::VectorXd *newElement, bool classifyOnly, set *excludeComponents, SHC_Component *starting) { + vector> *classified_map1=new vector>(), + *obsolete_map=new vector>(), + *neighborhood_map=NULL; + if(sigma_index!=NULL) { + SigmaIndexQueryResults *sigres=sigma_index->query(newElement,excludeComponents,starting); + + neighborhood_map=sigres->neighborhood; + for(pair it:*sigres->classified) + if(it.first->isObsolete()) obsolete_map->push_back(it); + else classified_map1->push_back(it); + if(!classifyOnly && classified_map1->size()>0) { + int offset=1; + while((classified_map1->begin()+offset)!=classified_map1->end()) { + pair removed=agglomerate((classified_map1->begin())->first,(classified_map1->begin()+offset)->first); + if(removed.second) { + classified_map1->erase(classified_map1->begin()+offset); + offset=1; + } else if(removed.first) { + classified_map1->erase(classified_map1->begin()); + offset=1; + } else offset++; + } + } + for(auto c_comp_ip:*classified_map1) + for(auto obs_comp_id:*obsolete_map) + agglomerate(obs_comp_id.first,c_comp_ip.first); + SHC_Component *min_comp=NULL; + double min_md=-1; + for(auto c_comp_ip:*classified_map1) + if(min_comp==NULL && !c_comp_ip.first->isBlocked()) { + min_comp=c_comp_ip.first; + min_md=c_comp_ip.second; + break; + } + if(min_comp==NULL && classified_map1->size()>0) { + min_comp=classified_map1->at(0).first; + min_md=classified_map1->at(0).second; + } + delete obsolete_map;delete sigres->classified;delete sigres; + return _int_cr{min_comp,neighborhood_map,classified_map1,min_md}; + } else throw SHC_Exception("Sigma index is not constructed, yet SHC want to use it :-S"); +} + +_int_cr SHC::classify(Eigen::VectorXd *newElement, SHC_Component *parentComponent, bool classifyOnly) { + vector> *classified_map=new vector>(), + *obsolete_map=new vector>(), + *neighborhood_map=new vector>(); + + unordered_map ccomps=parentComponent->fetchChildComponents(); + for(unordered_map::iterator it=ccomps.begin();it!=ccomps.end();it++) { + if(typeid(it->second)==typeid(SHC_Component)) { + SHC_Component *comp=static_cast(it->second); + SHC::t1(this,comp,newElement,classified_map,neighborhood_map,obsolete_map,theta); + } + } + + return classify_p1(classifyOnly,obsolete_map,classified_map,neighborhood_map/*,workers*/); +} + +_int_cr SHC::classify(Eigen::VectorXd *newElement, vector *forComponents, bool classifyOnly) { + vector> *classified_map=new vector>(), + *obsolete_map=new vector>(), + *neighborhood_map=new vector>(); + + for(SHC_Component *comp:*forComponents) { + SHC::t1(this,comp,newElement,classified_map,neighborhood_map,obsolete_map,theta); + } + + return classify_p1(classifyOnly,obsolete_map,classified_map,neighborhood_map/*,workers*/); +} + +shared_ptr SHC::process(VectorXd *newElement, bool classifyOnly) { + // Decay procedure... we remove at the beginning to reduce query + vector decay_rem; + for(auto comp_it:components) { + if(!comp_it.second->isObsolete() && comp_it.second->decayPing()) + decay_rem.push_back(comp_it.second->getId()); + } + for(string rcompId:decay_rem) { + if(components.find(rcompId)!=components.end()) { + SHC_Component *rcomp=components[rcompId]; + if(rcomp->isOutlier()) { + decayedOutliers.insert(rcomp->getId()); + removeComponent(rcomp); + } else processObsoleteComponent(rcomp); + } + } + // Query + _int_cr class_res=sigma_index==NULL ? classify(newElement, classifyOnly) : classifySigmaIndex(newElement, classifyOnly); + + shared_ptr res=make_shared(); + SHC_Component *comp=NULL; + if(class_res.comp) { + comp=class_res.comp; + bool added=false; + if(!classifyOnly) { + added=comp->addNewElement(newElement,this,class_res.classified_map); + if(sigma_index) { + std::chrono::time_point ust=std::chrono::system_clock::now(); + sigma_index->update(comp->getFinalRedirectedComponent(),class_res.getCombinedNeighborhood()); + std::chrono::time_point uet=std::chrono::system_clock::now(); + uTime+=std::chrono::duration_cast(uet-ust).count(); + } + } + res->component_id=new string(comp->getId()); + res->cluster_id=comp->getParent()!=NULL ? new string(comp->getParent()->getId()) : NULL; + } else { + if(!classifyOnly) { + comp=new SHC_Component(containers,newElement,virtualVariance,cbVarianceLimit,cbNLimit,driftCheckingSizeRatio, + driftMovementMDThetaRatio,decay_pace,componentFormingMinVVRatio,componentBlockingLimitVVRatio); + comp->switchDecay(decay,decay_pace); + comp->setSourceNode(deltaLoggingSourceName); + if(eventCallback) comp->setEventCallback(eventCallback); + SHC_Containable *cluster=new SHC_Containable(containers, comp); + components[comp->getId()]=comp; + if(delta) delta->addComponentDeltaElements(comp,deltaLoggingSourceName); + if(sigma_index) { + std::chrono::time_point ust=std::chrono::system_clock::now(); + sigma_index->create(comp); + std::chrono::time_point uet=std::chrono::system_clock::now(); + uTime+=std::chrono::duration_cast(uet-ust).count(); + } + res->component_id=new string(comp->getId()); + res->cluster_id=new string(cluster->getId()); + res->outlier=true; + } else res->outlier=true; + } + if(!classifyOnly) { + for(pair it:*class_res.neighborhood_map) + comp->addNeighbor(it.first,this,true); + comp->agglomerateNeighborhood(this); + pseudoOffline(); + processObsoleteComponent(comp); + if(cache) { + _cache.push_back(*newElement); + for(pair it:*class_res.classified_map) { + if(_cache2.find(it.first->getId())==_cache2.end()) _cache2[it.first->getId()]=vector(); + _cache2[it.first->getId()].push_back(*newElement); + } + } + } + return res; +} + +pair>>,shared_ptr> SHC::process(MatrixXd *elements, bool initNewDeltaLogger, bool classifyOnly) { + if(parallelize) { + if(!delta || (delta && initNewDeltaLogger)) { + if(delta) delta.reset(); + delta=make_shared(); + } + } + qTime=0;uTime=0;compTime=0; + std::chrono::time_point pst=std::chrono::system_clock::now(); + if(sigma_index) sigma_index->resetStatistics(); + else this->nodeCounter=0; + shared_ptr>> res=make_shared>>(); + if(elements!=NULL) { + for(int i(0);irows();i++) { + if(eventCallback) eventCallback(new SHCEvent(SlicePositionChange,i)); + VectorXd row=elements->row(i); + shared_ptr cr=process(&row,classifyOnly); + res->push_back(cr); + } + } + std::chrono::time_point pet=std::chrono::system_clock::now(); + pTime=std::chrono::duration_cast(pet-pst).count(); + if(delta) + for(pair it:components) delta->finalizeComponent(it.second); + if(sigma_index) { + SigmaIndexStatistics *st=sigma_index->getStatistics(); + qTime=st->qTime; + delete st; + } + + return make_pair(res,delta); +} + +SHC_Component *SHC::getComponent(string *comp_id) { + if(components.find(*comp_id)!=components.end()) + return components[*comp_id]; + return NULL; +} + +void SHC::agglomerateComponents(SHC_Component *c1,SHC_Component *c2) { + agglomerate(c1,c2); +} + +pair SHC::agglomerate(SHC_Component *c1,SHC_Component *c2,bool add_point) { + bool r1=false,r2=false; + SHC_ComponentConnection *cc=NULL; + if(c1->getParent()!=c2->getParent()) { + if(c1->isOutlier() && !c1->isObsolete() && !c2->isObsolete()) { + if(c2->addNewElement(c1->getMean(),this)) { + c2->addTrace(c1); + c2->getParent()->addTrace(c1->getParent()); + r1=true; + removeComponent(c1); + } + } else if(c2->isOutlier() && !c1->isObsolete() && !c2->isObsolete()) { + if(c1->addNewElement(c2->getMean(),this)) { + c1->addTrace(c2); + c1->getParent()->addTrace(c2->getParent()); + r2=true; + removeComponent(c2); + } + } else if(!c1->isOutlier() && !c2->isOutlier()) { + cc=connections.connect(c1, c2, add_point); + if(cc->getPoints()>=(long)this->sha_agglo_theta && c1->getParent()!=c2->getParent()) + joinComponents(c1, c2); + } + } else cc=connections.connect(c1, c2, add_point); + if(delta && cc) delta->logComponentConnection(cc); + return make_pair(r1, r2); +} + +void SHC::joinComponents(SHC_Component *c1, SHC_Component *c2) { + unordered_map p1_children=c1->getParent()->fetchChildComponents(); + long p1_elms=0; + for(auto p1it:p1_children) + if(typeid(*p1it.second)==typeid(SHC_Component)) p1_elms+=(dynamic_cast(p1it.second))->getElements(); + unordered_map p2_children=c2->getParent()->fetchChildComponents(); + long p2_elms=0; + for(auto p2it:p2_children) + if(typeid(*p2it.second)==typeid(SHC_Component)) p2_elms+=(dynamic_cast(p2it.second))->getElements(); + if(p1_elms>p2_elms) { + c1->getParent()->addTrace(c2->getParent()); + for(auto p2it:p2_children) { + p2it.second->detachFromParent(); + p2it.second->attachToParent(c1->getParent()); + } + } else { + c2->getParent()->addTrace(c1->getParent()); + for(auto p1it:p1_children) { + p1it.second->detachFromParent(); + p1it.second->attachToParent(c2->getParent()); + } + } +} + +void SHC::pseudoOffline(bool force) { + if(force || (agglo_count>0 && --agglo_counter<=0)) { + agglo_counter=agglo_count; + //set *obs_remove=new set(); + pair*,set*> co=getOutliersAndComponents(); + set *exclude=new set(); + for(string testing_comp_id:*co.first) { + exclude->clear(); + SHC_Component *test_comp=components[testing_comp_id]; + exclude->insert(test_comp); + if(!test_comp->isObsolete()) { + // temp out for test + if(test_comp->isRedirected() && test_comp->getElements()<=0.2*test_comp->getRedirectedComponent()->getElements()) { + SHC_Component *super_comp=test_comp->getRedirectedComponent(); + super_comp->setElements(super_comp->getElements()+test_comp->getElements()); + incDeltaElementsInDeltaLog(super_comp,test_comp->getElements()); + super_comp->addTrace(test_comp); + removeComponent(test_comp); + } else if(!test_comp->isRedirected()) { + std::chrono::time_point qst=std::chrono::system_clock::now(); + _int_cr class_res=!sigma_index ? classify(test_comp->getMean(),true,exclude) : classifySigmaIndex(test_comp->getMean(),true,exclude,test_comp); + std::chrono::time_point qet=std::chrono::system_clock::now(); + qTime+=std::chrono::duration_cast(qet-qst).count(); + if(class_res.comp && !class_res.comp->isBlocked()) { + if(!class_res.comp->isOutlier() && test_comp->getElements()getElements()) { + test_comp->redirectComponent(class_res.comp); + agglomerate(test_comp, class_res.comp); + } + } + } + } /*else { + set *ccomps=connections.getConnectedComponents(test_comp); + long telms=0; + bool remove=(ccomps->size()>0 ? false : true),all_neigh_obs=true; + if(!remove) + for(SHC_Component *comp:*ccomps) { + if(!comp->isObsolete()) { + all_neigh_obs=false; + telms+=comp->getElements(); + if(comp->hasBaseline()) { + remove=true; + break; + } + } + } + if(!remove && all_neigh_obs) remove=true; + if(!remove && telms>driftRemoveCompSizeRatio*test_comp->getElements()) remove=true; + if(remove || obs_remove->find(test_comp)!=obs_remove->end()) { + obs_remove->insert(test_comp); + for(SHC_Component *comp:*ccomps) if(comp->isObsolete()) obs_remove->insert(comp); + } + delete ccomps; + }*/ + } + /*for(SHC_Component *oc:*obs_remove) { + string oc_id=oc->getId(); + removeComponent(oc); + if(eventCallback!=NULL) eventCallback(new SHCEvent(AfterObsoleteComponentDeleted,oc_id)); + }*/ + delete exclude;delete co.first;delete co.second;//delete obs_remove; + } +} + +void SHC::processObsoleteComponent(SHC_Component *test_comp) { + if(!test_comp->isOutlier() && test_comp->isObsolete()) { + set *ccomps=connections.getConnectedComponents(test_comp), + *obs_remove=new set(); + + long telms=0; + bool remove=(ccomps->size()>0 ? false : true),all_neigh_obs=true; + if(!remove) + for(SHC_Component *comp:*ccomps) { + if(!comp->isObsolete()) { + all_neigh_obs=false; + telms+=comp->getElements(); + if(comp->hasBaseline()) { + remove=true; + break; + } + } + } + if(!remove && all_neigh_obs) remove=true; + if(!remove && telms>driftRemoveCompSizeRatio*test_comp->getElements()) remove=true; + if(remove) { + obs_remove->insert(test_comp); + for(SHC_Component *comp:*ccomps) if(comp->isObsolete()) obs_remove->insert(comp); + } + delete ccomps; + for(SHC_Component *oc:*obs_remove) { + string oc_id=oc->getId(); + removeComponent(oc); + if(eventCallback!=NULL) eventCallback(new SHCEvent(AfterObsoleteComponentDeleted,oc_id)); + } + delete obs_remove; + } +} + + +pair*,set*> SHC::getOutliersAndComponents() { + set *res_comps=new set(),*res_outs=new set(); + for(auto comp:components) + if(comp.second->isOutlier()) res_outs->insert(comp.first); + else res_comps->insert(comp.first); + return pair*,set*>(res_comps, res_outs); +} + +void SHC::removeComponent(SHC_Component *c) { + if(c->isObsolete() && eventCallback!=NULL) eventCallback(new SHCEvent(BeforeObsoleteComponentDeleted,c->getId())); + if(!c->isOutlier()) { + set *cc=connections.getConnectedComponents(c); + for(SHC_Component *cretr:*cc) cretr->addTrace(c); + delete cc; + } + c->removeFromNeighborhood(this); + c->detachFromParent(); + vector *>> *partitions=connections.removeComponent(c); + sort(partitions->begin(), partitions->end(), [=](pair *>& a,pair *>& b) { + return a.first>b.first; + }); + if(partitions->size()>1) { + bool biggest=true; + for(pair *> p_part:*partitions) { + if(!biggest) { + SHC_Containable *cluster=NULL; + for(SHC_Component *comp:*p_part.second) { + SHC_Containable *prev_parent=comp->getParent(); + pair> *prev_pair=NULL; + if(!cluster) prev_pair=new pair>(prev_parent->getId(),set(*prev_parent->getTrace())); + comp->detachFromParent(); + if(!cluster) { + cluster=new SHC_Containable(containers, comp); + if(prev_pair!=NULL) { + cluster->addTrace(prev_pair); + delete prev_pair; + } + } else comp->attachToParent(cluster); + } + } else biggest=false; + } + } + for(pair *> p_part:*partitions) delete p_part.second; + delete partitions; + if(sigma_index) sigma_index->remove(c); + if(delta) delta->logComponentRemoval(c); + components.erase(c->getId()); + delete c; +} + +void SHC::print(ostream &o_str) { + o_str << "**** Statistical Hierarchical Clustering ****" << endl; + for(auto it=components.begin();it!=components.end();it++) + it->second->print(o_str); +} + +set *SHC::getTopContainers(bool clusters, bool outliers) { + set *tc=new set(); + for(auto comp_pair:components) { + if(outliers && comp_pair.second->isOutlier()) { + SHC_Containable *top=comp_pair.second->fetchTopContainer(); + tc->insert(top->getId()); + } + if(clusters && !comp_pair.second->isOutlier()) { + SHC_Containable *top=comp_pair.second->fetchTopContainer(); + tc->insert(top->getId()); + } + } + return tc; +} + +vector *SHC::getClassificationDetails(string *container_id,double theta,int dim1,int dim2,bool single) { + vector *res=new vector(); + SHC_Containable *cont=containers->fetchContainer(container_id); + if(cont!=NULL) { + unordered_map map=cont->fetchChildComponents(); + for(auto comp:map) { + if(typeid(*comp.second)==typeid(SHC_Component)) { + SHC_Component *c1=static_cast(comp.second); + res->push_back(c1->calculateDetails(theta,dim1,dim2,cbVarianceLimit,cbNLimit,single)); + } + } + } + return res; +} +vector *SHC::getClassificationDetails(string *container_id) { + return getClassificationDetails(container_id, theta, 0, 1, true); +} + +SHC::~SHC() { + delete virtualVariance; + delete containers; + if(sigma_index) delete sigma_index; +} + +set *SHC::getUnrelatedComponents(SHC_Component *comp) { + SHC_Containable *clus=comp->fetchTopContainer(); + set *clusters=getTopContainers(); + set *res_comps=new set(); + for(string clus_id:*clusters) { + if(clus_id!=clus->getId()) { + SHC_Containable *cluster=containers->fetchContainer(&clus_id); + unordered_map child_comps=cluster->fetchChildComponents(); + for(unordered_map::iterator it=child_comps.begin();it!=child_comps.end();it++) + res_comps->insert(static_cast(it->second)); + } + } + delete clusters; + return res_comps; +} + +double SHC::calculateFragmentationIndex() { + SHC_Component *bc=getBiggestComponent(); + if(bc==NULL) return (double)1; + return (double)(1-(double)getBiggestComponent()->getElements()/(double)getTotalElements()); +} + +SHC_Component *SHC::getBiggestComponent() { + if(components.size()<1) return NULL; + auto max_it=max_element(components.begin(), components.end(), [=](pair a,pair b) { + return a.second->getElements()getElements(); + }); + return max_it->second; +} + +long SHC::getTotalElements(bool components,bool outliers) { + long total=0; + long res=accumulate(this->components.begin(), this->components.end(), total, [=](long a,pair b) { + if(components && !b.second->isOutlier()) return a+b.second->getElements(); + else if(outliers && b.second->isOutlier()) return a+b.second->getElements(); + else return a; + }); + return res; +} + +unordered_map *SHC::getComponents() { + return &components; +} + +vector *SHC::getComponents(string *container_id) { + vector *res=new vector(); + SHC_Containable *clus=containers->fetchContainer(container_id); + if(clus!=NULL) { + for(pair it:clus->fetchChildComponents()) + res->push_back(dynamic_cast(it.second)); + } + return res; +} + +vector *SHC::getOutliers() { + vector *res=new vector(); + for(pair it:components) + if(it.second->isOutlier()) res->push_back(it.second); + return res; +} + +void SHC::merge(SHC *mergedClassifier, SHC_Component *obsoleteComponent, unordered_map*> *front_conn_map) { + if(mergedClassifier==NULL) return; + for(auto m_comp:mergedClassifier->components) { + SHC_Component *trans_comp=m_comp.second->clone(containers,true); + if(eventCallback) trans_comp->setEventCallback(eventCallback); + trans_comp->switchDecay(decay,decay_pace); + //trans_comp->resetBaseline(); + trans_comp->setBaselineLimits(cbVarianceLimit, cbNLimit); + trans_comp->setDriftOptions(driftCheckingSizeRatio, driftMovementMDThetaRatio); + components[trans_comp->getId()]=trans_comp; + if(obsoleteComponent==NULL) { + string *cluster_id=new string(m_comp.second->getParent()->getId()); + SHC_Containable *cluster=containers->fetchContainer(cluster_id); + delete cluster_id; + if(cluster==NULL) cluster=new SHC_Containable(containers,trans_comp); + else trans_comp->attachToParent(cluster); + } else { + SHC_Containable *cluster=obsoleteComponent->getParent(); + trans_comp->attachToParent(cluster); + connections.connect(trans_comp, obsoleteComponent); + } + } + set *mc_cc=mergedClassifier->connections.getConnections(); + for(SHC_ComponentConnection *ccn:*mc_cc) + connections.connect(components[ccn->getComponent1()->getId()], components[ccn->getComponent2()->getId()]); + delete mc_cc; + if(front_conn_map!=NULL) { + for(auto it:*front_conn_map) + for(string child_comp_id:*it.second) { + set *nccid_s=mergedClassifier->traceComponentId(&child_comp_id); + for(string nccid:*nccid_s) connections.connect(components[it.first], components[nccid]); + delete nccid_s; + } + } + for(auto m_comp:mergedClassifier->components) + if(m_comp.second->isRedirected()) + components[m_comp.second->getId()]->redirectComponent(components[m_comp.second->getRedirectedComponent()->getId()]); + if(sigma_index) { // update sigma index as well + for(auto m_comp:mergedClassifier->components) sigma_index->create(components[m_comp.first]); + for(auto m_comp:mergedClassifier->components) { + SHC_Component *c=components[m_comp.first]; + set *ngs=connections.getConnectedComponents(c); + unordered_map *n=new unordered_map(); + for(SHC_Component *nc:*ngs) (*n)[nc]=0; + delete ngs; + for(auto m_comp2:mergedClassifier->components) + if(m_comp.first!=m_comp2.first) (*n)[components[m_comp2.first]]=0; + sigma_index->update(c,n); + } + } +} + +SHC::SHC(SHC *cloneFrom):SHC(cloneFrom->theta,cloneFrom->parallelize,cloneFrom->performSharedAgglomeration,cloneFrom->virtualVariance, + cloneFrom->agglo_count,cloneFrom->cbVarianceLimit,cloneFrom->cbNLimit) { + if(cloneFrom->sigma_index) this->sigma_index=cloneFrom->sigma_index->clone(); + this->sha_agglo_theta=cloneFrom->sha_agglo_theta; + this->componentFormingMinVVRatio=cloneFrom->componentFormingMinVVRatio; + this->componentBlockingLimitVVRatio=cloneFrom->componentBlockingLimitVVRatio; + switchDecay(cloneFrom->decay); + cloned=true; + // clone containers that are not components + for(auto m_container:*cloneFrom->containers->getAllContainers()) { + if(typeid(*m_container.second)!=typeid(SHC_Component)) { + SHC_Containable *n_container=new SHC_Containable(this->containers); + n_container->setId(m_container.second->getId()); + } + } + // retain relationships among containers + for(auto m_container:*cloneFrom->containers->getAllContainers()) { + if(typeid(*m_container.second)!=typeid(SHC_Component)) { + SHC_Containable *p_container=dynamic_cast(m_container.second); + string *container_id=new string(p_container->getId()); + SHC_Containable *nc_container1=this->containers->fetchContainer(container_id); + delete container_id; + for(auto ch1:*p_container->getChildren()) { + if(typeid(*ch1.second)!=typeid(SHC_Component)) { + string *ch_container_id=new string(ch1.second->getId()); + SHC_Containable *nc_container2=this->containers->fetchContainer(ch_container_id); + delete ch_container_id; + nc_container1->addChild(nc_container2); + } + } + } + } + for(auto m_comp:cloneFrom->components) { + SHC_Component *cloned_comp=m_comp.second->clone(this->containers,true); + this->components[cloned_comp->getId()]=cloned_comp; + SHC_Containable *cluster=m_comp.second->getParent(); + string *cluster_id=new string(cluster->getId()); + SHC_Containable *cloned_cluster=this->containers->fetchContainer(cluster_id); + delete cluster_id; + cloned_comp->attachToParent(cloned_cluster); + } + set *ccn_set=cloneFrom->connections.getConnections(); + for(SHC_ComponentConnection *ccn:*ccn_set) + this->connections.connect(this->components[ccn->getComponent1()->getId()], this->components[ccn->getComponent2()->getId()]); + delete ccn_set; + for(auto m_comp:cloneFrom->components) + if(m_comp.second->isRedirected()) + this->components[m_comp.second->getId()]->redirectComponent(this->components[m_comp.second->getRedirectedComponent()->getId()]); +} + +SHC *SHC::clone() { + SHC *nc=new SHC(theta, parallelize, performSharedAgglomeration, virtualVariance, agglo_count, cbVarianceLimit, cbNLimit); + if(this->sigma_index) nc->sigma_index=this->sigma_index->clone(); + nc->sha_agglo_theta=this->sha_agglo_theta; + nc->switchDecay(decay); + nc->cloned=true; + // clone containers that are not components + for(auto m_container:*containers->getAllContainers()) { + if(typeid(*m_container.second)!=typeid(SHC_Component)) { + SHC_Containable *n_container=new SHC_Containable(nc->containers); + n_container->setId(m_container.second->getId()); + } + } + // retain relationships among containers + for(auto m_container:*containers->getAllContainers()) { + if(typeid(*m_container.second)!=typeid(SHC_Component)) { + SHC_Containable *p_container=dynamic_cast(m_container.second); + string *container_id=new string(p_container->getId()); + SHC_Containable *nc_container1=nc->containers->fetchContainer(container_id); + delete container_id; + for(auto ch1:*p_container->getChildren()) { + if(typeid(*ch1.second)!=typeid(SHC_Component)) { + string *ch_container_id=new string(ch1.second->getId()); + SHC_Containable *nc_container2=nc->containers->fetchContainer(ch_container_id); + delete ch_container_id; + nc_container1->addChild(nc_container2); + } + } + } + } + for(auto m_comp:components) { + SHC_Component *cloned_comp=m_comp.second->clone(nc->containers,true); + nc->components[cloned_comp->getId()]=cloned_comp; + SHC_Containable *cluster=m_comp.second->getParent(); + string *cluster_id=new string(cluster->getId()); + SHC_Containable *cloned_cluster=nc->containers->fetchContainer(cluster_id); + delete cluster_id; + cloned_comp->attachToParent(cloned_cluster); + } + set *ccn_set=connections.getConnections(); + for(SHC_ComponentConnection *ccn:*ccn_set) + nc->connections.connect(nc->components[ccn->getComponent1()->getId()], nc->components[ccn->getComponent2()->getId()]); + delete ccn_set; + for(auto m_comp:components) + if(m_comp.second->isRedirected()) + nc->components[m_comp.second->getId()]->redirectComponent(nc->components[m_comp.second->getRedirectedComponent()->getId()]); + return nc; +} + +void SHC::setAgglomerationCount(int agglo_count) { + this->agglo_count=agglo_count; +} + +void SHC::setEventCallback(function eventCallback) { + this->eventCallback=eventCallback; +} + +set *SHC::traceComponentId(string *id) { + set *res=new set(); + if(components.find(*id)!=components.end()) res->insert(*id); + for(auto comp_it:components) if(comp_it.second->checkTrace(*id)) res->insert(comp_it.first); + return res; +} + +MatrixXd *SHC::getCache() { + MatrixXd *res=new MatrixXd(_cache.size(),virtualVariance->size()); + for(unsigned i=0;i<_cache.size();i++) + res->row(i)=_cache.at(i); + return res; +} +MatrixXd *SHC::getCache(string *id) { + if(_cache2.find(*id)==_cache2.end()) return NULL; + MatrixXd *res=new MatrixXd(_cache2[*id].size(),virtualVariance->size()); + for(unsigned i=0;i<_cache2[*id].size();i++) + res->row(i)=_cache2[*id].at(i); + return res; +} + +double SHC::meanWeightedComponentMDFrom(SHC_Component *fromComponent) { + double incsum=0,elsum=0; + for(auto comp:components) { + if(!comp.second->isOutlier()) { + incsum+=comp.second->getElements()*fromComponent->mahalanobisDistance(comp.second->getMean()); + //counter++; + elsum+=comp.second->getElements(); + } + } + return elsum==0 ? 0 : incsum/elsum; +} + +void SHC::switchDecay(bool decay) { + this->decay=decay; +} + +void SHC::cleanOutliers() { + set *outliers=new set(); + for(pair it:components) + if(it.second->isOutlier()) outliers->insert(it.first); + for(string out_id:*outliers) removeComponent(components[out_id]); + delete outliers; +} + +bool SHC::recheckOutlier(string *id) { + vector *outs=getOutliers(); + for(SHC_Component *c:*outs) if(*id==c->getId()) { + delete outs; + return true; + } + delete outs; + if(decayedOutliers.find(*id)!=decayedOutliers.end()) return true; + return false; +} + +void SHC::setSharedAgglomerationThreshold(int sha_agglo_theta) { + this->sha_agglo_theta=sha_agglo_theta; +} + +pair SHC::getStatistic() { + vector *outs=getOutliers(); + long v1=outs->size(); + delete outs; + return make_pair(components.size()-v1,v1); +} + +void SHC::setComponentFormingVVRatio(float ratio) { + this->componentFormingMinVVRatio=ratio; +} + +void SHC::setComponentBlockingLimitVVRatio(float ratio) { + this->componentBlockingLimitVVRatio=ratio; +} + +void SHC::removeSmallComponents(int elementsThreshold) { + set rem; + for(pair it:components) + if(it.second->getElements()(theta,neighborhood_mutiplier*theta, precision_switch); +} + +void SHC::printSigmaIndex(ostream &o_str) { + if(sigma_index) sigma_index->print(ROOT, o_str); +} + +unordered_map *_int_cr::getCombinedNeighborhood() { + unordered_map *combo=new unordered_map(); + if(classified_map!=NULL) + for(pair it1:*classified_map) + if(combo->find(it1.first)==combo->end()) (*combo)[it1.first]=it1.second; + if(neighborhood_map!=NULL) + for(pair it2:*neighborhood_map) + if(combo->find(it2.first)==combo->end()) (*combo)[it2.first]=it2.second; + return combo; +} + +vector SHC::getTimes() { + vector times; + times.push_back(qTime/1000); + times.push_back(uTime/1000); + times.push_back(pTime/1000); + return times; +} + +long SHC::getNodeCounter() { + if(sigma_index) { + SigmaIndexStatistics *stat=sigma_index->getStatistics(); + long nc=stat->totalCount; + delete stat; + return nc; + } else return this->nodeCounter; +} + +SigmaIndex *SHC::getSigmaIndex() { + return sigma_index; +} + +void SHC::consumeDeltaLog(shared_ptr delta_log,string *delta_log_src,shared_ptr amending_log,bool dropCosumationActivity, + ostream *o_str) { + if(dropCosumationActivity && o_str) { + (*o_str) << "###################################################" << endl; + (*o_str) << "## Consuming log " << (delta_log_src ? *delta_log_src : "master") << endl; + (*o_str) << "###################################################" << endl; + } + if(amending_log) delta=amending_log; + else if(!delta_log) delta=make_shared(); + else delta=NULL; + int total_dropped_points=0; + for(string rid:delta_log->cr_delta) + if(components.find(rid)!=components.end()) removeComponent(components[rid]); + for(pair it:delta_log->c_delta) { + string local_id=it.second->id; + bool existing_component=components.find(local_id)!=components.end(); + set *ts=traceComponentId(&local_id); + if(!existing_component && ts->size()>0) { + if(delta) delta->cr_delta.insert(local_id); + delete ts; + continue; + } + delete ts; + if(components.find(local_id)==components.end()) { + if(it.second->outlier) { + // let us check if this is an outlier on the same place we already have one + bool existing_outlier=false; + for(pair it_out:components) { + if(it_out.second->isOutlier()) { + SHC_Component *out=it_out.second; + double md_out=out->mahalanobisDistance(it.second->end->mean); + if(md_out<(theta/16)) { + existing_outlier=true; + if(delta) delta->cr_delta.insert(it.second->id); + if(dropCosumationActivity && o_str) (*o_str) << "****** it seems that " << local_id << " exists somehow here: " << out->getId() << endl; + total_dropped_points++; + break; + } else if(md_outaddNewElement(it.second->end->mean, this); + existing_outlier=true; + if(delta) delta->cr_delta.insert(it.second->id); + if(dropCosumationActivity && o_str) (*o_str) << "****** we added " << local_id << " to: " << out->getId() << endl; + break; + } + } + } + if(existing_outlier) continue; + } + // if the new component is really a new component, let us add it + MatrixXd *cov=it.second->end->isInverted ? new MatrixXd(it.second->end->covariance->inverse()) : new MatrixXd(*it.second->end->covariance); + long de=0; + if(delta_log_src) { + if(it.second->delta_elements.find(*delta_log_src)!=it.second->delta_elements.end()) de=it.second->delta_elements[*delta_log_src]; + } else { + for(pair itx:it.second->delta_elements) + if(itx.first!=deltaLoggingSourceName) de+=itx.second; + } + SHC_Component *nc=new SHC_Component(containers,it.second->end->mean,cov,it.second->end->isInverted,de,virtualVariance); + nc->setId(it.second->id); + nc->addTrace(&it.second->trace); + if(delta_log_src) nc->setSourceNode(*delta_log_src); + if(SHC_Containable *parent=containers->fetchContainer(&it.second->parent_id)) nc->attachToParent(parent); + else { + SHC_Containable *cluster=new SHC_Containable(containers, nc); + cluster->setId(it.second->parent_id); + } + delete cov; + nc->switchDecay(decay,decay_pace); + if(eventCallback) nc->setEventCallback(eventCallback); + nc->setObsolete(it.second->obsolete); + nc->setBlocked(it.second->blocked); + nc->setBaselineLimits(cbVarianceLimit,cbNLimit); + components[nc->getId()]=nc; + if(delta) { + if(delta_log_src) { + delta->logComponent(nc,*delta_log_src,de); + } else { + for(pair itx:it.second->delta_elements) + if(itx.first!=deltaLoggingSourceName) + delta->logComponent(nc,itx.first,itx.second); + } + } + } else { + SHC_Component *c=components[local_id]; + if(c->getParent()->getId()!=it.second->parent_id) { + c->detachFromParent(); + if(SHC_Containable *parent=containers->fetchContainer(&it.second->parent_id)) c->attachToParent(parent); + else { + SHC_Containable *cluster=new SHC_Containable(containers, c); + cluster->setId(it.second->parent_id); + } + } + if(c->isOutlier() && it.second->outlier) continue; + long ce=c->getElements(),cd=it.second->end->elements; + if(delta_log_src) { + double ce_coeff=((double)ce/((double)ce+(double)cd)); + VectorXd mean_delta=((*it.second->end->mean)-(*c->getMean()))*ce_coeff; + mean_delta+=*c->getMean(); + c->setMean(new VectorXd(mean_delta)); + MatrixXd *cov=c->getCovariance(); + for(int i=0;irows();i++) { + for(int j=0;jcols();j++) { + if(abs((*cov)(i,j))end->covariance)(i,j))) + (*cov)(i,j)=(*it.second->end->covariance)(i,j); + } + } + c->setCovariance(cov,c->isCovarianceInverted()); + delete cov; + } else { + c->setMean(new VectorXd(*it.second->end->mean)); + c->setCovariance(it.second->end->covariance,it.second->end->isInverted); + } + + c->setObsolete(it.second->obsolete); + c->setBlocked(it.second->blocked); + c->setBaselineLimits(cbVarianceLimit,cbNLimit); + c->addTrace(&it.second->trace); + if(delta_log_src) { + long de=0; + if(it.second->delta_elements.find(*delta_log_src)!=it.second->delta_elements.end()) de=it.second->delta_elements[*delta_log_src]; + SHC_Component *temp=c; + while(temp->isRedirected()) temp=temp->getRedirectedComponent(); + temp->setElements(ce+de); + if(delta) delta->logComponent(temp,*delta_log_src,de); + } else { + c->setElements(it.second->end->elements); + } + } + } + + //for (SHC_Component *c:touched_components) { + for (pair it:components) { + SHC_Component *c=it.second; + if(delta_log->c_delta.find(c->getId())!=delta_log->c_delta.end()) { + DL_Component *dl_c=delta_log->c_delta[c->getId()]; + if(dl_c->redirectedTo) { + string r_to_id=*dl_c->redirectedTo; + if(components.find(r_to_id)!=components.end()) { + SHC_Component *r_c=components[r_to_id]; + if(!c->isRedirected() || c->getRedirectedComponent()!=r_c) { + c->redirectComponent(r_c); + if(c->getParent()!=r_c->getParent()) { + if(dropCosumationActivity && o_str) (*o_str) << "*** join components " << c->getId() << " " << r_c->getId() << endl; + if(delta) { + delta->logComponent(c,deltaLoggingSourceName); + delta->logComponent(r_c,deltaLoggingSourceName); + } + joinComponents(c, r_c); + } + } + } + } + for(string o_n:dl_c->neighborhood) { + if(components.find(o_n)!=components.end()) { + SHC_Component *n_o_c=components[o_n]; + c->addNeighbor(n_o_c); + } + } + } + + if(delta_log_src && !c->isOutlier()) { + set ex={c}; + std::chrono::time_point qst=std::chrono::system_clock::now(); + _int_cr class_res=!sigma_index ? classify(c->getMean(), true, &ex) : classifySigmaIndex(c->getMean(), true, &ex, c); + std::chrono::time_point qet=std::chrono::system_clock::now(); + qTime+=std::chrono::duration_cast(qet-qst).count(); + for(pair it:*class_res.neighborhood_map) + if(it.first->isOutlier()) c->addNeighbor(it.first); + for(pair it:*class_res.classified_map) { + if(!it.first->isOutlier()) { + if(c->getParent()!=it.first->getParent()) { + if(dropCosumationActivity && o_str) (*o_str) << "*** join components " << c->getId() << " " << it.first->getId() << endl; + if(delta) { + delta->logComponent(c,deltaLoggingSourceName); + delta->logComponent(it.first,deltaLoggingSourceName); + } + joinComponents(c, it.first); + } + } else { + if(dropCosumationActivity && o_str) (*o_str) << "*** agglomerate " << c->getId() << " " << it.first->getId() << endl; + agglomerate(c, it.first); + } + } + if(!c->isRedirected()) { + bool redirected=false; + for(pair it:*class_res.classified_map) { + if(!it.first->isOutlier()) { + if(c->getElements()<=it.first->getElements()) { + if(dropCosumationActivity && o_str) (*o_str) << "*** redirect " << c->getId() << " to " << it.first->getId() << endl; + c->redirectComponent(it.first); + redirected=true; + break; + } + } + } + } + c->agglomerateNeighborhood(this); + } + } + + + for(pair, DL_ComponentConnection*> it:delta_log->cc_delta) { + if(components.find(it.second->c1_id)!=components.end() && + components.find(it.second->c2_id)!=components.end()) { + SHC_Component *c1=components[it.second->c1_id],*c2=components[it.second->c2_id]; + SHC_ComponentConnection *cc=connections.connect(c1, c2, false); + cc->addPoints(it.second->points); + agglomerate(c1, c2, false); + } + } + pseudoOffline(true); + if(delta) { + for(pair it:components) delta->finalizeComponent(it.second); + delta=NULL; + } + if(dropCosumationActivity && o_str) (*o_str) << "$$$$ Total dropped points " << total_dropped_points << endl; +} + +void SHC::incDeltaElementsInDeltaLog(SHC_Component *comp,int number) { + if(delta) delta->addComponentDeltaElements(comp,deltaLoggingSourceName,number); +} + +void SHC::setDeltaLoggingSourceName(string s) { + this->deltaLoggingSourceName=s; +} + +string SHC::getDeltaLoggingSourceName() { + return this->deltaLoggingSourceName; +} + +void SHC::clearEigenMPSupport() { + Eigen::setNbThreads(1); +} diff --git a/src/SHC/SHC.hpp b/src/SHC/SHC.hpp new file mode 100644 index 0000000..e7a0a7f --- /dev/null +++ b/src/SHC/SHC.hpp @@ -0,0 +1,179 @@ +#ifndef SHC_hpp +#define SHC_hpp + +#include +#include +#include +#include +#include +#include +#include +#include +#include "SHC_Component.hpp" +#include "SHC_ComponentConnection.hpp" +#include "SHC_DeltaLogger.hpp" +#include "SigmaIndex.hpp" +#include +using namespace std; +using namespace Eigen; + +class SHC_Exception : public exception { +private: + const char *msg=NULL; +public: + const char *what() const throw(); + SHC_Exception(const char *msg); +}; + +struct ClassificationResult { + string *component_id=NULL,*cluster_id=NULL; + bool outlier=false; + ~ClassificationResult() { + if(component_id!=NULL) delete component_id; + if(cluster_id!=NULL) delete cluster_id; + } +}; +struct _int_cr { + SHC_Component *comp=NULL; + double min_md=-1; + vector> *neighborhood_map=NULL,*classified_map=NULL; + _int_cr(SHC_Component *comp,vector> *neighborhood_map, + vector> *classified_map,double min_md) { + this->comp=comp; + this->neighborhood_map=neighborhood_map; + this->classified_map=classified_map; + this->min_md=min_md; + } + ~_int_cr() { + if(neighborhood_map!=NULL) delete neighborhood_map; + if(classified_map!=NULL) delete classified_map; + } + unordered_map *getCombinedNeighborhood(); +}; + +enum AgglomerationType {RelaxedAgglomeration,NormalAgglomeration,AggresiveAgglomeration}; +enum DriftType {NormalDrift,FastDrift,SlowDrift,NoDrift,UltraFastDrift}; + +enum SHCEventType {BeforeObsoleteComponentDeleted,UnknownEvent,DriftFrontTrigger,AfterObsoleteComponentDeleted, + SlicePositionChange}; +struct SHCEvent { + string *component_id=NULL,*cluster_id=NULL; + int *slice_pos=NULL; + SHCEventType eventType=UnknownEvent; + SHC *shc; + SHCEvent(SHCEventType eventType,string id) { + this->eventType=eventType; + if(eventType==BeforeObsoleteComponentDeleted || eventType==DriftFrontTrigger || eventType==AfterObsoleteComponentDeleted) + this->component_id=new string(id); + } + SHCEvent(SHCEventType eventType,int pos) { + this->eventType=eventType; + if(eventType==SlicePositionChange) this->slice_pos=new int(pos); + } + SHCEvent(SHCEventType eventType,SHC *shc,string id) { + this->eventType=eventType; + if(eventType==DriftFrontTrigger) { + this->component_id=new string(id); + this->shc=shc; + } + } + ~SHCEvent() { + if(component_id!=NULL) delete component_id; + if(cluster_id!=NULL) delete cluster_id; + if(slice_pos!=NULL) delete slice_pos; + } +}; + +class SHC { +private: + SigmaIndex *sigma_index=NULL; + string deltaLoggingSourceName="master"; + bool parallelize=false,performSharedAgglomeration=true,cache=false,decay=true,cloned=false; + int agglo_count,agglo_counter,decay_pace=10,sha_agglo_theta=1,entry=0; + long cbNLimit=40,qTime=0,nodeCounter=0,uTime=0,pTime=0,compTime=0; + double theta=3.0,cbVarianceLimit=10.0; + float driftRemoveCompSizeRatio=0.1,driftCheckingSizeRatio=1.0,driftMovementMDThetaRatio=1.0,componentFormingMinVVRatio=0.1, + componentBlockingLimitVVRatio=0.0; + VectorXd *virtualVariance=NULL; + vector _cache; + unordered_map> _cache2; + unordered_map components; + SHC_ComponentConnection_Set connections; + SHC_Containable_Set *containers=NULL; + static void t1(SHC *shc, SHC_Component *comp, VectorXd *newElement, vector> *classified_map, vector> *neighborhood_map, + vector> *obsolete_map, double theta=3.0); + //static void t2(SHC *shc, SHC_Component *comp_from, SHC_Component *comp_to, vector> *res_map); + pair agglomerate(SHC_Component *c1,SHC_Component *c2,bool add_point=true); + void joinComponents(SHC_Component *c1,SHC_Component *c2); + void purgeEmptyContainers(); + pair*,set *> getOutliersAndComponents(); + set *getUnrelatedComponents(SHC_Component *comp); + _int_cr classify_p1(bool classifyOnly,vector> *obsolete_map,vector> *classified_map, + vector> *neighborhood_map/*,vector *workers*/); + _int_cr classify(VectorXd *newElement, bool classifyOnly=false, set *excludeComponents=NULL); + _int_cr classifySigmaIndex(VectorXd *newElement, bool classifyOnly=false, set *excludeComponents=NULL, SHC_Component *starting=NULL); + _int_cr classify(VectorXd *newElement, SHC_Component *parentComponent, bool classifyOnly=false); + _int_cr classify(VectorXd *newElement, vector *forComponents, bool classifyOnly=false); + function eventCallback; + set decayedOutliers; + shared_ptr delta=nullptr; +protected: + mutex m1; +public: + SHC(double theta,bool parallelize=false,bool performSharedAgglomeration=true,VectorXd *virtualVariance=NULL, + int agglo_count=100,double cbVarianceLimit=10.0,long cbNLimit=40,float driftRemoveCompSizeRatio=0.1, + float driftCheckingSizeRatio=1.0,float driftMovementMDThetaRatio=1.0,int decayPace=10, + float componentFormingMinVVRatio=0.1, float componentBlockingLimitVVRatio=0.0, bool cache=false); + SHC(int dimensions,AgglomerationType aggloType=NormalAgglomeration,DriftType driftType=NormalDrift, + int decayPace=10,bool cache=false,bool parallelize=false); + SHC(SHC *cloneFrom); + ~SHC(); + shared_ptr process(VectorXd *newElement, bool classifyOnly=false); + pair>>,shared_ptr> process(MatrixXd *elements, bool initNewDeltaLogger=false, bool classifyOnly=false); + SHC_Component *getComponent(string *comp_id); + set *getTopContainers(bool clusters=true, bool outliers=true); + vector *getClassificationDetails(string *container_id,double theta,int dim1,int dim2,bool single=false); + vector *getClassificationDetails(string *container_id); + void print(ostream &o_str); + void pseudoOffline(bool force=false); + void processObsoleteComponent(SHC_Component *comp); + double calculateFragmentationIndex(); + SHC_Component *getBiggestComponent(); + long getTotalElements(bool components=true,bool outliers=true); + unordered_map *getComponents(); + vector *getComponents(string *container_id); + vector *getOutliers(); + void merge(SHC *mergedClassifier, SHC_Component *obsoleteComponent=NULL, unordered_map*> *front_conn_map=NULL); + SHC *clone(); + void setAgglomerationCount(int agglo_count); + void setEventCallback(function eventCallback); + set *traceComponentId(string *id); + MatrixXd *getCache(); + MatrixXd *getCache(string *id); + double meanWeightedComponentMDFrom(SHC_Component *fromComponent); + void switchDecay(bool decay); + double getTheta(); + VectorXd *getVirtualVariance(); + void cleanOutliers(); + bool recheckOutlier(string *id); + void setSharedAgglomerationThreshold(int sha_agglo_theta); + void agglomerateComponents(SHC_Component *c1,SHC_Component *c2); + pair getStatistic(); + void removeComponent(SHC_Component *c); + void setComponentFormingVVRatio(float ratio); + void setComponentBlockingLimitVVRatio(float ratio); + void removeSmallComponents(int elementsThreshold); + void useSigmaIndex(int neighborhood_mutiplier=2, bool precision_switch=true); + void printSigmaIndex(ostream &o_str); + vector getTimes(); + long getNodeCounter(); + SigmaIndex *getSigmaIndex(); + void consumeDeltaLog(shared_ptr delta_log,string *delta_log_src=NULL,shared_ptr amending_log=nullptr, + bool dropCosumationActivity=false,ostream *o_str=NULL); + void incDeltaElementsInDeltaLog(SHC_Component *comp,int number=1); + void setDeltaLoggingSourceName(string s); + string getDeltaLoggingSourceName(); + void clearEigenMPSupport(); +}; + +#endif /* SHC_hpp */ diff --git a/src/SHC/SHC_Component b/src/SHC/SHC_Component new file mode 100644 index 0000000..07add2e --- /dev/null +++ b/src/SHC/SHC_Component @@ -0,0 +1 @@ +#include "SHC_Component.hpp" diff --git a/src/SHC/SHC_Component.cpp b/src/SHC/SHC_Component.cpp new file mode 100644 index 0000000..ef4678e --- /dev/null +++ b/src/SHC/SHC_Component.cpp @@ -0,0 +1,666 @@ +#include "SHC_Component.hpp" +#include "SHC.hpp" +#include +#include +#include +#include +#include +using namespace std; +using namespace Eigen; + +SHC_Component_Exception::SHC_Component_Exception(const char *msg) { + this->msg=msg; +} +const char *SHC_Component_Exception::what() const throw() { + return this->msg; +} + +SHC_Component::SHC_Component(SHC_Containable_Set *set,VectorXd *newElement,VectorXd *virtualVariance,double cbVarianceLimit,long cbNLimit, + float driftCheckingSizeRatio,float driftMovementMDThetaRatio,int decayPace,float componentFormingMinVVRatio, + float componentBlockingLimitVVRatio) : SHC_Containable(set) { + if(set!=NULL) set->addComponent(this); + if(newElement==NULL) throw SHC_Component_Exception("New element must be defined"); + this->dim=newElement->size(); + if(virtualVariance!=NULL && virtualVariance->size()!=dim) throw SHC_Component_Exception("Virtual variance vector does not have the same dimension as the new element"); + this->cov=new MatrixXd(dim,dim); + this->cov->setZero(); + this->mean=new VectorXd(*newElement); + MatrixXd tmp(dim,dim); + tmp.setZero(); + if(virtualVariance==NULL) + for(int i=0;ivv_value=(*virtualVariance)(0); + } + this->vv=new MatrixXd(tmp.inverse()); + this->elements=1; + this->cbVarianceLimit=cbVarianceLimit; + this->cbNLimit=cbNLimit; + this->driftCheckingSizeRatio=driftCheckingSizeRatio; + this->driftMovementMDThetaRatio=driftMovementMDThetaRatio; + this->componentFormingMinVVRatio=componentFormingMinVVRatio; + this->componentBlockingLimitVVRatio=componentBlockingLimitVVRatio; + this->decay_pace=decayPace; + if(decay_pace>0 && cbNLimit>0) this->decay_handler=new SHC_Count_Decay(20*decay_pace*cbNLimit); +} + +SHC_Component::~SHC_Component() { + if(isRedirected()) redirect_to->removeFromRedirectedComponent(this); + for(auto redir_from:redirected_from) { + if(isRedirected()) redir_from.second->redirectComponent(redirect_to); + else redir_from.second->redirectComponent(NULL); + } + delete mean; + delete vv; + delete cov; + if(baseline!=NULL) delete baseline; + if(front_connection_mapping!=NULL) { + for(auto it:*front_connection_mapping) delete it.second; + delete front_connection_mapping; + } + if(front_resolver!=NULL) delete front_resolver; + if(decay_handler!=NULL) delete decay_handler; +} + +bool SHC_Component::isCovInvertible() { + if(inverted) throw SHC_Component_Exception("Covariance matrix already inverted"); + FullPivLU lu(*cov); + return lu.isInvertible(); +} + +bool SHC_Component::addNewElement(VectorXd *newElement,SHC *parent_resolver,vector> *classified_map) { + if(newElement==NULL) throw SHC_Component_Exception("New element must be defined"); + if(newElement->size()!=dim) { + throw SHC_Component_Exception("New element does not have the same dimension as the component"); + } + if(redirect_to!=NULL) + return redirect_to->addNewElement(newElement,parent_resolver,classified_map); + if(obsolete) return false; + if(outlier && componentFormingMinVVRatio>0) { + VectorXd v1=*newElement-*mean; + MatrixXd B=v1*v1.transpose()/2; + auto Barr=abs(B.array()); + double lim1=componentFormingMinVVRatio*vv_value; + bool cont=false; + if((Barr>=lim1).any()) cont=true; + if(!cont) return false; + double lim2=componentBlockingLimitVVRatio*vv_value; + if(componentBlockingLimitVVRatio>0 && (Barr>lim2).any()) + return false; + } + if(agglo_out_blocker>0) agglo_out_blocker--; + if(decay_handler) decay_handler->reset(elements*decay_paceincDeltaElementsInDeltaLog(this); + VectorXd *old_mean=mean; + VectorXd t1=(*newElement-*old_mean)/(elements+1); + mean=new VectorXd(*old_mean+t1); + // calculate new covariance + MatrixXd *old_cov=cov; + if(!inverted) { + MatrixXd B=(((*newElement-*old_mean)*(*newElement-*mean).transpose())-*old_cov)/(elements+1); + cov=new MatrixXd(*old_cov+B); + if(isCovInvertible()) { + MatrixXd *tcov=cov; + cov=new MatrixXd(tcov->inverse()); + inverted=true; + delete tcov; + } + } else { + MatrixXd u=(*newElement-*old_mean)/sqrt(elements); + MatrixXd vt=((*newElement-*mean)/sqrt(elements)).transpose(); + MatrixXd v1=*old_cov*u*vt**old_cov; + MatrixXd v2=vt**old_cov*u; + MatrixXd res=*old_cov-v1/((double)1+v2(0,0)); + double X=(double)elements/((double)elements+(double)1); + cov=new MatrixXd(res/X); + } + delete old_mean;delete old_cov; + outlier=false; + elements++; + if(elements==2) removeFromNeighborhood(parent_resolver); + MatrixXd *tmp_cov=getCovariance(); + if(!blocked && inverted && componentBlockingLimitVVRatio>0 && elements>10) { // chameleon=10 + auto x=abs((*tmp_cov).array()); + double lim=componentBlockingLimitVVRatio*vv_value; + if((x>lim).any()) { + blocked=true; + cleanRedirection(); + neighborhood.erase(neighborhood.begin(),neighborhood.end()); + if(isRedirected()) { + redirect_to->removeFromRedirectedComponent(this); + redirect_to=NULL; + } + } + } + if((cbVarianceLimit>0 || cbNLimit>0) && inverted && !baseline && ((tmp_cov->array()>cbVarianceLimit).any() || elements>cbNLimit)) { + baseline=clone(); + front_resolver=new SHC(dim,AggresiveAgglomeration,NoDrift,true); + front_resolver->switchDecay(false); + front_connection_mapping=new unordered_map*>(); + } + delete tmp_cov; + if(parent_resolver->getTheta()>0 && !obsolete && baseline) { + if(!obsolete && front_resolver!=NULL) { + shared_ptr class_res=front_resolver->process(newElement); + if(classified_map!=NULL) { + if(front_connection_mapping->find(*class_res->component_id)==front_connection_mapping->end()) + (*front_connection_mapping)[*class_res->component_id]=new std::set(); + for(pair it:*classified_map) + (*front_connection_mapping)[*class_res->component_id]->insert(it.first->getId()); + } + bool reset=false; + long bcomp_els=front_resolver->getBiggestComponent()->getElements(); + long base_els=(baseline->getElements()<(2*cbNLimit) ? baseline->getElements() : (2*cbNLimit)); + if(bcomp_els>driftCheckingSizeRatio*base_els) reset=true; + if(reset) { + double mdw=front_resolver->meanWeightedComponentMDFrom(baseline); + if(mdw>driftMovementMDThetaRatio*parent_resolver->getTheta()) { + if(eventCallback) eventCallback(new SHCEvent(DriftFrontTrigger,front_resolver,getId())); + obsolete=true; + if(parent_resolver!=NULL) parent_resolver->merge(front_resolver,this,front_connection_mapping); + delete front_resolver; + for(auto it:*front_connection_mapping) delete it.second; + delete front_connection_mapping; + front_resolver=NULL; + front_connection_mapping=NULL; + } else { + delete front_resolver; + for(auto it:*front_connection_mapping) delete it.second; + delete front_connection_mapping; + front_resolver=new SHC(dim,AggresiveAgglomeration,NoDrift,true); + front_resolver->switchDecay(false); + front_connection_mapping=new unordered_map*>(); + } + } + } + } + return true; +} + +MatrixXd *SHC_Component::chooseUniOrMulti() { + if(inverted) return cov; + if(outlier) return vv; + MatrixXd temp(dim,dim); + temp.setZero(); + for(int i=0;icols();i++) + if((*cov)(i,i)!=(float)0.0) temp(i,i)=(*cov)(i,i); + return new MatrixXd(temp.inverse()); +} + +double SHC_Component::mahalanobisDistance(VectorXd *newElement) { + if(newElement==NULL) throw SHC_Component_Exception("New element must be defined"); + if(newElement->size()!=dim) { + throw SHC_Component_Exception("New element does not have the same dimension as the component"); + } + MatrixXd *icov=chooseUniOrMulti(); + VectorXd delta=*newElement-*mean; + double N0=delta.transpose()*(*icov*delta); + if(icov!=cov && icov!=vv) delete icov; + return sqrtf(N0); +} + +void SHC_Component::print(ostream &o_str) { + o_str << "-=* Component " << getId() <<" *=-" << endl; + o_str << "Elements:" << elements << " Dimension:" << dim << " Inverted:" << (inverted ? "true":"false") << " Outlier:" << (outlier ? "true":"false") << endl; + o_str << "Mean:---" << endl << *mean << endl << "--------" << endl; + o_str << "Covariance:---" << endl << (inverted ? cov->inverse() : *cov) << endl << "--------------" << endl; +} + +VectorXd *SHC_Component::getMean() { + return mean; +} + +void SHC_Component::setMean(VectorXd *new_mean) { + delete mean; + mean=new_mean; +} + +MatrixXd *SHC_Component::getCovariance() { + if(!inverted) return new MatrixXd(*cov); + else return new MatrixXd(cov->inverse()); +} + +void SHC_Component::setCovariance(MatrixXd *new_cov,bool new_inverted) { + delete cov; + if(new_inverted) cov=new MatrixXd(new_cov->inverse()); + else cov=new MatrixXd(*new_cov); + inverted=new_inverted; +} + +bool SHC_Component::isCovarianceInverted() { + return inverted; +} + +void SHC_Component::setObsolete(bool obsolete) { + this->obsolete=obsolete; +} + +bool SHC_Component::isObsolete() { + return obsolete; +} + +bool SHC_Component::isBlocked() { + return blocked; +} + +void SHC_Component::setBlocked(bool blocked) { + this->blocked=blocked; +} + +long SHC_Component::getElements() { + return elements; +} + +void SHC_Component::setElements(long new_elements) { + this->elements=new_elements; +} + +double SHC_Component::connectionMeasure(SHC_Component *comp,double theta) { + double md1=mahalanobisDistance(comp->getMean()); + double p1=theta/md1; + double md2=comp->mahalanobisDistance(this->getMean()); + double p2=theta/md2; + if((p1+p2)==0) return (double)0.0; + return 1/(p1+p2); +} + +double SHC_Component::euclideanDistance(SHC_Component *comp) { + VectorXd diff=*(comp->getMean())-*mean; + double diff_sn=diff.squaredNorm(); + return diff_sn; +} + +unordered_map SHC_Component::fetchChildComponents() { + unordered_map res; + res[getId()]=this; + for(auto child:children) { + if(typeid(*child.second)==typeid(SHC_Component)) { + SHC_Component *child_comp=static_cast(child.second); + unordered_map t1=child_comp->fetchChildComponents(); + res.insert(t1.begin(),t1.end()); + } else { + unordered_map t1=child.second->fetchChildComponents(); + res.insert(t1.begin(),t1.end()); + } + } + return res; +} + +SHC_Component::SHC_Component(SHC_Containable_Set *set,Eigen::MatrixXd *initDataSet, Eigen::VectorXd *virtualVariance,double cbVarianceLimit,long cbNLimit, + float driftCheckingSizeRatio,float driftMovementMDThetaRatio,int decayPace,float componentFormingMinVVRatio, + float componentBlockingLimitVVRatio) : SHC_Containable(set) { + if(set!=NULL) set->addComponent(this); + if(initDataSet==NULL) throw SHC_Component_Exception("Initial dataset must be defined"); + if(initDataSet->rows()<1) throw SHC_Component_Exception("Initial dataset must have at least one data element (row)"); + VectorXd firstElement=initDataSet->row(0); + this->dim=firstElement.size(); + if(virtualVariance!=NULL && virtualVariance->size()!=dim) throw SHC_Component_Exception("Virtual variance vector does not have the same dimension as the new element"); + this->cov=new MatrixXd(MatrixXd::Zero(dim,dim)); + this->mean=new VectorXd(firstElement); + MatrixXd tmp(dim,dim); + tmp.setZero(); + if(virtualVariance==NULL) + for(int i=0;ivv_value=(*virtualVariance)(0); + } + this->vv=new MatrixXd(tmp.inverse()); + this->elements=1; + this->cbVarianceLimit=cbVarianceLimit; + this->cbNLimit=cbNLimit; + this->driftCheckingSizeRatio=driftCheckingSizeRatio; + this->driftMovementMDThetaRatio=driftMovementMDThetaRatio; + this->componentFormingMinVVRatio=componentFormingMinVVRatio; + this->componentBlockingLimitVVRatio=componentBlockingLimitVVRatio; + for(int i=1;irows();i++) { + VectorXd el=initDataSet->row(i); + addNewElement(&el,0,NULL); + } + this->decay_pace=decayPace; + if(decay_pace>0 && cbNLimit>0) this->decay_handler=new SHC_Count_Decay(decay_pace*cbNLimit); +} + +SHC_Component::SHC_Component(SHC_Containable_Set *set,VectorXd *mean,MatrixXd *covariance,bool inverted,long elements, + VectorXd *virtualVariance) : SHC_Containable(set) { + if(set!=NULL) set->addComponent(this); + if(mean==NULL || covariance==NULL) throw SHC_Component_Exception("Initial component parameters (mean,covariance) must be defined"); + this->dim=mean->size(); + if(virtualVariance!=NULL && virtualVariance->size()!=dim) throw SHC_Component_Exception("Virtual variance vector does not have the same dimension as the new element"); + this->cov=new MatrixXd(*covariance); + if(inverted) this->inverted=true; + else if(isCovInvertible()) { + MatrixXd *tcov=this->cov; + this->cov=new MatrixXd(tcov->inverse()); + this->inverted=true; + delete tcov; + } + this->mean=new VectorXd(*mean); + MatrixXd tmp(dim,dim); + tmp.setZero(); + if(virtualVariance==NULL) + for(int i=0;ivv_value=(*virtualVariance)(0); + for(int i=0;ivv=new MatrixXd(tmp.inverse()); + this->elements=elements; + this->outlier=elements==1; +} + +SHC_Component::SHC_Component(SHC_Containable_Set *set,SHC_Component *cloneFrom) : SHC_Containable(set) { + if(set!=NULL) set->addComponent(this); + if(cloneFrom==NULL) throw SHC_Component_Exception("Cloning component from undefined component"); + this->source_node=cloneFrom->source_node; + this->dim=cloneFrom->dim; + this->cov=new MatrixXd(*cloneFrom->cov); + if(cloneFrom->inverted) this->inverted=true; + else if(isCovInvertible()) { + MatrixXd *tcov=this->cov; + this->cov=new MatrixXd(tcov->inverse()); + this->inverted=true; + delete tcov; + } + this->mean=new VectorXd(*cloneFrom->mean); + this->vv=new MatrixXd(*cloneFrom->vv); + this->vv_value=cloneFrom->vv_value; + this->elements=cloneFrom->elements; + this->outlier=elements==1; + if(cloneFrom->baseline!=NULL) this->baseline=cloneFrom->baseline->clone(NULL,true); + if(cloneFrom->front_resolver!=NULL) this->front_resolver=new SHC(cloneFrom->front_resolver); + this->cbVarianceLimit=cloneFrom->cbVarianceLimit; + this->cbNLimit=cloneFrom->cbNLimit; + this->obsolete=cloneFrom->obsolete; + this->driftCheckingSizeRatio=cloneFrom->driftCheckingSizeRatio; + this->driftMovementMDThetaRatio=cloneFrom->driftMovementMDThetaRatio; + this->componentFormingMinVVRatio=cloneFrom->componentFormingMinVVRatio; + this->blocked=cloneFrom->blocked; + this->decay_pace=cloneFrom->decay_pace; + if(cloneFrom->decay_handler!=NULL) this->decay_handler=cloneFrom->decay_handler->clone(); + this->trace.insert(cloneFrom->trace.begin(),cloneFrom->trace.end()); +} + +SHC_Component_Details *SHC_Component::calculateDetails(double theta) { + return calculateDetails(theta, 0, 1, cbVarianceLimit, cbNLimit, true); +} + +SHC_Component_Details *SHC_Component::calculateDetails(double theta,int dim1,int dim2,double cbVarianceLimit,double cbNLimit,bool single) { + if(this->dim!=2) throw SHC_Component_Exception("Cannot calculate component details, the number of dimensions is not exactly 2"); + double R=0; + int inc=5,vno=(!single ? (int)180/inc : (int)360/inc),rindex=0; + MatrixXd *res1=NULL,*res2=NULL; + if(!single) { + res1=new MatrixXd(vno+1, 2); + for(int angle(0);angle<=180;angle+=inc) { + pair *r1=getEllipticalThresholdRadius(theta, angle, R, dim1, dim2); + (*res1)(rindex,0)=(*r1->second)(dim1); + (*res1)(rindex++,1)=(*r1->second)(dim2); + R=r1->first; + delete r1->second;delete r1; + } + rindex=0; + R=0; + res2=new MatrixXd(vno+1, 2); + for(int angle(0);angle>=-180;angle-=inc) { + pair *r2=getEllipticalThresholdRadius(theta, angle, R, dim1, dim2); + (*res2)(rindex,0)=(*r2->second)(dim1); + (*res2)(rindex++,1)=(*r2->second)(dim2); + R=r2->first; + delete r2->second;delete r2; + } + } else { + res1=new MatrixXd(vno+1, 2); + for(int angle(0);angle<=360;angle+=inc) { + pair *r1=getEllipticalThresholdRadius(theta, angle, R, dim1, dim2); + (*res1)(rindex,0)=(*r1->second)(dim1); + (*res1)(rindex++,1)=(*r1->second)(dim2); + R=r1->first; + delete r1->second;delete r1; + } + } + MatrixXd *tmp_cov=getCovariance(); + bool covTh=(tmp_cov->array()>cbVarianceLimit).any(); + delete tmp_cov; + bool nTh=elements>cbNLimit; + SHC_Component_Details *res=NULL; + if(!single) res=new SHC_Component_Details(this, res1, res2, covTh, nTh, obsolete, new VectorXd(*mean), (baseline!=NULL ? new VectorXd(*baseline->getMean()) : NULL)); + else res=new SHC_Component_Details(this, res1, covTh, nTh, obsolete, new VectorXd(*mean), (baseline!=NULL ? new VectorXd(*baseline->getMean()) : NULL)); + /*SHC_Component *bcomp=NULL; + if(front_resolver!=NULL && (bcomp=front_resolver->getBiggestComponent())!=NULL) + res->childDetails.push_back(bcomp->calculateDetails(theta, dim1, dim2, cbVarianceLimit, cbNLimit, single));*/ + if(front_resolver!=NULL) + for(pair it:*front_resolver->getComponents()) + if(it.second->getElements()>1) res->childDetails.push_back(it.second->calculateDetails(theta, dim1, dim2, cbVarianceLimit, cbNLimit, single)); + if(baseline) res->baselineDetails=baseline->calculateDetails(theta, dim1, dim2, cbVarianceLimit, cbNLimit, single); + return res; +} + +VectorXd *SHC_Component::getEllipticalCoordinates(double angle,double R,int dim1,int dim2) { + double radi=(2*M_PI*angle)/360; + double x=(double)R*cos(radi),y=(double)R*sin(radi); + VectorXd *el=new VectorXd(2); + (*el)(0)=x+(*mean)(dim1); + (*el)(1)=y+(*mean)(dim2); + return el; +} + +pair *SHC_Component::getEllipticalThresholdRadius(double theta,double angle,double initR,int dim1,int dim2) { + double R=initR; + VectorXd *pos=getEllipticalCoordinates(angle, R, dim1, dim2); + double md=mahalanobisDistance(pos); + if(md>=theta) { + while(md>=theta) { + R-=0.05; + delete pos; + pos=getEllipticalCoordinates(angle, R, dim1, dim2); + md=mahalanobisDistance(pos); + } + return new pair(R, pos); + } else { + while(md(R-0.05, pos); + } +} + +bool SHC_Component::isOutlier() { + return outlier; +} + +void SHC_Component::redirectComponent(SHC_Component *to_c) { + // check if we don't have a redirection loop... if we do, do not allow redirection to be assigned + if(to_c==NULL) { + redirect_to=NULL; + return; + } + if(this->redirect_to && this->redirect_to==to_c) return; + SHC_Component *rto=to_c; + while(rto!=NULL) { + if(rto==this) return; + rto=rto->getRedirectedComponent(); + } + if(this->redirect_to) cleanRedirection(); + this->redirect_to=to_c; + to_c->addFromRedirectedComponent(this); +} + +void SHC_Component::addFromRedirectedComponent(SHC_Component *from_c) { + if(redirected_from.find(from_c->getId())==redirected_from.end()) redirected_from[from_c->getId()]=from_c; +} +void SHC_Component::removeFromRedirectedComponent(SHC_Component *from_c) { + if(redirected_from.find(from_c->getId())!=redirected_from.end()) redirected_from.erase(from_c->getId()); +} +void SHC_Component::cleanRedirection() { + for(pair it:redirected_from) + it.second->redirectComponent(NULL); + redirected_from.erase(redirected_from.begin(),redirected_from.end()); +} + +bool SHC_Component::isRedirected() { + return redirect_to!=NULL; +} + +SHC_Component *SHC_Component::getRedirectedComponent() { + return redirect_to; +} + +SHC_Component *SHC_Component::getFinalRedirectedComponent() { + if(isRedirected()) return redirect_to->getFinalRedirectedComponent(); + else return this; +} + +bool SHC_Component::operator==(const SHC_Component &c) const { + return parent->getId()==c.parent->getId(); +} + +bool SHC_Component::hasBaseline() { + return baseline!=NULL; +} + +SHC_Component *SHC_Component::getBaseline() { + return baseline; +} + +SHC_Component *SHC_Component::clone(SHC_Containable_Set *set_forClone, bool retainId) { + SHC_Component *cloned_c=new SHC_Component(set_forClone,this); + if(retainId) cloned_c->setId(getId()); + return cloned_c; +} + +void SHC_Component::resetBaseline() { + if(this->baseline!=NULL) delete this->baseline; + this->baseline=NULL; +} + +void SHC_Component::setBaselineLimits(double cbVarianceLimit, long cbNLimit) { + this->cbVarianceLimit=cbVarianceLimit; + this->cbNLimit=cbNLimit; + if(decay_handler) decay_handler->reset(decay_pace*elements eventCallback) { + this->eventCallback=eventCallback; +} + +void SHC_Component::setDriftOptions(float sizeRatio,float movementRatio) { + this->driftCheckingSizeRatio=sizeRatio; + this->driftMovementMDThetaRatio=movementRatio; +} + +bool SHC_Component::decayPing() { + if(decay_handler) { + bool r=decay_handler->ping(); + if(r) setObsolete(true); + return r; + } + return false; +} + +void SHC_Component::switchDecay(bool decay,int decayPace) { + if(!decay && decay_handler) { + delete decay_handler; + decay_handler=NULL; + } + if(decay) decay_pace=decayPace; + if(decay && !decay_handler && decay_pace>0) { + decay_handler=new SHC_Count_Decay((outlier ? 20 : 1)*(decay_pace*elementsgetId(); + if(set!=NULL) set->removeContainer(&old_id); + forcesetId(id); + if(set!=NULL) set->addComponent(this); +} + +long SHC_Component::getDimensions() { + return this->dim; +} + +void SHC_Component::addNeighbor(SHC_Component *neighbor,SHC *parent,bool measure) { + if(neighbor==this) return; + if(!neighbor->isOutlier()) { + removeNeighbor(neighbor); + return; + } + if(measure && parent!=NULL) { + double md=mahalanobisDistance(neighbor->getMean()); + //parent->incCounter(); + if(md>2*parent->getTheta()) { + removeNeighbor(neighbor); + return; + } + } + if(neighborhood.find(neighbor)==neighborhood.end()) neighborhood.insert(neighbor); +} + +void SHC_Component::removeNeighbor(SHC_Component *neighbor) { + std::set::iterator it=neighborhood.find(neighbor); + if(it!=neighborhood.end()) neighborhood.erase(it); +} + +void SHC_Component::removeFromNeighborhood(SHC *parent) { + for(pair it:*parent->getComponents()) + it.second->removeNeighbor(this); +} + +void SHC_Component::findNeighbors(SHC *parent) { + if(agglo_out_blocker>0) return; + double th=parent->getTheta(); + vector *outs=parent->getOutliers(); + for(SHC_Component *out:*outs) { + double md=mahalanobisDistance(out->getMean()); + //parent->incCounter(); + if(md<=th) parent->agglomerateComponents(this,out); + else if(md>th && md<=2*th) addNeighbor(out); + } + delete outs; + if(neighborhood.size()==0) agglo_out_blocker=400; +} + +void SHC_Component::agglomerateNeighborhood(SHC *parent) { + if(this->outlier || this->blocked) return; + if(neighborhood.size()==0) findNeighbors(parent); + if(neighborhood.size()==0) return; + vector *an=new vector(); + double th=parent->getTheta(); + for(SHC_Component *neighbor:neighborhood) { + double md=mahalanobisDistance(neighbor->getMean()); + //parent->incCounter(); + if(md<=th) an->push_back(neighbor); + } + for(SHC_Component *neighbor:*an) { + parent->agglomerateComponents(this,neighbor); + removeNeighbor(neighbor); + } + delete an; +} + +set *SHC_Component::getNeighborhood() { + return &neighborhood; +} + +void SHC_Component::setSourceNode(string new_source_node) { + this->source_node=new_source_node; +} + +string SHC_Component::getSourceNode() { + return this->source_node; +} diff --git a/src/SHC/SHC_Component.hpp b/src/SHC/SHC_Component.hpp new file mode 100644 index 0000000..29eeca9 --- /dev/null +++ b/src/SHC/SHC_Component.hpp @@ -0,0 +1,153 @@ +#ifndef SHC_Component_hpp +#define SHC_Component_hpp + +#include +#include +#include +#include +#include +#include +#include "SHC_Container.hpp" +#include +#include "SHC_Decay.hpp" +using namespace Eigen; +using namespace std; + +class SHC_Count_Decay; +class SHC_Component_Exception : public exception { +private: + const char *msg=NULL; +public: + const char *what() const throw(); + SHC_Component_Exception(const char *msg); +}; + +const double DEFAULT_VIRTUAL_VARIANCE=0.7; + +class SHC_Component; +struct SHC_Component_Details { + MatrixXd *upper=NULL,*lower=NULL,*single=NULL; + bool covThreshold=false,nThreshold=false,obsolete=false; + VectorXd *mean=NULL,*baseline=NULL; + vector childDetails; + SHC_Component_Details *baselineDetails=NULL; + SHC_Component *parent=NULL; + SHC_Component_Details(SHC_Component *parent, MatrixXd *upper, MatrixXd *lower, bool covThreshold, bool nThreshold, bool obsolete, + VectorXd *mean=NULL, VectorXd *baseline=NULL) { + this->parent=parent; + this->upper=upper; + this->lower=lower; + this->covThreshold=covThreshold; + this->nThreshold=nThreshold; + this->obsolete=obsolete; + this->mean=mean; + this->baseline=baseline; + } + SHC_Component_Details(SHC_Component *parent, MatrixXd *single, bool covThreshold, bool nThreshold, bool obsolete, VectorXd *mean=NULL, + VectorXd *baseline=NULL) { + this->parent=parent; + this->single=single; + this->covThreshold=covThreshold; + this->nThreshold=nThreshold; + this->obsolete=obsolete; + this->mean=mean; + this->baseline=baseline; + } + ~SHC_Component_Details() { + if(upper!=NULL) delete upper; + if(lower!=NULL) delete lower; + if(single!=NULL) delete single; + if(mean!=NULL) delete mean; + if(baseline!=NULL) delete baseline; + for(SHC_Component_Details *det:childDetails) delete det; + if(baselineDetails) delete baselineDetails; + } +}; + +class SHC; +struct SHCEvent; +class SHC_Component : public SHC_Containable { +private: + string source_node="master"; + SHC_Component *redirect_to=NULL,*baseline=NULL; + unordered_map redirected_from; + SHC *front_resolver=NULL; + unordered_map*> *front_connection_mapping=NULL; + std::set neighborhood; + VectorXd *mean=NULL; + int decay_pace=10,agglo_out_blocker=0; + long elements=0,dim=0,cbNLimit=40; + double cbVarianceLimit=10.0; + float driftCheckingSizeRatio=1.0,driftMovementMDThetaRatio=0.5,componentFormingMinVVRatio=0.1,componentBlockingLimitVVRatio=0.0; + bool inverted=false,outlier=true,obsolete=false,blocked=false; + MatrixXd *cov=NULL,*vv=NULL; + double vv_value=DEFAULT_VIRTUAL_VARIANCE; + bool isCovInvertible(); + MatrixXd *chooseUniOrMulti(); + VectorXd *getEllipticalCoordinates(double angle,double R,int dim1,int dim2); + pair *getEllipticalThresholdRadius(double theta,double angle,double initR,int dim1,int dim2); + function eventCallback; + SHC_Count_Decay *decay_handler=NULL; + void cleanRedirection(); +protected: +public: + SHC_Component(SHC_Containable_Set *set,VectorXd *newElement,VectorXd *virtualVariance=NULL,double cbVarianceLimit=10.0,long cbNLimit=40, + float driftCheckingSizeRatio=1.0,float driftMovementMDThetaRatio=0.5,int decayPace=10,float componentFormingMinVVRatio=0.1, + float componentBlockingLimitVVRatio=0.0); + SHC_Component(SHC_Containable_Set *set,MatrixXd *initDataSet,VectorXd *virtualVariance=NULL,double cbVarianceLimit=10.0,long cbNLimit=40, + float driftCheckingSizeRatio=1.0,float driftMovementMDThetaRatio=0.5,int decayPace=10,float componentFormingMinVVRatio=0.1, + float componentBlockingLimitVVRatio=0.0); + SHC_Component(SHC_Containable_Set *set,VectorXd *mean,MatrixXd *covariance,bool inverted,long elements, + VectorXd *virtualVariance=NULL); + SHC_Component(SHC_Containable_Set *set,SHC_Component *cloneFrom); + ~SHC_Component(); + bool addNewElement(VectorXd *newElement,SHC *parent_resolver,vector> *classified_map=NULL); + double mahalanobisDistance(VectorXd *newElement); + double euclideanDistance(SHC_Component *comp); + double connectionMeasure(SHC_Component *comp,double theta); + void print(ostream &o_str); + VectorXd *getMean(); + void setMean(VectorXd *new_mean); + MatrixXd *getCovariance(); + void setCovariance(MatrixXd *new_cov,bool new_inverted=false); + long getElements(); + void setElements(long new_elements); + bool isOutlier(); + bool isCovarianceInverted(); + void setObsolete(bool obsolete); + bool isObsolete(); + bool isBlocked(); + void setBlocked(bool blocked); + bool hasBaseline(); + SHC_Component *getBaseline(); + unordered_map fetchChildComponents(); + SHC_Component_Details *calculateDetails(double theta,int dim1,int dim2,double cbVarianceLimit,double cbNLimit,bool single=false); + SHC_Component_Details *calculateDetails(double theta); + void redirectComponent(SHC_Component *to_c); + bool isRedirected(); + SHC_Component *getRedirectedComponent(),*getFinalRedirectedComponent(); + void addFromRedirectedComponent(SHC_Component *from_c); + void removeFromRedirectedComponent(SHC_Component *from_c); + bool operator==(const SHC_Component &c) const; + SHC_Component *clone(SHC_Containable_Set *set_forClone=NULL, bool retainId=false); + void resetBaseline(); + void setBaselineLimits(double cbVarianceLimit,long cbNLimit); + SHC *getFrontResolver(); + void setEventCallback(function eventCallback); + void setDriftOptions(float sizeRatio=1.0,float movementRatio=0.5); + bool decayPing(); + void switchDecay(bool decay,int decayPace); + SHC_Count_Decay *getDecayHandler(); + void setId(string id); + long getDimensions(); + void addNeighbor(SHC_Component *neighbor,SHC *parent=NULL,bool measure=false); + void removeNeighbor(SHC_Component *neighbor); + void findNeighbors(SHC *parent); + void agglomerateNeighborhood(SHC *parent); + void removeFromNeighborhood(SHC *parent); + std::set *getNeighborhood(); + void setSourceNode(string source_node); + string getSourceNode(); +}; + +#endif /* SHC_Component_hpp */ diff --git a/src/SHC/SHC_ComponentConnection b/src/SHC/SHC_ComponentConnection new file mode 100644 index 0000000..c20ecf9 --- /dev/null +++ b/src/SHC/SHC_ComponentConnection @@ -0,0 +1 @@ +#include "SHC_ComponentConnection.hpp" diff --git a/src/SHC/SHC_ComponentConnection.cpp b/src/SHC/SHC_ComponentConnection.cpp new file mode 100644 index 0000000..49a5695 --- /dev/null +++ b/src/SHC/SHC_ComponentConnection.cpp @@ -0,0 +1,171 @@ +#include "SHC_ComponentConnection.hpp" +#include +#include +#include "SHC_Utils.hpp" +using namespace std; + +SHC_ComponentConnection::SHC_ComponentConnection(SHC_Component *c1, SHC_Component *c2) { + this->id=generate_hex(10); + this->c1=c1; + this->c2=c2; +} + +void SHC_ComponentConnection::addPoint() { + points++; +} + +bool SHC_ComponentConnection::operator==(const SHC_ComponentConnection &cc) const { + return id==cc.id; +} + +SHC_Component *SHC_ComponentConnection::getConnectedComponent(SHC_Component *c) { + if(*c==*c1) return c2; + else return c1; +} + +SHC_ComponentConnection *SHC_ComponentConnection_Set::connect(SHC_Component *c1, SHC_Component *c2, bool add_point) { + SHC_ComponentConnection *cc=NULL; + if(!isConnected(c1, c2, true)) { + cc=new SHC_ComponentConnection(c1, c2); + if(!add_point) cc->setPoints(0); + _set[{c1->getId(), c2->getId()}]=cc; + } else { + cc=_set[{c1->getId(), c2->getId()}]; + if(add_point) cc->addPoint(); + } + if(_links.find(c1->getId())==_links.end()) _links[c1->getId()]=new set(); + _links[c1->getId()]->insert(c2); + if(_links.find(c2->getId())==_links.end()) _links[c2->getId()]=new set(); + _links[c2->getId()]->insert(c1); + return cc; +} + +bool SHC_ComponentConnection_Set::isConnected(SHC_Component *c1, SHC_Component *c2, bool directly) { + if(_set.find({c1->getId(), c2->getId()})!=_set.end()) return true; + if(!directly) { + set *_cs=new set(); + _getConnectedSet(c1, _cs); + bool res=_cs->find(c2)!=_cs->end(); + delete _cs; + return res; + } + return false; +} + +set *SHC_ComponentConnection_Set::getConnectedComponents(SHC_Component *c) { + set *res=new set(); + if(_links.find(c->getId())!=_links.end()) { + set *tmp=_links[c->getId()]; + res->insert(tmp->begin(), tmp->end()); + } + return res; +} + +SHC_Component *SHC_ComponentConnection_Set::getMaxConnectedComponents(SHC_Component *c) { + set *cc=getConnectedComponents(c); + long biggest=0; + SHC_Component *res=NULL; + for(SHC_Component *c:*cc) { + if(c->getElements()>biggest) { + biggest=c->getElements(); + res=c; + } + } + delete cc; + return res; +} + +vector *>> *SHC_ComponentConnection_Set::removeComponent(SHC_Component *c) { + set *_in_set=new set(); + _getConnectedSet(c, _in_set); + if(_in_set->find(c)!=_in_set->end()) _in_set->erase(c); + + set *ccomps=getConnectedComponents(c); + set *cc_set=_links[c->getId()]; + _links.erase(c->getId()); + delete cc_set; + for(SHC_Component *ccomp:*ccomps) { + if(_set.find({c->getId(), ccomp->getId()})!=_set.end()) { + SHC_ComponentConnection *cc=_set[{c->getId(), ccomp->getId()}]; + _set.erase({c->getId(), ccomp->getId()}); + delete cc; + } + if(_links.find(ccomp->getId())!=_links.end()) + _links[ccomp->getId()]->erase(c); + } + + vector *>> *res=calculatePartitions(_in_set); + delete _in_set;delete ccomps; + return res; +} + +void SHC_ComponentConnection_Set::_getConnectedSet(SHC_Component *_from_c, set *_in_set) { + set *_d_c=getConnectedComponents(_from_c); + for(auto cit:*_d_c) { + if(_in_set->find(cit)==_in_set->end()) { + _in_set->insert(cit); + _getConnectedSet(cit, _in_set); + } + } + delete _d_c; +} + +vector *>> *SHC_ComponentConnection_Set::calculatePartitions(set *_in_set) { + vector *>> *res=new vector *>>(); + while(_in_set->size()>0) { + set *_part=new set(); + auto elem=_in_set->begin(); + _part->insert(*elem); + _in_set->erase(*elem); + while(_partition2(_part, _in_set)); + long tel=0; + for(SHC_Component *pc:*_part) tel+=pc->getElements(); + res->push_back(pair *>(tel, _part)); + } + return res; +} + +bool SHC_ComponentConnection_Set::_partition2(set *_is1, set *_is2) { + for(SHC_Component *c1:*_is1) { + for(SHC_Component *c2:*_is2) { + if(isConnected(c1, c2)) { + _is1->insert(c2); + _is2->erase(c2); + return true; + } + } + } + return false; +} + +set *SHC_ComponentConnection_Set::getConnections() { + set *res=new set(); + for(auto it:_set) + res->insert(it.second); + return res; +} + +SHC_Component *SHC_ComponentConnection::getComponent1() { + return c1; +} + +SHC_Component *SHC_ComponentConnection::getComponent2() { + return c2; +} + +long SHC_ComponentConnection::getPoints() { + return this->points; +} + +void SHC_ComponentConnection::setPoints(long v) { + this->points=v; +} + +void SHC_ComponentConnection::addPoints(long v) { + this->points+=v; +} + +SHC_ComponentConnection_Set::~SHC_ComponentConnection_Set() { + for(auto cc_it:_set) delete cc_it.second; + for(auto lks_it:_links) delete lks_it.second; +} diff --git a/src/SHC/SHC_ComponentConnection.hpp b/src/SHC/SHC_ComponentConnection.hpp new file mode 100644 index 0000000..fc8118f --- /dev/null +++ b/src/SHC/SHC_ComponentConnection.hpp @@ -0,0 +1,64 @@ +#ifndef SHC_ComponentConnection_hpp +#define SHC_ComponentConnection_hpp + +#include "SHC_Component.hpp" +#include +#include +using namespace std; + +template struct SHC_ComponentConnectionId +{ + T1 x; + T2 y; + SHC_ComponentConnectionId(T1 x, T2 y) { + this->x = x; + this->y = y; + } + bool operator==(const SHC_ComponentConnectionId &cc) const { + return (x==cc.x && y==cc.y) || (x==cc.y && y==cc.x); + } +}; + +struct SHC_ComponentConnection_hash { + template + size_t operator() (const SHC_ComponentConnectionId &cc) const { + return hash()(cc.x)^hash()(cc.y); + } +}; + +class SHC_ComponentConnection { +private: + string id; + SHC_Component *c1=NULL,*c2=NULL; + long points=1; +public: + SHC_ComponentConnection(SHC_Component *c1, SHC_Component *c2); + void addPoint(); + bool operator==(const SHC_ComponentConnection &cc) const; + SHC_Component *getConnectedComponent(SHC_Component *c); + SHC_Component *getComponent1(); + SHC_Component *getComponent2(); + long getPoints(); + void setPoints(long v); + void addPoints(long v); +}; + +class SHC_ComponentConnection_Set { +private: + unordered_map, SHC_ComponentConnection *, SHC_ComponentConnection_hash> _set; + unordered_map *> _links; + void _getConnectedSet(SHC_Component *_from_c,set *_in_set); + vector *>> *calculatePartitions(set *_in_set); + bool _partition2(set *_is1, set *_is2); +public: + SHC_ComponentConnection_Set()=default; + ~SHC_ComponentConnection_Set(); + SHC_ComponentConnection *connect(SHC_Component *c1, SHC_Component *c2, bool add_point=true); + bool isConnected(SHC_Component *c1, SHC_Component *c2, bool directly=false); + set *getConnectedComponents(SHC_Component *c); + SHC_Component *getMaxConnectedComponents(SHC_Component *c); + vector *>> *removeComponent(SHC_Component *c); + set *getConnections(); +}; + +#endif /* SHC_ComponentConnection_hpp */ diff --git a/src/SHC/SHC_Container b/src/SHC/SHC_Container new file mode 100644 index 0000000..0007bb9 --- /dev/null +++ b/src/SHC/SHC_Container @@ -0,0 +1 @@ +#include "SHC_Container.hpp" diff --git a/src/SHC/SHC_Container.cpp b/src/SHC/SHC_Container.cpp new file mode 100644 index 0000000..b9f0dda --- /dev/null +++ b/src/SHC/SHC_Container.cpp @@ -0,0 +1,191 @@ +#include "SHC_Container.hpp" +#include "SHC_Utils.hpp" +#include "SHC_Component.hpp" +#include +#include +using namespace std; + +SHC_Containable::SHC_Containable(SHC_Containable_Set *set) { + id=generate_hex(10); + if(set!=NULL) { + this->set=set; + if(typeid(*this)==typeid(SHC_Containable)) this->set->addContainer(this); + } +} + +SHC_Containable::SHC_Containable(SHC_Containable_Set *set, SHC_Containable *initChild):SHC_Containable(set) { + addChild(initChild); +} + +SHC_Containable::~SHC_Containable() { + if(set!=NULL) set->removeContainer(&id); +} + +string SHC_Containable::getId() { + return id; +} + +SHC_Containable *SHC_Containable::getParent() { + return parent; +} + +void SHC_Containable::setParent(SHC_Containable *parent) { + this->parent=parent; +} + +SHC_Containable *SHC_Containable::findTop() { + SHC_Containable *x=this; + while(x->getParent()!=NULL) + x=x->getParent(); + return x; +} + +void SHC_Containable::addChild(SHC_Containable *child) { + if(children.find(child->getId())==children.end()) { + container_m.lock(); + children[child->getId()]=child; + if(child->getParent()!=NULL) + child->getParent()->removeChild(child); + child->setParent(this); + container_m.unlock(); + } +} + +void SHC_Containable::removeChild(SHC_Containable *child) { + if(children.find(child->getId())!=children.end()) { + container_m.lock(); + children.erase(child->getId()); + child->setParent(NULL); + container_m.unlock(); + } +} + +void SHC_Containable::detachFromParent() { + if(parent!=NULL) { + container_m.lock(); + SHC_Containable *temp=parent; + temp->removeChild(this); + if(temp->children.size()==0) { + temp->detachFromParent(); + delete temp; + } + container_m.unlock(); + } +} + +void SHC_Containable::attachToParent(SHC_Containable *parent) { + detachFromParent(); + container_m.lock(); + parent->addChild(this); + container_m.unlock(); +} + +unordered_map SHC_Containable::fetchChildComponents() { + unordered_map res; + for(auto child:children) { + if(typeid(*child.second)==typeid(SHC_Component)) { + SHC_Component *child_comp=static_cast(child.second); + unordered_map t1=child_comp->fetchChildComponents(); + res.insert(t1.begin(),t1.end()); + } else { + unordered_map t1=child.second->fetchChildComponents(); + res.insert(t1.begin(),t1.end()); + } + } + return res; +} + +double SHC_Containable::calculateSeparation(SHC_Containable *cont, double theta) { + unordered_map comps1=this->fetchChildComponents(), comps2=cont->fetchChildComponents(); + SHC_Component *minc1=NULL, *minc2=NULL; + double *min=NULL; + for(auto comp1:comps1) { + SHC_Component *c1=static_cast(comp1.second); + for(auto comp2:comps2) { + SHC_Component *c2=static_cast(comp2.second); + double emesaure=c1->euclideanDistance(c2); + if(min==NULL || emesaure<*min) { + min=new double(emesaure); + minc1=c1; + minc2=c2; + } + } + } + delete min; + return minc1->connectionMeasure(minc2, theta); +} + +SHC_Containable *SHC_Containable::fetchTopContainer() { + if(parent!=NULL) return parent->fetchTopContainer(); + else return this; +} + +bool SHC_Containable::hasChildren() { + return children.size()>0; +} + +void SHC_Containable_Set::addContainer(SHC_Containable *container) { + if(_set.find(container->getId())==_set.end()) _set[container->getId()]=container; +} +void SHC_Containable_Set::addComponent(SHC_Component *comp) { + if(_set.find(comp->getId())==_set.end()) _set[comp->getId()]=comp; +} + +void SHC_Containable_Set::removeContainer(string *id) { + _set.erase(*id); +} + +SHC_Containable_Set::~SHC_Containable_Set() { + set *cs=new set(); + for(pair cont_it:_set) cs->insert(cont_it.first); + for(string c_id:*cs) { + SHC_Containable *cont=fetchContainer(&c_id); + if(cont!=NULL) delete cont; + } + delete cs; +} + +unordered_map *SHC_Containable_Set::getAllContainers() { + return &_set; +} + +SHC_Containable *SHC_Containable_Set::fetchContainer(string *id) { + if(_set.find(*id)!=_set.end()) return _set[*id]; + return NULL; +} + +void SHC_Containable::forcesetId(string id) { + this->id=id; +} + +void SHC_Containable::setId(string id) { + if(set!=NULL) set->removeContainer(&this->id); + forcesetId(id); + if(set!=NULL) set->addContainer(this); +} + +unordered_map *SHC_Containable::getChildren() { + return &children; +} + +void SHC_Containable::addTrace(SHC_Containable *previous) { + trace.insert(previous->trace.begin(),previous->trace.end()); + trace.insert(previous->getId()); +} + +void SHC_Containable::addTrace(pair> *prev_pair) { + trace.insert(prev_pair->second.begin(),prev_pair->second.end()); + trace.insert(prev_pair->first); +} + +void SHC_Containable::addTrace(std::set *prev_log) { + trace.insert(prev_log->begin(),prev_log->end()); +} + +bool SHC_Containable::checkTrace(string id) { + return trace.find(id)!=trace.end(); +} + +set *SHC_Containable::getTrace() { + return &trace; +} diff --git a/src/SHC/SHC_Container.hpp b/src/SHC/SHC_Container.hpp new file mode 100644 index 0000000..962e71c --- /dev/null +++ b/src/SHC/SHC_Container.hpp @@ -0,0 +1,61 @@ +#ifndef SHC_Container_hpp +#define SHC_Container_hpp + +#include +#include +#include +#include +using namespace std; + +class SHC_Containable; +class SHC_Component; +class SHC_Containable_Set { +private: + unordered_map _set; +public: + SHC_Containable_Set()=default; + ~SHC_Containable_Set(); + void addContainer(SHC_Containable *container); + void addComponent(SHC_Component *comp); + void removeContainer(string *id); + SHC_Containable *fetchContainer(string *id); + unordered_map *getAllContainers(); +}; + +class SHC_Containable { +private: + string id; +protected: + mutex container_m; + SHC_Containable *parent=NULL; + unordered_map children; + SHC_Containable_Set *set=NULL; + std::set trace; + void forcesetId(string id); +public: + SHC_Containable(); + SHC_Containable(SHC_Containable_Set *set); + SHC_Containable(SHC_Containable_Set *set, SHC_Containable *initChild); + virtual ~SHC_Containable(); + string getId(); + void setId(string id); + SHC_Containable* getParent(); + void setParent(SHC_Containable* parent); + SHC_Containable* findTop(); + void addChild(SHC_Containable* child); + void removeChild(SHC_Containable* child); + unordered_map *getChildren(); + void detachFromParent(); + void attachToParent(SHC_Containable *parent); + unordered_map fetchChildComponents(); + double calculateSeparation(SHC_Containable *cont, double theta); + SHC_Containable *fetchTopContainer(); + bool hasChildren(); + void addTrace(SHC_Containable *previous); + void addTrace(pair> *prev_pair); + void addTrace(std::set *prev_log); + bool checkTrace(string id); + std::set *getTrace(); +}; + +#endif /* SHC_Container_hpp */ diff --git a/src/SHC/SHC_Decay b/src/SHC/SHC_Decay new file mode 100644 index 0000000..1bd1d69 --- /dev/null +++ b/src/SHC/SHC_Decay @@ -0,0 +1 @@ +#include "SHC_Decay.hpp" diff --git a/src/SHC/SHC_Decay.cpp b/src/SHC/SHC_Decay.cpp new file mode 100644 index 0000000..dc66ba7 --- /dev/null +++ b/src/SHC/SHC_Decay.cpp @@ -0,0 +1,38 @@ +#include "SHC_Decay.hpp" + +SHC_Decay::SHC_Decay() { +} +void SHC_Decay::reset() {} + +SHC_Count_Decay::SHC_Count_Decay(long ceil_count):SHC_Decay() { + this->ceil_count=ceil_count; + this->counter=ceil_count; +} + +SHC_Count_Decay::~SHC_Count_Decay()=default; + +void SHC_Count_Decay::reset() { + counter=ceil_count; +} +void SHC_Count_Decay::reset(long ceil_count) { + this->ceil_count=ceil_count; + this->counter=ceil_count; +} +bool SHC_Count_Decay::ping() { + counter--; + return counter<0; +} + +SHC_Count_Decay *SHC_Count_Decay::clone() { + SHC_Count_Decay *cc=new SHC_Count_Decay(ceil_count); + cc->counter=this->counter; + return cc; +} + +long SHC_Count_Decay::getCeilingCount() { + return ceil_count; +} + +long SHC_Count_Decay::getCurrentPos() { + return counter; +} diff --git a/src/SHC/SHC_Decay.hpp b/src/SHC/SHC_Decay.hpp new file mode 100644 index 0000000..bc1acc7 --- /dev/null +++ b/src/SHC/SHC_Decay.hpp @@ -0,0 +1,34 @@ +#ifndef SHC_Decay_hpp +#define SHC_Decay_hpp + +#include +#include +#include +#include "SHC_Component.hpp" +using namespace Eigen; +using namespace std; + +class SHC_Component; +class SHC; +class SHC_Decay { +public: + SHC_Decay(); + virtual void reset(); + virtual ~SHC_Decay()=default; +}; + +class SHC_Count_Decay : public SHC_Decay { +private: + long ceil_count,counter; +public: + SHC_Count_Decay(long ceil_count); + ~SHC_Count_Decay(); + void reset(); + void reset(long ceil_count); + bool ping(); + SHC_Count_Decay *clone(); + long getCeilingCount(); + long getCurrentPos(); +}; + +#endif /* SHC_Decay_hpp */ diff --git a/src/SHC/SHC_DeltaLogger.cpp b/src/SHC/SHC_DeltaLogger.cpp new file mode 100644 index 0000000..7999900 --- /dev/null +++ b/src/SHC/SHC_DeltaLogger.cpp @@ -0,0 +1,127 @@ +#include "SHC_DeltaLogger.hpp" + +DL_Component *DeltaLogger::logComponent(SHC_Component *logged_comp,string source,int init_delta_elements) { + DL_Component *c=NULL; + string id=logged_comp->getId(); + if(c_delta.find(id)==c_delta.end()) { + c=new DL_Component(id,logged_comp->getParent()->getId()); + c->source=source; + c->delta_elements[source]=init_delta_elements; + if(!logged_comp->isOutlier()) c->start=new DL_Component_SimpleData(logged_comp); + else c->created=true; + c_delta[id]=c; + } else { + c=c_delta[id]; + if(c->delta_elements.find(source)==c->delta_elements.end()) + c->delta_elements[source]=init_delta_elements; + } + return c; +} + +void DeltaLogger::addComponentDeltaElements(SHC_Component *logged_comp,string source,int number) { + DL_Component *c=logComponent(logged_comp,source); + c->delta_elements[source]=c->delta_elements[source]+number; +} + +void DeltaLogger::finalizeComponent(SHC_Component *logged_comp) { + DL_Component *c=NULL; + if(c_delta.find(logged_comp->getId())!=c_delta.end()) { + c=c_delta[logged_comp->getId()]; + if(c->end) delete c->end; + c->end=new DL_Component_SimpleData(logged_comp); + if(logged_comp->isRedirected()) { + if(c->redirectedTo) delete c->redirectedTo; + c->redirectedTo=new string(logged_comp->getRedirectedComponent()->getId()); + } + if(logged_comp->hasBaseline()) c->baseline=new DL_Component_SimpleData(logged_comp->getBaseline()); + for(SHC_Component *o:*logged_comp->getNeighborhood()) c->neighborhood.insert(o->getId()); + for(string t_id:*logged_comp->getTrace()) c->trace.insert(t_id); + c->outlier=logged_comp->isOutlier(); + c->obsolete=logged_comp->isObsolete(); + c->blocked=logged_comp->isBlocked(); + c->parent_id=logged_comp->getParent()->getId(); + } +} + +void DeltaLogger::logComponentConnection(SHC_ComponentConnection *logged_cc) { + DL_ComponentConnection *cc=NULL; + string id1=logged_cc->getComponent1()->getId(),id2=logged_cc->getComponent2()->getId(); + if(cc_delta.find({id1,id2})!=cc_delta.end()) + cc=cc_delta[{id1,id2}]; + else { + cc=new DL_ComponentConnection(logged_cc); + cc_delta[{id1,id2}]=cc; + } + ++cc->points; +} + +void DeltaLogger::logComponentRemoval(SHC_Component *removed_comp) { + string rid=removed_comp->getId(); + cr_delta.insert(rid); + for(pair it:c_delta) { + if(it.second->redirectedTo && *it.second->redirectedTo==rid) { + delete it.second->redirectedTo; + it.second->redirectedTo=NULL; + } + it.second->neighborhood.erase(rid); + } + if(c_delta.find(rid)!=c_delta.end()) { + delete c_delta[rid]; + c_delta.erase(rid); + } + vector> removals; + for(pair,DL_ComponentConnection*> it:cc_delta) { + if(it.second->c1_id==rid || it.second->c2_id==rid) + removals.push_back(it.first); + } + for(SHC_ComponentConnectionId it:removals) { + delete cc_delta[it]; + cc_delta.erase(it); + } +} + +DeltaLogger::~DeltaLogger() { + for(pair it:c_delta) + delete it.second; + for(pair,DL_ComponentConnection*> it:cc_delta) + delete it.second; +} + +void DeltaLogger::print(ostream &o_str,string source) { + o_str << "+++++++++++++++++++++++++++++++++++++++++++++++++++" << endl; + o_str << "++ Delta log " << source << endl; + o_str << "+++++++++++++++++++++++++++++++++++++++++++++++++++" << endl; + unordered_map parents,totals; + long total_end=0; + for(pair it1:c_delta) { + for(pair it2:it1.second->delta_elements) { + if(totals.find(it2.first)==totals.end()) totals[it2.first]=it2.second; + else totals[it2.first]=totals[it2.first]+it2.second; + } + if(parents.find(it1.second->parent_id)==parents.end()) parents[it1.second->parent_id]=it1.second->end->elements; + else parents[it1.second->parent_id]=parents[it1.second->parent_id]+it1.second->end->elements; + o_str << "Add/Update " << it1.first << " "; + o_str << "total:" << it1.second->end->elements << " "; + for(pair it3:it1.second->delta_elements) + o_str << it3.first << ":" << it3.second << " "; + o_str << endl; + o_str << "\t\t rto:" << (it1.second->redirectedTo ? *it1.second->redirectedTo : "none") << " pid:" << it1.second->parent_id << " src:" << it1.second->source << endl; + total_end+=it1.second->end->elements; + } + for(string it2:cr_delta) { + o_str << "Remove " << it2 << endl; + } + for(pair it:parents) { + o_str << "Parent " << it.first << " elements " << it.second << endl; + } + long total=0; + for(pair it:totals) { + o_str << "Source " << it.first << " points " << it.second << endl; + total+=it.second; + } + o_str << "Total delta points " << total << endl; + o_str << "Total final points " << total_end << endl; + o_str << "--- end delta log print" << endl; +} + + diff --git a/src/SHC/SHC_DeltaLogger.hpp b/src/SHC/SHC_DeltaLogger.hpp new file mode 100644 index 0000000..ef34d42 --- /dev/null +++ b/src/SHC/SHC_DeltaLogger.hpp @@ -0,0 +1,77 @@ +#ifndef SHC_DeltaLogger_hpp +#define SHC_DeltaLogger_hpp + +#include +#include +#include +#include +#include +#include "SHC_Component" +#include "SHC_ComponentConnection" +using namespace Eigen; +using namespace std; + +struct DL_Component_SimpleData { + VectorXd *mean=NULL; + bool isInverted=false; + MatrixXd *covariance=NULL; + long elements=1; + DL_Component_SimpleData(SHC_Component *c) { + mean=new VectorXd(*c->getMean()); + isInverted=c->isCovarianceInverted(); + covariance=c->getCovariance(); + //cout << "Covariance:" << *covariance << endl; + elements=c->getElements(); + } + ~DL_Component_SimpleData() { + if(mean) delete mean; + if(covariance) delete covariance; + } +}; + +struct DL_Component { + string id,parent_id,*redirectedTo=NULL; + DL_Component_SimpleData *start=NULL,*end=NULL; + unordered_map delta_elements; + DL_Component_SimpleData *baseline=NULL; + set neighborhood,trace; + bool created=false,outlier=true,obsolete=false,blocked=false; + string source="master"; + DL_Component(string id,string parent_id) { + this->id=id; + this->parent_id=parent_id; + } + ~DL_Component() { + if(redirectedTo) delete redirectedTo; + if(start) delete start; + if(end) delete end; + if(baseline) delete baseline; + } +}; + +struct DL_ComponentConnection { + string c1_id,c2_id; + long points=0; + DL_ComponentConnection(SHC_ComponentConnection *cc) { + c1_id=cc->getComponent1()->getId(); + c2_id=cc->getComponent2()->getId(); + } +}; + +class DeltaLogger { +public: + unordered_map c_delta; + unordered_map, DL_ComponentConnection*, SHC_ComponentConnection_hash> cc_delta; + set cr_delta; + + DeltaLogger()=default; + ~DeltaLogger(); + DL_Component *logComponent(SHC_Component* logged_comp,string source,int init_delta_elements=0); + void addComponentDeltaElements(SHC_Component* logged_comp,string source,int number=1); + void finalizeComponent(SHC_Component* logged_comp); + void logComponentConnection(SHC_ComponentConnection *logged_cc); + void logComponentRemoval(SHC_Component* removed_comp); + void print(ostream &o_str,string source); +}; + +#endif /* SHC_DeltaLogger_hpp */ diff --git a/src/SHC/SHC_Utils b/src/SHC/SHC_Utils new file mode 100644 index 0000000..8b34dce --- /dev/null +++ b/src/SHC/SHC_Utils @@ -0,0 +1 @@ +#include "SHC_Utils.hpp" diff --git a/src/SHC/SHC_Utils.cpp b/src/SHC/SHC_Utils.cpp new file mode 100644 index 0000000..d3df0d1 --- /dev/null +++ b/src/SHC/SHC_Utils.cpp @@ -0,0 +1,20 @@ +#include "SHC_Utils.hpp" + +string generate_hex(const unsigned int len) { + stringstream ss; + for(unsigned i = 0; i < len; i++) { + mt19937 eng((unsigned)chrono::high_resolution_clock::now().time_since_epoch().count()); + const unsigned int rc = eng() % 255; + stringstream hexstream; + hexstream << hex << rc; + auto hex = hexstream.str(); + ss << (hex.length() < 2 ? '0' + hex : hex); + } + return ss.str(); +} + +string formatDouble(double value,int precision) { + stringstream ss; + ss << fixed << setprecision(precision) << value; + return ss.str(); +} diff --git a/src/SHC/SHC_Utils.hpp b/src/SHC/SHC_Utils.hpp new file mode 100644 index 0000000..ced270c --- /dev/null +++ b/src/SHC/SHC_Utils.hpp @@ -0,0 +1,90 @@ +#ifndef SHC_Utils_h +#define SHC_Utils_h + +#include +#include +#include +#include +#include +#include +#include +using namespace std; +using namespace Eigen; + +string generate_hex(const unsigned int len); +string formatDouble(double value,int precision); + +namespace Eigen { +namespace internal { +template +struct scalar_normal_dist_op +{ + static mt19937 rng; + mutable normal_distribution norm; + + EIGEN_EMPTY_STRUCT_CTOR(scalar_normal_dist_op) + + template + inline const Scalar operator() (Index, Index = 0) const { return norm(rng); } + inline void seed(const uint64_t &s) { rng.seed((unsigned int)s); } +}; + +template +mt19937 scalar_normal_dist_op::rng; + +template struct functor_traits > +{ enum { Cost = 50 * NumTraits::MulCost, PacketAccess = false, IsRepeatable = false }; }; + +} +template class EigenMultivariateNormal +{ + MatrixXd _covar; + MatrixXd _transform; + VectorXd _mean; + internal::scalar_normal_dist_op randN; + bool _use_cholesky; + SelfAdjointEigenSolver> _eigenSolver; + +public: + EigenMultivariateNormal(const VectorXd& mean,const MatrixXd& covar, + const bool use_cholesky=false,const uint64_t &seed=mt19937::default_seed) + :_use_cholesky(use_cholesky) + { + randN.seed(seed); + setMean(mean); + setCovar(covar); + } + + void setMean(const Matrix& mean) { _mean = mean; } + void setCovar(const Matrix& covar) + { + _covar = covar; + + if (_use_cholesky) + { + Eigen::LLT > cholSolver(_covar); + + if (cholSolver.info()==Eigen::Success) + { + _transform = cholSolver.matrixL(); + } + else + { + throw std::runtime_error("Failed computing the Cholesky decomposition. Use solver instead"); + } + } + else + { + _eigenSolver = SelfAdjointEigenSolver >(_covar); + _transform = _eigenSolver.eigenvectors()*_eigenSolver.eigenvalues().cwiseMax(0).cwiseSqrt().asDiagonal(); + } + } + + MatrixXd samples(int nn) + { + return (_transform * Matrix::NullaryExpr(_covar.rows(),nn,randN)).colwise() + _mean; + } +}; +} + +#endif diff --git a/src/SHC/SigmaIndex b/src/SHC/SigmaIndex new file mode 100644 index 0000000..91451bb --- /dev/null +++ b/src/SHC/SigmaIndex @@ -0,0 +1 @@ +#include "SigmaIndex.hpp" diff --git a/src/SHC/SigmaIndex.cpp b/src/SHC/SigmaIndex.cpp new file mode 100644 index 0000000..4d5821f --- /dev/null +++ b/src/SHC/SigmaIndex.cpp @@ -0,0 +1,9 @@ +#include "SigmaIndex.hpp" + +SigmaIndex_Exception::SigmaIndex_Exception(const char *msg) { + this->msg=msg; +} +const char *SigmaIndex_Exception::what() const throw() { + return this->msg; +} + diff --git a/src/SHC/SigmaIndex.hpp b/src/SHC/SigmaIndex.hpp new file mode 100644 index 0000000..447cc14 --- /dev/null +++ b/src/SHC/SigmaIndex.hpp @@ -0,0 +1,162 @@ +#ifndef SigmaIndex_hpp +#define SigmaIndex_hpp + +#include +#include +#include +#include +#include +#include +#include "SHC_Component" +#include +#include +#include "SigmaIndexProxy" +using namespace std; +using namespace Eigen; + +const string ROOT="root"; + +class SHC_Component; + +struct _SigmaIndexQueryResults { + bool visited=false; + double md=-1; + void reset() { + visited=false; + md=-1; + } + int getClass(double theta,double ntheta) { + if(md>=0 && md<=theta) return 1; + if(md>theta && md<=ntheta) return 2; + return 0; + } + _SigmaIndexQueryResults clone() { + _SigmaIndexQueryResults res; + res.visited=this->visited; + res.md=this->md; + return res; + } +}; + +/** + Sigma-Index node class. Template T represents a population class used in the Sigma-Index node. + */ +template +class SigmaIndexNode { +private: + string bckp_id; // node identifier... usually this is taken from the population identifier +protected: + T pop_ptr=NULL; // population object pointer + unordered_map*> parents,children; // maps to parent and child nodes... the key of the map is node identifier +public: + SigmaIndexNode(); + SigmaIndexNode(T population); + ~SigmaIndexNode(); + string getId(); + T getPopulation(); + void addChild(SigmaIndexNode *child); + void removeChild(SigmaIndexNode *child); + bool isDirectChild(SigmaIndexNode *node); + bool isDirectParent(SigmaIndexNode *node); + bool underTheRoot(); + unordered_map*> *getChildren(), *getParents(); + vector*> *getParentsVector(); + vector*> *getChildrenVector(); + void removeRootParent(SigmaIndexNode *root_node); + void moveTo(SigmaIndexNode *pn); + vector*> *getSortedChildren(); + vector*> *getSortedParents(); + bool operator==(const SigmaIndexNode &c) const; + bool operator!=(const SigmaIndexNode &c) const; + _SigmaIndexQueryResults results; + SigmaIndexNode *clone(); +}; + +/** + Sigma-Index query results structure. Template V represents the population class. + */ +template +struct SigmaIndexQueryResults { + // classified = populations and mahalanobis distance under theta + // neighborhood = populations and related mahalanobis distances between theta and ntheta (neighborhood threshold) + vector> *classified=NULL, *neighborhood=NULL; + SigmaIndexQueryResults() { + classified=new vector>(); + neighborhood=new vector>(); + } +}; +/** + Sigma-Index statistics structure. + */ +struct SigmaIndexStatistics { + // totalCount - total populations that needed Mahalanobis distance calculations + // tpCount - total populations within ntheta + // tsCound - total populations outside ntheta - those that were "missed" in the previous queries + // totalSequential - total number of populations for previous queries + long totalCount=0,tpCount=0,tsCount=0,totalSequential=0,qTime=0; + // R - computation cost reduction + double R=0.0; + SigmaIndexStatistics(long totalCount,long tpCount,long tsCount,long totalSequential,long qTime) { + this->totalCount=totalCount; + this->tpCount=tpCount; + this->tsCount=tsCount; + this->totalSequential=totalSequential; + this->R=totalSequential>0 && totalCount<=totalSequential ? ((double)totalSequential-(double)totalCount)/(double)totalSequential : 0; + this->qTime=qTime; + } +}; + +/** + Sigma-Index exception + */ +class SigmaIndex_Exception : public exception { +private: + const char *msg=NULL; +public: + const char *what() const throw(); + SigmaIndex_Exception(const char *msg); +}; + +/** + Sigma-Index class. Template U represents population class. + */ +template +class SigmaIndex { +private: + int c1=0,c2=0; // used as a counter in queries + bool switch1=false; // precision switch 1 + long nodeCounter=0,tpCount=0,tsCount=0,totalSequential=0,qTime=0; // statistics counters + double theta=3.0,ntheta=6.0; // statistical thresholds + bool cached=false; // a flag that indicates that Sigma-Index has cached the last queried data point + VectorXd *cachedData=NULL; // cached data point + void connect(SigmaIndexNode *pn, SigmaIndexNode *cn); + void disconnect(SigmaIndexNode *pn, SigmaIndexNode *cn); + void move(SigmaIndexNode *pn1, SigmaIndexNode *cn, SigmaIndexNode *pn2); + void _query(VectorXd *data,SigmaIndexNode *node,SigmaIndexNode *results[]); + void make_connections(SigmaIndexNode *c,SigmaIndexNode *n,bool force=false); + void _resetCache(); + void _completeUpdate(SigmaIndexNode *class_node); +protected: + unordered_map *> nodes; // the main map which holds the Sigma-Index nodes +public: + SigmaIndex(double theta, double neighborhood_theta, bool precision_switch=true); + ~SigmaIndex(); + void update(U classified,unordered_map *neighborhood); + SigmaIndexQueryResults *query(VectorXd *data,set *exclude=NULL,U starting=NULL, + bool resetCache=true); + void create(U comp,bool resetCache=false); + void remove(U comp); + void print(string node_id,ostream &ostr,set *visited=NULL); + int getNeighborhoodMultiplier(); + SigmaIndex *clone(); + void resetStatistics(); + SigmaIndexStatistics *getStatistics(); + unordered_map *calculateHistogram(); + void completeUpdate(U classified); + SigmaIndexNode *getRoot(); +}; + +#include "SigmaIndex_Inline.cpp" + + +#endif /* SigmaIndex_hpp */ diff --git a/src/SHC/SigmaIndexProxy b/src/SHC/SigmaIndexProxy new file mode 100644 index 0000000..b0d961b --- /dev/null +++ b/src/SHC/SigmaIndexProxy @@ -0,0 +1 @@ +#include "SigmaIndexProxy.hpp" diff --git a/src/SHC/SigmaIndexProxy.cpp b/src/SHC/SigmaIndexProxy.cpp new file mode 100644 index 0000000..e6e2e5f --- /dev/null +++ b/src/SHC/SigmaIndexProxy.cpp @@ -0,0 +1,158 @@ +#include "SigmaIndexProxy.hpp" + +/* + Abstract population exception. + @param msg Exception message + */ +AbstractPopulation_Exception::AbstractPopulation_Exception(const char *msg) { + this->msg=msg; +} +const char *AbstractPopulation_Exception::what() const throw() { + return this->msg; +} + +/** + A method for the Mahalanobis distance calculation + @param newElement Data point. + @return Returns the Mahalanobis distance from the centroid to the input data point, using the population covariance matrix. + */ +double ManualPopulation::mahalanobisDistance(VectorXd *newElement) { + if(newElement==NULL) throw AbstractPopulation_Exception("New element must be defined"); + if(newElement->size()!=mean->size()) throw AbstractPopulation_Exception("New element does not have the same dimension as the component"); + VectorXd delta=*newElement-*mean; // calculate the Mahalanobis distance + double N0=delta.transpose()*(*icovariance*delta); + return sqrtf(N0); +} + +/** + Manual population constructor. + @param id A population identifier. + @param mean A population centroid. + @param covariance A non-inverted covariance matrix. + @param elements Population size. + @param updateCallback An update callback that is invoked when the population data changes. + */ +ManualPopulation::ManualPopulation(string id, VectorXd *mean, MatrixXd *covariance, long elements, function updateCallback) { + FullPivLU lu(*covariance); + if(lu.isInvertible()) { // check if the covariance matrix is invertible + this->icovariance=new MatrixXd(covariance->inverse()); // invert it - this way the Mahalanobis distance calculation is less expensive + } else throw AbstractPopulation_Exception("The given covariance matrix is not invertible"); + this->mean=new VectorXd(*mean); // set private properties + this->elements=elements; + this->id=id; + this->updateCallback=updateCallback; +} + +/** + Manual population destructor. + */ +ManualPopulation::~ManualPopulation() { + delete mean; + delete icovariance; +} + +/** + Returns the size of the population. + @returns The size of the population. + */ +long ManualPopulation::getElements() { + return this->elements; +} + +/** + Update the covariance matrix for this population. The new covariance matrix must be also invertible. + @param covariance A new covariance matrix. + @param updateSI A flag whether to update Sigma-Index DAG. + @note From the Sigma-Index point of view, this change is a change is the population size or shape. + */ +void ManualPopulation::updateCovariance(MatrixXd *covariance, bool updateSI) { + FullPivLU lu(*covariance); // again check if the supplied cov. matrix is invertible + if(!lu.isInvertible()) throw AbstractPopulation_Exception("The given covariance matrix is not invertible"); + this->icovariance=new MatrixXd(covariance->inverse()); // invert it + if(updateCallback && updateSI) updateCallback(id); // notify the parent Sigma-Index that population has been changed +} + +/** + Update the centroid for this population. + @param mean A new centroid. + @param updateSI A flag whether to update Sigma-Index DAG. + @note From the Sigma-Index point of view, this change is a population drift. + */ +void ManualPopulation::updateMean(VectorXd *mean, bool updateSI) { + this->mean=new VectorXd(*mean); + if(updateCallback && updateSI) updateCallback(id); // notify the parent Sigma-Index that population has been changed +} + +/** + Update the population size for this population. + @param elements A new number of population elements. + @param updateSI A flag whether to update Sigma-Index DAG. + @note From the Sigma-Index point of view, this change might influence that structure of the DAG. If this population became bigger that any of the parents, or smaller that any of the children, the Sigma-Index DAG is not correct anymore and must be updated. + */ +void ManualPopulation::updateElements(long elements, bool updateSI) { + this->elements=elements; + if(updateCallback && updateSI) updateCallback(id); // notify the parent Sigma-Index that population has been changed +} + +/** + Returns the population centroid. + @return Population centroid. + */ +VectorXd *ManualPopulation::getMean() { + return this->mean; +} + +/** + Returns the population inverted covariance matrix. + @return Inverted covariance matrix. + */ +MatrixXd *ManualPopulation::getICovariance() { + return this->icovariance; +} + +/** + Can be used for cloning of this population + @return Cloned population object. + */ +ManualPopulation *ManualPopulation::clone() { + ManualPopulation *new_pop=new ManualPopulation(); + new_pop->id=id; + new_pop->mean=new VectorXd(*mean); + new_pop->icovariance=new MatrixXd(*icovariance); + new_pop->elements=elements; + new_pop->updateCallback=updateCallback; + return new_pop; +} + +/** + Returns the population identifier. + @return Population identifier. + */ +string ManualPopulation::getId() { + return this->id; +} + +/** + A method that updates the population with new data point. + @param data_point A new data point. + @param updateSI A flag whether to update Sigma-Index DAG. + */ +void ManualPopulation::addNewElement(VectorXd *data_point, bool updateSI) { + // calculate new mean + VectorXd *old_mean=mean; + VectorXd t1=(*data_point-*old_mean)/(elements+1); + mean=new VectorXd(*old_mean+t1); + // calculate new covariance + // Sherman–Morrison–Woodbury incremental update is done here + MatrixXd *old_cov=icovariance; + MatrixXd u=(*data_point-*old_mean)/sqrt(elements); + MatrixXd vt=((*data_point-*mean)/sqrt(elements)).transpose(); + MatrixXd v1=*old_cov*u*vt**old_cov; + MatrixXd v2=vt**old_cov*u; + MatrixXd res=*old_cov-v1/((double)1+v2(0,0)); + double X=(double)elements/((double)elements+(double)1); + icovariance=new MatrixXd(res/X); + elements++; + if(updateCallback && updateSI) updateCallback(id); +} + diff --git a/src/SHC/SigmaIndexProxy.hpp b/src/SHC/SigmaIndexProxy.hpp new file mode 100644 index 0000000..65027ba --- /dev/null +++ b/src/SHC/SigmaIndexProxy.hpp @@ -0,0 +1,79 @@ +#ifndef SigmaIndexProxy_hpp +#define SigmaIndexProxy_hpp + +#include +#include +#include +#include +#include +#include +#include +using namespace Eigen; +using namespace std; + +/* + Population main exception + */ +class AbstractPopulation_Exception : public exception { +private: + const char *msg=NULL; +public: + const char *what() const throw(); + AbstractPopulation_Exception(const char *msg); +}; + +/** +An abstract class for all populations. Contains virtual functions needed by the Sigma-Index code. +*/ +class AbstractPopulation { +public: + /** + An abstract method for calculating the mahalanobis distance for the population. Must be implemented in the specialization class. + @param newElement Data point whose distance is calculated + @return The distance value + */ + virtual double mahalanobisDistance(VectorXd *newElement) = 0; + /** + An abstract method for retrieving the population size. + @return The number of population elements + */ + virtual long getElements()=0; + /** + An abstract method for retrieving centroid. + @return The centroid. + */ + virtual VectorXd *getMean() {return NULL;} + /** + An abstract method for retrieving population identifier. + @return The population identifier. + */ + virtual string getId() {throw AbstractPopulation_Exception("Virtual function");} +}; + +/** + A manual population class. Can be used to add population details to the Sigma-Index + */ +class ManualPopulation : public AbstractPopulation { +private: + string id; // population identifier + VectorXd *mean=NULL; // centroid + MatrixXd *icovariance=NULL; // inverted covariance + long elements=0; // the number of population elements + function updateCallback=NULL; // update callback, which is invoked in case of the population update + ManualPopulation()=default; // default constructor +public: + ManualPopulation(string id, VectorXd *mean, MatrixXd *covariance, long elements, function updateCallback); + ~ManualPopulation(); + double mahalanobisDistance(VectorXd *newElement); + long getElements(); + VectorXd *getMean(); + MatrixXd *getICovariance(); + void updateCovariance(MatrixXd *covariance, bool updateSI=true); + void updateMean(VectorXd *mean, bool updateSI=true); + void updateElements(long elements, bool updateSI=true); + void addNewElement(VectorXd *data_point, bool updateSI=true); + ManualPopulation *clone(); + string getId(); +}; + +#endif /* SigmaIndexProxy_hpp */ diff --git a/src/SHC/SigmaIndex_Inline.cpp b/src/SHC/SigmaIndex_Inline.cpp new file mode 100644 index 0000000..7e21a5e --- /dev/null +++ b/src/SHC/SigmaIndex_Inline.cpp @@ -0,0 +1,699 @@ +#include "SigmaIndex.hpp" + +/** + Sigma-Index node constructor. + @param population A population object for this node. + */ +template +SigmaIndexNode::SigmaIndexNode(T population) { + this->pop_ptr=population; + this->bckp_id=population->getId(); +} +/** + A constructor for the ROOT node. + */ +template +SigmaIndexNode::SigmaIndexNode() { + this->bckp_id=ROOT; +} +/** + Sigma-Index node desctructor. We do not clean population objects here, as we do not know how these populations were instances. + For example: a population can be an SHC component, which, if deleted here, would distrupt the list of SHC concepts. + */ +template +SigmaIndexNode::~SigmaIndexNode() { +} +/** + Returns Sigma-Index node identifier. + @return Node identifier. + */ +template +string SigmaIndexNode::getId() { + return bckp_id; +} +/** + Returns Sigma-Index node population object. + @return Population object. + */ +template +T SigmaIndexNode::getPopulation() { + return this->pop_ptr; +} + +/** + Add child to this node. + @param child The child node to add. + */ +template +void SigmaIndexNode::addChild(SigmaIndexNode *child) { + if(!isDirectChild(child)) { + children[child->getId()]=child; + if(!child->isDirectParent(this)) + child->parents[getId()]=this; + } +} +/** + Removes a child from this node. + @param child The child node to remove. + */ +template +void SigmaIndexNode::removeChild(SigmaIndexNode *child) { + if(isDirectChild(child)) { + children.erase(child->getId()); + if(child->isDirectParent(this)) + child->parents.erase(this->getId()); + } +} +/** + Tests whether the supplied node is a direct child of this node. + @param node The tested node. + @return false - not a direct child, true - is a direct child + */ +template +bool SigmaIndexNode::isDirectChild(SigmaIndexNode *node) { + return children.find(node->getId())!=children.end(); +} +/** + Tests whether the supplied node is a direct parent of this node. + @param node The tested node. + @return false - not a direct parent, true - is a direct parent + */ +template +bool SigmaIndexNode::isDirectParent(SigmaIndexNode *node) { + return parents.find(node->getId())!=parents.end(); +} +/** + Tests whether this node is direct child of the ROOT node. + @return false - not a direct child, true - is a direct child + */ +template +bool SigmaIndexNode::underTheRoot() { + return parents.find(ROOT)!=parents.end(); +} +/** + Returns child nodes. + @return A map containing all child nodes. + */ +template +unordered_map*> *SigmaIndexNode::getChildren() { + return &children; +} +/** + Returns parent nodes. + @return A map containing all parent nodes. + */ +template +unordered_map*> *SigmaIndexNode::getParents() { + return &parents; +} +/** + Returns a vector of parent nodes. + @return A vector of all parent nodes. This vector is a working instance containing only pointers to nodes. + */ +template +vector*> *SigmaIndexNode::getParentsVector() { + vector *res=new vector(); + for(pair it:parents) res->push_back(it.second); + return res; +} +/** + Returns a vector of child nodes. + @return A vector of all child nodes. This vector is a working instance containing only pointers to nodes. + */ +template +vector*> *SigmaIndexNode::getChildrenVector() { + vector *res=new vector(); + for(pair it:children) res->push_back(it.second); + return res; +} +/** + Returns a vector of child nodes, sorted descending by population size. + @return A vector of all child nodes, sorted descending by population size. This vector is a working instance containing only pointers to nodes. + */ +template +vector*> *SigmaIndexNode::getSortedChildren() { + vector*> *res=new vector*>(); + for(pair*> it:children) + res->push_back(it.second); + sort(res->begin(), res->end(), [](SigmaIndexNode *a, SigmaIndexNode *b) { + if(a==NULL || a->getPopulation()==NULL || b==NULL || b->getPopulation()==NULL) return false; + return a->getPopulation()->getElements()>b->getPopulation()->getElements(); + }); + return res; +} +/** + Returns a vector of parent nodes, sorted descending by population size. + @return A vector of all parent nodes, sorted descending by population size. This vector is a working instance containing only pointers to nodes. +*/ +template +vector*> *SigmaIndexNode::getSortedParents() { + vector*> *res=new vector*>(); + for(pair*> it:parents) + res->push_back(it.second); + sort(res->begin(), res->end(), [](SigmaIndexNode *a, SigmaIndexNode *b) { + if(a==NULL || a->getPopulation()==NULL || b==NULL || b->getPopulation()==NULL) return false; + return a->getPopulation()->getElements()>b->getPopulation()->getElements(); + }); + return res; +} +/** + Removes the ROOT node from the parents node. + @param root_node A pointer to the ROOT node. + */ +template +void SigmaIndexNode::removeRootParent(SigmaIndexNode *root_node) { + root_node->removeChild(this); +} +/** + Removes this node from the current parents and moves it under the specified node. + @param pn A node that must become an unique parent of this node. + */ +template +void SigmaIndexNode::moveTo(SigmaIndexNode *pn) { + if(parents.size()>0) + for(pair *> it:parents) it.second->removeChild(this); + pn->addChild(this); +} +/** + Operator that can be used to test whether two pointers point to the same node + @param c Pointer to the tested node. + @return Returns true if this node and the tested node are the same one. + */ +template +bool SigmaIndexNode::operator==(const SigmaIndexNode &c) const { + return this->pop_ptr==c.pop_ptr; +} +/** + Operator that can be used to test whether two pointers DO NOT point to the same node + @param c Pointer to the tested node. + @return Returns true if this node and the tested node are NOT the same one. + */ +template +bool SigmaIndexNode::operator!=(const SigmaIndexNode &c) const { + return this->pop_ptr!=c.pop_ptr; +} + +/** + Sigma-Index constructor. + @param theta Classification threshold. + @param neighborhood_theta Neighborhood threshold. + @param precision_switch Keep the precision high (lowers the speed for some cases) + */ +template +SigmaIndex::SigmaIndex(double theta, double neighborhood_theta, bool precision_switch) { + this->theta=theta; + this->ntheta=neighborhood_theta; + this->switch1=precision_switch; + SigmaIndexNode *root_node=new SigmaIndexNode(); + nodes[ROOT]=root_node; // Immediatelly create the ROOT node +} +/** + Sigma-Index destructor. + */ +template +SigmaIndex::~SigmaIndex() { + for(pair *> it:nodes) + delete it.second; // clean nodes +} + +/** + Incremental update of the Sigma-Index. We can perform such update right after the query. + This kind of update prevents re-calculation of the neighborhood, and by this saving some Mahalanobis distance calculations. + Template U represents the population class. + @param classified The classified population. The input data point was found to belong to this population. + @param neighborhood A set of populations that was found to be a neighborhood of the input data point, and by this the neighborhood of the classified population. + */ +template +void SigmaIndex::update(U classified,unordered_map *neighborhood) { + if(classified==NULL) return; + SigmaIndexNode *class_node=nodes[classified->getId()]; // search for the classified node... node is just a wrapper for a population + if(class_node->getParents()->size()>0) { // if classified node has parents (usually has) + /* + This chunk of the code checks whether the child became bigger than the parent + If that is the case, we reverse the direction of the edge + */ + vector*> *v=class_node->getParentsVector(); // get the working vector of these parent + for(SigmaIndexNode *pn:*v) { // iterate through them + if(pn->getId()!=ROOT && pn->getPopulation()->getElements()getPopulation()->getElements() && + pn->isDirectChild(class_node)) { // if the parent is not ROOT and the population size of the parent became smaller + disconnect(pn, class_node); // reverse direction of the edge + connect(class_node, pn); + } + } + delete v; + } + if(neighborhood!=NULL || neighborhood->size()>0) { + /* + If there was some neighborhood detected in the previous query. This is THE incremental update. + Effectively, we update only this neighborhood given by the user... which was (ideally) calculated in the previous query. + */ + for(pair nbor:*neighborhood) { // iterate through the neighborhood + if(nbor.first!=classified) { // if the neighbor population is not the same as the classified population + SigmaIndexNode *neigh_node=nodes[nbor.first->getId()]; // find the corresponding node + make_connections(class_node, neigh_node); // check and create connection between classified and neighbor node + } + } + } + delete neighborhood; +} + + +/** + Complete update of the Sigma-Index. Details on _completeUpdate. + @param classified The classified population. + */ +template +void SigmaIndex::completeUpdate(U classified) { + if(classified==NULL) return; + SigmaIndexNode *class_node=nodes[classified->getId()]; // find the classified Sigma-Index node containing the supplied population object + if(class_node->getParents()->size()>0) { // again, as in the update method, check relationships between parent and child population sizes + vector*> *v=class_node->getParentsVector(); + for(SigmaIndexNode *pn:*v) { + if(pn->getId()!=ROOT && pn->getPopulation()->getElements()getPopulation()->getElements() && + pn->isDirectChild(class_node)) { // if the population size flipped, we reverse the edge direction + disconnect(pn, class_node); + connect(class_node, pn); + } + } + delete v; + } + _completeUpdate(class_node); // do the complete update of the node +} +template +void SigmaIndex::_completeUpdate(SigmaIndexNode *class_node) { + set processed; + if(class_node->getParents()->size()>0) { + vector*> *v=class_node->getParentsVector(); + for(SigmaIndexNode *pn:*v) { + if(pn->getId()!=ROOT && pn->getPopulation()->mahalanobisDistance(class_node->getPopulation()->getMean())>ntheta) { + disconnect(pn, class_node); + processed.insert(pn->getId()); + } + } + delete v; + } + if(class_node->getChildren()->size()>0) { + vector*> *v=class_node->getChildrenVector(); + for(SigmaIndexNode *cn:*v) { + if(class_node->getPopulation()->mahalanobisDistance(cn->getPopulation()->getMean())>ntheta) { + disconnect(class_node, cn); + processed.insert(cn->getId()); + } + } + delete v; + } + // In the complete update, we iterate through all Sigma-Index nodes and calculate the neighborhood of the classified node + // This is quite costly, as we perform one sequential-scan classification for each complete update we do + // Even so, this is quicker than doing sequential-scan for each input data point + for(pair*> nit:nodes) { + if(nit.first!=ROOT && *nit.second!=*class_node && processed.find(nit.first)==processed.end() && + nit.second->getPopulation()->mahalanobisDistance(class_node->getPopulation()->getMean())<=ntheta) { + make_connections(nit.second, class_node, true); + } + } +} + +/** + Connects two Sigma-Index nodes with edge. + @param pn The parent node. + @param cn The child node. + */ +template +void SigmaIndex::connect(SigmaIndexNode *pn, SigmaIndexNode *cn) { + if(!pn->isDirectChild(cn) && !cn->isDirectChild(pn)) { + if(cn->underTheRoot()) cn->removeRootParent(nodes[ROOT]); // If the child node is directly under the ROOT, we remove this edge + pn->addChild(cn); + } +} +/** + Disconnects two Sigma-Index nodes if connected. + @param pn The parent node. + @param cn The child node. + */ +template +void SigmaIndex::disconnect(SigmaIndexNode *pn, SigmaIndexNode *cn) { + if(pn->isDirectChild(cn)) { // if connected pn->cn + pn->removeChild(cn); // remove + if(cn->getParents()->size()==0) cn->moveTo(nodes[ROOT]); // if cn does not have any parent nodes, connect it to the ROOT + } +} +/** + Moves a Sigma-Index node from one parent to other. + @param pn1 The originating parent node. + @param pn2 The destination parent node. + @param cn The child node. + */ +template +void SigmaIndex::move(SigmaIndexNode *pn1, SigmaIndexNode *cn, SigmaIndexNode *pn2) { + disconnect(pn1, cn); + connect(pn2, cn); +} + +/** + Internal query method. Algorithm 1 in the Sigma-Index article. + @param data The input data point that needs to be classified. + @param node The starting Sigma-Index node. Used for recursion purposes. + @param results An array that hold the current classification results. + */ +template +void SigmaIndex::_query(VectorXd *data,SigmaIndexNode *node,SigmaIndexNode *results[]) { + vector*> l_neigh1,l_neigh2,*l_neigh=NULL; + for(pair*> ch_it:*node->getChildren()) { // we iterate through the node children + if(!ch_it.second->results.visited) { // on each node we log whether it was already visited or not... if not the child node is not visited + std::chrono::time_point qst=std::chrono::system_clock::now(); + ch_it.second->results.md=ch_it.second->getPopulation()->mahalanobisDistance(data); // calculation for the child node + std::chrono::time_point qet=std::chrono::system_clock::now(); + qTime+=std::chrono::duration_cast(qet-qst).count(); + // notice that we store the distance on each node - this is what is considered for cache as well + nodeCounter++; // increment the calculation counter - statistics + if(!cached) { // if there is no cached data point, cache the one we are classifying now + cached=true; + if(cachedData==NULL) cachedData=new VectorXd(*data); + } + ch_it.second->results.visited=true; // switch on the visited flag on the child node + if(ch_it.second->results.md<=ntheta) { // if the data point is in the neighborhood of the child node + l_neigh1.push_back(ch_it.second); // add the child node to the l1 + results[c1++]=ch_it.second; // add it to the results + if(ch_it.second->results.md<=theta) ++c2; + tsCount++; // increment the success counter - statistics + } else { // if not in the neighborhood + l_neigh2.push_back(ch_it.second); // add the child node to the l2 + tpCount++; // increment the missed counter - statistics + } + } + } + for(pair*> pa_it:*node->getParents()) { // iterate through parent nodes + if(pa_it.first!=ROOT && !pa_it.second->results.visited) { // if the parent node is not ROOT and was not visited yet + std::chrono::time_point qst=std::chrono::system_clock::now(); + pa_it.second->results.md=pa_it.second->getPopulation()->mahalanobisDistance(data); // calculate the distance for the parent node + std::chrono::time_point qet=std::chrono::system_clock::now(); + qTime+=std::chrono::duration_cast(qet-qst).count(); + + nodeCounter++; // increment the calculation counter - statistics + if(!cached) { // if there is no cached data point, cache the one we are classifying now + cached=true; + if(cachedData==NULL) cachedData=new VectorXd(*data); + } + if(!switch1) pa_it.second->results.visited=true; // switch on the visited flag on the parent node... we keep the speed here + if(pa_it.second->results.md<=ntheta) { // for parents we process only those that are in the neighborhood... we trace neighborhood back up + if(switch1) pa_it.second->results.visited=true; // switch on the visited flag on the parent node + l_neigh1.push_back(pa_it.second); // add the parent node to l1 + results[c1++]=pa_it.second; // add the parent node to results + if(pa_it.second->results.md<=theta) ++c2; + tsCount++; // increment the success counter - statistics + } + } + } + + if(l_neigh1.size()>0) l_neigh=&l_neigh1; // this means we have encountered the neighborhood... we pursuit to explore only these nodes + else { // if we are not in the neighborhood, then we explore all children + if(!switch1 && c1>0) return; // if we found the neighborhood somewhere in the upper DAG levels, just return... + if(switch1 && c2>0) return; // if we found the classified neighborhood somewhere in the upper DAG levels, just return... + l_neigh=&l_neigh2; + } + bool done=false; + while(!done) { // we iterate until the exit condition... this loop wil execute mostly twice! + sort(l_neigh->begin(),l_neigh->end(),[=](SigmaIndexNode *a,SigmaIndexNode *b){ + return a->results.mdresults.md; // we sort the array l by the distance ascending, as we want to explore the closest nodes first + }); + for(SigmaIndexNode *n_sub:*l_neigh) { // iterate through l + _query(data,n_sub,results); // the recursive call + if(!switch1 && (l_neigh1.size()==0 || node->getId()==ROOT) && c1>0) return; // this means we returned back to the ROOT, but we found the neighborhood... stopping here + if(switch1 && (l_neigh1.size()==0 || node->getId()==ROOT) && c2>0) return; // this means we returned back to the ROOT, but we found the classified neighborhood... stopping here + } + if(switch1) { // only if we want to keep precision high + if(c1>0 && c2==0 && l_neigh==&l_neigh1) l_neigh=&l_neigh2; // so we found the neighborhood, but not classified... and we went through the l1, do it for l2 as well + else done=true; // otherwise flip the exit flag + } else { + if(c1==0 && l_neigh==&l_neigh1) l_neigh=&l_neigh2; // so we did not found the neighborhood... and we went through the l1, do it for l2 as well + else done=true; // otherwise flip the exit flag + } + } +} + +/** + Query method. + @param data The data point to classify. + @param exclude The set of populations that needs to be excluded from the query. + @param starting The starting population. + @param resetCache Force resetting of the data point cache. + */ +template +SigmaIndexQueryResults *SigmaIndex::query(VectorXd *data,set *exclude,U starting, + bool resetCache) { + SigmaIndexNode *results[nodes.size()]; // clean up the results + c1=0;c2=0; // init the internal counter + if(resetCache) _resetCache(); // reset the cache + SigmaIndexNode *start_node=nodes[ROOT]; // find the ROOT + if(starting!=NULL && nodes.find(starting->getId())!=nodes.end()) start_node=nodes[starting->getId()]; // if there is a starting population supplied from outside + totalSequential+=(nodes.size()-1); // increase the total sequential count for the total number of nodes + _query(data, start_node, results); // do the query + SigmaIndexQueryResults *res=new SigmaIndexQueryResults(); + for(int i=0;i *node=results[i]; + U c=node->getPopulation(); // get the population that is in the neighborhood + if(exclude==NULL || exclude->find(c)==exclude->end()) { // test if we need to exclude this population + int cz=node->results.getClass(theta,ntheta); // classify whether the population is <=theta or between theta and ntheta + if(cz==1) res->classified->push_back(pair(c,node->results.md)); // add to the classified set + else if(cz==2) res->neighborhood->push_back(pair(c,node->results.md)); // add to the neighborhood set + } + } + sort(res->classified->begin(),res->classified->end(),[=](pair& a,pair& b) { + return a.secondneighborhood->begin(),res->neighborhood->end(),[=](pair& a,pair& b) { + return a.second +void SigmaIndex::remove(U comp) { + if(nodes.find(comp->getId())!=nodes.end()) { // if we can find the node that wraps the supplied population + SigmaIndexNode *node=nodes[comp->getId()],*root_node=nodes[ROOT]; + vector remove1,remove2; + for(pair*> it1:*node->getChildren()) // iterate through children + remove1.push_back(it1.first); + for(string id:remove1) { + SigmaIndexNode *child=nodes[id]; + node->removeChild(child); + if(child->getParents()->size()==0) root_node->addChild(child); // if one of the child node looses all parents, add it to the ROOT node + } + for(pair*> it2:*node->getParents()) // iterate through parents + remove2.push_back(it2.first); + for(string id:remove2) + nodes[id]->removeChild(node); + nodes.erase(node->getId()); // remove the node from the map + delete node; // delete the Sigma-Index node object + } +} + +/** + Prints the Sigma-Index. Basic orientative, debugging text printout. + @param node_id The node identifier. + @param ostr The output stream for printout, e.g., cout. + @param visited The set of already visited nodes. + */ +template +void SigmaIndex::print(string node_id,ostream &ostr,set *visited) { + if(!visited) visited=new set(); + if(visited->find(node_id)!=visited->end()) return; // mark visited nodes + visited->insert(node_id); + SigmaIndexNode *node=nodes[node_id]; // find the Sigma-Index node having the supplied identifier + ostr << (node_id==ROOT ? "++++++++++++++++++++++" : "----------------------") << endl; + ostr << "Node:" << node_id << " elements=" << (node_id!=ROOT ? node->getPopulation()->getElements() : 0) << endl; + if(node->getChildren()->size()>0) { // iterate through his children + ostr << "Children:"; + for(pair*> it:*node->getChildren()) + ostr << it.first << "(" << it.second->getPopulation()->getElements() << ") "; + ostr << endl; + for(pair*> it:*node->getChildren()) + print(it.first,ostr,visited); + } +} + +/** + Calculates and returns the neighborhood multiplier, i.e., ntheta/theta + @return The multiplier. + */ +template +int SigmaIndex::getNeighborhoodMultiplier() { + return (int)ntheta/theta; +} + +/** + Clones the Sigma-Index. + @return The cloned Sigma-Index. + */ +template +SigmaIndex *SigmaIndex::clone() { + SigmaIndex *nst=new SigmaIndex(theta,ntheta,switch1); // instantiate the cloned object + nst->cached=this->cached; // clone the cached flag + if(this->cachedData!=NULL) nst->cachedData=new VectorXd(*this->cachedData); // clone the cached data point + for(pair*> it:nodes) { // iterate through node map + if(it.first!=ROOT) { // if not the ROOT node + SigmaIndexNode *nn=new SigmaIndexNode(it.second->getPopulation()); // clone the Sigma-Index node, reroute the population pointer + nn->results=it.second->results.clone(); // clone the cached data - visited flags and distances + nst->nodes[it.second->getPopulation()->getId()]=nn; // update the node map ine cloned Sigma-Index + } + } + for(pair*> it:nodes) { // iterate through nodes in the local map + SigmaIndexNode *nn=nst->nodes[it.first]; // find the same node in the cloned Sigma-Index + for(pair*> itp:*it.second->getParents()) // clone edges to parents + (*nn->getParents())[itp.first]=nst->nodes[itp.first]; + for(pair*> itp:*it.second->getChildren()) // clone edges to children + (*nn->getChildren())[itp.first]=nst->nodes[itp.first]; + } + return nst; +} + +/** + Creates new Sigma-Index node. Usually for a new population. Or an outlier. + @param comp The new population. + @param resetCache Flag that indicated whether to force reset the previous cache data. + */ +template +void SigmaIndex::create(U comp,bool resetCache) { + if(nodes.find(comp->getId())==nodes.end()) { // check the population identifier in the current nodes + SigmaIndexNode *nn=new SigmaIndexNode(comp); // create new Sigma-Index node + nodes[comp->getId()]=nn; // add it to the map + nodes[ROOT]->addChild(nn); // add it under the ROOT node... this is a default edge + if(!cached || resetCache) { // if we do not want to use the cache + if(cached) _resetCache(); // reset cached data + for(pair*> it:nodes) { // do the full update... this is quite costly + if(it.first!=comp->getId() && it.first!=ROOT) + make_connections(nn, it.second); + } + } else { // if we want to use the previously cached data... + // this usually happens when we performed a query priori to this create + // and we did not classify the input data point... but now we want to reuse neighborhood calculations + // to ease creation... this is similar to the incremental update + for(pair*> it:nodes) { // iterate through the nodes + if(it.second->results.visited && it.second->results.md<=ntheta) // if the cached distance indicates that this node is in the neighborhood + make_connections(nn, it.second, true); // connect it + } + vector*> unprocessed; + for(pair*> it:*nodes[ROOT]->getChildren()) { // iterate through top nodes, those directly under the ROOT + if(it.second->results.visited && it.second->getChildren()->size()>0 && + !it.second->getChildren()->begin()->second->results.visited) + // the top node is usually processed... but we check the second row of nodes + // those that are unvisited, we want to process them + unprocessed.push_back(it.second); // we store top-nodes that we want to process + } + sort(unprocessed.begin(),unprocessed.end(),[=](SigmaIndexNode *a,SigmaIndexNode *b) { + return a->results.mdresults.md; // sort the unprocessed nodes by distance ascending + }); + for(SigmaIndexNode *cn1:unprocessed) { // iterate through unprocessed nodes + SigmaIndexNode *results[nodes.size()]; + c1=0;c2=0; + _query(cachedData, cn1, results); // do the query only for one unprocessed top node - notice that we do not start from the ROOT here + if(!switch1 && c1==0) return; // exit early only if we do not want to keep the precision + // but we query only a sub-DAG + // we expect that we find any remaining neighborhood in the first few iterations + // until the query did not find any neighborhood nodes ... then we exit + for(int i=0;i *cn2=results[i]; + make_connections(nn, cn2, true); // connect to the new node + } + } + } + } +} + +/** + Resets the statistic counters. + */ +template +void SigmaIndex::resetStatistics() { + this->nodeCounter=0; + this->tpCount=0; + this->tsCount=0; + this->totalSequential=0; + this->qTime=0; +} + +/** + Returns the statistic structure. + @return The statistic structure. + */ +template +SigmaIndexStatistics *SigmaIndex::getStatistics() { + return new SigmaIndexStatistics(nodeCounter,tpCount,tsCount,totalSequential,qTime); +} + +/** + Resets the cached data. + */ +template +void SigmaIndex::_resetCache() { + cached=false; // switch off the cached flag + if(cachedData!=NULL) delete cachedData; // delete the cached data point + cachedData=NULL; + for(pair*> it:nodes) it.second->results.reset(); // iterate through nodes and reset the cached data +} + +/** + Calculates the complete histogram for the Sigma-Index. + @return The map 0-100 containing node counts for each computation cost reduction. + */ +template +unordered_map *SigmaIndex::calculateHistogram() { + unordered_map *hist=new unordered_map(); + for(pair*> it:nodes) { // iterate through all nodes + if(it.first!=ROOT) { // if not the ROOT node + resetStatistics(); // reset the statistic counters for each node + SigmaIndexQueryResults *res=query(it.second->getPopulation()->getMean()); // do the query for the population centroid + SigmaIndexStatistics *stats=getStatistics(); // retrieve statistics + int bin=ceil((stats->R<0 ? 0 : stats->R)*100); // calculate CCR as percentage + if(hist->find(bin)!=hist->end()) (*hist)[bin]=(*hist)[bin]+it.second->getPopulation()->getElements(); // multiply this CCR with population size and add to the CCR bin + else (*hist)[bin]=it.second->getPopulation()->getElements(); + delete stats;delete res; + } + } + return hist; +} + +/** + Creates connection between Sigma-Index nodes + @param c The node 1. + @param n The node 2. + @param force The force flag. + */ +template +void SigmaIndex::make_connections(SigmaIndexNode *c,SigmaIndexNode *n,bool force) { + if(*c==*n) return; // cannot create loops on nodes + if(n->getPopulation()->getElements()getPopulation()->getElements()) { // if n is smaller then c, n should be the child + if(n->isDirectChild(c)) disconnect(n, c); // if we already have an edge but in the opposite direction, remove it + if(!c->isDirectChild(n) && (force || c->getPopulation()->mahalanobisDistance(n->getPopulation()->getMean())<=ntheta || + n->getPopulation()->mahalanobisDistance(c->getPopulation()->getMean())<=ntheta)) connect(c, n); + // check if there is no edge c->n and n centroid is in the neighborhood of c, make the connection c->n + } + if(n->getPopulation()->getElements()>c->getPopulation()->getElements()) { // if c is smaller then n, c should be the child + if(c->isDirectChild(n)) disconnect(c, n); // if we already have an edge but in the opposite direction, remove it + if(!n->isDirectChild(c) && (force || n->getPopulation()->mahalanobisDistance(c->getPopulation()->getMean())<=ntheta || + c->getPopulation()->mahalanobisDistance(n->getPopulation()->getMean())<=ntheta)) connect(n, c); + // check if there is no edge n->c and c centroid is in the neighborhood of n, make the connection n->c + } + if(n->getPopulation()->getElements()==c->getPopulation()->getElements() && !n->isDirectChild(c) && !c->isDirectChild(n)) { + // if the nodes are equal, we can make the connection but only: + // - if there is no connection between these nodes (at all) + // - c centroid is in the neighborhood of n, then we create n->c + if(force || n->getPopulation()->mahalanobisDistance(c->getPopulation()->getMean())<=ntheta || + c->getPopulation()->mahalanobisDistance(n->getPopulation()->getMean())<=ntheta) { + connect(n, c); + } + } +} + +/** + Returns the ROOT node + @return The ROOT node. + */ +template +SigmaIndexNode *SigmaIndex::getRoot() { + return nodes[ROOT]; +} diff --git a/src/SHC_Module.cpp b/src/SHC_Module.cpp new file mode 100644 index 0000000..7e8118c --- /dev/null +++ b/src/SHC_Module.cpp @@ -0,0 +1,40 @@ +#include "SHC_R.cpp" +#include +using namespace Rcpp; +using namespace std; + +RCPP_EXPOSED_AS(SHC_R) +RCPP_MODULE(SHCModule) { + Rcpp::class_("SHC_R") + .constructor() + .constructor() + .constructor() + .method("process",&SHC_R::processForR,"SHC processing entry") + .method("getClusters",&SHC_R::getClusters,"Get containers that contain components and outliers") + .method("getComponents",&SHC_R::getComponents,"Get components for the supplied cluster") + .method("getClusterContours",&SHC_R::getClusterContours,"Get cluster component(s) classification bounds") + .method("cleanOutliers",&SHC_R::cleanOutliers,"Remove outliers from SHC models") + .method("getComponentDetails",&SHC_R::getComponentDetails,"Get component details") + .method("theta",&SHC_R::getTheta,"Get the SHC classifier statistical threshold") + .method("virtualVariance",&SHC_R::getVirtualVariance,"Get the SHC classifier statistical virtual variance") + .method("isTraceable",&SHC_R::isTraceable,"Are two components traceable") + .method("getAllComponents",&SHC_R::getAllComponents,"Get components for the supplied cluster") + .method("getClusterWeight",&SHC_R::getClusterWeight,"Get cluster weight") + .method("getComponentWeight",&SHC_R::getComponentWeight,"Get component weight") + .method("getClusterMapping",&SHC_R::getClusterMapping,"Get cluster mapping") + .method("getComponentMapping",&SHC_R::getComponentMapping,"Get component mapping") + .method("microToMacro",&SHC_R::stream_MicroToMacro,"Micro to macro mapping for the stream package") + .method("microToMicro",&SHC_R::stream_MicroToMicro,"Micro to micro mapping for the stream package") + .method("stats",&SHC_R::getStats,"Processing stats") + .method("useSigmaIndex",&SHC_R::useSigmaIndex,"Use sigma-indexing tree") + .method("setPseudoOfflineCounter",&SHC_R::setPseudoOfflineCounter,"Set pseudo-offline counter") + .method("getTimes",&SHC_R::getTimes,"Get variety of execution timings") + .method("getNodeCounter",&SHC_R::getNodeCounter,"Get node counter") + .method("getComputationCostReduction",&SHC_R::getSigmaIndexR,"Get computation cost reduction") + .method("getHistogram",&SHC_R::getSigmaIndexHistogram,"Get histogram") + .method("recheckOutlier",&SHC_R::recheckOutlier,"Recheck outlier") + .method("getOutlierPositions",&SHC_R::getOutlierPositions,"Get outlier positions") + .method("getTrace",&SHC_R::getTrace,"Get component or outlier trace") + .method("cloneSHC",&SHC_R::cloneSHC,"SHC cloning interface") + .method("clearEigenMPSupport",&SHC_R::clearEigenMPSupport,"Clear Eigen MP support"); +} \ No newline at end of file diff --git a/src/SHC_R.cpp b/src/SHC_R.cpp new file mode 100644 index 0000000..f4b8ff9 --- /dev/null +++ b/src/SHC_R.cpp @@ -0,0 +1,345 @@ +#include +#include +#include +using namespace Rcpp; +using namespace Eigen; + +class SHC_R { +private: + SHC *shc_classifier=NULL; + unordered_map clus_map,comp_map; + unordered_map bkclus_map,bkcomp_map; + int clus_ind=1,comp_ind=1; + int _getClusterMapping(string s_clusId) { + if(clus_map.find(s_clusId)==clus_map.end()) { + bkclus_map[clus_ind]=s_clusId; + clus_map[s_clusId]=clus_ind++; + } + return clus_map[s_clusId]; + } + int _getComponentMapping(string s_compId) { + if(comp_map.find(s_compId)==comp_map.end()) { + bkcomp_map[comp_ind]=s_compId; + comp_map[s_compId]=comp_ind++; + } + return comp_map[s_compId]; + } + string *_getBkClusterMapping(int cluster) { + if(bkclus_map.find(cluster)!=bkclus_map.end()) return new string(bkclus_map[cluster]); + else return NULL; + } + string *_getBkComponentMapping(int component) { + if(bkcomp_map.find(component)!=bkcomp_map.end()) return new string(bkcomp_map[component]); + else return NULL; + } +public: + SHC_R(int dimensions,int aggloType=0,int driftType=0,int decayPace=10,int sha_agglo_theta=1) { + AgglomerationType at=NormalAgglomeration; + switch(aggloType) { + case 1: at=AggresiveAgglomeration;break; + case 2: at=RelaxedAgglomeration;break; + default: at=NormalAgglomeration;break; + } + DriftType dt=NormalDrift; + switch(driftType) { + case 1: dt=FastDrift;break; + case 2: dt=SlowDrift;break; + case 3: dt=NoDrift;break; + case 4: dt=UltraFastDrift;break; + default: dt=NormalDrift;break; + } + shc_classifier=new SHC(dimensions,at,dt,decayPace); + if(sha_agglo_theta>1) shc_classifier->setSharedAgglomerationThreshold(sha_agglo_theta); + } + SHC_R(List params) { + double theta=params["theta"]; + bool par=params["parallelize"]; + bool pSA=params["performSharedAgglomeration"]; + VectorXd vv=as(params["virtualVariance"]); + int acount=params["agglo_count"]; + double cbV=params["cbVarianceLimit"]; + int cbN=params["cbNLimit"]; + double drCSR=params["driftRemoveCompSizeRatio"]; + double dCSR=params["driftCheckingSizeRatio"]; + double dMDTR=params["driftMovementMDThetaRatio"]; + int dp=params["decayPace"]; + double compFormingMinVVRatio=params["componentFormingMinVVRatio"]; + double compBlockingLimitVVRatio=params["componentBlockingLimitVVRatio"]; + shc_classifier=new SHC(theta,par,pSA,&vv,acount,cbV,cbN,(float)drCSR,(float)dCSR, + (float)dMDTR,dp,(float)compFormingMinVVRatio, + (float)compBlockingLimitVVRatio); + int sha_at=params["sharedAgglomerationThreshold"]; + if(sha_at>1) shc_classifier->setSharedAgglomerationThreshold(sha_at); + } + SHC_R(SHC_R *ancestor_shc) { + shc_classifier=ancestor_shc->shc_classifier->clone(); + } + ~SHC_R() { + if(shc_classifier!=NULL) delete shc_classifier; + } + DataFrame processForR(DataFrame elements,bool classifyOnly) { + Nullable l=elements.attr("cluster"); + NumericMatrix nm=Rcpp::internal::convert_using_rfunction(elements, "as.matrix"); + MatrixXd m=as(nm); + pair>>,shared_ptr> res_tup=shc_classifier->process(&m,false,classifyOnly); + shared_ptr>> res=res_tup.first; + //if(!classifyOnly) shc_classifier->pseudoOffline(true); + StringVector clus,comps; + IntegerVector clus_m,comps_m; + LogicalVector outliers; + for(int i=0;isize();i++) { + shared_ptr cres=res->at(i); + if(cres->cluster_id!=NULL) { + clus.push_back(*cres->cluster_id); + clus_m.push_back(_getClusterMapping(*cres->cluster_id)); + } else { + clus.push_back(NA_STRING); + clus_m.push_back(0); + } + if(cres->component_id!=NULL) { + comps.push_back(*cres->component_id); + comps_m.push_back(_getComponentMapping(*cres->component_id)); + } else { + comps.push_back(NA_STRING); + comps_m.push_back(0); + } + outliers.push_back(cres->outlier); + } + if(l.isNull()) return DataFrame::create(Named("cluster_id")=clus,Named("component_id")=comps,Named("assigned_cluster")=clus_m,Named("assigned_comp")=comps_m,Named("outlier")=outliers,Named("stringsAsFactors")=false); + else return DataFrame::create(Named("cluster_id")=clus,Named("component_id")=comps,Named("assigned_cluster")=clus_m,Named("assigned_comp")=comps_m,Named("outlier")=outliers,Named(".clazz")=StringVector(l),Named("stringsAsFactors")=false); + } + StringVector getClusters(bool clusters,bool outliers) { + set *clusts=shc_classifier->getTopContainers(clusters,outliers); + StringVector res; + for(string it:*clusts) res.push_back(it); + delete clusts; + return res; + } + StringVector getComponents(String clusterId) { + string s=(string)clusterId; + vector *comps=shc_classifier->getComponents(&s); + StringVector res; + for(SHC_Component *c:*comps) res.push_back(c->getId()); + delete comps; + return res; + } + StringVector getAllComponents() { + unordered_map *comps=shc_classifier->getComponents(); + StringVector res; + for(pair it:*comps) res.push_back(it.first); + return res; + } + List getClusterContours(String clusterId) { + string s=(string)clusterId; + vector *clus_det=shc_classifier->getClassificationDetails(&s); + List res=List::create(); + for(SHC_Component_Details *comp_det:*clus_det) { + res[comp_det->parent->getId()]=wrap(*comp_det->single); + delete comp_det; + } + delete clus_det; + return res; + } + void cleanOutliers() { + shc_classifier->cleanOutliers(); + } + NumericMatrix getOutlierPositions() { + vector *outs=shc_classifier->getOutliers(); + NumericMatrix m(outs->size(),shc_classifier->getVirtualVariance()->size()); + int i=0; + for(SHC_Component *c:*outs) { + NumericVector mu(wrap(*c->getMean())); + m(i++,_)=mu; + } + delete outs; + return m; + } + List getComponentDetails(String componentId) { + string s=(string)componentId; + SHC_Component *comp=shc_classifier->getComponent(&s); + if(comp==NULL) return R_NilValue; + List res=List::create(); + res["component_id"]=componentId; + res["cluster_id"]=comp->getParent()->getId(); + res["dimensions"]=comp->getDimensions(); + res["is_outlier"]=comp->isOutlier(); + res["elements"]=comp->getElements(); + res["mean"]=wrap(*comp->getMean()); + MatrixXd *cov=comp->getCovariance(); + res["covariance"]=wrap(*cov); + delete cov; + res["is_cov_inverted"]=comp->isCovarianceInverted(); + res["has_baseline"]=comp->hasBaseline(); + res["is_obsolete"]=comp->isObsolete(); + if(comp->getDimensions()==2) { + SHC_Component_Details *cd1=comp->calculateDetails(shc_classifier->getTheta()); + res["component_contour"]=wrap(*cd1->single); + delete cd1; + if(comp->hasBaseline()) { + SHC_Component *bline=comp->getBaseline(); + SHC_Component_Details *cd2=bline->calculateDetails(shc_classifier->getTheta()); + res["component_baseline_contour"]=wrap(*cd2->single); + delete cd2; + } + } + return res; + } + double getTheta() { + return shc_classifier->getTheta(); + } + NumericVector getVirtualVariance() { + return wrap(*shc_classifier->getVirtualVariance()); + } + bool isTraceable(String fromComponentId,String toComponentId) { + string fcid=(string)fromComponentId; + set *tl=shc_classifier->traceComponentId(&fcid); + bool res=tl->find((string)toComponentId)!=tl->end(); + if(!res) { + Rcout << "Trace for " << fcid << ":"; + for(string str:*tl) Rcout << str << " "; + Rcout << endl; + } + delete tl; + return res; + } + StringVector getTrace(String fromComponentId) { + string fcid=(string)fromComponentId; + set *tl=shc_classifier->traceComponentId(&fcid); + StringVector res; + for(string str:*tl) res.push_back(str); + delete tl; + return res; + } + double getClusterWeight(String clusterId) { + long telms=shc_classifier->getTotalElements(true,true); + string clusId=(string)clusterId; + vector *comps=shc_classifier->getComponents(&clusId); + long tclus=0; + for(SHC_Component *comp:*comps) tclus+=comp->getElements(); + delete comps; + return telms>0 ? tclus/telms : (double)0; + } + double getComponentWeight(String componentId) { + long telms=shc_classifier->getTotalElements(true,true); + string compId=(string)componentId; + SHC_Component *comp=shc_classifier->getComponent(&compId); + long tcomp=comp->getElements(); + return telms>0 ? tcomp/telms : (double)0; + } + int getClusterMapping(String clusId) { + return _getClusterMapping((string)clusId); + } + int getComponentMapping(String compId) { + return _getComponentMapping((string)compId); + } + IntegerVector stream_MicroToMacro(IntegerVector micro_vec) { + IntegerVector res; + for(int it:micro_vec) { + string *bk=_getBkComponentMapping(it); + if(bk!=NULL) { + SHC_Component *comp=shc_classifier->getComponent(bk); + if(comp!=NULL && comp->getParent()!=NULL) + res.push_back(_getClusterMapping(comp->getParent()->getId())); + else { + set *new_comps=shc_classifier->traceComponentId(bk); + if(new_comps->size()==0) res.push_back(0); + else { + bool done=false; + for(string it2:*new_comps) { + SHC_Component *nc=shc_classifier->getComponent(&it2); + if(nc!=NULL) { + int v2=_getClusterMapping(nc->getParent()->getId()); + if(v2>0 && !done) { + res.push_back(v2); + done=true; + } + } + } + if(!done) res.push_back(0); + } + delete new_comps; + } + delete bk; + } else res.push_back(0); + } + return res; + } + IntegerVector stream_MicroToMicro(int micro) { + IntegerVector res; + if(micro==0) return res; + string *bk=_getBkComponentMapping(micro); + if(bk!=NULL) { + SHC_Component *comp=shc_classifier->getComponent(bk); + if(comp==NULL) { + set *ncomps=shc_classifier->traceComponentId(bk); + for(string it2:*ncomps) { + int v2=_getComponentMapping(it2); + if(v2>0) res.push_back(v2); + } + delete ncomps; + } else { + int v2=_getComponentMapping(comp->getId()); + if(v2>0) res.push_back(v2); + } + delete bk; + } + return res; + } + List getStats() { + pair stats=shc_classifier->getStatistic(); + List res=List::create(); + res["components"]=stats.first; + res["outliers"]=stats.second; + return res; + } + void useSigmaIndex(int neigh_multiplier, bool precision_switch) { + shc_classifier->useSigmaIndex(neigh_multiplier, precision_switch); + } + void setPseudoOfflineCounter(int agglo_count) { + shc_classifier->setAgglomerationCount(agglo_count); + } + List getTimes() { + vector times=shc_classifier->getTimes(); + List res=List::create(); + res["queryTime"]=times[0]; + res["updateTime"]=times[1]; + res["processTime"]=times[2]; + return res; + } + long getNodeCounter() { + return shc_classifier->getNodeCounter(); + } + double getSigmaIndexR() { + SigmaIndex *st=shc_classifier->getSigmaIndex(); + if(st==NULL) return (double)0; + SigmaIndexStatistics *st_stats=st->getStatistics(); + double R=st_stats->R; + delete st_stats; + return R; + } + NumericMatrix getSigmaIndexHistogram() { + SigmaIndex *st=shc_classifier->getSigmaIndex(); + if(st==NULL) return NumericMatrix(0,0); + unordered_map *hist=st->calculateHistogram(); + IntegerVector xv,yv; + for(int it=0;it<=100;it++) { + xv.push_back(it); + if(hist->find(it)!=hist->end()) yv.push_back((*hist)[it]); + else yv.push_back(0); + } + NumericMatrix res(1,101,yv.begin()); + colnames(res)=xv; + delete hist; + return res; + } + bool recheckOutlier(String id) { + string s_id=(string)id; + return shc_classifier->recheckOutlier(&s_id); + } + SHC_R *cloneSHC() { + return new SHC_R(this); + } + void clearEigenMPSupport() { + shc_classifier->clearEigenMPSupport(); + } +}; \ No newline at end of file diff --git a/src/Utils.cpp b/src/Utils.cpp new file mode 100644 index 0000000..4786646 --- /dev/null +++ b/src/Utils.cpp @@ -0,0 +1,118 @@ +#include "Utils.h" +using namespace Eigen; +using namespace Rcpp; + +VectorXd calculateNewMean(VectorXd mean,VectorXd newElement,int elements) { + VectorXd t1 = (newElement - mean) / (elements + 1); + VectorXd t2 = mean + t1; + return t2; +} + +MatrixXd calculateCovariance(VectorXd oldMean,VectorXd newMean,MatrixXd oldCovariance,int elements,VectorXd newElement,bool isInversion) { + if(!isInversion) { + MatrixXd B = (((newElement - oldMean) * (newElement - newMean).transpose()) - oldCovariance) / (elements + 1); + return oldCovariance + B; + } else { + MatrixXd u = (newElement-oldMean)/sqrt(elements); + MatrixXd vt = ((newElement-newMean)/sqrt(elements)).transpose(); + MatrixXd v1 = oldCovariance*u*vt*oldCovariance; + MatrixXd v2 = vt*oldCovariance*u; + MatrixXd res = oldCovariance-v1/((double)1+v2(0,0)); + double X = (double)elements/((double)elements+(double)1); + MatrixXd res2 = res/X; + return res2; + } +} + +MatrixXd constructVirtualCovariance(VectorXd virtualVariance) { + MatrixXd vcov = MatrixXd::Zero(virtualVariance.size(), virtualVariance.size()); + for(int i=0;i lu(covariance); + if(lu.isInvertible()) return covariance; + + MatrixXd vcov = constructVirtualCovariance(vvar); + for(int i=0;i lu(covariance); + return(lu.isInvertible()); +} + +MatrixXd invertCovariance(MatrixXd covariance) { + if(isCovarianceInvertible(covariance)) + return(covariance.inverse()); + else + return(MatrixXd::Zero(covariance.rows(),covariance.cols())); +} + +double mahalanobisDistance(VectorXd mean,VectorXd newElement,MatrixXd covariance,VectorXd virtualVariance, + int elements,bool isInversion) { + MatrixXd icovariance; + if(!isInversion) + icovariance = chooseUniOrMulti(covariance, virtualVariance, elements).inverse(); + else + icovariance = covariance; + VectorXd delta = newElement - mean; + double N0 = delta.transpose() * (icovariance * delta); + return sqrtf(N0); +} + +double calculateEuclidean(VectorXd x,VectorXd y) { + return (x - y).norm(); +} + +double calculateMDSeparation(VectorXd mean1,MatrixXd cov1,VectorXd vvar1,int elms1,bool isInv1, + VectorXd mean2,MatrixXd cov2,VectorXd vvar2,int elms2,bool isInv2,double th1,double th2) { + double md1 = mahalanobisDistance(mean1, mean2, cov1, vvar1, elms1, isInv1); + double p1 = th1 / md1; + double md2 = mahalanobisDistance(mean2, mean1, cov2, vvar2, elms2, isInv2); + double p2 = th2 / md2; + if((p1+p2)==0) return (double)0.0; + return 1/(p1+p2); +} + +// [[Rcpp::export]] +double shc_MDSeparation(NumericVector mean1, NumericMatrix covariance1, NumericVector virtualVariance1, int N1, bool isInversion1, double th1, + NumericVector mean2, NumericMatrix covariance2, NumericVector virtualVariance2, int N2, bool isInversion2, double th2) { + double mdsep = calculateMDSeparation(as(mean1),as(covariance1),as(virtualVariance1),N1,isInversion1, + as(mean2),as(covariance2),as(virtualVariance2),N2,isInversion2, + th1,th2); + return mdsep; +} + +// [[Rcpp::export]] +double shc_MutualMinMahalanobis(NumericVector mean1, NumericMatrix covariance1, NumericVector virtualVariance1, int N1, bool isInversion1, + NumericVector mean2, NumericMatrix covariance2, NumericVector virtualVariance2, int N2, bool isInversion2) { + double md1 = mahalanobisDistance(as(mean2),as(mean1),as(covariance2),as(virtualVariance2), + N2,isInversion2); + double md2 = mahalanobisDistance(as(mean1),as(mean2),as(covariance1),as(virtualVariance1), + N1,isInversion1); + return md1<=md2 ? md1 : md2; +} + +// [[Rcpp::export]] +NumericVector shc_CalculateNewMean(NumericVector mean, NumericVector newElement, int N) { + VectorXd res = calculateNewMean(as(mean),as(newElement),N); + NumericVector res_nm(wrap(res)); + return res_nm; +} + +// [[Rcpp::export]] +NumericMatrix shc_CalculateCovariance(NumericVector oldMean, NumericVector newMean, + NumericMatrix oldCovariance, int N, NumericVector newElement, + bool isInversion) { + MatrixXd mat_newCovariance = calculateCovariance(as(oldMean),as(newMean),as(oldCovariance), + N,as(newElement),isInversion); + NumericMatrix res(wrap(mat_newCovariance)); + return res; +} \ No newline at end of file diff --git a/src/Utils.h b/src/Utils.h new file mode 100644 index 0000000..a92c372 --- /dev/null +++ b/src/Utils.h @@ -0,0 +1,26 @@ +#ifndef SHC_H_Utils +#define SHC_H_Utils + +#include +#include +#include "RcppEigen.h" +using namespace Eigen; +using namespace Rcpp; + +VectorXd calculateNewMean(VectorXd mean,VectorXd newElement,int elements); +MatrixXd calculateCovariance(VectorXd oldMean,VectorXd newMean,MatrixXd oldCovariance,int elements,VectorXd newElement,bool isInversion); +MatrixXd constructVirtualCovariance(VectorXd virtualVariance); +MatrixXd chooseUniOrMulti(MatrixXd covariance,VectorXd vvar,int elms); +double mahalanobisDistance(VectorXd mean,VectorXd newElement,MatrixXd covariance,VectorXd virtualVariance,int elements,bool isInversion); +double calculateEuclidean(VectorXd x,VectorXd y); +double calculateMDSeparation(VectorXd mean1,MatrixXd cov1,VectorXd vvar1,int elms1,bool isInv1,VectorXd mean2,MatrixXd cov2,VectorXd vvar2,int elms2,bool isInv2,double th1,double th2); +bool isCovarianceInvertible(MatrixXd covariance); +MatrixXd invertCovariance(MatrixXd covariance); + +double shc_MDSeparation(NumericVector mean1, NumericMatrix covariance1, NumericVector virtualVariance1, int N1, bool isInversion1, double th1, + NumericVector mean2, NumericMatrix covariance2, NumericVector virtualVariance2, int N2, bool isInversion2, double th2); +NumericVector shc_CalculateNewMean(NumericVector mean, NumericVector newElement, int N); +NumericMatrix shc_CalculateCovariance(NumericVector oldMean, NumericVector newMean, NumericMatrix oldCovariance, int N, + NumericVector newElement, bool isInversion); + +#endif /* SHC_H_Utils */ diff --git a/tests/testthat/test-DSC_SHC.R b/tests/testthat/test-DSC_SHC.R new file mode 100644 index 0000000..4c30fb6 --- /dev/null +++ b/tests/testthat/test-DSC_SHC.R @@ -0,0 +1,21 @@ +library("stream") +library("testthat") + +context("DSC_SHC") +set.seed(0) + +ds <- stream::DSD_Gaussians(k=10,outliers=10,separation_type="Mahalanobis", + separation=4,space_limit=c(0,150),variance_limit=8, + outlier_options=list(outlier_horizon=20000)) +c <- DSC_SHC.behavioral(2, SHCAgglomerationType$AggresiveAgglomeration, SHCDriftType$NoDrift, 0, sigmaIndex = TRUE, + sigmaIndexNeighborhood = 2) +c$RObj$setPseudoOfflineCounter(500) +r <- evaluate_with_callbacks(c, ds, n=20000, measure = c("cRand","queryTime","updateTime", + "processTime","nodeCount", + "computationCostReduction", "outlierjaccard"), + type="macro", callbacks=list(shc=SHCEvalCallback()), + single_pass_update=T, use_outliers=T) + + +expect_gt(r$cRand,0.9) +expect_gt(r$OutlierJaccard, 0.9)