-
Notifications
You must be signed in to change notification settings - Fork 37
binary segmentation incorrect for real data set #86
Copy link
Copy link
Open
Description
Related to #47 (incorrect results for small/trivial data).
This is an example of a real data set for which method="BinSeg" yields an incorrect result.
library(data.table)
data(neuroblastoma,package="neuroblastoma")
nb.profiles <- data.table(neuroblastoma[["profiles"]])
one.pid.chr <- nb.profiles[profile.id==2 & chromosome==2]
one.pid.chr[, data.i := .I]
data.vec <- one.pid.chr$logratio
N.data <- length(data.vec)
cum.data.vec <- cumsum(c(0,data.vec))
max.segs <- 5
max.changes <- max.segs-1
binseg.fit <- binsegRcpp::binseg_normal(data.vec, max.segs)
cpt.fit <- changepoint::cpt.mean(data.vec, method="BinSeg", Q=max.changes)
loss.dt.list <- list()
seg.dt.list <- list()
for(n.changes in 1:max.changes){
n.segs <- n.changes+1L
end.list <- list(
changepoint=c(N.data, cpt.fit@cpts.full[n.changes,]),
binsegRcpp=coef(binseg.fit, n.segs)$end
)
for(package in names(end.list)){
end <- sort(end.list[[package]])
seg.size <- diff(c(0,end))
seg.mean <- (cum.data.vec[end+1]-cum.data.vec[end-seg.size+1])/seg.size
data.mean <- rep(seg.mean, seg.size)
seg.dt.list[[paste(n.changes, package)]] <- data.table(
n.changes,
package,
start=end-seg.size+1L,
end,
mean=seg.mean)
loss.dt.list[[paste(n.changes, package)]] <- data.table(
n.changes, package, total.square.loss=sum((data.mean-data.vec)^2))
}
}
loss.dt <- do.call(rbind, loss.dt.list)
dcast(loss.dt, package ~ n.changes, value.var="total.square.loss")
seg.dt <- do.call(rbind, seg.dt.list)
loss.y <- c(
changepoint=5,
binsegRcpp=4)
library(ggplot2)
gg <- ggplot()+
theme_bw()+
scale_color_manual(values=c(binsegRcpp="red", changepoint="black"))+
scale_size_manual(values=c(binsegRcpp=1, changepoint=2))+
geom_text(aes(
40, loss.y[package], color=package,
label=sprintf("%s loss= %.2f", package, total.square.loss)),
data=loss.dt)+
scale_x_continuous(
"Position/index in data sequence",
breaks=seq(0,1000,by=2),
limits=c(0, nrow(one.pid.chr)+1))+
scale_y_continuous(
"DNA copy number logratio to segment")+
geom_segment(aes(
start-0.5, mean,
size=package,
color=package,
xend=end+0.5, yend=mean),
data=seg.dt)+
geom_vline(aes(
xintercept=start-0.5,
size=package,
color=package),
linetype="dashed",
data=seg.dt[1 < start])+
facet_grid(n.changes ~ ., labeller=label_both)+
geom_point(aes(
data.i, logratio),
color="grey50",
shape=21,
fill="white",
data=one.pid.chr)+
coord_cartesian(
expand=FALSE,
ylim=c(-1, 7),
xlim=c(15,75))
png("changepoint.bug.png", width=10, height=5, units="in", res=200)
print(gg)
dev.off()The result I got on my system is:

The binsegRcpp result is the same as other packages (wbs::sbs, fpop::multiBinSeg, ruptures).
The changepoint loss value is sometimes larger, sometimes smaller than binsegRcpp.
@rkillick any idea why this may be happening / how to fix?
Thanks!
Reactions are currently unavailable
Metadata
Metadata
Assignees
Labels
No labels