Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
6 changes: 3 additions & 3 deletions R/data_preparation.R
Original file line number Diff line number Diff line change
Expand Up @@ -23,7 +23,7 @@ auto_grouping <- function(data, input, target, n_groups, model="kmeans", seed=99

df_categ=categ_analysis(data, input , target)

d=select_(df_categ, "perc_target", "perc_rows")
d=dplyr::select(df_categ, dplyr::all_of(c("perc_target", "perc_rows")))

set.seed(seed)
if(model=="kmeans") {
Expand All @@ -44,12 +44,12 @@ auto_grouping <- function(data, input, target, n_groups, model="kmeans", seed=99
df_categ[, var_rec]=paste("group_", cluster_vec, sep = "")

## See new profiling based on new groups
data_rec=merge(select_(data, input, target), select_(df_categ, input, var_rec), by=input)
data_rec=merge(dplyr::select(data, dplyr::all_of(c(input, target))), dplyr::select(df_categ, dplyr::all_of(c(input, var_rec))), by=input)
recateg_results=categ_analysis(data_rec, var_rec, target)

l_res=list()
l_res$recateg_results=recateg_results
l_res$df_equivalence=arrange_(unique(select_(data_rec, input, var_rec)), var_rec)
l_res$df_equivalence=dplyr::arrange(unique(dplyr::select(data_rec, dplyr::all_of(c(input, var_rec)))), .data[[var_rec]])
l_res$fit_cluster=fit_cluster

return(l_res)
Expand Down
8 changes: 4 additions & 4 deletions R/discretize.R
Original file line number Diff line number Diff line change
Expand Up @@ -40,10 +40,10 @@ discretize_df <- function(data, data_bins, stringsAsFactors=T)

if(stringsAsFactors)
{
data_2b=data %>% mutate_at(vars(vars_num), conv_factor)
data_3=data_2b %>% mutate_at(vars(vars_num), funs(factor(replace(., is.na(.), "NA."))))
data_2b=data %>% dplyr::mutate(dplyr::across(dplyr::all_of(vars_num), conv_factor))
data_3=data_2b %>% dplyr::mutate(dplyr::across(dplyr::all_of(vars_num), ~factor(replace(., is.na(.), "NA."))))
} else {
data_3=data %>% mutate_at(vars(vars_num), funs(ifelse(is.na(.), "NA.", .)))
data_3=data %>% dplyr::mutate(dplyr::across(dplyr::all_of(vars_num), ~ifelse(is.na(.), "NA.", .)))
}

message(sprintf("Variables processed: %s", paste(vars_num, collapse = ", ")))
Expand Down Expand Up @@ -213,7 +213,7 @@ convert_df_to_categoric <- function(data, n_bins)
data_cat=discretize_df(data = data, data_bins = d_cuts, stringsAsFactors = F)

# Converting remaining variables
data_cat_2=data_cat %>% mutate_all(as.character)
data_cat_2=data_cat %>% dplyr::mutate(dplyr::across(dplyr::everything(), as.character))

return(data_cat_2)
}
Expand Down
2 changes: 1 addition & 1 deletion R/information_theory.R
Original file line number Diff line number Diff line change
Expand Up @@ -23,7 +23,7 @@ entropy_2 <- function(input, target)

# get partial entropy
df_tbl=as.data.frame.matrix(tbl)
res_entropy=data.frame(t(df_tbl)) %>% mutate_all(funs(entropy(., unit = "log2"))) %>% head(.,1)
res_entropy=data.frame(t(df_tbl)) %>% dplyr::mutate(dplyr::across(dplyr::everything(), ~entropy(., unit = "log2"))) %>% head(.,1)

# computing total entropy
total_en=sum(probs_input*res_entropy)
Expand Down
6 changes: 3 additions & 3 deletions R/models_lib.R
Original file line number Diff line number Diff line change
Expand Up @@ -139,7 +139,7 @@ desc_groups <- function(data, group_var, group_func=mean, add_all_data_row=T)
stat=status(data)
vars_to_keep=stat[stat$type %in% c("integer", "numeric") & stat$variable != group_var, "variable"]

grp_mean=data %>% group_by_(group_var) %>% summarise_each_(funs(group_func), vars_to_keep) %>% mutate_each_(funs(round(.,2)), vars_to_keep)
grp_mean=data %>% dplyr::group_by(dplyr::across(dplyr::all_of(group_var))) %>% dplyr::summarise(dplyr::across(dplyr::all_of(vars_to_keep), group_func), .groups = "drop") %>% dplyr::mutate(dplyr::across(dplyr::all_of(vars_to_keep), ~round(., 2)))
grp_mean=data.frame(grp_mean)

grp_mean[,group_var]=as.character(grp_mean[,group_var])
Expand All @@ -149,7 +149,7 @@ desc_groups <- function(data, group_var, group_func=mean, add_all_data_row=T)

# vars_to_keep have all num variables (excluding group_var and factor/char). Calculate 'All_Data' means per column
data_num=select(data, one_of(vars_to_keep))
b=as.data.frame(data_num) %>% summarise_each(funs(group_func))
b=as.data.frame(data_num) %>% dplyr::summarise(dplyr::across(dplyr::everything(), group_func))

## putting all together: the sumarization per group plus the total per column
all_results=rbind(a, b)
Expand Down Expand Up @@ -194,7 +194,7 @@ desc_groups_rank <- function(data, group_var, group_func=mean)
vars_to_group=all_col[all_col!=group_var]

# mutate each does the group by only for variables defined in vars_to_group
d_group_rank=d_group %>% mutate_each_(funs(dense_rank(desc(.))), vars_to_group)
d_group_rank=d_group %>% dplyr::mutate(dplyr::across(dplyr::all_of(vars_to_group), ~dplyr::dense_rank(dplyr::desc(.))))

return(d_group_rank)
}
Expand Down
17 changes: 9 additions & 8 deletions R/target_profiling.R
Original file line number Diff line number Diff line change
Expand Up @@ -71,7 +71,7 @@ get_target_plot <- function(data, input, target, plot_type)

histdens_target <- function(data, input, target)
{
cdf=group_by_(data, target) %>% summarise_(var.mean=interp(~mean(v, na.rm=T), v=as.name(input)))
cdf=data %>% dplyr::group_by(dplyr::across(dplyr::all_of(target))) %>% dplyr::summarise(var.mean = mean(.data[[input]], na.rm=TRUE), .groups = "drop")

cdf$var.mean=round(cdf$var.mean, 2)

Expand Down Expand Up @@ -167,13 +167,14 @@ categ_analysis_logic <- function(data, input, target)
tot_pos=sum(data[,target]==pred_class)

## profiling
grp=group_by_(data, input) %>% summarise_(
mean_target=interp(~round(mean(var==pred_class, na.rm = TRUE), 3), var = as.name(target)),
sum_target=interp(~sum(var==pred_class, na.rm = TRUE), var = as.name(target)),
perc_target=interp(~round(sum(var==pred_class, na.rm = TRUE)/tot_pos,3), var = as.name(target)),
q_rows=~n(),
perc_rows=~round(n()/nrow(data), 3)
) %>% arrange(-mean_target)
grp=data %>% dplyr::group_by(dplyr::across(dplyr::all_of(input))) %>% dplyr::summarise(
mean_target=round(mean(.data[[target]]==pred_class, na.rm = TRUE), 3),
sum_target=sum(.data[[target]]==pred_class, na.rm = TRUE),
perc_target=round(sum(.data[[target]]==pred_class, na.rm = TRUE)/tot_pos,3),
q_rows=dplyr::n(),
perc_rows=round(dplyr::n()/nrow(data), 3),
.groups = "drop"
) %>% dplyr::arrange(-mean_target)

#colnames(grp)[colnames(grp)=='sum_target']=paste("sum", target, sep="_")
#colnames(grp)[colnames(grp)=='perc_target']=paste("perc", target, sep="_")
Expand Down