|
| 1 | + |
| 2 | +#Auxiliary function to plot the relationship between clustering variables |
| 3 | +createCorrelationPlots <- function(filename, df, sampleSize = 500) { |
| 4 | + if(!is.null(filename)){ |
| 5 | + corrPlot <- ggpairs(sample_n(df, sampleSize), aes(col="red", alpha=0.4), diag=list(continuous="density"), axisLabels='show') |
| 6 | + png(filename, height=1500, width=1500) |
| 7 | + print(corrPlot) |
| 8 | + dev.off() |
| 9 | + } |
| 10 | +} |
| 11 | + |
| 12 | + |
| 13 | +#' Function to create clusters and a standard report of the mean of some variables |
| 14 | +#' The clustering is performed using kmeans algorith with the euclidean distance |
| 15 | +#' Character and factor variables will be dummified before entering the clustering |
| 16 | +#' As kmeans does not admit Infinite or NA values, they are na.roughfix'd (replaced by the median) |
| 17 | +#' |
| 18 | +#' |
| 19 | +#' @param dt Master table with the prediction |
| 20 | +#' @param clusteringVarnames Names of the variables that will be used for the clustering |
| 21 | +#' @param shownVarnames Variables that will be included in the report but not used for the clustering |
| 22 | +#' @param nClusters Number of clusters that will be created |
| 23 | +#' @param scoreColName Name of the column that contains the model score, used for clustering only the highest values |
| 24 | +#' @param nRowsToCluster Number of rows that will be considered for the clustering. Incompatible with "percRowsToCluster" |
| 25 | +#' @param percRowsToCluster Percentage of rows that will be considered for the clustering. Incompatible with "nRowsToCluster" |
| 26 | +#' @param scaleData Whether to scale data before clustering or not (subtract mean and divide by sd). |
| 27 | +#' Recommended to set as true as euclidean distance is sensitive to order of magnitude of data |
| 28 | +#' @param file Name of the file where the report will be saved (must be an xlsx file) |
| 29 | +#' @param template Excel template where the format and coloring of the output file is saved |
| 30 | +#' @param plotFilename Name of the file where a correlation plot of the different clustering variables will be saved |
| 31 | +#' The output will be saved as png. If NULL no plotting will be done |
| 32 | +#' @param plotSampleSize Sample size that will be used for the plot, as most likely all data won't be manageable |
| 33 | +#' @param verbose If TRUE, prints some information about its execution |
| 34 | +#' |
| 35 | +#' @return returns a list with two elements: |
| 36 | +#' clusteredData: The data that has been clustered with the clustering information |
| 37 | +#' report: The summary report of the clusters |
| 38 | +#' @export |
| 39 | +#' |
| 40 | +#' @examples |
| 41 | +kMeansClustering <- function(dt, |
| 42 | + clusteringVarnames, |
| 43 | + shownVarnames, |
| 44 | + nClusters, |
| 45 | + scoreColName, |
| 46 | + nRowsToCluster = NULL, |
| 47 | + percRowsToCluster = NULL, |
| 48 | + scaleData = TRUE, |
| 49 | + file = NULL, |
| 50 | + template = NULL, |
| 51 | + plotFilename = NULL, |
| 52 | + plotSampleSize = 500, |
| 53 | + verbose = TRUE){ |
| 54 | + |
| 55 | + originalColNames <- colnames(dt) |
| 56 | + setorderv(dt, scoreColName, order = -1) |
| 57 | + |
| 58 | + #Error control |
| 59 | + if(!is.null(nRowsToCluster) & !is.null(percRowsToCluster)){ |
| 60 | + stop("Only one of the following nRowsToCluster or percRowsToCluster should be not NULL") |
| 61 | + } |
| 62 | + if(!is.null(nRowsToCluster)){ |
| 63 | + rowsForClustering <- nRowsToCluster |
| 64 | + }else if(!is.null(percRowsToCluster)){ |
| 65 | + rowsForClustering <- ceiling(percRowsToCluster * nrow(dt)) |
| 66 | + }else{ |
| 67 | + stop("Only one of the following nRowsToCluster or percRowsToCluster should be not NULL") |
| 68 | + } |
| 69 | + #We cant add to the clustering more variables than the table has |
| 70 | + rowsForClustering <- min(nrow(dt), rowsForClustering) |
| 71 | + |
| 72 | + ####################################################################### |
| 73 | + ### Dummify factor and character variables |
| 74 | + ####################################################################### |
| 75 | + tic() |
| 76 | + cond_cat(verbose, "Dummifying factors and characters...\n") |
| 77 | + |
| 78 | + varsToTreat <- unique(c(clusteringVarnames, shownVarnames)) |
| 79 | + |
| 80 | + varsToDummify <- intersect(varsToTreat, colnames(dt)[sapply(dt,class) %in% c("factor","character")]) |
| 81 | + numericVars <- setdiff(varsToTreat, varsToDummify) |
| 82 | + |
| 83 | + #Store the new dummified variables names so that we add them to the clustering afterwards |
| 84 | + dummifiedVars <- c() |
| 85 | + dummifiedVarsForClustering <- c() |
| 86 | + for(varToDummify in varsToDummify){ |
| 87 | + vals <- unique(dt[[varToDummify]]) |
| 88 | + dt[, (varToDummify %+% "_" %+% vals) := lapply(vals, function(x){as.numeric(get(varToDummify) == x)})] |
| 89 | + dummifiedVars <- c(dummifiedVars, varToDummify %+% "_" %+% vals) |
| 90 | + if(varToDummify %in% clusteringVarnames){ |
| 91 | + dummifiedVarsForClustering <- c(dummifiedVarsForClustering, varToDummify %+% "_" %+% vals) |
| 92 | + } |
| 93 | + } |
| 94 | + |
| 95 | + #Add the dummy variables to the clustering. We put them in this patricular order so that all clustering variables are at the beginning |
| 96 | + varsToTreat <- c(dummifiedVarsForClustering, numericVars, setdiff(dummifiedVars, dummifiedVarsForClustering)) |
| 97 | + clusteringVarnames <- c(dummifiedVarsForClustering, |
| 98 | + intersect(numericVars, clusteringVarnames)) |
| 99 | + toc(quiet = !verbose) |
| 100 | + |
| 101 | + ####################################################################### |
| 102 | + ### Clustering |
| 103 | + ####################################################################### |
| 104 | + |
| 105 | + #Only clusterize the top N rows |
| 106 | + topClusterized <- dt[1:rowsForClustering] |
| 107 | + dataToKmeans <- topClusterized[, clusteringVarnames, with = FALSE] |
| 108 | + |
| 109 | + tic() |
| 110 | + cond_cat(verbose, "Removing infinite values...\n") |
| 111 | + dataToKmeans <- dataToKmeans[, lapply(.SD, function(x){ifelse(is.infinite(x), NA, x)})] |
| 112 | + toc(quiet = !verbose) |
| 113 | + |
| 114 | + tic() |
| 115 | + cond_cat(verbose, "Checking columns quality...\n") |
| 116 | + x <- sapply(clusteringVarnames, function(x){uniqueN(dataToKmeans[[x]], na.rm = T)}) |
| 117 | + if(any(x == 1)){ |
| 118 | + stop("Variables: " %+% paste(names(x)[x==1], collapse = ", ") %+% " have at most one non-NA value, which makes it unable to do kmeans on them, please run the clustering without these variables.") |
| 119 | + } |
| 120 | + toc(quiet = !verbose) |
| 121 | + |
| 122 | + if(scaleData){ |
| 123 | + tic() |
| 124 | + cond_cat(verbose, "Scaling data...\n") |
| 125 | + dataToKmeans <- scale(dataToKmeans) |
| 126 | + toc(quiet = !verbose) |
| 127 | + } |
| 128 | + |
| 129 | + if(!is.null(plotFilename)){ |
| 130 | + tic() |
| 131 | + cond_cat(verbose, "Plotting correlations...\n") |
| 132 | + createCorrelationPlots(plotFilename, topClusterized[, clusteringVarnames, with = FALSE], sampleSize = plotSampleSize) |
| 133 | + toc(quiet = !verbose) |
| 134 | + } |
| 135 | + |
| 136 | + tic() |
| 137 | + cond_cat(verbose, "Running kmeans...\n") |
| 138 | + set.seed(1804) |
| 139 | + auxKMeans <- kmeans(na.roughfix(dataToKmeans), centers= nClusters) |
| 140 | + |
| 141 | + #We will redefine the clusters numeration so that cluster 1 is the smallest and cluster N is the largest |
| 142 | + #This is done because the reports orders the clusters by size |
| 143 | + clustersNewOrder <- names(sort(table(auxKMeans$cluster))) #This sorts the clusters by size |
| 144 | + topClusterized[, cluster := "Cluster_" %+% str_pad(mapvalues(auxKMeans$cluster, clustersNewOrder, seq_len(nClusters)), width = 2, side = "left", pad = "0")] |
| 145 | + |
| 146 | + toc(quiet = !verbose) |
| 147 | + print(paste0("template file exists: ", file.exists(template))) |
| 148 | + clustersSummary <- summarizeClusters(clusteredTable = topClusterized, |
| 149 | + varsToShow = varsToTreat, |
| 150 | + file = file, |
| 151 | + template = template, |
| 152 | + allPopulationTable = dt, |
| 153 | + clusteringVarnames = clusteringVarnames, |
| 154 | + verbose = verbose) |
| 155 | + |
| 156 | + |
| 157 | + return(list(clusteredData = topClusterized, report = clustersSummary)) |
| 158 | +} |
| 159 | + |
| 160 | +#' Function that creates a summary report with the mean of some variables for each cluster |
| 161 | +#' |
| 162 | +#' @param clusteredTable Data that has been clustered |
| 163 | +#' @param varsToShow Names of the variables that will be shown in the report |
| 164 | +#' @param file Name of the file where the report will be saved (must be an xlsx file) |
| 165 | +#' @param template Excel template where the format and coloring of the output file is saved |
| 166 | +#' @param allPopulationTable Table with info about all the data. |
| 167 | +#' Useful when not all the data has been clusteredand want to compare the clustered data vs all the population |
| 168 | +#' @param clusterColName Name of the column that contains the cluster |
| 169 | +#' @param clusteredTableName Name that will be given in the report to the clustered Table |
| 170 | +#' @param clusteringVarnames Variables that were used for the clustering. If any, a column will be added to the report telling if the variable was used for the clustering or its just being displayed. |
| 171 | +#' @param verbose If TRUE, prints some information about its execution |
| 172 | +#' |
| 173 | +#' @return Returns the clusters summary report |
| 174 | +#' @export |
| 175 | +#' |
| 176 | +#' @examples |
| 177 | +summarizeClusters <- function(clusteredTable, |
| 178 | + varsToShow, |
| 179 | + file = NULL, |
| 180 | + template = NULL, |
| 181 | + allPopulationTable = NULL, |
| 182 | + clusterColName = "cluster", |
| 183 | + clusteredTableName = "Top Risky", |
| 184 | + clusteringVarnames = c(), |
| 185 | + verbose = FALSE |
| 186 | +){ |
| 187 | + |
| 188 | + tic() |
| 189 | + cond_cat(verbose, "Creating clusters report...\n") |
| 190 | + |
| 191 | + clusteredTable[, clusterSize := .N, by=.(cluster)] |
| 192 | + clusteredTable[, clusterPercentage := .N / nrow(clusteredTable), by = cluster] |
| 193 | + |
| 194 | + if(!is.null(allPopulationTable)){ |
| 195 | + allPopulationTable[, clusterSize := nrow(allPopulationTable)] |
| 196 | + allPopulationTable[, cluster := "Total Population"] |
| 197 | + allPopulationTable[, clusterPercentage := 1] |
| 198 | + } |
| 199 | + unclusteredTable <- copy(clusteredTable) |
| 200 | + unclusteredTable[, clusterSize := nrow(unclusteredTable)] |
| 201 | + unclusteredTable[, cluster := clusteredTableName] |
| 202 | + unclusteredTable[, clusterPercentage := 1] |
| 203 | + |
| 204 | + masterTable <- rbind(clusteredTable, unclusteredTable, allPopulationTable, fill=TRUE) |
| 205 | + |
| 206 | + clustersSummary <- masterTable[, lapply(.SD, mean, na.rm=TRUE), by=.(cluster), .SDcols = c("clusterSize", "clusterPercentage", varsToShow)] |
| 207 | + setorder(clustersSummary, clusterSize) |
| 208 | + #Transpose to make the data more readable |
| 209 | + varNames <- colnames(clustersSummary) |
| 210 | + clustersSummary <- transpose(clustersSummary) |
| 211 | + clustersSummary <- cbind(data.table(Cluster = varNames), clustersSummary) |
| 212 | + clustersSummary[, sapply(.SD, first)] |
| 213 | + setnames(clustersSummary, sapply(clustersSummary, first)) |
| 214 | + clustersSummary <- clustersSummary[-1] |
| 215 | + clustersSummary[, (colnames(clustersSummary)[-1]) := lapply(.SD, as.numeric), .SDcols = colnames(clustersSummary)[-1]] |
| 216 | + if(length(clusteringVarnames) != 0){ |
| 217 | + clustersSummary[, usedForClustering := ifelse(varNames[-1] %in% clusteringVarnames, "Yes", "No")] |
| 218 | + } |
| 219 | + |
| 220 | + saveClustersSummary(clustersSummary = clustersSummary, |
| 221 | + file = file, |
| 222 | + template = template) |
| 223 | + |
| 224 | + toc(quiet = !verbose) |
| 225 | + |
| 226 | + return(clustersSummary) |
| 227 | + |
| 228 | +} |
| 229 | + |
| 230 | +#' Saves the clusters summary report to an Excel file based on a template |
| 231 | +#' |
| 232 | +#' @param clustersSummary Clusters report (output from summarizeClusters) |
| 233 | +#' @param file Name of the file where the report will be saved (must be an xlsx file) |
| 234 | +#' @param template Excel template where the format and coloring of the output file is saved |
| 235 | +#' |
| 236 | +#' @return Nothing |
| 237 | +#' @export |
| 238 | +#' |
| 239 | +#' @examples |
| 240 | +saveClustersSummary <- function(clustersSummary, |
| 241 | + file = NULL, |
| 242 | + template = NULL){ |
| 243 | + |
| 244 | + |
| 245 | + if (!is.null(file)) { |
| 246 | + if(!is.null(template)){ |
| 247 | + if(!file.exists(template)){ |
| 248 | + warning("The provided template does not exist, running without template") |
| 249 | + wb <- XLConnect::loadWorkbook(file, create=TRUE) |
| 250 | + }else{ |
| 251 | + wb <- XLConnect::loadWorkbook(template, create=TRUE) |
| 252 | + } |
| 253 | + }else{ |
| 254 | + wb <- XLConnect::loadWorkbook(file, create=TRUE) |
| 255 | + } |
| 256 | + XLConnect::createSheet(wb,name="clustersSummary") |
| 257 | + |
| 258 | + setStyleAction(wb,XLC$STYLE_ACTION.NONE) |
| 259 | + |
| 260 | + XLConnect::writeWorksheet(wb,clustersSummary,sheet="clustersSummary", startRow=1, startCol=1) |
| 261 | + cs <- createCellStyle(wb) |
| 262 | + for(row in seq_len(nrow(clustersSummary))){ |
| 263 | + if(all(sapply(clustersSummary[row, -c("cluster", "usedForClustering"), with = FALSE], function(x){x %between% c(0,1)}))){ |
| 264 | + XLConnect::setCellStyle(wb, "clustersSummary!B" %+% (row+1) %+% ":ZZ" %+% (row+1), cellstyle = cs) |
| 265 | + } |
| 266 | + } |
| 267 | + |
| 268 | + XLConnect::setBorder(cs, side = "all", type = XLC$BORDER.THIN, color = XLC$COLOR.BLACK) |
| 269 | + XLConnect::setDataFormat(cs, "0%") |
| 270 | + |
| 271 | + XLConnect::saveWorkbook(wb, file=file) |
| 272 | + |
| 273 | + } |
| 274 | + |
| 275 | + return(invisible(NULL)) |
| 276 | +} |
| 277 | + |
| 278 | + |
| 279 | + |
| 280 | + |
| 281 | +#--------------------------------------------------------------------------------------- |
| 282 | + |
| 283 | +`%gn%` <- function(x, y) { |
| 284 | + grep(y, ignore.case = T, x = names(x)) |
| 285 | +} |
| 286 | + |
| 287 | +#' Wrapper for grepping values. Is NOT case sensitive |
| 288 | +#' |
| 289 | +#' @param x String vector |
| 290 | +#' @param y Pattern |
| 291 | +#' |
| 292 | +#' @return Elements of string that fit the pattern |
| 293 | +#' @export |
| 294 | +#' |
| 295 | +#' @examples |
| 296 | +#' |
| 297 | +#' c("hola", "adios", "cocacola") %gv% "ola" |
| 298 | +#' |
| 299 | +`%gv%` <- function(x, y) { |
| 300 | + grep(y, ignore.case = T, x = x, value = T) |
| 301 | +} |
| 302 | + |
| 303 | +`%g%` <- function(x, y) { |
| 304 | + grep(y, ignore.case = T, x = x) |
| 305 | +} |
| 306 | + |
| 307 | +#' Wrapper for paste0. Easy, simple and fast way to concatenate two strings. |
| 308 | +#' |
| 309 | +#' @param x |
| 310 | +#' @param y |
| 311 | +#' |
| 312 | +#' @return |
| 313 | +#' @export |
| 314 | +#' |
| 315 | +#' @examples |
| 316 | +`%+%` <- function(x, y) { |
| 317 | + paste0(x,y) |
| 318 | +} |
| 319 | + |
| 320 | +#' Conditional cat |
| 321 | +#' |
| 322 | +#' @param condFlag If TRUE, message in printed, otherwise not |
| 323 | +#' @param ... Message |
| 324 | +#' |
| 325 | +#' @return |
| 326 | +#' @export |
| 327 | +#' |
| 328 | +#' @examples |
| 329 | +cond_cat <- function(condFlag = TRUE, ...){ |
| 330 | + if(condFlag){ |
| 331 | + cat(...) |
| 332 | + } |
| 333 | +} |
0 commit comments