Skip to content

Commit 95b4c0c

Browse files
authored
Create utilities
1 parent b4d5cdd commit 95b4c0c

File tree

1 file changed

+333
-0
lines changed

1 file changed

+333
-0
lines changed

utilities

+333
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,333 @@
1+
2+
#Auxiliary function to plot the relationship between clustering variables
3+
createCorrelationPlots <- function(filename, df, sampleSize = 500) {
4+
if(!is.null(filename)){
5+
corrPlot <- ggpairs(sample_n(df, sampleSize), aes(col="red", alpha=0.4), diag=list(continuous="density"), axisLabels='show')
6+
png(filename, height=1500, width=1500)
7+
print(corrPlot)
8+
dev.off()
9+
}
10+
}
11+
12+
13+
#' Function to create clusters and a standard report of the mean of some variables
14+
#' The clustering is performed using kmeans algorith with the euclidean distance
15+
#' Character and factor variables will be dummified before entering the clustering
16+
#' As kmeans does not admit Infinite or NA values, they are na.roughfix'd (replaced by the median)
17+
#'
18+
#'
19+
#' @param dt Master table with the prediction
20+
#' @param clusteringVarnames Names of the variables that will be used for the clustering
21+
#' @param shownVarnames Variables that will be included in the report but not used for the clustering
22+
#' @param nClusters Number of clusters that will be created
23+
#' @param scoreColName Name of the column that contains the model score, used for clustering only the highest values
24+
#' @param nRowsToCluster Number of rows that will be considered for the clustering. Incompatible with "percRowsToCluster"
25+
#' @param percRowsToCluster Percentage of rows that will be considered for the clustering. Incompatible with "nRowsToCluster"
26+
#' @param scaleData Whether to scale data before clustering or not (subtract mean and divide by sd).
27+
#' Recommended to set as true as euclidean distance is sensitive to order of magnitude of data
28+
#' @param file Name of the file where the report will be saved (must be an xlsx file)
29+
#' @param template Excel template where the format and coloring of the output file is saved
30+
#' @param plotFilename Name of the file where a correlation plot of the different clustering variables will be saved
31+
#' The output will be saved as png. If NULL no plotting will be done
32+
#' @param plotSampleSize Sample size that will be used for the plot, as most likely all data won't be manageable
33+
#' @param verbose If TRUE, prints some information about its execution
34+
#'
35+
#' @return returns a list with two elements:
36+
#' clusteredData: The data that has been clustered with the clustering information
37+
#' report: The summary report of the clusters
38+
#' @export
39+
#'
40+
#' @examples
41+
kMeansClustering <- function(dt,
42+
clusteringVarnames,
43+
shownVarnames,
44+
nClusters,
45+
scoreColName,
46+
nRowsToCluster = NULL,
47+
percRowsToCluster = NULL,
48+
scaleData = TRUE,
49+
file = NULL,
50+
template = NULL,
51+
plotFilename = NULL,
52+
plotSampleSize = 500,
53+
verbose = TRUE){
54+
55+
originalColNames <- colnames(dt)
56+
setorderv(dt, scoreColName, order = -1)
57+
58+
#Error control
59+
if(!is.null(nRowsToCluster) & !is.null(percRowsToCluster)){
60+
stop("Only one of the following nRowsToCluster or percRowsToCluster should be not NULL")
61+
}
62+
if(!is.null(nRowsToCluster)){
63+
rowsForClustering <- nRowsToCluster
64+
}else if(!is.null(percRowsToCluster)){
65+
rowsForClustering <- ceiling(percRowsToCluster * nrow(dt))
66+
}else{
67+
stop("Only one of the following nRowsToCluster or percRowsToCluster should be not NULL")
68+
}
69+
#We cant add to the clustering more variables than the table has
70+
rowsForClustering <- min(nrow(dt), rowsForClustering)
71+
72+
#######################################################################
73+
### Dummify factor and character variables
74+
#######################################################################
75+
tic()
76+
cond_cat(verbose, "Dummifying factors and characters...\n")
77+
78+
varsToTreat <- unique(c(clusteringVarnames, shownVarnames))
79+
80+
varsToDummify <- intersect(varsToTreat, colnames(dt)[sapply(dt,class) %in% c("factor","character")])
81+
numericVars <- setdiff(varsToTreat, varsToDummify)
82+
83+
#Store the new dummified variables names so that we add them to the clustering afterwards
84+
dummifiedVars <- c()
85+
dummifiedVarsForClustering <- c()
86+
for(varToDummify in varsToDummify){
87+
vals <- unique(dt[[varToDummify]])
88+
dt[, (varToDummify %+% "_" %+% vals) := lapply(vals, function(x){as.numeric(get(varToDummify) == x)})]
89+
dummifiedVars <- c(dummifiedVars, varToDummify %+% "_" %+% vals)
90+
if(varToDummify %in% clusteringVarnames){
91+
dummifiedVarsForClustering <- c(dummifiedVarsForClustering, varToDummify %+% "_" %+% vals)
92+
}
93+
}
94+
95+
#Add the dummy variables to the clustering. We put them in this patricular order so that all clustering variables are at the beginning
96+
varsToTreat <- c(dummifiedVarsForClustering, numericVars, setdiff(dummifiedVars, dummifiedVarsForClustering))
97+
clusteringVarnames <- c(dummifiedVarsForClustering,
98+
intersect(numericVars, clusteringVarnames))
99+
toc(quiet = !verbose)
100+
101+
#######################################################################
102+
### Clustering
103+
#######################################################################
104+
105+
#Only clusterize the top N rows
106+
topClusterized <- dt[1:rowsForClustering]
107+
dataToKmeans <- topClusterized[, clusteringVarnames, with = FALSE]
108+
109+
tic()
110+
cond_cat(verbose, "Removing infinite values...\n")
111+
dataToKmeans <- dataToKmeans[, lapply(.SD, function(x){ifelse(is.infinite(x), NA, x)})]
112+
toc(quiet = !verbose)
113+
114+
tic()
115+
cond_cat(verbose, "Checking columns quality...\n")
116+
x <- sapply(clusteringVarnames, function(x){uniqueN(dataToKmeans[[x]], na.rm = T)})
117+
if(any(x == 1)){
118+
stop("Variables: " %+% paste(names(x)[x==1], collapse = ", ") %+% " have at most one non-NA value, which makes it unable to do kmeans on them, please run the clustering without these variables.")
119+
}
120+
toc(quiet = !verbose)
121+
122+
if(scaleData){
123+
tic()
124+
cond_cat(verbose, "Scaling data...\n")
125+
dataToKmeans <- scale(dataToKmeans)
126+
toc(quiet = !verbose)
127+
}
128+
129+
if(!is.null(plotFilename)){
130+
tic()
131+
cond_cat(verbose, "Plotting correlations...\n")
132+
createCorrelationPlots(plotFilename, topClusterized[, clusteringVarnames, with = FALSE], sampleSize = plotSampleSize)
133+
toc(quiet = !verbose)
134+
}
135+
136+
tic()
137+
cond_cat(verbose, "Running kmeans...\n")
138+
set.seed(1804)
139+
auxKMeans <- kmeans(na.roughfix(dataToKmeans), centers= nClusters)
140+
141+
#We will redefine the clusters numeration so that cluster 1 is the smallest and cluster N is the largest
142+
#This is done because the reports orders the clusters by size
143+
clustersNewOrder <- names(sort(table(auxKMeans$cluster))) #This sorts the clusters by size
144+
topClusterized[, cluster := "Cluster_" %+% str_pad(mapvalues(auxKMeans$cluster, clustersNewOrder, seq_len(nClusters)), width = 2, side = "left", pad = "0")]
145+
146+
toc(quiet = !verbose)
147+
print(paste0("template file exists: ", file.exists(template)))
148+
clustersSummary <- summarizeClusters(clusteredTable = topClusterized,
149+
varsToShow = varsToTreat,
150+
file = file,
151+
template = template,
152+
allPopulationTable = dt,
153+
clusteringVarnames = clusteringVarnames,
154+
verbose = verbose)
155+
156+
157+
return(list(clusteredData = topClusterized, report = clustersSummary))
158+
}
159+
160+
#' Function that creates a summary report with the mean of some variables for each cluster
161+
#'
162+
#' @param clusteredTable Data that has been clustered
163+
#' @param varsToShow Names of the variables that will be shown in the report
164+
#' @param file Name of the file where the report will be saved (must be an xlsx file)
165+
#' @param template Excel template where the format and coloring of the output file is saved
166+
#' @param allPopulationTable Table with info about all the data.
167+
#' Useful when not all the data has been clusteredand want to compare the clustered data vs all the population
168+
#' @param clusterColName Name of the column that contains the cluster
169+
#' @param clusteredTableName Name that will be given in the report to the clustered Table
170+
#' @param clusteringVarnames Variables that were used for the clustering. If any, a column will be added to the report telling if the variable was used for the clustering or its just being displayed.
171+
#' @param verbose If TRUE, prints some information about its execution
172+
#'
173+
#' @return Returns the clusters summary report
174+
#' @export
175+
#'
176+
#' @examples
177+
summarizeClusters <- function(clusteredTable,
178+
varsToShow,
179+
file = NULL,
180+
template = NULL,
181+
allPopulationTable = NULL,
182+
clusterColName = "cluster",
183+
clusteredTableName = "Top Risky",
184+
clusteringVarnames = c(),
185+
verbose = FALSE
186+
){
187+
188+
tic()
189+
cond_cat(verbose, "Creating clusters report...\n")
190+
191+
clusteredTable[, clusterSize := .N, by=.(cluster)]
192+
clusteredTable[, clusterPercentage := .N / nrow(clusteredTable), by = cluster]
193+
194+
if(!is.null(allPopulationTable)){
195+
allPopulationTable[, clusterSize := nrow(allPopulationTable)]
196+
allPopulationTable[, cluster := "Total Population"]
197+
allPopulationTable[, clusterPercentage := 1]
198+
}
199+
unclusteredTable <- copy(clusteredTable)
200+
unclusteredTable[, clusterSize := nrow(unclusteredTable)]
201+
unclusteredTable[, cluster := clusteredTableName]
202+
unclusteredTable[, clusterPercentage := 1]
203+
204+
masterTable <- rbind(clusteredTable, unclusteredTable, allPopulationTable, fill=TRUE)
205+
206+
clustersSummary <- masterTable[, lapply(.SD, mean, na.rm=TRUE), by=.(cluster), .SDcols = c("clusterSize", "clusterPercentage", varsToShow)]
207+
setorder(clustersSummary, clusterSize)
208+
#Transpose to make the data more readable
209+
varNames <- colnames(clustersSummary)
210+
clustersSummary <- transpose(clustersSummary)
211+
clustersSummary <- cbind(data.table(Cluster = varNames), clustersSummary)
212+
clustersSummary[, sapply(.SD, first)]
213+
setnames(clustersSummary, sapply(clustersSummary, first))
214+
clustersSummary <- clustersSummary[-1]
215+
clustersSummary[, (colnames(clustersSummary)[-1]) := lapply(.SD, as.numeric), .SDcols = colnames(clustersSummary)[-1]]
216+
if(length(clusteringVarnames) != 0){
217+
clustersSummary[, usedForClustering := ifelse(varNames[-1] %in% clusteringVarnames, "Yes", "No")]
218+
}
219+
220+
saveClustersSummary(clustersSummary = clustersSummary,
221+
file = file,
222+
template = template)
223+
224+
toc(quiet = !verbose)
225+
226+
return(clustersSummary)
227+
228+
}
229+
230+
#' Saves the clusters summary report to an Excel file based on a template
231+
#'
232+
#' @param clustersSummary Clusters report (output from summarizeClusters)
233+
#' @param file Name of the file where the report will be saved (must be an xlsx file)
234+
#' @param template Excel template where the format and coloring of the output file is saved
235+
#'
236+
#' @return Nothing
237+
#' @export
238+
#'
239+
#' @examples
240+
saveClustersSummary <- function(clustersSummary,
241+
file = NULL,
242+
template = NULL){
243+
244+
245+
if (!is.null(file)) {
246+
if(!is.null(template)){
247+
if(!file.exists(template)){
248+
warning("The provided template does not exist, running without template")
249+
wb <- XLConnect::loadWorkbook(file, create=TRUE)
250+
}else{
251+
wb <- XLConnect::loadWorkbook(template, create=TRUE)
252+
}
253+
}else{
254+
wb <- XLConnect::loadWorkbook(file, create=TRUE)
255+
}
256+
XLConnect::createSheet(wb,name="clustersSummary")
257+
258+
setStyleAction(wb,XLC$STYLE_ACTION.NONE)
259+
260+
XLConnect::writeWorksheet(wb,clustersSummary,sheet="clustersSummary", startRow=1, startCol=1)
261+
cs <- createCellStyle(wb)
262+
for(row in seq_len(nrow(clustersSummary))){
263+
if(all(sapply(clustersSummary[row, -c("cluster", "usedForClustering"), with = FALSE], function(x){x %between% c(0,1)}))){
264+
XLConnect::setCellStyle(wb, "clustersSummary!B" %+% (row+1) %+% ":ZZ" %+% (row+1), cellstyle = cs)
265+
}
266+
}
267+
268+
XLConnect::setBorder(cs, side = "all", type = XLC$BORDER.THIN, color = XLC$COLOR.BLACK)
269+
XLConnect::setDataFormat(cs, "0%")
270+
271+
XLConnect::saveWorkbook(wb, file=file)
272+
273+
}
274+
275+
return(invisible(NULL))
276+
}
277+
278+
279+
280+
281+
#---------------------------------------------------------------------------------------
282+
283+
`%gn%` <- function(x, y) {
284+
grep(y, ignore.case = T, x = names(x))
285+
}
286+
287+
#' Wrapper for grepping values. Is NOT case sensitive
288+
#'
289+
#' @param x String vector
290+
#' @param y Pattern
291+
#'
292+
#' @return Elements of string that fit the pattern
293+
#' @export
294+
#'
295+
#' @examples
296+
#'
297+
#' c("hola", "adios", "cocacola") %gv% "ola"
298+
#'
299+
`%gv%` <- function(x, y) {
300+
grep(y, ignore.case = T, x = x, value = T)
301+
}
302+
303+
`%g%` <- function(x, y) {
304+
grep(y, ignore.case = T, x = x)
305+
}
306+
307+
#' Wrapper for paste0. Easy, simple and fast way to concatenate two strings.
308+
#'
309+
#' @param x
310+
#' @param y
311+
#'
312+
#' @return
313+
#' @export
314+
#'
315+
#' @examples
316+
`%+%` <- function(x, y) {
317+
paste0(x,y)
318+
}
319+
320+
#' Conditional cat
321+
#'
322+
#' @param condFlag If TRUE, message in printed, otherwise not
323+
#' @param ... Message
324+
#'
325+
#' @return
326+
#' @export
327+
#'
328+
#' @examples
329+
cond_cat <- function(condFlag = TRUE, ...){
330+
if(condFlag){
331+
cat(...)
332+
}
333+
}

0 commit comments

Comments
 (0)