Skip to content

Commit

Permalink
Merge pull request #5103 from solgenomics/topic/solgs/interactive-clu…
Browse files Browse the repository at this point in the history
…ster-plot

make cluster plots interactive
  • Loading branch information
lukasmueller authored Sep 19, 2024
2 parents 06e5334 + 1969188 commit 675f817
Show file tree
Hide file tree
Showing 7 changed files with 445 additions and 269 deletions.
58 changes: 29 additions & 29 deletions R/solGS/kclustering.r
Original file line number Diff line number Diff line change
Expand Up @@ -37,6 +37,7 @@ kmeansPlotFile <- grep("k-means_plot", outputFiles, value = TRUE)
kResultFile <- grep("result", outputFiles, value = TRUE)
elbowPlotFile <- grep("elbow_plot", outputFiles, value = TRUE)
clusterMeansFile <- grep("k-means_means", outputFiles, value = TRUE)
clusterPcScoresFile <- grep("k-means_pc_scores", outputFiles, value = TRUE)
variancesFile <- grep("k-means_variances", outputFiles, value = TRUE)
reportFile <- grep("report", outputFiles, value = TRUE)
errorFile <- grep("error", outputFiles, value = TRUE)
Expand Down Expand Up @@ -115,36 +116,7 @@ clusterDataNotScaled <- c()

if (grepl("genotype", dataType, ignore.case = TRUE)) {
clusterData <- extractGenotype(inputFiles)

pca <- prcomp(clusterData, retx = TRUE)
pca <- summary(pca)

variances <- data.frame(pca$importance)

varProp <- variances[3, ]
varProp <- data.frame(t(varProp))
names(varProp) <- c("cumVar")

selectPcs <- varProp %>%
filter(cumVar <= 0.9)
pcsCnt <- nrow(selectPcs)

reportNotes <- paste0("Before clustering this dataset, principal component analysis (PCA) was perforemd on it to reduce the number of variables (dimensions). ")
reportNotes <- paste0(reportNotes, "Based on the PCA, ", pcsCnt, " PCs were used to do the clustering. ")
reportNotes <- paste0(reportNotes, "\n\nThe ", pcsCnt, " PCs explain 90% of the variance in the original dataset.",
"\n")

scores <- data.frame(pca$x)
scores <- scores[, 1:pcsCnt]
scores <- round(scores, 3)

variances <- variances[2, 1:pcsCnt]
variances <- round(variances, 4) * 100
variances <- data.frame(t(variances))

clusterData <- scores
} else {

if (grepl("gebv", dataType, ignore.case = TRUE)) {
gebvsFile <- grep("combined_gebvs", inputFiles, value = TRUE)
gebvsData <- data.frame(fread(gebvsFile, header = TRUE))
Expand Down Expand Up @@ -254,6 +226,29 @@ if (!grepl('genotype', kResultFile)) {

}

pca <- c()
if (is.null(selectedIndexGenotypes)) {
pca <- prcomp(clusterData, scale=TRUE, retx=TRUE)
} else {
pca <- prcomp(clusterData, retx=TRUE)
}

pca <- summary(pca)
scores <- data.frame(pca$x)
scores <- scores[, 1:2]
scores <- round(scores, 3)

clusterPcScoresGroups <- c()
if (length(clusterPcScoresFile)) {
message('adding cluster groups to pc scores...')
scores <- rownames_to_column(scores)
names(scores)[1] <- c("germplasmName")

clusterPcScoresGroups <- inner_join(kClusters, scores, by = "germplasmName")
clusterPcScoresGroups <- clusterPcScoresGroups %>%
arrange(Cluster)
}

cat(reportNotes, file = reportFile, sep = "\n", append = TRUE)

variances <- paste0("Variances output: ")
Expand Down Expand Up @@ -286,6 +281,11 @@ if (length(clusterMeansFile) && !is.null(clusterMeans)) {
quote = FALSE, )
}

if (length(clusterPcScoresFile) && !is.null(clusterPcScoresGroups)) {
fwrite(clusterPcScoresGroups, file = clusterPcScoresFile, sep = "\t", row.names = FALSE,
quote = FALSE, )
}

message("Done clustering.")


Expand Down
Loading

0 comments on commit 675f817

Please sign in to comment.