Merge pull request #5103 from solgenomics/topic/solgs/interactive-clu…

…ster-plot make cluster plots interactive
solgenomics · Sep 19, 2024 · 675f817 · 675f817
2 parents 06e5334 + 1969188
commit 675f817
Show file tree

Hide file tree

Showing 7 changed files with 445 additions and 269 deletions.
diff --git a/R/solGS/kclustering.r b/R/solGS/kclustering.r
@@ -37,6 +37,7 @@ kmeansPlotFile <- grep("k-means_plot", outputFiles, value = TRUE)
 kResultFile <- grep("result", outputFiles, value = TRUE)
 elbowPlotFile <- grep("elbow_plot", outputFiles, value = TRUE)
 clusterMeansFile <- grep("k-means_means", outputFiles, value = TRUE)
+clusterPcScoresFile <- grep("k-means_pc_scores", outputFiles, value = TRUE)
 variancesFile <- grep("k-means_variances", outputFiles, value = TRUE)
 reportFile <- grep("report", outputFiles, value = TRUE)
 errorFile <- grep("error", outputFiles, value = TRUE)
@@ -115,36 +116,7 @@ clusterDataNotScaled <- c()
 
 if (grepl("genotype", dataType, ignore.case = TRUE)) {
     clusterData <- extractGenotype(inputFiles)
-
-    pca <- prcomp(clusterData, retx = TRUE)
-    pca <- summary(pca)
-
-    variances <- data.frame(pca$importance)
-
-    varProp <- variances[3, ]
-    varProp <- data.frame(t(varProp))
-    names(varProp) <- c("cumVar")
-
-    selectPcs <- varProp %>%
-        filter(cumVar <= 0.9)
-    pcsCnt <- nrow(selectPcs)
-
-    reportNotes <- paste0("Before clustering this dataset, principal component analysis (PCA) was perforemd on it to reduce the number of variables (dimensions). ")
-    reportNotes <- paste0(reportNotes, "Based on the PCA, ", pcsCnt, " PCs were used to do the clustering. ")
-    reportNotes <- paste0(reportNotes, "\n\nThe ", pcsCnt, " PCs explain 90% of the variance in the original dataset.",
-        "\n")
-
-    scores <- data.frame(pca$x)
-    scores <- scores[, 1:pcsCnt]
-    scores <- round(scores, 3)
-
-    variances <- variances[2, 1:pcsCnt]
-    variances <- round(variances, 4) * 100
-    variances <- data.frame(t(variances))
-
-    clusterData <- scores
 } else {
-
     if (grepl("gebv", dataType, ignore.case = TRUE)) {
         gebvsFile <- grep("combined_gebvs", inputFiles, value = TRUE)
         gebvsData <- data.frame(fread(gebvsFile, header = TRUE))
@@ -254,6 +226,29 @@ if (!grepl('genotype', kResultFile)) {
 
 }
 
+pca <- c()
+if (is.null(selectedIndexGenotypes)) {
+    pca    <- prcomp(clusterData, scale=TRUE, retx=TRUE)
+} else {
+    pca    <- prcomp(clusterData, retx=TRUE)
+}
+
+pca    <- summary(pca)
+scores   <- data.frame(pca$x)
+scores   <- scores[, 1:2]
+scores   <- round(scores, 3)
+
+clusterPcScoresGroups <- c()
+if (length(clusterPcScoresFile)) {
+    message('adding cluster groups to pc scores...')
+    scores <- rownames_to_column(scores)
+    names(scores)[1] <- c("germplasmName")
+
+    clusterPcScoresGroups <- inner_join(kClusters, scores, by = "germplasmName")
+    clusterPcScoresGroups <- clusterPcScoresGroups %>% 
+        arrange(Cluster)
+}
+
 cat(reportNotes, file = reportFile, sep = "\n", append = TRUE)
 
 variances <- paste0("Variances output: ")
@@ -286,6 +281,11 @@ if (length(clusterMeansFile) && !is.null(clusterMeans)) {
         quote = FALSE, )
 }
 
+if (length(clusterPcScoresFile) && !is.null(clusterPcScoresGroups)) {
+    fwrite(clusterPcScoresGroups, file = clusterPcScoresFile, sep = "\t", row.names = FALSE,
+        quote = FALSE, )
+}
+
 message("Done clustering.")