Regression Analysis

## Multiple Regression Analysis 2024
## Max Gebhart

library(dplyr)
library(tidyr)
library(ggpubr)
library(rstatix)
library(lme4)
library(lmerTest)
library(MuMIn)


# Example from https://bioinformatics-core-shared-training.github.io/linear-models-r/multiple_regression.html#Section_1:_Multiple_Regression
{plot(trees)
m1 = lm(Volume ~ Girth, data= trees)
summary(m1)
plot(m1)
}


## My data will be looking at biomass variables compared to air temp, water depth, and year
## I'll integrate water temperature for the sites that have completed water temp datasets
# First load data
setwd("~/Work/Writing/Manuscripts/2023/FloweringRush")
Al <- read.csv("All Frush Data.csv")
MM <- read.csv("MM FRush Data.csv")
TON <- read.csv("TON FRush Data.csv")
UI <- read.csv("UI FRush data.csv")


## I have to coerce my data into only being numeric since decimals were used as blank placeholders in my case
## double check all data is kept as numerics
Al <- data.frame(sapply(Al, function(x) as.numeric(as.character(x))))
MM <- data.frame(sapply(MM, function(x) as.numeric(as.character(x))))
TON <- data.frame(sapply(TON, function(x) as.numeric(as.character(x))))
UI <- data.frame(sapply(UI, function(x) as.numeric(as.character(x))))

# This enclosed section is for the regression analyses
## these analyses were done comparing plant metrics to the environmental parameters
{


# This is a test regression for organizing how to grab all the necessary values
###### 
test <- lm(formula = A ~ WD, data = MM)
summary(test)
#####

### Ok so we have the basics of what we need for this so now we want to make a function that can slam through all the regressions
### This loop will essentially create a data frame that will harbor all of the end results from the analyses
### I suggest after you have all the analyses done that you randomly select a few and double check the values

# I am attaching the datasets beforehand so they can be called directly

attach(Al)

## Linear Regressions
{
  names <-list(c("A", "BG", "FL", "RB", "Total", "BD", "Ram", "AS", "BS", "FS", "RBS", "PH"), c("ATP", "ATeq", "ATR", "WTP", "WTeq", "WTR", "WDP", "WDeq", "WDR"))
  LRoutput <- data.frame(matrix(nrow = 12, ncol = 9, dimnames = names))
for (i in 1:12){
  flow <- list(A, BG, FL, RB, Total, BD, Ram, AS, BS, FS, RBS, PH)
  r <- i
  vari <- flow[[i]]
  at <- lm(vari ~ AT)
  wt <- lm(vari ~ WT)
  wd <- lm(vari ~ WD)
  
  LRoutput[r, 1] <- signif(anova(at)$'Pr(>F)'[1], 4)
  int <- signif(at[["coefficients"]][["(Intercept)"]], 4)
  mult <- signif(at[["coefficients"]][["AT"]],4)
  one <- paste(mult, "x", sep ="")
  equation <- paste(one, int)
  LRoutput[r, 2] <- equation
  LRoutput[r, 3] <- signif(summary(at)$r.squared, 4)
  
  LRoutput[r, 4] <- signif(anova(wt)$'Pr(>F)'[1], 4)
  int <- signif(wt[["coefficients"]][["(Intercept)"]], 4)
  mult <- signif(wt[["coefficients"]][["WT"]], 4)
  one <- paste(mult, "x", sep ="")
  equation <- paste(one, int)
  LRoutput[r, 5] <- equation
  LRoutput[r, 6] <- signif(summary(wt)$r.squared, 4)
  
  LRoutput[r, 7] <- signif(anova(wd)$'Pr(>F)'[1], 4)
  int <- signif(wd[["coefficients"]][["(Intercept)"]], 4)
  mult <- signif(wd[["coefficients"]][["WD"]], 4)
  one <- paste(mult, "x", sep ="")
  equation <- paste(one, int)
  LRoutput[r, 8] <- equation
  LRoutput[r, 9] <- signif(summary(wd)$r.squared, 4)
}
  print(LRoutput)
  write.csv(LRoutput, "Linear Regression Results.csv")
}

# Multiple Linear Regression
{
  {
    r1 <- lmer(A ~ AT + WT + WD + (1|Site))
    r2 <- lmer(BG ~ AT + WT + WD + (1|Site))
    r3 <- lmer(FL ~ AT + WT + WD + (1|Site))
    r4 <- lmer(RB ~ AT + WT + WD + (1|Site))
    r5 <- lmer(Total ~ AT + WT + WD + (1|Site))
    r6 <- lmer(BD ~ AT + WT + WD + (1|Site))
    r7 <- lmer(Ram ~ AT + WT + WD + (1|Site))
    r8 <- lmer(AS ~ AT + WT + WD + (1|Site))
    r9 <- lmer(BS ~ AT + WT + WD + (1|Site))
    r10 <- lmer(FS ~ AT + WT + WD + (1|Site))
    r11 <- lmer(RBS ~ AT + WT + WD + (1|Site))
    r12 <- lmer(PH ~ AT + WT + WD + (1|Site))
    
    ## This will extract the coefficients for reporting
    {
    r1 <- summary(r1)$coefficients
    r2 <- summary(r2)$coefficients
    r3 <- summary(r3)$coefficients
    r4 <- summary(r4)$coefficients
    r5 <- summary(r5)$coefficients
    r6 <- summary(r6)$coefficients
    r7 <- summary(r7)$coefficients
    r8 <- summary(r8)$coefficients
    r9 <- summary(r9)$coefficients
    r10 <- summary(r10)$coefficients
    r11 <- summary(r11)$coefficients
    r12 <- summary(r12)$coefficients
    }
    
    outlist <- list(r1,r2,r3,r4,r5,r6,r7,r8,r9,r10,r11,r12)
    outmat <- as.data.frame(outlist)
  }
  
  ## 
  {
    name <- list(c("A", "BG", "FL", "RB", "Total", "BD", "Ram", "AS", "BS", "FS", "RBS", "PH"), c("ATP", "ATeq", "ATR", "WTP", "WTeq", "WTR", "WDP", "WDeq", "WDR"))
    MLRoutput <- data.frame(matrix(nrow = 12, ncol = 9, dimnames = name))
    
    for (i in 1:12){
      flow <- list(A, BG, FL, RB, Total, BD, Ram, AS, BS, FS, RBS, PH)
      r <- i
      vari <- flow[[i]]
      at <- lmer(vari ~ AT + (1|Site))
      wt <- lmer(vari ~ WT + (1|Site))
      wd <- lmer(vari ~ WD + (1|Site))
      MLRoutput[i,1] <- signif(summary(at)$coefficients[2,5], 3)
      int <- signif(summary(at)$coefficients[1,1], 4)
      mult <- signif(summary(at)$coefficients[2,1], 4)
      one <- paste(mult, "x", sep ="")
      equation <- paste(one, int)
      MLRoutput[i,2] <- equation
      MLRoutput[i,3] <- signif(r.squaredGLMM(at)[,1], 3)
      
      MLRoutput[i,4] <- signif(summary(wt)$coefficients[2,5], 3)
      int <- signif(summary(wt)$coefficients[1,1], 4)
      mult <- signif(summary(wt)$coefficients[2,1], 4)
      one <- paste(mult, "x", sep ="")
      equation <- paste(one, int)
      MLRoutput[i,5] <- equation
      MLRoutput[i,6] <- signif(r.squaredGLMM(wt)[,1], 3)
      
      MLRoutput[i,7] <- signif(summary(wd)$coefficients[2,5], 3)
      int <- signif(summary(wd)$coefficients[1,1],4)
      mult <- signif(summary(wd)$coefficients[2,1],4)
      one <- paste(mult, "x", sep ="")
      equation <- paste(one, int)
      MLRoutput[i,8] <- equation
      MLRoutput[i,9] <- signif(r.squaredGLMM(wd)[,1], 3)
  }
  write.csv(MLRoutput, "MLR Output.csv")
  write.csv(outmat, "MLR output.csv")
}


## Polynomial Regressions
{
  names <-list(c("A", "BG", "FL", "RB", "Total", "BD", "Ram", "AS", "BS", "FS", "RBS", "PH"), c("ATP", "ATeq", "ATR", "WTP", "WTeq", "WTR", "WDP", "WDeq", "WDR" ))
  PRoutput <- data.frame(matrix(nrow = 12, ncol = 9, dimnames = names))
  for (i in 1:12){
    flow <- list(A, BG, FL, RB, Total, BD, Ram, AS, BS, FS, RBS, PH)
    r <- i
    vari <- flow[[i]]
    at <- lm(vari ~ AT + I(AT^2))
    wt <- lm(vari ~ WT + I(WT^2))
    wd <- lm(vari ~ WD + I(WD^2))
    { 
    PRoutput[r, 1] <- signif(pf(summary(at)$fstatistic[1], summary(at)$fstatistic[2], summary(at)$fstatistic[3], lower.tail = FALSE), 4)
    square <- signif(at[["coefficients"]][["I(AT^2)"]], 4)
    int <- signif(at[["coefficients"]][["(Intercept)"]], 4)
    mult <- signif(at[["coefficients"]][["AT"]],4)
    square <- paste(square, "x2", sep = "")
    one <- paste(mult, "x", sep ="")
    equation <- paste(square, one, int)
    PRoutput[r, 2] <- equation
    PRoutput[r, 3] <- signif(summary(at)$r.squared, 4)
    
    PRoutput[r, 4] <- signif(pf(summary(wt)$fstatistic[1], summary(wt)$fstatistic[2], summary(wt)$fstatistic[3], lower.tail = FALSE), 4)
    square <- signif(wt[["coefficients"]][["I(WT^2)"]], 4)
    int <- signif(wt[["coefficients"]][["(Intercept)"]], 4)
    mult <- signif(wt[["coefficients"]][["WT"]], 4)
    square <- paste(square, "x2", sep = "")
    one <- paste(mult, "x", sep ="")
    equation <- paste(square, one, int)
    PRoutput[r, 5] <- equation
    PRoutput[r, 6] <- signif(summary(wt)$r.squared, 4)
    
    PRoutput[r, 7] <- signif(pf(summary(wd)$fstatistic[1], summary(wd)$fstatistic[2], summary(wd)$fstatistic[3], lower.tail = FALSE), 4)
    square <- signif(wd[["coefficients"]][["I(WD^2)"]], 4)
    int <- signif(wd[["coefficients"]][["(Intercept)"]], 4)
    mult <- signif(wd[["coefficients"]][["WD"]], 4)
    square <- paste(square, "x2", sep = "")
    one <- paste(mult, "x", sep ="")
    equation <- paste(square, one, int)
    PRoutput[r, 8] <- equation
    PRoutput[r, 9] <- signif(summary(wd)$r.squared, 4)
  }  
    }
    print(PRoutput)
    write.csv(PRoutput, "Polynomial Regression Results.csv")
}
}

# This enclosed section includes the Kruskal-Wallis and Dunn's pairwise comparison tests
## these tests were done on the rhizome bud and ramet densities between sites
{
  # this separates the plant metrics with their site, month, and year labels 
  ## each will be separate objects to make running and saving the statistics a bit easier
  {descriptors <- Al[, 1:3]
  rbd <- Al[, 11]
  lrd <- Al[, 12]
  ph <- Al[, 17]
  rbd <- cbind(descriptors, rbd)
  lrd <- cbind(descriptors, lrd)
  ph <- cbind(descriptors, ph)
  }
  
  ## Here is a simple boxplot procedure if you want an initial idea of your data
  {ggboxplot(ph, x = "Site", y = "ph", 
            color = "Site", palette = c("#00AFBB", "#E7B800", "#FC4E07"),
            order = c("MM", "TON", "UI"),
            ylab = "Plant Height", xlab = "Site")
  }
  
  #Rhizome Bud Density
  {site1 <- kruskal_test(rbd ~ Site, data = rbd)
  sitepairs1 <- rstatix::dunn_test(rbd , rbd ~ Site, p.adjust.method = "bonferroni")
  month1 <- kruskal_test(rbd ~ Month, data = rbd)
  monthpairs1 <- rstatix::dunn_test(rbd , rbd ~ Month, p.adjust.method = "bonferroni")
  year1 <- kruskal_test(rbd ~ Year, data = rbd)
  yearpairs1 <- rstatix::dunn_test(rbd , rbd ~ Year, p.adjust.method = "bonferroni")
  rbdkrus <- matrix()
  grouping <- c("Site", "Month", "Year")
  rbdkrus <- rbind(site1, month1, year1)
  rbdkrus <- cbind(rbdkrus, grouping)
  rbddunns <- matrix()
  rbddunns <- rbind(sitepairs1, monthpairs1, yearpairs1)}
  
  #Leaf Ramet Density
  {site2 <- kruskal_test(lrd ~ Site, data = lrd)
    sitepairs2 <- rstatix::dunn_test(lrd , lrd ~ Site, p.adjust.method = "bonferroni")
    month2 <- kruskal_test(lrd ~ Month, data = lrd)
    monthpairs2 <- rstatix::dunn_test(lrd , lrd ~ Month, p.adjust.method = "bonferroni")
    year2 <- kruskal_test(lrd ~ Year, data = lrd)
    yearpairs2 <- rstatix::dunn_test(lrd , lrd ~ Year, p.adjust.method = "bonferroni")
    lrdkrus <- matrix()
    grouping <- c("Site", "Month", "Year")
    lrdkrus <- rbind(site2, month2, year2)
    lrdkrus <- cbind(lrdkrus, grouping)
    lrddunns <- matrix()
    lrddunns <- rbind(sitepairs2, monthpairs2, yearpairs2)}
  
  #Plant Height
  {site3 <- kruskal_test(ph ~ Site, data = ph)
    sitepairs3 <- rstatix::dunn_test(ph , ph ~ Site, p.adjust.method = "bonferroni")
    month3 <- kruskal_test(ph ~ Month, data = ph)
    monthpairs3 <- rstatix::dunn_test(ph , ph ~ Month, p.adjust.method = "bonferroni")
    year3 <- kruskal_test(ph ~ Year, data = ph)
    yearpairs3 <- rstatix::dunn_test(ph , ph ~ Year, p.adjust.method = "bonferroni")
    phkrus <- matrix()
    grouping <- c("Site", "Month", "Year")
    phkrus <- rbind(site3, month3, year3)
    phkrus <- cbind(phkrus, grouping)
    phdunns <- matrix()
    phdunns <- rbind(sitepairs3, monthpairs3, yearpairs3)}
  
}


pwc <- sitepairs3 %>% add_xy_position(x = "Site")
ggboxplot(ph, x = "Site", y = "ph") +
  stat_pvalue_manual(pwc, hide.ns = TRUE) +
  labs(
    subtitle = get_test_label(site3, detailed = TRUE),
    caption = get_pwc_label(pwc)
  )