Release 2.6.1

Release of version 2.6.1
FabbriniMarco · Jul 24, 2023 · e834686 · e834686
1 parent 6ac72d7
commit e834686
Show file tree

Hide file tree

Showing 5 changed files with 269 additions and 2 deletions.
diff --git a/ESEMPIO PLATE.xlsx b/ESEMPIO PLATE.xlsx
diff --git a/Generate-MiSeq-Manifest.R b/Generate-MiSeq-Manifest.R
@@ -0,0 +1,153 @@
+############### script for generating input manifest for MiSeq sequencing machine ##################
+# Marco Fabbrini
+# VERSION:2.6
+# RELEASE:03/Mar/2023
+
+args<-commandArgs(TRUE) 
+
+
+#----------LOAD FILES AND PARAMETERS------
+if (!requireNamespace("readxl", quietly = TRUE))
+  install.packages("readxl", repos = "https://cran.rstudio.com/")
+library(readxl)
+
+# Debugging purposes:
+# pDataFile1 <- "index_database.tsv"
+pDataFile1 <- args[1]
+index_database<- read.delim(pDataFile1, header=T, sep="\t")
+
+# Debugging purposes:
+# pDataFile2 <-  "header.tsv"
+pDataFile2 <- args[2]
+header_table <- read.delim(pDataFile2, header=F, sep="\t")[ , c(1:8)]
+header_table$V2[header_table$V2 == ""]=NA
+header_table[is.na(header_table)] = ""
+header_table = header_table[,c(1:7)]
+
+# Debugging purposes:
+# project_name = "test"
+# project_date = Sys.Date()+1
+project_name = args[3]
+project_date = args[4]
+
+header_table$V2[2] = project_name
+header_table$V2[3] = project_date
+
+n_plates <- args[5]
+
+pDataFile3 <- args[6]
+
+
+# SCRIPT ------------------------------------------------------------------
+
+index_set <- c()
+for ( n in 1:n_plates)
+{
+  suppressMessages(assign(paste("plate", n, sep=""), as.data.frame(read_xlsx(path = pDataFile3, sheet = n )[c(1:8),c(1:13)]) ))
+  index_set <- c(index_set, colnames(get(paste("plate", n, sep="")))[1] )
+}
+
+index_correspondence = data.frame(plate=1:n_plates, index_set=index_set)
+data_section = data.frame(matrix(ncol=7))
+colnames(data_section) = c("Sample_ID",	"Description"	,"I7_Index_ID",	"index",	"I5_Index_ID",	"index2",	"Sample_Project")
+data_section_header = c("Sample_ID",	"Description"	,"I7_Index_ID",	"index",	"I5_Index_ID",	"index2",	"Sample_Project")
+
+
+# Check for unknown headers -----------------------------------------------
+for ( n in 1:n_plates)
+{
+  if ( ! all ( colnames(get(paste("plate", n, sep="")))[2:13] %in% index_database$Index_Name | grepl("empty", colnames(get(paste("plate", n, sep="")))[2:13]) ) )
+  {
+    print(paste("No sample sheet has been produced. Check your EXCEL plate table first. Aligator."))
+    stop( print( paste("Invalid Index name found in columns of plate N.", n, " --- Index \'", 
+                       colnames(get(paste("plate", n, sep="")))[2:13][!(colnames(get(paste("plate", n, sep="")))[2:13] %in% index_database$Index_Name | grepl("empty", colnames(get(paste("plate", n, sep="")))[2:13]))],
+                       "\' not found in index_database.tsv", sep="") ) )
+  }
+  if ( !all( get(paste("plate", n, sep=""))[,1] %in% c(index_database$Index_Name, "empty")  ) )
+  {
+    print(paste("No sample sheet has been produced. Check your EXCEL plate table first. Aligator."))
+    stop( print( paste("Invalid Index name found in rows of plate N. ", n, " --- Index \'", 
+                       get(paste("plate", n, sep=""))[,1][!get(paste("plate", n, sep=""))[,1] %in% c(index_database$Index_Name, "empty")],
+                       "\' not found", sep="") ) )
+  }
+}
+
+
+# Generate the manifest ---------------------------------------------------
+insert = 1
+for ( n in 1:n_plates)
+{
+  index_column = colnames(get(paste("plate", n, sep="")))[2:ncol(get(paste("plate", n, sep="")))]
+  index_row = get(paste("plate", n, sep=""))[,1]
+  for ( row in 1:length(index_row) )
+  {
+    if ( get(paste("plate", n, sep=""))[row,1] != "empty" ) # If the row Index is "empty" skip all the row
+    {
+      for ( col in 2:(length(index_column)+1) )
+      {
+        if ( !grepl("empty", colnames(get(paste("plate", n, sep="")))[col]) ) # If the column Index is "empty" skip all the column
+        {
+          if ( get(paste("plate", n, sep=""))[row,col] != "empty" ) # If the cell value is "empty" skip it
+          {
+            data_section[insert,"Sample_ID"] = gsub(" ", "", get(paste("plate", n, sep=""))[row,col])
+            data_section[insert,"Description"] = gsub(" ", "", get(paste("plate", n, sep=""))[row,col])
+            data_section[insert,"index"] = paste( index_correspondence[index_correspondence$plate == n , "index_set"] ,
+                                                  colnames( get(paste("plate", n, sep="")) )[col] , 
+                                                  sep="-")
+            data_section[insert,"I7_Index_ID"] = index_database[ index_database$Index_Name == colnames( get(paste("plate", n, sep="")) )[col] , "Sequence"]
+            data_section[insert,"index2"] = paste( index_correspondence[index_correspondence$plate == n , "index_set"] ,
+                                                   get(paste("plate", n, sep=""))[,1][row] , 
+                                                   sep="-")
+            data_section[insert,"I5_Index_ID"] = index_database[ index_database$Index_Name == get(paste("plate", n, sep=""))[,1][row] , "Sequence"]
+            data_section[insert,"Sample_Project"] = project_name
+            insert = insert + 1
+          }
+        }
+      }
+    }
+  }
+}
+
+
+# Check for weird sample names --------------------------------------------
+if ( ( length( grep(" ", data_section$Sample_ID) ) + length( grep("\\.", data_section$Sample_ID) ) ) != 0  ) {
+  print(paste("I strongly suggest you to check your samples name"))
+  print(paste("Golden rule for assigning good sample's names:"))
+  print(paste("• Don't start any name with a number"))
+  print(paste("• Avoid using spaces, points or special characters such as * ' / + ° #, etc..."))
+  print(paste("• It's better to use names indicative of sample's characteristics (e.g., with timepoint and/or group, treatment or whatever)"))
+}
+
+
+# Check for duplicated samples --------------------------------------------
+if ( any(duplicated(data_section$Sample_ID)) )
+{
+  qualeduplicato = data_section$Sample_ID[duplicated(data_section$Sample_ID)]
+  for ( n in 1:n_plates )
+  {
+    for ( dupiter in qualeduplicato)
+    {
+      if ( any(grepl(dupiter, get(paste("plate", n, sep="")))) )
+      {
+        posizione_colonne = grep(dupiter, get(paste("plate", n, sep="")))
+        for ( quantidup in 1:length(posizione_colonne) )
+        {
+          colonna_temp = LETTERS[posizione_colonne[quantidup]]
+          riga_temp = 1+ which( get(paste("plate", n, sep=""))[,posizione_colonne[quantidup]] == dupiter )
+          print( paste("Duplicate ", dupiter , " found in plate N.", n, ", index set ", index_correspondence$index_set[index_correspondence$plate == n], ", EXCEL coordinates ", colonna_temp,  riga_temp, sep=""))
+        }
+      }
+    }
+  }
+
+  print(paste("No sample sheet has been produced. Check your EXCEL plate table first. Aligator."))
+} else {
+  colnames(data_section) = colnames(header_table)
+  final_format <- rbind(header_table, data_section_header, data_section)
+  # If everything is fine, save the manifest to a CSV file, ready for the sequencer
+  write.table( final_format, paste(project_name, "_SampleSheet_", project_date, ".csv", sep=""), sep=",", col.names=F, row.names=F, quote=F)
+  print(paste("Manifest generated succesfully"))
+}
+
+
+######################################-
diff --git a/README.md b/README.md
@@ -1,2 +1,61 @@
-# Generate-MiSeq-Manifest
-Generate a .CSV manifest file for importing multiple samples in the Illumina MiSeq machine
+###############------ Help section: script for generating input manifest for MiSeq sequencing machine------##################
+# Marco Fabbrini
+# VERSION:2.6.1
+# RELEASE:24/Lug/2023
+
+# Requirements:
+# # The readxl package is required ( install.packages("readxl") in a R session)
+# # Rscript bash bin is required (on MacOS or Linux platforms), or either use the Rscript.exe windows executable
+
+###### ---------
+###### Tutorial for Linux and MacOS:
+# Open your Terminal app and head to the folder containing this script and the plates for the run, using the "cd" command, for example:
+cd /User/myusername/Downloads/Generate_MiSeq_CSV-2.6.1
+# If you don't know your PATH you can also type "cd " and drag-and-drop the folder from the Finder/FileExplorer inside the terminal.
+
+# Paste this in your Terminal, changing the parameters accordingly
+Rscript Generate-MiSeq-Manifest.R index_database.tsv header.tsv Project_name 2023-05-18 4 ESEMPIO_PLATE.xlsx
+
+## You need to change the "Project_name" accordingly to the name you want to give to the run
+## You need to change the date "2023-05-18" setting the date in which the run WILL BE conducted
+## You need to change the number "4" - the one between the date and the excel file - according to the number of plates in the excel file
+## You need to change the excel file with the plates accordingly to the samples and plates you have prepared, sticking to the template
+
+
+###### ---------
+###### Tutorial for Windows 10 and Windows 11:
+# Open your Terminal app or Windows PowerShell by searching for "Terminal" or "Powershell" in your Start menu
+# If you have problem finding your terminal, you cen press the keys Win+R and a "Execute" window sill pop out in the bottom-left screen. Type "powershell" and hit Enter. 
+# Now head to the folder containing the script and the plates for the run, using the "cd" command. Remember to use single quotes for the path. For example:
+cd 'C:\Users\myusername\Downloads\Generate_MiSeq_CSV-2.6.1\'
+
+# If you don't know your PATH you can also type "cd " and drag-and-drop the folder from the File Explorer inside the terminal.
+
+# Then, we need to locate your Rscript.exe executable. You can open your File Explorer and head to "This PC", then select "OS C:", then "Program Files" (EN) or "Programmi" (IT) and open the "R" folder. Check which one is the latest version (the highest number) and edit the line of code below (R-4.3.1) in order to execute the appropriate version of R.
+# Remember to execute the command below with the commercial & first! Note the use of double quotes for the path to the Rscript.exe executable.
+# Note to the IT users: even if your folder is named "Programmi", you still need to type "Program Files" in the path below. Just adjust the R-X.X.X version in the path
+
+& "C:\Program Files\R\R-4.3.1\bin\Rscript.exe" Generate-MiSeq-Manifest.R index_database.tsv header.tsv Project_name 2023-05-18 4 ESEMPIO_PLATE.xlsx
+
+## You need to change the "Project_name" accordingly to the name you want to give to the run
+## You need to change the date "2023-05-18" setting the date in which the run WILL BE conducted
+## You need to change the number "4" - the one between the date and the excel file - according to the number of plates in the excel file
+## You need to change the excel file with the plates accordingly to the samples and plates you have prepared, sticking to the template
+
+
+
+# Additional details:
+
+# 1st argument contains the index database supplied
+# 2nd argument contains the header section supplied
+# 3rd argument specifies the project name. MUST avoid using spaces or special characters (e.g., wildcards . * )
+# 4th argument specifies the project's date. MUST use the date of the sequencing run in format YYYY-MM-DD
+# 5th argument contains the number of plates present in the XLSX file
+# 6th argument pointss to the XLSX file for the plates built as such:
+# # Each separate plate is in a separate sheet
+# # The plate must start from the first cell in the upper left corner of the Excel sheet
+# # The upper left cell must contain a single letter pointing to the adapter set (A, B, C, D). No spaces, no merged cells.
+# # Column names and row names MUST contain index codes (N- in the columns, S- in the rows) OR 'empty'
+# # The inner part of the table must contain sample's name. Avoid name starting with numbers or containing special characters
+# # In case of an empty cell you MUST fill the cell with the term: 'empty'
+# # An example of a plate file is supplied
diff --git a/header.tsv b/header.tsv
@@ -0,0 +1,14 @@
+[Header]												
+Experiment Name	Intert_Name						
+Date	Insert_Date						
+Module	GenerateFASTQ - 3.0.1						
+Workflow	GenerateFASTQ						
+Library Prep Kit	Nextera XT						
+Index Kit	Nextera XT v2 Index Kit Sets A B C D						
+Chemistry	Amplicon												
+[Reads]							
+251							
+251													
+[Settings]							
+adapter	CTGTCTCTTATACACATCT												
+[Data]							
diff --git a/index_database.tsv b/index_database.tsv
@@ -0,0 +1,41 @@
+Index_Name	Sequence
+S502	CTCTCTAT
+S503	TATCCTCT
+S505	GTAAGGAG
+S506	ACTGCATA
+S507	AAGGAGTA
+S508	CTAAGCCT
+S510	CGTCTAAT
+S511	TCTCTCCG
+S513	TCGACTAG
+S515	TTCTAGCT
+S516	CCTAGAGT
+S517	GCGTAAGA
+S518	CTATTAAG
+S520	AAGGCTAT
+S521	GAGCCTTA
+S522	TTATGCGA
+N701	TAAGGCGA
+N702	CGTACTAG
+N703	AGGCAGAA
+N704	TCCTGAGC
+N705	GGACTCCT
+N706	TAGGCATG
+N707	CTCTCTAC
+N710	CGAGGCTG
+N711	AAGAGGCA
+N712	GTAGAGGA
+N714	GCTCATGA
+N715	ATCTCAGG
+N716	ACTCGCTA
+N718	GGAGCTAC
+N719	GCGTAGTA
+N720	CGGAGCCT
+N721	TACGCTGC
+N722	ATGCGCAG
+N723	TAGCGCTC
+N724	ACTGAGCG
+N726	CCTAAGAC
+N727	CGATCAGT
+N728	TGCAGCTA
+N729	TCGACGTC