Skip to content

Commit

Permalink
Release 2.6.1
Browse files Browse the repository at this point in the history
Release of version 2.6.1
  • Loading branch information
FabbriniMarco committed Jul 24, 2023
1 parent 6ac72d7 commit e834686
Show file tree
Hide file tree
Showing 5 changed files with 269 additions and 2 deletions.
Binary file added ESEMPIO PLATE.xlsx
Binary file not shown.
153 changes: 153 additions & 0 deletions Generate-MiSeq-Manifest.R
Original file line number Diff line number Diff line change
@@ -0,0 +1,153 @@
############### script for generating input manifest for MiSeq sequencing machine ##################
# Marco Fabbrini
# VERSION:2.6
# RELEASE:03/Mar/2023

args<-commandArgs(TRUE)


#----------LOAD FILES AND PARAMETERS------
if (!requireNamespace("readxl", quietly = TRUE))
install.packages("readxl", repos = "https://cran.rstudio.com/")
library(readxl)

# Debugging purposes:
# pDataFile1 <- "index_database.tsv"
pDataFile1 <- args[1]
index_database<- read.delim(pDataFile1, header=T, sep="\t")

# Debugging purposes:
# pDataFile2 <- "header.tsv"
pDataFile2 <- args[2]
header_table <- read.delim(pDataFile2, header=F, sep="\t")[ , c(1:8)]
header_table$V2[header_table$V2 == ""]=NA
header_table[is.na(header_table)] = ""
header_table = header_table[,c(1:7)]

# Debugging purposes:
# project_name = "test"
# project_date = Sys.Date()+1
project_name = args[3]
project_date = args[4]

header_table$V2[2] = project_name
header_table$V2[3] = project_date

n_plates <- args[5]

pDataFile3 <- args[6]


# SCRIPT ------------------------------------------------------------------

index_set <- c()
for ( n in 1:n_plates)
{
suppressMessages(assign(paste("plate", n, sep=""), as.data.frame(read_xlsx(path = pDataFile3, sheet = n )[c(1:8),c(1:13)]) ))
index_set <- c(index_set, colnames(get(paste("plate", n, sep="")))[1] )
}

index_correspondence = data.frame(plate=1:n_plates, index_set=index_set)
data_section = data.frame(matrix(ncol=7))
colnames(data_section) = c("Sample_ID", "Description" ,"I7_Index_ID", "index", "I5_Index_ID", "index2", "Sample_Project")
data_section_header = c("Sample_ID", "Description" ,"I7_Index_ID", "index", "I5_Index_ID", "index2", "Sample_Project")


# Check for unknown headers -----------------------------------------------
for ( n in 1:n_plates)
{
if ( ! all ( colnames(get(paste("plate", n, sep="")))[2:13] %in% index_database$Index_Name | grepl("empty", colnames(get(paste("plate", n, sep="")))[2:13]) ) )
{
print(paste("No sample sheet has been produced. Check your EXCEL plate table first. Aligator."))
stop( print( paste("Invalid Index name found in columns of plate N.", n, " --- Index \'",
colnames(get(paste("plate", n, sep="")))[2:13][!(colnames(get(paste("plate", n, sep="")))[2:13] %in% index_database$Index_Name | grepl("empty", colnames(get(paste("plate", n, sep="")))[2:13]))],
"\' not found in index_database.tsv", sep="") ) )
}
if ( !all( get(paste("plate", n, sep=""))[,1] %in% c(index_database$Index_Name, "empty") ) )
{
print(paste("No sample sheet has been produced. Check your EXCEL plate table first. Aligator."))
stop( print( paste("Invalid Index name found in rows of plate N. ", n, " --- Index \'",
get(paste("plate", n, sep=""))[,1][!get(paste("plate", n, sep=""))[,1] %in% c(index_database$Index_Name, "empty")],
"\' not found", sep="") ) )
}
}


# Generate the manifest ---------------------------------------------------
insert = 1
for ( n in 1:n_plates)
{
index_column = colnames(get(paste("plate", n, sep="")))[2:ncol(get(paste("plate", n, sep="")))]
index_row = get(paste("plate", n, sep=""))[,1]
for ( row in 1:length(index_row) )
{
if ( get(paste("plate", n, sep=""))[row,1] != "empty" ) # If the row Index is "empty" skip all the row
{
for ( col in 2:(length(index_column)+1) )
{
if ( !grepl("empty", colnames(get(paste("plate", n, sep="")))[col]) ) # If the column Index is "empty" skip all the column
{
if ( get(paste("plate", n, sep=""))[row,col] != "empty" ) # If the cell value is "empty" skip it
{
data_section[insert,"Sample_ID"] = gsub(" ", "", get(paste("plate", n, sep=""))[row,col])
data_section[insert,"Description"] = gsub(" ", "", get(paste("plate", n, sep=""))[row,col])
data_section[insert,"index"] = paste( index_correspondence[index_correspondence$plate == n , "index_set"] ,
colnames( get(paste("plate", n, sep="")) )[col] ,
sep="-")
data_section[insert,"I7_Index_ID"] = index_database[ index_database$Index_Name == colnames( get(paste("plate", n, sep="")) )[col] , "Sequence"]
data_section[insert,"index2"] = paste( index_correspondence[index_correspondence$plate == n , "index_set"] ,
get(paste("plate", n, sep=""))[,1][row] ,
sep="-")
data_section[insert,"I5_Index_ID"] = index_database[ index_database$Index_Name == get(paste("plate", n, sep=""))[,1][row] , "Sequence"]
data_section[insert,"Sample_Project"] = project_name
insert = insert + 1
}
}
}
}
}
}


# Check for weird sample names --------------------------------------------
if ( ( length( grep(" ", data_section$Sample_ID) ) + length( grep("\\.", data_section$Sample_ID) ) ) != 0 ) {
print(paste("I strongly suggest you to check your samples name"))
print(paste("Golden rule for assigning good sample's names:"))
print(paste("• Don't start any name with a number"))
print(paste("• Avoid using spaces, points or special characters such as * ' / + ° #, etc..."))
print(paste("• It's better to use names indicative of sample's characteristics (e.g., with timepoint and/or group, treatment or whatever)"))
}


# Check for duplicated samples --------------------------------------------
if ( any(duplicated(data_section$Sample_ID)) )
{
qualeduplicato = data_section$Sample_ID[duplicated(data_section$Sample_ID)]
for ( n in 1:n_plates )
{
for ( dupiter in qualeduplicato)
{
if ( any(grepl(dupiter, get(paste("plate", n, sep="")))) )
{
posizione_colonne = grep(dupiter, get(paste("plate", n, sep="")))
for ( quantidup in 1:length(posizione_colonne) )
{
colonna_temp = LETTERS[posizione_colonne[quantidup]]
riga_temp = 1+ which( get(paste("plate", n, sep=""))[,posizione_colonne[quantidup]] == dupiter )
print( paste("Duplicate ", dupiter , " found in plate N.", n, ", index set ", index_correspondence$index_set[index_correspondence$plate == n], ", EXCEL coordinates ", colonna_temp, riga_temp, sep=""))
}
}
}
}

print(paste("No sample sheet has been produced. Check your EXCEL plate table first. Aligator."))
} else {
colnames(data_section) = colnames(header_table)
final_format <- rbind(header_table, data_section_header, data_section)
# If everything is fine, save the manifest to a CSV file, ready for the sequencer
write.table( final_format, paste(project_name, "_SampleSheet_", project_date, ".csv", sep=""), sep=",", col.names=F, row.names=F, quote=F)
print(paste("Manifest generated succesfully"))
}


######################################-
63 changes: 61 additions & 2 deletions README.md
Original file line number Diff line number Diff line change
@@ -1,2 +1,61 @@
# Generate-MiSeq-Manifest
Generate a .CSV manifest file for importing multiple samples in the Illumina MiSeq machine
###############------ Help section: script for generating input manifest for MiSeq sequencing machine------##################
# Marco Fabbrini
# VERSION:2.6.1
# RELEASE:24/Lug/2023

# Requirements:
# # The readxl package is required ( install.packages("readxl") in a R session)
# # Rscript bash bin is required (on MacOS or Linux platforms), or either use the Rscript.exe windows executable

###### ---------
###### Tutorial for Linux and MacOS:
# Open your Terminal app and head to the folder containing this script and the plates for the run, using the "cd" command, for example:
cd /User/myusername/Downloads/Generate_MiSeq_CSV-2.6.1
# If you don't know your PATH you can also type "cd " and drag-and-drop the folder from the Finder/FileExplorer inside the terminal.

# Paste this in your Terminal, changing the parameters accordingly
Rscript Generate-MiSeq-Manifest.R index_database.tsv header.tsv Project_name 2023-05-18 4 ESEMPIO_PLATE.xlsx

## You need to change the "Project_name" accordingly to the name you want to give to the run
## You need to change the date "2023-05-18" setting the date in which the run WILL BE conducted
## You need to change the number "4" - the one between the date and the excel file - according to the number of plates in the excel file
## You need to change the excel file with the plates accordingly to the samples and plates you have prepared, sticking to the template


###### ---------
###### Tutorial for Windows 10 and Windows 11:
# Open your Terminal app or Windows PowerShell by searching for "Terminal" or "Powershell" in your Start menu
# If you have problem finding your terminal, you cen press the keys Win+R and a "Execute" window sill pop out in the bottom-left screen. Type "powershell" and hit Enter.
# Now head to the folder containing the script and the plates for the run, using the "cd" command. Remember to use single quotes for the path. For example:
cd 'C:\Users\myusername\Downloads\Generate_MiSeq_CSV-2.6.1\'

# If you don't know your PATH you can also type "cd " and drag-and-drop the folder from the File Explorer inside the terminal.

# Then, we need to locate your Rscript.exe executable. You can open your File Explorer and head to "This PC", then select "OS C:", then "Program Files" (EN) or "Programmi" (IT) and open the "R" folder. Check which one is the latest version (the highest number) and edit the line of code below (R-4.3.1) in order to execute the appropriate version of R.
# Remember to execute the command below with the commercial & first! Note the use of double quotes for the path to the Rscript.exe executable.
# Note to the IT users: even if your folder is named "Programmi", you still need to type "Program Files" in the path below. Just adjust the R-X.X.X version in the path

& "C:\Program Files\R\R-4.3.1\bin\Rscript.exe" Generate-MiSeq-Manifest.R index_database.tsv header.tsv Project_name 2023-05-18 4 ESEMPIO_PLATE.xlsx

## You need to change the "Project_name" accordingly to the name you want to give to the run
## You need to change the date "2023-05-18" setting the date in which the run WILL BE conducted
## You need to change the number "4" - the one between the date and the excel file - according to the number of plates in the excel file
## You need to change the excel file with the plates accordingly to the samples and plates you have prepared, sticking to the template



# Additional details:

# 1st argument contains the index database supplied
# 2nd argument contains the header section supplied
# 3rd argument specifies the project name. MUST avoid using spaces or special characters (e.g., wildcards . * )
# 4th argument specifies the project's date. MUST use the date of the sequencing run in format YYYY-MM-DD
# 5th argument contains the number of plates present in the XLSX file
# 6th argument pointss to the XLSX file for the plates built as such:
# # Each separate plate is in a separate sheet
# # The plate must start from the first cell in the upper left corner of the Excel sheet
# # The upper left cell must contain a single letter pointing to the adapter set (A, B, C, D). No spaces, no merged cells.
# # Column names and row names MUST contain index codes (N- in the columns, S- in the rows) OR 'empty'
# # The inner part of the table must contain sample's name. Avoid name starting with numbers or containing special characters
# # In case of an empty cell you MUST fill the cell with the term: 'empty'
# # An example of a plate file is supplied
14 changes: 14 additions & 0 deletions header.tsv
Original file line number Diff line number Diff line change
@@ -0,0 +1,14 @@
[Header]
Experiment Name Intert_Name
Date Insert_Date
Module GenerateFASTQ - 3.0.1
Workflow GenerateFASTQ
Library Prep Kit Nextera XT
Index Kit Nextera XT v2 Index Kit Sets A B C D
Chemistry Amplicon
[Reads]
251
251
[Settings]
adapter CTGTCTCTTATACACATCT
[Data]
41 changes: 41 additions & 0 deletions index_database.tsv
Original file line number Diff line number Diff line change
@@ -0,0 +1,41 @@
Index_Name Sequence
S502 CTCTCTAT
S503 TATCCTCT
S505 GTAAGGAG
S506 ACTGCATA
S507 AAGGAGTA
S508 CTAAGCCT
S510 CGTCTAAT
S511 TCTCTCCG
S513 TCGACTAG
S515 TTCTAGCT
S516 CCTAGAGT
S517 GCGTAAGA
S518 CTATTAAG
S520 AAGGCTAT
S521 GAGCCTTA
S522 TTATGCGA
N701 TAAGGCGA
N702 CGTACTAG
N703 AGGCAGAA
N704 TCCTGAGC
N705 GGACTCCT
N706 TAGGCATG
N707 CTCTCTAC
N710 CGAGGCTG
N711 AAGAGGCA
N712 GTAGAGGA
N714 GCTCATGA
N715 ATCTCAGG
N716 ACTCGCTA
N718 GGAGCTAC
N719 GCGTAGTA
N720 CGGAGCCT
N721 TACGCTGC
N722 ATGCGCAG
N723 TAGCGCTC
N724 ACTGAGCG
N726 CCTAAGAC
N727 CGATCAGT
N728 TGCAGCTA
N729 TCGACGTC

0 comments on commit e834686

Please sign in to comment.