-
Notifications
You must be signed in to change notification settings - Fork 0
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Showing
5 changed files
with
269 additions
and
2 deletions.
There are no files selected for viewing
Binary file not shown.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,153 @@ | ||
############### script for generating input manifest for MiSeq sequencing machine ################## | ||
# Marco Fabbrini | ||
# VERSION:2.6 | ||
# RELEASE:03/Mar/2023 | ||
|
||
args<-commandArgs(TRUE) | ||
|
||
|
||
#----------LOAD FILES AND PARAMETERS------ | ||
if (!requireNamespace("readxl", quietly = TRUE)) | ||
install.packages("readxl", repos = "https://cran.rstudio.com/") | ||
library(readxl) | ||
|
||
# Debugging purposes: | ||
# pDataFile1 <- "index_database.tsv" | ||
pDataFile1 <- args[1] | ||
index_database<- read.delim(pDataFile1, header=T, sep="\t") | ||
|
||
# Debugging purposes: | ||
# pDataFile2 <- "header.tsv" | ||
pDataFile2 <- args[2] | ||
header_table <- read.delim(pDataFile2, header=F, sep="\t")[ , c(1:8)] | ||
header_table$V2[header_table$V2 == ""]=NA | ||
header_table[is.na(header_table)] = "" | ||
header_table = header_table[,c(1:7)] | ||
|
||
# Debugging purposes: | ||
# project_name = "test" | ||
# project_date = Sys.Date()+1 | ||
project_name = args[3] | ||
project_date = args[4] | ||
|
||
header_table$V2[2] = project_name | ||
header_table$V2[3] = project_date | ||
|
||
n_plates <- args[5] | ||
|
||
pDataFile3 <- args[6] | ||
|
||
|
||
# SCRIPT ------------------------------------------------------------------ | ||
|
||
index_set <- c() | ||
for ( n in 1:n_plates) | ||
{ | ||
suppressMessages(assign(paste("plate", n, sep=""), as.data.frame(read_xlsx(path = pDataFile3, sheet = n )[c(1:8),c(1:13)]) )) | ||
index_set <- c(index_set, colnames(get(paste("plate", n, sep="")))[1] ) | ||
} | ||
|
||
index_correspondence = data.frame(plate=1:n_plates, index_set=index_set) | ||
data_section = data.frame(matrix(ncol=7)) | ||
colnames(data_section) = c("Sample_ID", "Description" ,"I7_Index_ID", "index", "I5_Index_ID", "index2", "Sample_Project") | ||
data_section_header = c("Sample_ID", "Description" ,"I7_Index_ID", "index", "I5_Index_ID", "index2", "Sample_Project") | ||
|
||
|
||
# Check for unknown headers ----------------------------------------------- | ||
for ( n in 1:n_plates) | ||
{ | ||
if ( ! all ( colnames(get(paste("plate", n, sep="")))[2:13] %in% index_database$Index_Name | grepl("empty", colnames(get(paste("plate", n, sep="")))[2:13]) ) ) | ||
{ | ||
print(paste("No sample sheet has been produced. Check your EXCEL plate table first. Aligator.")) | ||
stop( print( paste("Invalid Index name found in columns of plate N.", n, " --- Index \'", | ||
colnames(get(paste("plate", n, sep="")))[2:13][!(colnames(get(paste("plate", n, sep="")))[2:13] %in% index_database$Index_Name | grepl("empty", colnames(get(paste("plate", n, sep="")))[2:13]))], | ||
"\' not found in index_database.tsv", sep="") ) ) | ||
} | ||
if ( !all( get(paste("plate", n, sep=""))[,1] %in% c(index_database$Index_Name, "empty") ) ) | ||
{ | ||
print(paste("No sample sheet has been produced. Check your EXCEL plate table first. Aligator.")) | ||
stop( print( paste("Invalid Index name found in rows of plate N. ", n, " --- Index \'", | ||
get(paste("plate", n, sep=""))[,1][!get(paste("plate", n, sep=""))[,1] %in% c(index_database$Index_Name, "empty")], | ||
"\' not found", sep="") ) ) | ||
} | ||
} | ||
|
||
|
||
# Generate the manifest --------------------------------------------------- | ||
insert = 1 | ||
for ( n in 1:n_plates) | ||
{ | ||
index_column = colnames(get(paste("plate", n, sep="")))[2:ncol(get(paste("plate", n, sep="")))] | ||
index_row = get(paste("plate", n, sep=""))[,1] | ||
for ( row in 1:length(index_row) ) | ||
{ | ||
if ( get(paste("plate", n, sep=""))[row,1] != "empty" ) # If the row Index is "empty" skip all the row | ||
{ | ||
for ( col in 2:(length(index_column)+1) ) | ||
{ | ||
if ( !grepl("empty", colnames(get(paste("plate", n, sep="")))[col]) ) # If the column Index is "empty" skip all the column | ||
{ | ||
if ( get(paste("plate", n, sep=""))[row,col] != "empty" ) # If the cell value is "empty" skip it | ||
{ | ||
data_section[insert,"Sample_ID"] = gsub(" ", "", get(paste("plate", n, sep=""))[row,col]) | ||
data_section[insert,"Description"] = gsub(" ", "", get(paste("plate", n, sep=""))[row,col]) | ||
data_section[insert,"index"] = paste( index_correspondence[index_correspondence$plate == n , "index_set"] , | ||
colnames( get(paste("plate", n, sep="")) )[col] , | ||
sep="-") | ||
data_section[insert,"I7_Index_ID"] = index_database[ index_database$Index_Name == colnames( get(paste("plate", n, sep="")) )[col] , "Sequence"] | ||
data_section[insert,"index2"] = paste( index_correspondence[index_correspondence$plate == n , "index_set"] , | ||
get(paste("plate", n, sep=""))[,1][row] , | ||
sep="-") | ||
data_section[insert,"I5_Index_ID"] = index_database[ index_database$Index_Name == get(paste("plate", n, sep=""))[,1][row] , "Sequence"] | ||
data_section[insert,"Sample_Project"] = project_name | ||
insert = insert + 1 | ||
} | ||
} | ||
} | ||
} | ||
} | ||
} | ||
|
||
|
||
# Check for weird sample names -------------------------------------------- | ||
if ( ( length( grep(" ", data_section$Sample_ID) ) + length( grep("\\.", data_section$Sample_ID) ) ) != 0 ) { | ||
print(paste("I strongly suggest you to check your samples name")) | ||
print(paste("Golden rule for assigning good sample's names:")) | ||
print(paste("• Don't start any name with a number")) | ||
print(paste("• Avoid using spaces, points or special characters such as * ' / + ° #, etc...")) | ||
print(paste("• It's better to use names indicative of sample's characteristics (e.g., with timepoint and/or group, treatment or whatever)")) | ||
} | ||
|
||
|
||
# Check for duplicated samples -------------------------------------------- | ||
if ( any(duplicated(data_section$Sample_ID)) ) | ||
{ | ||
qualeduplicato = data_section$Sample_ID[duplicated(data_section$Sample_ID)] | ||
for ( n in 1:n_plates ) | ||
{ | ||
for ( dupiter in qualeduplicato) | ||
{ | ||
if ( any(grepl(dupiter, get(paste("plate", n, sep="")))) ) | ||
{ | ||
posizione_colonne = grep(dupiter, get(paste("plate", n, sep=""))) | ||
for ( quantidup in 1:length(posizione_colonne) ) | ||
{ | ||
colonna_temp = LETTERS[posizione_colonne[quantidup]] | ||
riga_temp = 1+ which( get(paste("plate", n, sep=""))[,posizione_colonne[quantidup]] == dupiter ) | ||
print( paste("Duplicate ", dupiter , " found in plate N.", n, ", index set ", index_correspondence$index_set[index_correspondence$plate == n], ", EXCEL coordinates ", colonna_temp, riga_temp, sep="")) | ||
} | ||
} | ||
} | ||
} | ||
|
||
print(paste("No sample sheet has been produced. Check your EXCEL plate table first. Aligator.")) | ||
} else { | ||
colnames(data_section) = colnames(header_table) | ||
final_format <- rbind(header_table, data_section_header, data_section) | ||
# If everything is fine, save the manifest to a CSV file, ready for the sequencer | ||
write.table( final_format, paste(project_name, "_SampleSheet_", project_date, ".csv", sep=""), sep=",", col.names=F, row.names=F, quote=F) | ||
print(paste("Manifest generated succesfully")) | ||
} | ||
|
||
|
||
######################################- |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -1,2 +1,61 @@ | ||
# Generate-MiSeq-Manifest | ||
Generate a .CSV manifest file for importing multiple samples in the Illumina MiSeq machine | ||
###############------ Help section: script for generating input manifest for MiSeq sequencing machine------################## | ||
# Marco Fabbrini | ||
# VERSION:2.6.1 | ||
# RELEASE:24/Lug/2023 | ||
|
||
# Requirements: | ||
# # The readxl package is required ( install.packages("readxl") in a R session) | ||
# # Rscript bash bin is required (on MacOS or Linux platforms), or either use the Rscript.exe windows executable | ||
|
||
###### --------- | ||
###### Tutorial for Linux and MacOS: | ||
# Open your Terminal app and head to the folder containing this script and the plates for the run, using the "cd" command, for example: | ||
cd /User/myusername/Downloads/Generate_MiSeq_CSV-2.6.1 | ||
# If you don't know your PATH you can also type "cd " and drag-and-drop the folder from the Finder/FileExplorer inside the terminal. | ||
|
||
# Paste this in your Terminal, changing the parameters accordingly | ||
Rscript Generate-MiSeq-Manifest.R index_database.tsv header.tsv Project_name 2023-05-18 4 ESEMPIO_PLATE.xlsx | ||
|
||
## You need to change the "Project_name" accordingly to the name you want to give to the run | ||
## You need to change the date "2023-05-18" setting the date in which the run WILL BE conducted | ||
## You need to change the number "4" - the one between the date and the excel file - according to the number of plates in the excel file | ||
## You need to change the excel file with the plates accordingly to the samples and plates you have prepared, sticking to the template | ||
|
||
|
||
###### --------- | ||
###### Tutorial for Windows 10 and Windows 11: | ||
# Open your Terminal app or Windows PowerShell by searching for "Terminal" or "Powershell" in your Start menu | ||
# If you have problem finding your terminal, you cen press the keys Win+R and a "Execute" window sill pop out in the bottom-left screen. Type "powershell" and hit Enter. | ||
# Now head to the folder containing the script and the plates for the run, using the "cd" command. Remember to use single quotes for the path. For example: | ||
cd 'C:\Users\myusername\Downloads\Generate_MiSeq_CSV-2.6.1\' | ||
|
||
# If you don't know your PATH you can also type "cd " and drag-and-drop the folder from the File Explorer inside the terminal. | ||
|
||
# Then, we need to locate your Rscript.exe executable. You can open your File Explorer and head to "This PC", then select "OS C:", then "Program Files" (EN) or "Programmi" (IT) and open the "R" folder. Check which one is the latest version (the highest number) and edit the line of code below (R-4.3.1) in order to execute the appropriate version of R. | ||
# Remember to execute the command below with the commercial & first! Note the use of double quotes for the path to the Rscript.exe executable. | ||
# Note to the IT users: even if your folder is named "Programmi", you still need to type "Program Files" in the path below. Just adjust the R-X.X.X version in the path | ||
|
||
& "C:\Program Files\R\R-4.3.1\bin\Rscript.exe" Generate-MiSeq-Manifest.R index_database.tsv header.tsv Project_name 2023-05-18 4 ESEMPIO_PLATE.xlsx | ||
|
||
## You need to change the "Project_name" accordingly to the name you want to give to the run | ||
## You need to change the date "2023-05-18" setting the date in which the run WILL BE conducted | ||
## You need to change the number "4" - the one between the date and the excel file - according to the number of plates in the excel file | ||
## You need to change the excel file with the plates accordingly to the samples and plates you have prepared, sticking to the template | ||
|
||
|
||
|
||
# Additional details: | ||
|
||
# 1st argument contains the index database supplied | ||
# 2nd argument contains the header section supplied | ||
# 3rd argument specifies the project name. MUST avoid using spaces or special characters (e.g., wildcards . * ) | ||
# 4th argument specifies the project's date. MUST use the date of the sequencing run in format YYYY-MM-DD | ||
# 5th argument contains the number of plates present in the XLSX file | ||
# 6th argument pointss to the XLSX file for the plates built as such: | ||
# # Each separate plate is in a separate sheet | ||
# # The plate must start from the first cell in the upper left corner of the Excel sheet | ||
# # The upper left cell must contain a single letter pointing to the adapter set (A, B, C, D). No spaces, no merged cells. | ||
# # Column names and row names MUST contain index codes (N- in the columns, S- in the rows) OR 'empty' | ||
# # The inner part of the table must contain sample's name. Avoid name starting with numbers or containing special characters | ||
# # In case of an empty cell you MUST fill the cell with the term: 'empty' | ||
# # An example of a plate file is supplied |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,14 @@ | ||
[Header] | ||
Experiment Name Intert_Name | ||
Date Insert_Date | ||
Module GenerateFASTQ - 3.0.1 | ||
Workflow GenerateFASTQ | ||
Library Prep Kit Nextera XT | ||
Index Kit Nextera XT v2 Index Kit Sets A B C D | ||
Chemistry Amplicon | ||
[Reads] | ||
251 | ||
251 | ||
[Settings] | ||
adapter CTGTCTCTTATACACATCT | ||
[Data] |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,41 @@ | ||
Index_Name Sequence | ||
S502 CTCTCTAT | ||
S503 TATCCTCT | ||
S505 GTAAGGAG | ||
S506 ACTGCATA | ||
S507 AAGGAGTA | ||
S508 CTAAGCCT | ||
S510 CGTCTAAT | ||
S511 TCTCTCCG | ||
S513 TCGACTAG | ||
S515 TTCTAGCT | ||
S516 CCTAGAGT | ||
S517 GCGTAAGA | ||
S518 CTATTAAG | ||
S520 AAGGCTAT | ||
S521 GAGCCTTA | ||
S522 TTATGCGA | ||
N701 TAAGGCGA | ||
N702 CGTACTAG | ||
N703 AGGCAGAA | ||
N704 TCCTGAGC | ||
N705 GGACTCCT | ||
N706 TAGGCATG | ||
N707 CTCTCTAC | ||
N710 CGAGGCTG | ||
N711 AAGAGGCA | ||
N712 GTAGAGGA | ||
N714 GCTCATGA | ||
N715 ATCTCAGG | ||
N716 ACTCGCTA | ||
N718 GGAGCTAC | ||
N719 GCGTAGTA | ||
N720 CGGAGCCT | ||
N721 TACGCTGC | ||
N722 ATGCGCAG | ||
N723 TAGCGCTC | ||
N724 ACTGAGCG | ||
N726 CCTAAGAC | ||
N727 CGATCAGT | ||
N728 TGCAGCTA | ||
N729 TCGACGTC |