|
| 1 | +# Rd |
| 2 | +# description >> required |
| 3 | +# argument |
| 4 | +# item >> CpG_table >> with c("TargetID", "MAPINFO", "CHR") and "FORWARD_SEQUENCE" if add_seq_info = TRUE |
| 5 | +# item >> file_CpG_context >> Path of the file containing CpG context (HMD, PMD, LMR, UMR) file in .txt, .bed or Rdata with CpG feature to add to the table, with column c("chr", "start", "end") and the column to add |
| 6 | +# item >> name_col_CpG_context >> Names of columns in file_CpG_context that should be used to annotate CpG_table |
| 7 | +# item >> file_chrom_state >> Path of the file containing chromatin states file in .txt, .bed or Rdata with CpG feature to add to the table, with column c("chr", "start", "end") and the column to add |
| 8 | +# item >> name_col_chrom_state >> Names of columns in file_chrom_state that should be used to annotate CpG_table |
| 9 | +# item >> file_CGI >> Path of the file containing. file in .txt, .bed or Rdata with CpG feature to add to the table, with column c("chr", "start", "end") and the column to add |
| 10 | +# item >> name_col_CGI >> Names of columns in file_CGI that should be used to annotate CpG_table |
| 11 | +# item >> file_genes >> Path of the file containing. file in .txt, .bed or Rdata with CpG feature to add to the table, with column c("chr", "start", "end") and the column to add |
| 12 | +# item >> name_col_genes >> Names of columns in file_genes that should be used to annotate CpG_table |
| 13 | +# item >> file_replication >> Path of the file containing. file in .txt, .bed or Rdata with CpG feature to add to the table, with column c("chr", "start", "end") and the column to add |
| 14 | +# item >> name_col_replication >> Names of columns in file_replication that should be used to annotate CpG_table |
| 15 | +# item >> add_seq_info >> If = TRUE, add the number of adjacent CpG and the nucleotide context to the CpG_table |
| 16 | +# value >> return CpG_table annoted with Chromatine feature |
| 17 | +# author >> Léa Meunier |
| 18 | +# keyword >> CpG annotation |
| 19 | +#` @import stringr |
| 20 | +# end |
| 21 | + |
| 22 | + |
| 23 | +chromatin.feature <- function(CpG_table = CpG_table, file_CpG_context = NULL, name_col_CpG_context = NULL, file_chrom_state = NULL, name_col_chrom_state = NULL, file_CGI = NULL, name_col_CGI = NULL, file_genes = NULL, name_col_genes = NULL, file_replication = NULL, name_col_replication = NULL, add_seq_info = TRUE, save = FALSE, output.directory){ |
| 24 | + |
| 25 | + CpG_table = data.frame(CpG_table) |
| 26 | + |
| 27 | + ## Error if CpG_table does'nt contain minimal column c("TargetID", "MAPINFO", "CHR") |
| 28 | + missing_col = setdiff(c("TargetID", "MAPINFO", "CHR"), colnames(CpG_table)) |
| 29 | + if(length(missing_col)>= 1) |
| 30 | + stop(patse0("column", missing_col,"not found in CpG_table")) |
| 31 | + |
| 32 | + # Add several CpG feature |
| 33 | + # Chromatin state |
| 34 | + if(!is.null(file_chrom_state)){ |
| 35 | + |
| 36 | + file_name = stringr::str_split(file_chrom_state,"\\.", n = Inf, simplify = TRUE) |
| 37 | + extention = tolower(file_name[ncol(file_name)]) |
| 38 | + |
| 39 | + if(extention == "txt" | extention == "bed"){ |
| 40 | + chrom_state = read.table(file_chrom_state , sep = "\t", header = T) |
| 41 | + }else if(extention == "rdata"){ |
| 42 | + chrom_state = load.RData(file_chrom_state) |
| 43 | + } |
| 44 | + CpG_table = table.PosXSegm(table_Pos = CpG_table, table_Pos.chrom.col = "CHR", table_Pos.pos.col = "MAPINFO", |
| 45 | + table_Segm = chrom_state, table_Segm.chrom.col = "chr", table_Segm.start.col = "start", |
| 46 | + table_Segm.end.col = "end", cols_to_add = name_col_chrom_state, names_cols_to_add = c("state", "domain")) |
| 47 | + } |
| 48 | + |
| 49 | + # CpG context |
| 50 | + if(!is.null(file_CpG_context)){ |
| 51 | + |
| 52 | + file_name = stringr::str_split(file_CpG_context,"\\.", n = Inf, simplify = TRUE) |
| 53 | + extention = tolower(file_name[ncol(file_name)]) |
| 54 | + |
| 55 | + if(extention == "txt" | extention == "bed"){ |
| 56 | + CpG_context = read.table(file_CpG_context , sep = "\t", header = T) |
| 57 | + CpG_context$chr = paste("chr", CpG_context$chr, sep = "") |
| 58 | + }else if(extention == "rdata"){ |
| 59 | + CpG_context = load.RData(file_CpG_context) |
| 60 | + } |
| 61 | + |
| 62 | + CpG_table = table.PosXSegm(table_Pos = CpG_table, table_Pos.chrom.col = "CHR", table_Pos.pos.col = "MAPINFO", |
| 63 | + table_Segm = CpG_context, table_Segm.chrom.col = "chr", table_Segm.start.col = "start", |
| 64 | + table_Segm.end.col = "end", cols_to_add = name_col_CpG_context, names_cols_to_add = "CpG_context") |
| 65 | + } |
| 66 | + |
| 67 | + # CGI feature |
| 68 | + if(!is.null(file_CGI)){ |
| 69 | + |
| 70 | + file_name = stringr::str_split(file_CGI,"\\.", n = Inf, simplify = TRUE) |
| 71 | + extention = tolower(file_name[ncol(file_name)]) |
| 72 | + |
| 73 | + if(extention == "txt" | extention == "bed"){ |
| 74 | + CGI = read.table(file_CGI , sep = "\t", header = T) |
| 75 | + }else if(extention == "rdata"){ |
| 76 | + CGI = load.RData(file_CGI) |
| 77 | + } |
| 78 | + |
| 79 | + CpG_table = table.PosXSegm(table_Pos = CpG_table, table_Pos.chrom.col = "CHR", table_Pos.pos.col = "MAPINFO", |
| 80 | + table_Segm = CGI, table_Segm.chrom.col = "chr", table_Segm.start.col = "start", |
| 81 | + table_Segm.end.col = "end", cols_to_add = name_col_CGI, names_cols_to_add = "cgi_feature") |
| 82 | + } |
| 83 | + |
| 84 | + # Genes feature |
| 85 | + if(!is.null(file_genes)){ |
| 86 | + |
| 87 | + file_name = stringr::str_split(file_genes,"\\.", n = Inf, simplify = TRUE) |
| 88 | + extention = tolower(file_name[ncol(file_name)]) |
| 89 | + |
| 90 | + if(extention == "txt" | extention == "bed"){ |
| 91 | + Genes = read.table(file_genes , sep = "\t", header = T) |
| 92 | + }else if(extention == "rdata"){ |
| 93 | + Genes = load.RData(file_genes) |
| 94 | + } |
| 95 | + |
| 96 | + CpG_table = table.PosXSegm(table_Pos = CpG_table, table_Pos.chrom.col = "CHR", table_Pos.pos.col = "MAPINFO", |
| 97 | + table_Segm = Genes, table_Segm.chrom.col = "chr", table_Segm.start.col = "start", |
| 98 | + table_Segm.end.col = "end", cols_to_add = name_col_genes, names_cols_to_add = c("gene_name", "gene_feature")) |
| 99 | + } |
| 100 | + |
| 101 | + # Replication timing |
| 102 | + if(!is.null(file_CGI)){ |
| 103 | + |
| 104 | + file_name = stringr::str_split(file_replication,"\\.", n = Inf, simplify = TRUE) |
| 105 | + extention = tolower(file_name[ncol(file_name)]) |
| 106 | + |
| 107 | + if(extention == "txt" | extention == "bed"){ |
| 108 | + replicatio = read.table(file_replication , sep = "\t", header = T) |
| 109 | + }else if(extention == "rdata"){ |
| 110 | + replicatio = load.RData(file_replication) |
| 111 | + } |
| 112 | + |
| 113 | + CpG_table = table.PosXSegm(table_Pos = CpG_table, table_Pos.chrom.col = "CHR", table_Pos.pos.col = "MAPINFO", |
| 114 | + table_Segm = replicatio, table_Segm.chrom.col = "chr", table_Segm.start.col = "start", |
| 115 | + table_Segm.end.col = "end", cols_to_add = name_col_replication, names_cols_to_add = "decile") |
| 116 | + } |
| 117 | + |
| 118 | + # Add nucleotide context and number of CpG in the adjacent sequence |
| 119 | + if(add_seq_info == TRUE){ |
| 120 | + tmp_table_CpG = data.frame(CpG_table$TargetID , CpG_table$FORWARD_SEQUENCE, str_split(CpG_table$FORWARD_SEQUENCE, "\\[CG\\]", simplify = TRUE)) |
| 121 | + colnames(tmp_table_CpG) = c("TargetID", "FORWARD_SEQUENCE", "FORWARD_SEQUENCE_pre", "FORWARD_SEQUENCE_post") |
| 122 | + |
| 123 | + tmp_table_CpG$pre_context = stringr::str_sub(tmp_table_CpG$FORWARD_SEQUENCE_pre, -1, -1) |
| 124 | + tmp_table_CpG$post_context = stringr::str_sub(tmp_table_CpG$FORWARD_SEQUENCE_post, 1, 1) |
| 125 | + |
| 126 | + CpG_table$context = apply(tmp_table_CpG, 1, function(x){ |
| 127 | + if((x[5] == "C" | x[5] == "G")&(x[6] == "C" | x[6] == "G")){ |
| 128 | + return("SCGS") |
| 129 | + }else if((x[5] == "C" | x[5] == "G")&(x[6] == "A" | x[6] == "T")){ |
| 130 | + return("SCGW") |
| 131 | + }else if((x[5] == "A" | x[5] == "T")&(x[6] == "C" | x[6] == "G")){ |
| 132 | + return("SCGW") |
| 133 | + }else{ |
| 134 | + return("WCGW") |
| 135 | + } |
| 136 | + }) |
| 137 | + |
| 138 | + tmp_table_CpG$FORWARD_SEQUENCE = as.character(tmp_table_CpG$FORWARD_SEQUENCE) |
| 139 | + tmp_table_CpG$FORWARD_SEQUENCE_red = stringr::str_sub(tmp_table_CpG$FORWARD_SEQUENCE, 25, -25) |
| 140 | + tmp_table_CpG$FORWARD_SEQUENCE_red_post = stringr::str_sub(tmp_table_CpG$FORWARD_SEQUENCE_post, 2, -25) |
| 141 | + |
| 142 | + tmp_table_CpG = data.frame(tmp_table_CpG) |
| 143 | + CpG_table$nb_flanking_CpG = sapply(tmp_table_CpG$FORWARD_SEQUENCE_red, function(x){return(length(gregexpr("CG", x)[[1]])-1)}) |
| 144 | + |
| 145 | + } |
| 146 | + |
| 147 | + if(save == TRUE){ |
| 148 | + if(!file.exists(output.directory)){ |
| 149 | + dir.create(output.directory) |
| 150 | + } |
| 151 | + CpG_feature = CpG_table |
| 152 | + save(CpG_feature, file = paste0(output.directory, "CpG_feature.Rdata")) |
| 153 | + write.table(CpG_feature, file = paste0(output.directory, "CpG_feature.txt")) |
| 154 | + } |
| 155 | + |
| 156 | + return(CpG_table) |
| 157 | +} |
0 commit comments