-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathEDA_Chang_Noordermeer_2021.Rmd
103 lines (81 loc) · 2.8 KB
/
EDA_Chang_Noordermeer_2021.Rmd
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
---
title: "Analysis"
author: "Mikhail Dozmorov"
date: "`r Sys.Date()`"
output:
pdf_document:
toc: no
html_document:
theme: cerulean
toc: yes
---
```{r setup, echo=FALSE, message=FALSE, warning=FALSE}
# Set up the environment
library(knitr)
opts_chunk$set(cache.path='cache/', fig.path='img/', cache=F, tidy=T, fig.keep='high', echo=F, dpi=100, warnings=F, message=F, comment=NA, warning=F, results='as.is', fig.width = 10, fig.height = 6) #out.width=700,
library(pander)
panderOptions('table.split.table', Inf)
set.seed(1)
```
# Libraries
```{r libraries}
library(tidyverse)
library(readxl)
library(writexl)
library(GenomicRanges)
```
# Settings
```{r settings}
# Project folder path
dir_data <- "/Users/mdozmorov/Documents/Work/GitHub/CTCF.dev"
# Input files
fileNameIn1 <- file.path(dir_data, "data/Chang_Noordermeer_2021.xlsx")
# download.file("https://www.biorxiv.org/content/biorxiv/early/2021/04/15/2021.04.15.440007/DC1/embed/media-1.xlsx", fileNameIn1)
# Output files
fileNameOut1 <- "results/CTCF_mm10_Chang_Noordermeer_2021.RData"
```
# Load data
"ID" - numerical ID of a region with multiple (overlapping) CTCF ChIP-seq peaks. "length" - length of the peak. "value" - strength of the peak. "peak.log10pval", "peak.log10qval" - significance of the peak. "sequence" - sequence of the CTCF motif. "CTCF.log10pval", "CTCF.log10qval" - significance of the motif. "repeats" - whether the motif overlaps a repeat.
```{r data}
# Read in data
mtx <- read_xlsx(fileNameIn1, skip = 1)
```
## EDA
```{r}
summary(mtx$`-log10 p-value...7`)
summary(mtx$`-log10 q-value...8`)
summary(mtx$`-log10 p-value...13`)
summary(mtx$`-log10 q-value...14`)
table(mtx$`overlap with repeat(s)`)
```
# GRanges
```{r}
gr <- GRanges(seqnames = mtx$coordinate, IRanges(start = mtx$...3, end = mtx$...4), strand = mtx$strand)
gr$ID <- mtx$number
gr$length <- mtx$length
gr$value <- mtx$value
gr$peak.log10pval <- mtx$`-log10 p-value...7`
gr$peak.log10qval <- mtx$`-log10 q-value...8`
gr$sequence <-mtx$sequence
gr$CTCF.log10pval <- mtx$`-log10 p-value...13`
gr$CTCF.log10qval <- mtx$`-log10 q-value...14`
gr$repeats <- mtx$`overlap with repeat(s)`
genome_id <- "mm10"
# Get chromosome info and match it to the chromosome order in ctcfBED
chrom_data <- GenomeInfoDb::getChromInfoFromUCSC(genome = genome_id)
chrom_data <- chrom_data[chrom_data$chrom %in% seqlevels(gr), ]
chrom_data <- chrom_data[match(seqlevels(gr), chrom_data$chrom), ]
# Check if chromosome order is the same
if (!all.equal(seqlevels(gr), chrom_data$chrom)) {
print(paste("Chromosome order does not match for", genome_id, "genome."))
break
}
# Assign seqinfo data
seqlengths(gr) <- chrom_data$size
isCircular(gr) <- chrom_data$circular
genome(gr) <- genome_id
gr
# Save as RData object.
save(list = gr, file = fileNameOut1)
# load(file = paste0(subdir, ".RData"))
```