-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy path02_EDA_SCREEN.Rmd
125 lines (106 loc) · 3.66 KB
/
02_EDA_SCREEN.Rmd
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
---
title: "Analysis"
author: "Mikhail Dozmorov"
date: "`r Sys.Date()`"
output:
pdf_document:
toc: no
html_document:
theme: cerulean
toc: yes
---
```{r setup, echo=FALSE, message=FALSE, warning=FALSE}
# Set up the environment
library(knitr)
opts_chunk$set(cache.path='cache/', fig.path='img/', cache=F, tidy=T, fig.keep='high', echo=F, dpi=100, warnings=F, message=F, comment=NA, warning=F, results='as.is', fig.width = 10, fig.height = 6) #out.width=700,
library(pander)
panderOptions('table.split.table', Inf)
set.seed(1)
```
# Libraries
```{r libraries}
library(tidyverse)
library(readxl)
library(writexl)
library(GenomicRanges)
library(rtracklayer)
```
# Settings
```{r settings}
# Project folder path
dir_project <- "/Users/mdozmorov/Documents/Work/GitHub/CTCF.dev"
# Input files
# Human CTCF-bound cCRE (hg38)
fileNameIn1 <- file.path(dir_project, "data/GRCh38-CTCF.bed")
if (!file.exists(fileNameIn1)) {
download.file("https://api.wenglab.org/screen_v13/fdownloads/cCREs/GRCh38-CTCF.bed", fileNameIn1)
}
# Mouse CTCF-bound cCRE (mm10)
fileNameIn2 <- file.path(dir_project, "data/mm10-CTCF.bed")
if (!file.exists(fileNameIn2)) {
download.file("https://api.wenglab.org/screen_v13/fdownloads/cCREs/mm10-CTCF.bed", fileNameIn2)
}
# Output files
fileNameOut1 <- file.path(dir_project, "RData", "hg38.SCREEN.GRCh38_CTCF.RData")
fileNameOut1.1 <- file.path(dir_project, "RData", "hg38.SCREEN.GRCh38_CTCF.bed")
fileNameOut2 <- file.path(dir_project, "RData", "mm10.SCREEN.mm10_CTCF.RData")
fileNameOut2.1 <- file.path(dir_project, "RData", "mm10.SCREEN.mm10_CTCF.bed")
```
PLS/pELS/dELS - Promoter-Like Signatures, proximal/distal Enhancer-Like Signatures
# Load hg38 data
```{r data}
# Read in data
mtx_human <- read_tsv(fileNameIn1, col_names = FALSE)
mtx_mouse <- read_tsv(fileNameIn2, col_names = FALSE)
```
## EDA
```{r}
table(mtx_human$X6) / length(mtx_human$X6)
table(mtx_mouse$X6) / length(mtx_mouse$X6)
```
## GRanges
```{r}
# Function to convert SCREEN data to GRanges
mtx_to_gr <- function(mtx = mtx_human, genome_id = "hg38") {
gr <- GRanges(seqnames = mtx$X1,
IRanges(start = mtx$X2,
end = mtx$X3))
gr$ID1 <- mtx$X4
gr$ID2 <- mtx$X5
gr$Type <- mtx$X6
# Get chromosome info and match it to the chromosome order in ctcfBED
chrom_data <- GenomeInfoDb::getChromInfoFromUCSC(genome = genome_id)
# Subset to common chromosomes
common_chromosomes <- intersect(chrom_data$chrom, seqlevels(gr))
chrom_data <- chrom_data[chrom_data$chrom %in% common_chromosomes, ]
gr <- keepSeqlevels(gr, common_chromosomes, pruning.mode = "tidy")
# Match order
chrom_data <- chrom_data[match(seqlevels(gr), chrom_data$chrom), ]
# Check if chromosome order is the same
if (!all.equal(seqlevels(gr), chrom_data$chrom)) {
print(paste("Chromosome order does not match for", genome_id, "genome."))
break
}
# Assign seqinfo data
seqlengths(gr) <- chrom_data$size
isCircular(gr) <- chrom_data$circular
genome(gr) <- genome_id
return(gr)
}
```
```{r}
# Convert SCREEN data to GRanges
hg38.SCREEN.GRCh38_CTCF <- mtx_to_gr(mtx = mtx_human, genome_id = "hg38")
length(hg38.SCREEN.GRCh38_CTCF)
summary(width(hg38.SCREEN.GRCh38_CTCF))
mm10.SCREEN.mm10_CTCF <- mtx_to_gr(mtx = mtx_mouse, genome_id = "mm10")
length(mm10.SCREEN.mm10_CTCF)
summary(width(mm10.SCREEN.mm10_CTCF))
# Save as RData object. subdir is the character name of the CTCF GRanges variable
save(list = "hg38.SCREEN.GRCh38_CTCF", file = fileNameOut1)
save(list = "mm10.SCREEN.mm10_CTCF", file = fileNameOut2)
# load(file = paste0(subdir, ".RData"))
# Save BED files
export.bed(hg38.SCREEN.GRCh38_CTCF, fileNameOut1.1)
export.bed(mm10.SCREEN.mm10_CTCF, fileNameOut2.1)
```