Skip to content

Commit

Permalink
Improve doc and uniformize scripts
Browse files Browse the repository at this point in the history
  • Loading branch information
johanzi committed Dec 17, 2018
1 parent bd5da29 commit b789808
Show file tree
Hide file tree
Showing 10 changed files with 127 additions and 89 deletions.
2 changes: 1 addition & 1 deletion extended_figure_9b.R
Original file line number Diff line number Diff line change
Expand Up @@ -27,7 +27,7 @@ ggplot(df, aes(line, rosette)) +
scale_y_continuous(breaks=seq(0,60,10)) +
theme_light() +
ylab("") +
xlab("")+
xlab("")#+
theme(axis.text.x = element_blank(),legend.text=element_text(size=8), legend.title=element_text(size=8))


Expand Down
3 changes: 2 additions & 1 deletion figure_1e.R
Original file line number Diff line number Diff line change
@@ -1,8 +1,9 @@
#libraries to load

########################################################
library(ggplot2)#to make plots with colored factors
library(multcomp)#Do multicomparison
library(agricolae)
########################################################

#FT expression for Block C IR construct

Expand Down
3 changes: 3 additions & 0 deletions figure_2e.R
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,10 @@

#Data analysis H3K9me2

#libraries to load
########################################################
library(ggplot2)#to make plots with colored factors
########################################################

#Load data in a dataframe
data = read.table("figure_2e_data.txt", header=TRUE, dec=",", sep="\t")
Expand Down
4 changes: 2 additions & 2 deletions figure_3c.R
Original file line number Diff line number Diff line change
@@ -1,8 +1,8 @@
#libraries to load

########################################################
library(ggplot2)#to make plots with colored factors
library(multcomp)#Do multicomparison

########################################################

#FT expression for Block C IR construct

Expand Down
3 changes: 0 additions & 3 deletions figure_3d.R
Original file line number Diff line number Diff line change
@@ -1,8 +1,5 @@


#flowering time new RNAi line culture 34615


#libraries to load
########################################################
library(ggplot2)#to make plots with colored factors
Expand Down
5 changes: 1 addition & 4 deletions figure_3e.R
Original file line number Diff line number Diff line change
@@ -1,11 +1,10 @@
#Flowering time new RNAi line culture 40889


#libraries to load
########################################################
library(multcomp)#Do multicomparison
library(agricolae)#Load TukeyHSD package
library(car)#Leven test
########################################################

df <- read.table("figure_3e_data.txt", stringsAsFactors=TRUE,sep="\t", na.strings = c("NA",""), header=TRUE)

Expand Down Expand Up @@ -39,8 +38,6 @@ qqline(df$rosette)
with(df, shapiro.test(rosette))
#If p<5%, we reject hypothesis that residues follow normal distribution

#Alternatively (need library "car")
leveneTest(rosette~line, data=df)

#ANOVA + Dunnett's test (assuming the 2 previous hypotheses of homoscedasticity and normality of residuals)
fit = aov(rosette~line, data=df)
Expand Down
27 changes: 11 additions & 16 deletions figure_3f.R
Original file line number Diff line number Diff line change
@@ -1,11 +1,11 @@
#libraries to load


# Samples from cultures 44052, 44082, 44127

########################################################
library(ggplot2)#to make plots with colored factors
library(multcomp)#Do multicomparison
library(agricolae)
########################################################

# Samples from cultures 44052, 44082, 44127

#FT expression for Block C IR construct

Expand All @@ -17,13 +17,8 @@ df = read.table("figure_3f_data.txt",

df$ratio <- as.numeric(df$ratio)

bartlett.test(df, ratio~line)
qqnorm(df$ratio)
qqline(df$ratio)
with(df, shapiro.test(ratio))


# Plot
# Plot ----------------

# Reorder factors to display in proper order
df$line <- factor(df$line, levels = c("0T", "ft-10", "15-2-2","16-4","18-4","27-1","29-11"))
Expand All @@ -36,14 +31,14 @@ ggplot(df, aes(line, ratio)) +
theme_light() +
ylab("") +
xlab("")#+
theme(axis.text.x = element_blank(),legend.text=element_text(size=8), legend.title=element_text(size=8))


theme(axis.text.x = element_blank(),legend.text=element_text(size=8), legend.title=element_text(size=8))

###################################################
###################################################
# Statistics --------------

# Parametric test
bartlett.test(df, ratio~line)
qqnorm(df$ratio)
qqline(df$ratio)
with(df, shapiro.test(ratio))

#ANOVA
attach(df)
Expand Down
6 changes: 4 additions & 2 deletions figure_3g.R
Original file line number Diff line number Diff line change
@@ -1,8 +1,9 @@

#libraries to load
########################################################
library(ggplot2)
library(agricolae)
library(multcomp)

########################################################

# Analysis 48031 ----------------------------------------------------------

Expand All @@ -12,6 +13,7 @@ df$total <- df$rosette + df$cauline


# Plot data ---------------------------------------------------------------

df$cross <- factor(df$cross, levels = c("0T", "ft-10", "Block_C_27","Block_C_15","Block_E_18","Block_E_16","Col-0xBlock_C_27","Col-0xBlock_C_15","Block_C_27xCol-0","Block_C_15xCol-0","Block_E_16xCol-0","Col-0xBlock_E_18","Block_E_18xCol-0","Block_E_16xBlock_E_27","Block_E_16xBlock_E_15","Block_E_18xBlock_E_27","Block_E_18xBlock_E_15","Block_C_27xBlock_E_18","Block_C_27xBlock_E_16","Block_C_15xBlock_E_18","Block_C_15xBlock_E_16"))

give.n <- function(x){
Expand Down
4 changes: 4 additions & 0 deletions figure_4b.R
Original file line number Diff line number Diff line change
@@ -1,4 +1,8 @@

#libraries to load
########################################################
library(ggplot2)
########################################################

df <- read.table("figure_4b_data.txt", header=TRUE)

Expand Down
159 changes: 99 additions & 60 deletions smRNA-seq_analysis.md
Original file line number Diff line number Diff line change
Expand Up @@ -3,20 +3,46 @@ smRNA-seq analysis

# Required softwares

* bwai (v0.7.15)
* bwa (v0.7.15)
* bedtools (v2.25.0)
* R (v3.5.0)
* fastq-dump (v2.9.2)

# Data

Data available as fastq files in NCBI BioProject [PRJNA427142](https://www.ncbi.nlm.nih.gov/bioproject/PRJNA427142)

All libraries contain single end reads.

Download fastq files
```
fastq-dump --split-spot SRR6456282
fastq-dump --split-spot SRR6456283
fastq-dump --split-spot SRR6456284
fastq-dump --split-spot SRR6456285
fastq-dump --split-spot SRR8069222
fastq-dump --split-spot SRR8069223
fastq-dump --split-spot SRR8069224
```

Rename fastq files

```
mv SRR6456282.fastq 1794_C.fastq
mv SRR6456283.fastq 1794_D.fastq
mv SRR6456284.fastq 1794_A.fastq
mv SRR6456285.fastq 1794_B.fastq
mv SRR8069222.fastq 3634_C.fastq
mv SRR8069223.fastq 3634_B.fastq
mv SRR8069224.fastq 3634_A.fastq
```

Library codes:

* 1794_A: Block C IR #15-2
* 1794_A: Block C IR #15-3
* 1794_A: Block C IR #27-3
* 1794_A: Block C IR #27-4
* 1794_B: Block C IR #15-3
* 1794_C: Block C IR #27-3
* 1794_D: Block C IR #27-4
* 3634_A: Col-0
* 3634_B: Block E IR #16-5
* 3634_C: Block E IR #18-5
Expand All @@ -41,6 +67,7 @@ done

## Extract reads mapping at IR

Convert all bam files into bed files
```
for i in *bam; do
bedtools bamtobed -i $i > $i.bed
Expand All @@ -63,9 +90,18 @@ echo -e "chr1\t24334909\t24335722\ttarget_BlockE_plus_minus_100bp" target_BlockE

Use bedtools to retrieve all reads overlapping the coordinates of the IR target +/- 100 bp


For Block C IR lines
```
for i in 1794*bam.bed; do
bedtools intersect -wa -a $i -b target_BlockC_plus_minus_100bp.bed > ${i}.target_region_plus_minus_100bp.bed
done
```

For Block E IR lines and WT
```
for i in *bam.bed; do
bedtools intersect -wa -a $i -b target_BlockX_plus_minus_100bp.bed > ${i}.target_Block*_region_plus_minus_100bp.bed
for i in 3634*bam.bed; do
bedtools intersect -wa -a $i -b target_BlockE_plus_minus_100bp.bed > ${i}.target_region_plus_minus_100bp.bed
done
```

Expand Down Expand Up @@ -98,7 +134,7 @@ All libraries have about 10M reads, use the values of read count for library A a

Bed files ready to be analyzed in R (see script below)

## Generate density plot in R
## Generate density plot in R (figure 2a and extended figure 6b)

### For libraries project 1794 (Block C IR)

Expand Down Expand Up @@ -127,27 +163,7 @@ covA <- coverage(readsA)
covD <- coverage(readsD)
```

### For libraries project 3634 (Block E IR + WT)

Load bed files for the WT Col-0 (A) and the 2 libraries which map smRNAs at Block E (B, C)

```{r}
readsA = import.bed(con="3634_A.fastq.gz.bam.bed.target_region_BlockE_plus_minus_100bp.bed")
readsB = import.bed(con="3634_B.fastq.gz.bam.bed.target_region_BlockE_plus_minus_100bp.bed")
readsC = import.bed(con="3634_C.fastq.gz.bam.bed.target_region_BlockE_plus_minus_100bp.bed")
covA <- coverage(readsA)
covB <- coverage(readsB)
covC <- coverage(readsC)
```

Calculate coverage

```{r}
covA <- coverage(readsA)
covD <- coverage(readsD)
```

### Plot read density for Block C
#### Plot read density for Block C (figure 2a)

Plot read density at the target region. Get first and last position for each bed file looking at 5' coordinate of first read and 3' coordinate of last read (avoid error while plotting)
To normalize the amount of reads, divide coverage by the total number of reads of each library
Expand Down Expand Up @@ -189,7 +205,27 @@ abline(v=496, col="blue")
```


## Plot read density for Block E
### For libraries project 3634 (Block E IR + WT)

Load bed files for the WT Col-0 (A) and the 2 libraries which map smRNAs at Block E (B, C)

```{r}
readsA <- import.bed(con="3634_A.fastq.gz.bam.bed.target_region_BlockE_plus_minus_100bp.bed")
readsB <- import.bed(con="3634_B.fastq.gz.bam.bed.target_region_BlockE_plus_minus_100bp.bed")
readsC <- import.bed(con="3634_C.fastq.gz.bam.bed.target_region_BlockE_plus_minus_100bp.bed")
covA <- coverage(readsA)
covB <- coverage(readsB)
covC <- coverage(readsC)
```

Calculate coverage

```{r}
covA <- coverage(readsA)
covD <- coverage(readsD)
```

#### Plot read density for Block E (extended figure 6b)

Plot read density at the target region. Get first and last position for each bed file looking at 5' coordinate of first read and 3' coordinate of last read (avoid error while plotting)
To normalize the amount of reads, divide coverage by the total number of reads of each library
Expand Down Expand Up @@ -217,9 +253,7 @@ abline(v=614, col="blue")

# Generate plot frequency distribution of read size (Fig. 2b, Extended Fig. 6c)

## Get read size

### For Block C IR
## Get read size for Block C IR

Create a report with the number of read mapping at Block C. Each line in the bam file matches with one mapped read

Expand Down Expand Up @@ -248,31 +282,17 @@ Get rid of unremoved part of adapter sequences on the reads (unproper trimming o
*ready files can be analysed in R (see script below)*


### For Block E IR


```
for i in *bam; do
samtools index -b $i
samtools view $i "chr1:24335009-24335622" > ${i}_reads_BlockE.sam # to concatenate a variable, put the variable name between curly brackets
done
```

## Plot read size

### For Block C IR
### Plot read size distribution (figure 2b)

For normalized read length distribution, just give percent of smRNA-seq in each category as we look only into the reads into BlockC. To do, use prop.table function of contigency table generated with table()

```{r}
lengthA = scan("1794_A.fastq.bam_reads_target_region.ready")
lengthB = scan("1794_B.fastq.bam_reads_target_region.ready")
lengthC = scan("1794_C.fastq.bam_reads_target_region.ready")
lengthD = scan("1794_D.fastq.bam_reads_target_region.ready")
lengthA <- scan("1794_A.fastq.bam_reads_target_region.ready")
lengthB <- scan("1794_B.fastq.bam_reads_target_region.ready")
lengthC <- scan("1794_C.fastq.bam_reads_target_region.ready")
lengthD <- scan("1794_D.fastq.bam_reads_target_region.ready")
dataList = list("#15-2"=lengthA, "#15-3"=lengthB, "#27-3"=lengthC, "#27-4"=lengthD)
boxplot(dataList, main="Read distribution at target region", cex.main=1, ylab="read length")
dataList <- list("#15-2"=lengthA, "#15-3"=lengthB, "#27-3"=lengthC, "#27-4"=lengthD)
par(mfrow=c(1,2),oma = c(0, 0, 2, 0))
Expand All @@ -282,19 +302,38 @@ mtext("Distribution of read sizes at target region", outer = TRUE, cex = 1)
```

### For Block E IR

## Get read size for Block E IR

```{r}
Create a report with the number of read mapping at Block E. Each line in the bam file matches with one mapped read

lengthA = scan("3634_A.fastq.gz.bam_reads_BlockE.sam.ready")
lengthB = scan("3634_B.fastq.gz.bam_reads_BlockE.sam.ready")
lengthC = scan("3634_C.fastq.gz.bam_reads_BlockE.sam.ready")
```
for i in *bam; do
samtools index -b $i
samtools view $i "chr1:24335009-24335622" > ${i}_reads_BlockE.sam # to concatenate a variable, put the variable name between curly brackets
done
```

No problem of unremoved adapter sequences for the libraries of Block E IR and Col-0 but still do this step to get only the part of the reads that match the reference sequence.

```
for i in *sam; do
cut -f19 $i | awk -F":" '{print $3}' - | sed -e 's/\^//g' -e 's/^[0-9][A-Z][0-9][A-Z]//g' -e 's/^[0-9][A-Z]//g' -e 's/[A-Z][0-9][0-9][A-Z][0-9][0-9]$//g' -e 's/[A-Z][0-9][0-9][A-Z][0-9]$//g' -e 's/[A-Z][0-9][0-9]$//g' -e 's/[A-Z][0-9][A-Z][0-9]$//g' -e 's/[A-Z][0-9]$//g' - | grep -w '[0-9][0-9]' - > ${i}.ready
done
```

*ready files can be analysed in R (see script below)*

### Plot read size distribution (extended figure 6c)

For normalized read length distribution, just give percent of smRNA-seq in each category as we look only into the reads into BlockC. To do, use prop.table function of contigency table generated with table()

```{r}
dataList = list("Col-0"=lengthA, "#16-5"=lengthB, "#18-5"=lengthC)
#Display boxplot
lengthB <- scan("3634_B.fastq.gz.bam_reads_BlockE.sam.ready")
lengthC <- scan("3634_C.fastq.gz.bam_reads_BlockE.sam.ready")
boxplot(dataList, main="Read distribution at target region", cex.main=1, ylab="read length")
dataList <- list("#16-5"=lengthB, "#18-5"=lengthC)
par(mfrow=c(1,2),oma = c(0, 0, 2, 0))
Expand Down

0 comments on commit b789808

Please sign in to comment.