sample_viz.Rmd

---
title: 'Isoform Project: Zdhhc8'
output:
  prettydoc::html_pretty:
    theme: cayman
    toc: true
    number_sections: true
---

# Set Up

## Load Packages
```{r message=FALSE, warning=FALSE}
library(Gviz)
library(rtracklayer)
library(Rsamtools)
library(GenomicFeatures)
library(GenomicRanges)
library(rtracklayer)
library(Rsamtools)
library(GenomicAlignments)
library(VariantAnnotation)
library(tidyverse)
library(Rsubread)
```

## Randomly Sample from Cell Subclasses
The function below will take a directory of downloaded FASTQ files (e.g. scRNAseq samples from VISp region) and create a CSV containing a random sampling of 100 file names from a certain subclass (e.g. Pvalb)

* The CSV file created should be used to specify which FASTQ files to align in STAR script (which will create BAM files)

**Parameters:**

* **filedir:** the directory where FASTQs are found (assumed to contain only files for the specified region of interest)
* **outputcsv:** the directory and desired name for random sample, e.g. "../sample.csv" (*must end with .csv*)
* **metadatacsv:** path to CSV file containing metadata for each FASTQ (should have been downloaded with FASTQ files)
* **subclass:** as a character, subclass name (defined in metadatacsv)
* **region:** as a character, region name (defined in metadatacsv)

### Examine Metadata
```{r}
## Prepare metadata
metadatacsv <- ("/external/rprshnas01/netdata_kcni/stlab/Public/AIBS_scRNAseq_2019/mouse/2020_10_24/metadata.csv")
metadata <- read_csv(metadatacsv)

## Check column names
colnames(metadata)
head(metadata)

# See subclasses available in each region (only those with sufficient number of cells to sample)
metadata %>% filter(region_label!="HIP") %>% group_by(region_label, class_label, subclass_label) %>% tally() %>% filter(n>=150)

# See list of all available regions
metadata$region_label %>% unique(.)
```

### Sample One (1) Subclass
```{r}
makerandomsample <- function(metadatacsv, filedir, subclass, region, outputcsv) {
  # Get metadata for all files
  metadata <- read_csv(metadatacsv)
  
  # Get full list of downloaded FASTQ files in region of interest
  downloaded_files <- list.files(filedir, pattern="*.fastq.tar")
  downloaded_files <- downloaded_files %>% 
    str_remove(".fastq.tar") # Removing the extension to match metadata
  
  # Of the cells in the given subclass in the given region which were downloaded, randomly sample 100
  set.seed(12345) # For reproducibility
  metadata_sample <- metadata %>% 
    filter(subclass_label == subclass) %>% 
    filter(region_label == region) %>% 
    filter(exp_component_name %in% downloaded_files) %>% 
    select(exp_component_name) %>% 
    sample_n(100)
  
  # Save the random sample to a CSV
  metadata_sample %>% 
    write_csv(outputcsv, col_names=FALSE)
}
```

### Sample Multiple Subclasses w/in Region

**Modified Parameter:**

* **outputpath:** directory where .csv files will be created

```{r eval = FALSE}
# Randomly sample from each subclass in one region
makerandomsamples <- function(metadatacsv, filedir, region, outputpath) {
  ## Get subclass list
  subclasses<- metadata %>% 
    filter(region_label==region) %>% 
    group_by(subclass_label) %>% 
    tally() %>% 
    filter(n>=150) %>% 
    .$subclass_label 
  
  ## Format subclasses for file names
  subclasses_formatted <- subclasses %>% 
    str_replace_all(.,"CTX-1", "") %>%
    str_replace_all(.,"CTX", "") %>%
    str_replace_all(., "/", "") %>% 
    str_replace_all(.," ", "") %>% 
    str_to_lower(.)
  
  i=1
  for (subclass in subclasses) {
    makerandomsample(metadatacsv=metadatacsv,
                     filedir=filedir,
                     subclass=subclass,
                     region=region,
                     outputcsv=paste0(outputpath, "/", subclasses_formatted[i], "_sample.csv"))
    i=i+1
  }
}
```

### Example Usage
```{r}
random100dir <- ("/external/rprshnas01/netdata_kcni/stlab/Intralab_collab_scc_projects/Isoform_project/Random100")
fastqfiles <- ("/external/rprshnas01/netdata_kcni/stlab/Public/AIBS_scRNAseq_2019/mouse/Raw")

# Create sample for L2/3 in VISp
makerandomsample(metadatacsv=metadatacsv,
                 filedir=paste0(fastqfiles, "/VISp"),
                 subclass="L2/3 IT CTX-1", region="VISp", 
                 outputcsv=paste0(random100dir, "/VISp/samples/l23_sample.csv"))


# Create samples for all subclasses in VISp
makerandomsamples(metadatacsv=metadatacsv,
                  filedir=paste0(fastqfiles, "/VISp"),
                  subclasses=subclasses, region="VISp", 
                  outputpath=paste0(random100dir, "/VISp/samples"))
```

## Align Sampled Cells
**TODO: UPDATE SO THAT ONLY FILES WHICH ARE SUCCESSFULLY UNTARRED GO INTO SCRIPTS**

### Create Alignment Scripts (Example)
```{bash}
subclasses=('l5it' 'l5np' 'l6b' 'l6ct' 'l6it' 'l23' 'l45' 'vip')
for subclass in "${subclasses[@]}"; do
  echo "Subclass: $subclass"
  cd /external/rprshnas01/netdata_kcni/stlab/Intralab_collab_scc_projects/Isoform_project/Random100/ACA
  mkdir "$subclass"_outputs
  /external/rprshnas01/netdata_kcni/stlab/Intralab_collab_scc_projects/Isoform_project/Random100/STAR_align.sh /external/rprshnas01/netdata_kcni/stlab/Public/AIBS_scRNAseq_2019/mouse/Raw/ACA /external/rprshnas01/netdata_kcni/stlab/Intralab_collab_scc_projects/Isoform_project/Random100/ACA/${subclass}_outputs/ /external/rprshnas01/netdata_kcni/stlab/Genomic_references/Refseq/Mouse/Refseq_GRCm39/USE_THIS_genomeDir_gff /external/rprshnas01/netdata_kcni/stlab/Genomic_references/Refseq/Mouse/Refseq_GRCm39/Raw/GCF_000001635.27_GRCm39_genomic.gff /external/rprshnas01/netdata_kcni/stlab/Intralab_collab_scc_projects/Isoform_project/Random100/ACA/samples/${subclass}_sample.csv
done
```

### Run Alignment Scripts (Example)
```{bash}
subclasses=('l5it' 'l5np' 'l6ct' 'l6b' 'astro')
for subclass in "${subclasses[@]}"; do
  echo "Subclass: $subclass"
  cd /external/rprshnas01/netdata_kcni/stlab/Intralab_collab_scc_projects/Isoform_project/Random100/VISp/${subclass}_outputs
  qbatch -w 08:00:00 --ppj 12 -c 10 --mem 60G STARParaCom.txt
done
```

## Create Subclass-Level BAMs
The function below will create and index a merged BAM for a specific subclass

**Parameters:**

* **inputdir:** the directory where BAMs are found (i.e., the coord_bams directory produced from STAR script)
* **outputname:** the directory and desired name for new merged BAM, e.g. "../merged.bam" (*must end with .bam*)
* **chrID:** as a character, representing chromosome for selected reference, e.g. "NC_000082.7" for RefSeq OR "16" for Ensembl
* **coords:** as a vector, where the 1st value is the start coordinate and the 2nd is the end, e.g. c(0, 100)—note that unit is in bases, so 18,050kb becomes 18050000
* **index:** TRUE by default, change to FALSE if each BAM to be merged already has associated .bai (index file)

### For One (1) Subclass in One (1) Region
```{r}
makesubclassBAM <- function(inputdir, outputname, chrID, coords, index=TRUE) {
  
  # Create list of file paths for the BAMs
  bamfiles <- list.files(inputdir)
  bampaths <- paste0(inputdir, "/", bamfiles)
  
  # Ignoring any previously created index files
  bampaths_nobai <- str_subset(bampaths, ".bai", negate=TRUE)
  set.seed(12345) # For reproducibility
  bampaths_nobai <- bampaths_nobai %>% sample(size=98)
  
  # If index=TRUE, index each BAM file
  if (index) {
    i=0
    for (file in bampaths_nobai) {
      indexBam(file)
      print(i)
      i=i+1
    }
  }
  
  # Merge all BAMs into one which only contains gene of interest
  mergeBam(files=bampaths_nobai, destination=outputname, 
           overwrite=TRUE, region = GRanges(chrID, IRanges(coords[1], coords[2])), indexDestination=TRUE)
}
```

### Example Usage
```{r}
random100dir <- ("/external/rprshnas01/netdata_kcni/stlab/Intralab_collab_scc_projects/Isoform_project/Random100")
fastqfiles <- ("/external/rprshnas01/netdata_kcni/stlab/Public/AIBS_scRNAseq_2019/mouse/Raw")

# Create merged BAM
makesubclassBAM(
  inputdir=paste0(random100dir, "/ACA/l45_outputs/STAR_results/coord_bams"),
  outputname=paste0(random100dir, "/ACA/merged_bams/RefSeq_random100_merged_l45.bam"), chrID="NC_000082.7",
  coords=c(18038612,18056471), index=TRUE)

# Assign names to merged BAMs
vispsubclasses <- c("l23", "pvalb", "lamp5", "sst", "vip", "l45", "l5pt", "l6it", 
                    "l5it", "l5np", "l6ct", "l6b", "astro")
acasubclasses <- c("l23", "vip", "l5it", "l5np", "l6b", "l6ct", "l6it", "l45")

## For VISp:
for (subclass in vispsubclasses){
  assign(subclass, paste0(random100dir, "/VISp/merged_bams/RefSeq_random100_merged_", subclass, ".bam"))
}

## For ACA:
for (subclass in acasubclasses){
  assign(paste0("ACA_", subclass), paste0(random100dir, "/ACA/merged_bams/RefSeq_random100_merged_", subclass, ".bam"))
}
```

## Create Class-Level BAMs
```{r}
# Excitatory BAMs (VISp, ACA)
mergeBam(files=c(l23, l45, l5pt, l6it, l5it, l5np, l6ct, l6b), destination="/external/rprshnas01/netdata_kcni/stlab/Intralab_collab_scc_projects/Isoform_project/Random100/VISp/merged_bams/RefSeq_random100_merged_exc.bam", overwrite=TRUE, region=GRanges("NC_000082.7", IRanges(18038612,18056471)), indexDestination=TRUE)

exc <- ("/external/rprshnas01/netdata_kcni/stlab/Intralab_collab_scc_projects/Isoform_project/Random100/VISp/merged_bams/RefSeq_random100_merged_exc.bam")

mergeBam(files=c(ACA_l23, ACA_l5it, ACA_l5np, ACA_l6b, ACA_l6ct, ACA_l6it, ACA_l45), destination="/external/rprshnas01/netdata_kcni/stlab/Intralab_collab_scc_projects/Isoform_project/Random100/ACA/merged_bams/RefSeq_random100_merged_exc.bam", overwrite=TRUE, region=GRanges("NC_000082.7", IRanges(18038612,18056471)), indexDestination=TRUE)

ACA_exc <- ("/external/rprshnas01/netdata_kcni/stlab/Intralab_collab_scc_projects/Isoform_project/Random100/ACA/merged_bams/RefSeq_random100_merged_exc.bam")

# Inhibitory BAM (VISp)
mergeBam(files=c(pvalb, lamp5, sst, vip), destination="/external/rprshnas01/netdata_kcni/stlab/Intralab_collab_scc_projects/Isoform_project/RefSeq_random100_merged_inh.bam", overwrite=TRUE, region=GRanges("NC_000082.7", IRanges(18038612,18056471)), indexDestination=TRUE)

inh <- ("/external/rprshnas01/netdata_kcni/stlab/Intralab_collab_scc_projects/Isoform_project/Random100/VISp/merged_bams/RefSeq_random100_merged_inh.bam")
```

## Create Reference Genome Variables
```{r}
refseq_ref_GRCm38 <- ("/external/rprshnas01/netdata_kcni/stlab/Genomic_references/AIBS/Mouse/Refseq_GRCm38.p3/Raw/GCF_000001635.23_GRCm38.p3_genomic.gff")

refseq_ref_GRCm39 <- ("/external/rprshnas01/netdata_kcni/stlab/Genomic_references/Refseq/Mouse/Refseq_GRCm39/Raw/GCF_000001635.27_GRCm39_genomic.gff")
  
ensembl_ref_GRCm39 <- ("/external/rprshnas01/netdata_kcni/stlab/Genomic_references/Ensembl/Mouse/Release_104/Raw/Mus_musculus.GRCm39.104.gtf")
```

### Create TxDb Object
* These take a while, so you can just run the one you need

```{r results=FALSE, message=FALSE, warning=FALSE}
refseqTxDb_GRCm39 <- makeTxDbFromGFF(refseq_ref_GRCm39)

#refseqTxDb_GRCm38 <- makeTxDbFromGFF(refseq_ref_GRCm38)

#ensemblTxDb <- makeTxDbFromGFF(ensembl_ref)
```

## Set Up Figure
**TODO: adjust so that y-axis shows normalized read depth**

**TODO: create additional figure where y-axis=normalized counts (CPM), x-axis=cell types**

The function below will create a coverage + sashimi plot for two BAMs aligned to gene isoform models for the chosen reference, additionally showing coordinate positions on the x-axis

**Parameters:**

* **txdb:** a TxDb object, can be *refseqTxDb_GRCm39*, *refseqTxDb_GRCm38*, OR *ensemblTxDb*
* **bam1/bam2/...:** as a path to the BAM file (should be defined above, e.g. pvalb)
* **chrID:** as a character, representing chromosome for selected reference, e.g. "NC_000082.6" for RefSeq OR "16" for Ensembl
* **coords:** as a vector, where the 1st value is the start coordinate and the 2nd is the end, e.g. c(0, 100)—note that unit is in bases, so 18,050kb becomes 18050000
* **bam1title, etc.:** optional, a character specifying y-axis title for each coverage/junction plot

```{r}
makefigure <- function(txdb, bam1, bam2, bam3=NULL, bam4=NULL, chrID, coords, 
                       bam1title=deparse(substitute(bam1)), bam2title=deparse(substitute(bam2)), 
                       bam3title=deparse(substitute(bam3)), bam4title=deparse(substitute(bam4))) {
  
  # Extract list of transcripts
  gene_tx <- data.frame(transcripts(txdb)) %>% 
    filter(seqnames==chrID) %>% 
    filter((start>=coords[1]) & (end<=coords[2])) %>% 
    dplyr::select(tx_name) %>% 
    dplyr::filter(str_starts(tx_name, "NM")) %>% # just confirmed transcripts
    list(.$tx_name) %>% 
    .[[2]]
  
  # Create genome axis track
  genomeAxis <- GenomeAxisTrack(name="MyAxis", col="lightsteelblue4", fontcolor="lightsteelblue4", add35=TRUE) 
  
  
  bams <- c(bam1, bam2, bam3, bam4)
  bamtitles <- c(bam1title, bam2title, bam3title, bam4title)
  
  # Prepare plot for each BAM
  i=1
  bamplots <- vector()
  
  for (bam in bams) {
    if (!is.null(bam)){
      # Create BamFile objects (used in sashimi threshold)
      galign <- readGAlignmentPairs(file=bam, index=bam, strandMode=1)
    
      # Get junction info
      junctions <- summarizeJunctions(galign)
      
      # Create dataframe from junction info
      junction_df <- data.frame(c(as.data.frame(junctions@ranges), as.data.frame(junctions$score)))
      
      # Filter dataframe for junctions of interest
      junction_df <- junction_df %>% filter(end==18042081) %>% filter(start==18039569 | start==18041242)
      
      # Compute min junction threshold to show both EV and OV junctions
      junction_min <- min(junction_df$junctions.score[1],junction_df$junctions.score[2])
      
      if (junction_min<5 | is.na(junction_min)){
        junction_min=5
      }
      
      # Create plot for coverage and sashimi
      bamplot <- AlignmentsTrack(bam, name=bamtitles[i],
                                 cex=2, background.title="white",
                                 sashimiScore=junction_min,
                                 col.axis="lightsteelblue4", col.title="lightsteelblue4")
      bamplots <- append(bamplots, bamplot)
      
      # Update index
      i=i+1
    }
  }
  
  # Create gene models
  gr <- exonsBy(txdb, by = "tx", use.names=TRUE)[gene_tx]
  gr <- unlist(gr)
  elementMetadata(gr)$transcript <- names(gr)
  gene_models <- Gviz::GeneRegionTrack(gr, showId=TRUE, options(ucscChromosomeNames=FALSE),
                                       transcriptAnnotation="transcript", name="Gene Model",
                                       background.title="white", col.axis="lightsteelblue4",
                                       col.title="lightsteelblue4", fill="darkgrey",
                                       fontcolor.group="lightsteelblue4", col.line="lightsteelblue4")
  
  
  # Indicate tracks to plot
  tracks <- c(genomeAxis)
  for (plot in bamplots){
    tracks <- append(tracks, plot)
  }
  tracks <- append(tracks, gene_models)
  
  # Select track sizes
  tracksizes <- c(1)
  for (bam in bams) {
    if (!is.null(bam)) {
      tracksizes <- append(tracksizes, 3)
    }
  }
  tracksizes <- append(tracksizes, 2)
  
  # Put the figure together
  options(ucscChromosomeNames=FALSE)
  fig <- plotTracks(trackList=tracks,
                    showId=TRUE,
                    transcriptAnnotation="transcript",
                    chromosome=chrID,
                    sizes=tracksizes,
                    from=coords[1],
                    to=coords[2],
                    extend.left=3500,
                    fill="lightsteelblue", col.sashimi="lightsteelblue4",
                    type=c('coverage', 'sashimi'))
 
  return(fig)
}
```

# Visualization: gviz

## Create Figure
* May optionally save to .tiff

### ACA vs. VISp (Excitatory)
```{r}
makefigure(refseqTxDb_GRCm39, bam1=ACA_exc, bam2=exc,
           chrID="NC_000082.7", coords=c(18038612,18056471),
           bam1title="ACA", bam2title="VISp")
```

### ACA: Excitatory Subclasses
```{r}
makefigure(refseqTxDb_GRCm39, bam1=ACA_l23, bam2=ACA_l5it, bam3=ACA_l45, bam4=ACA_l6it,
           chrID="NC_000082.7", coords=c(18038612,18056471),
           bam1title="L2/3 IT", bam2title="L5 IT",
           bam3title="L4/5 IT", bam4title="L6 IT")
makefigure(refseqTxDb_GRCm39, bam1=ACA_l6b, bam2=ACA_l5np, bam3=ACA_l6ct,
           chrID="NC_000082.7", coords=c(18038612,18056471),
           bam1title="L6b", bam2title="L5 NP",
           bam3title="L6 CT")
```
### ACA vs. VISp (Inhibitory)
```{r}
makefigure(refseqTxDb_GRCm39, bam1=ACA_vip, inh, 
           chrID="NC_000082.7", coords=c(18038612,18056471), 
           bam1title="Vip", bam2title="VISp: Inhibitory")
```

### VISp: Excitatory Subclasses
```{r message=FALSE, warning=FALSE, results=FALSE}
makefigure(refseqTxDb_GRCm39, l6b, l23, l45, l6it, 
           chrID="NC_000082.7", coords=c(18038612,18056471),
           bam1title="L6b", bam2title="L2/3 IT",
           bam3title="L4/5 IT", bam4title="L6 IT")

makefigure(refseqTxDb_GRCm39, l5pt, l5it, l6ct, l5np, 
           chrID="NC_000082.7", coords=c(18038612,18056471),
           bam1title="L5 PT", bam2title="L5 IT",
           bam3title="L6 CT", bam4title="L5 NP")
```

### VISp: Inhibitory Subclasses
```{r message=FALSE, warning=FALSE, results=FALSE}
makefigure(refseqTxDb_GRCm39, pvalb, sst, lamp5, vip, "NC_000082.7", c(18038612,18056471), 
           bam1title="Pvalb", bam2title="Sst", bam3title="Lamp5", bam4title="Vip")
```

### VISp: Excitatory vs. Inhibitory vs. Non-Neuronal Classes
```{r message=FALSE, warning=FALSE, results=FALSE}
#tiff(filename="pvalbvsl23.tiff", units="in", width=8, height=6, res=500)
makefigure(refseqTxDb_GRCm39, bam1=exc, bam2=inh, bam3=astro, chrID="NC_000082.7", coords=c(18038612,18056471), 
           bam1title="Excitatory", bam2title="Inhibitory", bam3title="Astrocytes")
#dev.off()
```

# Quantification

## See exon boundary coordinates for every transcript
* Parameters same as defined above

```{r}
getexoncoords <- function(txdb, chrID, coords) {
  gene_tx <- data.frame(transcripts(txdb)) %>% 
    filter(seqnames==chrID) %>% 
    filter((start>=coords[1]) & (end<=coords[2])) %>% 
    dplyr::select(tx_name) %>% 
    list(.$tx_name) %>% 
    .[[2]]
  gr <- exonsBy(txdb, by = "tx", use.names=TRUE)[gene_tx]
  df <- data.frame(unlist(gr)) %>% 
    mutate(transcript=str_match(exon_name, "-(.*)-")[,2]) %>% 
    mutate(exon_num=str_match(exon_name, ".*-(.*)")[,2]) %>% 
    dplyr::select(transcript, exon_num, start, end)
  return(df)
}
```

### Example Usage
```{r}
exoncoords <- getexoncoords(refseqTxDb_GRCm39, "NC_000082.7", c(18038612,18056471))
exoncoords
```

## Define transcript-specific ranges
Select exon coordinates unique to one transcript (exon in one tx2 should be a subset of exon in tx1)
```{r}
selectrange <- function(chrID, exoncoords, tx1, tx2, exon1, exon2) {
  exon1_rng <- exoncoords %>% filter(transcript==tx1 & exon_num==exon1)
  exon2_rng <- exoncoords %>% filter(transcript==tx2 & exon_num==exon2)
  gr_tx1 <- GRanges(chrID, IRanges(exon1_rng$start, exon1_rng$end))
  gr_tx2 <- GRanges(chrID, IRanges(exon2_rng$start, exon2_rng$end))
  diff <- GenomicRanges::setdiff(gr_tx1, gr_tx2)
  
  return(diff)
}
```

### Example Usage
```{r}
rng <- selectrange("NC_000082.7", exoncoords, "NM_172151.4", "NM_001379019.1", "11", "11")
```

## summarizeOverlaps: calculate normalized reads

### Example 1: As a percentage of reads for whole gene
```{r}
# Pvalb cells
bf_pvalb <- BamFile(pvalb, asMates=TRUE)

counts_pvalb <- data.frame(union=assays(summarizeOverlaps(rng, bf_pvalb, singleEnd=FALSE, fragments=TRUE))$counts)
colSums(counts_pvalb) # raw number of reads in region
p_pvalb <- 100*(rowSums(counts_pvalb))/(countBam(bf_pvalb)$records) # reads as proportion of total reads in gene

# L2/3
bf_l23 <- BamFile(l23, asMates=TRUE)

counts_l23 <- data.frame(union=assays(summarizeOverlaps(rng, bf_l23, singleEnd=FALSE, fragments=TRUE))$counts)
colSums(counts_l23)
p_l23 <- 100*(rowSums(counts_l23))/(countBam(bf_l23)$records)

colSums(data.frame(p_pvalb))
colSums(data.frame(p_l23))
```

### Example 2: Region ratios (short vs. long-minus-short)
```{r}
# Define transcript-specific ranges
long <- rng # defined using selectrange()
short <- GRanges("NC_000082.7", IRanges(18038612, 18039568)) # determined using getexoncoords()

long_pvalb <- data.frame(union=assays(summarizeOverlaps(long, bf_pvalb, singleEnd=FALSE, fragments=TRUE))$counts)
short_pvalb <- data.frame(union=assays(summarizeOverlaps(short, bf_pvalb, singleEnd=FALSE, fragments=TRUE))$counts)

long_l23 <- data.frame(union=assays(summarizeOverlaps(long, bf_l23, singleEnd=FALSE, fragments=TRUE))$counts)
short_l23 <- data.frame(union=assays(summarizeOverlaps(short, bf_l23, singleEnd=FALSE, fragments=TRUE))$counts)

ratio_pvalb <- (rowSums(long_pvalb))/(rowSums(short_pvalb))
ratio_l23 <- (rowSums(long_l23))/(rowSums(short_l23))

# See ratios in a table
readratios <- data.frame(Pvalb=ratio_pvalb, "L2/3 IT"=ratio_l23, row.names="Read Ratio")
```

### Example 3: As a percentage of reads in long final exon region
```{r}
# Define final exon region (full) and portion of that exon specific to one transcript (full_minus_short)
full <- GRanges("NC_000082.7", IRanges(18038617, 18041241))
full_minus_short <- GRanges("NC_000082.7", IRanges(18039568, 18041241))

full_pvalb <- data.frame(union=assays(summarizeOverlaps(full, bf_pvalb, singleEnd=FALSE, fragments=TRUE))$counts)
full_minus_short_pvalb <- data.frame(union=assays(summarizeOverlaps(full_minus_short, bf_pvalb, singleEnd=FALSE, fragments=TRUE))$counts)

full_l23 <- data.frame(union=assays(summarizeOverlaps(full, bf_l23, singleEnd=FALSE, fragments=TRUE))$counts)
full_minus_short_l23 <- data.frame(union=assays(summarizeOverlaps(full_minus_short, bf_l23, singleEnd=FALSE, fragments=TRUE))$counts)

p_pvalb <- 100*(rowSums(full_minus_short_pvalb))/(rowSums(full_pvalb))
p_l23 <- 100*(rowSums(full_minus_short_l23))/(rowSums(full_l23))

# See percentages in a table
readpercentages <- data.frame(Pvalb=p_pvalb, "L2/3 IT"=p_l23, row.names="Read %")
```

### Function: getreads()
This function will calculate read percentages as in *Example 3* above for a list of BAM files, returning the results in a dataframe. Percentages reflect the proportion of reads in the final exon region that correspond to each transcript.

```{r}
getreads <- function(bams, region) {
  
  # Prepare full reads dataframe
  df <- data.frame(matrix(ncol=2, nrow=0))
  colnames(df) <- c("EV", "OV")
  
  # Set read ranges
  full_range <- GRanges("NC_000082.7", IRanges(18038617, 18041241))
  ov_specific_range <- GRanges("NC_000082.7", IRanges(18039568, 18041241))
  shared_range <- GRanges("NC_000082.7", IRanges (18038617,18039568))
  
  for (bam in bams) {
    # Create BamFile object
    bf <- BamFile(bam, asMates=TRUE)
    
    # Get reads per range
    full <- data.frame(union=assays(summarizeOverlaps(full_range, bf, singleEnd=FALSE, fragments=TRUE))$counts)
    ov_specific <- data.frame(union=assays(summarizeOverlaps(ov_specific_range, bf, singleEnd=FALSE, fragments=TRUE))$counts)
    shared <- data.frame(union=assays(summarizeOverlaps(shared_range, bf, singleEnd=FALSE, fragments=TRUE))$counts)

    # Compute reads as percentage of whole final exon
    read_percent_ov <- 100*(rowSums(ov_specific))/(rowSums(full))
    read_percent_ev <- 100-read_percent_ov
    
    rowname <- bam %>% 
      str_remove(paste0(
        "/external/rprshnas01/netdata_kcni/stlab/Intralab_collab_scc_projects/Isoform_project/Random100/", 
        region, "/merged_bams/RefSeq_random100_merged_")) %>% 
      str_remove(".bam")
    
    newrow <- data.frame(subclass=rowname, EV=read_percent_ev, OV=read_percent_ov)
    df <- rbind(df, newrow)
  }
  
  return(df)
}
```

## summarizeJunctions

### Function: getjunctions()
Like getreads(), takes in a list of BAM files and computes the percentage of all transcript-specific reads that correspond to each transcript

**TODO: fix so that easily works for other genes**
```{r}
getjunctions <- function(bams, region) {
  
  # Prepare full junction dataframe
  df <- data.frame(matrix(ncol=2, nrow=0))
  colnames(df) <- c("EV", "OV")
  
  for (bam in bams) {
    # Create GAlignments object
    galign <- readGAlignmentPairs(file=bam, index=bam, strandMode=1)
    
    # Get junction info
    junctions <- summarizeJunctions(galign)
    
    # Create dataframe from junction info
    junction_df <- data.frame(c(as.data.frame(junctions@ranges), as.data.frame(junctions$score)))
    
    # Filter dataframe for junctions of interest
    junction_df <- junction_df %>% filter(end==18042081) %>% filter(start==18039569 | start==18041242)
    
    ov <- junction_df$junctions.score[2]
    ev <- junction_df$junctions.score[1]
    
    if (is.na(ov)){
      ov<-0
    }
    if (is.na(ev)){
      ov<-0
    }
    
    junction_percent_ov <- 100*ov/(ev+ov)
    
    if (is.na(junction_percent_ov)){
      junction_percent_ov<-0
    }
    
    junction_percent_ev <- 100-junction_percent_ov
    
    rowname <- bam %>% 
      str_remove(
        paste0("/external/rprshnas01/netdata_kcni/stlab/Intralab_collab_scc_projects/Isoform_project/Random100/", 
               region, "/merged_bams/RefSeq_random100_merged_")) %>% 
      str_remove(".bam")
    
    newrow <- data.frame(subclass=rowname, EV=junction_percent_ev, OV=junction_percent_ov)
    df <- rbind(df, newrow)
  }

  return(df)
}
```

### Example Usage
```{r}
# VISp
visp <- c(astro, pvalb, lamp5, sst, vip, l5np, l5it, l6it, l6ct, l23, l45, l5pt, l6b)

junctions <- getjunctions(visp)
junctions

reads <- getreads(visp, "VISp")
reads

# write_csv(reads, "/external/rprshnas01/netdata_kcni/stlab/Intralab_collab_scc_projects/Isoform_project/Hodge_aligned/V1/mouse_comparison/mouse_reads.csv")

# ACA
aca <- c(ACA_vip, ACA_l23, ACA_l5it, ACA_l5np, ACA_l6b, ACA_l6ct, ACA_l6it, ACA_l45)

aca_junctions <- getjunctions(aca)
aca_junctions[1,2] <- 0 # fixing an issue that occurs when no reads are mapped to a specific junction
aca_junctions[1,3] <- 100 # same as above

aca_reads <- getreads(aca)
aca_reads
```


#### Function: totalreads()
This calculates the total number of reads per subclass which are mapped to the gene of interest

**TODO: fix so that easily works for other genes**
```{r}
totalreads <- function(bams) {
  
  # Prepare full reads dataframe
  df <- data.frame(matrix(ncol=1, nrow=0))
  colnames(df) <- "Read_Count"
  
  # Set read range
  range <- GRanges("NC_000082.7", IRanges(18038612,18056471))
  
  for (bam in bams) {
    # Create BamFile object
    bf <- BamFile(bam, asMates=TRUE)
    
    # Get reads per range
    full <- rowSums(data.frame(union=assays(summarizeOverlaps(range, bf, singleEnd=FALSE, fragments=TRUE))$counts))
    
    rowname <- bam %>% 
      str_remove("/external/rprshnas01/netdata_kcni/stlab/Intralab_collab_scc_projects/Isoform_project/Random100/VISp/merged_bams/RefSeq_random100_merged_") %>% 
      str_remove("/external/rprshnas01/netdata_kcni/stlab/Intralab_collab_scc_projects/Isoform_project/Random100/ACA/merged_bams/RefSeq_random100_merged_") %>% 
      str_remove(".bam")
    
    newrow <- data.frame(subclass=rowname, Read_Count=full)
    df <- rbind(df, newrow)
  }
  
  return(df)
}
```