Verifying denoising/understanding normalization #48

oshiomah1 · 2024-12-02T17:45:53Z

Hello, I have run dsb on my citeseq dataset (code shown below). One of the reasons for this was I assumed my antibody data was very noisy due to ubiquitous expression of antibodies. I am visualizing this expression using seurat's dotplot feature.The image below is regular seurat processing with CLR normalization:
For some context CD19 and CD20 should only be expressed by B cells, B cells do have the highest expression of these antibodies but it seems like all the cell types express it.

Now after running dsb the expression across antibodies seem very similar as show here

The cell type annotations are slightly different across the runs since the annotation tool i used is sensitive to how the data was processed. so you can ignore that.

My expectation was that after running dsb my dotplot wiuld look something like this(my gen expression data where there isn't a lot of non-specific expression)

here is my dsb/seurat end to end workflow

# read raw data using the Seurat function "Read10X" 
raw = Seurat::Read10X("/share/hennlab/projects/NCR_scRNAseq/results/cellranger/batch1/TBSS01/multi/count/raw_feature_bc_matrix")
cells = Seurat::Read10X("/share/hennlab/projects/NCR_scRNAseq/results/cellranger/batch1/TBSS01/per_sample_outs/TBSS01/count/sample_filtered_feature_bc_matrix")

# define cell-containing barcodes and separate cells and empty drops
stained_cells = colnames(cells$`Gene Expression`)
background = setdiff(colnames(raw$`Gene Expression`), stained_cells)

# split the data into separate matrices for RNA and ADT
prot = raw$`Antibody Capture`
rna = raw$`Gene Expression`

 # create metadata of droplet QC stats used in standard scRNAseq processing
mtgene = grep(pattern = "^MT-", rownames(rna), value = TRUE) # used below

md = data.frame(
  rna.size = log10(Matrix::colSums(rna)), 
  prot.size = log10(Matrix::colSums(prot)), 
  n.gene = Matrix::colSums(rna > 0), 
  mt.prop = Matrix::colSums(rna[mtgene, ]) / Matrix::colSums(rna)
)
# add indicator for barcodes Cell Ranger called as cells
md$drop.class = ifelse(rownames(md) %in% stained_cells, 'cell', 'background')

# remove barcodes with no evidence of capture in the experiment
md = md[md$rna.size > 0 & md$prot.size > 0, ]


background_drops = rownames(
  md[ md$prot.size > 1.5 & 
      md$prot.size < 3 & 
      md$rna.size < 2.5, ]
  ) 
background.adt.mtx = as.matrix(prot[ , background_drops])

# calculate statistical thresholds for droplet filtering.
cellmd = md[md$drop.class == 'cell', ]
head(cellmd)
# filter drops with + / - 3 median absolute deviations from the median library size
rna.mult = (3*mad(cellmd$rna.size))
prot.mult = (3*mad(cellmd$prot.size))
rna.lower = median(cellmd$rna.size) - rna.mult
rna.upper = median(cellmd$rna.size) + rna.mult
prot.lower = median(cellmd$prot.size) - prot.mult
prot.upper = median(cellmd$prot.size) + prot.mult

# filter rows based on droplet qualty control metrics
qc_cells = rownames(
  cellmd[cellmd$prot.size > prot.lower & 
         cellmd$prot.size < prot.upper & 
         cellmd$rna.size > rna.lower & 
         cellmd$rna.size < rna.upper & 
         cellmd$mt.prop < 0.15, ]
  )

  #Now subset the metadata ADT and RNA matrices.

cell.adt.raw = as.matrix(prot[ , qc_cells])
cell.rna.raw = rna[ ,qc_cells]
cellmd = cellmd[qc_cells, ]

# define isotype controls 
isotype.controls = c('Isotype_HTK888','Isotype_MOPC.173','Isotype_MOPC.21','Isotype_MPC.11','Isotype_RTK2071','Isotype_RTK2758',
                     'Isotype_RTK4530')

# normalize and denoise with dsb with 
cells.dsb.norm = DSBNormalizeProtein(
  cell_protein_matrix = cell.adt.raw, 
  empty_drop_matrix = background.adt.mtx, 
  denoise.counts = TRUE, 
  use.isotype.control = TRUE, 
  isotype.control.name.vec = isotype.controls
  )

s = Seurat::CreateSeuratObject(counts = cell.rna.raw, 
                               meta.data = cellmd,
                               assay = "RNA", 
                               min.cells = 20)

# add dsb normalized matrix "cell.adt.dsb" to the "CITE" data (not counts!) slot
s[["ADT"]] = Seurat::CreateAssayObject(data = cells.dsb.norm)


# integrating with Seurat
stopifnot(isTRUE(all.equal(rownames(cellmd), colnames(cell.adt.raw))))
stopifnot(isTRUE(all.equal(rownames(cellmd), colnames(cell.rna.raw))))

# create Seurat object note: min.cells is a gene filter, not a cell filter
s = Seurat::CreateSeuratObject(counts = cell.rna.raw, 
                               meta.data = cellmd,
                               assay = "RNA", 
                               min.cells = 20)

# add dsb normalized matrix "cell.adt.dsb" to the "CITE" data (not counts!) slot
s[["ADT"]] = Seurat::CreateAssayObject(data = cells.dsb.norm)

# filter seurat object further
s <- subset(s, subset = nFeature_RNA > 200 & nFeature_RNA < upper_bound_rna_nfeature & percent.mt < 5 )

# use pearson residuals as normalized values for pca 
DefaultAssay(s) = "RNA" #change del to s
s= NormalizeData(s, verbose = FALSE) %>% 
  FindVariableFeatures(selection.method = 'vst', nfeatures = 2000,verbose = FALSE) %>% 
  ScaleData(verbose = FALSE) %>%
  RunPCA(verbose = FALSE) %>% FindNeighbors(assay = "RNA", dims = 1:20)  %>%
  FindClusters(assay = "RNA")  %>%
  RunUMAP(reduction = 'pca', dims = 1:20, assay = 'RNA', 
              reduction.name = 'rna.umap', reduction.key = 'rnaUMAP_')
print("processed RNA data")



# define proteins to use in clustering (non-isotype controls)
prots = rownames(s@assays$ADT@data)[1:130]

# set up dsb values to use in WNN analysis (do not normalize with CLR, use dsb normalized values)
DefaultAssay(s) = "ADT"
VariableFeatures(s) = prots


s <-  s %>% 
  ScaleData() %>% RunPCA( assay = "ADT", features = prots, reduction.name = "apca", reduction.key = "APCA_") %>%
FindNeighbors(assay = "ADT", dims = 1:20,graph.name = "adt_snn")  %>%
FindClusters(assay = "ADT",graph.name = "adt_snn")  %>%
RunUMAP(reduction = 'apca', dims = 1:20, reduction.name = 'ADT.umap', reduction.key = 'adtUMAP_')


s <- FindMultiModalNeighbors(s, reduction.list = list("pca", "apca"), dims.list = list(1:20, 1:20),modality.weight.name = c("RNA.weight", "ADT.weight"))

 
s <- RunUMAP(s, nn.name = "weighted.nn", reduction.name = "wnn.umap", reduction.key = "wnnUMAP_")
s <- FindClusters(s, graph.name = "wsnn", algorithm = 3, resolution = 2, verbose = FALSE)


 DotPlot(s, features = adt_markers, group.by = "sctype_classification", assay ="ADT",cols = my_colors) + RotatedAxis()

The text was updated successfully, but these errors were encountered:

MattPM · 2024-12-08T00:28:27Z

Hi this has to do with the function you are using to visualize the data and not the normalized values. You're using a Seurat function DotPlot which is doing all sorts of processing to the data in order to scale it and squish the color range (can't tell if it is scaled across rows or columns from looking at this plot either). You can look at all the stuff Seurat is doing to manipulate the visualization yourself with getAnywhere('DotPlot').

You can tell it's scaled because the values are squished between -2 and 2 arbitrarily - that range of valuse is not optimal for the dsb values which are signal to noise ratios. For this reason, you also don't need to scale the dsb values. Try to plot your data using the functions provided in the vignette, or use ggplot to make the dotplot and see what you get with and without dsb for normalization.

If you want, with dsb you can also plot only the values above a threshold like 2 (or 3.5 which we use in throughout the paper) because the values are signal to noise and the scale is in standard deviations above expected background.

Check out code here from the vignette - i use the dsb values and dont scale them.
https://cran.r-project.org/web/packages/dsb/vignettes/end_to_end_workflow.html#interpretation

If you want to make a dotplot where you control the color scale better you can try using code similar to below which i used for a different paper


# summarize by percent expression 
index1 = prot_sub[1]; index2 = prot_sub[length(prot_sub)]

# calculate percent of cells above the 3.5 sd dsb threshold 
pgt = function(x){FSA::perc(x = x,dir = "gt",val = 3.5)}
dsummary = 
  d %>% 
  group_by(celltype) %>% 
  summarize_at(.vars = prot_sub, .funs = pgt) %>% 
  gather(prot, pct_express, index1:index2)

# calculate average dsb expression per cluster 
d_av = 
  d %>% 
  group_by(celltype) %>% 
  summarize_at(.vars = prot_sub, .funs = mean) %>% 
  gather(prot, gene_avg, index1:index2)

# merge summarized data 
d2 = full_join(dsummary, d_av)
d2$celltype = factor(d2$celltype, levels = rev(celltype_order))
d2$prot = factor(d2$prot, levels = rev(prot_sub))

# draw plot 
p = ggplot(d2 %>% filter(pct_express > 3.5), aes(x = celltype, y = prot, fill = gene_avg, size = pct_express)) +
  geom_point(shape = 21) + 
  scale_fill_gradientn(colours = BuenColors::jdb_palettes$solar_rojos,
                       limits = c(0,11), 
                       oob = scales::squish,
                       name = 'mean dsb\nnormalized \nvalue') + 
  scale_size_area(name = '% of cells \nabove noise \nthreshold') + 
  theme_bw() + 
  theme(axis.line  = element_blank()) +
  scale_x_discrete(position = 'top') + 
  theme(axis.text.x = element_text(angle = 58, vjust = 1, hjust=0, color = 'black', size = 11)) +
  theme(axis.text.y = element_text(color = 'black', size = 11)) +
  ylab('') + xlab('') + 
  theme(legend.position = 'bottom') + 
  theme(axis.ticks = element_blank())  +
  theme(plot.margin = unit(c(0.1, 1.5, 0.2,0.2), 'cm'))

Provide feedback

Saved searches

Use saved searches to filter your results more quickly

Verifying denoising/understanding normalization #48

Verifying denoising/understanding normalization #48

oshiomah1 commented Dec 2, 2024

MattPM commented Dec 8, 2024 •

edited

Loading

Verifying denoising/understanding normalization #48

Verifying denoising/understanding normalization #48

Comments

oshiomah1 commented Dec 2, 2024

MattPM commented Dec 8, 2024 • edited Loading

MattPM commented Dec 8, 2024 •

edited

Loading