生信技能树单细胞数据挖掘笔记 (1):数据读入

生信技能树单细胞数据挖掘笔记 (2):创建 Seurat 对象并进行质控、筛选高变基因并可视化

生信技能树单细胞数据挖掘笔记 (3):降维与聚类

生信技能树单细胞数据挖掘笔记 (4):其他分析(周期判断、double 诊断、细胞类型注释)

生信技能树单细胞数据挖掘笔记 (5):轨迹分析

生信技能树单细胞数据挖掘笔记 (6):差异分析 (1)

生信技能树单细胞数据挖掘笔记 (7):差异分析 (2)

# 创建 Seurat 对象并质控

创建 Seurat 对象,然后过滤掉表达量过低的基因、表达基因过少的细胞以及线粒体基因过多的细胞

### 2、构建 seurat 对象,质控绘图 ----
# 2.1 构建 seurat 对象,质控
#In total, 2,343 cells from tumor cores were included in this analysis.
#quality controlstandards:
#1) genes detected in < 3 cells were excluded; 筛选基因
#2) cells with < 50 total detected genes were excluded; 筛选细胞
#3) cells with ≥ 5% of mitochondria-expressed genes were excluded. 筛选细胞
library ("Seurat")
sce.meta <- data.frame (Patient_ID=group$Patient_ID,
                       row.names = group$sample)
head (sce.meta)
table (sce.meta$Patient_ID)
# 这个函数 CreateSeuratObject 有多种多样的执行方式
scRNA = CreateSeuratObject (counts=a.filt,
                           meta.data = sce.meta,
                           min.cells = 3,
                           min.features = 50)
#counts:a matrix-like object with unnormalized data with cells as columns and features as rows
#meta.data:Additional cell-level metadata to add to the Seurat object
#min.cells: features detected in at least this many cells.
#min.features:cells where at least this many features are detected.
head (scRNA@meta.data)
#nCount_RNA:the number of cell total counts
#nFeature_RNA:the number of cell's detected gene
summary (scRNA@meta.data)
scRNA@assays$RNA@counts [1:4,1:4]
# 可以看到,之前的 counts 矩阵存储格式发生了变化:4 x 4 sparse Matrix of class "dgCMatrix"
dim (scRNA)
#  20047  2342 仅过滤掉一个细胞
# 接下来根据线粒体基因表达筛选低质量细胞
#Calculate the proportion of transcripts mapping to mitochondrial genes
table (grepl ("^MT-",rownames (scRNA)))
#FALSE
#20050 没有染色体基因
scRNA [["percent.mt"]] <- PercentageFeatureSet (scRNA, pattern = "^MT-")
head (scRNA@meta.data)
summary (scRNA@meta.data)
# 结果显示没有线粒体基因,因此这里过滤也就没有意义,但是代码留在这里
# 万一大家的数据里面有线粒体基因,就可以如此这般进行过滤啦。
pctMT=5 #≥ 5% of mitochondria-expressed genes
scRNA <- subset (scRNA, subset = percent.mt < pctMT)
dim (scRNA)
table (grepl ("^ERCC-",rownames (scRNA)))
#FALSE  TRUE
#19961    86  发现是有 ERCC 基因
#External RNA Control Consortium,是常见的已知浓度的外源 RNA 分子 spike-in 的一种
# 指标含义类似线粒体含量,ERCC 含量大,则说明 total sum 变小
scRNA [["percent.ERCC"]] <- PercentageFeatureSet (scRNA, pattern = "^ERCC-")
head (scRNA@meta.data)
summary (scRNA@meta.data)
rownames (scRNA)[grep ("^ERCC-",rownames (scRNA))]
[1] "ERCC-00002" "ERCC-00003" "ERCC-00004" "ERCC-00009" "ERCC-00012" "ERCC-00013" "ERCC-00014" "ERCC-00017" "ERCC-00019" "ERCC-00022" "ERCC-00024" "ERCC-00025"
[13] "ERCC-00028" "ERCC-00031" "ERCC-00033" "ERCC-00034" "ERCC-00035" "ERCC-00039" "ERCC-00040" "ERCC-00041" "ERCC-00042" "ERCC-00043" "ERCC-00044" "ERCC-00046"
[25] "ERCC-00051" "ERCC-00053" "ERCC-00054" "ERCC-00058" "ERCC-00059" "ERCC-00060" "ERCC-00062" "ERCC-00067" "ERCC-00069" "ERCC-00071" "ERCC-00073" "ERCC-00074"
[37] "ERCC-00076" "ERCC-00077" "ERCC-00078" "ERCC-00079" "ERCC-00081" "ERCC-00083" "ERCC-00084" "ERCC-00085" "ERCC-00086" "ERCC-00092" "ERCC-00095" "ERCC-00096"
[49] "ERCC-00097" "ERCC-00098" "ERCC-00099" "ERCC-00104" "ERCC-00108" "ERCC-00109" "ERCC-00111" "ERCC-00112" "ERCC-00113" "ERCC-00116" "ERCC-00120" "ERCC-00123"
[61] "ERCC-00126" "ERCC-00130" "ERCC-00131" "ERCC-00134" "ERCC-00136" "ERCC-00137" "ERCC-00138" "ERCC-00142" "ERCC-00143" "ERCC-00144" "ERCC-00145" "ERCC-00147"
[73] "ERCC-00148" "ERCC-00150" "ERCC-00154" "ERCC-00156" "ERCC-00157" "ERCC-00158" "ERCC-00160" "ERCC-00162" "ERCC-00163" "ERCC-00164" "ERCC-00165" "ERCC-00168"
[85] "ERCC-00170" "ERCC-00171"
# 可以看到有不少 ERCC 基因
sum (scRNA$percent.ERCC< 40)
# 较接近原文过滤数量 2149,但感觉条件有点宽松了,先做下去看看
# 网上看了相关教程,一般 ERCC 占比不高于 10%
sum (scRNA$percent.ERCC< 10)   #就只剩下 460 个 cell,明显低于文献中的数量
pctERCC=40
scRNA <- subset (scRNA, subset = percent.ERCC < pctERCC)
dim (scRNA)
# 20047  2142   原文为 19752  2149
dim (a.filt)
#23460  2343 未过滤前

# 质控绘图

# 2.2 可视化
# 图 A:观察不同组 cell 的 counts、feature 分布
col.num <- length (unique (scRNA@meta.data$Patient_ID))
library (ggplot2)
p1_1.1 <- VlnPlot (scRNA,
                features = c ("nFeature_RNA"),
                group.by = "Patient_ID",
                cols =rainbow (col.num)) +
  theme (legend.position = "none") +
  labs (tag = "A")
p1_1.1
p1_1.2 <- VlnPlot (scRNA,
                features = c ("nCount_RNA"),
                group.by = "Patient_ID",
                cols =rainbow (col.num)) +
  theme (legend.position = "none")
p1_1.2
p1_1 <- p1_1.1 | p1_1.2
p1_1
VlnPlot (scRNA,
        features = c ("nFeature_RNA","nCount_RNA","percent.ERCC"))
# 图 B:nCount_RNA 与对应的 nFeature_RNA 关系
p1_2 <- FeatureScatter (scRNA, feature1 = "nCount_RNA", feature2 = "nFeature_RNA",
                       group.by = "Patient_ID",pt.size = 1.3) +
  labs (tag = "B")
p1_2
FeatureScatter (scRNA, feature1 = "nCount_RNA", feature2 = "percent.ERCC")

image-20201107130024109

<center>
p1_1
</center>
image-20201107134732184

<center>
p1_2
</center>

目前已经完成了文献中的 fig1A、B

image-20201107134914247

# 挑选高变基因

### 3、挑选 hvg 基因,可视化 ----
#highly Variable gene: 简单理解 sd 大的
scRNA <- FindVariableFeatures (scRNA, selection.method = "vst", nfeatures = 1500)
# 根据文献原图,挑选变化最大的 1500 个 hvg
top10 <- head (VariableFeatures (scRNA), 10)
top10
plot1 <- VariableFeaturePlot (scRNA)
# 标记 top10 hvg
p1_3 <- LabelPoints (plot = plot1, points = top10, repel = TRUE, size=2.5) +
  theme (legend.position = c (0.1,0.8)) +
  labs (tag = "C")
p1_3
save (scRNA, file="2.3.Rdata")

image-20201107135433890

<center>
p1_3
</center>

目前已复现文献中的图 1A 到 C。

更新于 阅读次数

请我喝[茶]~( ̄▽ ̄)~*

amane 微信支付

微信支付

amane 支付宝

支付宝