生信技能树单细胞数据挖掘笔记 (2) - 单细胞 - 生信学习 | Zhou Xiaozhao = 小钊の笔记 = 前天是小兔子，昨天是小鹿，今天是你

生信技能树单细胞数据挖掘笔记 (1)：数据读入

生信技能树单细胞数据挖掘笔记 (2)：创建 Seurat 对象并进行质控、筛选高变基因并可视化

生信技能树单细胞数据挖掘笔记 (3)：降维与聚类

生信技能树单细胞数据挖掘笔记 (4)：其他分析（周期判断、double 诊断、细胞类型注释）

生信技能树单细胞数据挖掘笔记 (5)：轨迹分析

生信技能树单细胞数据挖掘笔记 (6)：差异分析 (1)

生信技能树单细胞数据挖掘笔记 (7)：差异分析 (2)

# 创建 Seurat 对象并质控

创建 Seurat 对象，然后过滤掉表达量过低的基因、表达基因过少的细胞以及线粒体基因过多的细胞

	### 2、构建 seurat 对象，质控绘图 ----
	# 2.1 构建 seurat 对象，质控
	#In total, 2,343 cells from tumor cores were included in this analysis.
	#quality controlstandards:
	#1) genes detected in < 3 cells were excluded; 筛选基因
	#2) cells with < 50 total detected genes were excluded; 筛选细胞
	#3) cells with ≥ 5% of mitochondria-expressed genes were excluded. 筛选细胞
	library ("Seurat")
	sce.meta <- data.frame (Patient_ID=group$Patient_ID,
	row.names = group$sample)
	head (sce.meta)
	table (sce.meta$Patient_ID)
	# 这个函数 CreateSeuratObject 有多种多样的执行方式
	scRNA = CreateSeuratObject (counts=a.filt,
	meta.data = sce.meta,
	min.cells = 3,
	min.features = 50)
	#counts:a matrix-like object with unnormalized data with cells as columns and features as rows
	#meta.data:Additional cell-level metadata to add to the Seurat object
	#min.cells: features detected in at least this many cells.
	#min.features:cells where at least this many features are detected.
	head (scRNA@meta.data)
	#nCount_RNA：the number of cell total counts
	#nFeature_RNA：the number of cell's detected gene
	summary (scRNA@meta.data)
	scRNA@assays$RNA@counts [1:4,1:4]
	# 可以看到，之前的 counts 矩阵存储格式发生了变化：4 x 4 sparse Matrix of class "dgCMatrix"

	dim (scRNA)
	# 20047 2342 仅过滤掉一个细胞
	# 接下来根据线粒体基因表达筛选低质量细胞
	#Calculate the proportion of transcripts mapping to mitochondrial genes
	table (grepl ("^MT-",rownames (scRNA)))
	#FALSE


	#20050 没有染色体基因
	scRNA [["percent.mt"]] <- PercentageFeatureSet (scRNA, pattern = "^MT-")
	head (scRNA@meta.data)
	summary (scRNA@meta.data)
	# 结果显示没有线粒体基因，因此这里过滤也就没有意义，但是代码留在这里
	# 万一大家的数据里面有线粒体基因，就可以如此这般进行过滤啦。
	pctMT=5 #≥ 5% of mitochondria-expressed genes
	scRNA <- subset (scRNA, subset = percent.mt < pctMT)
	dim (scRNA)


	table (grepl ("^ERCC-",rownames (scRNA)))
	#FALSE TRUE
	#19961 86 发现是有 ERCC 基因
	#External RNA Control Consortium，是常见的已知浓度的外源 RNA 分子 spike-in 的一种
	# 指标含义类似线粒体含量，ERCC 含量大，则说明 total sum 变小
	scRNA [["percent.ERCC"]] <- PercentageFeatureSet (scRNA, pattern = "^ERCC-")
	head (scRNA@meta.data)
	summary (scRNA@meta.data)
	rownames (scRNA)[grep ("^ERCC-",rownames (scRNA))]
	[1] "ERCC-00002" "ERCC-00003" "ERCC-00004" "ERCC-00009" "ERCC-00012" "ERCC-00013" "ERCC-00014" "ERCC-00017" "ERCC-00019" "ERCC-00022" "ERCC-00024" "ERCC-00025"
	[13] "ERCC-00028" "ERCC-00031" "ERCC-00033" "ERCC-00034" "ERCC-00035" "ERCC-00039" "ERCC-00040" "ERCC-00041" "ERCC-00042" "ERCC-00043" "ERCC-00044" "ERCC-00046"
	[25] "ERCC-00051" "ERCC-00053" "ERCC-00054" "ERCC-00058" "ERCC-00059" "ERCC-00060" "ERCC-00062" "ERCC-00067" "ERCC-00069" "ERCC-00071" "ERCC-00073" "ERCC-00074"
	[37] "ERCC-00076" "ERCC-00077" "ERCC-00078" "ERCC-00079" "ERCC-00081" "ERCC-00083" "ERCC-00084" "ERCC-00085" "ERCC-00086" "ERCC-00092" "ERCC-00095" "ERCC-00096"
	[49] "ERCC-00097" "ERCC-00098" "ERCC-00099" "ERCC-00104" "ERCC-00108" "ERCC-00109" "ERCC-00111" "ERCC-00112" "ERCC-00113" "ERCC-00116" "ERCC-00120" "ERCC-00123"
	[61] "ERCC-00126" "ERCC-00130" "ERCC-00131" "ERCC-00134" "ERCC-00136" "ERCC-00137" "ERCC-00138" "ERCC-00142" "ERCC-00143" "ERCC-00144" "ERCC-00145" "ERCC-00147"
	[73] "ERCC-00148" "ERCC-00150" "ERCC-00154" "ERCC-00156" "ERCC-00157" "ERCC-00158" "ERCC-00160" "ERCC-00162" "ERCC-00163" "ERCC-00164" "ERCC-00165" "ERCC-00168"
	[85] "ERCC-00170" "ERCC-00171"
	# 可以看到有不少 ERCC 基因

	sum (scRNA$percent.ERCC< 40)
	# 较接近原文过滤数量 2149，但感觉条件有点宽松了，先做下去看看
	# 网上看了相关教程，一般 ERCC 占比不高于 10%
	sum (scRNA$percent.ERCC< 10) #就只剩下 460 个 cell，明显低于文献中的数量
	pctERCC=40
	scRNA <- subset (scRNA, subset = percent.ERCC < pctERCC)
	dim (scRNA)
	# 20047 2142 原文为 19752 2149
	dim (a.filt)
	#23460 2343 未过滤前

# 质控绘图

	# 2.2 可视化
	# 图 A：观察不同组 cell 的 counts、feature 分布
	col.num <- length (unique (scRNA@meta.data$Patient_ID))
	library (ggplot2)

	p1_1.1 <- VlnPlot (scRNA,
	features = c ("nFeature_RNA"),
	group.by = "Patient_ID",
	cols =rainbow (col.num)) +
	theme (legend.position = "none") +
	labs (tag = "A")
	p1_1.1
	p1_1.2 <- VlnPlot (scRNA,
	features = c ("nCount_RNA"),
	group.by = "Patient_ID",
	cols =rainbow (col.num)) +
	theme (legend.position = "none")
	p1_1.2
	p1_1 <- p1_1.1 \| p1_1.2
	p1_1
	VlnPlot (scRNA,
	features = c ("nFeature_RNA","nCount_RNA","percent.ERCC"))
	# 图 B：nCount_RNA 与对应的 nFeature_RNA 关系
	p1_2 <- FeatureScatter (scRNA, feature1 = "nCount_RNA", feature2 = "nFeature_RNA",
	group.by = "Patient_ID",pt.size = 1.3) +
	labs (tag = "B")
	p1_2
	FeatureScatter (scRNA, feature1 = "nCount_RNA", feature2 = "percent.ERCC")

<center>
p1_1
</center>

<center>
p1_2
</center>

目前已经完成了文献中的 fig1A、B

# 挑选高变基因

	### 3、挑选 hvg 基因，可视化 ----
	#highly Variable gene: 简单理解 sd 大的
	scRNA <- FindVariableFeatures (scRNA, selection.method = "vst", nfeatures = 1500)
	# 根据文献原图，挑选变化最大的 1500 个 hvg
	top10 <- head (VariableFeatures (scRNA), 10)
	top10
	plot1 <- VariableFeaturePlot (scRNA)
	# 标记 top10 hvg
	p1_3 <- LabelPoints (plot = plot1, points = top10, repel = TRUE, size=2.5) +
	theme (legend.position = c (0.1,0.8)) +
	labs (tag = "C")
	p1_3
	save (scRNA, file="2.3.Rdata")

<center>
p1_3
</center>

目前已复现文献中的图 1A 到 C。

bioinformatics ScRNAseq