生信技能树单细胞数据挖掘笔记 (1) - 单细胞 - 生信学习 | Zhou Xiaozhao = 小钊の笔记 = 前天是小兔子，昨天是小鹿，今天是你

# 前言

最近有幸加入生信技能树的单细胞数据挖掘尝鲜群，借此机会给大家分享一下

生信技能树单细胞数据挖掘笔记 (1)：数据读入

生信技能树单细胞数据挖掘笔记 (2)：创建 Seurat 对象并进行质控、筛选高变基因并可视化

生信技能树单细胞数据挖掘笔记 (3)：降维与聚类

生信技能树单细胞数据挖掘笔记 (4)：其他分析（周期判断、double 诊断、细胞类型注释）

生信技能树单细胞数据挖掘笔记 (5)：轨迹分析

生信技能树单细胞数据挖掘笔记 (6)：差异分析 (1)

生信技能树单细胞数据挖掘笔记 (7)：差异分析 (2)

# 课程大纲

# 入门基础知识

所需 R 包

	getOption ("BioC_mirror")
	getOption ("CRAN")
	#CRAN 基础包
	options (CRAN="https://mirrors.ustc.edu.cn/CRAN/")## 设置镜像
	cran_packages <- c ('tidyverse',
	'ggplot2'
	)
	for (pkg in cran_packages){
	if (! require (pkg,character.only=T) ) {
	install.packages (pkg,ask = F,update = F)
	require (pkg,character.only=T)
	}
	}
	if (!require ("BiocManager")) install.packages ("BiocManager",update = F,ask = F)

	#Bio 分析包
	Biocductor_packages <- c ("Seurat",
	"scran",
	"scater",
	"monocle",
	"DropletUtils",
	"SingleR"
	)
	options (BioC_mirror="https://mirrors.ustc.edu.cn/bioc/")
	#options (BioC_mirror="https://mirrors.tuna.tsinghua.edu.cn/bioconductor")
	# use BiocManager to install
	for (pkg in Biocductor_packages){
	if (! require (pkg,character.only=T) ) {
	BiocManager::install (pkg,ask = F,update = F)
	require (pkg,character.only=T)
	}
	}

	# 最后检查下成功与否
	for (pkg in c (Biocductor_packages,cran_packages)){
	require (pkg,character.only=T)
	}

	### GEO
	#
	# GEO Platform (GPL)
	# GEO Sample (GSM)
	# GEO Series (GSE)
	# GEO Dataset (GDS)

	# 一篇文章可以有一个或者多个 GSE 数据集，一个 GSE 里面可以有一个或者多个 GSM 样本。
	# 多个研究的 GSM 样本可以根据研究目的整合为一个 GDS，不过 GDS 本身用的很少。
	# 而每个数据集都有着自己对应的芯片平台，就是 GPL。(芯片名与基因名 ID 转换)
	#https://blog.csdn.net/weixin_43569478/article/details/108079337
	# https://www.ncbi.nlm.nih.gov/geo/query/acc.cgi?acc=

	BiocManager::install ("GEOquery")
	library (GEOquery)
	gse1009 <- getGEO ('GSE1009', destdir=".")
	class (gse1009)
	length (gse1009)
	a <- gse1009 [[1]]
	class (gse1009 [1])
	a
	b <- exprs (a)
	c <- pData (a)
	a$platform_id

# 下载、探索、数据整理

本次数据采用《Glioblastoma cell differentiation trajectory predicts the immunotherapy response and overall survival of patients》中的单细胞数据进行分析（GSE 号：84465）

	### 1、下载、探索、整理数据 ----
	## 1.1 下载、探索数据
	#https://www.ncbi.nlm.nih.gov/geo/query/acc.cgi?acc=GSE84465 ## 数据来源
	sessionInfo ()

# 读取数据

	a <- read.table ("../rawdata/GSE84465_GBM_All_data.csv.gz")
	a [1:4,1:4]
	# 行名为 symbol ID
	# 列名为 sample，看上去像是两个元素的组合。
	summary (a [,1:4])
	boxplot (a [,1:4])
	head (rownames (a))
	tail (rownames (a),10)
	# 可以看到原文的 counts 矩阵来源于 htseq 这个计数软件，所以有一些不是基因的行需要剔除：
	# "no_feature" "ambiguous" "too_low_aQual" "not_aligned" "alignment_not_unique"
	tail (a [,1:4],10)

	a=a [1:(nrow (a)-5),]

	# 原始 counts 数据

	#3,589 cells of 4 human primary GBM samples, accession number GSE84465
	#2,343 cells from tumor cores and 1,246 cells from peripheral regions
	b <- read.table ("../rawdata/SraRunTable.txt",
	sep = ",", header = T)
	b [1:4,1:4]
	table (b$Patient_ID) # 4 human primary GBM samples
	table (b$TISSUE) # tumor cores and peripheral regions
	table (b$TISSUE,b$Patient_ID)

# 整理数据

可以发现两个数据不对应，a 矩阵行名（sample）并非为 GSM 编号，而主要是由相应的 plate_id 与 Well 组合而成

	## 1.2 整理数据
	# tumor and peripheral 分组信息
	head (colnames (a))
	[1] "X1001000173.G8" "X1001000173.D4" "X1001000173.B4" "X1001000173.A2" "X1001000173.E2" "X1001000173.F6"
	head (b$plate_id)
	[1] 1001000173 1001000173 1001000173 1001000173 1001000173 1001000173
	head (b$Well)
	#a 矩阵行名（sample）并非为 GSM 编号，而主要是由相应的 plate_id 与 Well 组合而成

	b.group <- b [,c ("plate_id","Well","TISSUE","Patient_ID")]
	b.group$sample <- paste0 ("X",b.group$plate_id,".",b.group$Well)
	head (b.group)
	plate_id Well TISSUE Patient_ID sample
	1 1001000173 G8 Tumor BT_S2 X1001000173.G8
	2 1001000173 D4 Tumor BT_S2 X1001000173.D4
	3 1001000173 B4 Tumor BT_S2 X1001000173.B4
	4 1001000173 A2 Tumor BT_S2 X1001000173.A2
	5 1001000173 E2 Tumor BT_S2 X1001000173.E2
	6 1001000173 F6 Tumor BT_S2 X1001000173.F6
	identical (colnames (a),b.group$sample)

	# 筛选 tumor cell
	index <- which (b.group$TISSUE=="Tumor")
	length (index)
	group <- b.group [index,] #筛选的是行
	head (group)

	a.filt <- a [,index] #筛选的是列
	dim (a.filt)
	identical (colnames (a.filt),group$sample)

经过筛选之后得到所有的肿瘤细胞的表达矩阵