生信技能树单细胞数据挖掘笔记 (2):创建 Seurat 对象并进行质控、筛选高变基因并可视化
生信技能树单细胞数据挖掘笔记 (4):其他分析(周期判断、double 诊断、细胞类型注释)
前言:使用 GSE81861 提供的数据,比较 CRC 肿瘤上皮细胞与正常上皮细胞的差异。GEO 提供了 count 与 fpkm 两种数据。笔记内容先用官方的 fpkm 数据做差异分析,再利用 counts 数据手动计算 fpkm 矩阵,完成差异分析。最后比较两种方法的结果是否存在差异。
https://www.ncbi.nlm.nih.gov/geo/query/acc.cgi?acc=GSE81861
注:因为有不少重复的步骤,故设置较多的函数。
# 1、概述
# 1.1 单细胞差异分析 pipeline
简单来说分为三步:首先导入、制备规范的表达矩阵以及分组信息;然后利用 Seurat 包构建 seurat 对象,归一化;最后进行差异分析,以及结果的可视化。
# 1.2 count 标准化
主要受测序文库 (样本总 read 数) 与基因长度的影响,测序的 counts 数据不能直接进行差异分析,需要进行标准化处理。常见的几种标准化方法简单介绍如下 --
- rpkm:counts 先对测序文库标准化,再对基因长度标准化;
- fpkm:FPKM 同 RPKM 是一样的,只是 RPKM 用于单末端测序,而 FPKM 用于双末端测序;
- tpm:counts 先对基因长度标准化,再对测序文库标准化;
- cpm:counts 只对测序文库标准化。
测序文库相对容易计算,直接使用 colSums()
函数即可;而基因长度则比较难求,首先要了解基因长度有不同的定义标准,其次要知道哪些 R 包提供相关生物数据。目前有以下三种方法,以及根据与官方 fpkm 验证,最终选择第三种方法用于后续的分析。
# 计算基因长度
# (1) TxDb.Hsapiens.UCSC.hg19.knownGene 包
if (!require("TxDb.Hsapiens.UCSC.hg19.knownGene", quietly = TRUE)) | |
BiocManager::install("TxDb.Hsapiens.UCSC.hg19.knownGene") | |
library(TxDb.Hsapiens.UCSC.hg19.knownGene) | |
txdb <- TxDb.Hsapiens.UCSC.hg19.knownGene | |
exon_txdb=exons(txdb) | |
genes_txdb=genes(txdb) |
# g_l.1-- 根据非冗余外显子之和定义
g_l_1 <- function (){ | |
o <- findOverlaps (exon_txdb,genes_txdb) | |
t1=exon_txdb [queryHits (o)] | |
t2=genes_txdb [subjectHits (o)] | |
t1=as.data.frame (t1) | |
t1$geneid=mcols (t2)[,1] | |
# 得到 exon_id 与 geneid 的对应关系 | |
g_l.1 <- lapply (split (t1,t1$geneid),function (x){ | |
#按 gene id 拆分表格 | |
head (x) | |
tmp=apply (x,1,function (y){ | |
y [2]:y [3] | |
}) #根据每一个 gene 所有 exon 的区间,生成区间内的整数,返回的为 list。 | |
length (unique (unlist (tmp))) | |
#计算共有多少种整数,即为最终的非冗余 exon 长度之和 | |
}) | |
head (g_l.1) #为一个 list | |
g_l.1=data.frame (gene_id=names (g_l.1),length=as.numeric (g_l.1)) | |
dim (g_l.1) | |
head (g_l.1) | |
#为基因 ID 增添 ENSEMBLE ID | |
if (!require ("org.Hs.eg.db", quietly = TRUE)) | |
BiocManager::install ("org.Hs.eg.db") | |
library (org.Hs.eg.db) | |
s2g=toTable (org.Hs.egENSEMBL) | |
head (s2g) | |
g_l.1=merge (g_l.1,s2g,by='gene_id') | |
#把 g_l,s2g 两个数据框以 'gene_id' 为连接进行拼接 | |
head (g_l.1) | |
return (g_l.1) | |
} | |
g_l.1 <- g_l_1 () | |
head (g_l.1) |
# g_l.2---- 根据最长转录本定义
g_l_2 <- function (){ | |
t_l=transcriptLengths (txdb) | |
head (t_l) | |
t_l=na.omit (t_l) | |
#先按基因 ID,再按转录本长度从大到小排序 | |
t_l=t_l [order (t_l$gene_id,t_l$tx_len,decreasing = T),] | |
head (t_l);dim (t_l) | |
#根据 gene_id 去重,选择第一个,也就是最长的那个 | |
t_l=t_l [!duplicated (t_l$gene_id),] | |
head (t_l);dim (t_l) | |
g_l.2=t_l [,c (3,5)] | |
library (org.Hs.eg.db) | |
s2g=toTable (org.Hs.egENSEMBL) | |
g_l.2=merge (g_l.2,s2g,by='gene_id') | |
head (g_l.2) | |
return (g_l.2) | |
} | |
g_l.2 <- g_l_2 () | |
head (g_l.2) |
# (2) biomaRt 包
# g_l.3-- 根据最长转录本定义
g_l_3 <- function (){ | |
if (!require ("biomaRt", quietly = TRUE)) | |
BiocManager::install ("biomaRt") | |
library (biomaRt) | |
ensembl <- useMart ("ensembl") #connect to a specified BioMart database | |
ensembl = useDataset ("hsapiens_gene_ensembl",mart=ensembl) | |
#use the hsapiens (人类) dataset. 或者直接如下设置 | |
#ensembl = useMart ("ensembl",dataset="hsapiens_gene_ensembl") | |
#BiocManager::install ('grimbough/biomaRt') | |
#library (biomaRt) | |
#packageVersion ('biomaRt') | |
test <- getBM (attributes=c ('ensembl_gene_id', 'start_position', | |
'end_position','ensembl_transcript_id', | |
'transcript_length'), | |
mart = ensembl) | |
test <- test [order (test$ensembl_gene_id,test$transcript_length, | |
decreasing = T),] | |
g_l.3 <- test [!duplicated (test$ensembl_gene_id),] | |
g_l.3 <- g_l.3 [,c (1,5)] | |
head (g_l.3) | |
return (g_l.3) | |
} | |
g_l.3 <- g_l_3 () |
# 比较三种结果的差异
dim (g_l.1);dim (g_l.2);dim (g_l.3) | |
g_l.1 <- g_l.1 [,-1] | |
colnames (g_l.1) <- c ("g_l.1","ensembl_id") | |
g_l.2 <- g_l.2 [,-1] | |
colnames (g_l.2) <- c ("g_l.2","ensembl_id") | |
colnames (g_l.3) <- c ("ensembl_id","g_l.3") | |
g_l_all <- merge (g_l.1, g_l.2, by="ensembl_id") | |
g_l_all <- merge (g_l_all, g_l.3, by="ensembl_id") | |
head (g_l_all,10) | |
summary (g_l_all) | |
# 最终选择第三种结果,即 biomaR 包 | |
g_l <- g_l.3 |
# 2. 官方 fpkm 数据差异分析
# 2.1 表达矩阵与分组信息
#表达矩阵 | |
nm_FPKM <- read.csv ("GSE81861_CRC_NM_epithelial_cells_FPKM.csv/GSE81861_CRC_NM_epithelial_cells_FPKM.csv") | |
data_input <- function (data){ | |
#data <- nm_FPKM | |
row.names (data) <- data [,1] | |
data <- data [,-1] | |
data <- as.matrix (data) | |
rownames (data) <- sapply (strsplit (rownames (data),"_"),"[",3) | |
rownames (data) <- sapply (strsplit (rownames (data),"[.]"),"[",1) | |
colnames (data) <- sapply (strsplit (colnames (data),"_"),"[",1) | |
data <- data [grep ("ENSG",rownames (data)),] | |
return (data) | |
} | |
nm_FPKM <- data_input (nm_FPKM) | |
dim (nm_FPKM) #56380 个基因 #160 个样本 | |
nm_FPKM [1:4,1:4] | |
tumor_FPKM <- read.csv ("GSE81861_CRC_tumor_epithelial_cells_FPKM.csv/GSE81861_CRC_tumor_epithelial_cells_FPKM.csv") | |
tumor_FPKM <- data_input (tumor_FPKM) | |
tumor_FPKM [1:4,1:4] | |
dim (tumor_FPKM)#56380 个基因 #272 个样本 | |
exp_FPKM <- cbind (nm_FPKM,tumor_FPKM) | |
dim (exp_FPKM) # 56380 个基因 432 个样本 | |
# 分组信息 | |
group_dat <- data.frame (group=c (rep ('normal',160), | |
rep ('tumor',272)), | |
row.names = colnames (exp_FPKM)) | |
[1] 56380 160 | |
RHC4104 RHC6087 RHL2880 RHC5949 | |
ENSG00000000003 0.000 185.315 323.203 22.311700 | |
ENSG00000000005 0.000 0.000 0.000 0.000000 | |
ENSG00000000419 0.000 231.756 0.000 0.851514 | |
ENSG00000000457 100.135 0.000 0.000 0.000000 | |
RHC4075 RHC5563 RHC5552 RHC4874 | |
ENSG00000000003 0 0.000 0 0 | |
ENSG00000000005 0 0.000 0 0 | |
ENSG00000000419 0 0.000 0 0 | |
ENSG00000000457 0 193.167 0 0 | |
[1] 56380 272 | |
[1] 56380 432 |
# 2.2 ID 转换
因为 seurat 质控需要过滤线粒体基因,所以需要把 ensembl ID 转换为 symbol ID
exp_FPKM [1:4,1:4] | |
id_change <- function (data){ | |
print (dim (data)) | |
library (org.Hs.eg.db) | |
ids1 <- data.frame (ID=c (1:nrow (data)), | |
ensembl_id=rownames (data)) | |
ids2 <- merge (toTable (org.Hs.egENSEMBL), | |
toTable (org.Hs.egSYMBOL),by="gene_id") | |
ids <- merge (ids1, ids2, by="ensembl_id") | |
data <- data [ids$ID,] | |
rownames (data) <- ids$symbol | |
print (dim (data)) | |
return (data) | |
} | |
exp_FPKM <- id_change (exp_FPKM) #有 2w + 个 ensembl ID 没配对到 symbol | |
exp_FPKM [1:4,1:4] |
# 2.3 创建 seurat,质控,差异分析一键操作
scRNA_deg <- function (exp,group){ | |
library (Seurat) | |
print ("创建 seurat 对象...") | |
scRNA <- CreateSeuratObject (counts=exp, | |
meta.data=group) | |
print ("质控...") | |
scRNA [["percent.mt"]] <- PercentageFeatureSet (scRNA, pattern = "^MT-") | |
minGene=500;maxGene=4000;pctMT=15 | |
scRNA <- subset (scRNA, subset = nFeature_RNA > minGene & nFeature_RNA < maxGene & percent.mt < pctMT) | |
print ("归一化...") | |
scRNA <- NormalizeData (scRNA, normalization.method = "LogNormalize", scale.factor = 10000) | |
print ("差异分析...") | |
diff_dat <- FindMarkers (scRNA,ident.1="normal",ident.2="tumor", | |
group.by='group') | |
} | |
FPKM_diff <- scRNA_deg (exp=exp_FPKM, group=group_dat) | |
head (FPKM_diff) | |
dim (FPKM_diff) | |
FPKM_diff <- FPKM_diff [FPKM_diff$p_val<0.01 & abs (FPKM_diff$avg_logFC)>0.8,] | |
dim (FPKM_diff) | |
exp_FPKM_diff <- exp_FPKM [match (rownames (FPKM_diff),rownames (exp_FPKM)),] |
[1] "创建 seurat 对象..." | |
[1] "质控..." | |
[1] "归一化..." | |
Performing log-normalization | |
0% 10 20 30 40 50 60 70 80 90 100% | |
[----|----|----|----|----|----|----|----|----|----| | |
**************************************************| | |
[1] "差异分析..." | |
|++++++++++++++++++++++++++++++++++++++++++++++++++| 100% elapsed=04s | |
[1] 1455 5 | |
[1] 119 5 |
# 2.4 差异结果可视化
热图
my_heatmap <- function(exp_dat){ | |
n <- t(scale(t(exp_dat))) | |
n[n>2]=2;n[n< -2]= -2 | |
library(pheatmap) | |
pheatmap(n, show_rownames = F, | |
show_colnames = F, | |
annotation_col = group_dat) | |
} | |
p.exp_FPKM_diff <- my_heatmap(exp_FPKM_diff) | |
p.exp_FPKM_diff |
箱图