# 前言
最近有幸加入生信技能树的单细胞数据挖掘尝鲜群,借此机会给大家分享一下
生信技能树单细胞数据挖掘笔记 (2):创建 Seurat 对象并进行质控、筛选高变基因并可视化
生信技能树单细胞数据挖掘笔记 (4):其他分析(周期判断、double 诊断、细胞类型注释)
# 课程大纲

# 入门基础知识

所需 R 包
| getOption ("BioC_mirror") | |
| getOption ("CRAN") | |
| #CRAN 基础包 | |
| options (CRAN="https://mirrors.ustc.edu.cn/CRAN/")## 设置镜像 | |
| cran_packages <- c ('tidyverse', | |
| 'ggplot2' | |
| ) | |
| for (pkg in cran_packages){ | |
|   if (! require (pkg,character.only=T) ) { | |
| install.packages (pkg,ask = F,update = F) | |
| require (pkg,character.only=T) | |
| } | |
| } | |
| if (!require ("BiocManager")) install.packages ("BiocManager",update = F,ask = F) | |
| #Bio 分析包 | |
| Biocductor_packages <- c ("Seurat", | |
| "scran", | |
| "scater", | |
| "monocle", | |
| "DropletUtils", | |
| "SingleR" | |
| ) | |
| options (BioC_mirror="https://mirrors.ustc.edu.cn/bioc/") | |
| #options (BioC_mirror="https://mirrors.tuna.tsinghua.edu.cn/bioconductor") | |
| # use BiocManager to install | |
| for (pkg in Biocductor_packages){ | |
|   if (! require (pkg,character.only=T) ) { | |
| BiocManager::install (pkg,ask = F,update = F) | |
| require (pkg,character.only=T) | |
| } | |
| } | |
| # 最后检查下成功与否 | |
| for (pkg in c (Biocductor_packages,cran_packages)){ | |
| require (pkg,character.only=T) | |
| } | |
| ### GEO | |
| # | |
| # GEO Platform (GPL) | |
| # GEO Sample (GSM) | |
| # GEO Series (GSE) | |
| # GEO Dataset (GDS) | |
| # 一篇文章可以有一个或者多个 GSE 数据集,一个 GSE 里面可以有一个或者多个 GSM 样本。 | |
| # 多个研究的 GSM 样本可以根据研究目的整合为一个 GDS,不过 GDS 本身用的很少。 | |
| # 而每个数据集都有着自己对应的芯片平台,就是 GPL。(芯片名与基因名 ID 转换) | |
| #https://blog.csdn.net/weixin_43569478/article/details/108079337 | |
| # https://www.ncbi.nlm.nih.gov/geo/query/acc.cgi?acc= | |
| BiocManager::install ("GEOquery") | |
| library (GEOquery) | |
| gse1009 <- getGEO ('GSE1009', destdir=".") | |
| class (gse1009) | |
| length (gse1009) | |
| a <- gse1009 [[1]] | |
| class (gse1009 [1]) | |
| a | |
| b <- exprs (a) | |
| c <- pData (a) | |
| a$platform_id | 
# 下载、探索、数据整理
本次数据采用《Glioblastoma cell differentiation trajectory predicts the immunotherapy response and overall survival of patients》中的单细胞数据进行分析(GSE 号:84465)


| ### 1、下载、探索、整理数据 ---- | |
| ## 1.1 下载、探索数据 | |
| #https://www.ncbi.nlm.nih.gov/geo/query/acc.cgi?acc=GSE84465 ## 数据来源 | |
| sessionInfo () | 

# 读取数据
| a <- read.table ("../rawdata/GSE84465_GBM_All_data.csv.gz") | |
| a [1:4,1:4] | |
| # 行名为 symbol ID | |
| # 列名为 sample,看上去像是两个元素的组合。 | |
| summary (a [,1:4]) | |
| boxplot (a [,1:4]) | |
| head (rownames (a)) | |
| tail (rownames (a),10) | |
| # 可以看到原文的 counts 矩阵来源于 htseq 这个计数软件,所以有一些不是基因的行需要剔除: | |
| # "no_feature" "ambiguous" "too_low_aQual" "not_aligned" "alignment_not_unique" | |
| tail (a [,1:4],10) | |
| a=a [1:(nrow (a)-5),] | |
| # 原始 counts 数据 | |
| #3,589 cells of 4 human primary GBM samples, accession number GSE84465 | |
| #2,343 cells from tumor cores and 1,246 cells from peripheral regions | |
| b <- read.table ("../rawdata/SraRunTable.txt", | |
| sep = ",", header = T) | |
| b [1:4,1:4] | |
| table (b$Patient_ID) # 4 human primary GBM samples | |
| table (b$TISSUE) # tumor cores and peripheral regions | |
| table (b$TISSUE,b$Patient_ID) | 
# 整理数据
可以发现两个数据不对应,a 矩阵行名(sample)并非为 GSM 编号,而主要是由相应的 plate_id 与 Well 组合而成
| ## 1.2 整理数据 | |
| # tumor and peripheral 分组信息 | |
| head (colnames (a)) | |
| [1] "X1001000173.G8" "X1001000173.D4" "X1001000173.B4" "X1001000173.A2" "X1001000173.E2" "X1001000173.F6" | |
| head (b$plate_id) | |
| [1] 1001000173 1001000173 1001000173 1001000173 1001000173 1001000173 | |
| head (b$Well) | |
| #a 矩阵行名(sample)并非为 GSM 编号,而主要是由相应的 plate_id 与 Well 组合而成 | |
| b.group <- b [,c ("plate_id","Well","TISSUE","Patient_ID")] | |
| b.group$sample <- paste0 ("X",b.group$plate_id,".",b.group$Well) | |
| head (b.group) | |
| plate_id Well TISSUE Patient_ID sample | |
| 1 1001000173 G8 Tumor BT_S2 X1001000173.G8 | |
| 2 1001000173 D4 Tumor BT_S2 X1001000173.D4 | |
| 3 1001000173 B4 Tumor BT_S2 X1001000173.B4 | |
| 4 1001000173 A2 Tumor BT_S2 X1001000173.A2 | |
| 5 1001000173 E2 Tumor BT_S2 X1001000173.E2 | |
| 6 1001000173 F6 Tumor BT_S2 X1001000173.F6 | |
| identical (colnames (a),b.group$sample) | |
| # 筛选 tumor cell | |
| index <- which (b.group$TISSUE=="Tumor") | |
| length (index) | |
| group <- b.group [index,] #筛选的是行 | |
| head (group) | |
| a.filt <- a [,index] #筛选的是列 | |
| dim (a.filt) | |
| identical (colnames (a.filt),group$sample) | 
经过筛选之后得到所有的肿瘤细胞的表达矩阵
