# 目录
1.Module 1 - Introduction to RNA sequencing
2.Module 2 - RNA-seq Alignment and Visualization
3.Module 3 - Expression and Differential Expression
4.Module 4 - Isoform Discovery and Alternative Expression
- Reference Guided Transcript Assembly
- de novo Transcript Assembly
- Transcript Assembly Merge
- Differential Splicing
- Splicing Visualization
5.Module 5 - De novo transcript reconstruction
6.Module 6 - Functional Annotation of Transcripts
# 3.2 Differential Expression
# Ballgown
用 Ballgown 比较肿瘤和正常情况。详情请参考手册:
https://www.bioconductor.org/packages/release/bioc/html/ballgown.html `
使用所有复制,对已知 (仅参考模式) 转录本进行 UHR 与 HBR 比较:
首先创建一个文件,列出我们的 6 个表达式文件,然后查看该文件,检查这些结果:
printf "\"ids\",\"type\",\"path\"\n\"UHR_Rep1\",\"UHR\",\"UHR_Rep1\"\n\"UHR_Rep2\",\"UHR\",\"UHR_Rep2\"\n\"UHR_Rep3\",\"UHR\",\"UHR_Rep3\"\n\"HBR_Rep1\",\"HBR\",\"HBR_Rep1\"\n\"HBR_Rep2\",\"HBR\",\"HBR_Rep2\"\n\"HBR_Rep3\",\"HBR\",\"HBR_Rep3\"\n" > UHR_vs_HBR.csv | |
cat UHR_vs_HBR.csv | |
R |
#Jason Walker, jason.walker[AT]wustl.edu | |
#Malachi Griffith, mgriffit[AT]wustl.edu | |
#Obi Griffith, obigriffith[AT]wustl.edu | |
#The Genome McDonnell Institute, Washington University School of Medicine | |
#R tutorial for Informatics for RNA-sequence Analysis workshops | |
library(ballgown) | |
library(genefilter) | |
library(dplyr) | |
library(devtools) | |
# Load phenotype data from a file we saved in the current working directory | |
pheno_data = read.csv("UHR_vs_HBR.csv") | |
# Load ballgown data structure and save it to a variable "bg" | |
bg = ballgown(samples=as.vector(pheno_data$path), pData=pheno_data) | |
# Display a description of this object | |
bg | |
# Load all attributes including gene name | |
bg_table = texpr(bg, 'all') | |
bg_gene_names = unique(bg_table[, 9:10]) | |
# Save the ballgown object to a file for later use | |
save(bg, file='bg.rda') | |
# Perform differential expression (DE) analysis with no filtering | |
results_transcripts = stattest(bg, feature="transcript", covariate="type", getFC=TRUE, meas="FPKM") | |
results_genes = stattest(bg, feature="gene", covariate="type", getFC=TRUE, meas="FPKM") | |
results_genes = merge(results_genes, bg_gene_names, by.x=c("id"), by.y=c("gene_id")) | |
# Save a tab delimited file for both the transcript and gene results | |
write.table(results_transcripts, "UHR_vs_HBR_transcript_results.tsv", sep="\t", quote=FALSE, row.names = FALSE) | |
write.table(results_genes, "UHR_vs_HBR_gene_results.tsv", sep="\t", quote=FALSE, row.names = FALSE) | |
# Filter low-abundance genes. Here we remove all transcripts with a variance across the samples of less than one | |
bg_filt = subset (bg,"rowVars(texpr(bg)) > 1", genomesubset=TRUE) | |
# Load all attributes including gene name | |
bg_filt_table = texpr(bg_filt , 'all') | |
bg_filt_gene_names = unique(bg_filt_table[, 9:10]) | |
# Perform DE analysis now using the filtered data | |
results_transcripts = stattest(bg_filt, feature="transcript", covariate="type", getFC=TRUE, meas="FPKM") | |
results_genes = stattest(bg_filt, feature="gene", covariate="type", getFC=TRUE, meas="FPKM") | |
results_genes = merge(results_genes, bg_filt_gene_names, by.x=c("id"), by.y=c("gene_id")) | |
# Output the filtered list of genes and transcripts and save to tab delimited files | |
write.table(results_transcripts, "UHR_vs_HBR_transcript_results_filtered.tsv", sep="\t", quote=FALSE, row.names = FALSE) | |
write.table(results_genes, "UHR_vs_HBR_gene_results_filtered.tsv", sep="\t", quote=FALSE, row.names = FALSE) | |
# Identify the significant genes with p-value < 0.05 | |
sig_transcripts = subset(results_transcripts, results_transcripts$pval<0.05) | |
sig_genes = subset(results_genes, results_genes$pval<0.05) | |
# Output the signifant gene results to a pair of tab delimited files | |
write.table(sig_transcripts, "UHR_vs_HBR_transcript_results_sig.tsv", sep="\t", quote=FALSE, row.names = FALSE) | |
write.table(sig_genes, "UHR_vs_HBR_gene_results_sig.tsv", sep="\t", quote=FALSE, row.names = FALSE) | |
# Exit the R session | |
quit(save="no") |
运行结束后查看结果
head UHR_vs_HBR_gene_results.tsv |
这条染色体上有多少个基因?
grep -v feature UHR_vs_HBR_gene_results.tsv | wc -l |
在 UHR 或 HBR 中有多少通过过滤?
grep -v feature UHR_vs_HBR_gene_results_filtered.tsv | wc -l |
在这条染色体上发现了多少差异表达基因 (p 值 < 0.05)?
grep -v feature UHR_vs_HBR_gene_results_sig.tsv | wc -l |
展示前 20 个 DE 基因。看看 IGV 中的一些基因 —— 它们有意义吗?
grep -v feature UHR_vs_HBR_gene_results_sig.tsv | sort -rnk 3 | head -n 20 #Higher abundance in UHR | |
grep -v feature UHR_vs_HBR_gene_results_sig.tsv | sort -nk 3 | head -n 20 #Higher abundance in HBR |
将 P<0.05 的所有基因保存到一个新文件中。
grep -v feature UHR_vs_HBR_gene_results_sig.tsv | cut -f 6 | sed 's/\"//g' > DE_genes.txt | |
head DE_genes.txt |
# edgeR
mkdir -p de/htseq_counts | |
cd de/htseq_counts | |
perl -ne 'if ($_ =~ /gene_id\s\"(ENSG\S+)\"\;/) { $id = $1; $name = undef; if ($_ =~ /gene_name\s\"(\S+)"\;/) { $name = $1; }; }; if ($id && $name) {print "$id\t$name\n";} if ($_=~/gene_id\s\"(ERCC\S+)\"/){print "$1\t$1\n";}' $RNA_REF_GTF | sort | uniq > ENSG_ID2Name.txt | |
head ENSG_ID2Name.txt |
cut -f 1 ENSG_ID2Name.txt | sort | uniq | wc | |
cut -f 2 ENSG_ID2Name.txt | sort | uniq | wc | |
cut -f 2 ENSG_ID2Name.txt | sort | uniq -c | sort -r | head |
#Malachi Griffith, mgriffit[AT]wustl.edu | |
#Obi Griffith, obigriffith[AT]wustl.edu | |
#The McDonnell Genome Institute, Washington University School of Medicine | |
#R tutorial for Informatics for RNA-sequence Analysis workshops | |
####################### | |
# Loading Data into R # | |
####################### | |
#Set working directory where output will go | |
working_dir = "path_to/de/htseq_counts" | |
setwd(working_dir) | |
#Read in gene mapping | |
mapping=read.table("~/workspace/rnaseq/de/htseq_counts/ENSG_ID2Name.txt", header=FALSE, stringsAsFactors=FALSE, row.names=1) | |
# Read in count matrix | |
rawdata=read.table("~/workspace/rnaseq/expression/htseq_counts/gene_read_counts_table_all_final.tsv", header=TRUE, stringsAsFactors=FALSE, row.names=1) | |
# Check dimensions | |
dim(rawdata) | |
# Require at least 25% of samples to have count > 25 | |
quant <- apply(rawdata,1,quantile,0.75) | |
keep <- which((quant >= 25) == 1) | |
rawdata <- rawdata[keep,] | |
dim(rawdata) | |
################# | |
# Running edgeR # | |
################# | |
# load edgeR | |
library('edgeR') | |
# make class labels | |
class <- factor( c( rep("UHR",3), rep("HBR",3) )) | |
# Get common gene names | |
genes=rownames(rawdata) | |
gene_names=mapping[genes,1] | |
# Make DGEList object | |
y <- DGEList(counts=rawdata, genes=genes, group=class) | |
nrow(y) | |
# TMM Normalization | |
y <- calcNormFactors(y) | |
# Estimate dispersion | |
y <- estimateCommonDisp(y, verbose=TRUE) | |
y <- estimateTagwiseDisp(y) | |
# Differential expression test | |
et <- exactTest(y) | |
# Print top genes | |
topTags(et) | |
# Print number of up/down significant genes at FDR = 0.05 significance level | |
summary(de <- decideTestsDGE(et, p=.05)) | |
detags <- rownames(y)[as.logical(de)] | |
# Output DE genes | |
# Matrix of significantly DE genes | |
mat <- cbind( | |
genes,gene_names, | |
sprintf('%0.3f',log10(et$table$PValue)), | |
sprintf('%0.3f',et$table$logFC) | |
)[as.logical(de),] | |
colnames(mat) <- c("Gene", "Gene_Name", "Log10_Pvalue", "Log_fold_change") | |
# Order by log fold change | |
o <- order(et$table$logFC[as.logical(de)],decreasing=TRUE) | |
mat <- mat[o,] | |
# Save table | |
write.table(mat, file="DE_genes.txt", quote=FALSE, row.names=FALSE, sep="\t") | |
#To exit R type the following | |
quit(save="no") |
与其他结果比较
cat DE_genes.txt | |
cat /de/htseq_counts/DE_genes.txt |
cut -f 1 DE_genes.txt | sort > ballgown_DE_gene_symbols.txt | |
cut -f 2 de/htseq_counts/DE_genes.txt | sort > htseq_counts_edgeR_DE_gene_symbols.txt |