題目出自【生信技能樹】公眾號文章:2011年的表達芯片分析和2019年的區(qū)別
要求:走GEO標準分析流程分析GSE27447
代碼主要復制于Jimmy大佬的github:https://github.com/jmzeng1314/GEO/tree/master/GSE42872_main
主要步驟:
step1 - 讀入GSE27447數(shù)據(jù)
step2 - 檢查表達矩陣
step3 - 差異分析
step4 - GO/KEGG數(shù)據(jù)庫注釋
step5 - GSEA基因集富集分析(施工中)
step6 - GSVA基因集變異分析(施工中)
step1 - 讀入GSE27447數(shù)據(jù)
rm(list=ls())
options(stringsAsFactors = F)
library(AnnoProbe) # 生信技能樹出品的GEO數(shù)據(jù)下載利器
library(Biobase)
gset <- geoChina("GSE27447")
gse27447 <- gset[[1]]
exprSet <- exprs(gse27447)
boxplot(exprSet,las=2)
根據(jù)boxplot可以得出,原始數(shù)據(jù)需要log2處理。
exprSet <- log2(exprSet+1)
boxplot(exprSet,las=2)
exprSet[1:4,1:4]
# GSM678364 GSM678365 GSM678366 GSM678367
#7892501 3.221012 4.502222 1.271969 1.849167
#7892502 6.909593 7.305615 6.956150 7.078983
#7892503 5.125911 8.659878 5.395697 5.660467
#7892504 11.045603 10.060453 10.845937 11.145225
#行名是探針,列名是樣本,中間的數(shù)據(jù)是某樣本中某探針的表達量。
gse27447@annotation
#[1] "GPL6244"
checkGPL(gse27447@annotation)
#[1] TRUE
# checkGPL()
結果是TRUE說明AnnoProbe包中存在"GPL6244" 平臺數(shù)據(jù),于是可以使用另外兩個超級厲害的函數(shù)idmap()
和filterEM()
,得到探針對應的基因名,然后把表達矩陣的探針名轉換為基因名。
ids <- idmap(gse27447@annotation)
dat <- filterEM(exprSet,ids)
dim(dat)
#[1] 18837 19
dat <- dat[order(rownames(dat)),]
獲取臨床信息,從中進一步獲取分組信息:
pd <- pData(gse27447)
library(stringr)
group_list=str_split(pd$title,' ',simplify = T)[,1]
table(group_list)
#group_list
#non-triple triple
# 14 5
所以是5個三陰乳腺癌樣本,14個非三陰乳腺癌樣本。
把表達矩陣和分組信息保存到Rdata文件。
save(dat,group_list,file = 'step1-output.Rdata')
step2 - 檢查表達矩陣
rm(list = ls()) ##清空Enviroment
options(stringsAsFactors = F)
load('step1-output.Rdata') ##讀入第一步得到的數(shù)據(jù),然后檢查數(shù)據(jù)
table(group_list)
#group_list
#non-triple triple
# 14 5
dat[1:4,1:4]
# GSM678364 GSM678365 GSM678366 GSM678367
#A1CF 6.654092 6.074542 6.258994 6.609746
#A2M 10.266259 11.034813 10.494956 11.450283
#A2ML1 6.375017 6.118954 5.683851 4.278468
#A3GALT2 4.424096 4.144055 4.906982 5.065266
2.1 樣本相關性分析
dim(dat)
#[1] 18837 19
ac=data.frame(groups=group_list)
rownames(ac)=colnames(dat) #把ac的行名給到n的列名,即對每一個探針標記上分組信息
pheatmap(cor(dat),annotation_col = ac)
2.2 主成分分析(PCA)
# 我們是對樣本做主成分分析,所以要求行名是樣本名,列名是探針(基因)名,所以需要對表達矩陣進行轉置
dat=t(dat)
dat=as.data.frame(dat)
dat=cbind(dat,group_list) ##給表達矩陣加上分組信息
library("FactoMineR")
library("factoextra")
dat.pca <- PCA(dat[,-ncol(dat)], graph = FALSE)#dat最后一列是group_list。pca分析需要一個純數(shù)值矩陣,所以將dat最后一列去掉以后賦值給dat.pca
(fviz_pca_ind(dat.pca,geom.ind = "point",
col.ind = dat$group_list,
palette = c("#00AFBB", "#E7B800"),
addEllipses = TRUE,
legend.title = "Groups"
))
ggsave('all_samples_PCA.png')
可以看到,兩組樣本并沒有分開╮(╯▽╰)╭
不過大家都是腫瘤樣本,相似性較高也可以理解。
2.3 取sd最大的1000個基因畫熱圖
rm(list = ls()) ## 魔幻操作,一鍵清空~
load(file = 'step1-output.Rdata')
dat[1:4,1:4]
cg=names(tail(sort(apply(dat,1,sd)),1000))
##使用 apply()
函數(shù)計算獲取表達矩陣dat每行(即每個基因表達量)的方差,從小到大排序后,取最大的1000個方差,獲取其對應的基因名,賦值給變量cg
##然后就可以對這1000個基因畫熱圖
library(pheatmap)
pheatmap(dat[cg,],show_colnames =F,show_rownames = F)
優(yōu)化一下,比如對dat[cg,]
歸一化:
n=t(scale(t(dat[cg,])))
n[n>2]=2
n[n< -2]= -2
n[1:4,1:4]
pheatmap(n,show_colnames =F,show_rownames = F)
給樣本添加分組信息:
ac=data.frame(g=group_list)
rownames(ac)=colnames(n) #把ac的行名給到n的列名,即對每一個樣本標記上分組信息
(heatmap <- pheatmap(n,show_colnames =F,show_rownames = F,
annotation_col=ac))
library(ggplot2)
ggsave(filename = 'heatmap_top1000_sd.png',heatmap)
step3 - 差異分析
3.1 使用limma包做差異分析
參考:【生信菜鳥團】用limma對芯片數(shù)據(jù)做差異分析
rm(list = ls()) ## 魔幻操作,一鍵清空~
options(stringsAsFactors = F)
load(file = 'step1-output.Rdata') ##載入數(shù)據(jù)
dat[1:4,1:4]
# GSM678364 GSM678365 GSM678366 GSM678367
#A1CF 6.654092 6.074542 6.258994 6.609746
#A2M 10.266259 11.034813 10.494956 11.450283
#A2ML1 6.375017 6.118954 5.683851 4.278468
#A3GALT2 4.424096 4.144055 4.906982 5.065266
group_list[group_list=='non-triple'] <- 'non_triple'
table(group_list)
#group_list
#non_triple triple
# 14 5
使用箱線圖boxplot
肉眼觀察一下前幾個基因的數(shù)據(jù)分布。
boxplot(unlist(dat[1,])~group_list)
自定義一個函數(shù),用
ggpubr包
畫漂亮點的boxplot
library(ggpubr)
bp=function(g){ #定義一個函數(shù)g,函數(shù)為{}里的內容
df=data.frame(gene=g,stage=group_list)
p <- ggboxplot(df, x = "stage", y = "gene",
color = "stage", palette = "jco",
add = "jitter")
p + stat_compare_means()
}
p1 <- bp(as.numeric(dat[1,]))
p2 <- bp(as.numeric(dat[2,]))
p3 <- bp(as.numeric(dat[3,]))
p4 <- bp(as.numeric(dat[4,]))
library(patchwork)
(bp_4 <- p1|p2|p3|p4)
以上也只屬于check data,下面開始真正的差異分析
library(limma)
design <- model.matrix(~0+factor(group_list))
colnames(design)=levels(factor(group_list))
exprSet=dat
rownames(design)=colnames(exprSet)
contrast.matrix<-makeContrasts("triple-non_triple",levels = design)
contrast.matrix ##這個矩陣聲明,我們要把 triple 組跟 non_triple 進行差異分析比較
# Contrasts
#Levels triple-non_triple
# non_triple -1
# triple 1
自定義函數(shù)DEG,功能是做差異分析。
DEG <- function(exprSet,design,contrast.matrix){
fit <- lmFit(exprSet,design)
fit2 <- contrasts.fit(fit, contrast.matrix)
fit2 <- eBayes(fit2)
tempOutput = topTable(fit2, coef=1, n=Inf)
nrDEG = na.omit(tempOutput)
return(nrDEG)
}
把數(shù)據(jù)喂給DEG函數(shù),獲取差異分析結果,并保存到Rdata
deg <- DEG(exprSet,design,contrast.matrix)
head(deg)
# logFC AveExpr t P.Value adj.P.Val B
#ARFGEF3 -1.933996 9.224776 -5.893853 8.360630e-06 0.06003476 2.930433
#NEK11 -1.644951 6.944406 -5.882179 8.582134e-06 0.06003476 2.911194
#HOXB3 -1.459656 7.038370 -5.696430 1.303732e-05 0.06003476 2.601988
#EGR2 2.140287 6.380807 5.680934 1.350247e-05 0.06003476 2.575932
#C3orf14 -2.110725 6.862692 -5.555574 1.794713e-05 0.06003476 2.363696
# DNAH5 -3.742125 5.909818 -5.478036 2.141867e-05 0.06003476 2.231157
save(deg,file = 'deg.Rdata')
手動檢查deg前幾個基因表達情況
bp(as.numeric(dat[rownames(dat)=='ARFGEF3',]))
可以看到,與non-triple組比較,triple組表達ARFGEF3顯著下調,與deg結果一致。
3.2 差異分析結果的可視化
3.2.1 火山圖
設置P.Value
和logFC
的閾值,將基因分為up,down,stable三類
df <- deg
colnames(deg)
df$v <- -log10(df$P.Value)
p_thred <- 0.05
logFC_thred <- 1.5
df$groups = ifelse(df$P.Value > p_thred, "stable",
ifelse(df$logFC > logFC_thred, "up",
ifelse(df$logFC < -logFC_thred, "down",
"stable")))
table(df$groups)
# down stable up
# 164 18554 119
library(ggplot2)
this_tile <- paste0('Cutoff for logFC is ',round(logFC_thred,3),
'\nThe number of up gene is ',nrow(df[df$groups =='up',]) ,
'\nThe number of down gene is ',nrow(df[df$groups =='down',])
)
p <- ggplot(data = df, aes(x = logFC, y = v)) +
geom_point(alpha=0.4, size=1.75,
aes(color=groups)) +
scale_color_manual(values=c("blue", "grey","red")) +
geom_vline(xintercept=c(-logFC_thred,logFC_thred),lty=4,col="black",lwd=0.8) +
geom_hline(yintercept = -log10(p_thred),lty=4,col="black",lwd=0.8) +
labs(x="logFC",y="-log10(P.value)")+
ggtitle( this_tile ) + theme(plot.title = element_text(size=15,hjust = 0.5))+
theme_bw()
print(p)
3.2.2 熱圖
load(file = 'step1-output.Rdata')
dat[1:4,1:4]
# GSM678364 GSM678365 GSM678366 GSM678367
#A1CF 6.654092 6.074542 6.258994 6.609746
#A2M 10.266259 11.034813 10.494956 11.450283
#A2ML1 6.375017 6.118954 5.683851 4.278468
#A3GALT2 4.424096 4.144055 4.906982 5.065266
table(group_list)
#group_list
#non-triple triple
# 14 5
x=deg$logFC #deg取logFC這列并將其重新賦值給x
names(x)=rownames(deg) #deg取probe_id這列,并將其作為名字給x
cg=c(names(head(sort(x),100)),#對x進行從小到大排列,取前100及后100,并取其對應的探針名,作為向量賦值給cg
names(tail(sort(x),100)))
n=t(scale(t(dat[cg,])))
n[n>2]=2
n[n< -2]= -2
ac=data.frame(groups=group_list)
rownames(ac)=colnames(n)
pheatmap(n,show_colnames =F,show_rownames = F,
annotation_col=ac)
step4 - GO/KEGG數(shù)據(jù)庫注釋
參考:【生信技能樹】公眾號文章:為R包寫一本書(像Y叔致敬)
4.0 整理數(shù)據(jù)
設置P.Value和logFC的閾值,將基因分為UP
,DOWN
,stable
3組, deg矩陣新建一列g
儲存基因分組信息。
不同的閾值,篩選到的差異基因數(shù)量就不一樣,后面的超幾何分布檢驗結果就大相徑庭。
rm(list = ls()) ## 魔幻操作,一鍵清空~
load(file = 'deg.Rdata')
head(deg)
# logFC AveExpr t P.Value adj.P.Val B
#ARFGEF3 -1.933996 9.224776 -5.893853 8.360630e-06 0.06003476 2.930433
#NEK11 -1.644951 6.944406 -5.882179 8.582134e-06 0.06003476 2.911194
#HOXB3 -1.459656 7.038370 -5.696430 1.303732e-05 0.06003476 2.601988
#EGR2 2.140287 6.380807 5.680934 1.350247e-05 0.06003476 2.575932
#C3orf14 -2.110725 6.862692 -5.555574 1.794713e-05 0.06003476 2.363696
#DNAH5 -3.742125 5.909818 -5.478036 2.141867e-05 0.06003476 2.231157
p_thred <- 0.05
logFC_thred <- 1.5
deg$g=ifelse(deg$P.Value>p_thred,'stable',
ifelse( deg$logFC > logFC_thred,'UP',
ifelse( deg$logFC < -logFC_thred,'DOWN','stable') )
)
table(deg$g)
# DOWN stable UP
# 164 18554 119
head(deg)
# logFC AveExpr t P.Value adj.P.Val B g
#ARFGEF3 -1.933996 9.224776 -5.893853 8.360630e-06 0.06003476 2.930433 DOWN
#NEK11 -1.644951 6.944406 -5.882179 8.582134e-06 0.06003476 2.911194 DOWN
#HOXB3 -1.459656 7.038370 -5.696430 1.303732e-05 0.06003476 2.601988 stable
#EGR2 2.140287 6.380807 5.680934 1.350247e-05 0.06003476 2.575932 UP
#C3orf14 -2.110725 6.862692 -5.555574 1.794713e-05 0.06003476 2.363696 DOWN
#DNAH5 -3.742125 5.909818 -5.478036 2.141867e-05 0.06003476 2.231157 DOWN
富集分析需要基因的ENTREZID
,使用Y叔神作clusterProfiler包
里的bitr()
函數(shù)獲取對應關系,deg矩陣轉存為DEG,并加上ENTREZID
列
deg$SYMBOL=rownames(deg)
library(ggplot2)
library(clusterProfiler)
library(org.Hs.eg.db)
df <- bitr(unique(deg$SYMBOL), fromType = "SYMBOL",
toType = c( "ENTREZID"),
OrgDb = org.Hs.eg.db)
head(df)
# SYMBOL ENTREZID
#1 ARFGEF3 57221
#2 NEK11 79858
#3 HOXB3 3213
#4 EGR2 1959
#5 C3orf14 57415
#6 DNAH5 1767
DEG=deg
head(DEG)
# logFC AveExpr t P.Value adj.P.Val B g SYMBOL
#ARFGEF3 -1.933996 9.224776 -5.893853 8.360630e-06 0.06003476 2.930433 DOWN ARFGEF3
#NEK11 -1.644951 6.944406 -5.882179 8.582134e-06 0.06003476 2.911194 DOWN NEK11
#HOXB3 -1.459656 7.038370 -5.696430 1.303732e-05 0.06003476 2.601988 stable HOXB3
#EGR2 2.140287 6.380807 5.680934 1.350247e-05 0.06003476 2.575932 UP EGR2
#C3orf14 -2.110725 6.862692 -5.555574 1.794713e-05 0.06003476 2.363696 DOWN C3orf14
#DNAH5 -3.742125 5.909818 -5.478036 2.141867e-05 0.06003476 2.231157 DOWN DNAH5
DEG=merge(DEG,df,by='SYMBOL')
head(DEG)
# SYMBOL logFC AveExpr t P.Value adj.P.Val B g ENTREZID
#1 A1CF -0.24279450 6.481373 -0.78084875 0.44383564 0.8444129 -5.353833 stable 29974
#2 A2M 0.74751848 11.069348 1.89303707 0.07258966 0.5827869 -4.132066 stable 2
#3 A2ML1 0.99851595 4.933066 0.92880174 0.36382423 0.8085547 -5.242500 stable 144568
#4 A3GALT2 0.03320466 4.269750 0.08468268 0.93333708 0.9894294 -5.625240 stable 127550
#5 A4GALT -0.28893317 6.883433 -0.71439103 0.48305962 0.8637983 -5.397984 stable 53947
#6 A4GNT 0.43189200 6.202285 1.60275087 0.12432137 0.6497896 -4.528644 stable 51146
save(DEG,file = 'anno_DEG.Rdata')
分別取出上調基因和下調基因,合并為差異基因
gene_up <- DEG[DEG$g == 'UP','ENTREZID']
gene_down <- DEG[DEG$g == 'DOWN','ENTREZID']
gene_diff <- c(gene_up,gene_down)
gene_all <- as.character(DEG[ ,'ENTREZID'] )
geneList <- DEG$logFC
names(geneList) <- DEG$ENTREZID
geneList <- sort(geneList,decreasing = T)
4.1 KEGG
4.1.1 上調基因集
library(ggplot2)
library(clusterProfiler)
kk.up <- enrichKEGG(gene = gene_up,
organism = 'hsa',
universe = gene_all,
pvalueCutoff = 0.9,
qvalueCutoff = 0.9)
head(kk.up)[,1:6]
# ID Description GeneRatio BgRatio pvalue p.adjust
#hsa04640 hsa04640 Hematopoietic cell lineage 7/53 88/7130 3.334829e-06 0.000506894
#hsa04662 hsa04662 B cell receptor signaling pathway 6/53 78/7130 2.152723e-05 0.001636069
#hsa04062 hsa04062 Chemokine signaling pathway 6/53 175/7130 1.769682e-03 0.081549397
#hsa05340 hsa05340 Primary immunodeficiency 3/53 35/7130 2.146037e-03 0.081549397
#hsa04921 hsa04921 Oxytocin signaling pathway 5/53 147/7130 4.506704e-03 0.133247928
#hsa04064 hsa04064 NF-kappa B signaling pathway 4/53 95/7130 5.259787e-03 0.133247928
KEGG分析上調基因集結果可視化:
1)上調基因所屬信號通路(氣泡圖)
dotplot(kk.up)
2)查看第一個結果hsa04640的信號通路示意圖:
browseKEGG(kk.up, 'hsa04640')
3)上調基因所屬信號通路(條帶圖)
(gg_barplot <- barplot(kk.up,showCategory=20))
4)通路與基因之間的關系可視化:
cnetplot(kk.up, categorySize="pvalue", foldChange=geneList,colorEdge = TRUE)
5)通路與通路之間的關系圖:
emapplot(kk.up)
6)熱圖展現(xiàn)通路與基因之間的關系:
heatplot(kk.up)
4.1.2 下調基因集
同樣的方法計算下調基因集KEGG
kk.down <- enrichKEGG(gene = gene_down,
organism = 'hsa',
universe = gene_all,
pvalueCutoff = 0.9,
qvalueCutoff = 0.9)
head(kk.down)[,1:6]
# ID Description GeneRatio BgRatio pvalue p.adjust
#hsa04915 hsa04915 Estrogen signaling pathway 6/62 130/7130 0.0008719172 0.1464821
#hsa00910 hsa00910 Nitrogen metabolism 2/62 16/7130 0.0082547476 0.4221930
#hsa04921 hsa04921 Oxytocin signaling pathway 5/62 147/7130 0.0087691281 0.4221930
#hsa04934 hsa04934 Cushing syndrome 5/62 152/7130 0.0100522154 0.4221930
#hsa04360 hsa04360 Axon guidance 5/62 177/7130 0.0184322243 0.4366278
#hsa04927 hsa04927 Cortisol synthesis and secretion 3/62 65/7130 0.0186643448 0.4366278
dotplot(kk.down );ggsave('kk.down.dotplot.png')
4.1.2 差異基因集
kk.diff <- enrichKEGG(gene = gene_diff,
organism = 'hsa',
pvalueCutoff = 0.05)
head(kk.diff)[,1:6]
dotplot(kk.diff );ggsave('kk.diff.dotplot.png')
合并上下調基因后,幾乎分析不出信號通路。╮(╯▽╰)╭
4.1.3 Pathway Enrichment
kegg_diff_dt <- as.data.frame(kk.diff)
kegg_down_dt <- as.data.frame(kk.down)
kegg_up_dt <- as.data.frame(kk.up)
down_kegg<-kegg_down_dt[kegg_down_dt$pvalue<0.05,];down_kegg$group=-1
up_kegg<-kegg_up_dt[kegg_up_dt$pvalue<0.05,];up_kegg$group=1
kegg_plot <- function(up_kegg,down_kegg){
dat=rbind(up_kegg,down_kegg)
colnames(dat)
dat$pvalue = -log10(dat$pvalue)
dat$pvalue=dat$pvalue*dat$group
dat=dat[order(dat$pvalue,decreasing = F),]
g_kegg<- ggplot(dat, aes(x=reorder(Description,order(pvalue, decreasing = F)), y=pvalue, fill=group)) +
geom_bar(stat="identity") +
scale_fill_gradient(low="blue",high="red",guide = FALSE) +
scale_x_discrete(name ="Pathway names") +
scale_y_continuous(name ="log10P-value") +
coord_flip() + theme_bw()+theme(plot.title = element_text(hjust = 0.5))+
ggtitle("Pathway Enrichment")
}
g_kegg=kegg_plot(up_kegg,down_kegg)
print(g_kegg)
ggsave(g_kegg,filename = 'kegg_up_down.png')
這張圖其實就是上調基因和下調基因pathway條帶圖合并,可以看到
Oxytocin signaling pathway
同時存在于上調基因和下調基因pathway,剛好跟差異基因集的分析結果一樣。
4.1.4 GSEA
kk_gse <- gseKEGG(geneList = geneList,
organism = 'hsa',
nPerm = 1000,
minGSSize = 120,
pvalueCutoff = 0.9,
verbose = FALSE)
head(kk_gse)[,1:6]
gseaplot(kk_gse, geneSetID = rownames(kk_gse[1,]))
down_kegg<-kk_gse[kk_gse$pvalue<0.05 & kk_gse$enrichmentScore < 0,];down_kegg$group=-1
up_kegg<-kk_gse[kk_gse$pvalue<0.05 & kk_gse$enrichmentScore > 0,];up_kegg$group=1
g_kegg=kegg_plot(up_kegg,down_kegg)
print(g_kegg)
ggsave(g_kegg,filename = 'kegg_up_down_gsea.png')
4.2 GO
g_list=list(gene_up=gene_up,
gene_down=gene_down,
gene_diff=gene_diff)
go_enrich_results <- lapply( g_list , function(gene) {
lapply( c('BP','MF','CC') , function(ont) {
cat(paste('Now process ',ont ))
ego <- enrichGO(gene = gene,
universe = gene_all,
OrgDb = org.Hs.eg.db,
ont = ont ,
pAdjustMethod = "BH",
pvalueCutoff = 0.99,
qvalueCutoff = 0.99,
readable = TRUE)
print( head(ego) )
return(ego)
})
})
save(go_enrich_results,file = 'go_enrich_results.Rdata')
load(file = 'go_enrich_results.Rdata')
n1= c('gene_up','gene_down','gene_diff')
n2= c('BP','MF','CC')
for (i in 1:3){
for (j in 1:3){
fn=paste0('dotplot_',n1[i],'_',n2[j],'.png')
cat(paste0(fn,'\n'))
png(fn,res=150,width = 1080)
dotplot(go_enrich_results[[i]][[j]],title=paste0('dotplot_',n1[i],'_',n2[j])) %>% print()
dev.off()
}
}
未完待續(xù)……
step5 - GSEA基因集富集分析
step6 - GSVA基因集變異分析