從GEO數(shù)據(jù)下載到limma差異分析,再到topGO富集分析

一個(gè)失敗案例,不要點(diǎn)進(jìn)來(lái)


GSE2034 topGO

1. DEG

1.1 下載GEO數(shù)據(jù)

library(GEOquery)
f <- 'GSE2034.Rdata'
if(!file.exists(f)){
  gset <- getGEO('GSE2034',destdir = '.',
                 AnnotGPL = F,
                 getGPL = F)
  save(gset,file=f)
}
load('GSE2034.Rdata')

1.2 表達(dá)矩陣

eset <- gset[[1]]
expmat <- exprs(eset)

## 檢查數(shù)據(jù)是否已經(jīng)經(jīng)過(guò)log轉(zhuǎn)換,并進(jìn)行l(wèi)og2轉(zhuǎn)換
ex <- expmat
qx <- as.numeric(quantile(ex, c(0., 0.25, 0.5, 0.75, 0.99, 1.0), na.rm=T))

LogC <- (qx[5] > 100) ||
  (qx[6]-qx[1] > 50 && qx[2] > 0) ||
  (qx[2] > 0 && qx[2] < 1 && qx[4] > 1 && qx[4] < 2)

if (LogC) { ex[which(ex <= 0)] <- NaN
expmat <- log2(ex)
print("log2 transform finished")}else{print("log2 transform not needed")}
# [1] "log2 transform finished"

fit <- lmFit(expmat, design)
fit1 <- contrasts.fit(fit, contrast.matrix) 
fit2 <- eBayes(fit, 0.01)

DEGstb <- topTable(fit2, coef=1, n=Inf)
DEGstb <- DEGstb[order(DEGstb[,4], decreasing = F),]

before:

after:

1.3 設(shè)計(jì)矩陣和對(duì)比矩陣

pdata <- pData(eset)
group_list <- pdata$`bone relapses (1=yes, 0=no):ch1`
library(limma)
design <- model.matrix(~0+factor(group_list))
colnames(design) <- c("free", "relapses")
rownames(design) <- colnames(expmat)
contrast.matrix <- makeContrasts('free-relapses',levels = design) 

1.4 差異分析

fit <- lmFit(expmat, design)
fit1 <- contrasts.fit(fit, contrast.matrix) 
fit2 <- eBayes(fit1)
DEGstb <- topTable(fit2, coef=1, n=Inf)
DEGmtx <- na.omit(DEGstb)
head(DEGmtx)
#                  logFC   AveExpr         t      P.Value    adj.P.Val
# 209835_x_at  0.7010262 10.616209  6.251930 1.460149e-09 3.253649e-05
# 212014_x_at  0.7264678 10.372145  5.927511 8.785625e-09 9.788505e-05
# 204490_s_at  0.4928252  9.788234  5.558565 6.206262e-08 4.609804e-04
# 209380_s_at -0.5373120 10.062837 -5.421342 1.253664e-07 6.983850e-04
# 209831_x_at  0.2666780 10.594035  5.228288 3.295703e-07 1.338433e-03
# 204489_s_at  0.5619588 10.302607  5.181790 4.142994e-07 1.338433e-03
#                     B result
# 209835_x_at 11.154503     UP
# 212014_x_at  9.526820     UP
# 204490_s_at  7.757191     UP
# 209380_s_at  7.121927   DOWN
# 209831_x_at  6.249843    NOT
# 204489_s_at  6.043635     UP

1.5 火山圖

logFC_cutoff <- with(DEGmtx, mean(abs(logFC)) + 
                       2*sd(abs(logFC)))

DEGdf$result <- ifelse(DEGdf$P.Value < 0.05 &
                                    abs(DEGdf$logFC) > logFC_cutoff,
                                  ifelse(DEGdf$logFC >logFC_cutoff,
                                         "UP", "DOWN"), "NOT")

title <- paste0('Cutoff for logFC is ',round(logFC_cutoff,3), 
                '\nThe number of up gene is ',nrow(DEGmtx[DEGmtx$result =='UP',]) ,
                '\nThe number of down gene is ',nrow(DEGmtx[DEGmtx$result =='DOWN',]))

library(ggplot2)
ggplot(data=DEGmtx, aes(x=logFC, y=-log10(P.Value), color=result)) +
  geom_point(alpha=0.4, size=1.75) +
  theme_set(theme_set(theme_bw(base_size=20))) +
  xlab("log2 fold change") + ylab("-log10 p-value") +
  ggtitle( title ) + theme(plot.title = element_text(size=15,hjust = 0.5)) +
  scale_colour_manual(values = c('blue','black','red'))

1.6 PCA圖

pca_data <- prcomp(t(expmat),scale=TRUE)
pcx <- data.frame(pca_data$x)
pcr <- cbind(samples=rownames(pcx),group_list, pcx) 
ggplot(pcr, aes(PC1, PC2)) + geom_point(size=5, aes(color = group_list)) +
  geom_text(aes(label="samples"),hjust=-0.1, vjust=-0.3)

這個(gè)數(shù)據(jù)中的分組:bone relapses: 1=yes, 0=no

分別是加樣本名和不加樣本名的亞子:

1.7 熱圖

選擇&調(diào)整矩陣,設(shè)置各種參數(shù):

chosedgene <- rownames(DEGmtx)[1:50]
chosedmatx <- expmat[chosedgene,]      

re <- subset(pdata,pdata[,28]==1)
re <- row.names(re)
fr <- row.names(subset(pdata,pdata[,28]==0))
sampleorder <- as.vector(c(fr,re))

do_mtx <- t(scale(t(chosedmatx)))
do_mtx[do_mtx > 2] = 2
do_mtx[do_mtx < -2] = -2

do_mtx <- do_mtx[,sampleorder]

chosedprob <- row.names(do_mtx)
chosedsymbol <- AnnotationDbi::select(hgu133a.db, chosedprob,
                                      "SYMBOL",
                                      "PROBEID")[,2]
row.names(do_mtx) <- chosedsymbol

anno_col <- data.frame(sample=factor(rep(c("free", "relapses"), c(217, 69))))
row.names(anno_col) <- colnames(do_mtx)
ann_color <- list(sample=c(free = "#3a5fcd", relapses = "#cd0000"))

畫(huà)圖:

png(filename = "GSE2034_hm_nmlmatrix_nocluscol.png", height = 5000, width = 6000,
   res = 500, units = "px")

pheatmap::pheatmap(do_mtx, scale = "none", color = color,
                   annotation_col = anno_col,
                   annotation_colors = ann_color,
                   fontsize = 8, fontsize_row = 8, 
                   show_colnames = F,
                   cluster_cols = F,
                   main = "GSE2034: free vs relapses HEATMAP (delected outliers)",)
dev.off()

2. topGO

2.1 topGO: Predefined list of interesting genes

過(guò)表征分析 (over representation analysis, ORA)

Tests based on gene counts. This is the most popular family of tests, given that it only requires the presence of a list of interesting genes and nothing more. Tests like Fisher's exact test, Hypegeometric test and binomial test belong to this family. Draghici et al. (2006) .

2.1.1 構(gòu)建 topGOdata 對(duì)象

構(gòu)建 geneID2GO list

library(topGO)
library(hgu133a.db)
library(dplyr)

probe_id <- row.names(expmat)
entrez_id <- AnnotationDbi::select(hgu133a.db,
                                   probe_id,
                                   "ENTREZID",
                                   "PROBEID")[,2]

geneID2GOtb <- AnnotationDbi::select(hgu133a.db, entrez_id,
                                     "GO", "ENTREZID")
geneID2GOsb <- geneID2GOtb %>%
  group_by(ENTREZID) %>%
  summarise(GO=list(GO))
geneID2GO <- geneID2GOsb$GO
names(geneID2GO) <- rownames(geneID2GOsb)

構(gòu)建 geneList

myInterestingDEG <- subset(DEGdf, abs(DEGdf$logFC) > 1)
myInterestingDEG <- subset(myInterestingDEG, myInterestingDEG$adj.P.Val < 0.05)
logFC AveExpr t P.Value adj.P.Val B result
214777_at 1.621019 7.578387 4.723681 3.63E-06 0.005771 4.093824 UP
216365_x_at 1.340734 5.631291 4.599642 6.35E-06 0.007075 3.592027 UP
217157_x_at 1.02288 8.543867 4.564419 7.43E-06 0.007348 3.45161 UP
216984_x_at 1.482014 7.826338 4.56344 7.46E-06 0.007348 3.447722 UP
217378_x_at 1.740957 8.055472 4.500623 9.85E-06 0.008778 3.199637 UP
217258_x_at 1.217204 7.180579 4.485677 1.05E-05 0.009013 3.14105 UP
217148_x_at 1.236926 9.662857 4.468183 1.14E-05 0.009236 3.072682 UP
211798_x_at 1.129989 7.765948 4.411513 1.45E-05 0.009236 2.852795 UP
214669_x_at 1.146124 11.22506 4.382578 1.65E-05 0.009236 2.741463 UP
211644_x_at 1.51252 8.895729 4.353739 1.86E-05 0.00958 2.631127 UP
215176_x_at 1.554616 9.781743 4.350516 1.89E-05 0.00958 2.618838 UP
211637_x_at 1.319264 6.128292 4.291975 2.42E-05 0.01036 2.396959 UP
215121_x_at 1.279395 12.98246 4.247872 2.92E-05 0.011018 2.231533 UP
216576_x_at 1.355247 8.468197 4.198257 3.59E-05 0.011951 2.047218 UP
204540_at -1.06758 8.610066 -4.19044 3.71E-05 0.011982 2.018367 DOWN
216401_x_at 1.475812 7.711042 4.180749 3.86E-05 0.011983 1.982631 UP
207430_s_at -1.3399 6.097727 -4.15096 4.37E-05 0.012368 1.873294 DOWN
213201_s_at -1.27114 6.641464 -4.13499 4.66E-05 0.01267 1.814941 DOWN
215214_at 1.239997 7.376844 4.125793 4.84E-05 0.012763 1.781441 UP
204933_s_at 1.073296 4.540879 4.031533 7.10E-05 0.015467 1.441822 UP
217281_x_at 1.355866 5.417214 4.015399 7.58E-05 0.015784 1.384391 UP
214858_at -1.08985 6.455748 -4.00976 7.75E-05 0.015861 1.36435 DOWN
216510_x_at 1.132241 6.304655 3.99392 8.26E-05 0.016195 1.308245 UP
215379_x_at 1.123251 11.77926 3.943786 0.000101 0.016689 1.131931 UP
211634_x_at 1.014891 6.92074 3.93688 0.000104 0.016689 1.107799 UP
210297_s_at -1.201 6.705364 -3.93263 0.000105 0.016689 1.092959 DOWN
209138_x_at 1.136452 13.28617 3.9291 0.000107 0.016689 1.080658 UP
211645_x_at 1.244501 9.308405 3.889339 0.000125 0.017262 0.942694 UP
216491_x_at 1.292163 5.893802 3.845647 0.000148 0.018642 0.79254 UP
216560_x_at 1.278329 5.810675 3.778756 0.000192 0.020615 0.565602 UP
221286_s_at 1.10116 7.542805 3.777608 0.000192 0.020615 0.561739 UP
206391_at 1.082072 6.946425 3.505906 0.000528 0.031189 -0.32275 UP
205509_at -1.20626 8.869439 -3.48369 0.000572 0.032322 -0.39241 DOWN
209374_s_at 1.034312 10.40725 3.443591 0.000659 0.034236 -0.51714 UP
204623_at -1.04503 11.21622 -3.25463 0.001271 0.045188 -1.08699 DOWN
geneNames <- names(geneID2GO) 
myInterestingGenestb <- AnnotationDbi::select(hgu133a.db, row.names(myInterestingDEG),
                                            "ENTREZID", "PROBEID")

myInterestingGenes <- geneNames[match(myInterestingGenestb[,2], 
                                      geneNames)]
myInterestingGenes <- na.omit(myInterestingGenes)

ORAgeneList <- factor(as.integer(geneNames %in% myInterestingGenes)) 
names(ORAgeneList) <- geneNames 
str(ORAgeneList) 
#  Factor w/ 2 levels "0","1": 1 1 1 1 1 1 1 1 1 1 ...
#  - attr(*, "names")= chr [1:13755] "10" "100" "1000" "10000" ...

構(gòu)建 topGOdata 對(duì)象:

ORAGOdata <- new("topGOdata",
              ontology = "BP",
              allGenes = ORAgeneList,
              annot = annFUN.gene2GO,
              gene2GO = geneID2GO)
# 
# Building most specific GOs .....
#   ( 11302 GO terms found. )
# 
# Build GO DAG topology ..........
#   ( 15326 GO terms and 36394 relations. )
# 
# Annotating nodes ...............
#   ( 11973 genes annotated to the GO terms. )
ORAGOdata
# 
# ------------------------- topGOdata object -------------------------
# 
#  Description:
#    -   
# 
#  Ontology:
#    -  BP 
# 
#  13755 available genes (all genes from the array):
#    - symbol:  10 100 1000 10000 100008586  ...
#    - 29  significant genes. 
# 
#  11973 feasible genes (genes that can be used in the analysis):
#    - symbol:  10 100 1000 10000 10001  ...
#    - 21  significant genes. 
# 
#  GO graph (nodes with at least  1  genes):
#    - a graph with directed edges
#    - number of nodes = 15326 
#    - number of edges = 36394 
# 
# ------------------------- topGOdata object -------------------------
# 

2.1.2 富集分析

we perform a classical enrichment analysis by testing the over-representation of GO terms within the group of di erentially expressed genes.

ORAresultFisher <- runTest(ORAGOdata, algorithm = "classic", statistic = "fisher") 
# 
#            -- Classic Algorithm -- 
# 
#        the algorithm is scoring 449 nontrivial nodes
#        parameters: 
#            test statistic: fisher
# 
ORAresultFisher
# 
# Description:  
# Ontology: BP 
# 'classic' algorithm with the 'fisher' test
# 15326 GO terms scored: 86 terms with p < 0.01
# Annotation data:
#     Annotated genes: 11973 
#     Significant genes: 21 
#     Min. no. of genes annotated to a GO: 1 
#     Nontrivial nodes: 449 

使用 topGO 包時(shí),所構(gòu)建的 topGOdata 包含了 gene universe (ORAGOdata) 和 p-value (GSEAGOdata), ORA 和 GSEA 對(duì)應(yīng)的統(tǒng)計(jì)方法可以同時(shí)應(yīng)用于同一個(gè) topGOdata 對(duì)象。所以區(qū)別只是基于不同的已知信息,構(gòu)建 topGOdata 的方法不同。(maybe?

基于 classicelim 算法,對(duì) ORAdata 進(jìn)行 Kolmogorov-Smirnov test:

ORAresultKS <- runTest(ORAGOdata, algorithm = "classic", statistic = "ks") 
# 
#            -- Classic Algorithm -- 
# 
#        the algorithm is scoring 15326 nontrivial nodes
#        parameters: 
#            test statistic: ks
#            score order: increasing
ORAresultKS
# 
# Description:  
# Ontology: BP 
# 'classic' algorithm with the 'ks' test
# 15326 GO terms scored: 903 terms with p < 0.01
# Annotation data:
#     Annotated genes: 11973 
#     Significant genes: 21 
#     Min. no. of genes annotated to a GO: 1 
#     Nontrivial nodes: 15326 
ORAresult.elim <- runTest(ORAGOdata, algorithm = "elim", statistic = "ks") 
# 
#            -- Elim Algorithm -- 
# 
#        the algorithm is scoring 15326 nontrivial nodes
#        parameters: 
#            test statistic: ks
#            cutOff: 0.01
#            score order: increasing
# ...
ORAresult.elim
# 
# Description:  
# Ontology: BP 
# 'elim' algorithm with the 'ks : 0.01' test
# 15326 GO terms scored: 291 terms with p < 0.01
# Annotation data:
#     Annotated genes: 11973 
#     Significant genes: 21 
#     Min. no. of genes annotated to a GO: 1 
#     Nontrivial nodes: 15326 

2.1.3 Summary

ORAallRes <- GenTable(ORAGOdata,classicFisher = ORAresultFisher,
                   classicKS = ORAresultKS,
                   elimKS = ORAresult.elim,
                   orderBy = "elimKS",
                   ranksOf = "classicFisher",
                   topNodes = 20)
GO.ID Term Annotated Significant Expected Rank in classicFisher classicFisher classicKS elimKS
1 GO:0030198 extracellular matrix organization 303 1 0.53 233 0.42 2.20E-11 4.10E-08
2 GO:0019886 antigen processing and presentation of e... 88 0 0.15 449 1 5.50E-08 5.50E-08
3 GO:0098664 G protein-coupled serotonin receptor sig... 23 0 0.04 450 1 7.60E-08 7.60E-08
4 GO:0006120 mitochondrial electron transport, NADH t... 42 0 0.07 451 1 2.10E-07 2.10E-07
5 GO:0007190 activation of adenylate cyclase activity 29 0 0.05 452 1 5.10E-07 5.10E-07
6 GO:0060337 type I interferon signaling pathway 87 0 0.15 453 1 3.70E-06 3.70E-06
7 GO:0007193 adenylate cyclase-inhibiting G protein-c... 63 0 0.11 454 1 1.70E-08 4.80E-06
8 GO:0006069 ethanol oxidation 9 0 0.02 455 1 4.90E-06 4.90E-06
9 GO:0060078 regulation of postsynaptic membrane pote... 107 0 0.19 456 1 5.60E-07 5.70E-06
10 GO:0007155 cell adhesion 1134 0 1.99 457 1 1.30E-06 6.20E-06
11 GO:0042531 positive regulation of tyrosine phosphor... 54 0 0.09 458 1 6.60E-06 6.60E-06
12 GO:0000184 nuclear-transcribed mRNA catabolic proce... 114 0 0.2 459 1 9.90E-06 9.90E-06
13 GO:0048013 ephrin receptor signaling pathway 81 0 0.14 460 1 1.10E-05 1.10E-05
14 GO:0006749 glutathione metabolic process 43 0 0.08 461 1 1.40E-05 1.40E-05
15 GO:0006123 mitochondrial electron transport, cytoch... 13 0 0.02 462 1 1.40E-05 1.40E-05
16 GO:0009636 response to toxic substance 446 0 0.78 463 1 1.00E-08 1.70E-05
17 GO:1901687 glutathione derivative biosynthetic proc... 19 0 0.03 464 1 2.70E-05 2.70E-05
18 GO:1901701 cellular response to oxygen-containing c... 924 1 1.62 335 0.82 3.30E-06 3.20E-05
19 GO:1904322 cellular response to forskolin 10 0 0.02 465 1 3.20E-05 3.20E-05
20 GO:0097267 omega-hydroxylase P450 pathway 9 0 0.02 466 1 3.50E-05 3.50E-05

2.2 topGO: Using the genes score

基因富集分析 (gene set enrichment analysis, GSEA)

Tests based on gene scores or gene ranks. It includes Kolmogorov-Smirnov like tests (also known as GSEA), Gentleman's Category, t-test, etc. Ackermann and Strimmer (2009).

2.2.1 構(gòu)建 topGOdata 對(duì)象

構(gòu)建 geneList

GSEAgeneList <- subset(DEGdf, select = "adj.P.Val")
GSEAgeneList <- as.numeric(GSEAgeneList[,1])
names(GSEAgeneList) <- row.names(DEGdf)
str(GSEAgeneList)
#  Named num [1:22283] 3.25e-05 9.79e-05 4.61e-04 6.98e-04 1.34e-03 ...
#  - attr(*, "names")= chr [1:22283] "209835_x_at" "212014_x_at" "204490_s_at" "209380_s_at" ...

通過(guò)函數(shù) topDiffGenes() 篩選q值小于0.01的基因。

topDiffGenes <- function(allScore) {
  return(allScore < 0.01)
}
length(GSEAgeneList)
# [1] 22283
sum(topDiffGenes(GSEAgeneList))
# [1] 50

構(gòu)建 topGOdata 對(duì)象:

GSEAGOdata <- new("topGOdata",
                  description = "GO analysis of GSE2034 data; free vs relapses",
                  ontology = "BP",
                  allGenes = GSEAgeneList,
                  geneSel = topDiffGenes,
                  nodeSize = 5,
                  annot = annFUN.db,
                  affyLib = "hgu133a.db")
# 
# Building most specific GOs .....
#   ( 11302 GO terms found. )
# 
# Build GO DAG topology ..........
#   ( 15326 GO terms and 36394 relations. )
# 
# Annotating nodes ...............
#   ( 19375 genes annotated to the GO terms. )

GSEAGOdata
# ------------------------- topGOdata object -------------------------
# 
#  Description:
#    -  GO analysis of GSE2034 data; free vs relapses 
# 
#  Ontology:
#    -  BP 
# 
#  22283 available genes (all genes from the array):
#    - symbol:  209835_x_at 212014_x_at 204490_s_at 209380_s_at 209831_x_at  ...
#    - score :  3.25364921e-05 9.78850462e-05 0.000460980422 0.000698385002 0.0013384333  ...
#    - 50  significant genes. 
# 
#  19375 feasible genes (genes that can be used in the analysis):
#    - symbol:  209835_x_at 212014_x_at 204490_s_at 209380_s_at 209831_x_at  ...
#    - score :  3.25364921e-05 9.78850462e-05 0.000460980422 0.000698385002 0.0013384333  ...
#    - 47  significant genes. 
# 
#  GO graph (nodes with at least  5  genes):
#    - a graph with directed edges
#    - number of nodes = 10532 
#    - number of edges = 24696 
# 
# ------------------------- topGOdata object -------------------------
# 

2.2.2 富集分析

2.2.3 Summary

GenTable 導(dǎo)出表格

未完不更

References


最后,向大家隆重推薦生信技能樹(shù)的一系列干貨!

  1. 生信技能樹(shù)全球公益巡講:https://mp.weixin.qq.com/s/E9ykuIbc-2Ja9HOY0bn_6g
  2. B站公益74小時(shí)生信工程師教學(xué)視頻合輯:https://mp.weixin.qq.com/s/IyFK7l_WBAiUgqQi8O7Hxw
  3. 招學(xué)徒:https://mp.weixin.qq.com/s/KgbilzXnFjbKKunuw7NVfw
最后編輯于
?著作權(quán)歸作者所有,轉(zhuǎn)載或內(nèi)容合作請(qǐng)聯(lián)系作者
平臺(tái)聲明:文章內(nèi)容(如有圖片或視頻亦包括在內(nèi))由作者上傳并發(fā)布,文章內(nèi)容僅代表作者本人觀點(diǎn),簡(jiǎn)書(shū)系信息發(fā)布平臺(tái),僅提供信息存儲(chǔ)服務(wù)。

推薦閱讀更多精彩內(nèi)容