一個(gè)失敗案例,不要點(diǎn)進(jìn)來(lái)
GSE2034 topGO
1. DEG
1.1 下載GEO數(shù)據(jù)
library(GEOquery)
f <- 'GSE2034.Rdata'
if(!file.exists(f)){
gset <- getGEO('GSE2034',destdir = '.',
AnnotGPL = F,
getGPL = F)
save(gset,file=f)
}
load('GSE2034.Rdata')
1.2 表達(dá)矩陣
eset <- gset[[1]]
expmat <- exprs(eset)
## 檢查數(shù)據(jù)是否已經(jīng)經(jīng)過(guò)log轉(zhuǎn)換,并進(jìn)行l(wèi)og2轉(zhuǎn)換
ex <- expmat
qx <- as.numeric(quantile(ex, c(0., 0.25, 0.5, 0.75, 0.99, 1.0), na.rm=T))
LogC <- (qx[5] > 100) ||
(qx[6]-qx[1] > 50 && qx[2] > 0) ||
(qx[2] > 0 && qx[2] < 1 && qx[4] > 1 && qx[4] < 2)
if (LogC) { ex[which(ex <= 0)] <- NaN
expmat <- log2(ex)
print("log2 transform finished")}else{print("log2 transform not needed")}
# [1] "log2 transform finished"
fit <- lmFit(expmat, design)
fit1 <- contrasts.fit(fit, contrast.matrix)
fit2 <- eBayes(fit, 0.01)
DEGstb <- topTable(fit2, coef=1, n=Inf)
DEGstb <- DEGstb[order(DEGstb[,4], decreasing = F),]
before:
after:
1.3 設(shè)計(jì)矩陣和對(duì)比矩陣
pdata <- pData(eset)
group_list <- pdata$`bone relapses (1=yes, 0=no):ch1`
library(limma)
design <- model.matrix(~0+factor(group_list))
colnames(design) <- c("free", "relapses")
rownames(design) <- colnames(expmat)
contrast.matrix <- makeContrasts('free-relapses',levels = design)
1.4 差異分析
fit <- lmFit(expmat, design)
fit1 <- contrasts.fit(fit, contrast.matrix)
fit2 <- eBayes(fit1)
DEGstb <- topTable(fit2, coef=1, n=Inf)
DEGmtx <- na.omit(DEGstb)
head(DEGmtx)
# logFC AveExpr t P.Value adj.P.Val
# 209835_x_at 0.7010262 10.616209 6.251930 1.460149e-09 3.253649e-05
# 212014_x_at 0.7264678 10.372145 5.927511 8.785625e-09 9.788505e-05
# 204490_s_at 0.4928252 9.788234 5.558565 6.206262e-08 4.609804e-04
# 209380_s_at -0.5373120 10.062837 -5.421342 1.253664e-07 6.983850e-04
# 209831_x_at 0.2666780 10.594035 5.228288 3.295703e-07 1.338433e-03
# 204489_s_at 0.5619588 10.302607 5.181790 4.142994e-07 1.338433e-03
# B result
# 209835_x_at 11.154503 UP
# 212014_x_at 9.526820 UP
# 204490_s_at 7.757191 UP
# 209380_s_at 7.121927 DOWN
# 209831_x_at 6.249843 NOT
# 204489_s_at 6.043635 UP
1.5 火山圖
logFC_cutoff <- with(DEGmtx, mean(abs(logFC)) +
2*sd(abs(logFC)))
DEGdf$result <- ifelse(DEGdf$P.Value < 0.05 &
abs(DEGdf$logFC) > logFC_cutoff,
ifelse(DEGdf$logFC >logFC_cutoff,
"UP", "DOWN"), "NOT")
title <- paste0('Cutoff for logFC is ',round(logFC_cutoff,3),
'\nThe number of up gene is ',nrow(DEGmtx[DEGmtx$result =='UP',]) ,
'\nThe number of down gene is ',nrow(DEGmtx[DEGmtx$result =='DOWN',]))
library(ggplot2)
ggplot(data=DEGmtx, aes(x=logFC, y=-log10(P.Value), color=result)) +
geom_point(alpha=0.4, size=1.75) +
theme_set(theme_set(theme_bw(base_size=20))) +
xlab("log2 fold change") + ylab("-log10 p-value") +
ggtitle( title ) + theme(plot.title = element_text(size=15,hjust = 0.5)) +
scale_colour_manual(values = c('blue','black','red'))
1.6 PCA圖
pca_data <- prcomp(t(expmat),scale=TRUE)
pcx <- data.frame(pca_data$x)
pcr <- cbind(samples=rownames(pcx),group_list, pcx)
ggplot(pcr, aes(PC1, PC2)) + geom_point(size=5, aes(color = group_list)) +
geom_text(aes(label="samples"),hjust=-0.1, vjust=-0.3)
這個(gè)數(shù)據(jù)中的分組:bone relapses: 1=yes, 0=no
分別是加樣本名和不加樣本名的亞子:
1.7 熱圖
選擇&調(diào)整矩陣,設(shè)置各種參數(shù):
chosedgene <- rownames(DEGmtx)[1:50]
chosedmatx <- expmat[chosedgene,]
re <- subset(pdata,pdata[,28]==1)
re <- row.names(re)
fr <- row.names(subset(pdata,pdata[,28]==0))
sampleorder <- as.vector(c(fr,re))
do_mtx <- t(scale(t(chosedmatx)))
do_mtx[do_mtx > 2] = 2
do_mtx[do_mtx < -2] = -2
do_mtx <- do_mtx[,sampleorder]
chosedprob <- row.names(do_mtx)
chosedsymbol <- AnnotationDbi::select(hgu133a.db, chosedprob,
"SYMBOL",
"PROBEID")[,2]
row.names(do_mtx) <- chosedsymbol
anno_col <- data.frame(sample=factor(rep(c("free", "relapses"), c(217, 69))))
row.names(anno_col) <- colnames(do_mtx)
ann_color <- list(sample=c(free = "#3a5fcd", relapses = "#cd0000"))
畫(huà)圖:
png(filename = "GSE2034_hm_nmlmatrix_nocluscol.png", height = 5000, width = 6000,
res = 500, units = "px")
pheatmap::pheatmap(do_mtx, scale = "none", color = color,
annotation_col = anno_col,
annotation_colors = ann_color,
fontsize = 8, fontsize_row = 8,
show_colnames = F,
cluster_cols = F,
main = "GSE2034: free vs relapses HEATMAP (delected outliers)",)
dev.off()
2. topGO
2.1 topGO: Predefined list of interesting genes
過(guò)表征分析 (over representation analysis, ORA)
Tests based on gene counts. This is the most popular family of tests, given that it only requires the presence of a list of interesting genes and nothing more. Tests like Fisher's exact test, Hypegeometric test and binomial test belong to this family. Draghici et al. (2006) .
2.1.1 構(gòu)建 topGOdata
對(duì)象
構(gòu)建 geneID2GO
list
library(topGO)
library(hgu133a.db)
library(dplyr)
probe_id <- row.names(expmat)
entrez_id <- AnnotationDbi::select(hgu133a.db,
probe_id,
"ENTREZID",
"PROBEID")[,2]
geneID2GOtb <- AnnotationDbi::select(hgu133a.db, entrez_id,
"GO", "ENTREZID")
geneID2GOsb <- geneID2GOtb %>%
group_by(ENTREZID) %>%
summarise(GO=list(GO))
geneID2GO <- geneID2GOsb$GO
names(geneID2GO) <- rownames(geneID2GOsb)
構(gòu)建 geneList
myInterestingDEG <- subset(DEGdf, abs(DEGdf$logFC) > 1)
myInterestingDEG <- subset(myInterestingDEG, myInterestingDEG$adj.P.Val < 0.05)
logFC | AveExpr | t | P.Value | adj.P.Val | B | result | |
---|---|---|---|---|---|---|---|
214777_at | 1.621019 | 7.578387 | 4.723681 | 3.63E-06 | 0.005771 | 4.093824 | UP |
216365_x_at | 1.340734 | 5.631291 | 4.599642 | 6.35E-06 | 0.007075 | 3.592027 | UP |
217157_x_at | 1.02288 | 8.543867 | 4.564419 | 7.43E-06 | 0.007348 | 3.45161 | UP |
216984_x_at | 1.482014 | 7.826338 | 4.56344 | 7.46E-06 | 0.007348 | 3.447722 | UP |
217378_x_at | 1.740957 | 8.055472 | 4.500623 | 9.85E-06 | 0.008778 | 3.199637 | UP |
217258_x_at | 1.217204 | 7.180579 | 4.485677 | 1.05E-05 | 0.009013 | 3.14105 | UP |
217148_x_at | 1.236926 | 9.662857 | 4.468183 | 1.14E-05 | 0.009236 | 3.072682 | UP |
211798_x_at | 1.129989 | 7.765948 | 4.411513 | 1.45E-05 | 0.009236 | 2.852795 | UP |
214669_x_at | 1.146124 | 11.22506 | 4.382578 | 1.65E-05 | 0.009236 | 2.741463 | UP |
211644_x_at | 1.51252 | 8.895729 | 4.353739 | 1.86E-05 | 0.00958 | 2.631127 | UP |
215176_x_at | 1.554616 | 9.781743 | 4.350516 | 1.89E-05 | 0.00958 | 2.618838 | UP |
211637_x_at | 1.319264 | 6.128292 | 4.291975 | 2.42E-05 | 0.01036 | 2.396959 | UP |
215121_x_at | 1.279395 | 12.98246 | 4.247872 | 2.92E-05 | 0.011018 | 2.231533 | UP |
216576_x_at | 1.355247 | 8.468197 | 4.198257 | 3.59E-05 | 0.011951 | 2.047218 | UP |
204540_at | -1.06758 | 8.610066 | -4.19044 | 3.71E-05 | 0.011982 | 2.018367 | DOWN |
216401_x_at | 1.475812 | 7.711042 | 4.180749 | 3.86E-05 | 0.011983 | 1.982631 | UP |
207430_s_at | -1.3399 | 6.097727 | -4.15096 | 4.37E-05 | 0.012368 | 1.873294 | DOWN |
213201_s_at | -1.27114 | 6.641464 | -4.13499 | 4.66E-05 | 0.01267 | 1.814941 | DOWN |
215214_at | 1.239997 | 7.376844 | 4.125793 | 4.84E-05 | 0.012763 | 1.781441 | UP |
204933_s_at | 1.073296 | 4.540879 | 4.031533 | 7.10E-05 | 0.015467 | 1.441822 | UP |
217281_x_at | 1.355866 | 5.417214 | 4.015399 | 7.58E-05 | 0.015784 | 1.384391 | UP |
214858_at | -1.08985 | 6.455748 | -4.00976 | 7.75E-05 | 0.015861 | 1.36435 | DOWN |
216510_x_at | 1.132241 | 6.304655 | 3.99392 | 8.26E-05 | 0.016195 | 1.308245 | UP |
215379_x_at | 1.123251 | 11.77926 | 3.943786 | 0.000101 | 0.016689 | 1.131931 | UP |
211634_x_at | 1.014891 | 6.92074 | 3.93688 | 0.000104 | 0.016689 | 1.107799 | UP |
210297_s_at | -1.201 | 6.705364 | -3.93263 | 0.000105 | 0.016689 | 1.092959 | DOWN |
209138_x_at | 1.136452 | 13.28617 | 3.9291 | 0.000107 | 0.016689 | 1.080658 | UP |
211645_x_at | 1.244501 | 9.308405 | 3.889339 | 0.000125 | 0.017262 | 0.942694 | UP |
216491_x_at | 1.292163 | 5.893802 | 3.845647 | 0.000148 | 0.018642 | 0.79254 | UP |
216560_x_at | 1.278329 | 5.810675 | 3.778756 | 0.000192 | 0.020615 | 0.565602 | UP |
221286_s_at | 1.10116 | 7.542805 | 3.777608 | 0.000192 | 0.020615 | 0.561739 | UP |
206391_at | 1.082072 | 6.946425 | 3.505906 | 0.000528 | 0.031189 | -0.32275 | UP |
205509_at | -1.20626 | 8.869439 | -3.48369 | 0.000572 | 0.032322 | -0.39241 | DOWN |
209374_s_at | 1.034312 | 10.40725 | 3.443591 | 0.000659 | 0.034236 | -0.51714 | UP |
204623_at | -1.04503 | 11.21622 | -3.25463 | 0.001271 | 0.045188 | -1.08699 | DOWN |
geneNames <- names(geneID2GO)
myInterestingGenestb <- AnnotationDbi::select(hgu133a.db, row.names(myInterestingDEG),
"ENTREZID", "PROBEID")
myInterestingGenes <- geneNames[match(myInterestingGenestb[,2],
geneNames)]
myInterestingGenes <- na.omit(myInterestingGenes)
ORAgeneList <- factor(as.integer(geneNames %in% myInterestingGenes))
names(ORAgeneList) <- geneNames
str(ORAgeneList)
# Factor w/ 2 levels "0","1": 1 1 1 1 1 1 1 1 1 1 ...
# - attr(*, "names")= chr [1:13755] "10" "100" "1000" "10000" ...
構(gòu)建 topGOdata
對(duì)象:
ORAGOdata <- new("topGOdata",
ontology = "BP",
allGenes = ORAgeneList,
annot = annFUN.gene2GO,
gene2GO = geneID2GO)
#
# Building most specific GOs .....
# ( 11302 GO terms found. )
#
# Build GO DAG topology ..........
# ( 15326 GO terms and 36394 relations. )
#
# Annotating nodes ...............
# ( 11973 genes annotated to the GO terms. )
ORAGOdata
#
# ------------------------- topGOdata object -------------------------
#
# Description:
# -
#
# Ontology:
# - BP
#
# 13755 available genes (all genes from the array):
# - symbol: 10 100 1000 10000 100008586 ...
# - 29 significant genes.
#
# 11973 feasible genes (genes that can be used in the analysis):
# - symbol: 10 100 1000 10000 10001 ...
# - 21 significant genes.
#
# GO graph (nodes with at least 1 genes):
# - a graph with directed edges
# - number of nodes = 15326
# - number of edges = 36394
#
# ------------------------- topGOdata object -------------------------
#
2.1.2 富集分析
we perform a classical enrichment analysis by testing the over-representation of GO terms within the group of di erentially expressed genes.
ORAresultFisher <- runTest(ORAGOdata, algorithm = "classic", statistic = "fisher")
#
# -- Classic Algorithm --
#
# the algorithm is scoring 449 nontrivial nodes
# parameters:
# test statistic: fisher
#
ORAresultFisher
#
# Description:
# Ontology: BP
# 'classic' algorithm with the 'fisher' test
# 15326 GO terms scored: 86 terms with p < 0.01
# Annotation data:
# Annotated genes: 11973
# Significant genes: 21
# Min. no. of genes annotated to a GO: 1
# Nontrivial nodes: 449
使用 topGO
包時(shí),所構(gòu)建的 topGOdata
包含了 gene universe (ORAGOdata) 和 p-value (GSEAGOdata), ORA 和 GSEA 對(duì)應(yīng)的統(tǒng)計(jì)方法可以同時(shí)應(yīng)用于同一個(gè) topGOdata
對(duì)象。所以區(qū)別只是基于不同的已知信息,構(gòu)建 topGOdata
的方法不同。(maybe?
基于 classic 和 elim 算法,對(duì) ORAdata
進(jìn)行 Kolmogorov-Smirnov test:
ORAresultKS <- runTest(ORAGOdata, algorithm = "classic", statistic = "ks")
#
# -- Classic Algorithm --
#
# the algorithm is scoring 15326 nontrivial nodes
# parameters:
# test statistic: ks
# score order: increasing
ORAresultKS
#
# Description:
# Ontology: BP
# 'classic' algorithm with the 'ks' test
# 15326 GO terms scored: 903 terms with p < 0.01
# Annotation data:
# Annotated genes: 11973
# Significant genes: 21
# Min. no. of genes annotated to a GO: 1
# Nontrivial nodes: 15326
ORAresult.elim <- runTest(ORAGOdata, algorithm = "elim", statistic = "ks")
#
# -- Elim Algorithm --
#
# the algorithm is scoring 15326 nontrivial nodes
# parameters:
# test statistic: ks
# cutOff: 0.01
# score order: increasing
# ...
ORAresult.elim
#
# Description:
# Ontology: BP
# 'elim' algorithm with the 'ks : 0.01' test
# 15326 GO terms scored: 291 terms with p < 0.01
# Annotation data:
# Annotated genes: 11973
# Significant genes: 21
# Min. no. of genes annotated to a GO: 1
# Nontrivial nodes: 15326
2.1.3 Summary
ORAallRes <- GenTable(ORAGOdata,classicFisher = ORAresultFisher,
classicKS = ORAresultKS,
elimKS = ORAresult.elim,
orderBy = "elimKS",
ranksOf = "classicFisher",
topNodes = 20)
GO.ID | Term | Annotated | Significant | Expected | Rank in classicFisher | classicFisher | classicKS | elimKS | |
---|---|---|---|---|---|---|---|---|---|
1 | GO:0030198 | extracellular matrix organization | 303 | 1 | 0.53 | 233 | 0.42 | 2.20E-11 | 4.10E-08 |
2 | GO:0019886 | antigen processing and presentation of e... | 88 | 0 | 0.15 | 449 | 1 | 5.50E-08 | 5.50E-08 |
3 | GO:0098664 | G protein-coupled serotonin receptor sig... | 23 | 0 | 0.04 | 450 | 1 | 7.60E-08 | 7.60E-08 |
4 | GO:0006120 | mitochondrial electron transport, NADH t... | 42 | 0 | 0.07 | 451 | 1 | 2.10E-07 | 2.10E-07 |
5 | GO:0007190 | activation of adenylate cyclase activity | 29 | 0 | 0.05 | 452 | 1 | 5.10E-07 | 5.10E-07 |
6 | GO:0060337 | type I interferon signaling pathway | 87 | 0 | 0.15 | 453 | 1 | 3.70E-06 | 3.70E-06 |
7 | GO:0007193 | adenylate cyclase-inhibiting G protein-c... | 63 | 0 | 0.11 | 454 | 1 | 1.70E-08 | 4.80E-06 |
8 | GO:0006069 | ethanol oxidation | 9 | 0 | 0.02 | 455 | 1 | 4.90E-06 | 4.90E-06 |
9 | GO:0060078 | regulation of postsynaptic membrane pote... | 107 | 0 | 0.19 | 456 | 1 | 5.60E-07 | 5.70E-06 |
10 | GO:0007155 | cell adhesion | 1134 | 0 | 1.99 | 457 | 1 | 1.30E-06 | 6.20E-06 |
11 | GO:0042531 | positive regulation of tyrosine phosphor... | 54 | 0 | 0.09 | 458 | 1 | 6.60E-06 | 6.60E-06 |
12 | GO:0000184 | nuclear-transcribed mRNA catabolic proce... | 114 | 0 | 0.2 | 459 | 1 | 9.90E-06 | 9.90E-06 |
13 | GO:0048013 | ephrin receptor signaling pathway | 81 | 0 | 0.14 | 460 | 1 | 1.10E-05 | 1.10E-05 |
14 | GO:0006749 | glutathione metabolic process | 43 | 0 | 0.08 | 461 | 1 | 1.40E-05 | 1.40E-05 |
15 | GO:0006123 | mitochondrial electron transport, cytoch... | 13 | 0 | 0.02 | 462 | 1 | 1.40E-05 | 1.40E-05 |
16 | GO:0009636 | response to toxic substance | 446 | 0 | 0.78 | 463 | 1 | 1.00E-08 | 1.70E-05 |
17 | GO:1901687 | glutathione derivative biosynthetic proc... | 19 | 0 | 0.03 | 464 | 1 | 2.70E-05 | 2.70E-05 |
18 | GO:1901701 | cellular response to oxygen-containing c... | 924 | 1 | 1.62 | 335 | 0.82 | 3.30E-06 | 3.20E-05 |
19 | GO:1904322 | cellular response to forskolin | 10 | 0 | 0.02 | 465 | 1 | 3.20E-05 | 3.20E-05 |
20 | GO:0097267 | omega-hydroxylase P450 pathway | 9 | 0 | 0.02 | 466 | 1 | 3.50E-05 | 3.50E-05 |
2.2 topGO: Using the genes score
基因富集分析 (gene set enrichment analysis, GSEA)
Tests based on gene scores or gene ranks. It includes Kolmogorov-Smirnov like tests (also known as GSEA), Gentleman's Category, t-test, etc. Ackermann and Strimmer (2009).
2.2.1 構(gòu)建 topGOdata
對(duì)象
構(gòu)建 geneList
GSEAgeneList <- subset(DEGdf, select = "adj.P.Val")
GSEAgeneList <- as.numeric(GSEAgeneList[,1])
names(GSEAgeneList) <- row.names(DEGdf)
str(GSEAgeneList)
# Named num [1:22283] 3.25e-05 9.79e-05 4.61e-04 6.98e-04 1.34e-03 ...
# - attr(*, "names")= chr [1:22283] "209835_x_at" "212014_x_at" "204490_s_at" "209380_s_at" ...
通過(guò)函數(shù) topDiffGenes()
篩選q值小于0.01的基因。
topDiffGenes <- function(allScore) {
return(allScore < 0.01)
}
length(GSEAgeneList)
# [1] 22283
sum(topDiffGenes(GSEAgeneList))
# [1] 50
構(gòu)建 topGOdata
對(duì)象:
GSEAGOdata <- new("topGOdata",
description = "GO analysis of GSE2034 data; free vs relapses",
ontology = "BP",
allGenes = GSEAgeneList,
geneSel = topDiffGenes,
nodeSize = 5,
annot = annFUN.db,
affyLib = "hgu133a.db")
#
# Building most specific GOs .....
# ( 11302 GO terms found. )
#
# Build GO DAG topology ..........
# ( 15326 GO terms and 36394 relations. )
#
# Annotating nodes ...............
# ( 19375 genes annotated to the GO terms. )
GSEAGOdata
# ------------------------- topGOdata object -------------------------
#
# Description:
# - GO analysis of GSE2034 data; free vs relapses
#
# Ontology:
# - BP
#
# 22283 available genes (all genes from the array):
# - symbol: 209835_x_at 212014_x_at 204490_s_at 209380_s_at 209831_x_at ...
# - score : 3.25364921e-05 9.78850462e-05 0.000460980422 0.000698385002 0.0013384333 ...
# - 50 significant genes.
#
# 19375 feasible genes (genes that can be used in the analysis):
# - symbol: 209835_x_at 212014_x_at 204490_s_at 209380_s_at 209831_x_at ...
# - score : 3.25364921e-05 9.78850462e-05 0.000460980422 0.000698385002 0.0013384333 ...
# - 47 significant genes.
#
# GO graph (nodes with at least 5 genes):
# - a graph with directed edges
# - number of nodes = 10532
# - number of edges = 24696
#
# ------------------------- topGOdata object -------------------------
#
2.2.2 富集分析
2.2.3 Summary
GenTable
導(dǎo)出表格
未完不更
References
最后,向大家隆重推薦生信技能樹(shù)的一系列干貨!
- 生信技能樹(shù)全球公益巡講:https://mp.weixin.qq.com/s/E9ykuIbc-2Ja9HOY0bn_6g
- B站公益74小時(shí)生信工程師教學(xué)視頻合輯:https://mp.weixin.qq.com/s/IyFK7l_WBAiUgqQi8O7Hxw
- 招學(xué)徒:https://mp.weixin.qq.com/s/KgbilzXnFjbKKunuw7NVfw