一個(gè)失敗案例，不要點(diǎn)進(jìn)來(lái)

GSE2034 topGO

1. DEG

1.1 下載GEO數(shù)據(jù)

library(GEOquery)
f <- 'GSE2034.Rdata'
if(!file.exists(f)){
  gset <- getGEO('GSE2034',destdir = '.',
                 AnnotGPL = F,
                 getGPL = F)
  save(gset,file=f)
}
load('GSE2034.Rdata')

1.2 表達(dá)矩陣

eset <- gset[[1]]
expmat <- exprs(eset)

## 檢查數(shù)據(jù)是否已經(jīng)經(jīng)過(guò)log轉(zhuǎn)換，并進(jìn)行l(wèi)og2轉(zhuǎn)換
ex <- expmat
qx <- as.numeric(quantile(ex, c(0., 0.25, 0.5, 0.75, 0.99, 1.0), na.rm=T))

LogC <- (qx[5] > 100) ||
  (qx[6]-qx[1] > 50 && qx[2] > 0) ||
  (qx[2] > 0 && qx[2] < 1 && qx[4] > 1 && qx[4] < 2)

if (LogC) { ex[which(ex <= 0)] <- NaN
expmat <- log2(ex)
print("log2 transform finished")}else{print("log2 transform not needed")}
# [1] "log2 transform finished"

fit <- lmFit(expmat, design)
fit1 <- contrasts.fit(fit, contrast.matrix) 
fit2 <- eBayes(fit, 0.01)

DEGstb <- topTable(fit2, coef=1, n=Inf)
DEGstb <- DEGstb[order(DEGstb[,4], decreasing = F),]

before:

after:

1.3 設(shè)計(jì)矩陣和對(duì)比矩陣

pdata <- pData(eset)
group_list <- pdata$`bone relapses (1=yes, 0=no):ch1`
library(limma)
design <- model.matrix(~0+factor(group_list))
colnames(design) <- c("free", "relapses")
rownames(design) <- colnames(expmat)
contrast.matrix <- makeContrasts('free-relapses',levels = design)

1.4 差異分析

fit <- lmFit(expmat, design)
fit1 <- contrasts.fit(fit, contrast.matrix) 
fit2 <- eBayes(fit1)
DEGstb <- topTable(fit2, coef=1, n=Inf)
DEGmtx <- na.omit(DEGstb)
head(DEGmtx)
#                  logFC   AveExpr         t      P.Value    adj.P.Val
# 209835_x_at  0.7010262 10.616209  6.251930 1.460149e-09 3.253649e-05
# 212014_x_at  0.7264678 10.372145  5.927511 8.785625e-09 9.788505e-05
# 204490_s_at  0.4928252  9.788234  5.558565 6.206262e-08 4.609804e-04
# 209380_s_at -0.5373120 10.062837 -5.421342 1.253664e-07 6.983850e-04
# 209831_x_at  0.2666780 10.594035  5.228288 3.295703e-07 1.338433e-03
# 204489_s_at  0.5619588 10.302607  5.181790 4.142994e-07 1.338433e-03
#                     B result
# 209835_x_at 11.154503     UP
# 212014_x_at  9.526820     UP
# 204490_s_at  7.757191     UP
# 209380_s_at  7.121927   DOWN
# 209831_x_at  6.249843    NOT
# 204489_s_at  6.043635     UP

1.5 火山圖

logFC_cutoff <- with(DEGmtx, mean(abs(logFC)) + 
                       2*sd(abs(logFC)))

DEGdf$result <- ifelse(DEGdf$P.Value < 0.05 &
                                    abs(DEGdf$logFC) > logFC_cutoff,
                                  ifelse(DEGdf$logFC >logFC_cutoff,
                                         "UP", "DOWN"), "NOT")

title <- paste0('Cutoff for logFC is ',round(logFC_cutoff,3), 
                '\nThe number of up gene is ',nrow(DEGmtx[DEGmtx$result =='UP',]) ,
                '\nThe number of down gene is ',nrow(DEGmtx[DEGmtx$result =='DOWN',]))

library(ggplot2)
ggplot(data=DEGmtx, aes(x=logFC, y=-log10(P.Value), color=result)) +
  geom_point(alpha=0.4, size=1.75) +
  theme_set(theme_set(theme_bw(base_size=20))) +
  xlab("log2 fold change") + ylab("-log10 p-value") +
  ggtitle( title ) + theme(plot.title = element_text(size=15,hjust = 0.5)) +
  scale_colour_manual(values = c('blue','black','red'))

1.6 PCA圖

pca_data <- prcomp(t(expmat),scale=TRUE)
pcx <- data.frame(pca_data$x)
pcr <- cbind(samples=rownames(pcx),group_list, pcx) 
ggplot(pcr, aes(PC1, PC2)) + geom_point(size=5, aes(color = group_list)) +
  geom_text(aes(label="samples"),hjust=-0.1, vjust=-0.3)

這個(gè)數(shù)據(jù)中的分組：bone relapses: 1=yes, 0=no

分別是加樣本名和不加樣本名的亞子：

1.7 熱圖

選擇&調(diào)整矩陣，設(shè)置各種參數(shù)：

chosedgene <- rownames(DEGmtx)[1:50]
chosedmatx <- expmat[chosedgene,]      

re <- subset(pdata,pdata[,28]==1)
re <- row.names(re)
fr <- row.names(subset(pdata,pdata[,28]==0))
sampleorder <- as.vector(c(fr,re))

do_mtx <- t(scale(t(chosedmatx)))
do_mtx[do_mtx > 2] = 2
do_mtx[do_mtx < -2] = -2

do_mtx <- do_mtx[,sampleorder]

chosedprob <- row.names(do_mtx)
chosedsymbol <- AnnotationDbi::select(hgu133a.db, chosedprob,
                                      "SYMBOL",
                                      "PROBEID")[,2]
row.names(do_mtx) <- chosedsymbol

anno_col <- data.frame(sample=factor(rep(c("free", "relapses"), c(217, 69))))
row.names(anno_col) <- colnames(do_mtx)
ann_color <- list(sample=c(free = "#3a5fcd", relapses = "#cd0000"))

畫(huà)圖：

png(filename = "GSE2034_hm_nmlmatrix_nocluscol.png", height = 5000, width = 6000,
   res = 500, units = "px")

pheatmap::pheatmap(do_mtx, scale = "none", color = color,
                   annotation_col = anno_col,
                   annotation_colors = ann_color,
                   fontsize = 8, fontsize_row = 8, 
                   show_colnames = F,
                   cluster_cols = F,
                   main = "GSE2034: free vs relapses HEATMAP (delected outliers)",)
dev.off()

2. topGO

2.1 topGO: Predefined list of interesting genes

過(guò)表征分析 (over representation analysis, ORA)

Tests based on gene counts. This is the most popular family of tests, given that it only requires the presence of a list of interesting genes and nothing more. Tests like Fisher's exact test, Hypegeometric test and binomial test belong to this family. Draghici et al. (2006) .

2.1.1 構(gòu)建 `topGOdata` 對(duì)象

構(gòu)建 geneID2GO list

library(topGO)
library(hgu133a.db)
library(dplyr)

probe_id <- row.names(expmat)
entrez_id <- AnnotationDbi::select(hgu133a.db,
                                   probe_id,
                                   "ENTREZID",
                                   "PROBEID")[,2]

geneID2GOtb <- AnnotationDbi::select(hgu133a.db, entrez_id,
                                     "GO", "ENTREZID")
geneID2GOsb <- geneID2GOtb %>%
  group_by(ENTREZID) %>%
  summarise(GO=list(GO))
geneID2GO <- geneID2GOsb$GO
names(geneID2GO) <- rownames(geneID2GOsb)

構(gòu)建 geneList

myInterestingDEG <- subset(DEGdf, abs(DEGdf$logFC) > 1)
myInterestingDEG <- subset(myInterestingDEG, myInterestingDEG$adj.P.Val < 0.05)

	logFC	AveExpr	t	P.Value	adj.P.Val	B	result
214777_at	1.621019	7.578387	4.723681	3.63E-06	0.005771	4.093824	UP
216365_x_at	1.340734	5.631291	4.599642	6.35E-06	0.007075	3.592027	UP
217157_x_at	1.02288	8.543867	4.564419	7.43E-06	0.007348	3.45161	UP
216984_x_at	1.482014	7.826338	4.56344	7.46E-06	0.007348	3.447722	UP
217378_x_at	1.740957	8.055472	4.500623	9.85E-06	0.008778	3.199637	UP
217258_x_at	1.217204	7.180579	4.485677	1.05E-05	0.009013	3.14105	UP
217148_x_at	1.236926	9.662857	4.468183	1.14E-05	0.009236	3.072682	UP
211798_x_at	1.129989	7.765948	4.411513	1.45E-05	0.009236	2.852795	UP
214669_x_at	1.146124	11.22506	4.382578	1.65E-05	0.009236	2.741463	UP
211644_x_at	1.51252	8.895729	4.353739	1.86E-05	0.00958	2.631127	UP
215176_x_at	1.554616	9.781743	4.350516	1.89E-05	0.00958	2.618838	UP
211637_x_at	1.319264	6.128292	4.291975	2.42E-05	0.01036	2.396959	UP
215121_x_at	1.279395	12.98246	4.247872	2.92E-05	0.011018	2.231533	UP
216576_x_at	1.355247	8.468197	4.198257	3.59E-05	0.011951	2.047218	UP
204540_at	-1.06758	8.610066	-4.19044	3.71E-05	0.011982	2.018367	DOWN
216401_x_at	1.475812	7.711042	4.180749	3.86E-05	0.011983	1.982631	UP
207430_s_at	-1.3399	6.097727	-4.15096	4.37E-05	0.012368	1.873294	DOWN
213201_s_at	-1.27114	6.641464	-4.13499	4.66E-05	0.01267	1.814941	DOWN
215214_at	1.239997	7.376844	4.125793	4.84E-05	0.012763	1.781441	UP
204933_s_at	1.073296	4.540879	4.031533	7.10E-05	0.015467	1.441822	UP
217281_x_at	1.355866	5.417214	4.015399	7.58E-05	0.015784	1.384391	UP
214858_at	-1.08985	6.455748	-4.00976	7.75E-05	0.015861	1.36435	DOWN
216510_x_at	1.132241	6.304655	3.99392	8.26E-05	0.016195	1.308245	UP
215379_x_at	1.123251	11.77926	3.943786	0.000101	0.016689	1.131931	UP
211634_x_at	1.014891	6.92074	3.93688	0.000104	0.016689	1.107799	UP
210297_s_at	-1.201	6.705364	-3.93263	0.000105	0.016689	1.092959	DOWN
209138_x_at	1.136452	13.28617	3.9291	0.000107	0.016689	1.080658	UP
211645_x_at	1.244501	9.308405	3.889339	0.000125	0.017262	0.942694	UP
216491_x_at	1.292163	5.893802	3.845647	0.000148	0.018642	0.79254	UP
216560_x_at	1.278329	5.810675	3.778756	0.000192	0.020615	0.565602	UP
221286_s_at	1.10116	7.542805	3.777608	0.000192	0.020615	0.561739	UP
206391_at	1.082072	6.946425	3.505906	0.000528	0.031189	-0.32275	UP
205509_at	-1.20626	8.869439	-3.48369	0.000572	0.032322	-0.39241	DOWN
209374_s_at	1.034312	10.40725	3.443591	0.000659	0.034236	-0.51714	UP
204623_at	-1.04503	11.21622	-3.25463	0.001271	0.045188	-1.08699	DOWN

geneNames <- names(geneID2GO) 
myInterestingGenestb <- AnnotationDbi::select(hgu133a.db, row.names(myInterestingDEG),
                                            "ENTREZID", "PROBEID")

myInterestingGenes <- geneNames[match(myInterestingGenestb[,2], 
                                      geneNames)]
myInterestingGenes <- na.omit(myInterestingGenes)

ORAgeneList <- factor(as.integer(geneNames %in% myInterestingGenes)) 
names(ORAgeneList) <- geneNames 
str(ORAgeneList) 
#  Factor w/ 2 levels "0","1": 1 1 1 1 1 1 1 1 1 1 ...
#  - attr(*, "names")= chr [1:13755] "10" "100" "1000" "10000" ...

構(gòu)建 topGOdata 對(duì)象：

ORAGOdata <- new("topGOdata",
              ontology = "BP",
              allGenes = ORAgeneList,
              annot = annFUN.gene2GO,
              gene2GO = geneID2GO)
# 
# Building most specific GOs .....
#   ( 11302 GO terms found. )
# 
# Build GO DAG topology ..........
#   ( 15326 GO terms and 36394 relations. )
# 
# Annotating nodes ...............
#   ( 11973 genes annotated to the GO terms. )
ORAGOdata
# 
# ------------------------- topGOdata object -------------------------
# 
#  Description:
#    -   
# 
#  Ontology:
#    -  BP 
# 
#  13755 available genes (all genes from the array):
#    - symbol:  10 100 1000 10000 100008586  ...
#    - 29  significant genes. 
# 
#  11973 feasible genes (genes that can be used in the analysis):
#    - symbol:  10 100 1000 10000 10001  ...
#    - 21  significant genes. 
# 
#  GO graph (nodes with at least  1  genes):
#    - a graph with directed edges
#    - number of nodes = 15326 
#    - number of edges = 36394 
# 
# ------------------------- topGOdata object -------------------------
#

2.1.2 富集分析

we perform a classical enrichment analysis by testing the over-representation of GO terms within the group of di erentially expressed genes.

ORAresultFisher <- runTest(ORAGOdata, algorithm = "classic", statistic = "fisher") 
# 
#            -- Classic Algorithm -- 
# 
#        the algorithm is scoring 449 nontrivial nodes
#        parameters: 
#            test statistic: fisher
# 
ORAresultFisher
# 
# Description:  
# Ontology: BP 
# 'classic' algorithm with the 'fisher' test
# 15326 GO terms scored: 86 terms with p < 0.01
# Annotation data:
#     Annotated genes: 11973 
#     Significant genes: 21 
#     Min. no. of genes annotated to a GO: 1 
#     Nontrivial nodes: 449

使用 topGO 包時(shí)，所構(gòu)建的 topGOdata 包含了 gene universe (ORAGOdata) 和 p-value (GSEAGOdata), ORA 和 GSEA 對(duì)應(yīng)的統(tǒng)計(jì)方法可以同時(shí)應(yīng)用于同一個(gè) topGOdata 對(duì)象。所以區(qū)別只是基于不同的已知信息，構(gòu)建 topGOdata 的方法不同。（maybe?

基于 classic 和 elim 算法，對(duì) ORAdata 進(jìn)行 Kolmogorov-Smirnov test:

ORAresultKS <- runTest(ORAGOdata, algorithm = "classic", statistic = "ks") 
# 
#            -- Classic Algorithm -- 
# 
#        the algorithm is scoring 15326 nontrivial nodes
#        parameters: 
#            test statistic: ks
#            score order: increasing
ORAresultKS
# 
# Description:  
# Ontology: BP 
# 'classic' algorithm with the 'ks' test
# 15326 GO terms scored: 903 terms with p < 0.01
# Annotation data:
#     Annotated genes: 11973 
#     Significant genes: 21 
#     Min. no. of genes annotated to a GO: 1 
#     Nontrivial nodes: 15326

ORAresult.elim <- runTest(ORAGOdata, algorithm = "elim", statistic = "ks") 
# 
#            -- Elim Algorithm -- 
# 
#        the algorithm is scoring 15326 nontrivial nodes
#        parameters: 
#            test statistic: ks
#            cutOff: 0.01
#            score order: increasing
# ...
ORAresult.elim
# 
# Description:  
# Ontology: BP 
# 'elim' algorithm with the 'ks : 0.01' test
# 15326 GO terms scored: 291 terms with p < 0.01
# Annotation data:
#     Annotated genes: 11973 
#     Significant genes: 21 
#     Min. no. of genes annotated to a GO: 1 
#     Nontrivial nodes: 15326

2.1.3 Summary

ORAallRes <- GenTable(ORAGOdata,classicFisher = ORAresultFisher,
                   classicKS = ORAresultKS,
                   elimKS = ORAresult.elim,
                   orderBy = "elimKS",
                   ranksOf = "classicFisher",
                   topNodes = 20)

	GO.ID	Term	Annotated	Significant	Expected	Rank in classicFisher	classicFisher	classicKS	elimKS
1	GO:0030198	extracellular matrix organization	303	1	0.53	233	0.42	2.20E-11	4.10E-08
2	GO:0019886	antigen processing and presentation of e...	88	0	0.15	449	1	5.50E-08	5.50E-08
3	GO:0098664	G protein-coupled serotonin receptor sig...	23	0	0.04	450	1	7.60E-08	7.60E-08
4	GO:0006120	mitochondrial electron transport, NADH t...	42	0	0.07	451	1	2.10E-07	2.10E-07
5	GO:0007190	activation of adenylate cyclase activity	29	0	0.05	452	1	5.10E-07	5.10E-07
6	GO:0060337	type I interferon signaling pathway	87	0	0.15	453	1	3.70E-06	3.70E-06
7	GO:0007193	adenylate cyclase-inhibiting G protein-c...	63	0	0.11	454	1	1.70E-08	4.80E-06
8	GO:0006069	ethanol oxidation	9	0	0.02	455	1	4.90E-06	4.90E-06
9	GO:0060078	regulation of postsynaptic membrane pote...	107	0	0.19	456	1	5.60E-07	5.70E-06
10	GO:0007155	cell adhesion	1134	0	1.99	457	1	1.30E-06	6.20E-06
11	GO:0042531	positive regulation of tyrosine phosphor...	54	0	0.09	458	1	6.60E-06	6.60E-06
12	GO:0000184	nuclear-transcribed mRNA catabolic proce...	114	0	0.2	459	1	9.90E-06	9.90E-06
13	GO:0048013	ephrin receptor signaling pathway	81	0	0.14	460	1	1.10E-05	1.10E-05
14	GO:0006749	glutathione metabolic process	43	0	0.08	461	1	1.40E-05	1.40E-05
15	GO:0006123	mitochondrial electron transport, cytoch...	13	0	0.02	462	1	1.40E-05	1.40E-05
16	GO:0009636	response to toxic substance	446	0	0.78	463	1	1.00E-08	1.70E-05
17	GO:1901687	glutathione derivative biosynthetic proc...	19	0	0.03	464	1	2.70E-05	2.70E-05
18	GO:1901701	cellular response to oxygen-containing c...	924	1	1.62	335	0.82	3.30E-06	3.20E-05
19	GO:1904322	cellular response to forskolin	10	0	0.02	465	1	3.20E-05	3.20E-05
20	GO:0097267	omega-hydroxylase P450 pathway	9	0	0.02	466	1	3.50E-05	3.50E-05

2.2 topGO: Using the genes score

基因富集分析 (gene set enrichment analysis, GSEA)

Tests based on gene scores or gene ranks. It includes Kolmogorov-Smirnov like tests (also known as GSEA), Gentleman's Category, t-test, etc. Ackermann and Strimmer (2009).

2.2.1 構(gòu)建 `topGOdata` 對(duì)象

構(gòu)建 geneList

GSEAgeneList <- subset(DEGdf, select = "adj.P.Val")
GSEAgeneList <- as.numeric(GSEAgeneList[,1])
names(GSEAgeneList) <- row.names(DEGdf)
str(GSEAgeneList)
#  Named num [1:22283] 3.25e-05 9.79e-05 4.61e-04 6.98e-04 1.34e-03 ...
#  - attr(*, "names")= chr [1:22283] "209835_x_at" "212014_x_at" "204490_s_at" "209380_s_at" ...

通過(guò)函數(shù) topDiffGenes() 篩選q值小于0.01的基因。

topDiffGenes <- function(allScore) {
  return(allScore < 0.01)
}
length(GSEAgeneList)
# [1] 22283
sum(topDiffGenes(GSEAgeneList))
# [1] 50

構(gòu)建 topGOdata 對(duì)象：

GSEAGOdata <- new("topGOdata",
                  description = "GO analysis of GSE2034 data; free vs relapses",
                  ontology = "BP",
                  allGenes = GSEAgeneList,
                  geneSel = topDiffGenes,
                  nodeSize = 5,
                  annot = annFUN.db,
                  affyLib = "hgu133a.db")
# 
# Building most specific GOs .....
#   ( 11302 GO terms found. )
# 
# Build GO DAG topology ..........
#   ( 15326 GO terms and 36394 relations. )
# 
# Annotating nodes ...............
#   ( 19375 genes annotated to the GO terms. )

GSEAGOdata
# ------------------------- topGOdata object -------------------------
# 
#  Description:
#    -  GO analysis of GSE2034 data; free vs relapses 
# 
#  Ontology:
#    -  BP 
# 
#  22283 available genes (all genes from the array):
#    - symbol:  209835_x_at 212014_x_at 204490_s_at 209380_s_at 209831_x_at  ...
#    - score :  3.25364921e-05 9.78850462e-05 0.000460980422 0.000698385002 0.0013384333  ...
#    - 50  significant genes. 
# 
#  19375 feasible genes (genes that can be used in the analysis):
#    - symbol:  209835_x_at 212014_x_at 204490_s_at 209380_s_at 209831_x_at  ...
#    - score :  3.25364921e-05 9.78850462e-05 0.000460980422 0.000698385002 0.0013384333  ...
#    - 47  significant genes. 
# 
#  GO graph (nodes with at least  5  genes):
#    - a graph with directed edges
#    - number of nodes = 10532 
#    - number of edges = 24696 
# 
# ------------------------- topGOdata object -------------------------
#

2.2.2 富集分析

2.2.3 Summary

GenTable 導(dǎo)出表格

未完不更

References

https://davetang.org/muse/2018/05/15/making-a-heatmap-in-r-with-the-pheatmap-package/

最后，向大家隆重推薦生信技能樹(shù)的一系列干貨！

生信技能樹(shù)全球公益巡講：https://mp.weixin.qq.com/s/E9ykuIbc-2Ja9HOY0bn_6g
B站公益74小時(shí)生信工程師教學(xué)視頻合輯：https://mp.weixin.qq.com/s/IyFK7l_WBAiUgqQi8O7Hxw
招學(xué)徒：https://mp.weixin.qq.com/s/KgbilzXnFjbKKunuw7NVfw

三个男躁一个女,国精产品一区一手机的秘密,麦子交换系列最经典十句话,欧美国产综合欧美视频

從GEO數(shù)據(jù)下載到limma差異分析，再到topGO富集分析

從GEO數(shù)據(jù)下載到limma差異分析，再到topGO富集分析

GSE2034 topGO

1. DEG

1.1 下載GEO數(shù)據(jù)

1.2 表達(dá)矩陣

1.3 設(shè)計(jì)矩陣和對(duì)比矩陣

1.4 差異分析

1.5 火山圖

1.6 PCA圖

1.7 熱圖

2. topGO

2.1 topGO: Predefined list of interesting genes

2.1.1 構(gòu)建 `topGOdata` 對(duì)象

2.1.2 富集分析

2.1.3 Summary

2.2 topGO: Using the genes score

2.2.1 構(gòu)建 `topGOdata` 對(duì)象

2.2.2 富集分析

2.2.3 Summary

References

最后，向大家隆重推薦生信技能樹(shù)的一系列干貨！

推薦閱讀更多精彩內(nèi)容

三个男躁一个女,国精产品一区一手机的秘密,麦子交换系列最经典十句话,欧美 国产 综合 欧美 视频

從GEO數(shù)據(jù)下載到limma差異分析，再到topGO富集分析

GSE2034 topGO

1. DEG

1.1 下載GEO數(shù)據(jù)

1.2 表達(dá)矩陣

1.3 設(shè)計(jì)矩陣和對(duì)比矩陣

1.4 差異分析

1.5 火山圖

1.6 PCA圖

1.7 熱圖

2. topGO

2.1 topGO: Predefined list of interesting genes

2.1.1 構(gòu)建 topGOdata 對(duì)象

2.1.2 富集分析

2.1.3 Summary

2.2 topGO: Using the genes score

2.2.1 構(gòu)建 topGOdata 對(duì)象

2.2.2 富集分析

2.2.3 Summary

References

最后，向大家隆重推薦生信技能樹(shù)的一系列干貨！

推薦閱讀更多精彩內(nèi)容

三个男躁一个女,国精产品一区一手机的秘密,麦子交换系列最经典十句话,欧美国产综合欧美视频

2.1.1 構(gòu)建 `topGOdata` 對(duì)象

2.2.1 構(gòu)建 `topGOdata` 對(duì)象