R基礎概念和困惑點
R基礎概念和困惑點
公眾號鏈接: http://mp.weixin.qq.com/s/FU5lw29HCVe0dcKcz0ZLgA
插播一點R的基礎概念和疑難解釋。
R的交互式運行
在bash
命令行下輸入大寫字母R
即可啟動交互式界面
ct@ehbio:~$ R
R version 3.3.1 (2016-06-21) -- "Bug in Your Hair"
Copyright (C) 2016 The R Foundation for Statistical Computing
Platform: x86_64-redhat-linux-gnu (64-bit)
用'help.start()'通過HTML瀏覽器來看幫助文件。
用'q()'退出R.
> 1:3
[1] 1 2 3
> a <- 1:3
> a
[1] 1 2 3
# 使用source在交互式界面運行寫好的R腳本
> source('scrtpt.r')
> quit()
Save workspace image? [y/n/c]: n
# ctrl+d也可退出R
R基本語法
獲取幫助文檔,查看命令或函數的使用方法、事例或適用范圍
>>> ?command
>>> ??command #深度搜索或模糊搜索此命令
>>> example(command) #得到命令的例子
R中的變量
> # 數字變量
> a <- 10
> a
[1] 10
>
> # 字符串變量
> a <- "abc"
> a
[1] "abc"
>
> # 邏輯變量
> a <- TRUE
>
> a
[1] TRUE
>
> b <- T
>
> b
[1] TRUE
>
> d <- FALSE
>
> d
[1] FALSE
> # 向量
>
> a <- vector(mode="logical", length=5)
> a
[1] FALSE FALSE FALSE FALSE FALSE
>
> a <- c(1,2,3,4)
# 判斷一個變量是不是vector
> is.vector(a)
[1] TRUE
>
> # 矩陣
>
> a <- matrix(1:20,nrow=5,ncol=4,byrow=T)
> a
[,1] [,2] [,3] [,4]
[1,] 1 2 3 4
[2,] 5 6 7 8
[3,] 9 10 11 12
[4,] 13 14 15 16
[5,] 17 18 19 20
>
> is.matrix(a)
[1] TRUE
>
> dim(a) #查看或設置數組的維度向量
[1] 5 4
>
> # 錯誤的用法
> dim(a) <- c(4,4)
Error in dim(a) <- c(4, 4) : dims [product 16]與對象長度[20]不匹配
>
> # 正確的用法
> a <- 1:20
> dim(a) <- c(5,4) #轉換向量為矩陣
> a
[,1] [,2] [,3] [,4]
[1,] 1 6 11 16
[2,] 2 7 12 17
[3,] 3 8 13 18
[4,] 4 9 14 19
[5,] 5 10 15 20
>
> print(paste("矩陣a的行數", nrow(a)))
[1] "矩陣a的行數 5"
> print(paste("矩陣a的列數", ncol(a)))
[1] "矩陣a的列數 4"
>
> #查看或設置行列名
> rownames(a)
NULL
> rownames(a) <- c('a','b','c','d','e')
> a
[,1] [,2] [,3] [,4]
a 1 6 11 16
b 2 7 12 17
c 3 8 13 18
d 4 9 14 19
e 5 10 15 20
# R中獲取一系列的字母
> letters[1:4]
[1] "a" "b" "c" "d"
> colnames(a) <- letters[1:4]
> a
a b c d
a 1 6 11 16
b 2 7 12 17
c 3 8 13 18
d 4 9 14 19
e 5 10 15 20
>
# is系列和as系列函數用來判斷變量的屬性和轉換變量的屬性
# 矩陣轉換為data.frame
> is.character(a)
[1] FALSE
> is.numeric(a)
[1] TRUE
> is.matrix(a)
[1] TRUE
> is.data.frame(a)
[1] FALSE
> is.data.frame(as.data.frame(a))
[1] TRUE
R中矩陣運算
# 數據產生
# rnorm(n, mean = 0, sd = 1) 正態分布的隨機數
# runif(n, min = 0, max = 1) 平均分布的隨機數
# rep(1,5) 把1重復5次
# scale(1:5) 標準化數據
> a <- c(rnorm(5), rnorm(5,1), runif(5), runif(5,-1,1), 1:5, rep(0,5), c(2,10,11,13,4), scale(1:5)[1:5])
> a
[1] -0.41253556 0.12192929 -0.47635888 -0.97171653 1.09162243 1.87789657
[7] -0.11717937 2.92953522 1.33836620 -0.03269026 0.87540920 0.13005744
[13] 0.11900686 0.76663940 0.28407356 -0.91251181 0.17997973 0.50452258
[19] 0.25961316 -0.58052230 1.00000000 2.00000000 3.00000000 4.00000000
[25] 5.00000000 0.00000000 0.00000000 0.00000000 0.00000000 0.00000000
[31] 2.00000000 10.00000000 11.00000000 13.00000000 4.00000000 -1.26491106
[37] -0.63245553 0.00000000 0.63245553 1.26491106
> a <- matrix(a, ncol=5, byrow=T)
> a
[,1] [,2] [,3] [,4] [,5]
[1,] -0.4125356 0.1219293 -0.4763589 -0.9717165 1.09162243
[2,] 1.8778966 -0.1171794 2.9295352 1.3383662 -0.03269026
[3,] 0.8754092 0.1300574 0.1190069 0.7666394 0.28407356
[4,] -0.9125118 0.1799797 0.5045226 0.2596132 -0.58052230
[5,] 1.0000000 2.0000000 3.0000000 4.0000000 5.00000000
[6,] 0.0000000 0.0000000 0.0000000 0.0000000 0.00000000
[7,] 2.0000000 10.0000000 11.0000000 13.0000000 4.00000000
[8,] -1.2649111 -0.6324555 0.0000000 0.6324555 1.26491106
# 求行的加和
> rowSums(a)
[1] -0.6470593 5.9959284 2.1751865 -0.5489186 15.0000000 0.0000000 40.0000000
[8] 0.0000000
## 注意檢查括號的配對
> a <- a[rowSums(abs(a)!=0,]
錯誤: 意外的']' in "a <- a[rowSums(abs(a)!=0,]"
# 去除全部為0的行
> a <- a[rowSums(abs(a))!=0,]
# 另外一種方式去除全部為0的行
> #a[rowSums(a==0)<ncol(a),]
> a
[,1] [,2] [,3] [,4] [,5]
[1,] -0.4125356 0.1219293 -0.4763589 -0.9717165 1.09162243
[2,] 1.8778966 -0.1171794 2.9295352 1.3383662 -0.03269026
[3,] 0.8754092 0.1300574 0.1190069 0.7666394 0.28407356
[4,] -0.9125118 0.1799797 0.5045226 0.2596132 -0.58052230
[5,] 1.0000000 2.0000000 3.0000000 4.0000000 5.00000000
[6,] 2.0000000 10.0000000 11.0000000 13.0000000 4.00000000
[7,] -1.2649111 -0.6324555 0.0000000 0.6324555 1.26491106
# 矩陣運算,R默認針對整個數據進行常見運算
# 所有值都乘以2
> a * 2
[,1] [,2] [,3] [,4] [,5]
[1,] -0.8250711 0.2438586 -0.9527178 -1.9434331 2.18324487
[2,] 3.7557931 -0.2343587 5.8590704 2.6767324 -0.06538051
[3,] 1.7508184 0.2601149 0.2380137 1.5332788 0.56814712
[4,] -1.8250236 0.3599595 1.0090452 0.5192263 -1.16104460
[5,] 2.0000000 4.0000000 6.0000000 8.0000000 10.00000000
[6,] 4.0000000 20.0000000 22.0000000 26.0000000 8.00000000
[7,] -2.5298221 -1.2649111 0.0000000 1.2649111 2.52982213
# 所有值取絕對值,再取對數 (取對數前一般加一個數避免對0或負值取對數)
> log2(abs(a)+1)
[,1] [,2] [,3] [,4] [,5]
[1,] 0.4982872 0.1659818 0.5620435 0.9794522 1.0646224
[2,] 1.5250147 0.1598608 1.9743587 1.2255009 0.0464076
[3,] 0.9072054 0.1763961 0.1622189 0.8210076 0.3607278
[4,] 0.9354687 0.2387621 0.5893058 0.3329807 0.6604014
[5,] 1.0000000 1.5849625 2.0000000 2.3219281 2.5849625
[6,] 1.5849625 3.4594316 3.5849625 3.8073549 2.3219281
[7,] 1.1794544 0.7070437 0.0000000 0.7070437 1.1794544
# 取出最大值、最小值、行數、列數
> max(a)
[1] 13
> min(a)
[1] -1.264911
> nrow(a)
[1] 7
> ncol(a)
[1] 5
# 增加一列或一行
# cbind: column bind
> cbind(a, 1:7)
[,1] [,2] [,3] [,4] [,5] [,6]
[1,] -0.4125356 0.1219293 -0.4763589 -0.9717165 1.09162243 1
[2,] 1.8778966 -0.1171794 2.9295352 1.3383662 -0.03269026 2
[3,] 0.8754092 0.1300574 0.1190069 0.7666394 0.28407356 3
[4,] -0.9125118 0.1799797 0.5045226 0.2596132 -0.58052230 4
[5,] 1.0000000 2.0000000 3.0000000 4.0000000 5.00000000 5
[6,] 2.0000000 10.0000000 11.0000000 13.0000000 4.00000000 6
[7,] -1.2649111 -0.6324555 0.0000000 0.6324555 1.26491106 7
> cbind(a, seven=1:7)
seven
[1,] -0.4125356 0.1219293 -0.4763589 -0.9717165 1.09162243 1
[2,] 1.8778966 -0.1171794 2.9295352 1.3383662 -0.03269026 2
[3,] 0.8754092 0.1300574 0.1190069 0.7666394 0.28407356 3
[4,] -0.9125118 0.1799797 0.5045226 0.2596132 -0.58052230 4
[5,] 1.0000000 2.0000000 3.0000000 4.0000000 5.00000000 5
[6,] 2.0000000 10.0000000 11.0000000 13.0000000 4.00000000 6
[7,] -1.2649111 -0.6324555 0.0000000 0.6324555 1.26491106 7
# rbind: row bind
> rbind(a,1:5)
[,1] [,2] [,3] [,4] [,5]
[1,] -0.4125356 0.1219293 -0.4763589 -0.9717165 1.09162243
[2,] 1.8778966 -0.1171794 2.9295352 1.3383662 -0.03269026
[3,] 0.8754092 0.1300574 0.1190069 0.7666394 0.28407356
[4,] -0.9125118 0.1799797 0.5045226 0.2596132 -0.58052230
[5,] 1.0000000 2.0000000 3.0000000 4.0000000 5.00000000
[6,] 2.0000000 10.0000000 11.0000000 13.0000000 4.00000000
[7,] -1.2649111 -0.6324555 0.0000000 0.6324555 1.26491106
[8,] 1.0000000 2.0000000 3.0000000 4.0000000 5.00000000
# 計算每一行的mad (中值絕對偏差,一般認為比方差的魯棒性更強,更少受異常值的影響,更能反映數據間的差異)
> apply(a,1,mad)
[1] 0.7923976 2.0327283 0.2447279 0.4811672 1.4826000 4.4478000 0.9376786
# 計算每一行的var (方差)
# apply表示對數據(第一個參數)的每一行 (第二個參數賦值為1) 或每一列 (2)操作
# 最后返回一個列表
> apply(a,1,var)
[1] 0.6160264 1.6811161 0.1298913 0.3659391 2.5000000 22.5000000 1.0000000
# 計算每一列的平均值
> apply(a,2,mean)
[1] 0.4519068 1.6689045 2.4395294 2.7179083 1.5753421
# 取出中值絕對偏差大于0.5的行
> b = a[apply(a,1,mad)>0.5,]
> b
[,1] [,2] [,3] [,4] [,5]
[1,] -0.4125356 0.1219293 -0.4763589 -0.9717165 1.09162243
[2,] 1.8778966 -0.1171794 2.9295352 1.3383662 -0.03269026
[3,] 1.0000000 2.0000000 3.0000000 4.0000000 5.00000000
[4,] 2.0000000 10.0000000 11.0000000 13.0000000 4.00000000
[5,] -1.2649111 -0.6324555 0.0000000 0.6324555 1.26491106
# 矩陣按照mad的大小降序排列
> c = b[order(apply(b,1,mad), decreasing=T),]
> c
[,1] [,2] [,3] [,4] [,5]
[1,] 2.0000000 10.0000000 11.0000000 13.0000000 4.00000000
[2,] 1.8778966 -0.1171794 2.9295352 1.3383662 -0.03269026
[3,] 1.0000000 2.0000000 3.0000000 4.0000000 5.00000000
[4,] -1.2649111 -0.6324555 0.0000000 0.6324555 1.26491106
[5,] -0.4125356 0.1219293 -0.4763589 -0.9717165 1.09162243
> rownames(c) <- paste('Gene', letters[1:5], sep="_")
> colnames(c) <- toupper(letters[1:5])
> c
A B C D E
Gene_a 2.0000000 10.0000000 11.0000000 13.0000000 4.00000000
Gene_b 1.8778966 -0.1171794 2.9295352 1.3383662 -0.03269026
Gene_c 1.0000000 2.0000000 3.0000000 4.0000000 5.00000000
Gene_d -1.2649111 -0.6324555 0.0000000 0.6324555 1.26491106
Gene_e -0.4125356 0.1219293 -0.4763589 -0.9717165 1.09162243
# 矩陣轉置
> expr = t(c)
> expr
Gene_a Gene_b Gene_c Gene_d Gene_e
A 2 1.87789657 1 -1.2649111 -0.4125356
B 10 -0.11717937 2 -0.6324555 0.1219293
C 11 2.92953522 3 0.0000000 -0.4763589
D 13 1.33836620 4 0.6324555 -0.9717165
E 4 -0.03269026 5 1.2649111 1.0916224
# 矩陣值的替換
> expr2 = expr
> expr2[expr2<0] = 0
> expr2
Gene_a Gene_b Gene_c Gene_d Gene_e
A 2 1.877897 1 0.0000000 0.0000000
B 10 0.000000 2 0.0000000 0.1219293
C 11 2.929535 3 0.0000000 0.0000000
D 13 1.338366 4 0.6324555 0.0000000
E 4 0.000000 5 1.2649111 1.0916224
# 矩陣中只針對某一列替換
# expr2是個矩陣不是數據框,不能使用列名字索引
> expr2[expr2$Gene_b<1, "Gene_b"] <- 1
Error in expr2$Gene_b : $ operator is invalid for atomic vectors
# str是一個最為常用、好用的查看變量信息的工具,尤其是對特別復雜的變量,
# 可以看清其層級結構,便于提取數據
> str(expr2)
num [1:5, 1:5] 2 10 11 13 4 ...
- attr(*, "dimnames")=List of 2
..$ : chr [1:5] "A" "B" "C" "D" ...
..$ : chr [1:5] "Gene_a" "Gene_b" "Gene_c" "Gene_d" ...
# 轉換為數據庫,再進行相應的操作
> expr2 <- as.data.frame(expr2)
> str(expr2)
'data.frame': 5 obs. of 5 variables:
$ Gene_a: num 2 10 11 13 4
$ Gene_b: num 1.88 1 2.93 1.34 1
$ Gene_c: num 1 2 3 4 5
$ Gene_d: num 0 0 0 0.632 1.265
$ Gene_e: num 0 0.122 0 0 1.092
> expr2[expr2$Gene_b<1, "Gene_b"] <- 1
> expr2
> expr2
Gene_a Gene_b Gene_c Gene_d Gene_e
A 2 1.877897 1 0.0000000 0.0000000
B 10 1.000000 2 0.0000000 0.1219293
C 11 2.929535 3 0.0000000 0.0000000
D 13 1.338366 4 0.6324555 0.0000000
E 4 1.000000 5 1.2649111 1.0916224
R中矩陣篩選合并
# 讀入樣品信息
> sampleInfo = "Samp;Group;Genotype
+ A;Control;WT
+ B;Control;WT
+ D;Treatment;Mutant
+ C;Treatment;Mutant
+ E;Treatment;WT
+ F;Treatment;WT"
> phenoData = read.table(text=sampleInfo,sep=";", header=T, row.names=1, quote="")
> phenoData
Group Genotype
A Control WT
B Control WT
D Treatment Mutant
C Treatment Mutant
E Treatment WT
F Treatment WT
# 把樣品信息按照基因表達矩陣中的樣品信息排序,并只保留有基因表達信息的樣品
# match() returns a vector of the positions of (first) matches of
its first argument in its second.
> phenoData[match(rownames(expr), rownames(phenoData)),]
Group Genotype
A Control WT
B Control WT
C Treatment Mutant
D Treatment Mutant
E Treatment WT
# ‘%in%’ is a more intuitive interface as a binary operator, which
returns a logical vector indicating if there is a match or not for
its left operand.
# 注意順序,%in%比match更好理解一些
> phenoData = phenoData[rownames(phenoData) %in% rownames(expr),]
> phenoData
Group Genotype
A Control WT
B Control WT
C Treatment Mutant
D Treatment Mutant
E Treatment WT
# 合并矩陣
# by=0 表示按照行的名字排序
# by=columnname 表示按照共有的某一列排序
# 合并后多出了新的一列Row.names
> merge_data = merge(expr, phenoData, by=0, all.x=T)
> merge_data
Row.names Gene_a Gene_b Gene_c Gene_d Gene_e Group Genotype
1 A 2 1.87789657 1 -1.2649111 -0.4125356 Control WT
2 B 10 -0.11717937 2 -0.6324555 0.1219293 Control WT
3 C 11 2.92953522 3 0.0000000 -0.4763589 Treatment Mutant
4 D 13 1.33836620 4 0.6324555 -0.9717165 Treatment Mutant
5 E 4 -0.03269026 5 1.2649111 1.0916224 Treatment WT
> rownames(merge_data) <- merge_data$Row.names
> merge_data
Row.names Gene_a Gene_b Gene_c Gene_d Gene_e Group Genotype
A A 2 1.87789657 1 -1.2649111 -0.4125356 Control WT
B B 10 -0.11717937 2 -0.6324555 0.1219293 Control WT
C C 11 2.92953522 3 0.0000000 -0.4763589 Treatment Mutant
D D 13 1.33836620 4 0.6324555 -0.9717165 Treatment Mutant
E E 4 -0.03269026 5 1.2649111 1.0916224 Treatment WT
# 去除一列;-1表示去除第一列
> merge_data = merge_data[,-1]
> merge_data
Gene_a Gene_b Gene_c Gene_d Gene_e Group Genotype
A 2 1.87789657 1 -1.2649111 -0.4125356 Control WT
B 10 -0.11717937 2 -0.6324555 0.1219293 Control WT
C 11 2.92953522 3 0.0000000 -0.4763589 Treatment Mutant
D 13 1.33836620 4 0.6324555 -0.9717165 Treatment Mutant
E 4 -0.03269026 5 1.2649111 1.0916224 Treatment WT
# 提取出所有的數值列
> merge_data[sapply(merge_data, is.numeric)]
Gene_a Gene_b Gene_c Gene_d Gene_e
A 2 1.87789657 1 -1.2649111 -0.4125356
B 10 -0.11717937 2 -0.6324555 0.1219293
C 11 2.92953522 3 0.0000000 -0.4763589
D 13 1.33836620 4 0.6324555 -0.9717165
E 4 -0.03269026 5 1.2649111 1.0916224
str
的應用
str
: Compactly display the internal structure of an R object, a
diagnostic function and an alternative to 'summary (and to some
extent, 'dput'). Ideally, only one line for each 'basic'
structure is displayed. It is especially well suited to compactly
display the (abbreviated) contents of (possibly nested) lists.
The idea is to give reasonable output for any R object. It
calls ‘args’ for (non-primitive) function objects.
str
用來告訴結果的構成方式,對于不少Bioconductor的包,或者復雜的R函數的輸出,都是一堆列表的嵌套,str(complex_result)
會輸出每個列表的名字,方便提取對應的信息。
# str的一個應用例子
> str(list(a = "A", L = as.list(1:100)), list.len = 9)
List of 2
$ a: chr "A"
$ L:List of 100
..$ : int 1
..$ : int 2
..$ : int 3
..$ : int 4
..$ : int 5
..$ : int 6
..$ : int 7
..$ : int 8
..$ : int 9
.. [list output truncated]
# 利用str查看pca的結果,具體的PCA應用查看http://mp.weixin.qq.com/s/sRElBMkyR9rGa4TQp9KjNQ
> pca_result <- prcomp(expr)
> pca_result
Standard deviations:
[1] 4.769900e+00 1.790861e+00 1.072560e+00 1.578391e-01 2.752128e-16
Rotation:
PC1 PC2 PC3 PC4 PC5
Gene_a 0.99422750 -0.02965529 0.078809521 0.01444655 0.06490461
Gene_b 0.04824368 -0.44384942 -0.885305329 0.03127940 0.12619948
Gene_c 0.08258192 0.81118590 -0.451360828 0.05440417 -0.35842886
Gene_d -0.01936958 0.30237826 -0.079325524 -0.66399283 0.67897952
Gene_e -0.04460135 0.22948437 -0.002097256 0.74496081 0.62480128
> str(pca_result)
List of 5
$ sdev : num [1:5] 4.77 1.79 1.07 1.58e-01 2.75e-16
$ rotation: num [1:5, 1:5] 0.9942 0.0482 0.0826 -0.0194 -0.0446 ...
..- attr(*, "dimnames")=List of 2
.. ..$ : chr [1:5] "Gene_a" "Gene_b" "Gene_c" "Gene_d" ...
.. ..$ : chr [1:5] "PC1" "PC2" "PC3" "PC4" ...
$ center : Named num [1:5] 8 1.229 3 0.379 0.243
..- attr(*, "names")= chr [1:5] "Gene_a" "Gene_b" "Gene_c" "Gene_d" ...
$ scale : logi FALSE
$ x : num [1:5, 1:5] -6.08 1.86 3.08 5.06 -3.93 ...
..- attr(*, "dimnames")=List of 2
.. ..$ : chr [1:5] "A" "B" "C" "D" ...
.. ..$ : chr [1:5] "PC1" "PC2" "PC3" "PC4" ...
- attr(*, "class")= chr "prcomp"
# 取出每個主成分解釋的差異
> pca_result$sdev
[1] 4.769900e+00 1.790861e+00 1.072560e+00 1.578391e-01 2.752128e-16
R的包管理
# 什么時候需要安裝包
> library('unExistedPackage')
Error in library("unExistedPackage") :
不存在叫‘unExistedPackage’這個名字的程輯包
# 安裝包
> install.packages("package_name")
# 指定安裝來源
> install.packages("package_name", repo="http://cran.us.r-project.org")
# 安裝Bioconductor的包
> source('https://bioconductor.org/biocLite.R')
> biocLite('BiocInstaller')
> biocLite(c("RUVSeq","pcaMethods"))
# 安裝Github的R包
> install.packages("devtools")
> devtools::install_github("JustinaZ/pcaReduce")
# 手動安裝, 首先下載包的源文件(壓縮版就可),然后在終端運行下面的命令。
ct@ehbio:~$ R CMD INSTALL package.tar.gz
# 移除包
>remove.packages("package_name")
# 查看所有安裝的包
>library()
# 查看特定安裝包的版本
> installed.packages()[c("DESeq2"), c("Package", "Version")]
Package Version
"DESeq2" "1.14.1"
>
# 查看默認安裝包的位置
>.libPaths()
# 調用安裝的包
>library(package_name)
#devtools::install_github("hms-dbmi/scde", build_vignettes = FALSE)
#install.packages(c("mvoutlier","ROCR"))
#biocLite(c("RUVSeq","pcaMethods","SC3","TSCAN","monocle","MultiAssayExperiment","SummarizedExperiment"))
#devtools::install_github("satijalab/seurat")