3. 散點圖
目錄
3. 散點圖
3.1 繪制基本散點圖
3.2 使用點形和顏色屬性進行分組
3.3 使用不同于默認設置的點形
3.4 將連續型變量映射到點的顏色或大小屬性上
3.5 處理圖形重疊
3.6 添加回歸模型擬合線
3.7 根據已有模型向散點圖添加擬合線
3.8 添加來自多個模型的擬合線
3.9 向散點圖添加模型系數
3.10 向散點圖添加邊際地毯
3.11 向散點圖添加標簽
3.12 繪制氣泡圖
3.13 繪制散點圖矩陣往期文章參考書籍
散點圖通常用于刻畫兩個連續型變量之間的關系。繪制散點圖時,數據集中的每一個觀測值都由每個點表示。
3.1 繪制基本散點圖
library(gcookbook)
library(ggplot2)
# 列出我們用到的列
head(heightweight[, c("ageYear", "heightIn")])
> head(heightweight[, c("ageYear", "heightIn")])
ageYear heightIn
1 11.92 56.3
2 12.92 62.3
3 12.75 63.3
4 13.42 59.0
5 15.92 62.5
6 14.25 62.5
ggplot(heightweight, aes(x=ageYear, y=heightIn)) + geom_point()
# shape參數設置點型 size設置點的大小
ggplot(heightweight, aes(x=ageYear, y=heightIn)) +
geom_point(shape=21)
ggplot(heightweight, aes(x=ageYear, y=heightIn)) +
geom_point(size=1.5)
3.2 使用點形和顏色屬性進行分組
head(heightweight[, c("sex", "ageYear", "heightIn")])
> head(heightweight[, c("sex", "ageYear", "heightIn")])
sex ageYear heightIn
1 f 11.92 56.3
2 f 12.92 62.3
3 f 12.75 63.3
4 f 13.42 59.0
5 f 15.92 62.5
6 f 14.25 62.5
ggplot(heightweight, aes(x=ageYear, y=heightIn, colour=sex)) +
geom_point()
ggplot(heightweight, aes(x=ageYear, y=heightIn, shape=sex)) +
geom_point()
# scale_shape_manual()使用其它點形狀
#scale_colour_brewer()使用其它顏色
ggplot(heightweight, aes(x=ageYear, y=heightIn, shape=sex, colour=sex)) +
geom_point() +
scale_shape_manual(values=c(1,2)) +
scale_colour_brewer(palette="Set1")
3.3 使用不同于默認設置的點形
# 使用點形和填充色屬性分別表示不同變量
hw <- heightweight
# 分組 Categorize into <100 and >=100 groups
hw$weightGroup <- cut(hw$weightLb, breaks=c(-Inf, 100, Inf),
labels=c("< 100", ">= 100"))
# 使用具有顏色和填充色的點形及對應于空值(NA)和填充色的顏色
ggplot(hw, aes(x=ageYear, y=heightIn, shape=sex, fill=weightGroup)) +
geom_point(size=2.5) +
scale_shape_manual(values=c(21, 24)) +
scale_fill_manual(values=c(NA, "black"),
guide=guide_legend(override.aes=list(shape=21)))
3.4 將連續型變量映射到點的顏色或大小屬性上
ggplot(heightweight, aes(x=ageYear, y=heightIn, colour=weightLb)) +
geom_point()
ggplot(heightweight, aes(x=ageYear, y=heightIn, size=weightLb)) +
geom_point()
# 默認點的大小范圍為1-6mm
# scale_size_continuous(range=c(2, 5))修改點的大小范圍
# 將色階設定為由黑至白
ggplot(heightweight, aes(x=weightLb, y=heightIn, fill=ageYear)) +
geom_point(shape=21, size=2.5) +
scale_fill_gradient(low="black", high="white")
# 使用 guide_legend() 函數以離散的圖例代替色階
ggplot(heightweight, aes(x=weightLb, y=heightIn, fill=ageYear)) +
geom_point(shape=21, size=2.5) +
scale_fill_gradient(low="black", high="white", breaks=12:17,
guide=guide_legend())
# 調用scale_size_area()函數使數據點的面積正比于變量值。
ggplot(heightweight, aes(x=ageYear, y=heightIn, size=weightLb, colour=sex)) +
geom_point(alpha=.5) +
scale_size_area() +
scale_colour_brewer(palette="Set1")
3.5 處理圖形重疊
方法:
- 使用半透明的點
- 將數據分箱(bin),并用矩形表示
- 將數據分箱(bin),并用六邊形表示
- 使用箱線圖
sp <- ggplot(diamonds, aes(x=carat, y=price))
sp + geom_point()
# 透明度
sp + geom_point(alpha=.1)
sp + geom_point(alpha=.01)
# stat_bin2d()函數默認分別在x軸和y軸方向上將數據分割為30各組
sp + stat_bin2d()
# bin=50設置箱數,limits參數設定圖例范圍
sp + stat_bin2d(bins=50) +
scale_fill_gradient(low="lightblue", high="red", limits=c(0, 6000))
# stat_binhex()函數使用六邊形分箱
library(hexbin)
sp + stat_binhex() +
scale_fill_gradient(low="lightblue", high="red",
limits=c(0, 8000))
sp + stat_binhex() +
scale_fill_gradient(low="lightblue", high="red",
breaks=c(0, 250, 500, 1000, 2000, 4000, 6000),
limits=c(0, 6000))
sp1 <- ggplot(ChickWeight, aes(x=Time, y=weight))
sp1 + geom_point()
# 調用position_jitter()函數給數據點增加隨機擾動,通過width,height參數調節
sp1 + geom_point(position="jitter")
# 也可以調用 geom_jitter()
sp1 + geom_point(position=position_jitter(width=.5, height=0))
# 箱線圖
sp1 + geom_boxplot(aes(group=Time))
3.6 添加回歸模型擬合線
# 運行stat_smooth()函數并設定 method=lm 即可向散點圖中添加線性回歸擬合線
# 默認情況下 stat_smooth() 函數會為回歸擬合線自動添加95% 的置信域,可以設置 level 參數對置信水平進行調整。設置 se = FALSE, 則不添加置信域
library(gcookbook) # For the data set
sp <- ggplot(heightweight, aes(x=ageYear, y=heightIn))
sp + geom_point() + stat_smooth(method=lm)
# 99% 置信域
sp + geom_point() + stat_smooth(method=lm, level=0.99)
# 沒有置信域
sp + geom_point() + stat_smooth(method=lm, se=FALSE)
# 設置擬合線的顏色
sp + geom_point(colour="grey60") +
stat_smooth(method=lm, se=FALSE, colour="black")
# stat_smooth()函數默認的模型為 loess 曲線
sp + geom_point(colour="grey60") + stat_smooth()
sp + geom_point(colour="grey60") + stat_smooth(method=loess)
# 分組繪制模型擬合線
sps <- ggplot(heightweight, aes(x=ageYear, y=heightIn, colour=sex)) +
geom_point() +
scale_colour_brewer(palette="Set1")
sps + geom_smooth()
sps + geom_smooth(method=lm, se=FALSE, fullrange=TRUE)
值得注意的是:loess()
函數只能根據數據對應的x軸的范圍進行預測。如果想基于數據集對擬合線進行外推,必須使用支持外推的函數,比如lm()
,并將fullrange=TRUE
參數傳遞給 stat_smooth()
函數。
3.7 根據已有模型向散點圖添加擬合線
使用 lm()
函數建立一個以 ageYear
為預測變量對 heightIn
進行預測的模型。然后,調用 predict()
函數對 heightIn
進行預測。
model <- lm(heightIn ~ ageYear + I(ageYear^2), heightweight)
model
> model
Call:
lm(formula = heightIn ~ ageYear + I(ageYear^2), data = heightweight)
Coefficients:
(Intercept) ageYear I(ageYear^2)
-10.3136 8.6673 -0.2478
# 創建一個 ageYear 列,并對其進行插值。
xmin <- min(heightweight$ageYear)
xmax <- max(heightweight$ageYear)
predicted <- data.frame(ageYear=seq(xmin, xmax, length.out=100))
# 計算 heightIn 的預測值
predicted$heightIn <- predict(model, predicted)
head(predicted)
> head(predicted)
ageYear heightIn
1 11.58000 56.82624
2 11.63980 57.00047
3 11.69960 57.17294
4 11.75939 57.34363
5 11.81919 57.51255
6 11.87899 57.67969
# 將預測曲線繪制的數據點散點圖上
sp <- ggplot(heightweight, aes(x=ageYear, y=heightIn)) +
geom_point(colour="grey40")
sp + geom_line(data=predicted, size=1)
# 應用定義的 predictvals() 函數可以簡化向散點圖添加模型擬合線的過程
predictvals <- function(model, xvar, yvar, xrange=NULL, samples=100, ...) {
if (is.null(xrange)) {
if (any(class(model) %in% c("lm", "glm")))
xrange <- range(model$model[[xvar]])
else if (any(class(model) %in% "loess"))
xrange <- range(model$x)
}
newdata <- data.frame(x = seq(xrange[1], xrange[2], length.out = samples))
names(newdata) <- xvar
newdata[[yvar]] <- predict(model, newdata = newdata, ...)
newdata
}
# 調用lm() 函數和 loess() 函數對數據集建立線性和LOESS模型
modlinear <- lm(heightIn ~ ageYear, heightweight)
modloess <- loess(heightIn ~ ageYear, heightweight)
lm_predicted <- predictvals(modlinear, "ageYear", "heightIn")
loess_predicted <- predictvals(modloess, "ageYear", "heightIn")
ggplot(heightweight, aes(x=ageYear, y=heightIn)) +
geom_point(colour="grey40") +
geom_line(data=lm_predicted, colour="red", size=.8) +
geom_line(data=loess_predicted, colour="blue", size=.8)
3.8 添加來自多個模型的擬合線
根據變量 sex 的水平對 heightweight 數據集進行分組,調用 lm() 函數對每組數據分別建立線性模型,并將模型結果放在一個列表內。隨后,通過下面定義的 make_model() 函數建立模型。
make_model <- function(data) {
lm(heightIn ~ ageYear, data)
}
# 將heighweight 數據集分別切分為男性和女性組并建立模型
ibrary(gcookbook)
library(plyr)
models <- dlply(heightweight, "sex", .fun = make_model)
# 查看兩個lm對象f和m組成的列表
models
> models
$f
Call:
lm(formula = heightIn ~ ageYear, data = data)
Coefficients:
(Intercept) ageYear
43.963 1.209
$m
Call:
lm(formula = heightIn ~ ageYear, data = data)
Coefficients:
(Intercept) ageYear
30.658 2.301
attr(,"split_type")
[1] "data.frame"
attr(,"split_labels")
sex
1 f
2 m
predvals <- ldply(models, .fun=predictvals, xvar="ageYear", yvar="heightIn")
head(predvals)
ggplot(heightweight, aes(x=ageYear, y=heightIn, colour=sex)) +
geom_point() + geom_line(data=predvals)
# 設置 xrange 參數使兩組預測線對應的xz軸范圍與整個數據集對應的x軸范圍詳談
predvals <- ldply(models, .fun=predictvals, xvar="ageYear", yvar="heightIn",
xrange=range(heightweight$ageYear))
ggplot(heightweight, aes(x=ageYear, y=heightIn, colour=sex)) +
geom_point() + geom_line(data=predvals)
3.9 向散點圖添加模型系數
調用 annotate()
函數在圖形中添加文本。
model <- lm(heightIn ~ ageYear, heightweight)
# 查看模型參數
summary(model)
> summary(model)
Call:
lm(formula = heightIn ~ ageYear, data = heightweight)
Residuals:
Min 1Q Median 3Q Max
-8.3517 -1.9006 0.1378 1.9071 8.3371
Coefficients:
Estimate Std. Error t value Pr(>|t|)
(Intercept) 37.4356 1.8281 20.48 <2e-16 ***
ageYear 1.7483 0.1329 13.15 <2e-16 ***
---
Signif. codes: 0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1
Residual standard error: 2.989 on 234 degrees of freedom
Multiple R-squared: 0.4249, Adjusted R-squared: 0.4225
F-statistic: 172.9 on 1 and 234 DF, p-value: < 2.2e-16
pred <- predictvals(model, "ageYear", "heightIn")
sp <- ggplot(heightweight, aes(x=ageYear, y=heightIn)) + geom_point() +
geom_line(data=pred)
# x,y參數設置標簽位置
sp + annotate("text", label="r^2=0.42", x=16.5, y=52)
# parse = TRUE 調用R的數學表達式語法
sp + annotate("text", label="r^2 == 0.42", parse = TRUE, x=16.5, y=52)
# 自動生成公式
eqn <- as.character(as.expression(
substitute(italic(y) == a + b * italic(x) * "," ~~ italic(r)^2 ~ "=" ~ r2,
list(a = format(coef(model)[1], digits=3),
b = format(coef(model)[2], digits=3),
r2 = format(summary(model)$r.squared, digits=2)
))))
eqn
parse(text=eqn) # Parsing turns it into an expression
sp + annotate("text", label=eqn, parse=TRUE, x=Inf, y=-Inf, hjust=1.1, vjust=-.5)
3.10 向散點圖添加邊際地毯
# 使用 geom_rug() 函數添加邊際地毯
ggplot(faithful, aes(x=eruptions, y=waiting)) +
geom_point() +
geom_rug()
# 通過向邊際地毯線的位置坐標添加擾動并設定size減小線寬可以減輕邊際地毯線的重疊程度
ggplot(faithful, aes(x=eruptions, y=waiting)) +
geom_point() +
geom_rug(position="jitter", size=.2)
3.11 向散點圖添加標簽
library(gcookbook)
# 以countries數據集為例,對各國醫療保健支出與嬰兒死亡率之間的關系進行可視化
# 選取人均支出大于2000美元的國家的數據子集進行分析
subset(countries, Year==2009 & healthexp>2000)
sp <- ggplot(subset(countries, Year==2009 & healthexp>2000),
aes(x=healthexp, y=infmortality)) +
geom_point()
# annotate()函數指定標簽坐標和標簽文本
sp + annotate("text", x=4350, y=5.4, label="Canada") +
annotate("text", x=7400, y=6.8, label="USA")
# geom_text()函數自動添加數據標簽
sp + geom_text(aes(label=Name), size=4)
調整標簽位置,大家自行嘗試。
# 對標簽的位置進行調整
sp + geom_text(aes(label=Name), size=4, vjust=0)
sp + geom_text(aes(y=infmortality+.1, label=Name), size=4, vjust=0)
sp + geom_text(aes(label=Name), size=4, hjust=0)
sp + geom_text(aes(x=healthexp+100, label=Name), size=4, hjust=0)
如何只對自己想要的數據點添加標簽。
注:有很多人在后臺問我如何在火山圖里給自己想要的基因添加注釋。這里提供了一個思路。
# 新建一個數據
cdat <- subset(countries, Year==2009 & healthexp>2000)
cdat$Name1 <- cdat$Name
# 用%in%運算符找出繪圖時希望抱怨的標簽
idx <- cdat$Name1 %in% c("Canada", "Ireland", "United Kingdom", "United States",
"New Zealand", "Iceland", "Japan", "Luxembourg",
"Netherlands", "Switzerland")
idx
# 根據上面的邏輯向量用 NA 重寫變量 Name1 中的其它取值
cdat$Name1[!idx] <- NA
cdat
ggplot(cdat, aes(x=healthexp, y=infmortality)) +
geom_point() +
geom_text(aes(x=healthexp+100, label=Name1), size=4, hjust=0) +
xlim(2000, 10000)
3.12 繪制氣泡圖
調用 geom_point()
和 scale_size_area()
函數即可繪制氣泡圖。
# 示例數據
library(gcookbook) # For the data set
cdat <- subset(countries, Year==2009 &
Name %in% c("Canada", "Ireland", "United Kingdom", "United States",
"New Zealand", "Iceland", "Japan", "Luxembourg",
"Netherlands", "Switzerland"))
cdat
> cdat
Name Code Year GDP laborrate healthexp infmortality
1733 Canada CAN 2009 39599.04 67.8 4379.761 5.2
4436 Iceland ISL 2009 37972.24 77.5 3130.391 1.7
4691 Ireland IRL 2009 49737.93 63.6 4951.845 3.4
4946 Japan JPN 2009 39456.44 59.5 3321.466 2.4
5864 Luxembourg LUX 2009 106252.24 55.5 8182.855 2.2
7088 Netherlands NLD 2009 48068.35 66.1 5163.740 3.8
7190 New Zealand NZL 2009 29352.45 68.6 2633.625 4.9
9587 Switzerland CHE 2009 63524.65 66.9 7140.729 4.1
10454 United Kingdom GBR 2009 35163.41 62.2 3285.050 4.7
10505 United States USA 2009 45744.56 65.0 7410.163 6.6
p <- ggplot(cdat, aes(x=healthexp, y=infmortality, size=GDP)) +
geom_point(shape=21, colour="black", fill="cornsilk")
# 將GDP 映射給半徑 (scale_size_continuous)
p
# 將GDP 映射給面積
p + scale_size_area(max_size=15)
如果x軸,y軸皆是分類變量,氣泡圖可以用來表示網格上的變量值。
# 對男性組和女性組求和
hec <- HairEyeColor[,,"Male"] + HairEyeColor[,,"Female"]
# 轉化為長格式(long format)
library(reshape2)
hec <- melt(hec, value.name="count")
ggplot(hec, aes(x=Eye, y=Hair)) +
geom_point(aes(size=count), shape=21, colour="black", fill="cornsilk") +
scale_size_area(max_size=20, guide=FALSE) +
geom_text(aes(y=as.numeric(Hair)-sqrt(count)/22, label=count), vjust=1,
colour="grey60", size=4)
3.13 繪制散點圖矩陣
散點圖矩陣是一種對多個變量兩兩之間關系進行可視化的有效方法。pairs()函數可以繪制散點圖矩陣。
注:現在散點圖矩陣有現成的R包(如
GGally_ggpairs
)。以下內容僅供了解。
# 示例數據
library(gcookbook) # For the data set
c2009 <- subset(countries, Year==2009,
select=c(Name, GDP, laborrate, healthexp, infmortality))
head(c2009
> head(c2009)
Name GDP laborrate healthexp infmortality
50 Afghanistan NA 59.8 50.88597 103.2
101 Albania 3772.605 59.5 264.60406 17.2
152 Algeria 4022.199 58.5 267.94653 32.0
203 American Samoa NA NA NA NA
254 Andorra NA NA 3089.63589 3.1
305 Angola 4068.576 81.3 203.80787 99.9
pairs(c2009[,2:5])
# 定義一個panel.cor函數來展示變量兩兩之間的相關系數以代替默認的散點圖
panel.cor <- function(x, y, digits=2, prefix="", cex.cor, ...) {
usr <- par("usr")
on.exit(par(usr))
par(usr = c(0, 1, 0, 1))
r <- abs(cor(x, y, use="complete.obs"))
txt <- format(c(r, 0.123456789), digits=digits)[1]
txt <- paste(prefix, txt, sep="")
if(missing(cex.cor)) cex.cor <- 0.8/strwidth(txt)
text(0.5, 0.5, txt, cex = cex.cor * (1 + r) / 2)
}
# 定義 panel.hist 函數展示各個變量的直方圖
panel.hist <- function(x, ...) {
usr <- par("usr")
on.exit(par(usr))
par(usr = c(usr[1:2], 0, 1.5) )
h <- hist(x, plot = FALSE)
breaks <- h$breaks
nB <- length(breaks)
y <- h$counts
y <- y/max(y)
rect(breaks[-nB], 0, breaks[-1], y, col="white", ...)
}
pairs(c2009[,2:5], upper.panel = panel.cor,
diag.panel = panel.hist,
lower.panel = panel.smooth)
# 線性模型替代lowess 模型
panel.lm <- function (x, y, col = par("col"), bg = NA, pch = par("pch"),
cex = 1, col.smooth = "black", ...) {
points(x, y, pch = pch, col = col, bg = bg, cex = cex)
abline(stats::lm(y ~ x), col = col.smooth, ...)
}
pairs(c2009[,2:5], pch=".",
upper.panel = panel.cor,
diag.panel = panel.hist,
lower.panel = panel.lm)
往期文章
參考書籍
- R Graphics Cookbook, 2nd edition.