- 基本用法
- 正則表達式
- 使用正則表達式的stringr函數
1. 基本用法
- 字符串長度
str_length()
> str_length(c("a", "R for data science", NA))
[1] 1 18 NA
- 字符串組合
str_c()
# 將字符串組合為字符
> str_c("x", "y")
[1] "xy"
# 缺失值可傳染
> x <- c("abc", NA)
> str_c("|-", x, "-|")
[1] "|-abc-|" NA
# 使用 str_replace_na() 函數顯示缺失值
> str_c("|-", str_replace_na(x), "-|")
[1] "|-abc-|" "|-NA-|"
# 可自動循環短向量
> str_c("prefix-", c("a", "b", "c"), "-suffix")
[1] "prefix-a-suffix" "prefix-b-suffix" "prefix-c-suffix"
# 使用 collapse() 函數將字符向量合并為字符串
> str_c(c("x", "y", "z"), collapse = ",")
[1] "x,y,z"
- 字符串取子集
str_sub()
# start 和 end 參數給出了提取字符串的位置
> x <- c("apple", "banana", "pear")
> str_sub(x, 1, 3)
[1] "app" "ban" "pea"
# 負數表示從后往前數
> str_sub(x, -3, -1)
[1] "ple" "ana" "ear"
# 通過賦值形式修改字符串
> str_sub(x, 1, 1) <- str_to_upper(str_sub(x, 1, 1))
> x
[1] "Apple" "Banana" "Pear"
- 字符串排序
str_sort()
或str_order()
> x <- c("apple", "eggplant", "banana")
> str_sort(x, locale = "en") # 英語
[1] "apple" "banana" "eggplant"
2. 正則表達式
2.1 特殊匹配
\
在正則表示式中作為轉義字符,并且在字符串中也需要用\
進行轉義。所以正則表達式字符串中需使用\\
匹配轉義字符。
\\\\
對應 \
\\.
對應 .
\\d
對應 \d
2.2 字符類與字符選項
特殊模式可以匹配多個字符
.
匹配除換行符外所有字符
\d
匹配任意數字
\s
匹配任意 空白 字符(如空格、制表符、換行符)
[abc]
匹配a或b或c
[^abc]
匹配除a\b\c外任何字符
[A-Za-z]
匹配所有大小寫英文字符
- 需要在字符串中對
\
進行轉義,如輸入\\d
創建包含\d
的正則表達式
2.3 錨點
^
從字符串開頭進行匹配
$
從字符串末尾進行匹配
\b
匹配單詞的邊界
2.4 重復
?
0次或1次
+
1次或多次
*
0次或多次
{n}
匹配n次
{n, m}
匹配n次到m次
- 默認將匹配盡量長的字符串, 即貪婪的。在表達式后添加
?
將匹配盡量短的字符串
2.5 分組與回溯引用
\\1
重復匹配第一個()
內的表達式
\\2
重復匹配第二個()
內的表達式
3. 使用正則表達式的stringr函數
3.1 匹配檢測
-
str_detect()
檢測字符向量是否匹配某種模式,返回長度相同的邏輯向量
> x <- c("apple", "banana", "pear")
> str_detect(x, "e")
[1] TRUE FALSE TRUE
邏輯向量中FALSE值為0,TRUE值為1,使得sum()
mean()
函數可用作統計
> mean(str_detect(words, pattern = "[aeiou]$"))
[1] 0.2765306
當匹配的邏輯條件復雜時,可通過單個正則表達式或邏輯運算符將多個str_detect()
調用組合
# 如匹配非元音字母開頭的單詞
> no_vowels_1 <- !str_detect(words, "[aeiou]")
> no_vowels_2 <- str_detect(words, "^[^aeiou]+$")
> identical(no_vowels_1, no_vowels_2)
[1] TRUE
-
str_subset()
選取出所匹配的子集
> str_subset(words, "x$")
[1] "box" "sex" "six" "tax"
結合filter()
操作,選出某一列中符合匹配模式的字符串
> df <- tibble(
+ word = words,
+ i = seq_along(words)
+ )
> df %>%
+ filter(str_detect(words, "x$"))
# A tibble: 4 x 2
word I
<chr> <int>
1 box 108
2 sex 747
3 six 772
4 tax 841
-
str_count
返回字符串中匹配的數量
> str_count(x, "a")
[1] 1 3 1
str_count()
與 mutate()
配合使用
# 計算每個單詞中所包含的原音和輔音字母
> df %>%
+ mutate(
+ vowels = str_count(words, "[aeiou]"),
+ consonants = str_count(words, "[^aeiou]")
+ )
# A tibble: 980 x 4
word i vowels consonants
<chr> <int> <int> <int>
1 a 1 1 0
2 able 2 2 2
3 about 3 3 2
4 absolute 4 4 4
5 accept 5 2 4
6 account 6 3 4
7 achieve 7 4 3
8 across 8 2 4
9 act 9 1 2
10 active 10 3 3
# … with 970 more rows
- exercise
# words表格中哪個單詞包含最大比例的元音字母
> df %>%
+ mutate(
+ vowels = str_count(word, "[aeiou]"),
+ length = str_length(word),
+ rate = vowels / length
+ ) %>%
+ arrange(desc(rate))
# A tibble: 980 x 5
word i vowels length rate
<chr> <int> <int> <int> <dbl>
1 a 1 1 1 1
2 area 49 3 4 0.75
3 idea 412 3 4 0.75
4 age 22 2 3 0.667
5 ago 24 2 3 0.667
6 air 26 2 3 0.667
7 die 228 2 3 0.667
8 due 250 2 3 0.667
9 eat 256 2 3 0.667
10 europe 278 4 6 0.667
# … with 970 more rows
3.2 提取匹配內容
str_extract()
提取匹配的實際文本
# 以 stringr::sentences 為例
> head(sentences)
[1] "The birch canoe slid on the smooth planks."
[2] "Glue the sheet to the dark blue background."
[3] "It's easy to tell the depth of a well."
[4] "These days a chicken leg is a rare dish."
[5] "Rice is often served in round bowls."
[6] "The juice of lemons makes fine punch."
# 構建正則表達式 color_match
> colors <- c("red", "orange", "yellow", "green", "blue", "purple")
> color_match <- str_c(colors, collapse = "|")
> color_match
[1] "red|orange|yellow|green|blue|purple"
# 挑選出匹配的句子
> has_color <- str_subset(sentences, color_match)
> length(has_color)
# 提取匹配句子中包含的顏色
> matches <- str_extract(has_color, color_match)
> matches
[1] "blue" "blue" "red" "red" "red" "blue"
[7] "yellow" "red" "red" "green" "red" "red"
[13] "blue" "red" "red" "red" "red" "blue"
[19] "red" "blue" "red" "green" "red" "red"
[25] "red" "red" "red" "red" "green" "red"
[31] "green" "red" "purple" "green" "red" "red"
[37] "red" "red" "red" "blue" "red" "blue"
[43] "red" "red" "red" "red" "green" "green"
[49] "green" "red" "red" "yellow" "red" "orange"
[55] "red" "red" "red"
str_extract()
只提取第一個匹配,可使用str_extract_all()
得到所有匹配exercise
# 提取以ing結尾的所有單詞
pattern1 <- "\\b[A-Za-z]+ing\\b"
str_extract(sentences, pattern1)
# 比較:
pattern2 <- "ing\\b"
str_extract(sentences, pattern2)
pattern1
與 pattern2
在 str_subset()
函數中都可以提取出包含ing結尾單詞的句子, 但只有pattern1
可在 str_extract()
中提取出結尾為ing的單詞
3.3 分組匹配
str_match()
可在提取出完整匹配后給出每個獨立分組,結果返回矩陣
e.g. 提取跟在 a 或 the 后面的單詞
# 構建正則表達式,定義為 至少有1個非空格字符的字符序列
> noun <- "(a|the)([^ ]+)"
# 提取匹配到的字符串
> has_noun <- sentences %>%
+ str_subset(noun) %>%
+ head(10)
# 提取所匹配的具體內容
> has_noun %>% str_extract(noun)
[1] "the smooth" "the sheet" "the depth" "a chicken"
[5] "the parked" "the sun" "the huge" "the ball"
[9] "the woman" "a helps"
# 提取完整內容并顯示分組的匹配結果
> has_noun %>% str_match(noun)
[,1] [,2] [,3]
[1,] "the smooth" "the" "smooth"
[2,] "the sheet" "the" "sheet"
[3,] "the depth" "the" "depth"
[4,] "a chicken" "a" "chicken"
[5,] "the parked" "the" "parked"
[6,] "the sun" "the" "sun"
[7,] "the huge" "the" "huge"
[8,] "the ball" "the" "ball"
[9,] "the woman" "the" "woman"
[10,] "a helps" "a" "helps"
-
str_extract_all()
提取所有匹配
3.4 替換匹配的內容
str_replace(string, pattern, replacement)
使用固定字符串替換匹配內容
> x <- c("1 apple", "2 pear", "3 banana")
> str_replace(x, pattern = "[aeiou]", replacement = "-")
[1] "1 -pple" "2 p-ar" "3 b-nana"
> str_replace_all(x, "[aeiou]", "-")
[1] "1 -ppl-" "2 p--r" "3 b-n-n-"
# 可同時執行多個替換
> str_replace_all(x, c("1" = "one", "2" = "two", "3" = "three"))
[1] "one apple" "two pear" "three banana"
可利用回溯引用來插入匹配中的分組
> head(sentences)[1:5]
[1] "The birch canoe slid on the smooth planks."
[2] "Glue the sheet to the dark blue background."
[3] "It's easy to tell the depth of a well."
[4] "These days a chicken leg is a rare dish."
[5] "Rice is often served in round bowls."
> sentences %>%
+ str_replace("([^ ]+) ([^ ]+) ([^ ]+)", "\\1 \\3 \\2") %>%
+ head(5)
[1] "The canoe birch slid on the smooth planks."
[2] "Glue sheet the to the dark blue background."
[3] "It's to easy tell the depth of a well."
[4] "These a days chicken leg is a rare dish."
[5] "Rice often is served in round bowls."
注意正則表達式中的空格!
3.5 拆分
str_split(string, pattern, n = Inf, simplify = FLASE)
默認結果返回列表形式
> sentences %>%
+ head(5) %>%
+ str_split(" ")
[[1]]
[1] "The" "birch" "canoe" "slid" "on" "the"
[7] "smooth" "planks."
[[2]]
[1] "Glue" "the" "sheet" "to"
[5] "the" "dark" "blue" "background."
[[3]]
[1] "It's" "easy" "to" "tell" "the" "depth" "of"
[8] "a" "well."
[[4]]
[1] "These" "days" "a" "chicken" "leg" "is"
[7] "a" "rare" "dish."
[[5]]
[1] "Rice" "is" "often" "served" "in" "round"
[7] "bowls."
# 設置 simplify = TRUE 返回矩陣
> sentences %>%
+ head(5) %>%
+ str_split(" ", simplify = TRUE)
[,1] [,2] [,3] [,4] [,5] [,6] [,7]
[1,] "The" "birch" "canoe" "slid" "on" "the" "smooth"
[2,] "Glue" "the" "sheet" "to" "the" "dark" "blue"
[3,] "It's" "easy" "to" "tell" "the" "depth" "of"
[4,] "These" "days" "a" "chicken" "leg" "is" "a"
[5,] "Rice" "is" "often" "served" "in" "round" "bowls."
[,8] [,9]
[1,] "planks." ""
[2,] "background." ""
[3,] "a" "well."
[4,] "rare" "dish."
[5,] "" ""
可通過 boundary()
函數指定字母、行或句子和單詞邊界進行拆分
> x <- "This is a sentence. This is another sentence"
> str_split(x, boundary("word"))[[1]]
[1] "This" "is" "a" "sentence" "This"
[6] "is" "another" "sentence"
3.6 定位匹配內容
str_locate(string, pattern)
可以給出每個匹配位置的開始和結束, 結合 str_sub()
函數提取或修改匹配的內容。