數據量級:從 415816 到 221436
第1種 distinct
SELECT COUNT(DISTINCT uid)
FROM tableA
WHERE dt = '20191205'
AND event = 'start'
;
--29.379s
第2種 group by
SELECT COUNT(*)
FROM (
SELECT uid
,collect_set(model)[0]
,collect_set(os)[0]
FROM tableA
WHERE dt = '20191205'
AND event = 'start'
GROUP BY uid
) a
;
--25.239s
第3種 row_number()
SELECT COUNT(*)
FROM (
SELECT uid
,row_number() OVER(PARTITION BY uid ORDER BY uid) rn
FROM tableA
WHERE dt = '20191205'
AND event = 'start'
) c
WHERE rn = 1
;
-- 25.162s
-- 25.154s
加不加 DESC 除了結果順序不同,對查詢效率的影響
SELECT COUNT(*)
FROM (
SELECT uid
,row_number() OVER(PARTITION BY uid ORDER BY uid DESC) rn
FROM tableA
WHERE dt = '20191205'
AND event = 'start'
) c
WHERE rn = 1
;
-- 24.436s
-- 25.435s
-- 24.114s
總結:
- distinct使用起來雖然簡單,但是效率不及 group by 和 row_number()。
- 當數據量非常大時,尤其達到百萬級及以上,應優先使用后兩種去重方式。
參考: