每日top3熱點(diǎn)搜索詞統(tǒng)計(jì)Demo
1、數(shù)據(jù)格式:
日期 用戶(hù) 搜索詞 城市 平臺(tái) 版本
2、需求:
1、篩選出符合查詢(xún)條件(城市、平臺(tái)、版本)的數(shù)據(jù)
2、統(tǒng)計(jì)出每天搜索uv排名前3的搜索詞
3、按照每天的top3搜索詞的uv搜索總次數(shù),倒序排序
4、將數(shù)據(jù)保存到hive表中
3、實(shí)現(xiàn)思路:
- 1、針對(duì)原始數(shù)據(jù)(HDFS文件),獲取輸入的RDD
- 2、使用filter算子,去針對(duì)輸入RDD中的數(shù)據(jù),進(jìn)行數(shù)據(jù)過(guò)濾,過(guò)濾出符合查詢(xún)條件的數(shù)據(jù);
- 2.1 普通的做法:直接在fitler算子函數(shù)中,使用外部的查詢(xún)條件(Map),但是,這樣做的話(huà),是不是查詢(xún)條件Map,會(huì)發(fā)送到每一個(gè)task上一份副本。(性能并不好);
- 2.2 優(yōu)化后的做法:將查詢(xún)條件,封裝為Broadcast廣播變量,在filter算子中使用Broadcast廣播變量進(jìn)行數(shù)據(jù)篩選;
- 3、將數(shù)據(jù)轉(zhuǎn)換為“(日期搜索詞, 用戶(hù))”格式,然后呢,對(duì)它進(jìn)行分組,然后再次進(jìn)行映射,對(duì)每天每個(gè)搜索詞的搜索用戶(hù)進(jìn)行去重操作,并統(tǒng)計(jì)去重后的數(shù)量,即為每天每個(gè)搜索詞的uv。最后,獲得“(日期搜索詞, uv)” ;
- 4、將得到的每天每個(gè)搜索詞的uv,RDD,映射為元素類(lèi)型為Row的RDD,將該RDD轉(zhuǎn)換為DataFrame;
- 5、將DataFrame注冊(cè)為臨時(shí)表,使用Spark SQL的開(kāi)窗函數(shù),來(lái)統(tǒng)計(jì)每天的uv數(shù)量排名前3的搜索詞,以及它的搜索uv,最后獲取,是一個(gè)DataFrame;
- 6、將DataFrame轉(zhuǎn)換為RDD,繼續(xù)操作,按照每天日期來(lái)進(jìn)行分組,并進(jìn)行映射,計(jì)算出每天的top3搜索詞的搜索uv的總數(shù),然后將uv總數(shù)作為key,將每天的top3搜索詞以及搜索次數(shù),拼接為一個(gè)字符串
- 7、按照每天的top3搜索總uv,進(jìn)行排序,倒序排序
- 8、將排好序的數(shù)據(jù),再次映射回來(lái),變成“日期_搜索詞_uv”的格式
- 9、再次映射為DataFrame,并將數(shù)據(jù)保存到Hive中即可
package cn.spark.study.sql;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.HashMap;
import java.util.Iterator;
import java.util.List;
import java.util.Map;
import org.apache.spark.SparkConf;
import org.apache.spark.api.java.JavaPairRDD;
import org.apache.spark.api.java.JavaRDD;
import org.apache.spark.api.java.JavaSparkContext;
import org.apache.spark.api.java.function.FlatMapFunction;
import org.apache.spark.api.java.function.Function;
import org.apache.spark.api.java.function.PairFunction;
import org.apache.spark.broadcast.Broadcast;
import org.apache.spark.sql.DataFrame;
import org.apache.spark.sql.Row;
import org.apache.spark.sql.RowFactory;
import org.apache.spark.sql.hive.HiveContext;
import org.apache.spark.sql.types.DataTypes;
import org.apache.spark.sql.types.StructField;
import org.apache.spark.sql.types.StructType;
import scala.Tuple2;
/**
* 每日top3熱點(diǎn)搜索詞統(tǒng)計(jì)案例
*/
public class DailyTop3Keyword {
@SuppressWarnings("deprecation")
public static void main(String[] args) {
SparkConf conf = new SparkConf()
.setAppName("DailyTop3Keyword");
JavaSparkContext sc = new JavaSparkContext(conf);
HiveContext sqlContext = new HiveContext(sc.sc());
// 偽造出一份數(shù)據(jù),查詢(xún)條件
// 備注:實(shí)際上,在工作中,這個(gè)查詢(xún)條件,是通過(guò)J2EE平臺(tái)插入到某個(gè)MySQL表中的
// 然后,在這里通過(guò)Spring框架和ORM框架(MyBatis),去提取MySQL表中的查詢(xún)條件
Map<String, List<String>> queryParamMap = new HashMap<String, List<String>>();
queryParamMap.put("city", Arrays.asList("beijing"));
queryParamMap.put("platform", Arrays.asList("android"));
queryParamMap.put("version", Arrays.asList("1.0", "1.2", "1.5", "2.0"));
// 根據(jù)我們實(shí)現(xiàn)思路中的分析,這里最合適的方式,
//是將該查詢(xún)參數(shù)Map封裝為一個(gè)Broadcast廣播變量
// 這樣可以進(jìn)行優(yōu)化,每個(gè)Worker節(jié)點(diǎn),只拷貝一份數(shù)據(jù)即可
final Broadcast<Map<String, List<String>>> queryParamMapBroadcast =
sc.broadcast(queryParamMap);
// 1、針對(duì)HDFS文件中的日志,獲取輸入RDD
JavaRDD<String> rawRDD = sc.textFile("hdfs://spark1:9000/spark-study/keyword.txt");
// 2、使用查詢(xún)參數(shù)Map廣播變量,進(jìn)行篩選
JavaRDD<String> filterRDD = rawRDD.filter(new Function<String, Boolean>() {
private static final long serialVersionUID = 1L;
@Override
public Boolean call(String log) throws Exception {
// 切割原始日志,獲取城市、平臺(tái)和版本
String[] logSplited = log.split("\t");
String city = logSplited[3];
String platform = logSplited[4];
String version = logSplited[5];
// 與查詢(xún)條件進(jìn)行比對(duì),任何一個(gè)條件,只要該條件設(shè)置了,且日志中的數(shù)據(jù)沒(méi)有滿(mǎn)足條件
// 則直接返回false,過(guò)濾該日志
// 否則,如果所有設(shè)置的條件,都有日志中的數(shù)據(jù),則返回true,保留日志
Map<String, List<String>> queryParamMap = queryParamMapBroadcast.value();
List<String> cities = queryParamMap.get("city");
if(cities.size() > 0 && !cities.contains(city)) {
return false;
}
List<String> platforms = queryParamMap.get("platform");
if(platforms.size() > 0 && !platforms.contains(platform)) {
return false;
}
List<String> versions = queryParamMap.get("version");
if(versions.size() > 0 && !versions.contains(version)) {
return false;
}
return true;
}
});
// 3、過(guò)濾出來(lái)的原始日志,映射為(日期_搜索詞, 用戶(hù))的格式
JavaPairRDD<String, String> dateKeywordUserRDD = filterRDD.mapToPair(
new PairFunction<String, String, String>() {
private static final long serialVersionUID = 1L;
@Override
public Tuple2<String, String> call(String log) throws Exception {
String[] logSplited = log.split("\t");
String date = logSplited[0];
String user = logSplited[1];
String keyword = logSplited[2];
return new Tuple2<String, String>(date + "_" + keyword, user);
}
});
// 進(jìn)行分組,獲取每天每個(gè)搜索詞,有哪些用戶(hù)搜索了(沒(méi)有去重)
JavaPairRDD<String, Iterable<String>> dateKeywordUsersRDD = dateKeywordUserRDD.groupByKey();
// 對(duì)每天每個(gè)搜索詞的搜索用戶(hù),執(zhí)行去重操作,獲得其uv
JavaPairRDD<String, Long> dateKeywordUvRDD = dateKeywordUsersRDD.mapToPair(
new PairFunction<Tuple2<String,Iterable<String>>, String, Long>() {
private static final long serialVersionUID = 1L;
@Override
public Tuple2<String, Long> call(
Tuple2<String, Iterable<String>> dateKeywordUsers) throws Exception {
String dateKeyword = dateKeywordUsers._1;
Iterator<String> users = dateKeywordUsers._2.iterator();
// 對(duì)用戶(hù)進(jìn)行去重,并統(tǒng)計(jì)去重后的數(shù)量
List<String> distinctUsers = new ArrayList<String>();
while(users.hasNext()) {
String user = users.next();
if(!distinctUsers.contains(user)) {
distinctUsers.add(user);
}
}
// 獲取uv
long uv = distinctUsers.size();
return new Tuple2<String, Long>(dateKeyword, uv);
}
});
//4、 將每天每個(gè)搜索詞的uv數(shù)據(jù),轉(zhuǎn)換成DataFrame
JavaRDD<Row> dateKeywordUvRowRDD = dateKeywordUvRDD.map(
new Function<Tuple2<String,Long>, Row>() {
private static final long serialVersionUID = 1L;
@Override
public Row call(Tuple2<String, Long> dateKeywordUv) throws Exception {
String date = dateKeywordUv._1.split("_")[0];
String keyword = dateKeywordUv._1.split("_")[1];
long uv = dateKeywordUv._2;
return RowFactory.create(date, keyword, uv);
}
});
List<StructField> structFields = Arrays.asList(
DataTypes.createStructField("date", DataTypes.StringType, true),
DataTypes.createStructField("keyword", DataTypes.StringType, true),
DataTypes.createStructField("uv", DataTypes.LongType, true));
StructType structType = DataTypes.createStructType(structFields);
// 將該RDD轉(zhuǎn)換為DataFrame
DataFrame dateKeywordUvDF = sqlContext.createDataFrame(dateKeywordUvRowRDD, structType);
// 5、使用Spark SQL的開(kāi)窗函數(shù),統(tǒng)計(jì)每天搜索uv排名前3的熱點(diǎn)搜索詞
dateKeywordUvDF.registerTempTable("daily_keyword_uv");
DataFrame dailyTop3KeywordDF = sqlContext.sql(""
+ "SELECT date,keyword,uv "
+ "FROM ("
+ "SELECT "
+ "date,"
+ "keyword,"
+ "uv,"
+ "row_number() OVER (PARTITION BY date ORDER BY uv DESC) rank "
+ "FROM daily_keyword_uv"
+ ") tmp "
+ "WHERE rank<=3");
// 6、將DataFrame轉(zhuǎn)換為RDD,然后映射,計(jì)算出每天的top3搜索詞的搜索uv總數(shù)
JavaRDD<Row> dailyTop3KeywordRDD = dailyTop3KeywordDF.javaRDD();
JavaPairRDD<String, String> top3DateKeywordUvRDD = dailyTop3KeywordRDD.mapToPair(
new PairFunction<Row, String, String>() {
private static final long serialVersionUID = 1L;
@Override
public Tuple2<String, String> call(Row row)
throws Exception {
String date = String.valueOf(row.get(0));
String keyword = String.valueOf(row.get(1));
Long uv = Long.valueOf(String.valueOf(row.get(2)));
return new Tuple2<String, String>(date, keyword + "_" + uv);
}
});
JavaPairRDD<String, Iterable<String>> top3DateKeywordsRDD = top3DateKeywordUvRDD.groupByKey();
JavaPairRDD<Long, String> uvDateKeywordsRDD = top3DateKeywordsRDD.mapToPair(
new PairFunction<Tuple2<String,Iterable<String>>, Long, String>() {
private static final long serialVersionUID = 1L;
@Override
public Tuple2<Long, String> call(
Tuple2<String, Iterable<String>> tuple)
throws Exception {
String date = tuple._1;
Long totalUv = 0L;
String dateKeywords = date;
Iterator<String> keywordUvIterator = tuple._2.iterator();
while(keywordUvIterator.hasNext()) {
String keywordUv = keywordUvIterator.next();
Long uv = Long.valueOf(keywordUv.split("_")[1]);
totalUv += uv;
dateKeywords += "," + keywordUv;
}
return new Tuple2<Long, String>(totalUv, dateKeywords);
}
});
// 7、按照每天的總搜索uv進(jìn)行倒序排序
JavaPairRDD<Long, String> sortedUvDateKeywordsRDD = uvDateKeywordsRDD.sortByKey(false);
//8、再次進(jìn)行映射,將排序后的數(shù)據(jù),映射回原始的格式,Iterable<Row>
JavaRDD<Row> sortedRowRDD = sortedUvDateKeywordsRDD.flatMap(
new FlatMapFunction<Tuple2<Long,String>, Row>() {
private static final long serialVersionUID = 1L;
@Override
public Iterable<Row> call(Tuple2<Long, String> tuple)
throws Exception {
String dateKeywords = tuple._2;
String[] dateKeywordsSplited = dateKeywords.split(",");
String date = dateKeywordsSplited[0];
List<Row> rows = new ArrayList<Row>();
rows.add(RowFactory.create(date,
dateKeywordsSplited[1].split("_")[0],
Long.valueOf(dateKeywordsSplited[1].split("_")[1])));
rows.add(RowFactory.create(date,
dateKeywordsSplited[2].split("_")[0],
Long.valueOf(dateKeywordsSplited[2].split("_")[1])));
rows.add(RowFactory.create(date,
dateKeywordsSplited[3].split("_")[0],
Long.valueOf(dateKeywordsSplited[3].split("_")[1])));
return rows;
}
});
//9、將最終的數(shù)據(jù),轉(zhuǎn)換為DataFrame,并保存到Hive表中
DataFrame finalDF = sqlContext.createDataFrame(sortedRowRDD, structType);
finalDF.saveAsTable("daily_top3_keyword_uv");
sc.close();
}
}