當文件使用Parquet格式時,如果多次生成的文件列不同,可以進行元數據的合并,不用再像關系型數據庫那樣多個表關聯。關鍵點sqlContext.read.option("mergeSchema",true)
package com.spark.sql
import org.apache.spark.{SparkConf, SparkContext}
import org.apache.spark.sql.{SQLContext, SaveMode}
/**
* Created by Administrator on 2017/3/12.
* parquet數據元合并元數據
*/
object ParquetSchemaMerge extends App{
val conf = new SparkConf()
.setMaster("local")
.setAppName("ParquetLoadData")
val sc = new SparkContext(conf)
val sqlContext=new SQLContext(sc)
//導入隱式轉換
import sqlContext.implicits._
//創建第一個RDD
val studentWithNameAndAge=Array(("Tom",13),("Mary",14))
val studentWithNameAndAgeDF=sc.parallelize(studentWithNameAndAge,1).toDF("name","age")
//保存
studentWithNameAndAgeDF.write.format("parquet").mode(SaveMode.Append)
.save("E:\\spark\\src\\main\\resources\\student")
//創建第二個RDD
val studentWithNameAndGrade=Array(("Yangql","A"),("Test","B"))
val studentWithNameAndGradeDF=sc.parallelize(studentWithNameAndGrade).toDF("name","grade")
studentWithNameAndGradeDF.write.format("parquet").mode(SaveMode.Append)
.save("E:\\spark\\src\\main\\resources\\student")
//用mergeSchema的方式讀取數據,進行元數據的合并
val studentsDF=sqlContext.read.option("mergeSchema",true).parquet("E:\\spark\\src\\main\\resources\\student")
studentsDF.show()
studentsDF.printSchema()
}