6.2 需求3實現

package com.atguigu.sparkmall.offline

/**
  * 頁面單跳轉化率
  */

import java.sql.{Connection, DriverManager, PreparedStatement}
import java.util.UUID

import com.atguigu.sparkmall.common.model.{CategoryTop10, CategoryTop10SessionTop10, UserVisitAction}
import com.atguigu.sparkmall.common.util.StringUtil
import org.apache.spark.broadcast.Broadcast
import org.apache.spark.rdd.RDD
import org.apache.spark.{SparkConf, SparkContext}

import scala.collection.{immutable, mutable}


object Req3PageFlowApplication {
  def main(args: Array[String]): Unit = {


    val conf = new SparkConf().setMaster("local[*]").setAppName("Req1CategoryTop10Application")
    val sc = new SparkContext(conf)

    //從文件中獲取原始數據
    val lineDataRDD: RDD[String] = sc.textFile("input/user_visit_action.csv")

    //將數據轉換成樣例類
    val actionRDD: RDD[UserVisitAction] = lineDataRDD.map {
      line => {
        val datas: Array[String] = line.split(",")
        UserVisitAction(
          datas(0),
          datas(1),
          datas(2),
          datas(3).toLong,
          datas(4),
          datas(5),
          datas(6).toLong,
          datas(7).toLong,
          datas(8),
          datas(9),
          datas(10),
          datas(11),
          datas(12)
        )
      }
    }

    //做緩存,原因是:
    actionRDD.cache()

    //TODO: 2.計算分母數據
    //2.1將原始數據進行過濾,保留需要統計的頁面(我們只關心1,2,3,4,5,6,7這幾個頁面)
    val pageids = List(1,2,3,4,5,6,7)

    val zipPageids: List[String] = pageids.zip(pageids.tail).map {
      case (pageid1, pageid2) => {
        pageid1 + "-" + pageid2
      }
    }

    //1,2,3,4,5,6,7
    val filterRDD: RDD[UserVisitAction] = actionRDD.filter {
      action => {
        pageids.contains(action.page_id.toInt) //注意類型
      }
    }

    //將原始數據進行結構的轉化 (pageid,1)
    val pageIdToOneRDD: RDD[(Long, Long)] = filterRDD.map {
      action => (action.page_id, 1L)
    }

    //將數據進行聚合統計 (pageid,1) => (pageid,sum)
    val pageIdToSumRDD: RDD[(Long, Long)] = pageIdToOneRDD.reduceByKey(_+_)
    val pageIdToSums: Map[Long, Long] = pageIdToSumRDD.collect().toMap

    /*************************************************************************/

    //TODO:分子計算
    //將原始數據根據session進行分組(session, Iterato[UserVisitAction])
    val sessionGroupRDD: RDD[(String, Iterable[UserVisitAction])] = actionRDD.groupBy(action => action.session_id)

    //將分組后的數據進行時間的排序(升序):(session, List[pageid1,pageid2])
    val sessionToZipRDD: RDD[(String, List[(String, Long)])] = sessionGroupRDD.mapValues {
      datas => {
        val actions: List[UserVisitAction] = datas.toList.sortWith {
          (left, right) => {
            left.action_time < right.action_time
          }
        }
        val ids: List[Long] = actions.map(_.page_id)
        val zipList: List[(Long, Long)] = ids.zip(ids.tail)

        zipList.map {
          case (pageid1, pageid2) => {
            (pageid1 + "-" + pageid2, 1L)
          }
        }

      }
    }

    //(1-2,1)
    val zipRDD: RDD[(String, Long)] = sessionToZipRDD.map(_._2).flatMap(list => list)

    //過濾無效數據
    val zipFilterRDD: RDD[(String, Long)] = zipRDD.filter {
      case (pageflow, one) => {
        zipPageids.contains(pageflow)
      }
    }


    //將拉鏈后的數據進行統計分析(pageid1-pageid2,sum)
    val pageFlowReduceRDD: RDD[(String, Long)] = zipFilterRDD.reduceByKey(_+_)

    // TODO 4. 計算單跳轉化率 : 分子數據/分母數據 => (pageid1-pageid2).sum / pageid1.sum
    pageFlowReduceRDD.foreach{
      case(pageFlow,sum) => {
        val pageid1: String = pageFlow.split("-")(0)
        println(pageFlow + "=" + sum.toDouble / pageIdToSums.getOrElse(pageid1.toLong, 1L))
      }
    }


    // 釋放資源
    sc.stop()
  }
}

?著作權歸作者所有,轉載或內容合作請聯系作者
平臺聲明:文章內容(如有圖片或視頻亦包括在內)由作者上傳并發布,文章內容僅代表作者本人觀點,簡書系信息發布平臺,僅提供信息存儲服務。

推薦閱讀更多精彩內容