什么是二次排序
指的是在Reduce阶段对某个键关联的值排序。
解决方案
解决方案至少有两种以上,但是首先要考虑一个问题,既然使用spark或者hadoop,就要考虑大数据量下的效率问题,首先要规避的是OOM。例如缓存到数组数据结构中,使用集合可能导致规约器耗尽内存。所以推荐方案使用MapReduce框架或者Spark来完成。
设计方法:
1.使用键值转换设计模式:构造一个中间键(k,v1),其中v1是次键(Secondary key)。在这里,K称为自然键(natural key),要在规约器中注入一个值v1,只需要创建一个组合键。
2.让MR框架完成排序(而不是在内存中排序)
3.保留多个键值状态来完成处理,可以适当使用映射器输出分区来实现。
import org.apache.spark.rdd.RDD
import org.apache.spark.{Partitioner, SparkConf, SparkContext}
import org.apache.spark.sql.{DataFrame, Row, SQLContext}
import org.scalatest.{BeforeAndAfterAll, FunSpec, Matchers}
class SparkAlgorithms extends FunSpec with Matchers with BeforeAndAfterAll {
/** 二次排序的概念
* map(key1,value1) -> list(key2,value2)
* reduce(key2,list(value2)) -> list(key3,value3)
*
*/
describe("Secondary Sort") {
it("Secondary_Sort_1") {
val conf=new SparkConf().setAppName("test-algorithms").setMaster("local[2]")
val sc=new SparkContext(conf)
val sqlContext=new SQLContext(sc)
object data extends Serializable{
import sqlContext.implicits._
val ymd_tmperature=Seq(
("2012","01","01",5),
("2012","01","02",45),
("2012","01","03",35),
("2012","01","04",10),
("2001","11","01",46),
("2001","11","02",47),
("2001","11","01",48),
("2001","11","02",40),
("2005","08","20",50),
("2005","08","21",52),
("2005","08","22",38),
("2005","08","23",70)
).toDF("year","month","day","temperature")
}
/**
* 自定义排序分区
* 对传入的规约器的键分区
**/
class SortPartitioner(partitions: Int) extends Partitioner {
require(partitions > 0, s"分区的数量($partitions)必须大于零。")
def numPartitions: Int = partitions
def getPartition(key: Any): Int = key match {
case (k: String, v: Int) => math.abs(k.hashCode % numPartitions)
case null => 0
case _ => math.abs(key.hashCode % numPartitions)
}
override def equals(other: Any): Boolean = other match {
case o: SortPartitioner => o.numPartitions == numPartitions
case _ => false
}
override def hashCode: Int = numPartitions
}
//对规约器的键排序 使用框架插件排序
implicit def tupleOrderingDesc = new Ordering[Tuple2[String, Int]] {
override def compare(x: Tuple2[String, Int], y: Tuple2[String, Int]): Int = {
if (y._1.compare(x._1) == 0) -y._2.compare(x._2)
else -y._1.compare(x._1)
}
}
//Map 年月为自然键 自然值 组合键
val valuetokey:RDD[((String, Int), Int)]=data.ymd_tmperature.rdd.map(x=>{
((x(0)+"-"+x(1), x(3).asInstanceOf[Int]), x(3).asInstanceOf[Int])
})
//sort
val sorted=valuetokey.repartitionAndSortWithinPartitions(new SortPartitioner(3))
//Reduce
val result=sorted.map{
case (k,v) => (k._1,v.toString)
}.reduceByKey(_+","+_)
result.foreach(println)
}
}
}
网友评论