美文网首页
排序及topN

排序及topN

作者: ibunny | 来源:发表于2017-03-28 14:27 被阅读653次

    基于排序机制的wordcount

    按照每个单词出现次数的顺序,降序排序

    import java.util.Arrays;
    
    import org.apache.spark.SparkConf;
    import org.apache.spark.api.java.JavaPairRDD;
    import org.apache.spark.api.java.JavaRDD;
    import org.apache.spark.api.java.JavaSparkContext;
    import org.apache.spark.api.java.function.FlatMapFunction;
    import org.apache.spark.api.java.function.Function2;
    import org.apache.spark.api.java.function.PairFunction;
    import org.apache.spark.api.java.function.VoidFunction;
    
    import scala.Tuple2;
    
    /**
     * 排序的wordcount程序
     * @author Administrator
     *
     */
    public class SortWordCount {
    
        public static void main(String[] args) {
            // 创建SparkConf和JavaSparkContext
            SparkConf conf = new SparkConf()
                    .setAppName("SortWordCount")
                    .setMaster("local"); 
            JavaSparkContext sc = new JavaSparkContext(conf);
            
            // 创建lines RDD
            JavaRDD<String> lines = sc.textFile("C://Users//Administrator//Desktop//spark.txt");
            
            // 执行我们之前做过的单词计数
            JavaRDD<String> words = lines.flatMap(new FlatMapFunction<String, String>() {
    
                private static final long serialVersionUID = 1L;
    
                @Override
                public Iterable<String> call(String t) throws Exception {
                    return Arrays.asList(t.split(" "));  
                }
                
            });
            
            JavaPairRDD<String, Integer> pairs = words.mapToPair(
                    
                    new PairFunction<String, String, Integer>() {
    
                        private static final long serialVersionUID = 1L;
    
                        @Override
                        public Tuple2<String, Integer> call(String t) throws Exception {
                            return new Tuple2<String, Integer>(t, 1);
                        }
                        
                    });
            
            JavaPairRDD<String, Integer> wordCounts = pairs.reduceByKey(
                    
                    new Function2<Integer, Integer, Integer>() {
    
                        private static final long serialVersionUID = 1L;
    
                        @Override
                        public Integer call(Integer v1, Integer v2) throws Exception {
                            return v1 + v2;
                        }
                        
                    });
            
            // 到这里为止,就得到了每个单词出现的次数
            // 但是,问题是,我们的新需求,是要按照每个单词出现次数的顺序,降序排序
            // wordCounts RDD内的元素是什么?应该是这种格式的吧:(hello, 3) (you, 2)
            // 我们需要将RDD转换成(3, hello) (2, you)的这种格式,才能根据单词出现次数进行排序(使用sortByKey)
            
            // 进行key-value的反转映射
            JavaPairRDD<Integer, String> countWords = wordCounts.mapToPair(
                    
                    new PairFunction<Tuple2<String,Integer>, Integer, String>() {
    
                        private static final long serialVersionUID = 1L;
    
                        @Override
                        public Tuple2<Integer, String> call(Tuple2<String, Integer> t)
                                throws Exception {
                            return new Tuple2<Integer, String>(t._2, t._1);
                        }
                        
                    });
            
            // 按照key进行排序
            JavaPairRDD<Integer, String> sortedCountWords = countWords.sortByKey(false);
            
            // 再次将value-key进行反转映射
            JavaPairRDD<String, Integer> sortedWordCounts = sortedCountWords.mapToPair(
                    
                    new PairFunction<Tuple2<Integer,String>, String, Integer>() {
    
                        private static final long serialVersionUID = 1L;
    
                        @Override
                        public Tuple2<String, Integer> call(Tuple2<Integer, String> t)
                                throws Exception {
                            return new Tuple2<String, Integer>(t._2, t._1);
                        }
                        
                    });
            
            // 到此为止,我们获得了按照单词出现次数排序后的单词计数
            // 打印出来
            sortedWordCounts.foreach(new VoidFunction<Tuple2<String,Integer>>() {
                
                private static final long serialVersionUID = 1L;
    
                @Override
                public void call(Tuple2<String, Integer> t) throws Exception {
                    System.out.println(t._1 + " appears " + t._2 + " times.");      
                }
                
            });
            
            // 关闭JavaSparkContext
            sc.close();
        }
        
    }
    
    
    import org.apache.spark.SparkConf
    import org.apache.spark.SparkContext
    
    /**
     * @author Administrator
     */
    object SortWordCount {
      
      def main(args: Array[String]) {
        val conf = new SparkConf()
            .setAppName("SortWordCount")
            .setMaster("local") 
        val sc = new SparkContext(conf)
        
        val lines = sc.textFile("C://Users//Administrator//Desktop//spark.txt", 1)
        val words = lines.flatMap { line => line.split(" ") }  
        val pairs = words.map { word => (word, 1) }  
        val wordCounts = pairs.reduceByKey(_ + _)  
        
        val countWords = wordCounts.map(wordCount => (wordCount._2, wordCount._1))   
        val sortedCountWords = countWords.sortByKey(false)  
        val sortedWordCounts = sortedCountWords.map(sortedCountWord => (sortedCountWord._2, sortedCountWord._1))  
        
        sortedWordCounts.foreach(sortedWordCount => println(
            sortedWordCount._1 + " appear " + sortedWordCount._2 + " times."))
      }
      
    }
    

    二次排序

    安装文件的第一列排序,如果第一列相同,则按照第二列排序。

    /*******SecondarySortKey.java*******/
    
    import java.io.Serializable;
    
    import scala.math.Ordered;
    
    /**
     * 自定义的二次排序key
     * @author Administrator
     *
     */
    public class SecondarySortKey implements Ordered<SecondarySortKey>, Serializable {
    
        private static final long serialVersionUID = -2366006422945129991L;
        
        // 首先在自定义key里面,定义需要进行排序的列
        private int first;
        private int second;
        
        public SecondarySortKey(int first, int second) {
            this.first = first;
            this.second = second;
        }
    
        @Override
        public boolean $greater(SecondarySortKey other) {
            // 如果第一列更大则大,第一列相等的话就比较第二列
            if(this.first > other.getFirst()) {
                return true;
            } else if(this.first == other.getFirst() && 
                    this.second > other.getSecond()) {
                return true;
            }
            return false;
        }
        
        @Override
        public boolean $greater$eq(SecondarySortKey other) {
            if(this.$greater(other)) {
                return true;
            } else if(this.first == other.getFirst() && 
                    this.second == other.getSecond()) {
                return true;
            }
            return false;
        }
    
        @Override
        public boolean $less(SecondarySortKey other) {
            if(this.first < other.getFirst()) {
                return true;
            } else if(this.first == other.getFirst() && 
                    this.second < other.getSecond()) {
                return true;
            }
            return false;
        }
        
        @Override
        public boolean $less$eq(SecondarySortKey other) {
            if(this.$less(other)) {
                return true;
            } else if(this.first == other.getFirst() && 
                    this.second == other.getSecond()) {
                return true;
            }
            return false;
        }
        
        @Override
        public int compare(SecondarySortKey other) {
            if(this.first - other.getFirst() != 0) {
                return this.first - other.getFirst();
            } else {
                return this.second - other.getSecond();
            }
        }
        
        @Override
        public int compareTo(SecondarySortKey other) {
            if(this.first - other.getFirst() != 0) {
                return this.first - other.getFirst();
            } else {
                return this.second - other.getSecond();
            }
        }
        
        // 为要进行排序的多个列,提供getter和setter方法,以及hashcode和equals方法
        public int getFirst() {
            return first;
        }
    
        public void setFirst(int first) {
            this.first = first;
        }
    
        public int getSecond() {
            return second;
        }
    
        public void setSecond(int second) {
            this.second = second;
        }
    
        @Override
        public int hashCode() {
            final int prime = 31;
            int result = 1;
            result = prime * result + first;
            result = prime * result + second;
            return result;
        }
    
        @Override
        public boolean equals(Object obj) {
            if (this == obj)
                return true;
            if (obj == null)
                return false;
            if (getClass() != obj.getClass())
                return false;
            SecondarySortKey other = (SecondarySortKey) obj;
            if (first != other.first)
                return false;
            if (second != other.second)
                return false;
            return true;
        }
        
    }
    
    /**********SecondarySort.java***********/
    
    import org.apache.spark.SparkConf;
    import org.apache.spark.api.java.JavaPairRDD;
    import org.apache.spark.api.java.JavaRDD;
    import org.apache.spark.api.java.JavaSparkContext;
    import org.apache.spark.api.java.function.Function;
    import org.apache.spark.api.java.function.PairFunction;
    import org.apache.spark.api.java.function.VoidFunction;
    
    import scala.Tuple2;
    
    /**
     * 二次排序
     * 1、实现自定义的key,要实现Ordered接口和Serializable接口,在key中实现自己对多个列的排序算法
     * 2、将包含文本的RDD,映射成key为自定义key,value为文本的JavaPairRDD
     * 3、使用sortByKey算子按照自定义的key进行排序
     * 4、再次映射,剔除自定义的key,只保留文本行
     * @author Administrator
     *
     */
    public class SecondarySort {
    
        public static void main(String[] args) {
            SparkConf conf = new SparkConf()
                    .setAppName("SecondarySort") 
                    .setMaster("local");
            JavaSparkContext sc = new JavaSparkContext(conf);
        
            JavaRDD<String> lines = sc.textFile("C://Users//Administrator//Desktop//sort.txt");
            
            JavaPairRDD<SecondarySortKey, String> pairs = lines.mapToPair(
                    
                    new PairFunction<String, SecondarySortKey, String>() {
    
                        private static final long serialVersionUID = 1L;
    
                        @Override
                        public Tuple2<SecondarySortKey, String> call(String line) throws Exception {
                            String[] lineSplited = line.split(" ");  
                            SecondarySortKey key = new SecondarySortKey(
                                    Integer.valueOf(lineSplited[0]), 
                                    Integer.valueOf(lineSplited[1]));  
                            return new Tuple2<SecondarySortKey, String>(key, line);
                        }
                        
                    });
            
            JavaPairRDD<SecondarySortKey, String> sortedPairs = pairs.sortByKey();
            
            JavaRDD<String> sortedLines = sortedPairs.map(
                    
                    new Function<Tuple2<SecondarySortKey,String>, String>() {
    
                        private static final long serialVersionUID = 1L;
    
                        @Override
                        public String call(Tuple2<SecondarySortKey, String> v1) throws Exception {
                            return v1._2;  // 不返回key
                        }
                        
                    });
            
            sortedLines.foreach(new VoidFunction<String>() {
    
                private static final long serialVersionUID = 1L;
    
                @Override
                public void call(String t) throws Exception {
                    System.out.println(t);  
                    // 1 3
                    // 1 5
                    // 2 1
                    // 2 4
                }
                
            });
            
            sc.close();
        }
        
    }
    
    
    /***********SecondSortKey.scala***********/
    
    class SecondSortKey(val first: Int, val second: Int) 
        extends Ordered[SecondSortKey] with Serializable {
      
      def compare(that: SecondSortKey): Int = {
        if(this.first - that.first != 0) {
          this.first - that.first
        } else {
          this.second - that.second
        }
      }
    }
    
    /***********SecondSort.scala***********/
    
    import org.apache.spark.SparkConf
    import org.apache.spark.SparkContext
    
    object SecondSort {
      
      def main(args: Array[String]): Unit = {
        val conf = new SparkConf()
            .setAppName("SecondSort")  
            .setMaster("local")  
        val sc = new SparkContext(conf)
      
        val lines = sc.textFile("C://Users//Administrator//Desktop//sort.txt", 1)
        val pairs = lines.map { line => (
            new SecondSortKey(line.split(" ")(0).toInt, line.split(" ")(1).toInt),
            line)}
        val sortedPairs = pairs.sortByKey()
        val sortedLines = sortedPairs.map(sortedPair => sortedPair._2)  
        
        sortedLines.foreach { sortedLine => println(sortedLine) }  
      }
      
    }
    

    topN

    对文件内的数字,取最大的前3个
    3
    5
    6
    7
    1
    4
    5

    import java.util.List;
    
    import org.apache.spark.SparkConf;
    import org.apache.spark.api.java.JavaPairRDD;
    import org.apache.spark.api.java.JavaRDD;
    import org.apache.spark.api.java.JavaSparkContext;
    import org.apache.spark.api.java.function.Function;
    import org.apache.spark.api.java.function.PairFunction;
    
    import scala.Tuple2;
    
    /**
     * 取最大的前3个数字
     * @author Administrator
     *
     */
    public class Top3 {
    
        public static void main(String[] args) {
            SparkConf conf = new SparkConf()
                    .setAppName("Top3")
                    .setMaster("local");  
            JavaSparkContext sc = new JavaSparkContext(conf);
        
            JavaRDD<String> lines = sc.textFile("C://Users//Administrator//Desktop//top.txt");
            
            JavaPairRDD<Integer, String> pairs = lines.mapToPair(
                    
                    new PairFunction<String, Integer, String>() {
    
                        private static final long serialVersionUID = 1L;
    
                        @Override
                        public Tuple2<Integer, String> call(String t) throws Exception {
                            return new Tuple2<Integer, String>(Integer.valueOf(t), t);
                        }
                        
                    });
            
            JavaPairRDD<Integer, String> sortedPairs = pairs.sortByKey(false);
            
            JavaRDD<Integer> sortedNumbers = sortedPairs.map(
                    
                    new Function<Tuple2<Integer,String>, Integer>() {
    
                        private static final long serialVersionUID = 1L;
    
                        @Override
                        public Integer call(Tuple2<Integer, String> v1) throws Exception {
                            return v1._1;
                        }
                        
                    });
            
            List<Integer> sortedNumberList = sortedNumbers.take(3);
            
            for(Integer num : sortedNumberList) {
                System.out.println(num);
            }
            
            sc.close();
        }
        
    }
    
    
    import org.apache.spark.SparkConf
    import org.apache.spark.SparkContext
    
    object Top3 {
      
      def main(args: Array[String]): Unit = {
        val conf = new SparkConf()
            .setAppName("Top3")
            .setMaster("local")  
        val sc = new SparkContext(conf)
        
        val lines = sc.textFile("C://Users//Administrator//Desktop//top.txt", 1)
        val pairs = lines.map { line => (line.toInt, line) }
        val sortedPairs = pairs.sortByKey(false)
        val sortedNumbers = sortedPairs.map(sortedPair => sortedPair._1)  
        val top3Number = sortedNumbers.take(3)
        
        for(num <- top3Number) {
          println(num)  
        }
      }
      
    }
    

    班级内的学生成绩,取出前3名
    class1 90
    class2 56
    class1 87
    class1 76
    class2 88
    class1 95
    class1 74
    class2 87
    class2 67
    class2 77

    
    import java.util.Arrays;
    import java.util.Iterator;
    
    import org.apache.spark.SparkConf;
    import org.apache.spark.api.java.JavaPairRDD;
    import org.apache.spark.api.java.JavaRDD;
    import org.apache.spark.api.java.JavaSparkContext;
    import org.apache.spark.api.java.function.PairFunction;
    import org.apache.spark.api.java.function.VoidFunction;
    
    import scala.Tuple2;
    
    /**
     * 分组取top3
     * @author Administrator
     *
     */
    public class GroupTop3 {
        
        public static void main(String[] args) {
            SparkConf conf = new SparkConf()
                    .setAppName("Top3")
                    .setMaster("local");  
            JavaSparkContext sc = new JavaSparkContext(conf);
            
            JavaRDD<String> lines = sc.textFile("C://Users//Administrator//Desktop//score.txt");
            
            JavaPairRDD<String, Integer> pairs = lines.mapToPair(
                    
                    new PairFunction<String, String, Integer>() {
    
                        private static final long serialVersionUID = 1L;
    
                        @Override
                        public Tuple2<String, Integer> call(String line) throws Exception {
                            String[] lineSplited = line.split(" ");  
                            return new Tuple2<String, Integer>(lineSplited[0], 
                                    Integer.valueOf(lineSplited[1]));
                        }
                        
                    });
            
            JavaPairRDD<String, Iterable<Integer>> groupedPairs = pairs.groupByKey();
            
            JavaPairRDD<String, Iterable<Integer>> top3Score = groupedPairs.mapToPair(
                    
                    new PairFunction<Tuple2<String,Iterable<Integer>>, String, Iterable<Integer>>() {
    
                        private static final long serialVersionUID = 1L;
    
                        @Override
                        public Tuple2<String, Iterable<Integer>> call(
                                Tuple2<String, Iterable<Integer>> classScores)
                                throws Exception {
                            Integer[] top3 = new Integer[3];
                            
                            String className = classScores._1;
                            Iterator<Integer> scores = classScores._2.iterator();
                            
                            while(scores.hasNext()) {
                                Integer score = scores.next();
                                
                                for(int i = 0; i < 3; i++) {
                                    if(top3[i] == null) {
                                        top3[i] = score;
                                        break;
                                    } else if(score > top3[i]) {
                                        for(int j = 2; j > i; j--) {
                                            top3[j] = top3[j - 1];  
                                        }
                                        
                                        top3[i] = score;
                                        
                                        break;
                                    } 
                                }
                            }
                            
                            return new Tuple2<String, 
                                    Iterable<Integer>>(className, Arrays.asList(top3));    
                        }
                        
                    });
            
            top3Score.foreach(new VoidFunction<Tuple2<String,Iterable<Integer>>>() {
                
                private static final long serialVersionUID = 1L;
    
                @Override
                public void call(Tuple2<String, Iterable<Integer>> t) throws Exception {
                    System.out.println("class: " + t._1);  
                    Iterator<Integer> scoreIterator = t._2.iterator();
                    while(scoreIterator.hasNext()) {
                        Integer score = scoreIterator.next();
                        System.out.println(score);  
                    }
                    System.out.println("=======================================");   
                }
                
            });
            
            sc.close();
        }
        
    }
    
    

    相关文章

      网友评论

          本文标题:排序及topN

          本文链接:https://www.haomeiwen.com/subject/wsejottx.html