美文网首页
spark shell

spark shell

作者: zhangxu0636 | 来源:发表于2016-08-02 15:29 被阅读101次

    1. 启动shell

    1. cd spar目录/bin
    2. ./spark-shell
    

    2. 载入需要分析的数据

    //得到了一个rdd
    val textFile = sc.textFile("file:///xuzhang/home/Cloud/spark/README.md");
    //得到其中的rdd总数
    textFile.count()
    //得到第一条数据
    textFile.first()
    //filter 过滤数据,spark中叫做rdd转换
    val lineWithSpark = textFile.filter(line=>line.contains("Spark"));
    //或者
    val lineWithSpark = textFile.filter(_.contains("Spark");
    //map reduce
    textFile.map(_.split(" ").size).reduce((a,b)=>if (a > b) a else b))
    //调用java的方法统计
    import java.lang.Math
    textFile.map(_.split(" ").size).reduce((a,b)=>Math.max(a,b))
    //spark 的wordCount
    textFile.flatMap(_.split(" ")).map(word=>(word,1)).reduceByKey((a,b)=>a+b)
    

    编程之scala版

    /* SimpleApp.scala */
    import org.apache.spark.SparkContext
    import org.apache.spark.SparkContext._
    import org.apache.spark.SparkConf
    object SimpleApp { 
    def main(args: Array[String]) { 
    val logFile = "YOUR_SPARK_HOME/README.md" 
    // Should be some file on your system 
    val conf = new SparkConf().setAppName("Simple Application")
    val sc = new SparkContext(conf) 
    val logData = sc.textFile(logFile, 2).cache() 
    val numAs = logData.filter(line => line.contains("a")).count()
    val numBs = logData.filter(line => line.contains("b")).count()
    println("Lines with a: %s, Lines with b: %s".format(numAs, numBs)) 
    }
    }
    

    编程之python版

    """SimpleApp.py"""
    from pyspark import SparkContext
    logFile = "YOUR_SPARK_HOME/README.md" # Should be some file on your system
    sc = SparkContext("local", "Simple App")
    logData = sc.textFile(logFile).cache()
    numAs = logData.filter(lambda s: 'a' in s).count()
    numBs = logData.filter(lambda s: 'b' in s).count()
    print("Lines with a: %i, lines with b: %i" % (numAs, numBs))
    

    相关文章

      网友评论

          本文标题:spark shell

          本文链接:https://www.haomeiwen.com/subject/mrwfsttx.html