美文网首页
spark常用操作

spark常用操作

作者: 行走于无形之中 | 来源:发表于2019-03-07 18:05 被阅读0次
    from pyspark import SparkConf, SparkContext,SQLContext
    from pyspark.sql import Row
    
    conf = SparkConf()
    sc = SparkContext(conf=conf)
    #设置错误级别
    sc.setLogLevel("error")
    sqlContext = SQLContext(sc)
    
    def run(outfile):
        origin_data = sc.textFile("filepath").map(lambda x: x.split("\t"))
        first = origin_data.first()
        # 过滤第一行
        whole= origin_data.filter(lambda x: x != first)
    
        course_order = whole.map(lambda x: (int(x[0]), int(x[1]), int(x[2]), int(x[3]))). \
            filter(lambda x: int(x[3]) == 3). \
            filter(lambda x: x[2] <= 100). \
            filter(lambda x: x[0] != 0). \
            map(lambda x: (int(x[0]), int(x[1]))). \
            map(lambda x: Row(user_id=int(x[0]), num=int(x[1])))
    
        out = sqlContext.createDataFrame(course_order).\
            #分组
            groupBy("user_id"). \
            #聚合
            agg({"num": "sum"}). \
            #列重命名
            withColumnRenamed("sum(num)", "num")
    
        #csv file
        out.repartition(1).write.format("csv").option("header", "false").mode("append").save(outfile)
        #sql file
        out.rdd.map(lambda x:sq % (x['user_id'] %10, x['num'], x['user_id'])).repartition(1).saveAsTextFile('sql.csv')
    
    if __name__ == '__main__':
        run("out")
    

    相关文章

      网友评论

          本文标题:spark常用操作

          本文链接:https://www.haomeiwen.com/subject/beshpqtx.html