案例1

作者: 7125messi | 来源:发表于2018-01-12 09:43 被阅读17次
#引入各种库
import os
import sys
#添加Spark工作环境
os.environ['SPARK_HOME']="/opt/spark-2.1.1-bin-hadoop2.7/"

#添加pyspark库到Python工作环境目录中
sys.path.append("/opt/spark-2.1.1-bin-hadoop2.7/bin/pyspark")



#初始化spark/sc
from pyspark.sql import SparkSession
from pyspark.sql import Row
# from pyspark.sql.types import *
spark = SparkSession.builder.appName("create").getOrCreate()
sc = spark.sparkContext


import numpy as np
import pandas as pd

from pyspark.ml.regression import LinearRegression
from pyspark.ml.feature import VectorAssembler
from pyspark.sql.functions import array, col, count, mean, sum, udf, when
from pyspark.sql.types import DoubleType, IntegerType, StringType, Row
from pyspark.sql.functions import sum, col, udf


df = spark.read.option('header','true')\
          .option('inferSchema','true')\
          .csv('/home/ydzhao/Book/spark-nba-analytics-master/data/season_totals.csv')
df.show(6)
df.printSchema()
df.describe()
df.orderBy('pts',ascending=False).show(5)
df.orderBy('pts',ascending=False).limit(10).show()
df.orderBy('pts',ascending=False).limit(10).toPandas()
df.orderBy('pts',ascending=False).limit(10).toPandas()[['yr','player','age','pts']]
df.orderBy('pts',ascending=False).limit(10)[['yr','player','age','pts']].show()


# groupBy
fga_py = df.groupBy('yr')\
           .agg({'mp' : 'sum', 'fg3a' : 'sum', 'fga' : 'sum'})\
           .select(col('yr'), (36*col('sum(fga)')/col('sum(mp)')).alias('fga_pm'), (36*col('sum(fg3a)')/col('sum(mp)')).alias('fg3a_pm'))\
           .orderBy('yr',ascending=False)
fga_py.show(6)


# sql
df.createOrReplaceTempView('df')
fga_py = spark.sql("SELECT yr,sum(fg3a)/sum(mp)*36 as fg3a_pm FROM df GROUP BY yr ORDER BY yr desc")
fga_py.show(6)


##############################################################   train the model
t = VectorAssembler(inputCols=['yr'], outputCol = 'features')
training = t.transform(fga_py)\
            .withColumn('yr',fga_py.yr)\
            .withColumn('label',fga_py.fg3a_pm)
training.show(10)
training.toPandas()

lr = LinearRegression(maxIter=10)
model = lr.fit(training)
model

############## 1.apply model for the 1979-80 season and  2020-21 season
training_yrs = training.select('yr').rdd.map(lambda x: x[0]).collect()
training_y = training.select('fg3a_pm').rdd.map(lambda x: x[0]).collect()

testing_yrs = [2017, 2018, 2019, 2020, 2021]
all_yrs = training_yrs + testing_yrs

############## 2.built testing DataFrame
test_rdd = sc.parallelize(all_yrs)
row = Row('yr')
testing = t.transform(test_rdd.map(row).toDF())
testing.show()

############## 3.apply linear regression model
df_results = model.transform(testing)
df_results.show()



sc.stop()
spark.stop()

相关文章

  • 18.fetch使用案例

    案例1 案例2 案例3 案例4 案例5 案例6

  • 1、案例1

    画面较大 自我感觉良好,自信,也有一定的适应不良。 偏画纸下方,较为现实。 圆弧线条及部分短促线条 性格较为随和,...

  • 案例1

  • 案例1

  • 案例1

    客户获取来源:朋友圈广告引流,主动联系我 场景描述:当天发朋友圈客户刚好看到了,有这样的需求,他微信上联系我,告诉...

  • 案例1⃣️

    一、我们正在玩超市收银角色扮演,女儿把真钱放入收银机,一个小朋友一把把我们的钱拿过来,说钱!!!我笑眯眯的对他说,...

  • 案例1

  • 案例1

    1.你如何理解“双减”的意义?请结合个人工作实际,谈谈如何推进“双减”工作落地落实。 双减就是要有效减轻义务教育阶...

  • python中*args函数的使用

    案例1: 案例2:

  • SAP 学海拾遗-销售价格主数据创建(有效期重叠)

    1. 测试案例1 2.测试案例2 3.测试案例3 4.测试案例4

网友评论

    本文标题:案例1

    本文链接:https://www.haomeiwen.com/subject/gebxoxtx.html