hdfs集群三个节点内存 : 8G ,4G ,4G
Hive 配置spark 引擎提交查询
最新更新:使用sparlSql 也能执行成功,SparkSql 和 hive on spark 都是使用spark计算引擎,但是两#者在SQL 语法解析不相同。
目标: 请写出代码求得所有用户和活跃用户的总数及平均年龄。(活跃用户指连续两天都有访问记录的用户)日期 用户 年龄
解法1:
select
sum(allCount) ,sum(allAvg) ,sum(twiceCount) ,sum(twiceAvg)
from
(select
count(*) as allCount,
-- AVG(t.age) as avgAge ,
sum(t.age) / count(*) as allAvg ,
0 as twiceCount ,
0 as twiceAvg
from
(select
user_id , age
from user_age group by user_id , age ) t
union all
select
0 as allCount ,
0 as allAvg ,
count(*) as twiceCount ,
sum(t4.age)/count(*) as twiceAvg
from
(select
t3.user_id ,
min(t3.age) as age
from
( select
t2.user_id ,
t2.age
from
( SELECT
t1.user_id ,
t1.age ,
date_sub(t1.dt , t1.rn) as diff
from
(select
t.user_id ,
t.age ,
t.dt ,
ROW_NUMBER () over(PARTITION by t.user_id order by t.dt ) rn
from
(select
user_id ,
age ,
dt
from user_age group by user_id , age ,dt ) t) t1 ) t2 group by t2.user_id ,
t2.age ,
t2.diff HAVING count(t2.user_id) >= 2 ) t3 group by t3.user_id) t4 group by t4.user_id ) t5
解法2:
select
sum(allCount) ,
sum(allAvg) ,
sum(twiceCount) ,
sum(twiceAvg)
from
(select
count(*) as allCount,
-- AVG(t.age) as avgAge ,
sum(t.age) / count(*) as allAvg ,
0 as twiceCount ,
0 as twiceAvg
from
(select
user_id , age
from user_age group by user_id , age ) t
union all
select
0 as allCount ,
0 as allAvg ,
count(*) as twiceCount ,
sum(t4.age)/count(*) as twiceAvg
from
(
select
t3.user_id ,
min(t3.age) as age
from
(
select
t2.user_id ,
t2.age ,
COUNT(t2.user_id) over(PARTITION BY t2.user_id,t2.diff) days
from
( SELECT
t1.user_id ,
t1.age ,
date_sub(t1.dt , t1.rn) as diff
from
(select
t.user_id ,
t.age ,
t.dt ,
ROW_NUMBER () over(PARTITION by t.user_id order by t.dt ) rn
from
(select
user_id ,
age ,
dt
from user_age group by user_id , age ,dt ) t) t1 ) t2
) t3 where t3.days >=2 group by t3.user_id) t4 group by t4.user_id ) t5 ;
解法2 使用spark 引擎报错。
源数据
create table user_age(dt string,user_id string,age int)row format delimited fields terminated by ',';
2019-02-11,test_1,23
2019-02-11,test_2,19
2019-02-11,test_3,39
2019-02-11,test_1,23
2019-02-11,test_3,39
2019-02-11,test_1,23
2019-02-12,test_2,19
2019-02-13,test_1,23
2019-02-15,test_2,19
2019-02-16,test_2,19
- 使用开窗函数提示异常
select
t2.user_id
,t2.age
,COUNT(t2.user_id) over(PARTITION BY t2.user_id,t2.diff) as days
from
( SELECT
t1.user_id ,
t1.age ,
date_sub(t1.dt , t1.rn) as diff
from
(select
t.user_id ,
t.age ,
t.dt ,
ROW_NUMBER () over(PARTITION by t.user_id order by t.dt ) rn
from
(select
user_id ,
age ,
dt
from user_age group by user_id , age ,dt ) t) t1 ) t2
DBeaver异常显示
org.jkiss.dbeaver.model.sql.DBSQLException: SQL 错误 [3] [42000]: Error while processing statement: FAILED: Execution Error, return code 3 from org.apache.hadoop.hive.ql.exec.spark.SparkTask. Spark job failed during runtime. Please check stacktrace for the root cause.
at org.jkiss.dbeaver.model.impl.jdbc.exec.JDBCStatementImpl.executeStatement(JDBCStatementImpl.java:134)
at org.jkiss.dbeaver.ui.editors.sql.execute.SQLQueryJob.executeStatement(SQLQueryJob.java:487)
at org.jkiss.dbeaver.ui.editors.sql.execute.SQLQueryJob.lambda$0(SQLQueryJob.java:424)
at org.jkiss.dbeaver.model.exec.DBExecUtils.tryExecuteRecover(DBExecUtils.java:164)
at org.jkiss.dbeaver.ui.editors.sql.execute.SQLQueryJob.executeSingleQuery(SQLQueryJob.java:416)
at org.jkiss.dbeaver.ui.editors.sql.execute.SQLQueryJob.extractData(SQLQueryJob.java:774)
at org.jkiss.dbeaver.ui.editors.sql.SQLEditor$QueryResultsContainer.readData(SQLEditor.java:2914)
at org.jkiss.dbeaver.ui.controls.resultset.ResultSetJobDataRead.lambda$0(ResultSetJobDataRead.java:110)
at org.jkiss.dbeaver.model.exec.DBExecUtils.tryExecuteRecover(DBExecUtils.java:164)
at org.jkiss.dbeaver.ui.controls.resultset.ResultSetJobDataRead.run(ResultSetJobDataRead.java:108)
at org.jkiss.dbeaver.ui.controls.resultset.ResultSetViewer$17.run(ResultSetViewer.java:3421)
at org.jkiss.dbeaver.model.runtime.AbstractJob.run(AbstractJob.java:103)
at org.eclipse.core.internal.jobs.Worker.run(Worker.java:63)
Caused by: java.sql.SQLException: Error while processing statement: FAILED: Execution Error, return code 3 from org.apache.hadoop.hive.ql.exec.spark.SparkTask. Spark job failed during runtime. Please check stacktrace for the root cause.
at org.apache.hive.jdbc.HiveStatement.waitForOperationToComplete(HiveStatement.java:354)
at org.apache.hive.jdbc.HiveStatement.execute(HiveStatement.java:245)
at org.jkiss.dbeaver.model.impl.jdbc.exec.JDBCStatementImpl.execute(JDBCStatementImpl.java:338)
at org.jkiss.dbeaver.model.impl.jdbc.exec.JDBCStatementImpl.executeStatement(JDBCStatementImpl.java:131)
... 12 more
yarn监控:任务执行结果
image.png
Failure Reason
Job aborted due to stage failure: Task 0 in stage 7.0 failed 4 times, most recent failure: Lost task 0.3 in stage 7.0 (TID 18, hadoop102, executor 2): java.lang.RuntimeException: java.lang.NullPointerException
at org.apache.hadoop.hive.ql.exec.spark.SparkReduceRecordHandler.processVectorRow(SparkReduceRecordHandler.java:543)
at org.apache.hadoop.hive.ql.exec.spark.SparkReduceRecordHandler.processRow(SparkReduceRecordHandler.java:300)
at org.apache.hadoop.hive.ql.exec.spark.HiveReduceFunctionResultList.processNextRecord(HiveReduceFunctionResultList.java:56)
at org.apache.hadoop.hive.ql.exec.spark.HiveReduceFunctionResultList.processNextRecord(HiveReduceFunctionResultList.java:28)
at org.apache.hadoop.hive.ql.exec.spark.HiveBaseFunctionResultList.hasNext(HiveBaseFunctionResultList.java:85)
at scala.collection.convert.Wrappers$JIteratorWrapper.hasNext(Wrappers.scala:42)
at scala.collection.Iterator$class.foreach(Iterator.scala:891)
at scala.collection.AbstractIterator.foreach(Iterator.scala:1334)
at org.apache.spark.rdd.AsyncRDDActions$$anonfun$foreachAsync$1$$anonfun$apply$12.apply(AsyncRDDActions.scala:127)
at org.apache.spark.rdd.AsyncRDDActions$$anonfun$foreachAsync$1$$anonfun$apply$12.apply(AsyncRDDActions.scala:127)
at org.apache.spark.SparkContext$$anonfun$37.apply(SparkContext.scala:2212)
at org.apache.spark.SparkContext$$anonfun$37.apply(SparkContext.scala:2212)
at org.apache.spark.scheduler.ResultTask.runTask(ResultTask.scala:90)
at org.apache.spark.scheduler.Task.run(Task.scala:123)
at org.apache.spark.executor.Executor$TaskRunner$$anonfun$10.apply(Executor.scala:408)
at org.apache.spark.util.Utils$.tryWithSafeFinally(Utils.scala:1360)
at org.apache.spark.executor.Executor$TaskRunner.run(Executor.scala:414)
at java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1149)
at java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:624)
at java.lang.Thread.run(Thread.java:748)
Caused by: java.lang.NullPointerException
at org.apache.hadoop.hive.ql.exec.vector.expressions.VectorUDFDateAddColCol.evaluate(VectorUDFDateAddColCol.java:96)
at org.apache.hadoop.hive.ql.exec.vector.ptf.VectorPTFOperator.process(VectorPTFOperator.java:366)
at org.apache.hadoop.hive.ql.exec.Operator.vectorForward(Operator.java:966)
at org.apache.hadoop.hive.ql.exec.Operator.forward(Operator.java:939)
at org.apache.hadoop.hive.ql.exec.vector.VectorSelectOperator.process(VectorSelectOperator.java:158)
at org.apache.hadoop.hive.ql.exec.spark.SparkReduceRecordHandler.forwardBatch(SparkReduceRecordHandler.java:549)
at org.apache.hadoop.hive.ql.exec.spark.SparkReduceRecordHandler.processVectorRow(SparkReduceRecordHandler.java:453)
... 19 more
Driver stacktrace:
失败阶段详情:
image.png image.png image.png
image.png
- 如果使用分组函数就不会查询失败
select
t2.user_id
,t2.age
from
( SELECT
t1.user_id ,
t1.age ,
date_sub(t1.dt , t1.rn) as diff
from
(select
t.user_id ,
t.age ,
t.dt ,
ROW_NUMBER () over(PARTITION by t.user_id order by t.dt ) rn
from
(select
user_id ,
age ,
dt
from user_age group by user_id , age ,dt ) t) t1 ) t2
group by t2.user_id ,
t2.age ,
t2.diff HAVING count(t2.user_id) >= 2
- 设置执行引擎为 MR 也可以运行
set hive.execution.engine=mr
select
t2.user_id
,t2.age
,COUNT(t2.user_id) over(PARTITION BY t2.user_id,t2.diff) as days
from
( SELECT
t1.user_id ,
t1.age ,
date_sub(t1.dt , t1.rn) as diff
from
(select
t.user_id ,
t.age ,
t.dt ,
ROW_NUMBER () over(PARTITION by t.user_id order by t.dt ) rn
from
(select
user_id ,
age ,
dt
from user_age group by user_id , age ,dt ) t) t1 ) t2
MR 提交的JOB全部执行成功
image.png
网友评论