一. MySQL同步到Hive
1.1 MySQL数据准备
1.2 Hive端提前建表
CREATE TABLE `ods_fact_sale_orc_datax`(
`id` bigint,
`sale_date` string,
`prod_name` string,
`sale_nums` int)
ROW FORMAT DELIMITED FIELDS TERMINATED BY '\t'
STORED AS ORC;
1.3 json配置文件准备
{
"setting": {},
"job": {
"setting": {
"speed": {
"channel": 2
}
},
"content": [
{
"reader": {
"name": "mysqlreader",
"parameter": {
"username": "root",
"password": "abc123",
"column": [
"id",
"sale_date",
"prod_name",
"sale_nums"
],
"splitPk": "id",
"connection": [
{
"table": [
"fact_sale"
],
"jdbcUrl": [
"jdbc:mysql://10.31.1.122:3306/test"
]
}
]
}
},
"writer": {
"name": "hdfswriter",
"parameter": {
"defaultFS": "hdfs://10.31.1.123:8020",
"fileType": "orc",
"path": "/user/hive/warehouse/test.db/ods_fact_sale_orc_datax",
"fileName": "ods_fact_sale_datax",
"column": [
{
"name": "id",
"type": "bigint"
},
{
"name": "sale_date",
"type": "string"
},
{
"name": "prod_name",
"type": "string"
},
{
"name": "sale_nums",
"type": "int"
}
],
"writeMode": "append",
"fieldDelimiter": "\t",
"compress":"NONE"
}
}
}
]
}
}
1.4 测试记录
代码:
python datax.py --jvm="-Xms3G -Xmx3G" hdfsreader.json
测试记录:
[15:13:58] [root@10-31-1-119 bin]# python datax.py --jvm="-Xms3G -Xmx3G" mysqltohdfs.json
[15:13:58]
[15:13:58] DataX (DATAX-OPENSOURCE-3.0), From Alibaba !
[15:13:58] Copyright (C) 2010-2017, Alibaba Group. All Rights Reserved.
[15:13:58]
[15:13:58]
[15:13:59] 2021-11-24 15:14:06.967 [main] INFO VMInfo - VMInfo# operatingSystem class => sun.management.OperatingSystemImpl
[15:13:59] 2021-11-24 15:14:06.974 [main] INFO Engine - the machine info =>
[15:13:59]
[15:13:59] osInfo: Oracle Corporation 1.8 25.242-b08
[15:13:59] jvmInfo: Linux amd64 3.10.0-1127.el7.x86_64
[15:13:59] cpu num: 8
[15:13:59]
[15:13:59] totalPhysicalMemory: -0.00G
[15:13:59] freePhysicalMemory: -0.00G
[15:13:59] maxFileDescriptorCount: -1
[15:13:59] currentOpenFileDescriptorCount: -1
[15:13:59]
[15:13:59] GC Names [PS MarkSweep, PS Scavenge]
[15:13:59]
[15:13:59] MEMORY_NAME | allocation_size | init_size
[15:13:59] PS Eden Space | 768.00MB | 768.00MB
[15:13:59] Code Cache | 240.00MB | 2.44MB
[15:13:59] Compressed Class Space | 1,024.00MB | 0.00MB
[15:13:59] PS Survivor Space | 128.00MB | 128.00MB
[15:13:59] PS Old Gen | 2,048.00MB | 2,048.00MB
[15:13:59] Metaspace | -0.00MB | 0.00MB
[15:13:59]
[15:13:59]
[15:13:59] 2021-11-24 15:14:06.989 [main] INFO Engine -
[15:13:59] {
[15:13:59] "content":[
[15:13:59] {
[15:13:59] "reader":{
[15:13:59] "name":"mysqlreader",
[15:13:59] "parameter":{
[15:13:59] "column":[
[15:13:59] "id",
[15:13:59] "sale_date",
[15:13:59] "prod_name",
[15:13:59] "sale_nums"
[15:13:59] ],
[15:13:59] "connection":[
[15:13:59] {
[15:13:59] "jdbcUrl":[
[15:13:59] "jdbc:mysql://10.31.1.122:3306/test"
[15:13:59] ],
[15:13:59] "table":[
[15:13:59] "fact_sale"
[15:13:59] ]
[15:13:59] }
[15:13:59] ],
[15:13:59] "password":"******",
[15:13:59] "splitPk":"id",
[15:13:59] "username":"root"
[15:13:59] }
[15:13:59] },
[15:13:59] "writer":{
[15:13:59] "name":"hdfswriter",
[15:13:59] "parameter":{
[15:13:59] "column":[
[15:13:59] {
[15:13:59] "name":"id",
[15:13:59] "type":"bigint"
[15:13:59] },
[15:13:59] {
[15:13:59] "name":"sale_date",
[15:13:59] "type":"string"
[15:13:59] },
[15:13:59] {
[15:13:59] "name":"prod_name",
[15:13:59] "type":"string"
[15:13:59] },
[15:13:59] {
[15:13:59] "name":"sale_nums",
[15:13:59] "type":"int"
[15:13:59] }
[15:13:59] ],
[15:13:59] "compress":"NONE",
[15:13:59] "defaultFS":"hdfs://10.31.1.123:8020",
[15:13:59] "fieldDelimiter":"\t",
[15:13:59] "fileName":"ods_fact_sale_datax",
[15:13:59] "fileType":"orc",
[15:13:59] "path":"/user/hive/warehouse/test.db/ods_fact_sale_orc_datax",
[15:13:59] "writeMode":"append"
[15:13:59] }
[15:13:59] }
[15:13:59] }
[15:13:59] ],
[15:13:59] "setting":{
[15:13:59] "speed":{
[15:13:59] "channel":2
[15:13:59] }
[15:13:59] }
[15:13:59] }
[15:13:59]
[15:13:59] 2021-11-24 15:14:07.002 [main] WARN Engine - prioriy set to 0, because NumberFormatException, the value is: null
[15:13:59] 2021-11-24 15:14:07.005 [main] INFO PerfTrace - PerfTrace traceId=job_-1, isEnable=false, priority=0
[15:13:59] 2021-11-24 15:14:07.006 [main] INFO JobContainer - DataX jobContainer starts job.
[15:13:59] 2021-11-24 15:14:07.010 [main] INFO JobContainer - Set jobId = 0
[15:13:59] 2021-11-24 15:14:07.291 [job-0] INFO OriginalConfPretreatmentUtil - Available jdbcUrl:jdbc:mysql://10.31.1.122:3306/test?yearIsDateType=false&zeroDateTimeBehavior=convertToNull&tinyInt1isBit=false&rewriteBatchedStatements=true.
[15:13:59] 2021-11-24 15:14:07.308 [job-0] INFO OriginalConfPretreatmentUtil - table:[fact_sale] has columns:[id,sale_date,prod_name,sale_nums,reserverd1].
[15:13:59] 十一月 24, 2021 3:14:07 下午 org.apache.hadoop.util.NativeCodeLoader <clinit>
[15:13:59] 警告: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable
[15:14:00] 2021-11-24 15:14:08.104 [job-0] INFO JobContainer - jobContainer starts to do prepare ...
[15:14:00] 2021-11-24 15:14:08.105 [job-0] INFO JobContainer - DataX Reader.Job [mysqlreader] do prepare work .
[15:14:00] 2021-11-24 15:14:08.105 [job-0] INFO JobContainer - DataX Writer.Job [hdfswriter] do prepare work .
[15:14:00] 2021-11-24 15:14:08.178 [job-0] INFO HdfsWriter$Job - 由于您配置了writeMode append, 写入前不做清理工作, [/user/hive/warehouse/test.db/ods_fact_sale_orc_datax] 目录下写入相应文件名前缀 [ods_fact_sale_datax] 的文件
[15:14:00] 2021-11-24 15:14:08.179 [job-0] INFO JobContainer - jobContainer starts to do split ...
[15:14:00] 2021-11-24 15:14:08.179 [job-0] INFO JobContainer - Job set Channel-Number to 2 channels.
[15:14:00] 2021-11-24 15:14:08.194 [job-0] INFO SingleTableSplitUtil - split pk [sql=SELECT MIN(id),MAX(id) FROM fact_sale] is running...
[15:14:00] 2021-11-24 15:14:08.219 [job-0] INFO SingleTableSplitUtil - After split(), allQuerySql=[
[15:14:00] select id,sale_date,prod_name,sale_nums from fact_sale where (1 <= id AND id < 78762161)
[15:14:00] select id,sale_date,prod_name,sale_nums from fact_sale where (78762161 <= id AND id < 157524321)
[15:14:00] select id,sale_date,prod_name,sale_nums from fact_sale where (157524321 <= id AND id < 236286481)
[15:14:00] select id,sale_date,prod_name,sale_nums from fact_sale where (236286481 <= id AND id < 315048641)
[15:14:00] select id,sale_date,prod_name,sale_nums from fact_sale where (315048641 <= id AND id < 393810801)
[15:14:00] select id,sale_date,prod_name,sale_nums from fact_sale where (393810801 <= id AND id < 472572961)
[15:14:00] select id,sale_date,prod_name,sale_nums from fact_sale where (472572961 <= id AND id < 551335120)
[15:14:00] select id,sale_date,prod_name,sale_nums from fact_sale where (551335120 <= id AND id < 630097279)
[15:14:00] select id,sale_date,prod_name,sale_nums from fact_sale where (630097279 <= id AND id < 708859438)
[15:14:00] select id,sale_date,prod_name,sale_nums from fact_sale where (708859438 <= id AND id <= 787621597)
[15:14:00] select id,sale_date,prod_name,sale_nums from fact_sale where id IS NULL
[15:14:00] ].
[15:14:00] 2021-11-24 15:14:08.219 [job-0] INFO JobContainer - DataX Reader.Job [mysqlreader] splits to [11] tasks.
[15:14:00] 2021-11-24 15:14:08.219 [job-0] INFO HdfsWriter$Job - begin do split...
[15:14:00] 2021-11-24 15:14:08.223 [job-0] INFO HdfsWriter$Job - splited write file name:[hdfs://10.31.1.123:8020/user/hive/warehouse/test.db/ods_fact_sale_orc_datax__dcc27788_0305_445b_acaa_54c8e61a99c4/ods_fact_sale_datax__bd91f932_1225_4aa3_90cf_7e5c15c5537f]
[15:14:00] 2021-11-24 15:14:08.224 [job-0] INFO HdfsWriter$Job - splited write file name:[hdfs://10.31.1.123:8020/user/hive/warehouse/test.db/ods_fact_sale_orc_datax__dcc27788_0305_445b_acaa_54c8e61a99c4/ods_fact_sale_datax__328e36ec_e8e4_4147_95f4_83748bc18a73]
[15:14:00] 2021-11-24 15:14:08.224 [job-0] INFO HdfsWriter$Job - splited write file name:[hdfs://10.31.1.123:8020/user/hive/warehouse/test.db/ods_fact_sale_orc_datax__dcc27788_0305_445b_acaa_54c8e61a99c4/ods_fact_sale_datax__871e2e30_3818_4c36_87d8_3142bb1b2703]
[15:14:00] 2021-11-24 15:14:08.224 [job-0] INFO HdfsWriter$Job - splited write file name:[hdfs://10.31.1.123:8020/user/hive/warehouse/test.db/ods_fact_sale_orc_datax__dcc27788_0305_445b_acaa_54c8e61a99c4/ods_fact_sale_datax__13e6edef_d40a_4500_8b22_71588d17e4e9]
[15:14:00] 2021-11-24 15:14:08.225 [job-0] INFO HdfsWriter$Job - splited write file name:[hdfs://10.31.1.123:8020/user/hive/warehouse/test.db/ods_fact_sale_orc_datax__dcc27788_0305_445b_acaa_54c8e61a99c4/ods_fact_sale_datax__df33a43d_e258_4fb1_ae2c_d5259fd1a554]
[15:14:00] 2021-11-24 15:14:08.225 [job-0] INFO HdfsWriter$Job - splited write file name:[hdfs://10.31.1.123:8020/user/hive/warehouse/test.db/ods_fact_sale_orc_datax__dcc27788_0305_445b_acaa_54c8e61a99c4/ods_fact_sale_datax__3b0f910c_67fb_421b_a4f4_bed340f035db]
[15:14:00] 2021-11-24 15:14:08.225 [job-0] INFO HdfsWriter$Job - splited write file name:[hdfs://10.31.1.123:8020/user/hive/warehouse/test.db/ods_fact_sale_orc_datax__dcc27788_0305_445b_acaa_54c8e61a99c4/ods_fact_sale_datax__e904b458_efc3_4191_a1d5_6594c86f3f75]
[15:14:00] 2021-11-24 15:14:08.226 [job-0] INFO HdfsWriter$Job - splited write file name:[hdfs://10.31.1.123:8020/user/hive/warehouse/test.db/ods_fact_sale_orc_datax__dcc27788_0305_445b_acaa_54c8e61a99c4/ods_fact_sale_datax__fc598d87_4dc4_4c31_9289_59b6a143e471]
[15:14:00] 2021-11-24 15:14:08.226 [job-0] INFO HdfsWriter$Job - splited write file name:[hdfs://10.31.1.123:8020/user/hive/warehouse/test.db/ods_fact_sale_orc_datax__dcc27788_0305_445b_acaa_54c8e61a99c4/ods_fact_sale_datax__ac9c6e09_99d9_40a7_acd0_18032961cd15]
[15:14:00] 2021-11-24 15:14:08.226 [job-0] INFO HdfsWriter$Job - splited write file name:[hdfs://10.31.1.123:8020/user/hive/warehouse/test.db/ods_fact_sale_orc_datax__dcc27788_0305_445b_acaa_54c8e61a99c4/ods_fact_sale_datax__94aeb798_65df_4b7a_8ef4_97adaf99b5ab]
[15:14:00] 2021-11-24 15:14:08.227 [job-0] INFO HdfsWriter$Job - splited write file name:[hdfs://10.31.1.123:8020/user/hive/warehouse/test.db/ods_fact_sale_orc_datax__dcc27788_0305_445b_acaa_54c8e61a99c4/ods_fact_sale_datax__4b4a7925_88db_442f_80cb_c270dc034b6b]
[15:14:00] 2021-11-24 15:14:08.227 [job-0] INFO HdfsWriter$Job - end do split.
[15:14:00] 2021-11-24 15:14:08.227 [job-0] INFO JobContainer - DataX Writer.Job [hdfswriter] splits to [11] tasks.
[15:14:00] 2021-11-24 15:14:08.245 [job-0] INFO JobContainer - jobContainer starts to do schedule ...
[15:14:00] 2021-11-24 15:14:08.254 [job-0] INFO JobContainer - Scheduler starts [1] taskGroups.
[15:14:00] 2021-11-24 15:14:08.256 [job-0] INFO JobContainer - Running by standalone Mode.
[15:14:00] 2021-11-24 15:14:08.267 [taskGroup-0] INFO TaskGroupContainer - taskGroupId=[0] start [2] channels for [11] tasks.
[15:14:00] 2021-11-24 15:14:08.273 [taskGroup-0] INFO Channel - Channel set byte_speed_limit to -1, No bps activated.
[15:14:00] 2021-11-24 15:14:08.273 [taskGroup-0] INFO Channel - Channel set record_speed_limit to -1, No tps activated.
[15:14:00] 2021-11-24 15:14:08.286 [taskGroup-0] INFO TaskGroupContainer - taskGroup[0] taskId[0] attemptCount[1] is started
[15:14:00] 2021-11-24 15:14:08.289 [0-0-0-reader] INFO CommonRdbmsReader$Task - Begin to read record by Sql: [select id,sale_date,prod_name,sale_nums from fact_sale where (1 <= id AND id < 78762161)
[15:14:00] ] jdbcUrl:[jdbc:mysql://10.31.1.122:3306/test?yearIsDateType=false&zeroDateTimeBehavior=convertToNull&tinyInt1isBit=false&rewriteBatchedStatements=true].
[15:14:00] 2021-11-24 15:14:08.289 [taskGroup-0] INFO TaskGroupContainer - taskGroup[0] taskId[1] attemptCount[1] is started
[15:14:00] 2021-11-24 15:14:08.291 [0-0-1-reader] INFO CommonRdbmsReader$Task - Begin to read record by Sql: [select id,sale_date,prod_name,sale_nums from fact_sale where (78762161 <= id AND id < 157524321)
[15:14:00] ] jdbcUrl:[jdbc:mysql://10.31.1.122:3306/test?yearIsDateType=false&zeroDateTimeBehavior=convertToNull&tinyInt1isBit=false&rewriteBatchedStatements=true].
[15:14:00] 2021-11-24 15:14:08.312 [0-0-1-writer] INFO HdfsWriter$Task - begin do write...
[15:14:00] 2021-11-24 15:14:08.312 [0-0-1-writer] INFO HdfsWriter$Task - write to file : [hdfs://10.31.1.123:8020/user/hive/warehouse/test.db/ods_fact_sale_orc_datax__dcc27788_0305_445b_acaa_54c8e61a99c4/ods_fact_sale_datax__328e36ec_e8e4_4147_95f4_83748bc18a73]
[15:14:00] 2021-11-24 15:14:08.312 [0-0-0-writer] INFO HdfsWriter$Task - begin do write...
[15:14:00] 2021-11-24 15:14:08.312 [0-0-0-writer] INFO HdfsWriter$Task - write to file :
..................................................................
..................................................................
..................................................................
..................................................................
.................................................................
...................................................................
[15:42:10] 2021-11-24 15:42:18.638 [job-0] INFO JobContainer -
[15:42:10] [total cpu info] =>
[15:42:10] averageCpu | maxDeltaCpu | minDeltaCpu
[15:42:10] -1.00% | -1.00% | -1.00%
[15:42:10]
[15:42:10]
[15:42:10] [total gc info] =>
[15:42:10] NAME | totalGCCount | maxDeltaGCCount | minDeltaGCCount | totalGCTime | maxDeltaGCTime | minDeltaGCTime
[15:42:10] PS MarkSweep | 4 | 1 | 0 | 0.285s | 0.096s | 0.000s
[15:42:10] PS Scavenge | 3131 | 590 | 256 | 24.641s | 5.802s | 2.277s
[15:42:10]
[15:42:10] 2021-11-24 15:42:18.638 [job-0] INFO JobContainer - PerfTrace not enable!
[15:42:10] 2021-11-24 15:42:18.639 [job-0] INFO StandAloneJobContainerCommunicator - Total 767830000 records, 18367983662 bytes | Speed 10.37MB/s, 454337 records/s | Error 0 records, 0 bytes | All Task WaitWriterTime 2,272.990s | All Task WaitReaderTime 16.823s | Percentage 100.00%
[15:42:10] 2021-11-24 15:42:18.640 [job-0] INFO JobContainer -
[15:42:10] 任务启动时刻 : 2021-11-24 15:14:07
[15:42:10] 任务结束时刻 : 2021-11-24 15:42:18
[15:42:10] 任务总计耗时 : 1691s
[15:42:10] 任务平均流量 : 10.37MB/s
[15:42:10] 记录写入速度 : 454337rec/s
[15:42:10] 读出记录总数 : 767830000
[15:42:10] 读写失败总数 : 0
[15:42:10]
网友评论