美文网首页
iceberg 元数据

iceberg 元数据

作者: 淡水河谷123 | 来源:发表于2021-04-23 12:02 被阅读0次

以下为一个hive-catalog的iceberg表的所有存在hdfs目录中的文件
包含
1.parquet数据文件
2.json元数据文件
3.avro snapshot文件
4.avro manifest文件

hdfs://10.177.13.120:8020/user/hive/dc-warehouse/iceberg_cdc_table/data/00000-0-319a206d-7ead-415d-9ec8-700c1a49b8c4-00001.parquet
hdfs://10.177.13.120:8020/user/hive/dc-warehouse/iceberg_cdc_table/data/00000-0-319a206d-7ead-415d-9ec8-700c1a49b8c4-00003.parquet
hdfs://10.177.13.120:8020/user/hive/dc-warehouse/iceberg_cdc_table/data/00000-0-319a206d-7ead-415d-9ec8-700c1a49b8c4-00004.parquet
hdfs://10.177.13.120:8020/user/hive/dc-warehouse/iceberg_cdc_table/data/00000-0-319a206d-7ead-415d-9ec8-700c1a49b8c4-00005.parquet
hdfs://10.177.13.120:8020/user/hive/dc-warehouse/iceberg_cdc_table/data/00000-0-319a206d-7ead-415d-9ec8-700c1a49b8c4-00006.parquet
hdfs://10.177.13.120:8020/user/hive/dc-warehouse/iceberg_cdc_table/data/00000-0-319a206d-7ead-415d-9ec8-700c1a49b8c4-00007.parquet
hdfs://10.177.13.120:8020/user/hive/dc-warehouse/iceberg_cdc_table/data/00000-0-319a206d-7ead-415d-9ec8-700c1a49b8c4-00008.parquet
hdfs://10.177.13.120:8020/user/hive/dc-warehouse/iceberg_cdc_table/data/00000-0-319a206d-7ead-415d-9ec8-700c1a49b8c4-00009.parquet
hdfs://10.177.13.120:8020/user/hive/dc-warehouse/iceberg_cdc_table/data/00000-0-319a206d-7ead-415d-9ec8-700c1a49b8c4-00010.parquet
hdfs://10.177.13.120:8020/user/hive/dc-warehouse/iceberg_cdc_table/data/00000-0-319a206d-7ead-415d-9ec8-700c1a49b8c4-00011.parquet
hdfs://10.177.13.120:8020/user/hive/dc-warehouse/iceberg_cdc_table/data/00000-0-319a206d-7ead-415d-9ec8-700c1a49b8c4-00012.parquet
hdfs://10.177.13.120:8020/user/hive/dc-warehouse/iceberg_cdc_table/data/00000-0-79d89118-5069-4877-8332-2a592c887fe3-00001.parquet

hdfs://10.177.13.120:8020/user/hive/dc-warehouse/iceberg_cdc_table/metadata/00000-f9a42593-ab76-4933-a739-8e10b476fc85.metadata.json
hdfs://10.177.13.120:8020/user/hive/dc-warehouse/iceberg_cdc_table/metadata/00001-2002be31-0182-4085-9173-aee3e4facc0b.metadata.json
hdfs://10.177.13.120:8020/user/hive/dc-warehouse/iceberg_cdc_table/metadata/00002-2c5e9702-a908-43a6-bbe8-0f0c6582e984.metadata.json
hdfs://10.177.13.120:8020/user/hive/dc-warehouse/iceberg_cdc_table/metadata/00003-3db39d6b-6311-4bdb-9d7b-b56f2df74fb3.metadata.json
hdfs://10.177.13.120:8020/user/hive/dc-warehouse/iceberg_cdc_table/metadata/00004-a5490f98-4daf-4592-abf1-fdcc408f1b0f.metadata.json
hdfs://10.177.13.120:8020/user/hive/dc-warehouse/iceberg_cdc_table/metadata/00005-b13e2c1f-1383-43c3-a53c-832ed8c68fa8.metadata.json
hdfs://10.177.13.120:8020/user/hive/dc-warehouse/iceberg_cdc_table/metadata/00006-68ce5b89-27fb-421a-8a49-42f383dfc587.metadata.json
hdfs://10.177.13.120:8020/user/hive/dc-warehouse/iceberg_cdc_table/metadata/00007-b3430d66-c9fb-401c-b800-e2ea4ad70d8d.metadata.json

hdfs://10.177.13.120:8020/user/hive/dc-warehouse/iceberg_cdc_table/metadata/09769592-109f-4f6e-ab46-9b597dacfd43-m0.avro
hdfs://10.177.13.120:8020/user/hive/dc-warehouse/iceberg_cdc_table/metadata/1a49a079-d7cf-41a6-931d-15ad2a44914b-m0.avro
hdfs://10.177.13.120:8020/user/hive/dc-warehouse/iceberg_cdc_table/metadata/1a49a079-d7cf-41a6-931d-15ad2a44914b-m1.avro
hdfs://10.177.13.120:8020/user/hive/dc-warehouse/iceberg_cdc_table/metadata/2b1ddf19-5701-4c0b-ac6a-ea41fdab9c07-m0.avro
hdfs://10.177.13.120:8020/user/hive/dc-warehouse/iceberg_cdc_table/metadata/2b1ddf19-5701-4c0b-ac6a-ea41fdab9c07-m1.avro
hdfs://10.177.13.120:8020/user/hive/dc-warehouse/iceberg_cdc_table/metadata/bf413511-d1cf-407f-bcc9-b6960cde7898-m0.avro
hdfs://10.177.13.120:8020/user/hive/dc-warehouse/iceberg_cdc_table/metadata/bf413511-d1cf-407f-bcc9-b6960cde7898-m1.avro
hdfs://10.177.13.120:8020/user/hive/dc-warehouse/iceberg_cdc_table/metadata/e97d1919-f47d-40c0-9eb6-24bf68f96980-m0.avro
hdfs://10.177.13.120:8020/user/hive/dc-warehouse/iceberg_cdc_table/metadata/e97d1919-f47d-40c0-9eb6-24bf68f96980-m1.avro

hdfs://10.177.13.120:8020/user/hive/dc-warehouse/iceberg_cdc_table/metadata/f0bd795c-6a10-41bc-8f79-437fef1ff5f9-m0.avro
hdfs://10.177.13.120:8020/user/hive/dc-warehouse/iceberg_cdc_table/metadata/f0bd795c-6a10-41bc-8f79-437fef1ff5f9-m1.avro
hdfs://10.177.13.120:8020/user/hive/dc-warehouse/iceberg_cdc_table/metadata/f0bd795c-6a10-41bc-8f79-437fef1ff5f9-m2.avro
hdfs://10.177.13.120:8020/user/hive/dc-warehouse/iceberg_cdc_table/metadata/f0bd795c-6a10-41bc-8f79-437fef1ff5f9-m3.avro
hdfs://10.177.13.120:8020/user/hive/dc-warehouse/iceberg_cdc_table/metadata/f0bd795c-6a10-41bc-8f79-437fef1ff5f9-m4.avro
hdfs://10.177.13.120:8020/user/hive/dc-warehouse/iceberg_cdc_table/metadata/f0bd795c-6a10-41bc-8f79-437fef1ff5f9-m5.avro
hdfs://10.177.13.120:8020/user/hive/dc-warehouse/iceberg_cdc_table/metadata/f0bd795c-6a10-41bc-8f79-437fef1ff5f9-m6.avro
hdfs://10.177.13.120:8020/user/hive/dc-warehouse/iceberg_cdc_table/metadata/f0bd795c-6a10-41bc-8f79-437fef1ff5f9-m7.avro

hdfs://10.177.13.120:8020/user/hive/dc-warehouse/iceberg_cdc_table/metadata/snap-1289984099921389549-1-1a49a079-d7cf-41a6-931d-15ad2a44914b.avro
hdfs://10.177.13.120:8020/user/hive/dc-warehouse/iceberg_cdc_table/metadata/snap-3921229567852426700-1-bf413511-d1cf-407f-bcc9-b6960cde7898.avro
hdfs://10.177.13.120:8020/user/hive/dc-warehouse/iceberg_cdc_table/metadata/snap-5386042144404510937-1-09769592-109f-4f6e-ab46-9b597dacfd43.avro
hdfs://10.177.13.120:8020/user/hive/dc-warehouse/iceberg_cdc_table/metadata/snap-7125662397327732785-1-2b1ddf19-5701-4c0b-ac6a-ea41fdab9c07.avro
hdfs://10.177.13.120:8020/user/hive/dc-warehouse/iceberg_cdc_table/metadata/snap-7329471080018208648-1-f0bd795c-6a10-41bc-8f79-437fef1ff5f9.avro
hdfs://10.177.13.120:8020/user/hive/dc-warehouse/iceberg_cdc_table/metadata/snap-7377732782289998100-1-e97d1919-f47d-40c0-9eb6-24bf68f96980.avro

以下为iceberg表在hive中的建表语句
REATE EXTERNAL TABLE iceberg_cdc_table(
id string COMMENT 'unique ID',
data string)
ROW FORMAT SERDE
'org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe'
STORED AS INPUTFORMAT
'org.apache.hadoop.mapred.FileInputFormat'
OUTPUTFORMAT
'org.apache.hadoop.mapred.FileOutputFormat'
LOCATION
'hdfs://test-hdfs1/user/hive/dc-warehouse/iceberg_cdc_table'
TBLPROPERTIES (
'COLUMN_STATS_ACCURATE'='false',
'metadata_location'='hdfs://test-hdfs1/user/hive/dc-warehouse/iceberg_cdc_table/metadata/00007-b3430d66-c9fb-401c-b800-e2ea4ad70d8d.metadata.json',
'numFiles'='0',
'numRows'='-1',
'previous_metadata_location'='hdfs://test-hdfs1/user/hive/dc-warehouse/iceberg_cdc_table/metadata/00006-68ce5b89-27fb-421a-8a49-42f383dfc587.metadata.json',
'rawDataSize'='-1',
'table_type'='ICEBERG',
'totalSize'='0',
'transient_lastDdlTime'='1619089695')

其中metadata_location为当前的元数据文件,查看该文件

{
  "format-version" : 2,
  "table-uuid" : "924ae1db-5aad-451a-ae3b-bd933296ea84",
  "location" : "hdfs://test-hdfs1/user/hive/dc-warehouse/iceberg_cdc_table",
  "last-sequence-number" : 6,
  "last-updated-ms" : 1619090084800,
  "last-column-id" : 2,
  "current-schema-id" : 0,
  "schemas" : [ {
    "type" : "struct",
    "schema-id" : 0,
    "fields" : [ {
      "id" : 1,
      "name" : "id",
      "required" : true,
      "type" : "string",
      "doc" : "unique ID"
    }, {
      "id" : 2,
      "name" : "data",
      "required" : true,
      "type" : "string"
    } ]
  } ],
  "default-spec-id" : 0,
  "partition-specs" : [ {
    "spec-id" : 0,
    "fields" : [ ]
  } ],
  "last-partition-id" : 999,
  "default-sort-order-id" : 0,
  "sort-orders" : [ {
    "order-id" : 0,
    "fields" : [ ]
  } ],
  "row-key" : {
    "identifier-fields" : [ {
      "source-id" : 1
    } ]
  },
  "properties" : { },
  "current-snapshot-id" : 7329471080018208648,
  "snapshots" : [ {
    "sequence-number" : 1,
    "snapshot-id" : 5386042144404510937,
    "timestamp-ms" : 1619089843403,
    "summary" : {
      "operation" : "append",
      "flink.job-id" : "94aed63193990d73442f8696c3eee136",
      "flink.max-committed-checkpoint-id" : "1",
      "added-data-files" : "1",
      "added-records" : "1000000",
      "added-files-size" : "3076138",
      "changed-partition-count" : "1",
      "total-records" : "1000000",
      "total-files-size" : "3076138",
      "total-data-files" : "1",
      "total-delete-files" : "0",
      "total-position-deletes" : "0",
      "total-equality-deletes" : "0"
    },
    "manifest-list" : "hdfs://test-hdfs1/user/hive/dc-warehouse/iceberg_cdc_table/metadata/snap-5386042144404510937-1-09769592-109f-4f6e-ab46-9b597dacfd43.avro"
  }, {
    "sequence-number" : 2,
    "snapshot-id" : 1289984099921389549,
    "parent-snapshot-id" : 5386042144404510937,
    "timestamp-ms" : 1619089902186,
    "summary" : {
      "operation" : "overwrite",
      "flink.job-id" : "94aed63193990d73442f8696c3eee136",
      "flink.max-committed-checkpoint-id" : "2",
      "added-data-files" : "1",
      "added-delete-files" : "1",
      "added-records" : "21892",
      "added-files-size" : "184249",
      "added-equality-deletes" : "21892",
      "changed-partition-count" : "1",
      "total-records" : "1021892",
      "total-files-size" : "3260387",
      "total-data-files" : "2",
      "total-delete-files" : "1",
      "total-position-deletes" : "0",
      "total-equality-deletes" : "21892"
    },
    "manifest-list" : "hdfs://test-hdfs1/user/hive/dc-warehouse/iceberg_cdc_table/metadata/snap-1289984099921389549-1-1a49a079-d7cf-41a6-931d-15ad2a44914b.avro"
  }, {
    "sequence-number" : 3,
    "snapshot-id" : 7377732782289998100,
    "parent-snapshot-id" : 1289984099921389549,
    "timestamp-ms" : 1619089962201,
    "summary" : {
      "operation" : "overwrite",
      "flink.job-id" : "94aed63193990d73442f8696c3eee136",
      "flink.max-committed-checkpoint-id" : "3",
      "added-data-files" : "1",
      "added-delete-files" : "1",
      "added-records" : "73302",
      "added-files-size" : "604308",
      "added-equality-deletes" : "73302",
      "changed-partition-count" : "1",
      "total-records" : "1095194",
      "total-files-size" : "3864695",
      "total-data-files" : "3",
      "total-delete-files" : "2",
      "total-position-deletes" : "0",
      "total-equality-deletes" : "95194"
    },
    "manifest-list" : "hdfs://test-hdfs1/user/hive/dc-warehouse/iceberg_cdc_table/metadata/snap-7377732782289998100-1-e97d1919-f47d-40c0-9eb6-24bf68f96980.avro"
  }, {
    "sequence-number" : 4,
    "snapshot-id" : 3921229567852426700,
    "parent-snapshot-id" : 7377732782289998100,
    "timestamp-ms" : 1619090021768,
    "summary" : {
      "operation" : "overwrite",
      "flink.job-id" : "94aed63193990d73442f8696c3eee136",
      "flink.max-committed-checkpoint-id" : "4",
      "added-data-files" : "1",
      "added-delete-files" : "1",
      "added-records" : "95137",
      "added-files-size" : "783498",
      "added-equality-deletes" : "95137",
      "changed-partition-count" : "1",
      "total-records" : "1190331",
      "total-files-size" : "4648193",
      "total-data-files" : "4",
      "total-delete-files" : "3",
      "total-position-deletes" : "0",
      "total-equality-deletes" : "190331"
    },
    "manifest-list" : "hdfs://test-hdfs1/user/hive/dc-warehouse/iceberg_cdc_table/metadata/snap-3921229567852426700-1-bf413511-d1cf-407f-bcc9-b6960cde7898.avro"
  }, {
    "sequence-number" : 5,
    "snapshot-id" : 7125662397327732785,
    "parent-snapshot-id" : 3921229567852426700,
    "timestamp-ms" : 1619090082142,
    "summary" : {
      "operation" : "overwrite",
      "flink.job-id" : "94aed63193990d73442f8696c3eee136",
      "flink.max-committed-checkpoint-id" : "5",
      "added-data-files" : "1",
      "added-delete-files" : "1",
      "added-records" : "2772",
      "added-files-size" : "25696",
      "added-equality-deletes" : "2772",
      "changed-partition-count" : "1",
      "total-records" : "1193103",
      "total-files-size" : "4673889",
      "total-data-files" : "5",
      "total-delete-files" : "4",
      "total-position-deletes" : "0",
      "total-equality-deletes" : "193103"
    },
    "manifest-list" : "hdfs://test-hdfs1/user/hive/dc-warehouse/iceberg_cdc_table/metadata/snap-7125662397327732785-1-2b1ddf19-5701-4c0b-ac6a-ea41fdab9c07.avro"
  }, {
    "sequence-number" : 6,
    "snapshot-id" : 7329471080018208648,
    "parent-snapshot-id" : 7125662397327732785,
    "timestamp-ms" : 1619090084800,
    "summary" : {
      "operation" : "replace",
      "added-data-files" : "1",
      "deleted-data-files" : "4",
      "removed-delete-files" : "3",
      "added-records" : "1000000",
      "deleted-records" : "1190331",
      "added-files-size" : "3293597",
      "removed-files-size" : "4648193",
      "removed-equality-deletes" : "190331",
      "changed-partition-count" : "1",
      "total-records" : "1002772",
      "total-files-size" : "3319293",
      "total-data-files" : "2",
      "total-delete-files" : "1",
      "total-position-deletes" : "0",
      "total-equality-deletes" : "2772"
    },
    "manifest-list" : "hdfs://test-hdfs1/user/hive/dc-warehouse/iceberg_cdc_table/metadata/snap-7329471080018208648-1-f0bd795c-6a10-41bc-8f79-437fef1ff5f9.avro"
  } ],
  "snapshot-log" : [ {
    "timestamp-ms" : 1619089843403,
    "snapshot-id" : 5386042144404510937
  }, {
    "timestamp-ms" : 1619089902186,
    "snapshot-id" : 1289984099921389549
  }, {
    "timestamp-ms" : 1619089962201,
    "snapshot-id" : 7377732782289998100
  }, {
    "timestamp-ms" : 1619090021768,
    "snapshot-id" : 3921229567852426700
  }, {
    "timestamp-ms" : 1619090082142,
    "snapshot-id" : 7125662397327732785
  }, {
    "timestamp-ms" : 1619090084800,
    "snapshot-id" : 7329471080018208648
  } ],
  "metadata-log" : [ {
    "timestamp-ms" : 1619089691387,
    "metadata-file" : "hdfs://test-hdfs1/user/hive/dc-warehouse/iceberg_cdc_table/metadata/00000-f9a42593-ab76-4933-a739-8e10b476fc85.metadata.json"
  }, {
    "timestamp-ms" : 1619089741748,
    "metadata-file" : "hdfs://test-hdfs1/user/hive/dc-warehouse/iceberg_cdc_table/metadata/00001-2002be31-0182-4085-9173-aee3e4facc0b.metadata.json"
  }, {
    "timestamp-ms" : 1619089843403,
    "metadata-file" : "hdfs://test-hdfs1/user/hive/dc-warehouse/iceberg_cdc_table/metadata/00002-2c5e9702-a908-43a6-bbe8-0f0c6582e984.metadata.json"
  }, {
    "timestamp-ms" : 1619089902186,
    "metadata-file" : "hdfs://test-hdfs1/user/hive/dc-warehouse/iceberg_cdc_table/metadata/00003-3db39d6b-6311-4bdb-9d7b-b56f2df74fb3.metadata.json"
  }, {
    "timestamp-ms" : 1619089962201,
    "metadata-file" : "hdfs://test-hdfs1/user/hive/dc-warehouse/iceberg_cdc_table/metadata/00004-a5490f98-4daf-4592-abf1-fdcc408f1b0f.metadata.json"
  }, {
    "timestamp-ms" : 1619090021768,
    "metadata-file" : "hdfs://test-hdfs1/user/hive/dc-warehouse/iceberg_cdc_table/metadata/00005-b13e2c1f-1383-43c3-a53c-832ed8c68fa8.metadata.json"
  }, {
    "timestamp-ms" : 1619090082142,
    "metadata-file" : "hdfs://test-hdfs1/user/hive/dc-warehouse/iceberg_cdc_table/metadata/00006-68ce5b89-27fb-421a-8a49-42f383dfc587.metadata.json"
  } ]
}

其中包含了所有的snapshot信息和所有的元数据文件信息
注意sequence-number和snapshot-id,它们是强关联的,
sequence-number在v2版本的表中会作为标识数据的序列号
读取的时候data文件中过滤掉equility-delete数据的时候是按sequence-number过滤的
就找比data文件snapshot大的equility-delete文件

小文件合并也和入数据checkpoint一样生成新的snapshot
如果入库snapshot是3 然后开始小文件合并 合并过程中入库生成snapshot 4
然后合并完成生成snapshot 5
snapshot5的文件只合并了snapshot3的文件需要对snapshot 4中的equility-delete文件进行过滤 但是因为5比4大就不会过滤了

小文件合并跨了入库的snapshot数据就有问题了

当前的snapshotID和对应的文件,查看该文件snap-7329471080018208648-1-f0bd795c-6a10-41bc-8f79-437fef1ff5f9.avro

{
  "manifest_path" : "hdfs://test-hdfs1/user/hive/dc-warehouse/iceberg_cdc_table/metadata/f0bd795c-6a10-41bc-8f79-437fef1ff5f9-m7.avro",
  "manifest_length" : 6569,
  "partition_spec_id" : 0,
  "content" : 0,
  "sequence_number" : 6,
  "min_sequence_number" : 6,
  "added_snapshot_id" : 7329471080018208648,
  "added_data_files_count" : 1,
  "existing_data_files_count" : 0,
  "deleted_data_files_count" : 0,
  "added_rows_count" : 1000000,
  "existing_rows_count" : 0,
  "deleted_rows_count" : 0,
  "partitions" : {
    "array" : [ ]
  }
  
  00000-0-79d89118-5069-4877-8332-2a592c887fe3-00001.parquet  "status" : 1  "content" : 0
  
}
{
  "manifest_path" : "hdfs://test-hdfs1/user/hive/dc-warehouse/iceberg_cdc_table/metadata/2b1ddf19-5701-4c0b-ac6a-ea41fdab9c07-m0.avro",
  "manifest_length" : 6557,
  "partition_spec_id" : 0,
  "content" : 0,
  "sequence_number" : 5,
  "min_sequence_number" : 5,
  "added_snapshot_id" : 7125662397327732785,
  "added_data_files_count" : 1,
  "existing_data_files_count" : 0,
  "deleted_data_files_count" : 0,
  "added_rows_count" : 2772,
  "existing_rows_count" : 0,
  "deleted_rows_count" : 0,
  "partitions" : {
    "array" : [ ]
  }
  
  00000-0-319a206d-7ead-415d-9ec8-700c1a49b8c4-00009.parquet  "status" : 1  "content" : 0
  
}
{
  "manifest_path" : "hdfs://test-hdfs1/user/hive/dc-warehouse/iceberg_cdc_table/metadata/f0bd795c-6a10-41bc-8f79-437fef1ff5f9-m0.avro",
  "manifest_length" : 6553,
  "partition_spec_id" : 0,
  "content" : 0,
  "sequence_number" : 6,
  "min_sequence_number" : 6,
  "added_snapshot_id" : 7329471080018208648,
  "added_data_files_count" : 0,
  "existing_data_files_count" : 0,
  "deleted_data_files_count" : 1,
  "added_rows_count" : 0,
  "existing_rows_count" : 0,
  "deleted_rows_count" : 95137,
  "partitions" : {
    "array" : [ ]
  }
  
  00000-0-319a206d-7ead-415d-9ec8-700c1a49b8c4-00007.parquet  "status" : 2  "content" : 0
  
}
{
  "manifest_path" : "hdfs://test-hdfs1/user/hive/dc-warehouse/iceberg_cdc_table/metadata/f0bd795c-6a10-41bc-8f79-437fef1ff5f9-m3.avro",
  "manifest_length" : 6554,
  "partition_spec_id" : 0,
  "content" : 0,
  "sequence_number" : 6,
  "min_sequence_number" : 6,
  "added_snapshot_id" : 7329471080018208648,
  "added_data_files_count" : 0,
  "existing_data_files_count" : 0,
  "deleted_data_files_count" : 1,
  "added_rows_count" : 0,
  "existing_rows_count" : 0,
  "deleted_rows_count" : 73302,
  "partitions" : {
    "array" : [ ]
  }
  
  00000-0-319a206d-7ead-415d-9ec8-700c1a49b8c4-00005.parquet  "status" : 2  "content" : 0
  
}
{
  "manifest_path" : "hdfs://test-hdfs1/user/hive/dc-warehouse/iceberg_cdc_table/metadata/f0bd795c-6a10-41bc-8f79-437fef1ff5f9-m2.avro",
  "manifest_length" : 6553,
  "partition_spec_id" : 0,
  "content" : 0,
  "sequence_number" : 6,
  "min_sequence_number" : 6,
  "added_snapshot_id" : 7329471080018208648,
  "added_data_files_count" : 0,
  "existing_data_files_count" : 0,
  "deleted_data_files_count" : 1,
  "added_rows_count" : 0,
  "existing_rows_count" : 0,
  "deleted_rows_count" : 21892,
  "partitions" : {
    "array" : [ ]
  }
  
  00000-0-319a206d-7ead-415d-9ec8-700c1a49b8c4-00003.parquet  "status" : 2  "content" : 0
  
}
{
  "manifest_path" : "hdfs://test-hdfs1/user/hive/dc-warehouse/iceberg_cdc_table/metadata/f0bd795c-6a10-41bc-8f79-437fef1ff5f9-m1.avro",
  "manifest_length" : 6566,
  "partition_spec_id" : 0,
  "content" : 0,
  "sequence_number" : 6,
  "min_sequence_number" : 6,
  "added_snapshot_id" : 7329471080018208648,
  "added_data_files_count" : 0,
  "existing_data_files_count" : 0,
  "deleted_data_files_count" : 1,
  "added_rows_count" : 0,
  "existing_rows_count" : 0,
  "deleted_rows_count" : 1000000,
  "partitions" : {
    "array" : [ ]
  }
  
  00000-0-319a206d-7ead-415d-9ec8-700c1a49b8c4-00001.parquet  "status" : 2  "content" : 0
  
}
{
  "manifest_path" : "hdfs://test-hdfs1/user/hive/dc-warehouse/iceberg_cdc_table/metadata/2b1ddf19-5701-4c0b-ac6a-ea41fdab9c07-m1.avro",
  "manifest_length" : 6568,
  "partition_spec_id" : 0,
  "content" : 1,
  "sequence_number" : 5,
  "min_sequence_number" : 5,
  "added_snapshot_id" : 7125662397327732785,
  "added_data_files_count" : 1,
  "existing_data_files_count" : 0,
  "deleted_data_files_count" : 0,
  "added_rows_count" : 2772,
  "existing_rows_count" : 0,
  "deleted_rows_count" : 0,
  "partitions" : {
    "array" : [ ]
  }
  
  00000-0-319a206d-7ead-415d-9ec8-700c1a49b8c4-00010.parquet  "status" : 1  "content" : 2
  
}
{
  "manifest_path" : "hdfs://test-hdfs1/user/hive/dc-warehouse/iceberg_cdc_table/metadata/f0bd795c-6a10-41bc-8f79-437fef1ff5f9-m4.avro",
  "manifest_length" : 6568,
  "partition_spec_id" : 0,
  "content" : 1,
  "sequence_number" : 6,
  "min_sequence_number" : 6,
  "added_snapshot_id" : 7329471080018208648,
  "added_data_files_count" : 0,
  "existing_data_files_count" : 0,
  "deleted_data_files_count" : 1,
  "added_rows_count" : 0,
  "existing_rows_count" : 0,
  "deleted_rows_count" : 95137,
  "partitions" : {
    "array" : [ ]
  }
  
  00000-0-319a206d-7ead-415d-9ec8-700c1a49b8c4-00008.parquet  "status" : 2  "content" : 2
  
}
{
  "manifest_path" : "hdfs://test-hdfs1/user/hive/dc-warehouse/iceberg_cdc_table/metadata/f0bd795c-6a10-41bc-8f79-437fef1ff5f9-m5.avro",
  "manifest_length" : 6570,
  "partition_spec_id" : 0,
  "content" : 1,
  "sequence_number" : 6,
  "min_sequence_number" : 6,
  "added_snapshot_id" : 7329471080018208648,
  "added_data_files_count" : 0,
  "existing_data_files_count" : 0,
  "deleted_data_files_count" : 1,
  "added_rows_count" : 0,
  "existing_rows_count" : 0,
  "deleted_rows_count" : 73302,
  "partitions" : {
    "array" : [ ]
  }
  
  00000-0-319a206d-7ead-415d-9ec8-700c1a49b8c4-00006.parquet  "status" : 2  "content" : 2
  
}
{
  "manifest_path" : "hdfs://test-hdfs1/user/hive/dc-warehouse/iceberg_cdc_table/metadata/f0bd795c-6a10-41bc-8f79-437fef1ff5f9-m6.avro",
  "manifest_length" : 6567,
  "partition_spec_id" : 0,
  "content" : 1,
  "sequence_number" : 6,
  "min_sequence_number" : 6,
  "added_snapshot_id" : 7329471080018208648,
  "added_data_files_count" : 0,
  "existing_data_files_count" : 0,
  "deleted_data_files_count" : 1,
  "added_rows_count" : 0,
  "existing_rows_count" : 0,
  "deleted_rows_count" : 21892,
  "partitions" : {
    "array" : [ ]
  }
  
  00000-0-319a206d-7ead-415d-9ec8-700c1a49b8c4-00004.parquet  "status" : 2  "content" : 2
  
}

这其中包含了所有的manifest文件,注意content属性,在ManifestContent 中定义了其意义,0表示新增数据Manifest,1表示删除数据Manifest

/**
 * Content type stored in a manifest file, either DATA or DELETES.
 */
public enum ManifestContent {
  DATA(0),
  DELETES(1);

  private final int id;

  ManifestContent(int id) {
    this.id = id;
  }

  public int id() {
    return id;
  }
}

查看manifest文件

{
  "status" : 1,
  "snapshot_id" : {
    "long" : 7329471080018208648
  },
  "sequence_number" : null,
  "data_file" : {
    "content" : 0,
    "file_path" : "hdfs://test-hdfs1/user/hive/dc-warehouse/iceberg_cdc_table/data/00000-0-79d89118-5069-4877-8332-2a592c887fe3-00001.parquet",
    "file_format" : "PARQUET",
    "partition" : { },
    "record_count" : 1000000,
    "file_size_in_bytes" : 3293597,
    "column_sizes" : {
      "array" : [ {
        "key" : 1,
        "value" : 2554588
      }, {
        "key" : 2,
        "value" : 734455
      } ]
    },
    "value_counts" : {
      "array" : [ {
        "key" : 1,
        "value" : 1000000
      }, {
        "key" : 2,
        "value" : 1000000
      } ]
    },
    "null_value_counts" : {
      "array" : [ {
        "key" : 1,
        "value" : 0
      }, {
        "key" : 2,
        "value" : 0
      } ]
    },
    "nan_value_counts" : {
      "array" : [ ]
    },
    "lower_bounds" : {
      "array" : [ {
        "key" : 1,
        "value" : "0"
      }, {
        "key" : 2,
        "value" : "007-dacf7d6ae3f9"
      } ]
    },
    "upper_bounds" : {
      "array" : [ {
        "key" : 1,
        "value" : "999999"
      }, {
        "key" : 2,
        "value" : "ff3-e85ff5b95460"
      } ]
    },
    "key_metadata" : null,
    "split_offsets" : {
      "array" : [ 4 ]
    },
    "equality_ids" : null,
    "sort_order_id" : {
      "int" : 0
    }
  }
}

注意status属性,在ManifestEntry接口中定义了枚举

package org.apache.iceberg;

interface ManifestEntry<F extends ContentFile<F>> {
  enum Status {
    EXISTING(0),
    ADDED(1),
    DELETED(2);

    private final int id;

    Status(int id) {
      this.id = id;
    }

    public int id() {
      return id;
    }
  }
}

1表示添加的文件,2表示已经无效需要删除的文件

还有content属性,在FileContent 类中定义了其意义,0表示数据文件,1表示POSITION_DELETES文件,2表示 EQUALITY_DELETES文件

package org.apache.iceberg;

/**
 * Content type stored in a file, one of DATA, POSITION_DELETES, or EQUALITY_DELETES.
 */
public enum FileContent {
  DATA(0),
  POSITION_DELETES(1),
  EQUALITY_DELETES(2);

  private final int id;

  FileContent(int id) {
    this.id = id;
  }

  public int id() {
    return id;
  }
}

上面的snapshot文件snap-7329471080018208648-1-f0bd795c-6a10-41bc-8f79-437fef1ff5f9.avro是最新的snapshot文件,有6个content为0的文件和4个content为1的文件,因为我这里是初始入了100w条cdc数据生成一个data文件,然后经历了4次updata,生成了4个data文件和4个delete文件,最后做了一个文件合并生成一个新的data文件。

我提取了其中对应的parquet文件和其status和content信息,state状态为1的有3个,即只有3个有效的文件,一个是进行小文件合并后生成的文件,两个是之后入库的更新文件,这两个也是一个是DATA文件一个是POSITION_DELETES文件。

而在小文件合并之前则是9个有效文件,5个data文件和4个POSITION_DELETES文件。

相关文章

网友评论

      本文标题:iceberg 元数据

      本文链接:https://www.haomeiwen.com/subject/cxnfrltx.html