下图包含有MP4标准文档:

mp4文件由一个个BOX组成,BOX可以嵌套,如图:

BOX主要由header和body两部分组成,body包含实际数据或其他BOX,header主要由4字节的size和4字节的type组成,size是整个BOX的大小,type是BOX的类型,如图:

- 如果size等于0,该BOX是最后一个BOX。
- 如果size等于1,该BOX存在可选的8字节的largesize字段,表示整个BOX的大小。
- 如果type等于"uuid",该BOX存在可选的16字节的UUID字段。
- 有一部分box是fullbox,它在基础box上添加了1字节的version字段和3字节的flags字段。
下表列出了标准定义的一些BOX以及它们之间的嵌套关系,带*号的BOX是必选的。

常见的比较重要的几个box
// mp4文件的第一个box
// brand通常有 isom、iso2、iso6、mp41、mp42、avc1 等。
aligned(8) class FileTypeBox extends Box(‘ftyp’) {
unsigned int(32) major_brand; // 主brand
unsigned int(32) minor_version; // 最小版本
unsigned int(32) compatible_brands[]; // 兼容brand列表
}
// 整个视频的整体信息
aligned(8) class MovieHeaderBox extends FullBox(‘mvhd’, version, 0) {
if (version == 1) {
unsigned int(64) creation_time; // 创建时间,从UTC时间1904年1月1日00:00:00起的秒数。
unsigned int(64) modification_time; // 修改时间,计时起点同上。
unsigned int(32) timescale; // 一秒的时间刻度。
unsigned int(64) duration; // 总的持续时间。duration/timescale = 总秒数。
} else { // version==0
unsigned int(32) creation_time; // 创建时间,从UTC时间1904年1月1日00:00:00起的秒数。
unsigned int(32) modification_time; // 修改时间,计时起点同上。
unsigned int(32) timescale; // 一秒的时间刻度。
unsigned int(32) duration; // 总的持续时间。duration/timescale = 总秒数。
}
template int(32) rate; // 播放速度,16.16定点数,通常是0x00010000(1.0),也就是1.0正常速度播放。
template int(16) volume; // 音量大小,8.8定点数,通常是0x0100(1.0),也就是1.0全部音量。
const bit(16) reserved = 0; // 预留
const unsigned int(32)[2] reserved = 0; // 预留
template int(32)[9] matrix = {0x00010000,0,0,0,0x00010000,0,0,0,0x40000000 }; //Unity matrix
bit(32)[6] pre_defined = 0; // ????
unsigned int(32) next_track_ID; // ????
}
// Track 的整体信息
aligned(8) class TrackHeaderBox extends FullBox(‘tkhd’, version, flags){
if (version==1) {
unsigned int(64) creation_time; // 创建时间,从UTC时间1904年1月1日00:00:00起的秒数。
unsigned int(64) modification_time; // 修改时间,计时起点同上。
unsigned int(32) track_ID; // track ID
unsigned int(32) reserved = 0; // 预留
unsigned int(64) duration; // 该 Track 总时间,duration/timescale = 总秒数。
} else { // version==0
unsigned int(32) creation_time; // 创建时间,从UTC时间1904年1月1日00:00:00起的秒数。
unsigned int(32) modification_time; // 修改时间,计时起点同上。
unsigned int(32) track_ID; // track ID
unsigned int(32) reserved = 0; // 预留
unsigned int(32) duration; // 该 Track 总时间,duration/timescale = 总秒数。
}
unsigned int(32)[2] reserved = 0; // 预留
template int(16) layer = 0; // ???
template int(16) alternate_group = 0; // ???
template int(16) volume = {if track_is_audio 0x0100 else 0};
unsigned int(16) reserved = 0; // 预留
template int(32)[9] matrix={ 0x00010000,0,0,0,0x00010000,0,0,0,0x40000000 };// unity matrix
unsigned int(32) width; // 对于文字或字幕类Track,它可以是显示文字区域建议的尺寸。
unsigned int(32) height; // 对于文字或字幕类Track,它可以是显示文字区域建议的尺寸。
}
// 媒体头声明与音轨中媒体特性相关的总体信息。
aligned(8) class MediaHeaderBox extends FullBox(‘mdhd’, version, 0) {
if (version==1) {
unsigned int(64) creation_time; // 创建时间,从UTC时间1904年1月1日00:00:00起的秒数。
unsigned int(64) modification_time; // 修改时间,计时起点同上。
unsigned int(32) timescale; // 一秒的时间刻度。
unsigned int(64) duration; // 总的持续时间。duration/timescale = 总秒数。
}
else {
unsigned int(32) creation_time; // 创建时间,从UTC时间1904年1月1日00:00:00起的秒数。
unsigned int(32) modification_time; // 修改时间,计时起点同上。
unsigned int(32) timescale; // 一秒的时间刻度。
unsigned int(32) duration; // 总的持续时间。duration/timescale = 总秒数。
}
bit(1) pad = 0;
unsigned int(5)[3] language; // ISO-639-2/T language code
unsigned int(16) pre_defined = 0;
}
// stsd box 有子box。
aligned(8) class SampleDescriptionBox (unsigned int(32) handler_type) extends FullBox('stsd', version, 0){
unsigned int(32) entry_count;
for (i=1; i<=entry_count; i++){
SampleEntry(); // an instance of a class derived from SampleEntry
}
}
// 记录每个sample的持续时间
aligned(8) class TimeToSampleBox extends FullBox(’stts’, version = 0, 0) {
unsigned int(32) entry_count; // 项目个数
for (i=1; i<=entry_count; i++) { //
unsigned int(32) sample_count; // 连续相同的持续时间的数量
unsigned int(32) sample_delta; // sample的持续时间
}
}
// 记录关键帧的列表
aligned(8) class SyncSampleBox extends FullBox(‘stss’, version = 0, 0) {
unsigned int(32) entry_count; // 关键帧个数
for (i=1; i<=entry_count; i++) { //
unsigned int(32) sample_number; // 关键帧的帧号,从1数起,不是从0数起。
}
}
// 记录每个sample的显示时间和解码时间的时间差
aligned(8) class CompositionOffsetBox extends FullBox(‘ctts’, version, 0) {
unsigned int(32) entry_count; // 个数。
if (version == 0) {
for (i=1; i<=entry_count; i++) {
unsigned int(32) sample_count; // 连续相同的偏移量的数量
unsigned int(32) sample_offset; // 显示时间相对解码时间的偏移量,解码时间点+sample_offset=显示时间点。
}
}
else if (version == 1) {
for (i=1; i<=entry_count; i++) {
unsigned int(32) sample_count; // 连续相同的偏移量的数量
signed int(32) sample_offset; // 显示时间相对解码时间的偏移量,解码时间点+sample_offset=显示时间点。
}
}
}
// 记录每个 Chunk 中 sample 的个数,计算方法有点绕:
// entry_count是Chunk组的数量,每个Chunk组中第一个Chunk的序号是first_chunk,每个Chunk组中Chunk的数量
// 是下一个Chunk组的first_chunk减去本Chunk组的first_chunk,如果没有下一个Chunk组,那就从first_chunk数
// 到最后一个Chunk。Chunk总数可从 "stco" 或 "co64" box 获取。
aligned(8) class SampleToChunkBox extends FullBox(‘stsc’, version = 0, 0) {
unsigned int(32) entry_count; // Chunk组的数量
for (i=1; i<=entry_count; i++) { //
unsigned int(32) first_chunk; // 该Chunk组中第一个Chunk的序号
unsigned int(32) samples_per_chunk; // 该Chunk组中每一个Chunk中sample的个数
unsigned int(32) sample_description_index; // ???
}
}
// 记录每一个 sample 的大小
aligned(8) class SampleSizeBox extends FullBox(‘stsz’, version = 0, 0) {
unsigned int(32) sample_size; // 如果每一个sample的大小都相同,那么sample的大小就是该值。
unsigned int(32) sample_count; // 如果sample_size是0,那么该值是sample的个数。
if (sample_size == 0) { //
for (i=1; i<=sample_count; i++) { //
unsigned int(32) entry_size; // 各个sample的大小
}
}
}
// 记录每个 Chunk 在整个文件中的位置
aligned(8) class ChunkOffsetBox extends FullBox(‘stco’, version = 0, 0) {
unsigned int(32) entry_count; // Chunk 个数
for (i=1; i<=entry_count; i++) { //
unsigned int(32) chunk_offset; // 各个 Chunk 在整个mp4文件中的偏移量。
}
}
// 作用同 stco,不同的是每个偏移量数值是64位,为了应对大于4GB的mp4文件。
aligned(8) class ChunkOffsetBox extends FullBox(‘stco’, version = 0, 0) {
unsigned int(32) entry_count;
for (i=1; i<=entry_count; i++) {
unsigned int(64) chunk_offset;
}
}
示例代码
以下示例代码可用于初步分析mp4文件结构:
#include <stdio.h>
#include <stdint.h>
#define MP4BOXTYPE_ftyp 0x66747970
#define MP4BOXTYPE_moov 0x6d6f6f76
#define MP4BOXTYPE_mvhd 0x6d766864
#define MP4BOXTYPE_iods 0x696f6473
#define MP4BOXTYPE_trak 0x7472616b
#define MP4BOXTYPE_tkhd 0x746b6864
#define MP4BOXTYPE_edts 0x65647473
#define MP4BOXTYPE_elst 0x656c7374
#define MP4BOXTYPE_mdia 0x6d646961
#define MP4BOXTYPE_mdhd 0x6d646864
#define MP4BOXTYPE_hdlr 0x68646c72
#define MP4BOXTYPE_minf 0x6d696e66
#define MP4BOXTYPE_vmhd 0x766d6864
#define MP4BOXTYPE_smhd 0x736d6864
#define MP4BOXTYPE_dinf 0x64696e66
#define MP4BOXTYPE_dref 0x64726566
#define MP4BOXTYPE_stbl 0x7374626c
#define MP4BOXTYPE_stsd 0x73747364
#define MP4BOXTYPE_stts 0x73747473
#define MP4BOXTYPE_stss 0x73747373
#define MP4BOXTYPE_ctts 0x63747473
#define MP4BOXTYPE_stsc 0x73747363
#define MP4BOXTYPE_stsz 0x7374737a
#define MP4BOXTYPE_stco 0x7374636f
#define MP4BOXTYPE_co64 0x636f3634
#define MP4BOXTYPE_sgpd 0x73677064
#define MP4BOXTYPE_sbgp 0x73626770
#define MP4BOXTYPE_sdtp 0x73647470
#define MP4BOXTYPE_udta 0x75647461
#define MP4BOXTYPE_mdat 0x6d646174
#define MP4BOXTYPE_free 0x66726565
#define MP4BOXTYPE_uuid 0x75756964
static int64_t getFileSize(FILE *fp){
_fseeki64(fp, 0, SEEK_END);
return _ftelli64(fp);
}
static uint32_t readU32BE(FILE *fp, int64_t offset){
uint8_t buffer[8];
if(offset >= 0){
_fseeki64(fp, offset, SEEK_SET);
}
fread(buffer, 4, 1, fp);
uint32_t value = buffer[0];
value = (value << 8) | buffer[1];
value = (value << 8) | buffer[2];
value = (value << 8) | buffer[3];
return value;
}
static int64_t readI64BE(FILE *fp, int64_t offset){
uint8_t buffer[16];
if(offset >= 0){
_fseeki64(fp, offset, SEEK_SET);
}
fread(buffer, 8, 1, fp);
int64_t value = buffer[0];
value = (value << 8) | buffer[1];
value = (value << 8) | buffer[2];
value = (value << 8) | buffer[3];
value = (value << 8) | buffer[4];
value = (value << 8) | buffer[5];
value = (value << 8) | buffer[6];
value = (value << 8) | buffer[7];
return value;
}
////////////////////////////////////////////////////////////////////////////////////////////////////
static void readBox_ftyp(FILE *fp, int64_t fileStart, int64_t fileEnd, uint8_t depth){
}
static void readBox_mvhd(FILE *fp, int64_t fileStart, int64_t fileEnd, uint8_t depth){
}
static void readBox_iods(FILE *fp, int64_t fileStart, int64_t fileEnd, uint8_t depth){
}
static void readBox_tkhd(FILE *fp, int64_t fileStart, int64_t fileEnd, uint8_t depth){
}
static void readBox_udta(FILE *fp, int64_t fileStart, int64_t fileEnd, uint8_t depth){
}
static void readBox_elst(FILE *fp, int64_t fileStart, int64_t fileEnd, uint8_t depth){
}
static void readBox_mdhd(FILE *fp, int64_t fileStart, int64_t fileEnd, uint8_t depth){
}
static void readBox_hdlr(FILE *fp, int64_t fileStart, int64_t fileEnd, uint8_t depth){
}
static void readBox_vmhd(FILE *fp, int64_t fileStart, int64_t fileEnd, uint8_t depth){
}
static void readBox_smhd(FILE *fp, int64_t fileStart, int64_t fileEnd, uint8_t depth){
}
static void readBox_dref(FILE *fp, int64_t fileStart, int64_t fileEnd, uint8_t depth){
}
static void readBox_stsd(FILE *fp, int64_t fileStart, int64_t fileEnd, uint8_t depth){
}
static void readBox_stts(FILE *fp, int64_t fileStart, int64_t fileEnd, uint8_t depth){
}
static void readBox_stss(FILE *fp, int64_t fileStart, int64_t fileEnd, uint8_t depth){
}
static void readBox_ctts(FILE *fp, int64_t fileStart, int64_t fileEnd, uint8_t depth){
}
static void readBox_stsc(FILE *fp, int64_t fileStart, int64_t fileEnd, uint8_t depth){
}
static void readBox_stsz(FILE *fp, int64_t fileStart, int64_t fileEnd, uint8_t depth){
}
static void readBox_stco(FILE *fp, int64_t fileStart, int64_t fileEnd, uint8_t depth){
}
static void readBox_co64(FILE *fp, int64_t fileStart, int64_t fileEnd, uint8_t depth){
}
static void readBox_sgpd(FILE *fp, int64_t fileStart, int64_t fileEnd, uint8_t depth){
}
static void readBox_sbgp(FILE *fp, int64_t fileStart, int64_t fileEnd, uint8_t depth){
}
static void readBox_sdtp(FILE *fp, int64_t fileStart, int64_t fileEnd, uint8_t depth){
}
static void readBox_uuid(FILE *fp, int64_t fileStart, int64_t fileEnd, uint8_t depth){
}
////////////////////////////////////////////////////////////////////////////////////////////////////
static void typeToStr(char *buffer, uint32_t type){
char *p = (char*)&type;
buffer[0] = p[3];
buffer[1] = p[2];
buffer[2] = p[1];
buffer[3] = p[0];
buffer[4] = 0;
}
static void printMp4Struct(int64_t fileStart, int64_t bodySize, uint32_t headSize, char *typeStr, uint8_t depth){
printf("%12lld ", fileStart);
while(depth--) printf("-----");
printf(" %s (%u+%lld)\n", typeStr, headSize, bodySize);
}
static void readBox_xxxx(FILE *fp, int64_t fileStart, int64_t fileEnd, uint8_t depth){
int64_t size;
uint32_t type;
uint32_t head;
char typeStr[8];
while(fileStart < fileEnd){
size = readU32BE(fp, fileStart+0);
type = readU32BE(fp, fileStart+4);
if(size >= 8){
head = 8;
}else if(size == 1){
size = readI64BE(fp, fileStart+8);
head = 16;
}else{
break;
}
if(type == MP4BOXTYPE_uuid){
head += 16;
}
typeToStr(typeStr, type);
printMp4Struct(fileStart, size-head, head, typeStr, depth);
switch(type){
case MP4BOXTYPE_ftyp:
readBox_ftyp(fp, fileStart+head, fileStart+size, depth+1);
break;
case MP4BOXTYPE_moov:
readBox_xxxx(fp, fileStart+head, fileStart+size, depth+1);
break;
case MP4BOXTYPE_mvhd:
readBox_mvhd(fp, fileStart+head, fileStart+size, depth+1);
break;
case MP4BOXTYPE_iods:
readBox_iods(fp, fileStart+head, fileStart+size, depth+1);
break;
case MP4BOXTYPE_trak:
readBox_xxxx(fp, fileStart+head, fileStart+size, depth+1);
break;
case MP4BOXTYPE_tkhd:
readBox_tkhd(fp, fileStart+head, fileStart+size, depth+1);
break;
case MP4BOXTYPE_edts:
readBox_xxxx(fp, fileStart+head, fileStart+size, depth+1);
break;
case MP4BOXTYPE_elst:
readBox_elst(fp, fileStart+head, fileStart+size, depth+1);
break;
case MP4BOXTYPE_mdia:
readBox_xxxx(fp, fileStart+head, fileStart+size, depth+1);
break;
case MP4BOXTYPE_mdhd:
readBox_mdhd(fp, fileStart+head, fileStart+size, depth+1);
break;
case MP4BOXTYPE_hdlr:
readBox_hdlr(fp, fileStart+head, fileStart+size, depth+1);
break;
case MP4BOXTYPE_minf:
readBox_xxxx(fp, fileStart+head, fileStart+size, depth+1);
break;
case MP4BOXTYPE_vmhd:
readBox_vmhd(fp, fileStart+head, fileStart+size, depth+1);
break;
case MP4BOXTYPE_smhd:
readBox_smhd(fp, fileStart+head, fileStart+size, depth+1);
break;
case MP4BOXTYPE_dinf:
readBox_xxxx(fp, fileStart+head, fileStart+size, depth+1);
break;
case MP4BOXTYPE_dref:
readBox_dref(fp, fileStart+head, fileStart+size, depth+1);
break;
case MP4BOXTYPE_stbl:
readBox_xxxx(fp, fileStart+head, fileStart+size, depth+1);
break;
case MP4BOXTYPE_stsd:
readBox_stsd(fp, fileStart+head, fileStart+size, depth+1);
break;
case MP4BOXTYPE_stts:
readBox_stts(fp, fileStart+head, fileStart+size, depth+1);
break;
case MP4BOXTYPE_stss:
readBox_stss(fp, fileStart+head, fileStart+size, depth+1);
break;
case MP4BOXTYPE_ctts:
readBox_ctts(fp, fileStart+head, fileStart+size, depth+1);
break;
case MP4BOXTYPE_stsc:
readBox_stsc(fp, fileStart+head, fileStart+size, depth+1);
break;
case MP4BOXTYPE_stsz:
readBox_stsz(fp, fileStart+head, fileStart+size, depth+1);
break;
case MP4BOXTYPE_stco:
readBox_stco(fp, fileStart+head, fileStart+size, depth+1);
break;
case MP4BOXTYPE_co64:
readBox_co64(fp, fileStart+head, fileStart+size, depth+1);
break;
case MP4BOXTYPE_sgpd:
readBox_sgpd(fp, fileStart+head, fileStart+size, depth+1);
break;
case MP4BOXTYPE_sbgp:
readBox_sbgp(fp, fileStart+head, fileStart+size, depth+1);
break;
case MP4BOXTYPE_sdtp:
readBox_sdtp(fp, fileStart+head, fileStart+size, depth+1);
break;
case MP4BOXTYPE_udta:
readBox_udta(fp, fileStart+head, fileStart+size, depth+1);
break;
case MP4BOXTYPE_uuid:
readBox_uuid(fp, fileStart+head, fileStart+size, depth+1);
break;
case MP4BOXTYPE_mdat:
case MP4BOXTYPE_free:
break;
default:
printf("#define MP4BOXTYPE_%s 0x%08x\n", typeStr, type);
break;
}
fileStart += size;
}
}
int main(int argc, char *argv[]){
if(argc < 2) return 0;
FILE *fp = fopen(argv[1], "rb");
if(fp == NULL) return -1;
int64_t fileSize = getFileSize(fp);
if(fileSize <= 8){
fclose(fp);
return -2;
}
readBox_xxxx(fp, 0, fileSize, 1);
fclose(fp);
return 0;
}
运行效果如下图:

网友评论