美文网首页
BoltDB(三)DB 结构以及open操作

BoltDB(三)DB 结构以及open操作

作者: wayyyy | 来源:发表于2022-08-15 00:31 被阅读0次
    // The largest step that can be taken when remapping the mmap.
    const maxMmapStep = 1 << 30 // 1GB
    // The data file format version.
    const version = 2
    // Represents a marker value to indicate that a file is a Bolt DB.
    const magic uint32 = 0xED0CDAED
    // IgnoreNoSync specifies whether the NoSync field of a DB is ignored when
    // syncing changes to a file.  This is required as some operating systems,
    // such as OpenBSD, do not have a unified buffer cache (UBC) and writes
    // must be synchronized using the msync(2) syscall.
    const IgnoreNoSync = runtime.GOOS == "openbsd"
    // Default values if not set in a DB instance.
    const (
        DefaultMaxBatchSize  int =s 1000
        DefaultMaxBatchDelay     = 10 * time.Millisecond
        // 16k
        DefaultAllocSize = 16 * 1024 * 1024
    )
    // default page size for db is set to the OS page size.
    var defaultPageSize = os.Getpagesize()
    // DB represents a collection of buckets persisted to a file on disk.
    // All data access is performed through transactions which can be obtained through the DB.
    // All the functions on DB will return a ErrDatabaseNotOpen if accessed before Open() is called.
    type DB struct {
        // When enabled, the database will perform a Check() after every commit.
        // A panic is issued if the database is in an inconsistent state. This
        // flag has a large performance impact so it should only be used for
        // debugging purposes.
        StrictMode bool
        // Setting the NoSync flag will cause the database to skip fsync()
        // calls after each commit. This can be useful when bulk loading data
        // into a database and you can restart the bulk load in the event of
        // a system failure or database corruption. Do not set this flag for
        // normal use.
        //
        // If the package global IgnoreNoSync constant is true, this value is
        // ignored.  See the comment on that constant for more details.
        //
        // THIS IS UNSAFE. PLEASE USE WITH CAUTION.
        NoSync bool
        // When true, skips the truncate call when growing the database.
        // Setting this to true is only safe on non-ext3/ext4 systems.
        // Skipping truncation avoids preallocation of hard drive space and
        // bypasses a truncate() and fsync() syscall on remapping.
        //
        // https://github.com/boltdb/bolt/issues/284
        NoGrowSync bool
        // If you want to read the entire database fast, you can set MmapFlag to
        // syscall.MAP_POPULATE on Linux 2.6.23+ for sequential read-ahead.
        MmapFlags int
        // MaxBatchSize is the maximum size of a batch. Default value is
        // copied from DefaultMaxBatchSize in Open.
        //
        // If <=0, disables batching.
        //
        // Do not change concurrently with calls to Batch.
        MaxBatchSize int
        // MaxBatchDelay is the maximum delay before a batch starts.
        // Default value is copied from DefaultMaxBatchDelay in Open.
        //
        // If <=0, effectively disables batching.
        //
        // Do not change concurrently with calls to Batch.
        MaxBatchDelay time.Duration
        // AllocSize is the amount of space allocated when the database
        // needs to create new pages. This is done to amortize the cost
        // of truncate() and fsync() when growing the data file.
        AllocSize int
        path     string
        file     *os.File  // 真实存储数据的磁盘文件
        lockfile *os.File // windows only
        dataref  []byte   // mmap'ed readonly, write throws SEGV
        // 通过mmap映射进来的地址
        data   *[maxMapSize]byte
        datasz int
        filesz int // current on disk file size
        //  元数据
        meta0 *meta
        meta1 *meta
        pageSize int
        opened   bool
        rwtx     *Tx  // 写事务锁
        txs      []*Tx  // 读事务数组
        freelist *freelist // 空闲列表
        stats    Stats
        pagePool sync.Pool
        batchMu sync.Mutex
        batch   *batch
        rwlock   sync.Mutex   // Allows only one writer at a time.
        metalock sync.Mutex   // Protects meta page access.
        mmaplock sync.RWMutex // Protects mmap access during remapping.
        statlock sync.RWMutex // Protects stats access.
        ops struct {
            writeAt func(b []byte, off int64) (n int, err error)
        }
        // Read only mode.
        // When true, Update() and Begin(true) return ErrDatabaseReadOnly immediately.
        readOnly bool
    }
    

    Open()方法主要用来创建一个boltdb的DB对象,底层会执行新建或者打开存储数据的文件,当指定的文件不存在时, boltdb就会新建一个数据文件。否则的话,就直接加载指定的数据库文件内容。

    // Open creates and opens a database at the given path.
    // If the file does not exist then it will be created automatically.
    // Passing in nil options will cause Bolt to open the database with the default options.
    func Open(path string, mode os.FileMode, options *Options) (*DB, error) {
        var db = &DB{opened: true}
    
        // Set default options if no options are provided.
        if options == nil {
            options = DefaultOptions
        }
    
        db.NoGrowSync = options.NoGrowSync
    
        db.MmapFlags = options.MmapFlags
    
        // Set default values for later DB operations.
        db.MaxBatchSize = DefaultMaxBatchSize
        db.MaxBatchDelay = DefaultMaxBatchDelay
        db.AllocSize = DefaultAllocSize
        flag := os.O_RDWR
        if options.ReadOnly {
            flag = os.O_RDONLY
            db.readOnly = true
        }
    
        // Open data file and separate sync handler for metadata writes.
        db.path = path
        var err error
        // 打开db文件
        if db.file, err = os.OpenFile(db.path, flag|os.O_CREATE, mode); err != nil {
            _ = db.close()
            return nil, err
        }
    
        // Lock file so that other processes using Bolt in read-write mode cannot
        // use the database  at the same time. This would cause corruption since
        // the two processes would write meta pages and free pages separately.
        // The database file is locked exclusively (only one process can grab the lock)
        // if !options.ReadOnly.
        // The database file is locked using the shared lock (more than one process may
        // hold a lock at the same time) otherwise (options.ReadOnly is set).
        if err := flock(db, mode, !db.readOnly, options.Timeout); err != nil {
            _ = db.close()
            return nil, err
        }
        // Default values for test hooks
        db.ops.writeAt = db.file.WriteAt
        // Initialize the database if it doesn't exist.
        if info, err := db.file.Stat(); err != nil {
            return nil, err
        } else if info.Size() == 0 {
            // Initialize new files with meta pages.
            // 初始化新db文件
            if err := db.init(); err != nil {
                return nil, err
            }
        } else {
            // 不是新文件,读取第一页元数据
            // Read the first meta page to determine the page size.
            // 2^12,正好是4k
            var buf [0x1000]byte
            if _, err := db.file.ReadAt(buf[:], 0); err == nil {
                // 仅仅是读取了pageSize
                m := db.pageInBuffer(buf[:], 0).meta()
                if err := m.validate(); err != nil {
                    // If we can't read the page size, we can assume it's the same
                    // as the OS -- since that's how the page size was chosen in the
                    // first place.
                    //
                    // If the first page is invalid and this OS uses a different
                    // page size than what the database was created with then we
                    // are out of luck and cannot access the database.
                    db.pageSize = os.Getpagesize()
                } else {
                    db.pageSize = int(m.pageSize)
                }
            }
        }
    
        // Memory map the data file.
        // mmap映射db文件数据到内存
        if err := db.mmap(options.InitialMmapSize); err != nil {
            _ = db.close()
            return nil, err
        }
    
        // Read in the freelist.
        db.freelist = newFreelist()
        // 读第二页的数据,然后建立起freelist中
        db.freelist.read(db.page(db.meta().freelist))  // db.meta().freelist=2
        
        // Mark the database as opened and return.
        return db, nil
    }
    

    新建时:
    会调用init()方法,内部主要是新建一个文件,然后第0页、第1页写入元数据信息;第2页写入freelist信息;第3页写入bucket leaf信息。并最终刷盘。

    // init creates a new database file and initializes its meta pages.
    func (db *DB) init() error {
        // Set the page size to the OS page size.
        db.pageSize = os.Getpagesize()
        // Create two meta pages on a buffer.
        buf := make([]byte, db.pageSize*4)
        for i := 0; i < 2; i++ {
            p := db.pageInBuffer(buf[:], pgid(i))
            p.id = pgid(i)
            // 第0页和第1页存放元数据
            p.flags = metaPageFlag
            // Initialize the meta page.
            m := p.meta()
            m.magic = magic
            m.version = version
            m.pageSize = uint32(db.pageSize)
            m.freelist = 2
            m.root = bucket{root: 3}
            m.pgid = 4
            m.txid = txid(i)
            m.checksum = m.sum64()
        }
        // Write an empty freelist at page 3.
        // 拿到第2页存放freelist
        p := db.pageInBuffer(buf[:], pgid(2))
        p.id = pgid(2)
        p.flags = freelistPageFlag
        p.count = 0
        // 第三块存放叶子page
        // Write an empty leaf page at page 4.
        p = db.pageInBuffer(buf[:], pgid(3))
        p.id = pgid(3)
        p.flags = leafPageFlag
        p.count = 0
        // Write the buffer to our data file.
        // 写入4页的数据
        if _, err := db.ops.writeAt(buf, 0); err != nil {
            return err
        }
        // 刷盘
        if err := fdatasync(db); err != nil {
            return err
        }
        return nil
    }
    

    加载时:
    会读取第0页内容,也就是元信息。然后对其进行校验和校验,当校验通过后获取pageSize。否则的话,读取操作系统默认的pagesize(一般4k)

    // page retrieves a page reference from the mmap based on the current page size.
    func (db *DB) page(id pgid) *page {
        pos := id * pgid(db.pageSize)
        return (*page)(unsafe.Pointer(&db.data[pos]))
    }
    
    // pageInBuffer retrieves a page reference from a given byte array based on the current page size.
    func (db *DB) pageInBuffer(b []byte, id pgid) *page {
        return (*page)(unsafe.Pointer(&b[id*pgid(db.pageSize)]))
    }
    

    最后都会通过mmap来映射数据。

    // mmap opens the underlying memory-mapped file and initializes the meta references.
    // minsz is the minimum size that the new mmap can be.
    func (db *DB) mmap(minsz int) error {
        db.mmaplock.Lock()
        defer db.mmaplock.Unlock()
        info, err := db.file.Stat()
        if err != nil {
            return fmt.Errorf("mmap stat error: %s", err)
        } else if int(info.Size()) < db.pageSize*2 {
            return fmt.Errorf("file size too small")
        }
        // Ensure the size is at least the minimum size.
        var size = int(info.Size())
        if size < minsz {
            size = minsz
        }
        size, err = db.mmapSize(size)
        if err != nil {
            return err
        }
        // Dereference all mmap references before unmapping.
        if db.rwtx != nil {
            db.rwtx.root.dereference()
        }
        // Unmap existing data before continuing.
        if err := db.munmap(); err != nil {
            return err
        }
        // Memory-map the data file as a byte slice.
        if err := mmap(db, size); err != nil {
            return err
        }
        // Save references to the meta pages.
        // 获取元数据信息
        db.meta0 = db.page(0).meta()
        db.meta1 = db.page(1).meta()
        // Validate the meta pages. We only return an error if both meta pages fail
        // validation, since meta0 failing validation means that it wasn't saved
        // properly -- but we can recover using meta1. And vice-versa.
        err0 := db.meta0.validate()
        err1 := db.meta1.validate()
        if err0 != nil && err1 != nil {
            return err0
        }
        return nil
    }
    
    // mmapSize determines the appropriate size for the mmap given the current size
    // of the database. The minimum size is 32KB and doubles until it reaches 1GB.
    // Returns an error if the new mmap size is greater than the max allowed.
    func (db *DB) mmapSize(size int) (int, error) {
        // Double the size from 32KB until 1GB.
        for i := uint(15); i <= 30; i++ {
            if size <= 1<<i {
                return 1 << i, nil
            }
        }
        // Verify the requested size is not above the maximum allowed.
        if size > maxMapSize {
            return 0, fmt.Errorf("mmap too large")
        }
        // If larger than 1GB then grow by 1GB at a time.
        sz := int64(size)
        if remainder := sz % int64(maxMmapStep); remainder > 0 {
            sz += int64(maxMmapStep) - remainder
        }
        // Ensure that the mmap size is a multiple of the page size.
        // This should always be true since we're incrementing in MBs.
        pageSize := int64(db.pageSize)
        if (sz % pageSize) != 0 {
            sz = ((sz / pageSize) + 1) * pageSize
        }
        // If we've exceeded the max size then only grow up to the max size.
        if sz > maxMapSize {
            sz = maxMapSize
        }
        return int(sz), nil
    }
    

    为什么这里需要MMAP?
    一般来说,因为内存比磁盘小,一般会实现 page cache 缓存部分 page,比如使用 LRU 算法。boltdb 没有实现,而是使用 mmap() 创建共享、只读的文件映射并调用 madvise(MADV_RANDOM),由操作系统 管理 page cache;

    相关文章

      网友评论

          本文标题:BoltDB(三)DB 结构以及open操作

          本文链接:https://www.haomeiwen.com/subject/mtrkgrtx.html