作者: Vackine | 来源:发表于2020-03-08 16:58

     * An abstraction used for storing documents in a collection or entries in an index.
     * In storage engines implementing the KVEngine, record stores are also used for implementing
     * catalogs.
     * Many methods take an OperationContext parameter. This contains the RecoveryUnit, with
     * all RecordStore specific transaction information, as well as the LockState. Methods that take
     * an OperationContext may throw a WriteConflictException.
     * This class must be thread-safe for document-level locking storage engines. In addition, for
     * storage engines implementing the KVEngine some methods must be thread safe, see DurableCatalog.
     * Only for MMAPv1 is this class not thread-safe.
    class RecordStore

    insert 一条记录相关的源码记录。


     /**通过复制传递的record data 将指定的记录插入到RecordStore中,并更新'inOutRecords'的值让其包含
     * 插入数据的id
         * Inserts the specified records into this RecordStore by copying the passed-in record data and
         * updates 'inOutRecords' to contain the ids of the inserted records.
        virtual Status insertRecords(OperationContext* opCtx, // 操作的上下文
                                     std::vector<Record>* inOutRecords, // inOutRecords对象
                                     const std::vector<Timestamp>& timestamps) = 0; // 时戳列表
         * A thin wrapper around insertRecords() to simplify handling of single document inserts.
        StatusWith<RecordId> insertRecord(OperationContext* opCtx,
                                          const char* data,
                                          int len,
                                          Timestamp timestamp) {
            std::vector<Record> inOutRecords{Record{RecordId(), RecordData(data, len)}}; //
            // Record{RecordId(), RecordData(data, len)} -> 通过构建recordid 和data构建Record 对象
            Status status = insertRecords(opCtx, &inOutRecords, std::vector<Timestamp>{timestamp});
            if (!status.isOK())
                return status;
            return inOutRecords.front().id;



     * The data items stored in a RecordStore.
    struct Record {
        RecordId id;
        RecordData data;


    RecordId对象 是一个collection或RecordStore中记录的唯一标识,是一个有范围的值,

     * The key that uniquely identifies a Record in a Collection or RecordStore.
    class RecordId {
        // This set of constants define the boundaries of the 'normal' and 'reserved' id ranges.
        static constexpr int64_t kNullRepr = 0;
        static constexpr int64_t kMinRepr = LLONG_MIN;
        static constexpr int64_t kMaxRepr = LLONG_MAX;
        static constexpr int64_t kMinReservedRepr = kMaxRepr - (1024 * 1024);
         * Enumerates all ids in the reserved range that have been allocated for a specific purpose.
        enum class ReservedId : int64_t { kWildcardMultikeyMetadataId = kMinReservedRepr };
         * Constructs a Null RecordId.
        RecordId() : _repr(kNullRepr) {}
        explicit RecordId(int64_t repr) : _repr(repr) {}
        explicit RecordId(ReservedId repr) : RecordId(static_cast<int64_t>(repr)) {}
         * Construct a RecordId from two halves.
         * TODO consider removing.
        RecordId(int high, int low) : _repr((uint64_t(high) << 32) | uint32_t(low)) {}
         * A RecordId that compares less than all ids that represent documents in a collection.
        static RecordId min() {
            return RecordId(kMinRepr);
         * A RecordId that compares greater than all ids that represent documents in a collection.
        static RecordId max() {
            return RecordId(kMaxRepr);
         * Returns the first record in the reserved id range at the top of the RecordId space.
        static RecordId minReserved() {
            return RecordId(kMinReservedRepr);
        bool isNull() const {
            return _repr == 0;
        int64_t repr() const {
            return _repr;
         * Valid RecordIds are the only ones which may be used to represent Records. The range of valid
         * RecordIds includes both "normal" ids that refer to user data, and "reserved" ids that are
         * used internally. All RecordIds outside of the valid range are sentinel values.
        bool isValid() const {
            return isNormal() || isReserved();
         * Normal RecordIds are those which fall within the range used to represent normal user data,
         * excluding the reserved range at the top of the RecordId space.
        bool isNormal() const {
            return _repr > 0 && _repr < kMinReservedRepr;
         * Returns true if this RecordId falls within the reserved range at the top of the record space.
        bool isReserved() const {
            return _repr >= kMinReservedRepr && _repr < kMaxRepr;
        int compare(RecordId rhs) const {
            return _repr == rhs._repr ? 0 : _repr < rhs._repr ? -1 : 1;
         * Hash value for this RecordId. The hash implementation may be modified, and its behavior
         * may differ across platforms. Hash values should not be persisted.
        struct Hasher {
            size_t operator()(RecordId rid) const {
                size_t hash = 0;
                // TODO consider better hashes
                boost::hash_combine(hash, rid.repr());
                return hash;
        /// members for Sorter
        struct SorterDeserializeSettings {};  // unused
        void serializeForSorter(BufBuilder& buf) const {
            buf.appendNum(static_cast<long long>(_repr));
        static RecordId deserializeForSorter(BufReader& buf, const SorterDeserializeSettings&) {
            return RecordId(buf.read<LittleEndian<int64_t>>());
        int memUsageForSorter() const {
            return sizeof(RecordId);
        RecordId getOwned() const {
            return *this;
        int64_t _repr;

    RecordData 对象

     * A replacement for the Record class. This class represents data in a record store.
     * The _ownedData attribute is used to manage memory ownership.
    class RecordData {
        RecordData() : _data(NULL), _size(0) {}
        RecordData(const char* data, int size) : _data(data), _size(size) {}
        RecordData(SharedBuffer ownedData, int size)
            : _data(ownedData.get()), _size(size), _ownedData(std::move(ownedData)) {}
        const char* data() const {
            return _data;
        int size() const {
            return _size;
         * Returns true if this owns its own memory, and false otherwise
        bool isOwned() const {
            return _ownedData.get();
        SharedBuffer releaseBuffer() {
            return std::move(_ownedData);
        BSONObj toBson() const& {
            return isOwned() ? BSONObj(_ownedData) : BSONObj(_data);
        BSONObj releaseToBson() {
            return isOwned() ? BSONObj(releaseBuffer()) : BSONObj(_data);
        BSONObj toBson() && {
            return releaseToBson();
        RecordData getOwned() const {
            if (isOwned())
                return *this;
            auto buffer = SharedBuffer::allocate(_size);
            memcpy(buffer.get(), _data, _size);
            return RecordData(buffer, _size);
        void makeOwned() {
            if (isOwned())
            *this = getOwned();
        const char* _data;
        int _size;
        SharedBuffer _ownedData;

    wiredTiger 中对 insert 接口的实现

    virtual Status insertRecords

    Status WiredTigerRecordStore::insertRecords(OperationContext* opCtx,
                                                std::vector<Record>* records,
                                                const std::vector<Timestamp>& timestamps) {
        return _insertRecords(opCtx, records->data(), timestamps.data(), records->size());
    Status WiredTigerRecordStore::_insertRecords(OperationContext* opCtx,
                                                 Record* records,
                                                 const Timestamp* timestamps,
                                                 size_t nRecords) {
        dassert(opCtx->lockState()->isWriteLocked()); // 获取到写锁
        // We are kind of cheating on capped collections since we write all of them at once ....
        // Simplest way out would be to just block vector writes for everything except oplog ?
        int64_t totalLength = 0;
        for (size_t i = 0; i < nRecords; i++)
            totalLength += records[i].data.size();
        // caller will retry one element at a time
        // 如果store engine 是内存型的并且插入数据总的大小大于内存大小,之间退出
         // The capped settings should not be updated once operations have started
        const bool _isCapped;
        // True if the storage engine is an in-memory storage engine
        if (_isCapped && totalLength > _cappedMaxSize)
            return Status(ErrorCodes::BadValue, "object to insert exceeds cappedMaxSize");
        WiredTigerCursor curwrap(_uri, _tableId, true, opCtx); // 通过表id以及uri等参数获取wtCursor对象
        WT_CURSOR* c = curwrap.get(); 
        RecordId highestId = RecordId(); // 默认recordid
        dassert(nRecords != 0);
        for (size_t i = 0; i < nRecords; i++) {
            auto& record = records[i];
            if (_isOplog) {// 是读oplog进行表数据更新
                StatusWith<RecordId> status =
                    oploghack::extractKey(record.data.data(), record.data.size());
                if (!status.isOK())
                    return status.getStatus();
                record.id = status.getValue();
            } else {// 直接插入新的记录
                record.id = _nextId(opCtx);
                // 会先找到最近一个在使用的id,并在该基础上+1
            dassert(record.id > highestId);
            highestId = record.id;
        for (size_t i = 0; i < nRecords; i++) {
            auto& record = records[i];
            Timestamp ts;
            if (timestamps[i].isNull() && _isOplog) {
                // If the timestamp is 0, that probably means someone inserted a document directly
                // into the oplog.  In this case, use the RecordId as the timestamp, since they are
                // one and the same. Setting this transaction to be unordered will trigger a journal
                // flush. Because these are direct writes into the oplog, the machinery to trigger a
                // journal flush is bypassed. A followup oplog read will require a fresh visibility
                // value to make progress.
                ts = Timestamp(record.id.repr());
            } else {
                ts = timestamps[i];
            if (!ts.isNull()) {
                LOG(4) << "inserting record with timestamp " << ts;
                fassert(39001, opCtx->recoveryUnit()->setTimestamp(ts));
            setKey(c, record.id);
            WiredTigerItem value(record.data.data(), record.data.size());
            c->set_value(c, value.Get());
            int ret = WT_OP_CHECK(c->insert(c));
            if (ret)
                return wtRCToStatus(ret, "WiredTigerRecordStore::insertRecord");
        _changeNumRecords(opCtx, nRecords);
        _increaseDataSize(opCtx, totalLength);
        if (_oplogStones) {
                opCtx, totalLength, highestId, nRecords);
        } else {
            _cappedDeleteAsNeeded(opCtx, highestId);
        return Status::OK();



