美文网首页Ceph
ceph rbd:Image create

ceph rbd:Image create

作者: chnmagnus | 来源:发表于2018-02-28 16:45 被阅读17次

    创建image过程的代码走读。过程中,发现自己对librados aio机制和cls 注册的函数调用机制不太了解,有空单独写篇文。

    浅析

    先走一遍流程,从宏观上看一下image创建的过程。
    初始化rbd并创建image。

    ceph osd pool create rbd 32
    rbd pool init rbd
    rbd create --size 1024 rbd/testimage
    

    查看已有的对象

    rados ls -p rbd
    
    rbd_directory
    rbd_id.testimage
    rbd_info
    rbd_object_map.105d2ae8944a
    rbd_header.105d2ae8944a
    

    一个pool中的rbd对象分成两类:
    第一类,整个pool的rbd元数据对象
    1.rbd_directory:在每个pool中都存在,用于保存该pool下所有的image的信息。该对象的omap中保存该pool中所有image的name和id。对于每一个image,保存两条信息,第一条key为id_<image id>,value为image name;第二条key为name_<image name>,value为image id。

    rados listomapvals rbd_directory -p rbd
    
    id_105d2ae8944a
    value (13 bytes) :
    00000000  09 00 00 00 74 65 73 74  69 6d 61 67 65           |....testimage|
    0000000d
    
    name_testimage
    value (16 bytes) :
    00000000  0c 00 00 00 31 30 35 64  32 61 65 38 39 34 34 61  |....105d2ae8944a|
    00000010
    

    2.rbd_info:正常情况下内容为overwrite validated,如果是v1 image,情况不同。暂时忽略。

    第二类,一个image的元数据对象
    文档描述如下:

    /* New-style rbd image 'testimage' consists of objects
     *   rbd_id.testimage        - id of image
     *   rbd_header.<id>         - image metadata
     *   rbd_object_map.<id>     - optional image object map
     *   rbd_data.<id>.00000000
     *   rbd_data.<id>.00000001
     *   ...                     - data
     */
    

    但之前的rados ls结果只有前三个。为了加速image创建、节省空间。数据对象只有在使用时才会被分配。

    1.rbd_id.testimage:被称为image的id_obj对象,其内容为该image的id。

    2.rbd_header.105d2ae8944a:被称为image的head_obj对象,它的omap中保存了该image的元数据。

    rados listomapvals rbd_header.105d2ae8944a -p rbd
    
    create_timestamp - 创建时间
    value (8 bytes) :
    00000000  17 25 96 5a a7 bc 94 35                           |.%.Z...5|
    00000008
    
    features - 开启的特性
    value (8 bytes) :
    00000000  3d 00 00 00 00 00 00 00                           |=.......|
    00000008
    
    object_prefix - data对象的前缀
    value (25 bytes) :
    00000000  15 00 00 00 72 62 64 5f  64 61 74 61 2e 31 30 35  |....rbd_data.105|
    00000010  64 32 61 65 38 39 34 34  61                       |d2ae8944a|
    00000019
    
    order - 每个data对象的大小
    value (1 bytes) :
    00000000  16                                                |.|
    00000001
    
    size - image size
    value (8 bytes) :
    00000000  00 00 00 40 00 00 00 00                           |...@....|
    00000008
    
    snap_seq - 当前存在的最新的seq
    value (8 bytes) :
    00000000  00 00 00 00 00 00 00 00                           |........|
    00000008
    

    如果创建了快照会有快照相关的key value存在于omap中,暂且不表。

    3.rbd_object_map.105d2ae8944a:用于支持object map特性,开启object map时会创建。

    代码

    省略了部分代码,不影响阅读。

    image的创建从librbd.cc的create函数开始,调用了internal.cc中的create。注意,有多个版本的create函数,其区别主要在于指定选项的多寡,其最终实现是一致的。

      /*
        io_ctx参数为调用librados创建的,用于连接rados中对应的pool
          librados::IoCtx io_ctx;
          rados.ioctx_create(pool_name.c_str(), io_ctx);
        name参数表示要创建的image的名称
        size参数为image size
        order为rbd对应到rados中每个对象的大小,默认为4MB,即1<<22
      */
      int RBD::create(IoCtx& io_ctx, const char *name, uint64_t size, int *order)
      {
        int r = librbd::create(io_ctx, name, size, order);
        return r;
      }
    

    internal.cc中的create

      int create(librados::IoCtx& io_ctx, const char *imgname, uint64_t size,
             int *order)
      {
        uint64_t order_ = *order;
        ImageOptions opts;
    
        int r = opts.set(RBD_IMAGE_OPTION_ORDER, order_);
        assert(r == 0);
        // 转调
        r = create(io_ctx, imgname, "", size, opts, "", "", false);
    
        int r1 = opts.get(RBD_IMAGE_OPTION_ORDER, &order_);
        assert(r1 == 0);
        *order = order_;
    
        return r;
      }
      
      // 真正的实现
      int create(IoCtx& io_ctx, const std::string &image_name,
             const std::string &image_id, uint64_t size,
             ImageOptions& opts,
             const std::string &non_primary_global_image_id,
             const std::string &primary_mirror_uuid,
             bool skip_mirror_enable)
      {
        // 准备image的id,不存在则生成
        std::string id(image_id);
        if (id.empty()) {
          id = util::generate_image_id(io_ctx);
        }
    
        CephContext *cct = (CephContext *)io_ctx.cct();
        ldout(cct, 10) << __func__ << " name=" << image_name << ", "
               << "id= " << id << ", "
               << "size=" << size << ", opts=" << opts << dendl;
        // 准备image的format类型,不存在则设为默认值
        uint64_t format;
        if (opts.get(RBD_IMAGE_OPTION_FORMAT, &format) != 0)
          format = cct->_conf->get_val<int64_t>("rbd_default_format");
        bool old_format = format == 1;
    
        // make sure it doesn't already exist, in either format
        int r = detect_format(io_ctx, image_name, NULL, NULL);
        if (r != -ENOENT) {
          if (r) {
        lderr(cct) << "Could not tell if " << image_name << " already exists"
               << dendl;
        return r;
          }
          lderr(cct) << "rbd image " << image_name << " already exists" << dendl;
          return -EEXIST;
        }
        // 准备order,不存在则设为默认值
        uint64_t order = 0;
        if (opts.get(RBD_IMAGE_OPTION_ORDER, &order) != 0 || order == 0) {
          order = cct->_conf->get_val<int64_t>("rbd_default_order");
        }
        r = image::CreateRequest<>::validate_order(cct, order);
        if (r < 0) {
          return r;
        }
        // 根据不同的format,创建不同的镜像,old format只为向下兼容,不深究
        if (old_format) {
          r = create_v1(io_ctx, image_name.c_str(), size, order);
        } else {
          // ceph 使用的线程池和队列,ContextWQ是异步回调方式的队列
          // 放入其中的任务,在线程池中执行完成后,最终会调用用户实现的回调函数(Context::finish())
          ThreadPool *thread_pool;
          ContextWQ *op_work_queue;
          ImageCtx::get_thread_pool_instance(cct, &thread_pool, &op_work_queue);
    
          C_SaferCond cond;
          // new一个CreateRequest对象,其中模版参数为默认值,ImageCtx
          // 在构造函数中,解析出所有需要的参数,列举如下:
          /*
            name
            id
            size
            features
            order
            stripe_unit
            stripe_count
            journal_order
            journal_splay_width
            journal_pool
            data_pool
          */
          image::CreateRequest<> *req = image::CreateRequest<>::create(
            io_ctx, image_name, id, size, opts, non_primary_global_image_id,
            primary_mirror_uuid, skip_mirror_enable, op_work_queue, &cond);
          // 执行操作的入口函数
          req->send();
          // 等待req的完成
          r = cond.wait();
        }
    
        int r1 = opts.set(RBD_IMAGE_OPTION_ORDER, order);
        assert(r1 == 0);
    
        return r;
      }
    

    CreateRequest.h/cc中定义了创建操作的具体实现,先贴出状态图。之后的代码执行流程与状态图一致。

      /**
       * @verbatim
       *
       *                                  <start> . . . . > . . . . .
       *                                     |                      .
       *                                     v                      .
       *                               VALIDATE POOL                v (pool validation
       *                                     |                      .  disabled)
       *                                     v                      .
       *                             VALIDATE OVERWRITE             .
       *                                     |                      .
       *                                     v                      .
       * (error: bottom up)           CREATE ID OBJECT. . < . . . . .
       *  _______<_______                    |
       * |               |                   v
       * |               |          ADD IMAGE TO DIRECTORY
       * |               |               /   |
       * |      REMOVE ID OBJECT<-------/    v
       * |               |           NEGOTIATE FEATURES (when using default features)
       * |               |                   |
       * |               |                   v         (stripingv2 disabled)
       * |               |              CREATE IMAGE. . . . > . . . .
       * v               |               /   |                      .
       * |      REMOVE FROM DIR<--------/    v                      .
       * |               |          SET STRIPE UNIT COUNT           .
       * |               |               /   |  \ . . . . . > . . . .
       * |      REMOVE HEADER OBJ<------/    v                     /. (object-map
       * |               |\           OBJECT MAP RESIZE . . < . . * v  disabled)
       * |               | \              /  |  \ . . . . . > . . . .
       * |               |  *<-----------/   v                     /. (journaling
       * |               |             FETCH MIRROR MODE. . < . . * v  disabled)
       * |               |                /   |                     .
       * |     REMOVE OBJECT MAP<--------/    v                     .
       * |               |\             JOURNAL CREATE              .
       * |               | \               /  |                     .
       * v               |  *<------------/   v                     .
       * |               |           MIRROR IMAGE ENABLE            .
       * |               |                /   |                     .
       * |        JOURNAL REMOVE*<-------/    |                     .
       * |                                    v                     .
       * |_____________>___________________<finish> . . . . < . . . .
       *
       * @endverbatim
       */
    

    对应于状态图每一步的函数如下:

    • send(),校验各种参数,开始流程
    • validate_pool(),检验rbd_directory是否存在
    • validate_overwrite(),检验rbd_info存在及内容。与image旧版本有关,略。
    • create_id_object(),创建rbd_id.<image name>对象,并设置其内容为image id
    • add_image_to_directory(),将image name和id加入rbd_directory的oamp中
    • negotiate_features(),对features参数做一些处理
    • create_image(),创建rbd_header.<image id>对象,并存入各种元数据到其omap
    • set_stripe_unit_count(),设置stripe_unitstripe_count到header omap
    • object_map_resize(),设置object_countobject_state到header omap
    • fetch_mirror_mode(),mirror特性,暂时略
    • journal_create(),journal特性,暂时略
    • mirror_image_enable(),mirror特性,暂时略
    • complete(),完成流程

    下面是上述函数的详细代码:

    send,状态机的入口函数,在这个函数中验证各种参数的正确性,如果出错,则调用complete函数,complete函数最终会调用继承自Context::finish()的回调函数,进行错误处理。如果没有出错,则调用validate_pool()函数,进入下一状态。

    template<typename I>
    void CreateRequest<I>::send() {
      ldout(m_cct, 20) << dendl;
      // 校验各种参数
      int r = validate_features(m_cct, m_features, m_force_non_primary);
      if (r < 0) {
        complete(r);
        return;
      }
    
      r = validate_order(m_cct, m_order);
      if (r < 0) {
        complete(r);
        return;
      }
    
      r = validate_striping(m_cct, m_order, m_stripe_unit, m_stripe_count);
      if (r < 0) {
        complete(r);
        return;
      }
    
      r = validate_data_pool(m_cct, m_ioctx, m_features, m_data_pool,
                             &m_data_pool_id);
      if (r < 0) {
        complete(r);
        return;
      }
    
      if (((m_features & RBD_FEATURE_OBJECT_MAP) != 0) &&
          (!validate_layout(m_cct, m_size, m_layout))) {
        complete(-EINVAL);
        return;
      }
      // 进入下一状态
      validate_pool();
    }
    

    validate_pool,校验rbd_directory对象是否存在

    template<typename I>
    void CreateRequest<I>::validate_pool() {
      // 判断是否跳过validate_pool阶段
      if (!m_cct->_conf->get_val<bool>("rbd_validate_pool")) {
        create_id_object();
        return;
      }
    
      // 将handle_validate_pool函数封装成AioCompletion对象,作为aio_operate完成时的回调函数
      // 在handle_validate_pool函数中调用了validate_overwrite,进入下一状态
      using klass = CreateRequest<I>;
      librados::AioCompletion *comp =
        create_rados_callback<klass, &klass::handle_validate_pool>(this);
    
      librados::ObjectReadOperation op;
      op.stat(NULL, NULL, NULL);
    
      m_outbl.clear();
      // 通过读取rbd_directory对象,判断其是否存在
      int r = m_ioctx.aio_operate(RBD_DIRECTORY, comp, &op, &m_outbl);
      assert(r == 0);
      comp->release();
    }
    
    template<typename I>
    void CreateRequest<I>::handle_validate_pool(int r) {
      ldout(m_cct, 20) << "r=" << r << dendl;
    
      if (r == 0) {
        validate_overwrite();
        return;
      } else if ((r < 0) && (r != -ENOENT)) {
        lderr(m_cct) << "failed to stat RBD directory: " << cpp_strerror(r)
                     << dendl;
        complete(r);
        return;
      }
    
      // allocate a self-managed snapshot id if this a new pool to force
      // self-managed snapshot mode
      // This call is executed just once per (fresh) pool, hence we do not
      // try hard to make it asynchronous (and it's pretty safe not to cause
      // deadlocks).
    
      uint64_t snap_id;
      r = m_ioctx.selfmanaged_snap_create(&snap_id);
      if (r == -EINVAL) {
        lderr(m_cct) << "pool not configured for self-managed RBD snapshot support"
                     << dendl;
        complete(r);
        return;
      } else if (r < 0) {
        lderr(m_cct) << "failed to allocate self-managed snapshot: "
                     << cpp_strerror(r) << dendl;
        complete(r);
        return;
      }
    
      r = m_ioctx.selfmanaged_snap_remove(snap_id);
      if (r < 0) {
        // we've already switched to self-managed snapshots -- no need to
        // error out in case of failure here.
        ldout(m_cct, 10) << "failed to release self-managed snapshot " << snap_id
                         << ": " << cpp_strerror(r) << dendl;
      }
    
      validate_overwrite();
    }
    
    

    validate_overwrite,校验rbd_info对象的内容,与新旧版本image有关,可以忽略。

    template <typename I>
    void CreateRequest<I>::validate_overwrite() {
      ...
      // handle_validate_overwrite为aio_operate的回调函数
      // handle_validate_overwrite函数会调用create_id_object进入下一状态
      using klass = CreateRequest<I>;
      librados::AioCompletion *comp =
        create_rados_callback<klass, &klass::handle_validate_overwrite>(this);
    
      librados::ObjectReadOperation op;
      op.read(0, 0, nullptr, nullptr);
    
      m_outbl.clear();
      // 通过读取rbd_info对象,判断rbd_info对象是否存在
      int r = m_data_io_ctx.aio_operate(RBD_INFO, comp, &op, &m_outbl);
      assert(r == 0);
      comp->release();
    }
    
    template <typename I>
    void CreateRequest<I>::handle_validate_overwrite(int r) {
      ldout(m_cct, 20) << "r=" << r << dendl;
    
      bufferlist bl;
      bl.append("overwrite validated");
      // 如果rbd_info存在并且,内容为overwrite validated,直接进入下一状态
      if (r == 0 && m_outbl.contents_equal(bl)) {
        create_id_object();
        return;
      } else if ((r < 0) && (r != -ENOENT)) {
        lderr(m_cct) << "failed to read RBD info: " << cpp_strerror(r) << dendl;
        complete(r);
        return;
      }
      
      // 旧版本image相关,不管它
      // validate the pool supports overwrites. We cannot use rbd_directory
      // since the v1 images store the directory as tmap data within the object.
      ldout(m_cct, 10) << "validating overwrite support" << dendl;
      bufferlist initial_bl;
      initial_bl.append("validate");
      r = m_data_io_ctx.write(RBD_INFO, initial_bl, initial_bl.length(), 0);
      if (r >= 0) {
        r = m_data_io_ctx.write(RBD_INFO, bl, bl.length(), 0);
      }
      if (r == -EOPNOTSUPP) {
        lderr(m_cct) << "pool missing required overwrite support" << dendl;
        complete(-EINVAL);
        return;
      } else if (r < 0) {
        lderr(m_cct) << "failed to validate overwrite support: " << cpp_strerror(r)
                     << dendl;
        complete(r);
        return;
      }
    
      create_id_object();
    }
    

    create_id_object,创建rbd_id.<image name>对象

    template<typename I>
    void CreateRequest<I>::create_id_object() {
      ldout(m_cct, 20) << dendl;
      // 创建一个writeoption对象
      librados::ObjectWriteOperation op;
      // 创建该对象
      op.create(true);
      // 通过cls client调用注册在osd上的set_id函数
      // 其功能为将op对应的对象的内容设置为image_id。
      // 也就是将rbd_id.<image name>的内容设置为image id
      cls_client::set_id(&op, m_image_id);
    
      // handle_create_id_object为aio_operate完成后调用的回调函数
      // 在handle_create_id_object中,调用了add_image_to_directory,进入下一状态
      using klass = CreateRequest<I>;
      librados::AioCompletion *comp =
        create_rados_callback<klass, &klass::handle_create_id_object>(this);
      // 疑问。
      // 之前已经通过cls创建了rbd_id对象,这里的作用是?做进一步验证?或者仅仅为了触发回调函数?
      // 或者说,之前cls的操作并不会直接执行,需要通过aio_operate来触发。我倾向于后者。
      int r = m_ioctx.aio_operate(m_id_obj, comp, &op);
      assert(r == 0);
      comp->release();
    }
    
    template<typename I>
    void CreateRequest<I>::handle_create_id_object(int r) {
      ldout(m_cct, 20) << "r=" << r << dendl;
    
      if (r < 0) {
        lderr(m_cct) << "error creating RBD id object: " << cpp_strerror(r)
                     << dendl;
        complete(r);
        return;
      }
    
      add_image_to_directory();
    }
    

    add_image_to_directory,在rbd_directory对象中加入该image的id和name

    template<typename I>
    void CreateRequest<I>::add_image_to_directory() {
      ldout(m_cct, 20) << dendl;
      // 通过cls client调用注册在osd上的dir_add_image函数,
      // 在rbd_directory的omap中增加两条key value。
      librados::ObjectWriteOperation op;
      cls_client::dir_add_image(&op, m_image_name, m_image_id);
    
      using klass = CreateRequest<I>;
      librados::AioCompletion *comp =
        create_rados_callback<klass, &klass::handle_add_image_to_directory>(this);
      int r = m_ioctx.aio_operate(RBD_DIRECTORY, comp, &op);
      assert(r == 0);
      comp->release();
    }
    
    template<typename I>
    void CreateRequest<I>::handle_add_image_to_directory(int r) {
      ldout(m_cct, 20) << "r=" << r << dendl;
    
      if (r < 0) {
        lderr(m_cct) << "error adding image to directory: " << cpp_strerror(r)
                     << dendl;
    
        m_r_saved = r;
        remove_id_object();
      }
    
      negotiate_features();
    }
    

    negotiate_features,

    template<typename I>
    void CreateRequest<I>::negotiate_features() {
      if (!m_negotiate_features) {
        create_image();
        return;
      }
    
      ldout(m_cct, 20) << dendl;
    
      librados::ObjectReadOperation op;
      // 获取所有的features
      cls_client::get_all_features_start(&op);
      
      using klass = CreateRequest<I>;
      librados::AioCompletion *comp =
        create_rados_callback<klass, &klass::handle_negotiate_features>(this);
      // 执行op并触发回调函数
      m_outbl.clear();
      int r = m_ioctx.aio_operate(RBD_DIRECTORY, comp, &op, &m_outbl);
      assert(r == 0);
      comp->release();
    }
    
    template<typename I>
    void CreateRequest<I>::handle_negotiate_features(int r) {
      ldout(m_cct, 20) << "r=" << r << dendl;
    
      uint64_t all_features;
      if (r >= 0) {
        bufferlist::iterator it = m_outbl.begin();
        // 将返回的features decode到all_features
        r = cls_client::get_all_features_finish(&it, &all_features);
      }
      if (r < 0) {
        ldout(m_cct, 10) << "error retrieving server supported features set: "
                         << cpp_strerror(r) << dendl;
      } else if ((m_features & all_features) != m_features) {
        m_features &= all_features;
        ldout(m_cct, 10) << "limiting default features set to server supported: "
                 << m_features << dendl;
      }
    
      create_image();
    }
    

    create_image,

    template<typename I>
    void CreateRequest<I>::create_image() {
      ldout(m_cct, 20) << dendl;
      assert(m_data_pool.empty() || m_data_pool_id != -1);
      // 准备数据对象的名称
      ostringstream oss;
      oss << RBD_DATA_PREFIX;
      if (m_data_pool_id != -1) {
        oss << stringify(m_ioctx.get_id()) << ".";
      }
      oss << m_image_id;
      if (oss.str().length() > RBD_MAX_BLOCK_NAME_PREFIX_LENGTH) {
        lderr(m_cct) << "object prefix '" << oss.str() << "' too large" << dendl;
        complete(-EINVAL);
        return;
      }
    
      librados::ObjectWriteOperation op;
      op.create(true);
      // 通过cls注册的函数,创建rbd_header对象,并设置omap中的值
      cls_client::create_image(&op, m_size, m_order, m_features, oss.str(),
                               m_data_pool_id);
    
      using klass = CreateRequest<I>;
      librados::AioCompletion *comp =
        create_rados_callback<klass, &klass::handle_create_image>(this);
      int r = m_ioctx.aio_operate(m_header_obj, comp, &op);
      assert(r == 0);
      comp->release();
    }
    
    template<typename I>
    void CreateRequest<I>::handle_create_image(int r) {
      ldout(m_cct, 20) << "r=" << r << dendl;
    
      if (r < 0) {
        lderr(m_cct) << "error writing header: " << cpp_strerror(r) << dendl;
        m_r_saved = r;
        remove_from_dir();
        return;
      }
    
      set_stripe_unit_count();
    }
    

    以下函数代码暂时省略。流程类似。

    set_stripe_unit_count
    object_map_resize
    fetch_mirror_mode
    journal_create
    mirror_image_enable

    最后调用complete函数,传入的参数为0

    template<typename I>
    void CreateRequest<I>::complete(int r) {
      if (r == 0) {
        ldout(m_cct, 20) << "done." << dendl;
      }
      // 释放数据对象上下文
      m_data_io_ctx.close();
      // 调用CreateRequest结束回调函数,完成步骤
      m_on_finish->complete(r);
      delete this;
    }
    
    

    相关文章

      网友评论

        本文标题:ceph rbd:Image create

        本文链接:https://www.haomeiwen.com/subject/mtlixftx.html