在前面博文介绍了Ceph集群及RGW的部署。同时提到,针对RGW,bucket-index对象性能要求高,不同用户具有不同的存储性能要求。因此,这篇文章介绍如何设计crush map,同时使得RGW能够支持不同存储性能要求的数据。
在现有集群中添加OSD
三个节点,各添加一个OSD。
#qemu-img create -f qcow2 lm1_journal2.qcow2 50G
#qemu-img create -f qcow2 lm2_journal2.qcow2 50G
#qemu-img create -f qcow2 lm3_journal2.qcow2 50G
#qemu-img create -f qcow2 lm1_osd3.qcow2 100G
#qemu-img create -f qcow2 lm2_osd3.qcow2 100G
#qemu-img create -f qcow2 lm3_osd3.qcow2 100G
界面挂载块设备
OSD盘设置:
#ansible ceph -m command -a "pvcreate /dev/vdf"
#ansible ceph -m command -a "vgcreate datavg3 /dev/vdf"
#ansible ceph -m command -a "lvcreate -n datalv3 -l 100%Free datavg3"
SSD盘设置(block.db, block.wal):
#ansible ceph -m command -a "parted /dev/vde mklabel gpt"
#ansible ceph -m command -a "parted /dev/vde mkpart primary 2048s 50%"
#ansible ceph -m command -a "parted /dev/vde mkpart primary 50% 100%"
#ansible ceph -m command -a "pvcreate /dev/vde1"
#ansible ceph -m command -a "pvcreate /dev/vde2"
#ansible ceph -m command -a "vgcreate block_db_vg3 /dev/vde1"
#ansible ceph -m command -a "vgcreate block_wal_vg3 /dev/vde2"
#ansible ceph -m command -a "lvcreate -n dblv3 -l 100%Free block_db_vg3"
#ansible ceph -m command -a "lvcreate -n wallv3 -l 100%Free block_wal_vg3"
Luminous1/2/3分别运行:
ceph-volume lvm prepare --data /dev/datavg3/datalv3 --block.wal block_wal_vg3/wallv3 --block.db block_db_vg3/dblv3
ceph-volume lvm activate {osd_id} {osd_fsid}
更新crush map
配置文件添加:
[osd]
osd crush update on start = false
重启所有OSD
添加虚拟节点:
#ceph osd crush add-bucket ssd-luminous1 host
#ceph osd crush add-bucket ssd-luminous2 host
#ceph osd crush add-bucket ssd-luminous3 host
#ceph osd crush add-bucket ssd-root root
将对应的OSD添加至指定的host中:
# ceph osd tree
ID CLASS WEIGHT TYPE NAME STATUS REWEIGHT PRI-AFF
-11 0 host ssd-luminous3
-10 0 host ssd-luminous2
-9 0 host ssd-luminous1
-1 0.58498 root default
-3 0.19499 host luminous1
0 hdd 0.09799 osd.0 up 1.00000 1.00000
1 hdd 0.09799 osd.1 up 1.00000 1.00000
-5 0.19499 host luminous2
2 hdd 0.09799 osd.2 up 1.00000 1.00000
3 hdd 0.09799 osd.3 up 1.00000 1.00000
-7 0.19499 host luminous3
4 hdd 0.09799 osd.4 up 1.00000 1.00000
5 hdd 0.09799 osd.5 up 1.00000 1.00000
6 hdd 0 osd.6 up 1.00000 1.00000
7 hdd 0 osd.7 up 1.00000 1.00000
8 hdd 0 osd.8 up 1.00000 1.00000
#ceph osd crush set osd.6 0.09798 root=ssd-root host=ssd-luminous1
#ceph osd crush set osd.7 0.09798 root=ssd-root host=ssd-luminous2
#ceph osd crush set osd.8 0.09798 root=ssd-root host=ssd-luminous3
# ceph osd tree
ID CLASS WEIGHT TYPE NAME STATUS REWEIGHT PRI-AFF
-11 0.09798 host ssd-luminous3
8 hdd 0.09798 osd.8 up 1.00000 1.00000
-10 0.09798 host ssd-luminous2
7 hdd 0.09798 osd.7 up 1.00000 1.00000
-9 0.09798 host ssd-luminous1
6 hdd 0.09798 osd.6 up 1.00000 1.00000
-1 0.58498 root default
-3 0.19499 host luminous1
0 hdd 0.09799 osd.0 up 1.00000 1.00000
1 hdd 0.09799 osd.1 up 1.00000 1.00000
-5 0.19499 host luminous2
2 hdd 0.09799 osd.2 up 1.00000 1.00000
3 hdd 0.09799 osd.3 up 1.00000 1.00000
-7 0.19499 host luminous3
4 hdd 0.09799 osd.4 up 1.00000 1.00000
5 hdd 0.09799 osd.5 up 1.00000 1.00000
如上,添加了三个虚拟host。同时,将三个OSD添加至对应的虚拟host下。
创建新的rule set
#ceph osd getcrushmap -o /tmp/mycrushmap
#crushtool -d /tmp/mycrushmap > /tmp/mycrushmapcp
编辑/tmp/mycrushmapcp:
如下:
root ssd-root {
id -15 # do not change unnecessarily
id -16 class hdd # do not change unnecessarily
id -17 class ssd # do not change unnecessarily
# weight 0.000
alg straw2
hash 0 # rjenkins1
item ssd-luminous1 weight 0.098
item ssd-luminous2 weight 0.098
item ssd-luminous3 weight 0.098
}
#ssd-rules
rule index_rule {
id 1
type replicated
min_size 1
max_size 10
step take ssd-root
step chooseleaf firstn 0 type host
step emit
}
#crushtool -c /tmp/mycrushmapcp -o /tmp/mycrushmapnew
#ceph osd setcrushmap -i /tmp/mycrushmapnew
# ceph osd tree
ID CLASS WEIGHT TYPE NAME STATUS REWEIGHT PRI-AFF
-15 0 root ssd-root
-11 0.09798 host ssd-luminous3
8 ssd 0.09798 osd.8 up 1.00000 1.00000
-10 0.09798 host ssd-luminous2
7 ssd 0.09798 osd.7 up 1.00000 1.00000
-9 0.09798 host ssd-luminous1
6 ssd 0.09798 osd.6 up 1.00000 1.00000
-1 0.58498 root default
-3 0.19499 host luminous1
0 hdd 0.09799 osd.0 up 1.00000 1.00000
1 hdd 0.09799 osd.1 up 1.00000 1.00000
-5 0.19499 host luminous2
2 hdd 0.09799 osd.2 up 1.00000 1.00000
3 hdd 0.09799 osd.3 up 1.00000 1.00000
-7 0.19499 host luminous3
4 hdd 0.09799 osd.4 up 1.00000 1.00000
5 hdd 0.09799 osd.5 up 1.00000 1.00000
如上完成了一个基于SSD设备的rule set的创建。
测试
#ceph osd pool create test 32 32 index_rule
pool 'test' created
#ceph osd pool get test crush_rule
crush_rule: index_rule
# ceph pg dump | grep '^12\.' | awk 'BEGIN{print "PG_id","\t","copy_set"}{print $1,"\t",$17}' | less
PG_id copy_set
dumped all
12.1d [7,6,8]
12.1c [6,7,8]
12.1a [8,7,6]
...
新的pg都落在指定的osd上。
基于定制的crush map设置不同的rgw placement策略
在上述中,我们利用创建了一个独立rule set。此rule set可使数据落在独立的一批OSD中。
需求:
- index pool: 落在此定制的rule set中;
- data pool:不同用户可以选择不同性能的Pool。
index pool: 创建pool时可以直接指定对应的crush rule。
对于不同的用户使用不同的data pool,有两种方法:
创建一个data pool:
# ceph osd pool create upc.rgw.test.data 32 32 index_rule
# ceph osd pool application enable upc.rgw.test.data rgw
- 基于placement策略,placement关联bucket
创建placement:
设置为默认的zone,此处应在部署时即设定
#radosgw-admin zone default --rgw-zone=upc --rgw-zonegroup=pd --rgw-realm=sh
#radosgw-admin period update --commit
#radosgw-admin zonegroup placement add --rgw-zonegroup=pd --placement-id=temp
#radosgw-admin zonegroup placement modify --rgw-zonegroup=pd --placement-id=temp --tags="Tag"
#radosgw-admin zone placement add --rgw-zone upc --placement-id temp --data-pool upc.rgw.test.data --index-pool upc.rgw.buckets.index --data-extra-pool upc.rgw.buckets.non-ec
#radosgw-admin period update --commit
配置用户的placement:
#radosgw-admin metadata get user:ups302 > ups3.json
编辑ups3.json:
修改default_placement : temp,设置一个tag用于权限控制。
"default_placement": "temp",
"default_storage_class": "",
"placement_tags": [
"Tag"
],
更新用户元数据
#radosgw-admin metadata put user:ups302 < ups3.json
测试"storage class"及"compress"功能
创建一个bucket:
#s3cmd mb s3://second //创建在user.default_placement,当前为temp
# s3cmd put ups3.json s3://second
upload: 'ups3.json' -> 's3://second/ups3.json' [1 of 1]
1199 of 1199 100% in 0s 25.21 kB/s done
利用--bucket-location覆盖用户的default_placement
#s3cmd mb s3://second --bucket-location=":default-placement"
可以发现数据实际存放在了我们指定的pool中:
# rados -p upc.rgw.test.data ls
1c60b268-0a5d-4718-ad02-e4b5bce824bf.136021.1_ups3.json
- 基于Storage Class,直接关联object data
在RGW N版,提供了storage class的功能。基于storage class完成不同object的不同存储策略。
添加storage class,我们添加在default-placement中:
# radosgw-admin zonegroup placement add --rgw-zonegroup pd --placement-id default-placement --storage-class COLD
[
{
"key": "default-placement",
"val": {
"name": "default-placement",
"tags": [],
"storage_classes": [
"COLD",
"STANDARD"
]
}
},
{
"key": "temp",
"val": {
"name": "temp",
"tags": [
"Tag"
],
"storage_classes": [
"STANDARD"
]
}
}
]
设置storage class 的data pool,并且设置压缩策略:
# radosgw-admin zone placement add --rgw-zone upc --placement-id default-placement --storage-class COLD --data-pool upc.rgw.test.data --compression zlib
...
"placement_pools": [
{
"key": "default-placement",
"val": {
"index_pool": "upc.rgw.buckets.index",
"storage_classes": {
"COLD": {
"data_pool": "upc.rgw.test.data",
"compression_type": "zlib"
},
"STANDARD": {
"data_pool": "upc.rgw.buckets.data"
}
},
"data_extra_pool": "upc.rgw.buckets.non-ec",
"index_type": 0
}
},
...
#radosgw-admin period update --commit
测试
使用storage class,有两种方式:
- 编辑用户的元数据:
"default_storage_class": ""
-
添加http头部:
#s3cmd --storage-class=COLD put cold_data s3://second
[root@luminous1 ~]# rados -p upc.rgw.test.data ls
1c60b268-0a5d-4718-ad02-e4b5bce824bf.136021.4__shadow_.6sYlWrtESVuQNVMTsCNSToM_MB293Ah_0
[root@luminous1 ~]# rados -p upc.rgw.buckets.data ls
1c60b268-0a5d-4718-ad02-e4b5bce824bf.136021.4_cold_data
发现其头部对象仍然在"STANDARD" class中,分片对象在对应"COLD" 的 data pool 中;但这里的头部对象并不存储数据。
# radosgw-admin bucket stats --bucket=second
"usage": {
"rgw.main": {
"size": 10485760,
"size_actual": 10485760,
"size_utilized": 10212,
"size_kb": 10240,
"size_kb_actual": 10240,
"size_kb_utilized": 10,
"num_objects": 1
}
},
其中:size_utilized/size_kb_utilized 为实际占用的磁盘空间,可以看出压缩后的大小为10KB。原始大小"size": 10MB。
查看一下其头部对象的xattr:
# rados -p upc.rgw.buckets.data listxattr 1c60b268-0a5d-4718-ad02-e4b5bce824bf.136021.4_cold_data
user.rgw.acl
user.rgw.compression
user.rgw.content_type
user.rgw.etag
user.rgw.idtag
user.rgw.manifest
user.rgw.pg_ver
user.rgw.source_zone
user.rgw.storage_class
user.rgw.tail_tag
user.rgw.x-amz-content-sha256
user.rgw.x-amz-date
user.rgw.x-amz-meta-s3cmd-attrs
压缩信息存储在了xattr:user.rgw.compression。利用getxattr检索发现即是"zlib"(compress算法)。
网友评论