CentOS Stream 9
Sponsored Link

Ceph Pacific : Add or Remove OSDs
2022/03/31
 
This is how to add or remove OSDs from exisiting Cluster.
                                         |
        +--------------------+           |           +----------------------+
        |   [dlp.srv.world]  |10.0.0.30  |  10.0.0.31|    [www.srv.world]   |
        |     Ceph Client    +-----------+-----------+        RADOSGW       |
        |                    |           |           |                      |
        +--------------------+           |           +----------------------+
            +----------------------------+----------------------------+
            |                            |                            |
            |10.0.0.51                   |10.0.0.52                   |10.0.0.53 
+-----------+-----------+    +-----------+-----------+    +-----------+-----------+
|   [node01.srv.world]  |    |   [node02.srv.world]  |    |   [node03.srv.world]  |
|     Object Storage    +----+     Object Storage    +----+     Object Storage    |
|     Monitor Daemon    |    |                       |    |                       |
|     Manager Daemon    |    |                       |    |                       |
+-----------------------+    +-----------------------+    +-----------------------+

[1] For example, Add a [node04] node for OSD on Admin Node.
For Block device on new [node04] Node, use [/dev/sdb] on this example.
# transfer public key

[root@node01 ~]#
ssh-copy-id node04

# if Firewalld is running, allow service

[root@node01 ~]#
ssh node04 "firewall-cmd --add-service=ceph; firewall-cmd --runtime-to-permanent"

# install required packages

[root@node01 ~]#
ssh node04 "dnf -y install centos-release-ceph-pacific; dnf -y install ceph"
# transfer required files

[root@node01 ~]#
scp /etc/ceph/ceph.conf node04:/etc/ceph/ceph.conf

[root@node01 ~]#
scp /etc/ceph/ceph.client.admin.keyring node04:/etc/ceph

[root@node01 ~]#
scp /var/lib/ceph/bootstrap-osd/ceph.keyring node04:/var/lib/ceph/bootstrap-osd
# configure OSD

[root@node01 ~]# ssh node04 \
"chown ceph. /etc/ceph/ceph.* /var/lib/ceph/bootstrap-osd/*; \
parted --script /dev/sdb 'mklabel gpt'; \
parted --script /dev/sdb "mkpart primary 0% 100%"; \
ceph-volume lvm create --data /dev/sdb1" 

Running command: /usr/bin/ceph-authtool --gen-print-key
Running command: /usr/bin/ceph --cluster ceph --name client.bootstrap-osd --keyring /var/lib/ceph/bootstrap-osd/ceph.keyring -i - osd new 36c64bed-ab6d-44dc-97f6-d08ea4e5bb90
Running command: /usr/sbin/vgcreate --force --yes ceph-90aa1c1d-4ffa-49b3-9b8c-5461b37b4225 /dev/sdb1

.....
.....

Running command: /usr/bin/systemctl enable ceph-volume@lvm-3-36c64bed-ab6d-44dc-97f6-d08ea4e5bb90
 stderr: Created symlink /etc/systemd/system/multi-user.target.wants/ceph-volume@lvm-3-36c64bed-ab6d-44dc-97f6-d08ea4e5bb90.service → /usr/lib/systemd/system/ceph-volume@.service.
Running command: /usr/bin/systemctl enable --runtime ceph-osd@3
 stderr: Created symlink /run/systemd/system/ceph-osd.target.wants/ceph-osd@3.service → /usr/lib/systemd/system/ceph-osd@.service.
Running command: /usr/bin/systemctl start ceph-osd@3
--> ceph-volume lvm activate successful for osd ID: 3
--> ceph-volume lvm create successful for: /dev/sdb1

[root@node01 ~]# ceph -s 
  cluster:
    id:     7912846f-a2bd-407d-8032-0bdb9adf2c50
    health: HEALTH_OK

  services:
    mon: 1 daemons, quorum node01 (age 95m)
    mgr: node01(active, since 93m)
    mds: 1/1 daemons up
    osd: 4 osds: 4 up (since 108s), 4 in (since 2m)
    rgw: 1 daemon active (1 hosts, 1 zones)

  data:
    volumes: 1/1 healthy
    pools:   8 pools, 193 pgs
    objects: 217 objects, 52 KiB
    usage:   144 MiB used, 320 GiB / 320 GiB avail
    pgs:     193 active+clean
[2] To remove an OSD Node from existing Cluster, run commands like follows.
For example, Remove [node04] node.
[root@node01 ~]#
ceph -s

  cluster:
    id:     7912846f-a2bd-407d-8032-0bdb9adf2c50
    health: HEALTH_OK

  services:
    mon: 1 daemons, quorum node01 (age 95m)
    mgr: node01(active, since 93m)
    mds: 1/1 daemons up
    osd: 4 osds: 4 up (since 108s), 4 in (since 2m)
    rgw: 1 daemon active (1 hosts, 1 zones)

  data:
    volumes: 1/1 healthy
    pools:   8 pools, 193 pgs
    objects: 217 objects, 52 KiB
    usage:   144 MiB used, 320 GiB / 320 GiB avail
    pgs:     193 active+clean

[root@node01 ~]#
ceph osd tree

ID  CLASS  WEIGHT   TYPE NAME        STATUS  REWEIGHT  PRI-AFF
-1         0.31238  root default
-3         0.07809      host node01
 0    hdd  0.07809          osd.0        up   1.00000  1.00000
-5         0.07809      host node02
 1    hdd  0.07809          osd.1        up   1.00000  1.00000
-7         0.07809      host node03
 2    hdd  0.07809          osd.2        up   1.00000  1.00000
-9         0.07809      host node04
 3    hdd  0.07809          osd.3        up   1.00000  1.00000

# specify OSD ID of a node you'd like to remove

[root@node01 ~]#
ceph osd out 3

marked out osd.3.
# live watch cluster status
# after running [ceph osd out ***], rebalancing is executed automatically
# to quit live watch, push [Ctrl + c]

[root@node01 ~]#
ceph -w

  cluster:
    id:     7912846f-a2bd-407d-8032-0bdb9adf2c50
    health: HEALTH_WARN
            Degraded data redundancy: 139/651 objects degraded (21.352%), 29 pgs degraded

  services:
    mon: 1 daemons, quorum node01 (age 96m)
    mgr: node01(active, since 95m)
    mds: 1/1 daemons up
    osd: 4 osds: 4 up (since 3m), 3 in (since 16s); 17 remapped pgs
    rgw: 1 daemon active (1 hosts, 1 zones)

  data:
    volumes: 1/1 healthy
    pools:   8 pools, 193 pgs
    objects: 217 objects, 52 KiB
    usage:   124 MiB used, 240 GiB / 240 GiB avail
    pgs:     139/651 objects degraded (21.352%)
             24/651 objects misplaced (3.687%)
             144 active+clean
             29  active+recovery_wait+degraded
             17  active+remapped+backfill_wait
             2   active+recovery_wait
             1   active+recovering

  io:
    recovery: 0 B/s, 0 objects/s

  progress:
    Global Recovery Event (10s)
      [====================........] (remaining: 3s)

# after status turns to [HEALTH_OK], disable OSD service on the target node

[root@node01 ~]#
ssh node04 "systemctl disable --now ceph-osd@3.service"

Removed /run/systemd/system/ceph-osd.target.wants/ceph-osd@3.service.
# remove the node to specify target OSD ID

[root@node01 ~]#
ceph osd purge 3 --yes-i-really-mean-it

purged osd.3
[root@node01 ~]#
ceph -s

  cluster:
    id:     7912846f-a2bd-407d-8032-0bdb9adf2c50
    health: HEALTH_OK

  services:
    mon: 1 daemons, quorum node01 (age 98m)
    mgr: node01(active, since 96m)
    mds: 1/1 daemons up
    osd: 3 osds: 3 up (since 10s), 3 in (since 101s)
    rgw: 1 daemon active (1 hosts, 1 zones)

  data:
    volumes: 1/1 healthy
    pools:   8 pools, 193 pgs
    objects: 217 objects, 52 KiB
    usage:   126 MiB used, 240 GiB / 240 GiB avail
    pgs:     193 active+clean

  io:
    client:   5.0 KiB/s rd, 0 B/s wr, 4 op/s rd, 3 op/s wr
Matched Content