Ceph Squid : Add or Remove OSDs2024/05/03

This is how to add or remove OSDs from existing Cluster.

                                         |
        +--------------------+           |           +----------------------+
        |   [dlp.srv.world]  |10.0.0.30  |  10.0.0.31|    [www.srv.world]   |
        |     Ceph Client    +-----------+-----------+        RADOSGW       |
        |                    |           |           |                      |
        +--------------------+           |           +----------------------+
            +----------------------------+----------------------------+
            |                            |                            |
            |10.0.0.51                   |10.0.0.52                   |10.0.0.53 
+-----------+-----------+    +-----------+-----------+    +-----------+-----------+
|   [node01.srv.world]  |    |   [node02.srv.world]  |    |   [node03.srv.world]  |
|     Object Storage    +----+     Object Storage    +----+     Object Storage    |
|     Monitor Daemon    |    |                       |    |                       |
|     Manager Daemon    |    |                       |    |                       |
+-----------------------+    +-----------------------+    +-----------------------+

[1]	For example, Add a [node04] node to OSDs on Admin Node. For Block device on new [node04] Node, use [/dev/sdb] on this example.

# transfer public key

root@node01:~#

ssh-copy-id node04

# install required packages

root@node01:~#

ssh node04 "apt update; apt -y install ceph python3-packaging"

# transfer required files

root@node01:~#

scp /etc/ceph/ceph.conf node04:/etc/ceph/ceph.conf

root@node01:~#

scp /etc/ceph/ceph.client.admin.keyring node04:/etc/ceph

root@node01:~#

scp /var/lib/ceph/bootstrap-osd/ceph.keyring node04:/var/lib/ceph/bootstrap-osd

# configure OSD

root@node01:~# ssh node04 \
"chown ceph:ceph /etc/ceph/ceph.* /var/lib/ceph/bootstrap-osd/*; \
parted --script /dev/sdb 'mklabel gpt'; \
parted --script /dev/sdb "mkpart primary 0% 100%"; \
ceph-volume lvm create --data /dev/sdb1" 
Running command: /usr/bin/ceph-authtool --gen-print-key
Running command: /usr/bin/ceph --cluster ceph --name client.bootstrap-osd --keyring /var/lib/ceph/bootstrap-osd/ceph.keyring -i - osd new 7229113d-beb3-40ce-bcef-ed266dfef5e0
Running command: vgcreate --force --yes ceph-8ea4d664-ff40-43e1-b7ca-70d26978d73f /dev/sdb1
 stdout: Physical volume "/dev/sdb1" successfully created.
 stdout: Volume group "ceph-8ea4d664-ff40-43e1-b7ca-70d26978d73f" successfully created
Running command: lvcreate --yes -l 40959 -n osd-block-7229113d-beb3-40ce-bcef-ed266dfef5e0 ceph-8ea4d664-ff40-43e1-b7ca-70d26978d73f
.....
.....
Running command: /usr/bin/systemctl start ceph-osd@3
--> ceph-volume lvm activate successful for osd ID: 3
--> ceph-volume lvm create successful for: /dev/sdb1

# after few minutes, it's OK if HEALTH_OK
root@node01:~# ceph -s 
  cluster:
    id:     3666a474-14e0-4c5f-ad1e-daf2e30aed8f
    health: HEALTH_OK

  services:
    mon: 1 daemons, quorum node01 (age 51m)
    mgr: node01(active, since 50m)
    mds: 1/1 daemons up
    osd: 4 osds: 4 up (since 4m), 4 in (since 4m)
    rgw: 1 daemon active (1 hosts, 1 zones)

  data:
    volumes: 1/1 healthy
    pools:   7 pools, 289 pgs
    objects: 248 objects, 456 KiB
    usage:   211 MiB used, 640 GiB / 640 GiB avail
    pgs:     289 active+clean

[2]	To remove an OSD Node from existing Cluster, run commands like follows. For example, Remove [node04] node.

root@node01:~#

ceph -s

  cluster:
    id:     3666a474-14e0-4c5f-ad1e-daf2e30aed8f
    health: HEALTH_OK

  services:
    mon: 1 daemons, quorum node01 (age 65m)
    mgr: node01(active, since 64m)
    mds: 1/1 daemons up
    osd: 4 osds: 4 up (since 18m), 4 in (since 18m)
    rgw: 1 daemon active (1 hosts, 1 zones)

  data:
    volumes: 1/1 healthy
    pools:   7 pools, 289 pgs
    objects: 248 objects, 456 KiB
    usage:   215 MiB used, 640 GiB / 640 GiB avail
    pgs:     289 active+clean

root@node01:~#

ceph osd tree

ID  CLASS  WEIGHT   TYPE NAME        STATUS  REWEIGHT  PRI-AFF
-1         0.62476  root default
-3         0.15619      host node01
 0    hdd  0.15619          osd.0        up   1.00000  1.00000
-5         0.15619      host node02
 1    hdd  0.15619          osd.1        up   1.00000  1.00000
-7         0.15619      host node03
 2    hdd  0.15619          osd.2        up   1.00000  1.00000
-9         0.15619      host node04
 3    hdd  0.15619          osd.3        up   1.00000  1.00000

# specify OSD ID of a node you'd like to remove

root@node01:~#

ceph osd out 3

marked out osd.3.

# live watch cluster status
# after running [ceph osd out ***], rebalancing is executed automatically
# to quit live watch, push [Ctrl + c]

root@node01:~#

ceph -w

  cluster:
    id:     3666a474-14e0-4c5f-ad1e-daf2e30aed8f
    health: HEALTH_WARN
            too many PGs per OSD (289 > max 250)

  services:
    mon: 1 daemons, quorum node01 (age 66m)
    mgr: node01(active, since 65m)
    mds: 1/1 daemons up
    osd: 4 osds: 4 up (since 19m), 3 in (since 7s)
    rgw: 1 daemon active (1 hosts, 1 zones)

  data:
    volumes: 1/1 healthy
    pools:   7 pools, 289 pgs
    objects: 249 objects, 456 KiB
    usage:   170 MiB used, 480 GiB / 480 GiB avail
    pgs:     289 active+clean

  io:
    client:   0 B/s wr, 0 op/s rd, 0 op/s wr
    recovery: 1003 B/s, 1 keys/s, 36 objects/s


2024-05-03T04:01:27.510202+0000 mon.node01 [INF] Health check cleared: PG_AVAILABILITY (was: Reduced data availability: 7 pgs peering)
.....
.....

# after status turns to [HEALTH_OK], disable OSD service on the target node

root@node01:~#

ssh node04 "systemctl disable --now ceph-osd@3.service"

# remove the node to specify target OSD ID

root@node01:~#

ceph osd purge 3 --yes-i-really-mean-it

purged osd.3

root@node01:~#

ceph -s

  cluster:
    id:     3666a474-14e0-4c5f-ad1e-daf2e30aed8f
    health: HEALTH_OK

  services:
    mon: 1 daemons, quorum node01 (age 70m)
    mgr: node01(active, since 69m)
    osd: 3 osds: 3 up (since 2m), 3 in (since 4m)
    rgw: 1 daemon active (1 hosts, 1 zones)

  data:
    pools:   5 pools, 129 pgs
    objects: 226 objects, 454 KiB
    usage:   175 MiB used, 480 GiB / 480 GiB avail
    pgs:     129 active+clean

Matched Content