validate_cluster.sh 5.1 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188
  1. #!/usr/bin/env bash
  2. # Copyright 2021 The Rook Authors. All rights reserved.
  3. #
  4. # Licensed under the Apache License, Version 2.0 (the "License");
  5. # you may not use this file except in compliance with the License.
  6. # You may obtain a copy of the License at
  7. #
  8. # http://www.apache.org/licenses/LICENSE-2.0
  9. #
  10. # Unless required by applicable law or agreed to in writing, software
  11. # distributed under the License is distributed on an "AS IS" BASIS,
  12. # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  13. # See the License for the specific language governing permissions and
  14. # limitations under the License.
  15. set -xEe
  16. : "${DAEMON_TO_VALIDATE:=${1}}"
  17. if [ -z "$DAEMON_TO_VALIDATE" ]; then
  18. DAEMON_TO_VALIDATE=all
  19. fi
  20. OSD_COUNT=$2
  21. #############
  22. # FUNCTIONS #
  23. #############
  24. EXEC_COMMAND="kubectl -n rook-ceph exec $(kubectl get pod -l app=rook-ceph-tools -n rook-ceph -o jsonpath='{.items[*].metadata.name}') -- ceph --connect-timeout 10"
  25. function wait_for_daemon() {
  26. timeout=90
  27. daemon_to_test=$1
  28. while [ $timeout -ne 0 ]; do
  29. if eval $daemon_to_test; then
  30. return 0
  31. fi
  32. sleep 1
  33. let timeout=timeout-1
  34. done
  35. echo "current status:"
  36. $EXEC_COMMAND -s
  37. return 1
  38. }
  39. function test_demo_mon {
  40. # shellcheck disable=SC2046
  41. return $(wait_for_daemon "$EXEC_COMMAND -s | grep -sq quorum")
  42. }
  43. function test_demo_mgr {
  44. # shellcheck disable=SC2046
  45. return $(wait_for_daemon "$EXEC_COMMAND -s | grep -sq 'mgr:'")
  46. }
  47. function test_demo_osd {
  48. # shellcheck disable=SC2046
  49. ret_val=$(wait_for_daemon "$EXEC_COMMAND -s | grep -sq \"$OSD_COUNT osds: $OSD_COUNT up.*, $OSD_COUNT in.*\"")
  50. # debug info for an intermittent failure
  51. echo "Return value = $ret_val"
  52. return $ret_val
  53. }
  54. function test_demo_rgw {
  55. timeout 360 bash -x <<-'EOF'
  56. until [[ "$(kubectl -n rook-ceph get pods -l app=rook-ceph-rgw -o 'jsonpath={..status.conditions[?(@.type=="Ready")].status}')" == "True" ]]; do
  57. echo "waiting for rgw pods to be ready"
  58. sleep 5
  59. done
  60. EOF
  61. }
  62. function test_demo_mds {
  63. echo "Waiting for the MDS to be ready"
  64. # NOTE: metadata server always takes up to 5 sec to run
  65. # so we first check if the pools exit, from that we assume that
  66. # the process will start. We stop waiting after 10 seconds.
  67. # shellcheck disable=SC2046
  68. return $(wait_for_daemon "$EXEC_COMMAND osd dump | grep -sq cephfs && $EXEC_COMMAND -s | grep -sq up")
  69. }
  70. function test_demo_rbd_mirror {
  71. # shellcheck disable=SC2046
  72. return $(wait_for_daemon "$EXEC_COMMAND -s | grep -sq 'rbd-mirror:'")
  73. }
  74. function test_demo_fs_mirror {
  75. # shellcheck disable=SC2046
  76. return $(wait_for_daemon "$EXEC_COMMAND -s | grep -sq 'cephfs-mirror:'")
  77. }
  78. function test_demo_pool {
  79. # shellcheck disable=SC2046
  80. return $(wait_for_daemon "$EXEC_COMMAND -s | grep -sq '11 pools'")
  81. }
  82. function test_csi {
  83. timeout 360 bash -x <<-'EOF'
  84. echo $IS_POD_NETWORK
  85. echo $IS_MULTUS
  86. if [ -z "$IS_POD_NETWORK" ]; then
  87. until [[ "$(kubectl -n rook-ceph get pods --field-selector=status.phase=Running|grep -c ^csi-)" -eq 6 ]]; do
  88. echo "waiting for csi pods to be ready"
  89. sleep 5
  90. done
  91. else
  92. until [[ "$(kubectl -n rook-ceph get pods --field-selector=status.phase=Running|grep -c ^csi-)" -eq 9 ]]; do
  93. echo "waiting for csi pods to be ready with multus or pod networking"
  94. sleep 5
  95. done
  96. fi
  97. if [ -n "$IS_MULTUS" ]; then
  98. echo "verifying csi holder interfaces (multus ones must be present)"
  99. kubectl -n rook-ceph exec -t ds/csi-rbdplugin-holder-my-cluster -- grep eth0 /proc/net/dev
  100. kubectl -n rook-ceph exec -t ds/csi-cephfsplugin-holder-my-cluster -- grep eth0 /proc/net/dev
  101. kubectl -n rook-ceph exec -t ds/csi-nfsplugin-holder-my-cluster -- grep eth0 /proc/net/dev
  102. fi
  103. EOF
  104. }
  105. function test_nfs {
  106. timeout 360 bash <<-'EOF'
  107. until [[ "$(kubectl -n rook-ceph get pods --field-selector=status.phase=Running|grep -c ^rook-ceph-nfs-)" -eq 1 ]]; do
  108. echo "waiting for nfs pods to be ready"
  109. sleep 5
  110. done
  111. EOF
  112. }
  113. ########
  114. # MAIN #
  115. ########
  116. test_csi
  117. test_demo_mon
  118. test_demo_mgr
  119. if [[ "$DAEMON_TO_VALIDATE" == "all" ]]; then
  120. daemons_list="osd mds rgw rbd_mirror fs_mirror nfs"
  121. else
  122. # change commas to space
  123. comma_to_space=${DAEMON_TO_VALIDATE//,/ }
  124. # transform to an array
  125. IFS=" " read -r -a array <<<"$comma_to_space"
  126. # sort and remove potential duplicate
  127. daemons_list=$(echo "${array[@]}" | tr ' ' '\n' | sort -u | tr '\n' ' ')
  128. fi
  129. for daemon in $daemons_list; do
  130. case "$daemon" in
  131. mon)
  132. continue
  133. ;;
  134. mgr)
  135. continue
  136. ;;
  137. osd)
  138. test_demo_osd
  139. ;;
  140. mds)
  141. test_demo_mds
  142. ;;
  143. rgw)
  144. test_demo_rgw
  145. ;;
  146. rbd_mirror)
  147. test_demo_rbd_mirror
  148. ;;
  149. fs_mirror)
  150. test_demo_fs_mirror
  151. ;;
  152. nfs)
  153. test_nfs
  154. ;;
  155. *)
  156. log "ERROR: unknown daemon to validate!"
  157. log "Available daemon are: mon mgr osd mds rgw rbd_mirror fs_mirror"
  158. exit 1
  159. ;;
  160. esac
  161. done
  162. echo "Ceph is up and running, have a look!"
  163. $EXEC_COMMAND -s
  164. kubectl -n rook-ceph get pods
  165. kubectl -n rook-ceph logs "$(kubectl -n rook-ceph -l app=rook-ceph-operator get pods -o jsonpath='{.items[*].metadata.name}')"
  166. kubectl -n rook-ceph get cephcluster -o yaml