auto-grow-storage.sh 9.8 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263
  1. #!/usr/bin/env bash
  2. #############
  3. # FUNCTIONS #
  4. #############
  5. function calculateSize() {
  6. local currentsize=$2
  7. local unit=$1
  8. rawsizeValue=0 # rawsizeValue is a global variable
  9. if [[ "$currentsize" == *"Mi" ]]
  10. then
  11. rawSize=${currentsize//Mi} # rawSize is a global variable
  12. unitSize="Mi"
  13. rawsizeValue=$rawSize
  14. elif [[ "$currentsize" == *"Gi" ]]
  15. then
  16. rawSize=${currentsize//Gi}
  17. unitSize="Gi"
  18. rawsizeValue=$(( rawSize * 1000 ))
  19. elif [[ "$currentsize" == *"Ti" ]]
  20. then
  21. rawSize=${currentsize//Ti}
  22. unitSize="Ti"
  23. rawsizeValue=$(( rawSize * 1000000 ))
  24. else
  25. echo "Unknown unit of $unit : ${currentsize}"
  26. echo "Supported units are 'Mi','Gi','Ti'"
  27. exit 1
  28. fi
  29. }
  30. function compareSizes() {
  31. local newsize=$1
  32. local maxsize=$2
  33. calculateSize newsize "${newsize}" # rawsizeValue is calculated and used for further process
  34. local newsize=$rawsizeValue
  35. calculateSize maxsize "${maxsize}"
  36. local maxsize=$rawsizeValue
  37. if [ "${newsize}" -ge "${maxsize}" ]
  38. then
  39. return "1"
  40. fi
  41. return "0"
  42. }
  43. function growVertically() {
  44. local growRate=$1
  45. local pvc=$2
  46. local ns=$3
  47. local maxSize=$4
  48. local currentSize
  49. currentSize=$(kubectl get pvc "${pvc}" -n "${ns}" -o json | jq -r '.spec.resources.requests.storage')
  50. echo "PVC(OSD) current size is ${currentSize} and will be increased by ${growRate}%."
  51. calculateSize "${pvc}" "${currentSize}" # rawSize is calculated and used for further process
  52. if ! [[ "${rawSize}" =~ ^[0-9]+$ ]]
  53. then
  54. echo "disk size should be an integer"
  55. else
  56. newSize=$(echo "${rawSize}+(${rawSize} * ${growRate})/100" | bc | cut -f1 -d'.')
  57. if [ "${newSize}" = "${rawSize}" ]
  58. then
  59. newSize=$(( rawSize + 1 ))
  60. echo "New adjusted calculated size for the PVC is ${newSize}${unitSize}"
  61. else
  62. echo "New calculated size for the PVC is ${newSize}${unitSize}"
  63. fi
  64. compareSizes ${newSize}${unitSize} "${maxSize}"
  65. if [ "1" = $? ]
  66. then
  67. newSize=${maxSize}
  68. echo "Disk has reached it's MAX capacity ${maxSize}, add a new disk to it"
  69. result=$(kubectl patch pvc "${pvc}" -n "${ns}" --type json --patch "[{ op: replace, path: /spec/resources/requests/storage, value: ${newSize} }]")
  70. else
  71. result=$(kubectl patch pvc "${pvc}" -n "${ns}" --type json --patch "[{ op: replace, path: /spec/resources/requests/storage, value: ${newSize}${unitSize} }]")
  72. fi
  73. echo "${result}"
  74. fi
  75. }
  76. function growHorizontally() {
  77. local increaseOSDCount=$1
  78. local pvc=$2
  79. local ns=$3
  80. local maxOSDCount=$4
  81. local deviceSetName
  82. local cluster=""
  83. local deviceSet=""
  84. local currentOSDCount=0
  85. local clusterCount=0
  86. local deviceSetCount=0
  87. deviceSetName=$(kubectl get pvc "${pvc}" -n "${ns}" -o json | jq -r '.metadata.labels."ceph.rook.io/DeviceSet"')
  88. while [ "$cluster" != "null" ]
  89. do
  90. cluster=$(kubectl get CephCluster -n "${ns}" -o json | jq -r ".items[${clusterCount}]")
  91. while [ "$deviceSet" != "null" ]
  92. do
  93. deviceSet=$(kubectl get CephCluster -n "${ns}" -o json | jq -r ".items[${clusterCount}].spec.storage.storageClassDeviceSets[${deviceSetCount}].name")
  94. if [[ $deviceSet == "${deviceSetName}" ]]
  95. then
  96. currentOSDCount=$(kubectl get CephCluster -n "${ns}" -o json | jq -r ".items[${clusterCount}].spec.storage.storageClassDeviceSets[${deviceSetCount}].count")
  97. finalCount=$(( "${currentOSDCount}" + "${increaseOSDCount}" ))
  98. echo "OSD count: ${currentOSDCount}. OSD count will be increased by ${increaseOSDCount}."
  99. if [ "${finalCount}" -ge "${maxOSDCount}" ]
  100. then
  101. finalCount=${maxOSDCount}
  102. echo "DeviceSet ${deviceSet} capacity is full, cannot add more OSD to it"
  103. fi
  104. echo "Total count of OSDs for deviceset ${deviceSetName} is set to ${finalCount}."
  105. clusterName=$(kubectl get CephCluster -n "${ns}" -o json | jq -r ".items[${clusterCount}].metadata.name" )
  106. result=$(kubectl patch CephCluster "${clusterName}" -n "${ns}" --type json --patch "[{ op: replace, path: /spec/storage/storageClassDeviceSets/${deviceSetCount}/count, value: ${finalCount} }]")
  107. echo "${result}"
  108. break
  109. fi
  110. deviceSetCount=$((deviceSetCount+1))
  111. deviceSet=$(kubectl get CephCluster -n "${ns}" -o json | jq -r ".items[${clusterCount}].spec.storage.storageClassDeviceSets[${deviceSetCount}].name")
  112. done
  113. clusterCount=$((clusterCount+1))
  114. cluster=$(kubectl get CephCluster -n "${ns}" -o json | jq -r ".items[${clusterCount}]")
  115. done
  116. }
  117. function growOSD(){
  118. itr=0
  119. alertmanagerroute=$(kubectl -n rook-ceph -o jsonpath="{.status.hostIP}" get pod prometheus-rook-prometheus-0)
  120. route=${alertmanagerroute}:30900
  121. toolbox=$(kubectl get pods -n rook-ceph | grep -i rook-ceph-tools | awk '{ print $1 }')
  122. alerts=$(kubectl exec -it "${toolbox}" -n rook-ceph -- bash -c "curl -s http://${route}/api/v1/alerts")
  123. export total_alerts
  124. total_alerts=$( jq '.data.alerts | length' <<< "${alerts}")
  125. echo "Looping at $(date +"%Y-%m-%d %H:%M:%S")"
  126. while true
  127. do
  128. if [ "${total_alerts}" == "" ]
  129. then
  130. echo "Alert manager not configured,re-run the script"
  131. exit 1
  132. fi
  133. export entry
  134. entry=$( jq ".data.alerts[$itr]" <<< "${alerts}")
  135. thename=$(echo "${entry}" | jq -r '.labels.alertname')
  136. if [ "${thename}" = "CephOSDNearFull" ] || [ "${thename}" = "CephOSDCriticallyFull" ]
  137. then
  138. echo "${entry}"
  139. ns=$(echo "${entry}" | jq -r '.labels.namespace')
  140. osdID=$(echo "${entry}" | jq -r '.labels.ceph_daemon')
  141. osdID=${osdID/./-}
  142. pvc=$(kubectl get deployment -n "${ns}" rook-ceph-"${osdID}" -o json | jq -r '.metadata.labels."ceph.rook.io/pvc"')
  143. if [[ $pvc == null ]]
  144. then
  145. echo "PVC not found, script can only run on PVC-based cluster"
  146. exit 1
  147. fi
  148. echo "Processing NearFull or Full alert for PVC ${pvc} in namespace ${ns}"
  149. if [[ $1 == "count" ]]
  150. then
  151. growHorizontally "$2" "${pvc}" "${ns}" "$3"
  152. else
  153. growVertically "$2" "${pvc}" "${ns}" "$3"
  154. fi
  155. fi
  156. (( itr = itr + 1 ))
  157. if [[ "${itr}" == "${total_alerts}" ]] || [[ "${total_alerts}" == "0" ]]
  158. then
  159. sleep 600
  160. alerts=$(kubectl exec -it "${toolbox}" -n rook-ceph -- bash -c "curl -s http://${route}/api/v1/alerts")
  161. total_alerts=$( jq '.data.alerts | length' <<< "${alerts}")
  162. itr=0
  163. echo "Looping at $(date +"%Y-%m-%d %H:%M:%S")"
  164. fi
  165. done
  166. }
  167. function creatingPrerequisites(){
  168. echo "creating Prerequisites deployments - Prometheus Operator and Prometheus Instances"
  169. # creating Prometheus operator
  170. kubectl apply -f https://raw.githubusercontent.com/coreos/prometheus-operator/v0.40.0/bundle.yaml
  171. # waiting for Prometheus operator to get ready
  172. timeout 30 sh -c "until [ $(kubectl get pod -l app.kubernetes.'io/name'=prometheus-operator -o json | jq -r '.items[0].status.phase') = Running ]; do echo 'waiting for prometheus-operator to get created' && sleep 1; done"
  173. # creating a service monitor that will watch the Rook cluster and collect metrics regularly
  174. kubectl create -f https://raw.githubusercontent.com/rook/rook/master/deploy/examples/monitoring/service-monitor.yaml
  175. # create the PrometheusRule for Rook alerts.
  176. kubectl create -f https://raw.githubusercontent.com/rook/rook/master/deploy/examples/monitoring/prometheus-ceph-v14-rules.yaml
  177. # create prometheus-rook-prometheus-0 pod
  178. kubectl create -f https://raw.githubusercontent.com/rook/rook/master/deploy/examples/monitoring/prometheus.yaml
  179. # create prometheus-service
  180. kubectl create -f https://raw.githubusercontent.com/rook/rook/master/deploy/examples/monitoring/prometheus-service.yaml
  181. # waiting for prometheus-rook-prometheus-0 pod to get ready
  182. timeout 60 sh -c "until [ $(kubectl get pod -l prometheus=rook-prometheus -nrook-ceph -o json | jq -r '.items[0].status.phase') = Running ]; do echo 'waiting for prometheus-rook-prometheus-0 pod to get created' && sleep 1; done"
  183. if [ "$(kubectl get pod -l prometheus=rook-prometheus -nrook-ceph)" == "" ]
  184. then
  185. echo "prometheus-rook-prometheus-0 pod not created, re-run the script"
  186. exit 1
  187. fi
  188. echo "Prerequisites deployments created"
  189. }
  190. function invalidCall(){
  191. echo " $0 [command]
  192. Available Commands for normal cluster:
  193. ./auto-grow-storage.sh count --max maxCount --count rate Scale horizontally by adding more OSDs to the cluster
  194. ./auto-grow-storage.sh size --max maxSize --growth-rate percent Scale vertically by increasing the size of existing OSDs
  195. " >&2
  196. }
  197. case "${1:-}" in
  198. count)
  199. if [[ $# -ne 5 ]]; then
  200. echo "incorrect command to run the script"
  201. invalidCall
  202. exit 1
  203. fi
  204. max=$3
  205. count=$5
  206. if ! [[ "${max}" =~ ^[0-9]+$ ]]
  207. then
  208. echo "maxCount should be an integer"
  209. invalidCall
  210. exit 1
  211. fi
  212. if ! [[ "${count}" =~ ^[0-9]+$ ]]
  213. then
  214. echo "rate should be an integer"
  215. invalidCall
  216. exit 1
  217. fi
  218. creatingPrerequisites
  219. echo "Adding on nearfull and full alert and number of OSD to add is ${count}"
  220. growOSD count "${count}" "${max}"
  221. ;;
  222. size)
  223. if [[ $# -ne 5 ]]; then
  224. echo "incorrect command to run the script"
  225. invalidCall
  226. exit 1
  227. fi
  228. max=$3
  229. growRate=$5
  230. if [[ "${max}" =~ ^[0-9]+$ ]]
  231. then
  232. echo "maxSize should be an string"
  233. invalidCall
  234. exit 1
  235. fi
  236. if ! [[ "${growRate}" =~ ^[0-9]+$ ]]
  237. then
  238. echo "growth-rate should be an integer"
  239. invalidCall
  240. exit 1
  241. fi
  242. creatingPrerequisites
  243. echo "Resizing on nearfull and full alert and Expansion percentage set to ${growRate}%"
  244. growOSD size "${growRate}" "${max}"
  245. ;;
  246. *)
  247. invalidCall
  248. ;;
  249. esac