status.go 10 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322
  1. /*
  2. Copyright 2016 The Rook Authors. All rights reserved.
  3. Licensed under the Apache License, Version 2.0 (the "License");
  4. you may not use this file except in compliance with the License.
  5. You may obtain a copy of the License at
  6. http://www.apache.org/licenses/LICENSE-2.0
  7. Unless required by applicable law or agreed to in writing, software
  8. distributed under the License is distributed on an "AS IS" BASIS,
  9. WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  10. See the License for the specific language governing permissions and
  11. limitations under the License.
  12. */
  13. package client
  14. import (
  15. "encoding/json"
  16. "fmt"
  17. "regexp"
  18. "github.com/pkg/errors"
  19. "github.com/rook/rook/pkg/clusterd"
  20. )
  21. const (
  22. // CephHealthOK denotes the status of ceph cluster when healthy.
  23. CephHealthOK = "HEALTH_OK"
  24. // CephHealthWarn denotes the status of ceph cluster when unhealthy but recovering.
  25. CephHealthWarn = "HEALTH_WARN"
  26. // CephHealthErr denotes the status of ceph cluster when unhealthy but usually needs
  27. // manual intervention.
  28. CephHealthErr = "HEALTH_ERR"
  29. )
  30. const (
  31. activeClean = "active+clean"
  32. activeCleanScrubbing = "active+clean+scrubbing"
  33. activeCleanScrubbingDeep = "active+clean+scrubbing+deep"
  34. defaultPgHealthyRegex = `^(active\+clean|active\+clean\+scrubbing|active\+clean\+scrubbing\+deep)$`
  35. )
  36. var (
  37. defaultPgHealthyRegexCompiled = regexp.MustCompile(defaultPgHealthyRegex)
  38. )
  39. type CephStatus struct {
  40. Health HealthStatus `json:"health"`
  41. FSID string `json:"fsid"`
  42. ElectionEpoch int `json:"election_epoch"`
  43. Quorum []int `json:"quorum"`
  44. QuorumNames []string `json:"quorum_names"`
  45. MonMap MonMap `json:"monmap"`
  46. OsdMap OsdMap `json:"osdmap"`
  47. PgMap PgMap `json:"pgmap"`
  48. MgrMap MgrMap `json:"mgrmap"`
  49. Fsmap Fsmap `json:"fsmap"`
  50. }
  51. type HealthStatus struct {
  52. Status string `json:"status"`
  53. Checks map[string]CheckMessage `json:"checks"`
  54. }
  55. type CheckMessage struct {
  56. Severity string `json:"severity"`
  57. Summary Summary `json:"summary"`
  58. }
  59. type Summary struct {
  60. Message string `json:"message"`
  61. }
  62. type MonMap struct {
  63. Epoch int `json:"epoch"`
  64. NumMons int `json:"num_mons"`
  65. FSID string `json:"fsid"`
  66. CreatedTime string `json:"created"`
  67. ModifiedTime string `json:"modified"`
  68. Mons []MonMapEntry `json:"mons"`
  69. }
  70. type MgrStat struct {
  71. Epoch int `json:"epoch"`
  72. Available bool `json:"available"`
  73. ActiveName string `json:"active_name"`
  74. NumStandby int `json:"num_standby"`
  75. }
  76. type MgrMap struct {
  77. Epoch int `json:"epoch"`
  78. ActiveGID int `json:"active_gid"`
  79. ActiveName string `json:"active_name"`
  80. ActiveAddr string `json:"active_addr"`
  81. Available bool `json:"available"`
  82. Standbys []MgrStandby `json:"standbys"`
  83. }
  84. type MgrStandby struct {
  85. GID int `json:"gid"`
  86. Name string `json:"name"`
  87. }
  88. type OsdMap struct {
  89. Epoch int `json:"epoch"`
  90. NumOsd int `json:"num_osds"`
  91. NumUpOsd int `json:"num_up_osds"`
  92. NumInOsd int `json:"num_in_osds"`
  93. Full bool `json:"full"`
  94. NearFull bool `json:"nearfull"`
  95. NumRemappedPgs int `json:"num_remapped_pgs"`
  96. }
  97. type PgMap struct {
  98. PgsByState []PgStateEntry `json:"pgs_by_state"`
  99. Version int `json:"version"`
  100. NumPgs int `json:"num_pgs"`
  101. DataBytes uint64 `json:"data_bytes"`
  102. UsedBytes uint64 `json:"bytes_used"`
  103. AvailableBytes uint64 `json:"bytes_avail"`
  104. TotalBytes uint64 `json:"bytes_total"`
  105. ReadBps uint64 `json:"read_bytes_sec"`
  106. WriteBps uint64 `json:"write_bytes_sec"`
  107. ReadOps uint64 `json:"read_op_per_sec"`
  108. WriteOps uint64 `json:"write_op_per_sec"`
  109. RecoveryBps uint64 `json:"recovering_bytes_per_sec"`
  110. RecoveryObjectsPerSec uint64 `json:"recovering_objects_per_sec"`
  111. RecoveryKeysPerSec uint64 `json:"recovering_keys_per_sec"`
  112. CacheFlushBps uint64 `json:"flush_bytes_sec"`
  113. CacheEvictBps uint64 `json:"evict_bytes_sec"`
  114. CachePromoteBps uint64 `json:"promote_op_per_sec"`
  115. }
  116. type PgStateEntry struct {
  117. StateName string `json:"state_name"`
  118. Count int `json:"count"`
  119. }
  120. // Fsmap is a struct representing the filesystem map
  121. type Fsmap struct {
  122. Epoch int `json:"epoch"`
  123. ID int `json:"id"`
  124. Up int `json:"up"`
  125. In int `json:"in"`
  126. Max int `json:"max"`
  127. ByRank []struct {
  128. FilesystemID int `json:"filesystem_id"`
  129. Rank int `json:"rank"`
  130. Name string `json:"name"`
  131. Status string `json:"status"`
  132. Gid int `json:"gid"`
  133. } `json:"by_rank"`
  134. UpStandby int `json:"up:standby"`
  135. }
  136. func Status(context *clusterd.Context, clusterInfo *ClusterInfo) (CephStatus, error) {
  137. args := []string{"status"}
  138. cmd := NewCephCommand(context, clusterInfo, args)
  139. buf, err := cmd.Run()
  140. if err != nil {
  141. return CephStatus{}, errors.Wrapf(err, "failed to get status. %s", string(buf))
  142. }
  143. var status CephStatus
  144. if err := json.Unmarshal(buf, &status); err != nil {
  145. return CephStatus{}, errors.Wrap(err, "failed to unmarshal status response")
  146. }
  147. return status, nil
  148. }
  149. func StatusWithUser(context *clusterd.Context, clusterInfo *ClusterInfo) (CephStatus, error) {
  150. args := []string{"status", "--format", "json"}
  151. command, args := FinalizeCephCommandArgs("ceph", clusterInfo, args, context.ConfigDir)
  152. buf, err := context.Executor.ExecuteCommandWithOutput(command, args...)
  153. if err != nil {
  154. if buf != "" {
  155. return CephStatus{}, errors.Wrapf(err, "failed to get status. %s", string(buf))
  156. }
  157. return CephStatus{}, errors.Wrap(err, "failed to get ceph status")
  158. }
  159. var status CephStatus
  160. if err := json.Unmarshal([]byte(buf), &status); err != nil {
  161. return CephStatus{}, errors.Wrap(err, "failed to unmarshal status response")
  162. }
  163. return status, nil
  164. }
  165. // IsClusterClean returns msg (string), clean (bool), err (error)
  166. // msg describes the state of the PGs
  167. // clean is true if the cluster is clean
  168. // err is not nil if getting the status failed.
  169. func IsClusterClean(context *clusterd.Context, clusterInfo *ClusterInfo, pgHealthyRegex string) (string, bool, error) {
  170. status, err := Status(context, clusterInfo)
  171. if err != nil {
  172. return "unable to get PG health", false, err
  173. }
  174. pgHealthyRegexCompiled := defaultPgHealthyRegexCompiled
  175. if pgHealthyRegex != "" {
  176. pgHealthyRegexCompiled, err = regexp.Compile(pgHealthyRegex)
  177. if err != nil {
  178. return "unable to compile pgHealthyRegex", false, err
  179. }
  180. }
  181. msg, clean := isClusterClean(status, pgHealthyRegexCompiled)
  182. if !clean {
  183. return msg, false, nil
  184. }
  185. return msg, true, nil
  186. }
  187. // IsClusterCleanError returns an error indicating if the cluster is fully clean yet (i.e., all placement
  188. // groups are in the active+clean state). It returns nil if the cluster is clean.
  189. // Using IsClusterClean is recommended if you want to differentiate between a failure of the status query and
  190. // an unclean cluster.
  191. func IsClusterCleanError(context *clusterd.Context, clusterInfo *ClusterInfo, pgHealthyRegex string) error {
  192. msg, clean, err := IsClusterClean(context, clusterInfo, pgHealthyRegex)
  193. if err != nil {
  194. return err
  195. }
  196. if !clean {
  197. return errors.New(msg)
  198. }
  199. return nil
  200. }
  201. func isClusterClean(status CephStatus, pgHealthyRegex *regexp.Regexp) (string, bool) {
  202. if status.PgMap.NumPgs == 0 {
  203. // there are no PGs yet, that still counts as clean
  204. return "cluster has no PGs", true
  205. }
  206. cleanPGs := 0
  207. for _, pg := range status.PgMap.PgsByState {
  208. if pgHealthyRegex.MatchString(pg.StateName) {
  209. cleanPGs += pg.Count
  210. }
  211. }
  212. if cleanPGs == status.PgMap.NumPgs {
  213. // all PGs in the cluster are in a clean state
  214. logger.Debugf("all placement groups have reached a clean state: %+v", status.PgMap.PgsByState)
  215. return "all PGs in cluster are clean", true
  216. }
  217. return fmt.Sprintf("cluster is not fully clean. PGs: %+v", status.PgMap.PgsByState), false
  218. }
  219. // getMDSRank returns the rank of a given MDS
  220. func getMDSRank(status CephStatus, fsName string) (int, error) {
  221. // dummy rank
  222. mdsRank := -1000
  223. for r := range status.Fsmap.ByRank {
  224. if status.Fsmap.ByRank[r].Name == fsName {
  225. mdsRank = r
  226. }
  227. }
  228. // if the mds is not shown in the map one reason might be because it's in standby
  229. // if not in standby there is something else going wrong
  230. if mdsRank < 0 && status.Fsmap.UpStandby < 1 {
  231. // it might seem strange to log an error since this could be a warning too
  232. // it is a warning until we reach the timeout, this should give enough time to the mds to transition its state
  233. // after the timeout we consider that the mds might be gone or the timeout was not long enough...
  234. return mdsRank, errors.Errorf("mds %s not found in fsmap, this likely means mdss are transitioning between active and standby states", fsName)
  235. }
  236. return mdsRank, nil
  237. }
  238. // MdsActiveOrStandbyReplay returns whether a given MDS is active or in standby
  239. func MdsActiveOrStandbyReplay(context *clusterd.Context, clusterInfo *ClusterInfo, fsName string) error {
  240. status, err := Status(context, clusterInfo)
  241. if err != nil {
  242. return errors.Wrap(err, "failed to get ceph status")
  243. }
  244. mdsRank, err := getMDSRank(status, fsName)
  245. if err != nil {
  246. return errors.Cause(err)
  247. }
  248. // this MDS is in standby so let's return immediately
  249. if mdsRank < 0 {
  250. logger.Infof("mds %s is in standby, nothing to check", fsName)
  251. return nil
  252. }
  253. if status.Fsmap.ByRank[mdsRank].Status == "up:active" || status.Fsmap.ByRank[mdsRank].Status == "up:standby-replay" || status.Fsmap.ByRank[mdsRank].Status == "up:standby" {
  254. logger.Infof("mds %s is %s", fsName, status.Fsmap.ByRank[mdsRank].Status)
  255. return nil
  256. }
  257. return errors.Errorf("mds %s is %s, bad state", fsName, status.Fsmap.ByRank[mdsRank].Status)
  258. }
  259. // IsCephHealthy verifies Ceph is healthy, useful when performing an upgrade
  260. // check if it's a minor or major upgrade... too!
  261. func IsCephHealthy(context *clusterd.Context, clusterInfo *ClusterInfo) bool {
  262. cephStatus, err := Status(context, clusterInfo)
  263. if err != nil {
  264. logger.Errorf("failed to detect if Ceph is healthy. failed to get ceph status. %v", err)
  265. return false
  266. }
  267. return isCephHealthy(cephStatus)
  268. }
  269. func isCephHealthy(status CephStatus) bool {
  270. s := status.Health.Status
  271. if s == "HEALTH_WARN" || s == "HEALTH_OK" {
  272. return true
  273. }
  274. return false
  275. }