123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322 |
- /*
- Copyright 2016 The Rook Authors. All rights reserved.
- Licensed under the Apache License, Version 2.0 (the "License");
- you may not use this file except in compliance with the License.
- You may obtain a copy of the License at
- http://www.apache.org/licenses/LICENSE-2.0
- Unless required by applicable law or agreed to in writing, software
- distributed under the License is distributed on an "AS IS" BASIS,
- WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- See the License for the specific language governing permissions and
- limitations under the License.
- */
- package client
- import (
- "encoding/json"
- "fmt"
- "regexp"
- "github.com/pkg/errors"
- "github.com/rook/rook/pkg/clusterd"
- )
- const (
- // CephHealthOK denotes the status of ceph cluster when healthy.
- CephHealthOK = "HEALTH_OK"
- // CephHealthWarn denotes the status of ceph cluster when unhealthy but recovering.
- CephHealthWarn = "HEALTH_WARN"
- // CephHealthErr denotes the status of ceph cluster when unhealthy but usually needs
- // manual intervention.
- CephHealthErr = "HEALTH_ERR"
- )
- const (
- activeClean = "active+clean"
- activeCleanScrubbing = "active+clean+scrubbing"
- activeCleanScrubbingDeep = "active+clean+scrubbing+deep"
- defaultPgHealthyRegex = `^(active\+clean|active\+clean\+scrubbing|active\+clean\+scrubbing\+deep)$`
- )
- var (
- defaultPgHealthyRegexCompiled = regexp.MustCompile(defaultPgHealthyRegex)
- )
- type CephStatus struct {
- Health HealthStatus `json:"health"`
- FSID string `json:"fsid"`
- ElectionEpoch int `json:"election_epoch"`
- Quorum []int `json:"quorum"`
- QuorumNames []string `json:"quorum_names"`
- MonMap MonMap `json:"monmap"`
- OsdMap OsdMap `json:"osdmap"`
- PgMap PgMap `json:"pgmap"`
- MgrMap MgrMap `json:"mgrmap"`
- Fsmap Fsmap `json:"fsmap"`
- }
- type HealthStatus struct {
- Status string `json:"status"`
- Checks map[string]CheckMessage `json:"checks"`
- }
- type CheckMessage struct {
- Severity string `json:"severity"`
- Summary Summary `json:"summary"`
- }
- type Summary struct {
- Message string `json:"message"`
- }
- type MonMap struct {
- Epoch int `json:"epoch"`
- NumMons int `json:"num_mons"`
- FSID string `json:"fsid"`
- CreatedTime string `json:"created"`
- ModifiedTime string `json:"modified"`
- Mons []MonMapEntry `json:"mons"`
- }
- type MgrStat struct {
- Epoch int `json:"epoch"`
- Available bool `json:"available"`
- ActiveName string `json:"active_name"`
- NumStandby int `json:"num_standby"`
- }
- type MgrMap struct {
- Epoch int `json:"epoch"`
- ActiveGID int `json:"active_gid"`
- ActiveName string `json:"active_name"`
- ActiveAddr string `json:"active_addr"`
- Available bool `json:"available"`
- Standbys []MgrStandby `json:"standbys"`
- }
- type MgrStandby struct {
- GID int `json:"gid"`
- Name string `json:"name"`
- }
- type OsdMap struct {
- Epoch int `json:"epoch"`
- NumOsd int `json:"num_osds"`
- NumUpOsd int `json:"num_up_osds"`
- NumInOsd int `json:"num_in_osds"`
- Full bool `json:"full"`
- NearFull bool `json:"nearfull"`
- NumRemappedPgs int `json:"num_remapped_pgs"`
- }
- type PgMap struct {
- PgsByState []PgStateEntry `json:"pgs_by_state"`
- Version int `json:"version"`
- NumPgs int `json:"num_pgs"`
- DataBytes uint64 `json:"data_bytes"`
- UsedBytes uint64 `json:"bytes_used"`
- AvailableBytes uint64 `json:"bytes_avail"`
- TotalBytes uint64 `json:"bytes_total"`
- ReadBps uint64 `json:"read_bytes_sec"`
- WriteBps uint64 `json:"write_bytes_sec"`
- ReadOps uint64 `json:"read_op_per_sec"`
- WriteOps uint64 `json:"write_op_per_sec"`
- RecoveryBps uint64 `json:"recovering_bytes_per_sec"`
- RecoveryObjectsPerSec uint64 `json:"recovering_objects_per_sec"`
- RecoveryKeysPerSec uint64 `json:"recovering_keys_per_sec"`
- CacheFlushBps uint64 `json:"flush_bytes_sec"`
- CacheEvictBps uint64 `json:"evict_bytes_sec"`
- CachePromoteBps uint64 `json:"promote_op_per_sec"`
- }
- type PgStateEntry struct {
- StateName string `json:"state_name"`
- Count int `json:"count"`
- }
- // Fsmap is a struct representing the filesystem map
- type Fsmap struct {
- Epoch int `json:"epoch"`
- ID int `json:"id"`
- Up int `json:"up"`
- In int `json:"in"`
- Max int `json:"max"`
- ByRank []struct {
- FilesystemID int `json:"filesystem_id"`
- Rank int `json:"rank"`
- Name string `json:"name"`
- Status string `json:"status"`
- Gid int `json:"gid"`
- } `json:"by_rank"`
- UpStandby int `json:"up:standby"`
- }
- func Status(context *clusterd.Context, clusterInfo *ClusterInfo) (CephStatus, error) {
- args := []string{"status"}
- cmd := NewCephCommand(context, clusterInfo, args)
- buf, err := cmd.Run()
- if err != nil {
- return CephStatus{}, errors.Wrapf(err, "failed to get status. %s", string(buf))
- }
- var status CephStatus
- if err := json.Unmarshal(buf, &status); err != nil {
- return CephStatus{}, errors.Wrap(err, "failed to unmarshal status response")
- }
- return status, nil
- }
- func StatusWithUser(context *clusterd.Context, clusterInfo *ClusterInfo) (CephStatus, error) {
- args := []string{"status", "--format", "json"}
- command, args := FinalizeCephCommandArgs("ceph", clusterInfo, args, context.ConfigDir)
- buf, err := context.Executor.ExecuteCommandWithOutput(command, args...)
- if err != nil {
- if buf != "" {
- return CephStatus{}, errors.Wrapf(err, "failed to get status. %s", string(buf))
- }
- return CephStatus{}, errors.Wrap(err, "failed to get ceph status")
- }
- var status CephStatus
- if err := json.Unmarshal([]byte(buf), &status); err != nil {
- return CephStatus{}, errors.Wrap(err, "failed to unmarshal status response")
- }
- return status, nil
- }
- // IsClusterClean returns msg (string), clean (bool), err (error)
- // msg describes the state of the PGs
- // clean is true if the cluster is clean
- // err is not nil if getting the status failed.
- func IsClusterClean(context *clusterd.Context, clusterInfo *ClusterInfo, pgHealthyRegex string) (string, bool, error) {
- status, err := Status(context, clusterInfo)
- if err != nil {
- return "unable to get PG health", false, err
- }
- pgHealthyRegexCompiled := defaultPgHealthyRegexCompiled
- if pgHealthyRegex != "" {
- pgHealthyRegexCompiled, err = regexp.Compile(pgHealthyRegex)
- if err != nil {
- return "unable to compile pgHealthyRegex", false, err
- }
- }
- msg, clean := isClusterClean(status, pgHealthyRegexCompiled)
- if !clean {
- return msg, false, nil
- }
- return msg, true, nil
- }
- // IsClusterCleanError returns an error indicating if the cluster is fully clean yet (i.e., all placement
- // groups are in the active+clean state). It returns nil if the cluster is clean.
- // Using IsClusterClean is recommended if you want to differentiate between a failure of the status query and
- // an unclean cluster.
- func IsClusterCleanError(context *clusterd.Context, clusterInfo *ClusterInfo, pgHealthyRegex string) error {
- msg, clean, err := IsClusterClean(context, clusterInfo, pgHealthyRegex)
- if err != nil {
- return err
- }
- if !clean {
- return errors.New(msg)
- }
- return nil
- }
- func isClusterClean(status CephStatus, pgHealthyRegex *regexp.Regexp) (string, bool) {
- if status.PgMap.NumPgs == 0 {
- // there are no PGs yet, that still counts as clean
- return "cluster has no PGs", true
- }
- cleanPGs := 0
- for _, pg := range status.PgMap.PgsByState {
- if pgHealthyRegex.MatchString(pg.StateName) {
- cleanPGs += pg.Count
- }
- }
- if cleanPGs == status.PgMap.NumPgs {
- // all PGs in the cluster are in a clean state
- logger.Debugf("all placement groups have reached a clean state: %+v", status.PgMap.PgsByState)
- return "all PGs in cluster are clean", true
- }
- return fmt.Sprintf("cluster is not fully clean. PGs: %+v", status.PgMap.PgsByState), false
- }
- // getMDSRank returns the rank of a given MDS
- func getMDSRank(status CephStatus, fsName string) (int, error) {
- // dummy rank
- mdsRank := -1000
- for r := range status.Fsmap.ByRank {
- if status.Fsmap.ByRank[r].Name == fsName {
- mdsRank = r
- }
- }
- // if the mds is not shown in the map one reason might be because it's in standby
- // if not in standby there is something else going wrong
- if mdsRank < 0 && status.Fsmap.UpStandby < 1 {
- // it might seem strange to log an error since this could be a warning too
- // it is a warning until we reach the timeout, this should give enough time to the mds to transition its state
- // after the timeout we consider that the mds might be gone or the timeout was not long enough...
- return mdsRank, errors.Errorf("mds %s not found in fsmap, this likely means mdss are transitioning between active and standby states", fsName)
- }
- return mdsRank, nil
- }
- // MdsActiveOrStandbyReplay returns whether a given MDS is active or in standby
- func MdsActiveOrStandbyReplay(context *clusterd.Context, clusterInfo *ClusterInfo, fsName string) error {
- status, err := Status(context, clusterInfo)
- if err != nil {
- return errors.Wrap(err, "failed to get ceph status")
- }
- mdsRank, err := getMDSRank(status, fsName)
- if err != nil {
- return errors.Cause(err)
- }
- // this MDS is in standby so let's return immediately
- if mdsRank < 0 {
- logger.Infof("mds %s is in standby, nothing to check", fsName)
- return nil
- }
- if status.Fsmap.ByRank[mdsRank].Status == "up:active" || status.Fsmap.ByRank[mdsRank].Status == "up:standby-replay" || status.Fsmap.ByRank[mdsRank].Status == "up:standby" {
- logger.Infof("mds %s is %s", fsName, status.Fsmap.ByRank[mdsRank].Status)
- return nil
- }
- return errors.Errorf("mds %s is %s, bad state", fsName, status.Fsmap.ByRank[mdsRank].Status)
- }
- // IsCephHealthy verifies Ceph is healthy, useful when performing an upgrade
- // check if it's a minor or major upgrade... too!
- func IsCephHealthy(context *clusterd.Context, clusterInfo *ClusterInfo) bool {
- cephStatus, err := Status(context, clusterInfo)
- if err != nil {
- logger.Errorf("failed to detect if Ceph is healthy. failed to get ceph status. %v", err)
- return false
- }
- return isCephHealthy(cephStatus)
- }
- func isCephHealthy(status CephStatus) bool {
- s := status.Health.Status
- if s == "HEALTH_WARN" || s == "HEALTH_OK" {
- return true
- }
- return false
- }
|