osd.go 12 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385
  1. /*
  2. Copyright 2016 The Rook Authors. All rights reserved.
  3. Licensed under the Apache License, Version 2.0 (the "License");
  4. you may not use this file except in compliance with the License.
  5. You may obtain a copy of the License at
  6. http://www.apache.org/licenses/LICENSE-2.0
  7. Unless required by applicable law or agreed to in writing, software
  8. distributed under the License is distributed on an "AS IS" BASIS,
  9. WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  10. See the License for the specific language governing permissions and
  11. limitations under the License.
  12. */
  13. package client
  14. import (
  15. "encoding/json"
  16. "fmt"
  17. "strconv"
  18. "strings"
  19. "github.com/pkg/errors"
  20. "github.com/rook/rook/pkg/clusterd"
  21. )
  22. type OSDUsage struct {
  23. OSDNodes []OSDNodeUsage `json:"nodes"`
  24. Summary struct {
  25. TotalKB json.Number `json:"total_kb"`
  26. TotalUsedKB json.Number `json:"total_kb_used"`
  27. TotalAvailKB json.Number `json:"total_kb_avail"`
  28. AverageUtil json.Number `json:"average_utilization"`
  29. } `json:"summary"`
  30. }
  31. type OSDNodeUsage struct {
  32. ID int `json:"id"`
  33. Name string `json:"name"`
  34. CrushWeight json.Number `json:"crush_weight"`
  35. Depth json.Number `json:"depth"`
  36. Reweight json.Number `json:"reweight"`
  37. KB json.Number `json:"kb"`
  38. UsedKB json.Number `json:"kb_used"`
  39. AvailKB json.Number `json:"kb_avail"`
  40. Utilization json.Number `json:"utilization"`
  41. Variance json.Number `json:"var"`
  42. Pgs json.Number `json:"pgs"`
  43. }
  44. type OSDPerfStats struct {
  45. PerfInfo []struct {
  46. ID json.Number `json:"id"`
  47. Stats struct {
  48. CommitLatency json.Number `json:"commit_latency_ms"`
  49. ApplyLatency json.Number `json:"apply_latency_ms"`
  50. } `json:"perf_stats"`
  51. } `json:"osd_perf_infos"`
  52. }
  53. type OSDDump struct {
  54. OSDs []struct {
  55. OSD json.Number `json:"osd"`
  56. Up json.Number `json:"up"`
  57. In json.Number `json:"in"`
  58. } `json:"osds"`
  59. Flags string `json:"flags"`
  60. CrushNodeFlags map[string][]string `json:"crush_node_flags"`
  61. }
  62. // IsFlagSet checks if an OSD flag is set
  63. func (dump *OSDDump) IsFlagSet(checkFlag string) bool {
  64. flags := strings.Split(dump.Flags, ",")
  65. for _, flag := range flags {
  66. if flag == checkFlag {
  67. return true
  68. }
  69. }
  70. return false
  71. }
  72. // IsFlagSetOnCrushUnit checks if an OSD flag is set on specified Crush unit
  73. func (dump *OSDDump) IsFlagSetOnCrushUnit(checkFlag, crushUnit string) bool {
  74. for unit, list := range dump.CrushNodeFlags {
  75. if crushUnit == unit {
  76. for _, flag := range list {
  77. if flag == checkFlag {
  78. return true
  79. }
  80. }
  81. }
  82. }
  83. return false
  84. }
  85. // UpdateFlagOnCrushUnit checks if the flag is in the desired state and sets/unsets if it isn't. Mitigates redundant calls
  86. // it returns true if the value was changed
  87. func (dump *OSDDump) UpdateFlagOnCrushUnit(context *clusterd.Context, clusterInfo *ClusterInfo, set bool, crushUnit, flag string) (bool, error) {
  88. flagSet := dump.IsFlagSetOnCrushUnit(flag, crushUnit)
  89. if flagSet && !set {
  90. err := UnsetFlagOnCrushUnit(context, clusterInfo, crushUnit, flag)
  91. if err != nil {
  92. return true, err
  93. }
  94. return true, nil
  95. }
  96. if !flagSet && set {
  97. err := SetFlagOnCrushUnit(context, clusterInfo, crushUnit, flag)
  98. if err != nil {
  99. return true, err
  100. }
  101. return true, nil
  102. }
  103. return false, nil
  104. }
  105. // SetFlagOnCrushUnit sets the specified flag on the crush unit
  106. func SetFlagOnCrushUnit(context *clusterd.Context, clusterInfo *ClusterInfo, crushUnit, flag string) error {
  107. args := []string{"osd", "set-group", flag, crushUnit}
  108. cmd := NewCephCommand(context, clusterInfo, args)
  109. _, err := cmd.Run()
  110. if err != nil {
  111. return errors.Wrapf(err, "failed to set flag %s on %s", crushUnit, flag)
  112. }
  113. return nil
  114. }
  115. // UnsetFlagOnCrushUnit unsets the specified flag on the crush unit
  116. func UnsetFlagOnCrushUnit(context *clusterd.Context, clusterInfo *ClusterInfo, crushUnit, flag string) error {
  117. args := []string{"osd", "unset-group", flag, crushUnit}
  118. cmd := NewCephCommand(context, clusterInfo, args)
  119. _, err := cmd.Run()
  120. if err != nil {
  121. return errors.Wrapf(err, "failed to unset flag %s on %s", crushUnit, flag)
  122. }
  123. return nil
  124. }
  125. type SafeToDestroyStatus struct {
  126. SafeToDestroy []int `json:"safe_to_destroy"`
  127. }
  128. // OsdTree represents the CRUSH hierarchy
  129. type OsdTree struct {
  130. Nodes []struct {
  131. ID int `json:"id"`
  132. Name string `json:"name"`
  133. Type string `json:"type"`
  134. TypeID int `json:"type_id"`
  135. Children []int `json:"children,omitempty"`
  136. PoolWeights struct {
  137. } `json:"pool_weights,omitempty"`
  138. CrushWeight float64 `json:"crush_weight,omitempty"`
  139. Depth int `json:"depth,omitempty"`
  140. Exists int `json:"exists,omitempty"`
  141. Status string `json:"status,omitempty"`
  142. Reweight float64 `json:"reweight,omitempty"`
  143. PrimaryAffinity float64 `json:"primary_affinity,omitempty"`
  144. } `json:"nodes"`
  145. Stray []struct {
  146. ID int `json:"id"`
  147. Name string `json:"name"`
  148. Type string `json:"type"`
  149. TypeID int `json:"type_id"`
  150. CrushWeight float64 `json:"crush_weight"`
  151. Depth int `json:"depth"`
  152. Exists int `json:"exists"`
  153. Status string `json:"status"`
  154. Reweight float64 `json:"reweight"`
  155. PrimaryAffinity float64 `json:"primary_affinity"`
  156. } `json:"stray"`
  157. }
  158. // OsdList returns the list of OSD by their IDs
  159. type OsdList []int
  160. // StatusByID returns status and inCluster states for given OSD id
  161. func (dump *OSDDump) StatusByID(id int64) (int64, int64, error) {
  162. for _, d := range dump.OSDs {
  163. i, err := d.OSD.Int64()
  164. if err != nil {
  165. return 0, 0, err
  166. }
  167. if id == i {
  168. in, err := d.In.Int64()
  169. if err != nil {
  170. return 0, 0, err
  171. }
  172. up, err := d.Up.Int64()
  173. if err != nil {
  174. return 0, 0, err
  175. }
  176. return up, in, nil
  177. }
  178. }
  179. return 0, 0, errors.Errorf("not found osd.%d in OSDDump", id)
  180. }
  181. func GetOSDUsage(context *clusterd.Context, clusterInfo *ClusterInfo) (*OSDUsage, error) {
  182. args := []string{"osd", "df"}
  183. buf, err := NewCephCommand(context, clusterInfo, args).Run()
  184. if err != nil {
  185. return nil, errors.Wrap(err, "failed to get osd df")
  186. }
  187. var osdUsage OSDUsage
  188. if err := json.Unmarshal(buf, &osdUsage); err != nil {
  189. return nil, errors.Wrap(err, "failed to unmarshal osd df response")
  190. }
  191. return &osdUsage, nil
  192. }
  193. func GetOSDPerfStats(context *clusterd.Context, clusterInfo *ClusterInfo) (*OSDPerfStats, error) {
  194. args := []string{"osd", "perf"}
  195. buf, err := NewCephCommand(context, clusterInfo, args).Run()
  196. if err != nil {
  197. return nil, errors.Wrap(err, "failed to get osd perf")
  198. }
  199. var osdPerfStats OSDPerfStats
  200. if err := json.Unmarshal(buf, &osdPerfStats); err != nil {
  201. return nil, errors.Wrap(err, "failed to unmarshal osd perf response")
  202. }
  203. return &osdPerfStats, nil
  204. }
  205. func GetOSDDump(context *clusterd.Context, clusterInfo *ClusterInfo) (*OSDDump, error) {
  206. args := []string{"osd", "dump"}
  207. cmd := NewCephCommand(context, clusterInfo, args)
  208. buf, err := cmd.Run()
  209. if err != nil {
  210. return nil, errors.Wrap(err, "failed to get osd dump")
  211. }
  212. var osdDump OSDDump
  213. if err := json.Unmarshal(buf, &osdDump); err != nil {
  214. return nil, errors.Wrap(err, "failed to unmarshal osd dump response")
  215. }
  216. return &osdDump, nil
  217. }
  218. func OSDOut(context *clusterd.Context, clusterInfo *ClusterInfo, osdID int) (string, error) {
  219. args := []string{"osd", "out", strconv.Itoa(osdID)}
  220. buf, err := NewCephCommand(context, clusterInfo, args).Run()
  221. return string(buf), err
  222. }
  223. func OsdSafeToDestroy(context *clusterd.Context, clusterInfo *ClusterInfo, osdID int) (bool, error) {
  224. args := []string{"osd", "safe-to-destroy", strconv.Itoa(osdID)}
  225. cmd := NewCephCommand(context, clusterInfo, args)
  226. buf, err := cmd.Run()
  227. if err != nil {
  228. return false, errors.Wrap(err, "failed to get safe-to-destroy status")
  229. }
  230. var output SafeToDestroyStatus
  231. if err := json.Unmarshal(buf, &output); err != nil {
  232. return false, errors.Wrapf(err, "failed to unmarshal safe-to-destroy response. %s", string(buf))
  233. }
  234. if len(output.SafeToDestroy) != 0 && output.SafeToDestroy[0] == osdID {
  235. return true, nil
  236. }
  237. return false, nil
  238. }
  239. // HostTree returns the osd tree
  240. func HostTree(context *clusterd.Context, clusterInfo *ClusterInfo) (OsdTree, error) {
  241. var output OsdTree
  242. args := []string{"osd", "tree"}
  243. buf, err := NewCephCommand(context, clusterInfo, args).Run()
  244. if err != nil {
  245. return output, errors.Wrap(err, "failed to get osd tree")
  246. }
  247. err = json.Unmarshal(buf, &output)
  248. if err != nil {
  249. return output, errors.Wrap(err, "failed to unmarshal 'osd tree' response")
  250. }
  251. return output, nil
  252. }
  253. // OsdListNum returns the list of OSDs
  254. func OsdListNum(context *clusterd.Context, clusterInfo *ClusterInfo) (OsdList, error) {
  255. var output OsdList
  256. args := []string{"osd", "ls"}
  257. buf, err := NewCephCommand(context, clusterInfo, args).Run()
  258. if err != nil {
  259. return output, errors.Wrap(err, "failed to get osd list")
  260. }
  261. err = json.Unmarshal(buf, &output)
  262. if err != nil {
  263. return output, errors.Wrap(err, "failed to unmarshal 'osd ls' response")
  264. }
  265. return output, nil
  266. }
  267. // OSDDeviceClass report device class for osd
  268. type OSDDeviceClass struct {
  269. ID int `json:"osd"`
  270. DeviceClass string `json:"device_class"`
  271. }
  272. // OSDDeviceClasses returns the device classes for particular OsdIDs
  273. func OSDDeviceClasses(context *clusterd.Context, clusterInfo *ClusterInfo, osdIds []string) ([]OSDDeviceClass, error) {
  274. var deviceClasses []OSDDeviceClass
  275. args := []string{"osd", "crush", "get-device-class"}
  276. args = append(args, osdIds...)
  277. buf, err := NewCephCommand(context, clusterInfo, args).Run()
  278. if err != nil {
  279. return deviceClasses, errors.Wrap(err, "failed to get device-class info")
  280. }
  281. err = json.Unmarshal(buf, &deviceClasses)
  282. if err != nil {
  283. return deviceClasses, errors.Wrap(err, "failed to unmarshal 'osd crush get-device-class' response")
  284. }
  285. return deviceClasses, nil
  286. }
  287. // OSDOkToStopStats report detailed information about which OSDs are okay to stop
  288. type OSDOkToStopStats struct {
  289. OkToStop bool `json:"ok_to_stop"`
  290. OSDs []int `json:"osds"`
  291. NumOkPGs int `json:"num_ok_pgs"`
  292. NumNotOkPGs int `json:"num_not_ok_pgs"`
  293. BadBecomeInactive []string `json:"bad_become_inactive"`
  294. OkBecomeDegraded []string `json:"ok_become_degraded"`
  295. }
  296. // OSDOkToStop returns a list of OSDs that can be stopped that includes the OSD ID given.
  297. // This is relevant, for example, when checking which OSDs can be updated.
  298. // The number of OSDs returned is limited by the value set in maxReturned.
  299. // maxReturned=0 is the same as maxReturned=1.
  300. func OSDOkToStop(context *clusterd.Context, clusterInfo *ClusterInfo, osdID, maxReturned int) ([]int, error) {
  301. args := []string{"osd", "ok-to-stop", strconv.Itoa(osdID)}
  302. // NOTE: if the number of OSD IDs given in the CLI arg query is Q and --max=N is given, if
  303. // N < Q, Ceph treats the query as though max=Q instead, always returning at least Q OSDs.
  304. args = append(args, fmt.Sprintf("--max=%d", maxReturned))
  305. buf, err := NewCephCommand(context, clusterInfo, args).Run()
  306. if err != nil {
  307. // is not ok to stop (or command error)
  308. return []int{}, errors.Wrapf(err, "OSD %d is not ok to stop", osdID)
  309. }
  310. var stats OSDOkToStopStats
  311. err = json.Unmarshal(buf, &stats)
  312. if err != nil {
  313. // Since the command succeeded we still know that at least the given OSD ID is ok to
  314. // stop, so we do not *have* to return an error. However, it is good to do it anyway so
  315. // that we can catch breaking changes to JSON output in CI testing. As a middle ground
  316. // here, return error but also return the given OSD ID in the output in case the calling
  317. // function wants to recover from this case.
  318. return []int{osdID}, errors.Wrapf(err, "failed to unmarshal 'osd ok-to-stop %d' response", osdID)
  319. }
  320. return stats.OSDs, nil
  321. }
  322. // SetPrimaryAffinity assigns primary-affinity (within range [0.0, 1.0]) to a specific OSD.
  323. func SetPrimaryAffinity(context *clusterd.Context, clusterInfo *ClusterInfo, osdID int, affinity string) error {
  324. logger.Infof("setting osd.%d with primary-affinity %q", osdID, affinity)
  325. args := []string{"osd", "primary-affinity", fmt.Sprintf("osd.%d", osdID), affinity}
  326. _, err := NewCephCommand(context, clusterInfo, args).Run()
  327. if err != nil {
  328. return errors.Wrapf(err, "failed to set osd.%d with primary-affinity %q", osdID, affinity)
  329. }
  330. logger.Infof("successfully applied osd.%d primary-affinity %q", osdID, affinity)
  331. return nil
  332. }