config.go 10 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307
  1. /*
  2. Copyright 2023 The Rook Authors. All rights reserved.
  3. Licensed under the Apache License, Version 2.0 (the "License");
  4. you may not use this file except in compliance with the License.
  5. You may obtain a copy of the License at
  6. http://www.apache.org/licenses/LICENSE-2.0
  7. Unless required by applicable law or agreed to in writing, software
  8. distributed under the License is distributed on an "AS IS" BASIS,
  9. WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  10. See the License for the specific language governing permissions and
  11. limitations under the License.
  12. */
  13. package multus
  14. import (
  15. _ "embed"
  16. "encoding/json"
  17. "fmt"
  18. "os"
  19. "strings"
  20. "time"
  21. "github.com/rook/rook/pkg/operator/k8sutil"
  22. "gopkg.in/yaml.v2"
  23. corev1 "k8s.io/api/core/v1"
  24. metavalidation "k8s.io/apimachinery/pkg/util/validation"
  25. )
  26. var (
  27. //go:embed config.yaml
  28. ConfigYaml string
  29. )
  30. var (
  31. DefaultValidationNamespace = "rook-ceph"
  32. DefaultValidationOSDsPerNode = 3
  33. DefaultValidationOtherDaemonsPerNode = 16
  34. DefaultValidationNginxImage = "nginxinc/nginx-unprivileged:stable-alpine"
  35. DefaultValidationResourceTimeout = 3 * time.Minute
  36. DefaultValidationFlakyThreshold = 30 * time.Second
  37. DefaultStorageNodeLabelKey = "storage-node"
  38. DefaultStorageNodeLabelValue = "true"
  39. DefaultArbiterNodeLabelKey = "topology.kubernetes.io/zone"
  40. DefaultArbiterNodeLabelValue = "arbiter"
  41. DefaultArbiterTolerationKey = "node-role.kubernetes.io/control-plane"
  42. )
  43. const DefaultValidationNodeType = "shared-storage-and-worker-nodes"
  44. func init() {
  45. // the default namespace is the current namespace the operator pod is running in if possible
  46. ns := os.Getenv(k8sutil.PodNamespaceEnvVar)
  47. if ns != "" {
  48. DefaultValidationNamespace = ns
  49. }
  50. }
  51. // ValidationTestConfig is a configuration for a Multus validation test. To prevent documentation
  52. // for this struct from getting out of date, see the output of ValidationTestConfig.ToYAML() for
  53. // usage text for each field.
  54. type ValidationTestConfig struct {
  55. Namespace string `yaml:"namespace"`
  56. PublicNetwork string `yaml:"publicNetwork"`
  57. ClusterNetwork string `yaml:"clusterNetwork"`
  58. ResourceTimeout time.Duration `yaml:"resourceTimeout"`
  59. FlakyThreshold time.Duration `yaml:"flakyThreshold"`
  60. NginxImage string `yaml:"nginxImage"`
  61. NodeTypes map[string]NodeConfig `yaml:"nodeTypes"`
  62. }
  63. type NodeConfig struct {
  64. // OSD daemons per node
  65. OSDsPerNode int `yaml:"osdsPerNode"`
  66. // Non-OSD daemons per node.
  67. OtherDaemonsPerNode int `yaml:"otherDaemonsPerNode"`
  68. Placement PlacementConfig `yaml:"placement"`
  69. }
  70. // NodeSelector and Tolerations are intentionally the only configurable parameters here.
  71. // Affinity/Anti-Affinity is too relaxed of a specification to ensure the validation test runs the
  72. // exact number of daemons per node that it should be running. Only allow the minimum selection
  73. // configs that can be used to define nodes this test can run on.
  74. type PlacementConfig struct {
  75. NodeSelector map[string]string `yaml:"nodeSelector"`
  76. Tolerations []TolerationType `yaml:"tolerations"`
  77. }
  78. type TolerationType corev1.Toleration
  79. // ToJSON renders a toleration as a single-line JSON string. The JSON rendering is just as easy to
  80. // read as the YAML rendering and is easier to format in the config.yaml template using Golang
  81. // text templating compared to YAML.
  82. // Need to define our own because corev1.Toleration.Marshal() does not render as expected.
  83. func (t *TolerationType) ToJSON() (string, error) {
  84. j, err := json.Marshal(*t)
  85. if err != nil {
  86. return "", fmt.Errorf("failed to convert toleration into JSON: %w", err)
  87. }
  88. return string(j), nil
  89. }
  90. // NewDefaultValidationTestConfig returns a new ValidationTestConfig with default values.
  91. // The default test is a converged-node test with no placement.
  92. func NewDefaultValidationTestConfig() *ValidationTestConfig {
  93. return &ValidationTestConfig{
  94. Namespace: DefaultValidationNamespace,
  95. ResourceTimeout: DefaultValidationResourceTimeout,
  96. FlakyThreshold: DefaultValidationFlakyThreshold,
  97. NginxImage: DefaultValidationNginxImage,
  98. NodeTypes: map[string]NodeConfig{
  99. DefaultValidationNodeType: {
  100. OSDsPerNode: DefaultValidationOSDsPerNode,
  101. OtherDaemonsPerNode: DefaultValidationOtherDaemonsPerNode,
  102. // Placement empty
  103. },
  104. },
  105. }
  106. }
  107. // ToYAML converts the validation test config into a YAML representation with user-readable comments
  108. // describing how to use the various parameters.
  109. func (c *ValidationTestConfig) ToYAML() (string, error) {
  110. // No Go YAML libraries seem to support fields with default-comments attached to them. It would
  111. // be silly to use some super-advanced reflection techniques or to extend our own YAML library,
  112. // so it is at least straightforward to render the config file from a Go template.
  113. t, err := loadTemplate("config.yaml", ConfigYaml, c)
  114. if err != nil {
  115. return "", fmt.Errorf("failed to load config into yaml template: %w", err)
  116. }
  117. return string(t), nil
  118. }
  119. // String implements the Stringer interface
  120. func (c *ValidationTestConfig) String() string {
  121. out, err := yaml.Marshal(c)
  122. if err != nil {
  123. return "failed quick marshal of validation test config!"
  124. }
  125. return string(out)
  126. }
  127. // ValidationTestConfigFromYAML loads a YAML-formatted string into a new ValidationTestConfig.
  128. func ValidationTestConfigFromYAML(y string) (*ValidationTestConfig, error) {
  129. c := &ValidationTestConfig{}
  130. err := yaml.Unmarshal([]byte(y), c)
  131. if err != nil {
  132. return nil, fmt.Errorf("failed to unmarshal config from yaml: %w", err)
  133. }
  134. return c, nil
  135. }
  136. func (c *ValidationTestConfig) TotalDaemonsPerNode() int {
  137. return c.TotalOSDsPerNode() + c.TotalOtherDaemonsPerNode()
  138. }
  139. func (c *ValidationTestConfig) TotalOSDsPerNode() int {
  140. t := 0
  141. for _, config := range c.NodeTypes {
  142. t += config.OSDsPerNode
  143. }
  144. return t
  145. }
  146. func (c *ValidationTestConfig) TotalOtherDaemonsPerNode() int {
  147. t := 0
  148. for _, config := range c.NodeTypes {
  149. t += config.OtherDaemonsPerNode
  150. }
  151. return t
  152. }
  153. func (c *ValidationTestConfig) BestNodePlacementForServer() (PlacementConfig, error) {
  154. // the web server MUST be placed on a node with both public and cluster networks available
  155. // since OSDs must have both, picking a node type with OSDs is a GOOD guess
  156. // BEST can't be determined easily, but a good approximation of BEST is the node likely to have
  157. // the most system resources available
  158. // the node type with the most OSDs will have high overall resource needs in production, which
  159. // is a good approximation of most overall system resources
  160. // in the case of a tie for num OSDs, more overall daemons means more resources
  161. best := NodeConfig{}
  162. for _, config := range c.NodeTypes {
  163. if (config.OSDsPerNode > best.OSDsPerNode) ||
  164. (config.OSDsPerNode == best.OSDsPerNode && config.OtherDaemonsPerNode > best.OtherDaemonsPerNode) {
  165. best = config
  166. }
  167. }
  168. if best.OSDsPerNode == 0 {
  169. return PlacementConfig{}, fmt.Errorf("cannot place web server in cluster with no OSDs")
  170. }
  171. return best.Placement, nil
  172. }
  173. // Validate reports any validation test configuration problems as errors.
  174. func (c *ValidationTestConfig) Validate() error {
  175. errs := []string{}
  176. if c.Namespace == "" {
  177. errs = append(errs, "namespace must be specified")
  178. }
  179. if c.PublicNetwork == "" && c.ClusterNetwork == "" {
  180. errs = append(errs, "at least one of publicNetwork and clusterNetwork must be specified")
  181. }
  182. if c.ResourceTimeout < 1*time.Minute {
  183. errs = append(errs, "resourceTimeout must be at least one minute (two or more are recommended)")
  184. }
  185. if c.FlakyThreshold < 5*time.Second {
  186. errs = append(errs, "flaky threshold must be at least 5 seconds")
  187. }
  188. if c.NginxImage == "" {
  189. errs = append(errs, "nginxImage must be specified")
  190. }
  191. if c.TotalOSDsPerNode() == 0 {
  192. errs = append(errs, "osdsPerNode must be set in at least one config")
  193. }
  194. // Do not care if the total number of OtherDaemonsPerNode is zero. OSDs run on both public and
  195. // cluster network, so OSDsPerNode can test all daemon types, but not vice-versa.
  196. for nodeType := range c.NodeTypes {
  197. mvErrs := metavalidation.IsDNS1123Subdomain(nodeType)
  198. if len(mvErrs) > 0 {
  199. errs = append(errs, fmt.Sprintf("nodeType identifier %q must meet RFC 1123 requirements: %v", nodeType, mvErrs))
  200. }
  201. }
  202. if len(errs) > 0 {
  203. return fmt.Errorf("validation test config is invalid: %s", strings.Join(errs, ", "))
  204. }
  205. return nil
  206. }
  207. func NewSharedStorageAndWorkerNodesValidationTestConfig() *ValidationTestConfig {
  208. return NewDefaultValidationTestConfig()
  209. }
  210. const (
  211. DedicatedStorageNodeType = "storage-nodes"
  212. DedicatedWorkerNodeType = "worker-nodes"
  213. )
  214. var dedicatedStorageNodeConfig = NodeConfig{
  215. OSDsPerNode: DefaultValidationOSDsPerNode,
  216. OtherDaemonsPerNode: DefaultValidationOtherDaemonsPerNode,
  217. Placement: PlacementConfig{
  218. NodeSelector: map[string]string{
  219. DefaultStorageNodeLabelKey: DefaultStorageNodeLabelValue,
  220. },
  221. Tolerations: []TolerationType{
  222. {Key: DefaultStorageNodeLabelKey, Value: DefaultStorageNodeLabelValue},
  223. },
  224. },
  225. }
  226. var dedicatedWorkerNodeConfig = NodeConfig{
  227. OSDsPerNode: 0,
  228. OtherDaemonsPerNode: 6, // CSI plugins only
  229. // Placement empty
  230. }
  231. func NewDedicatedStorageNodesValidationTestConfig() *ValidationTestConfig {
  232. return &ValidationTestConfig{
  233. Namespace: DefaultValidationNamespace,
  234. ResourceTimeout: DefaultValidationResourceTimeout,
  235. FlakyThreshold: DefaultValidationFlakyThreshold,
  236. NginxImage: DefaultValidationNginxImage,
  237. NodeTypes: map[string]NodeConfig{
  238. DedicatedStorageNodeType: dedicatedStorageNodeConfig,
  239. DedicatedWorkerNodeType: dedicatedWorkerNodeConfig,
  240. },
  241. }
  242. }
  243. const (
  244. DedicatedArbiterNodeType = "arbiter-node"
  245. )
  246. func NewArbiterValidationTestConfig() *ValidationTestConfig {
  247. return &ValidationTestConfig{
  248. Namespace: DefaultValidationNamespace,
  249. ResourceTimeout: DefaultValidationResourceTimeout,
  250. FlakyThreshold: DefaultValidationFlakyThreshold,
  251. NginxImage: DefaultValidationNginxImage,
  252. NodeTypes: map[string]NodeConfig{
  253. DedicatedStorageNodeType: dedicatedStorageNodeConfig,
  254. DedicatedWorkerNodeType: dedicatedWorkerNodeConfig,
  255. DedicatedArbiterNodeType: {
  256. OSDsPerNode: 0,
  257. OtherDaemonsPerNode: 10, // 1 mon, plus all 9 CSI provisioners and plugins (optional)
  258. Placement: PlacementConfig{
  259. NodeSelector: map[string]string{
  260. DefaultArbiterNodeLabelKey: DefaultArbiterNodeLabelValue,
  261. },
  262. Tolerations: []TolerationType{
  263. {Key: DefaultArbiterTolerationKey, Operator: corev1.TolerationOpExists},
  264. },
  265. },
  266. },
  267. },
  268. }
  269. }