validation.go 14 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405
  1. /*
  2. Copyright 2023 The Rook Authors. All rights reserved.
  3. Licensed under the Apache License, Version 2.0 (the "License");
  4. you may not use this file except in compliance with the License.
  5. You may obtain a copy of the License at
  6. http://www.apache.org/licenses/LICENSE-2.0
  7. Unless required by applicable law or agreed to in writing, software
  8. distributed under the License is distributed on an "AS IS" BASIS,
  9. WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  10. See the License for the specific language governing permissions and
  11. limitations under the License.
  12. */
  13. package multus
  14. import (
  15. "context"
  16. "fmt"
  17. "time"
  18. "k8s.io/apimachinery/pkg/types"
  19. "k8s.io/client-go/kubernetes"
  20. )
  21. const (
  22. // the name of the config object that "owns" all other resources created for this test
  23. // this does not need to be unique per test. per-namespace is fine
  24. ownerConfigMapName = "multus-validation-test-owner"
  25. flakyNetworkSuggestion = "the underlying network may be flaky or not have the bandwidth to support a production ceph cluster; " +
  26. "even if the validation test passes, this could still be an issue"
  27. )
  28. // A Multus ValidationTest runs a number of Multus-connected pods to validate that a Kubernetes
  29. // environment is suitable for Rook to run Ceph in.
  30. type ValidationTest struct {
  31. Clientset kubernetes.Interface
  32. // The Logger will be used to render ongoing status by this library.
  33. Logger Logger
  34. ValidationTestConfig
  35. }
  36. // ValidationTestResults contains results from a validation test.
  37. type ValidationTestResults struct {
  38. suggestedDebugging []string
  39. }
  40. func (vtr *ValidationTestResults) SuggestedDebuggingReport() string {
  41. if vtr == nil || len(vtr.suggestedDebugging) == 0 {
  42. return ""
  43. }
  44. out := fmt.Sprintln("Suggested things to investigate before installing with Multus:")
  45. for _, s := range vtr.suggestedDebugging {
  46. out = out + " - " + fmt.Sprintln(s)
  47. }
  48. return out
  49. }
  50. func (vtr *ValidationTestResults) addSuggestions(s ...string) {
  51. for _, sug := range s {
  52. if sug == "" {
  53. continue
  54. }
  55. vtr.suggestedDebugging = append(vtr.suggestedDebugging, sug)
  56. }
  57. }
  58. /*
  59. * Validation state machine state definitions
  60. */
  61. /*
  62. * > Determine how many pods the image pull daemonset schedules
  63. * -- next state --> Ensure node type definitions don't overlap
  64. */
  65. type getExpectedNumberOfImagePullPodsState struct {
  66. expectedNumPodsPerNodeType perNodeTypeCount
  67. expectedNumPodsValueChanged time.Time
  68. }
  69. // the length of time to wait for daemonset scheduler to stabilize to a specific number of pods
  70. // started. must be lower than the state timeout duration
  71. var podSchedulerDebounceTime = 30 * time.Second
  72. func (s *getExpectedNumberOfImagePullPodsState) Run(
  73. ctx context.Context, vsm *validationStateMachine,
  74. ) (suggestions []string, err error) {
  75. expectedPerNodeType, err := vsm.vt.getImagePullPodCountPerNodeType(ctx)
  76. if err != nil {
  77. return []string{"inability to schedule DaemonSets is likely an issue with the Kubernetes cluster itself"},
  78. fmt.Errorf("expected number of image pull pods not yet ready: %w", err)
  79. }
  80. if !s.expectedNumPodsPerNodeType.Equal(&expectedPerNodeType) {
  81. s.expectedNumPodsPerNodeType = expectedPerNodeType
  82. s.expectedNumPodsValueChanged = time.Now()
  83. }
  84. if time.Since(s.expectedNumPodsValueChanged) < podSchedulerDebounceTime {
  85. vsm.vt.Logger.Infof("waiting to ensure num expected image pull pods per node type to stabilize at %v", s.expectedNumPodsPerNodeType)
  86. return []string{}, nil
  87. }
  88. vsm.SetNextState(&ensureNodeTypesDoNotOverlapState{
  89. ImagePullPodsPerNodeType: s.expectedNumPodsPerNodeType,
  90. })
  91. return []string{}, nil
  92. }
  93. /*
  94. * > Ensure node type definitions don't overlap
  95. * -- next state --> Verify all image pull pods are running (verifies all images are pulled)
  96. */
  97. type ensureNodeTypesDoNotOverlapState struct {
  98. ImagePullPodsPerNodeType perNodeTypeCount
  99. }
  100. func (s *ensureNodeTypesDoNotOverlapState) Run(ctx context.Context, vsm *validationStateMachine) (suggestions []string, err error) {
  101. err = vsm.vt.ensureOneImagePullPodPerNode(ctx)
  102. if err != nil {
  103. vsm.Exit() // checking in a loop won't change the result
  104. return []string{}, err
  105. }
  106. vsm.vt.Logger.Infof("expecting image pull pods to be 'Ready': expected count per node type: %v", s.ImagePullPodsPerNodeType)
  107. vsm.SetNextState(&verifyAllPodsRunningState{
  108. AppType: imagePullDaemonSetAppType,
  109. ImagePullPodsPerNodeType: s.ImagePullPodsPerNodeType,
  110. ExpectedNumPods: s.ImagePullPodsPerNodeType.Total(),
  111. })
  112. return []string{}, nil
  113. }
  114. /*
  115. * Reusable state to verify that expected number of pods are "Running" but not necessarily "Ready"
  116. * > Verify all image pull pods are running
  117. * -- next state --> Delete image pull daemonset
  118. * > Verify all client pods are running
  119. * -- next state --> Verify all client pods are "Ready"
  120. */
  121. type verifyAllPodsRunningState struct {
  122. AppType daemonsetAppType
  123. ImagePullPodsPerNodeType perNodeTypeCount
  124. ExpectedNumPods int
  125. }
  126. func (s *verifyAllPodsRunningState) Run(ctx context.Context, vsm *validationStateMachine) (suggestions []string, err error) {
  127. var podSelectorLabel string
  128. suggestions = []string{}
  129. switch s.AppType {
  130. case imagePullDaemonSetAppType:
  131. podSelectorLabel = imagePullAppLabel()
  132. // image pull pods don't have multus labels, so this can't be a multus issue
  133. suggestions = append(suggestions, "inability to run image pull pods is likely an issue with Nginx image or Kubernetes itself")
  134. case clientDaemonSetAppType:
  135. podSelectorLabel = clientAppLabel()
  136. suggestions = append(suggestions, "clients not being able to run can mean multus is unable to provide them with addresses")
  137. suggestions = append(suggestions, unableToProvideAddressSuggestions...)
  138. default:
  139. return []string{}, fmt.Errorf("internal error; unknown daemonset type %q", s.AppType)
  140. }
  141. numRunning, err := vsm.vt.getNumRunningPods(ctx, podSelectorLabel)
  142. errMsg := fmt.Sprintf("all %d %s pods are not yet 'Running'", s.ExpectedNumPods, s.AppType)
  143. if err != nil {
  144. return suggestions, fmt.Errorf("%s: %w", errMsg, err)
  145. }
  146. if numRunning != s.ExpectedNumPods {
  147. return suggestions, fmt.Errorf("%s: found %d", errMsg, numRunning)
  148. }
  149. switch s.AppType {
  150. case imagePullDaemonSetAppType:
  151. vsm.vt.Logger.Infof("cleaning up all %d 'Running' image pull pods", s.ExpectedNumPods)
  152. vsm.SetNextState(&deleteImagePullersState{
  153. ImagePullPodsPerNodeType: s.ImagePullPodsPerNodeType,
  154. })
  155. case clientDaemonSetAppType:
  156. vsm.vt.Logger.Infof("verifying all %d 'Running' client pods reach 'Ready' state", s.ExpectedNumPods)
  157. vsm.SetNextState(&verifyAllClientsReadyState{
  158. ExpectedNumClients: s.ExpectedNumPods,
  159. })
  160. }
  161. return []string{}, nil
  162. }
  163. /*
  164. * > Delete image pull daemonset
  165. * -- next state --> Get web server info
  166. */
  167. type deleteImagePullersState struct {
  168. // keeps track of the number of image pull pods that ran. this will directly affect the number
  169. // of client pods that can be expected to run later on
  170. ImagePullPodsPerNodeType perNodeTypeCount
  171. }
  172. func (s *deleteImagePullersState) Run(ctx context.Context, vsm *validationStateMachine) (suggestions []string, err error) {
  173. err = vsm.vt.deleteImagePullers(ctx)
  174. if err != nil {
  175. // erroring here is not strictly necessary but does indicate a k8s issue that probably affects future test steps
  176. return []string{"inability to delete resources is likely an issue with Kubernetes itself"}, err
  177. }
  178. vsm.vt.Logger.Infof("getting web server info for clients")
  179. vsm.SetNextState(&getWebServerInfoState{
  180. ImagePullPodsPerNodeType: s.ImagePullPodsPerNodeType,
  181. })
  182. return []string{}, nil
  183. }
  184. /*
  185. * > Get web server info
  186. * -- next state --> Start clients
  187. */
  188. type getWebServerInfoState struct {
  189. ImagePullPodsPerNodeType perNodeTypeCount
  190. }
  191. func (s *getWebServerInfoState) Run(ctx context.Context, vsm *validationStateMachine) (suggestions []string, err error) {
  192. var desiredPublicNet *types.NamespacedName = nil
  193. var desiredClusterNet *types.NamespacedName = nil
  194. if vsm.vt.PublicNetwork != "" {
  195. n, err := networkNamespacedName(vsm.vt.PublicNetwork, vsm.vt.Namespace)
  196. if err != nil {
  197. return nil, fmt.Errorf("public network is an invalid NAD name: %w", err)
  198. }
  199. desiredPublicNet = &n
  200. }
  201. if vsm.vt.ClusterNetwork != "" {
  202. n, err := networkNamespacedName(vsm.vt.ClusterNetwork, vsm.vt.Namespace)
  203. if err != nil {
  204. return nil, fmt.Errorf("cluster network is an invalid NAD name: %w", err)
  205. }
  206. desiredClusterNet = &n
  207. }
  208. info, suggestions, err := vsm.vt.getWebServerInfo(ctx, desiredPublicNet, desiredClusterNet)
  209. if err != nil {
  210. return suggestions, err
  211. }
  212. vsm.vt.Logger.Infof("starting clients on each node")
  213. vsm.SetNextState(&startClientsState{
  214. WebServerInfo: info,
  215. ImagePullPodsPerNodeType: s.ImagePullPodsPerNodeType,
  216. })
  217. return []string{}, nil
  218. }
  219. /*
  220. * Start clients
  221. * -- next state --> Verify all client pods are running
  222. */
  223. type startClientsState struct {
  224. WebServerInfo podNetworkInfo
  225. ImagePullPodsPerNodeType perNodeTypeCount
  226. }
  227. func (s *startClientsState) Run(ctx context.Context, vsm *validationStateMachine) (suggestions []string, err error) {
  228. podsPerNodeType := perNodeTypeCount{}
  229. for nodeType := range vsm.vt.NodeTypes {
  230. numClientDaemonsetsStarted, err := vsm.vt.startClients(ctx, vsm.resourceOwnerRefs, s.WebServerInfo.publicAddr, s.WebServerInfo.clusterAddr, nodeType)
  231. if err != nil {
  232. eErr := fmt.Errorf("failed to start clients: %w", err)
  233. vsm.Exit() // this is a whole validation test failure if we can't start clients
  234. return []string{}, eErr
  235. }
  236. // Use num image pull pods that ran for this node type as the expectation for how many pods
  237. // will run for every daemonset of this node type.
  238. podsPerNodeType[nodeType] = numClientDaemonsetsStarted * s.ImagePullPodsPerNodeType[nodeType]
  239. }
  240. vsm.vt.Logger.Infof("verifying %d client pods begin 'Running': count per node type: %v", podsPerNodeType.Total(), podsPerNodeType)
  241. vsm.SetNextState(&verifyAllPodsRunningState{
  242. AppType: clientDaemonSetAppType,
  243. ImagePullPodsPerNodeType: s.ImagePullPodsPerNodeType,
  244. ExpectedNumPods: podsPerNodeType.Total(),
  245. })
  246. return []string{}, nil
  247. }
  248. /*
  249. * > Verify all client pods are "Ready"
  250. * -- next state --> Exit / Done
  251. */
  252. type verifyAllClientsReadyState struct {
  253. ExpectedNumClients int
  254. // keep some info to heuristically determine if the network might be flaky/overloaded
  255. prevNumReady int
  256. timeClientsStartedBecomingReady time.Time
  257. suggestFlaky bool
  258. }
  259. func (s *verifyAllClientsReadyState) Run(ctx context.Context, vsm *validationStateMachine) (suggestions []string, err error) {
  260. numReady, err := vsm.vt.numClientsReady(ctx, s.ExpectedNumClients)
  261. collocationSuggestion := "if clients on the same node as the web server become ready but not others, " +
  262. "there may be a network firewall or security policy blocking inter-node traffic on multus networks"
  263. defaultSuggestions := append([]string{collocationSuggestion, flakyNetworkSuggestion}, unableToProvideAddressSuggestions...)
  264. if err != nil {
  265. return defaultSuggestions, err
  266. }
  267. s.checkIfFlaky(vsm, numReady)
  268. if numReady != s.ExpectedNumClients {
  269. return defaultSuggestions, fmt.Errorf("number of 'Ready' clients [%d] is not the number expected [%d]", numReady, s.ExpectedNumClients)
  270. }
  271. vsm.vt.Logger.Infof("all %d clients are 'Ready'", s.ExpectedNumClients)
  272. suggestionsOnSuccess := []string{}
  273. if s.suggestFlaky {
  274. suggestionsOnSuccess = append(suggestionsOnSuccess,
  275. fmt.Sprintf("not all clients became ready within %s; %s", vsm.vt.FlakyThreshold.String(), flakyNetworkSuggestion))
  276. }
  277. vsm.Exit() // DONE!
  278. return suggestionsOnSuccess, nil
  279. }
  280. // clients should all become ready within a pretty short amount of time since they all should start
  281. // pretty simultaneously
  282. func (s *verifyAllClientsReadyState) checkIfFlaky(vsm *validationStateMachine, numReady int) {
  283. if s.suggestFlaky {
  284. return // no need to do any checks if network is already found flaky
  285. }
  286. if numReady > s.prevNumReady && s.timeClientsStartedBecomingReady.IsZero() {
  287. vsm.vt.Logger.Debugf("clients started becoming ready")
  288. s.timeClientsStartedBecomingReady = time.Now()
  289. return
  290. }
  291. if !s.timeClientsStartedBecomingReady.IsZero() {
  292. // check to see how long it took since clients first started becoming ready. if the time is
  293. // longer than the flaky threshold, warn the user, and record that the network is flaky
  294. if time.Since(s.timeClientsStartedBecomingReady) > vsm.vt.FlakyThreshold {
  295. vsm.vt.Logger.Warningf(
  296. "network seems flaky; the time since clients started becoming ready until now is greater than %s", vsm.vt.FlakyThreshold.String())
  297. s.suggestFlaky = true
  298. }
  299. }
  300. }
  301. // Run the Multus validation test.
  302. func (vt *ValidationTest) Run(ctx context.Context) (*ValidationTestResults, error) {
  303. if vt.Logger == nil {
  304. vt.Logger = &SimpleStderrLogger{}
  305. vt.Logger.Infof("no logger was specified; using a simple stderr logger")
  306. }
  307. vt.Logger.Infof("starting multus validation test with the following config:\n%s", &vt.ValidationTestConfig)
  308. if err := vt.ValidationTestConfig.Validate(); err != nil {
  309. return nil, err
  310. }
  311. testResults := &ValidationTestResults{
  312. suggestedDebugging: []string{},
  313. }
  314. // configmap's purpose is to serve as the owner resource object for all other test resources.
  315. // this allows users to clean up a botched test easily just by deleting this configmap
  316. owningConfigMap, err := vt.createOwningConfigMap(ctx)
  317. if err != nil {
  318. testResults.addSuggestions(previousTestSuggestion)
  319. return testResults, fmt.Errorf("failed to create validation test config object: %w", err)
  320. }
  321. err = vt.startWebServer(ctx, owningConfigMap)
  322. if err != nil {
  323. testResults.addSuggestions(previousTestSuggestion)
  324. return testResults, fmt.Errorf("failed to start web server: %w", err)
  325. }
  326. err = vt.startImagePullers(ctx, owningConfigMap)
  327. if err != nil {
  328. testResults.addSuggestions(previousTestSuggestion)
  329. return testResults, fmt.Errorf("failed to start image pulls: %w", err)
  330. }
  331. // start the state machine
  332. vsm := &validationStateMachine{
  333. vt: vt,
  334. resourceOwnerRefs: owningConfigMap,
  335. testResults: testResults,
  336. lastSuggestions: []string{},
  337. }
  338. startingState := &getExpectedNumberOfImagePullPodsState{}
  339. vsm.SetNextState(startingState)
  340. return vsm.Run(ctx)
  341. }
  342. // CleanUp cleans up Multus validation test resources. It returns a suggestion for manual action if
  343. // clean up was unsuccessful.
  344. func (vt *ValidationTest) CleanUp(ctx context.Context) (*ValidationTestResults, error) {
  345. var err error
  346. res := ValidationTestResults{
  347. suggestedDebugging: []string{},
  348. }
  349. suggestions, err := vt.cleanUpTestResources()
  350. res.addSuggestions(suggestions)
  351. return &res, err
  352. }