resources.go 14 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414
  1. /*
  2. Copyright 2023 The Rook Authors. All rights reserved.
  3. Licensed under the Apache License, Version 2.0 (the "License");
  4. you may not use this file except in compliance with the License.
  5. You may obtain a copy of the License at
  6. http://www.apache.org/licenses/LICENSE-2.0
  7. Unless required by applicable law or agreed to in writing, software
  8. distributed under the License is distributed on an "AS IS" BASIS,
  9. WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  10. See the License for the specific language governing permissions and
  11. limitations under the License.
  12. */
  13. package multus
  14. import (
  15. "context"
  16. "fmt"
  17. "time"
  18. core "k8s.io/api/core/v1"
  19. kerrors "k8s.io/apimachinery/pkg/api/errors"
  20. meta "k8s.io/apimachinery/pkg/apis/meta/v1"
  21. "k8s.io/apimachinery/pkg/types"
  22. "k8s.io/apimachinery/pkg/util/wait"
  23. )
  24. type podNetworkInfo struct {
  25. // node the pod is running on
  26. nodeName string
  27. // multus public addr attached (if any)
  28. publicAddr string
  29. // multus cluster addr attached (if any)
  30. clusterAddr string
  31. }
  32. var manualCleanupInstructions = fmt.Sprintf(
  33. "manually delete owner configmap %q, and wait for all multus-validation-test resources to be deleted", ownerConfigMapName)
  34. var previousTestSuggestion = "there could be a past test preventing this one from proceeding; " + manualCleanupInstructions
  35. var unableToProvideAddressSuggestions = []string{
  36. "multus may be unable to provide addresses for pods",
  37. "check networking events on the pod and multus logs",
  38. "macvlan: NIC or switch hardware/software may block the association of some number of additional MAC addresses on an interface",
  39. "macvlan: interfaces and network switching must enable promiscuous mode to allow receiving packets for unknown (Multus) MACs",
  40. "macvlan/ipvlan: switch hardware/software may block an interface from receiving packets to an unknown (Multus) IP",
  41. }
  42. // create a validation test config object that stores the configuration of the running validation
  43. // test. this object serves as the owner of all associated test objects. when this object is
  44. // deleted, all validation test objects should also be deleted, effectively cleaning up all
  45. // components of this test.
  46. func (vt *ValidationTest) createOwningConfigMap(ctx context.Context) ([]meta.OwnerReference, error) {
  47. c := core.ConfigMap{
  48. ObjectMeta: meta.ObjectMeta{
  49. Name: ownerConfigMapName,
  50. },
  51. }
  52. configObject, err := vt.Clientset.CoreV1().ConfigMaps(vt.Namespace).Create(ctx, &c, meta.CreateOptions{})
  53. if err != nil {
  54. return nil, fmt.Errorf("failed to create validation test config object [%+v]: %w", c, err)
  55. }
  56. // for cleanup, we want to make sure all children are deleted
  57. BlockOwnerDeletion := true
  58. refToConfigObject := meta.OwnerReference{
  59. APIVersion: "v1",
  60. Kind: "ConfigMap",
  61. Name: configObject.GetName(),
  62. UID: configObject.GetUID(),
  63. BlockOwnerDeletion: &BlockOwnerDeletion,
  64. }
  65. return []meta.OwnerReference{refToConfigObject}, nil
  66. }
  67. func (vt *ValidationTest) startWebServer(ctx context.Context, owners []meta.OwnerReference) error {
  68. placement, err := vt.BestNodePlacementForServer()
  69. if err != nil {
  70. return fmt.Errorf("failed to place web server pod: %w", err)
  71. }
  72. // infer good placement for web server pod from the node type with the most OSDs
  73. pod, err := vt.generateWebServerPod(placement)
  74. if err != nil {
  75. return fmt.Errorf("failed to generate web server pod: %w", err)
  76. }
  77. pod.SetOwnerReferences(owners) // set owner refs so cleanup is easier
  78. configMap, err := vt.generateWebServerConfigMap()
  79. if err != nil {
  80. return fmt.Errorf("failed to generate web server config: %w", err)
  81. }
  82. configMap.SetOwnerReferences(owners) // set owner refs so cleanup is easier
  83. // create configmap before pod so pod doesn't crashloopbackoff on first creation
  84. _, err = vt.Clientset.CoreV1().ConfigMaps(vt.Namespace).Create(ctx, configMap, meta.CreateOptions{})
  85. if err != nil {
  86. return fmt.Errorf("failed to create web server config: %w", err)
  87. }
  88. _, err = vt.Clientset.CoreV1().Pods(vt.Namespace).Create(ctx, pod, meta.CreateOptions{})
  89. if err != nil {
  90. return fmt.Errorf("failed to create web server pod: %w", err)
  91. }
  92. return nil
  93. }
  94. func (vt *ValidationTest) getWebServerInfo(
  95. ctx context.Context,
  96. desiredPublicNet, desiredClusterNet *types.NamespacedName,
  97. ) (podNetworkInfo, []string, error) {
  98. podInfo := podNetworkInfo{}
  99. pod, err := vt.Clientset.CoreV1().Pods(vt.Namespace).Get(ctx, webServerPodName(), meta.GetOptions{})
  100. if err != nil {
  101. return podInfo, []string{}, fmt.Errorf("unexpected error when getting web server pod: %w", err)
  102. }
  103. var publicAddr, clusterAddr string
  104. publicAddr, clusterAddr, networkSuggestions, err := getNetworksFromPod(pod, desiredPublicNet, desiredClusterNet)
  105. if err != nil {
  106. return podInfo, networkSuggestions, fmt.Errorf("no web server network info: %w", err)
  107. }
  108. if !podIsReady(*pod) {
  109. return podInfo, []string{}, fmt.Errorf("web server pod is not ready yet")
  110. }
  111. podInfo.nodeName = pod.Spec.NodeName
  112. podInfo.publicAddr = publicAddr
  113. podInfo.clusterAddr = clusterAddr
  114. return podInfo, []string{}, nil // no suggestions if successful
  115. }
  116. func (vt *ValidationTest) startImagePullers(ctx context.Context, owners []meta.OwnerReference) error {
  117. for typeName, nodeType := range vt.NodeTypes {
  118. ds, err := vt.generateImagePullDaemonSet(typeName, nodeType.Placement)
  119. if err != nil {
  120. return fmt.Errorf("failed to generate image pull daemonset: %w", err)
  121. }
  122. ds.SetOwnerReferences(owners) // set owner so cleanup is easier
  123. _, err = vt.Clientset.AppsV1().DaemonSets(vt.Namespace).Create(ctx, ds, meta.CreateOptions{})
  124. if err != nil {
  125. return fmt.Errorf("failed to create image pull daemonset: %w", err)
  126. }
  127. }
  128. return nil
  129. }
  130. func (vt *ValidationTest) deleteImagePullers(ctx context.Context) error {
  131. noGracePeriod := int64(0)
  132. delOpts := meta.DeleteOptions{
  133. GracePeriodSeconds: &noGracePeriod,
  134. }
  135. listOpts := meta.ListOptions{
  136. LabelSelector: imagePullAppLabel(),
  137. }
  138. err := vt.Clientset.AppsV1().DaemonSets(vt.Namespace).DeleteCollection(ctx, delOpts, listOpts)
  139. if err != nil {
  140. if kerrors.IsNotFound(err) {
  141. return nil // already deleted
  142. }
  143. return fmt.Errorf("failed to delete image pullers: %w", err)
  144. }
  145. return nil
  146. }
  147. func (vt *ValidationTest) startClients(
  148. ctx context.Context,
  149. owners []meta.OwnerReference,
  150. serverPublicAddr, serverClusterAddr string,
  151. nodeType string,
  152. ) (int, error) {
  153. numDaemonsetsCreated := 0
  154. nodeConfig := vt.NodeTypes[nodeType]
  155. // start clients that simulate OSDs (connected to both public and cluster nets)
  156. osdsPerNode := nodeConfig.OSDsPerNode
  157. vt.Logger.Infof("starting %d %s validation clients for node type %q", osdsPerNode, ClientTypeOSD, nodeType)
  158. for i := 0; i < osdsPerNode; i++ {
  159. attachToClusterNet := true
  160. ds, err := vt.generateClientDaemonSet(true, attachToClusterNet, serverPublicAddr, serverClusterAddr, nodeType, ClientTypeOSD, i, nodeConfig.Placement)
  161. if err != nil {
  162. return numDaemonsetsCreated, fmt.Errorf("failed to generate client daemonset for node type %q, client type %q, client #%d: %w", nodeType, ClientTypeOSD, i, err)
  163. }
  164. ds.SetOwnerReferences(owners) // set owner refs so cleanup is easier
  165. _, err = vt.Clientset.AppsV1().DaemonSets(vt.Namespace).Create(ctx, ds, meta.CreateOptions{})
  166. if err != nil {
  167. return numDaemonsetsCreated, fmt.Errorf("failed to create client daemonset for node type %q, client type %q, client #%d: %w", nodeType, ClientTypeOSD, i, err)
  168. }
  169. numDaemonsetsCreated++
  170. }
  171. // start clients that simulate non-OSD daemons (connected only to public net)
  172. if serverPublicAddr == "" {
  173. return numDaemonsetsCreated, nil // no public net; thus, no public-net-only clients to run
  174. }
  175. otherPerNode := nodeConfig.OtherDaemonsPerNode
  176. vt.Logger.Infof("starting %d %s (non-OSD) validation clients for node type %q", otherPerNode, ClientTypeNonOSD, nodeType)
  177. for i := 0; i < otherPerNode; i++ {
  178. attachToClusterNet := false
  179. ds, err := vt.generateClientDaemonSet(true, attachToClusterNet, serverPublicAddr, serverClusterAddr, nodeType, ClientTypeNonOSD, i, nodeConfig.Placement)
  180. if err != nil {
  181. return numDaemonsetsCreated, fmt.Errorf("failed to generate client daemonset for node type %q, client type %q, client #%d: %w", nodeType, ClientTypeNonOSD, i, err)
  182. }
  183. ds.SetOwnerReferences(owners) // set owner refs so cleanup is easier
  184. _, err = vt.Clientset.AppsV1().DaemonSets(vt.Namespace).Create(ctx, ds, meta.CreateOptions{})
  185. if err != nil {
  186. return numDaemonsetsCreated, fmt.Errorf("failed to create client daemonset for node type %q, client type %q, client #%d: %w", nodeType, ClientTypeNonOSD, i, err)
  187. }
  188. numDaemonsetsCreated++
  189. }
  190. return numDaemonsetsCreated, nil
  191. }
  192. type perNodeTypeCount map[string]int
  193. func (a *perNodeTypeCount) Increment(nodeType string) {
  194. current, ok := (*a)[nodeType]
  195. if !ok {
  196. current = 0
  197. }
  198. (*a)[nodeType] = current + 1
  199. }
  200. func (a *perNodeTypeCount) Total() int {
  201. t := 0
  202. for _, c := range *a {
  203. t += c
  204. }
  205. return t
  206. }
  207. func (a *perNodeTypeCount) Equal(b *perNodeTypeCount) bool {
  208. if len(*a) != len(*b) {
  209. return false
  210. }
  211. for nodeType, numA := range *a {
  212. numB, ok := (*b)[nodeType]
  213. if !ok {
  214. return false
  215. }
  216. if numA != numB {
  217. return false
  218. }
  219. }
  220. return true
  221. }
  222. func (vt *ValidationTest) getImagePullPodCountPerNodeType(
  223. ctx context.Context,
  224. ) (perNodeTypeCount, error) {
  225. emptyCount := perNodeTypeCount{}
  226. listOpts := meta.ListOptions{
  227. LabelSelector: imagePullAppLabel(),
  228. }
  229. dsets, err := vt.Clientset.AppsV1().DaemonSets(vt.Namespace).List(ctx, listOpts)
  230. if err != nil {
  231. return emptyCount, fmt.Errorf("unexpected error listing daemonsets: %w", err)
  232. }
  233. expectedNumDaemonsets := len(vt.NodeTypes)
  234. if len(dsets.Items) != expectedNumDaemonsets {
  235. return emptyCount, fmt.Errorf("got %d daemonsets when %d should exist", len(dsets.Items), expectedNumDaemonsets)
  236. }
  237. numsScheduled := perNodeTypeCount{}
  238. for i, d := range dsets.Items {
  239. nodeType := getNodeType(&dsets.Items[i].ObjectMeta)
  240. numScheduled := d.Status.CurrentNumberScheduled
  241. if numScheduled == 0 {
  242. return emptyCount, fmt.Errorf("image pull daemonset for node type %q expects zero scheduled pods", nodeType)
  243. }
  244. numsScheduled[nodeType] = int(numScheduled)
  245. }
  246. return numsScheduled, nil
  247. }
  248. func (vt *ValidationTest) ensureOneImagePullPodPerNode(ctx context.Context) error {
  249. listOpts := meta.ListOptions{
  250. LabelSelector: imagePullAppLabel(),
  251. }
  252. pods, err := vt.Clientset.CoreV1().Pods(vt.Namespace).List(ctx, listOpts)
  253. if err != nil {
  254. return fmt.Errorf("failed to list pods: %w", err)
  255. }
  256. nodesFound := map[string]string{}
  257. for _, p := range pods.Items {
  258. nodeName := p.Spec.NodeName
  259. nodeType := p.GetLabels()["nodeType"]
  260. if otherNodeType, ok := nodesFound[nodeName]; ok {
  261. return fmt.Errorf("node types must not overlap: node type %q has overlap with node type %q", nodeType, otherNodeType)
  262. }
  263. nodesFound[nodeName] = nodeType
  264. }
  265. return nil
  266. }
  267. func (vt *ValidationTest) getNumRunningPods(
  268. ctx context.Context,
  269. podSelectorLabel string,
  270. ) (int, error) {
  271. listOpts := meta.ListOptions{
  272. LabelSelector: podSelectorLabel,
  273. }
  274. pods, err := vt.Clientset.CoreV1().Pods(vt.Namespace).List(ctx, listOpts)
  275. if err != nil {
  276. return 0, fmt.Errorf("failed to list pods: %w", err)
  277. }
  278. numRunning := 0
  279. for _, p := range pods.Items {
  280. if podIsRunning(p) {
  281. numRunning++
  282. }
  283. }
  284. return numRunning, nil
  285. }
  286. func (vt *ValidationTest) numClientsReady(ctx context.Context, expectedNumPods int) (int, error) {
  287. pods, err := vt.getClientPods(ctx, expectedNumPods)
  288. if err != nil {
  289. return 0, fmt.Errorf("unexpected error getting client pods: %w", err)
  290. }
  291. numReady := 0
  292. for _, p := range pods.Items {
  293. if podIsReady(p) {
  294. numReady++
  295. }
  296. }
  297. return numReady, nil
  298. }
  299. func (vt *ValidationTest) getClientPods(ctx context.Context, expectedNumPods int) (*core.PodList, error) {
  300. listOpts := meta.ListOptions{
  301. LabelSelector: clientAppLabel(),
  302. }
  303. pods, err := vt.Clientset.CoreV1().Pods(vt.Namespace).List(ctx, listOpts)
  304. if err != nil {
  305. return nil, fmt.Errorf("failed to list client pods: %w", err)
  306. }
  307. if len(pods.Items) != expectedNumPods {
  308. return nil, fmt.Errorf("the number of pods listed [%d] does not match the number expected [%d]", len(pods.Items), expectedNumPods)
  309. }
  310. return pods, err
  311. }
  312. func (vt *ValidationTest) cleanUpTestResources() (string, error) {
  313. // need a clean, non-canceled context in case the test is canceled by ctrl-c
  314. ctx := context.Background()
  315. // delete the config object in the foreground so we wait until all validation test resources are
  316. // gone before stopping, and do it now because there's no need to wait for just a test
  317. var gracePeriodZero int64 = 0
  318. deleteForeground := meta.DeletePropagationForeground
  319. delOpts := meta.DeleteOptions{
  320. PropagationPolicy: &deleteForeground,
  321. GracePeriodSeconds: &gracePeriodZero,
  322. }
  323. err := vt.Clientset.CoreV1().ConfigMaps(vt.Namespace).Delete(ctx, ownerConfigMapName, delOpts)
  324. if err != nil {
  325. if !kerrors.IsNotFound(err) {
  326. return manualCleanupInstructions, fmt.Errorf("failed to clean up multus validation test resources: %w", err)
  327. }
  328. return "", nil
  329. }
  330. // clients take a long time to terminate, and the 0 grace period set on the configmap doesn't
  331. // propagate to dependents. make a best-effort attempt to delete client pods with 0 grace period
  332. listOpts := meta.ListOptions{
  333. LabelSelector: clientAppLabel(),
  334. }
  335. // ignore errors for a best-effort attempt, they will delete eventually
  336. _ = vt.Clientset.CoreV1().Pods(vt.Namespace).DeleteCollection(ctx, delOpts, listOpts)
  337. // wait for resources to be cleaned up
  338. ctx, cancel := context.WithTimeout(ctx, vt.ResourceTimeout)
  339. defer cancel()
  340. lastSuggestion := ""
  341. err = wait.PollUntilContextCancel(ctx, 2*time.Second, true, func(ctx context.Context) (done bool, err error) {
  342. _, getErr := vt.Clientset.CoreV1().ConfigMaps(vt.Namespace).Get(ctx, ownerConfigMapName, meta.GetOptions{})
  343. if getErr != nil {
  344. if kerrors.IsNotFound(getErr) {
  345. return true, nil
  346. }
  347. lastSuggestion = fmt.Sprintf("unexpected error when cleaning up multus validation test resources; attempting to continue: %v", err)
  348. }
  349. return false, nil
  350. })
  351. if err != nil {
  352. return lastSuggestion + "; " + manualCleanupInstructions,
  353. fmt.Errorf("failed waiting for multus validation test resources to be deleted: %w", err)
  354. }
  355. return "", nil
  356. }