gpu-discovery-common.sh 3.1 KB

1234567891011121314151617181920212223242526272829303132333435363738394041424344454647484950515253545556575859606162636465666768697071727374757677787980818283848586878889909192
  1. #!/usr/bin/env bash
  2. ################################################################################
  3. # Licensed to the Apache Software Foundation (ASF) under one
  4. # or more contributor license agreements. See the NOTICE file
  5. # distributed with this work for additional information
  6. # regarding copyright ownership. The ASF licenses this file
  7. # to you under the Apache License, Version 2.0 (the
  8. # "License"); you may not use this file except in compliance
  9. # with the License. You may obtain a copy of the License at
  10. #
  11. # http://www.apache.org/licenses/LICENSE-2.0
  12. #
  13. # Unless required by applicable law or agreed to in writing, software
  14. # distributed under the License is distributed on an "AS IS" BASIS,
  15. # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  16. # See the License for the specific language governing permissions and
  17. # limitations under the License.
  18. ################################################################################
  19. non_coordination_allocate() {
  20. indexes=($1)
  21. amount=$2
  22. to_occupy_indexes=(${indexes[@]:0:$amount})
  23. if [ $amount -gt ${#to_occupy_indexes[@]} ]; then
  24. echo "Could not get enough GPU resources."
  25. exit 1
  26. fi
  27. echo ${to_occupy_indexes[@]} | sed 's/ /,/g'
  28. }
  29. coordination_allocate() {
  30. indexes=($1)
  31. amount=$2
  32. coordination_file=${3:-/var/tmp/flink-gpu-coordination}
  33. (
  34. flock -x 200
  35. # GPU indexes to be occupied.
  36. to_occupy_indexes=()
  37. # GPU indexes which are already recorded in the coordination file. These indexes should not be occupied unless the associated
  38. # processes are no longer alive.
  39. recorded_indexes=()
  40. for i in ${indexes[@]}
  41. do
  42. if [ ${#to_occupy_indexes[@]} -eq $amount ]; then
  43. break
  44. elif [ `grep -c "^$i " $coordination_file` -ne 0 ]; then
  45. recorded_indexes[${#recorded_indexes[@]}]=$i
  46. else
  47. to_occupy_indexes[${#to_occupy_indexes[@]}]=$i
  48. fi
  49. done
  50. # If there are not enough indexes, we will try to occupy indexes whose associated processes are dead.
  51. for i in ${!recorded_indexes[@]}
  52. do
  53. if [ ${#to_occupy_indexes[@]} -eq $amount ];then
  54. break
  55. fi
  56. owner=`grep "^${recorded_indexes[$i]} " $coordination_file | awk '{print $2}'`
  57. if [ -n $owner ] && [ `ps -p $owner | grep -c $owner` -eq 0 ]; then
  58. # The owner does not exist anymore. We could occupy it.
  59. sed -i "/${recorded_indexes[$i]} /d" $coordination_file
  60. to_occupy_indexes[${#to_occupy_indexes[@]}]=${recorded_indexes[$i]}
  61. unset recorded_indexes[$i]
  62. fi
  63. done
  64. if [ $amount -gt ${#to_occupy_indexes[@]} ]; then
  65. echo "Could not get enough GPU resources."
  66. exit 1
  67. fi
  68. for i in "${to_occupy_indexes[@]}"
  69. do
  70. echo "$i $PPID" >> $coordination_file
  71. done
  72. echo ${to_occupy_indexes[@]} | sed 's/ /,/g'
  73. ) 200<> $coordination_file
  74. }
  75. gpu_discovery() {
  76. indexes=$1
  77. amount=$2
  78. coordination_mode=$3
  79. coordination_file=${4:-/var/tmp/flink-gpu-coordination}
  80. if [ "$coordination_mode" == "coordination" ]; then
  81. coordination_allocate "$indexes" $amount $coordination_file
  82. else
  83. non_coordination_allocate "$indexes" $amount
  84. fi
  85. }