Based on nvidia-smi
and python. I only tested it on Linux OS.
1 2 3 import osimport timeimport yagmail
The libraries we need.
Calculate the number of GPUs. 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 gpu_status = ( os.popen("nvidia-smi | grep %" ).read().split("|" ) ) if len (gpu_status) > 2 : gpu_info.append( int (gpu_status[2 ].split("/" )[0 ].split("M" )[0 ].strip()) / int (gpu_status[2 ].split("/" )[1 ].split("M" )[0 ].strip()) ) if len (gpu_status) > 6 : gpu_info.append( int (gpu_status[6 ].split("/" )[0 ].split("M" )[0 ].strip()) / int (gpu_status[6 ].split("/" )[1 ].split("M" )[0 ].strip()) ) if len (gpu_status) > 10 : gpu_info.append( int (gpu_status[10 ].split("/" )[0 ].split("M" )[0 ].strip()) / int (gpu_status[10 ].split("/" )[1 ].split("M" )[0 ].strip()) ) if len (gpu_status) > 14 : gpu_info.append( int (gpu_status[14 ].split("/" )[0 ].split("M" )[0 ].strip()) / int (gpu_status[14 ].split("/" )[1 ].split("M" )[0 ].strip()) )
Find working GPU 1 2 3 4 5 6 7 8 threshold = 0.1 if all (gpu_ratio < threshold for gpu_ratio in info_ratio): print ("no working GPU" ) else : working_gpu_id = list () for i, gpu_ratio in enumerate (info_ratio): if gpu_ratio >= threshold: working_gpu_id.append(i)
Continuous judge patience times 1 2 3 4 5 6 7 patience = 2 if info_ratio[id ] < threshold: judge_num[id ] += 1 if judge_num[id ] >= patience: else : judge_num[id ] = 0
Send email Get the SMTP password of your email
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 mail_server = yagmail.SMTP( user="email@address" , password="SMTP password" , port=587 , smtp_starttls=True , smtp_ssl=None , ) email_address = ["receive email@address" ] title = ["%s experiment on GPU %d finished" % (name, i)] content = [" " ] mail_server.send( to=email_address, subject=title, contents=content ) mail_server.close() print ("sending finished" )
Entire code 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 ''' Author: Ni Runyu & MonkeyDC Date: 2023-09-12 17:24:48 LastEditors: Ni Runyu & MonkeyDC LastEditTime: 2023-09-14 11:11:22 FilePath: \undefinedd:\check_gpu.py Description: Check gpu whether working Copyright (c) 2023 by Ni Runyu, All Rights Reserved. ''' import osimport timeimport yagmaildef gpu_info (): gpu_status = ( os.popen("nvidia-smi | grep %" ).read().split("|" ) ) gpu_info = list () if len (gpu_status) > 2 : gpu_info.append( int (gpu_status[2 ].split("/" )[0 ].split("M" )[0 ].strip()) / int (gpu_status[2 ].split("/" )[1 ].split("M" )[0 ].strip()) ) if len (gpu_status) > 6 : gpu_info.append( int (gpu_status[6 ].split("/" )[0 ].split("M" )[0 ].strip()) / int (gpu_status[6 ].split("/" )[1 ].split("M" )[0 ].strip()) ) if len (gpu_status) > 10 : gpu_info.append( int (gpu_status[10 ].split("/" )[0 ].split("M" )[0 ].strip()) / int (gpu_status[10 ].split("/" )[1 ].split("M" )[0 ].strip()) ) if len (gpu_status) > 14 : gpu_info.append( int (gpu_status[14 ].split("/" )[0 ].split("M" )[0 ].strip()) / int (gpu_status[14 ].split("/" )[1 ].split("M" )[0 ].strip()) ) return gpu_info if __name__ == "__main__" : name = "ni3" threshold = 0.1 patience = 2 info_ratio = gpu_info() judge_num = [0 for i in info_ratio] if all (gpu_ratio < threshold for gpu_ratio in info_ratio): print ("no working GPU" ) else : working_gpu_id = list () for i, gpu_ratio in enumerate (info_ratio): if gpu_ratio >= threshold: working_gpu_id.append(i) while True : print ("time: " , time.asctime()) info_ratio = gpu_info() for id in working_gpu_id: print (f"GPU {id } usage:" , info_ratio[id ]) if info_ratio[id ] < threshold: judge_num[id ] += 1 if judge_num[id ] >= patience: try : mail_server = yagmail.SMTP( user="" , password="" , port=587 , smtp_starttls=True , smtp_ssl=None , ) email_address = ["" ] title = ["%s experiment on GPU %d finished" % (name, i)] content = [" " ] mail_server.send( to=email_address, subject=title, contents=content ) mail_server.close() print ("sending finished" ) except Exception as e: print (repr (e)) else : judge_num[id ] = 0 if any ( (info_ratio[id ] < threshold) for id in working_gpu_id ) and any ((judge_num[id ] == patience) for id in working_gpu_id): break time.sleep(120 )
Reference: https://blog.csdn.net/lqy845650069/article/details/117321971