Check GPU whether working and then send email

Based on nvidia-smi and python. I only tested it on Linux OS.

1
2
3
import os
import time
import yagmail

The libraries we need.

Calculate the number of GPUs.

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
gpu_status = (
os.popen("nvidia-smi | grep %").read().split("|")
) # grep the infromation

# Default maximum is 4
if len(gpu_status) > 2:
gpu_info.append(
int(gpu_status[2].split("/")[0].split("M")[0].strip())
/ int(gpu_status[2].split("/")[1].split("M")[0].strip())
)
if len(gpu_status) > 6:
gpu_info.append(
int(gpu_status[6].split("/")[0].split("M")[0].strip())
/ int(gpu_status[6].split("/")[1].split("M")[0].strip())
)
if len(gpu_status) > 10:
gpu_info.append(
int(gpu_status[10].split("/")[0].split("M")[0].strip())
/ int(gpu_status[10].split("/")[1].split("M")[0].strip())
)
if len(gpu_status) > 14:
gpu_info.append(
int(gpu_status[14].split("/")[0].split("M")[0].strip())
/ int(gpu_status[14].split("/")[1].split("M")[0].strip())
)

Find working GPU

1
2
3
4
5
6
7
8
threshold = 0.1
if all(gpu_ratio < threshold for gpu_ratio in info_ratio):
print("no working GPU")
else:
working_gpu_id = list()
for i, gpu_ratio in enumerate(info_ratio):
if gpu_ratio >= threshold:
working_gpu_id.append(i)

Continuous judge patience times

1
2
3
4
5
6
7
patience = 2
if info_ratio[id] < threshold:
judge_num[id] += 1
if judge_num[id] >= patience:
# send email
else:
judge_num[id] = 0

Send email

Get the SMTP password of your email

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
mail_server = yagmail.SMTP(
user="email@address",
password="SMTP password",
port=587,
smtp_starttls=True,
smtp_ssl=None,
)
email_address = ["receive email@address"]
title = ["%s experiment on GPU %d finished" % (name, i)]
content = [" "]
mail_server.send(
to=email_address, subject=title, contents=content
)
mail_server.close()
print("sending finished")

Entire code

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
'''
Author: Ni Runyu & MonkeyDC
Date: 2023-09-12 17:24:48
LastEditors: Ni Runyu & MonkeyDC
LastEditTime: 2023-09-14 11:11:22
FilePath: \undefinedd:\check_gpu.py
Description: Check gpu whether working

Copyright (c) 2023 by Ni Runyu, All Rights Reserved.
'''

import os
import time

import yagmail


def gpu_info():
gpu_status = (
os.popen("nvidia-smi | grep %").read().split("|")
) # grep the infromation.

gpu_info = list()
# Default maximum is 4

if len(gpu_status) > 2:
gpu_info.append(
int(gpu_status[2].split("/")[0].split("M")[0].strip())
/ int(gpu_status[2].split("/")[1].split("M")[0].strip())
)
if len(gpu_status) > 6:
gpu_info.append(
int(gpu_status[6].split("/")[0].split("M")[0].strip())
/ int(gpu_status[6].split("/")[1].split("M")[0].strip())
)
if len(gpu_status) > 10:
gpu_info.append(
int(gpu_status[10].split("/")[0].split("M")[0].strip())
/ int(gpu_status[10].split("/")[1].split("M")[0].strip())
)
if len(gpu_status) > 14:
gpu_info.append(
int(gpu_status[14].split("/")[0].split("M")[0].strip())
/ int(gpu_status[14].split("/")[1].split("M")[0].strip())
)

return gpu_info


if __name__ == "__main__":
name = "ni3" # server's name
threshold = 0.1
patience = 2

# find working gpu
info_ratio = gpu_info()
judge_num = [0 for i in info_ratio] # for continuous judgment

if all(gpu_ratio < threshold for gpu_ratio in info_ratio):
print("no working GPU")
else:
working_gpu_id = list()
for i, gpu_ratio in enumerate(info_ratio):
if gpu_ratio >= threshold:
working_gpu_id.append(i)

while True:
print("time: ", time.asctime())
info_ratio = gpu_info()

for id in working_gpu_id:
print(f"GPU {id} usage:", info_ratio[id])
if info_ratio[id] < threshold:
judge_num[id] += 1
# ratio should be continuously smaller than threshold patience times
if judge_num[id] >= patience:
# send email
try:
mail_server = yagmail.SMTP(
user="",
password="",
port=587,
smtp_starttls=True,
smtp_ssl=None,
)
email_address = [""]
title = ["%s experiment on GPU %d finished" % (name, i)]
content = [" "]
mail_server.send(
to=email_address, subject=title, contents=content
)
mail_server.close()
print("sending finished")
except Exception as e:
print(repr(e))
else:
judge_num[id] = 0

if any(
(info_ratio[id] < threshold) for id in working_gpu_id
) and any((judge_num[id] == patience) for id in working_gpu_id):
break
time.sleep(120)

Reference: https://blog.csdn.net/lqy845650069/article/details/117321971