-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathmonitor.py
More file actions
139 lines (123 loc) · 5.3 KB
/
monitor.py
File metadata and controls
139 lines (123 loc) · 5.3 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
from collections import defaultdict
import subprocess
import psutil
import GPUtil
import time
import argparse
import os
import csv
from threading import Thread
from datetime import datetime
class Monitor(Thread):
def __init__(self, delay, path):
super(Monitor, self).__init__()
self.stopped = False
self.delay = delay
self.path = path
self.start()
def get_cpu_usage_by_user(self):
user_cpu = defaultdict(float)
user_mem = defaultdict(float)
for proc in psutil.process_iter(attrs=["username", "cpu_percent", "memory_percent"]):
try:
info = proc.info
username = info["username"]
if username and os.path.isdir(f"/home/{username}"):
user_cpu[username] += info["cpu_percent"]
user_mem[username] += info["memory_percent"]
except (psutil.NoSuchProcess, psutil.AccessDenied):
continue
return dict(user_cpu), dict(user_mem)
def get_gpu_usage_by_user(self):
import re
user_gpu_mem = defaultdict(float)
try:
result = subprocess.run(
["nvidia-smi"],
stdout=subprocess.PIPE,
stderr=subprocess.DEVNULL,
text=True,
check=True
)
for line in result.stdout.splitlines():
# Process rows start with '|' and contain MiB usage
if not line.startswith("|") or "MiB" not in line:
continue
# Example line pattern:
# | 0 N/A N/A 2426920 C ...python 4792MiB |
# └─ skip two 'N/A' columns ─┘ PID ─┴─ mem ─┘
m = re.search(
r"\|\s*\d+\s+(?:\S+\s+){2}(\d+)\s+[CG]\s+.*?\s+(\d+)MiB",
line
)
if not m:
continue
pid_str, mem_str = m.groups()
try:
pid = int(pid_str)
mem = float(mem_str)
proc = psutil.Process(pid)
username = proc.username()
if os.path.isdir(f"/home/{username}"):
user_gpu_mem[username] += mem
except (psutil.NoSuchProcess, psutil.AccessDenied, ValueError):
continue
except subprocess.CalledProcessError:
pass
# Convert MiB → GiB (GB) and round to 2 decimals
user_gpu_mem_gb = {u: round(mib / 1024, 2) for u, mib in user_gpu_mem.items()}
return user_gpu_mem_gb
def run(self):
while not self.stopped:
cpu_usage = psutil.cpu_percent()
mem_usage = psutil.virtual_memory().percent
print(f'CPU: {cpu_usage:.1f}%, RAM: {mem_usage:.1f}%')
GPUstring = GPUtil.showUtilization()
gpus = GPUtil.getGPUs()
current_time = datetime.now().strftime("%H:%M-%d-%m-%Y")
for gpu in gpus:
with open(f"{self.path}/GPU_{gpu.id}.csv", 'a') as f:
csv.writer(f).writerow([current_time,gpu.load*100,gpu.memoryUtil*100])
with open(f"{self.path}/CPU.csv", 'a') as f:
csv.writer(f).writerow([current_time,cpu_usage,mem_usage])
# Per-user CPU and GPU usage
user_cpu, user_mem = self.get_cpu_usage_by_user()
user_gpu_mem = self.get_gpu_usage_by_user()
all_users = set(user_cpu.keys()).union(user_mem.keys(), user_gpu_mem.keys())
with open(f"{self.path}/USAGE.csv", 'a') as f:
writer = csv.writer(f)
for user in all_users:
writer.writerow([
current_time,
user,
round(user_cpu.get(user, 0.0), 2),
round(user_mem.get(user, 0.0), 2),
round(user_gpu_mem.get(user, 0.0), 2)
])
print("\n")
time.sleep(self.delay*60) # minutes to seconds
def stop(self):
self.stopped = True
if __name__ == '__main__':
parser = argparse.ArgumentParser()
parser.add_argument('-p','--period', type=float, default=7, help="Specify the period (days) to monitor the server")
parser.add_argument('-i','--interval', type=float, default=30, help="Specify the interval (mins) to monitor the server")
parser.add_argument('--path', type=str, default='./results', help="Specify the path to write the results")
args = parser.parse_args()
if not os.path.isdir(args.path):
os.makedirs(args.path)
gpus = GPUtil.getGPUs()
current_time = datetime.now().strftime("%H:%M-%d-%m-%Y")
results_dir = args.path+'/'+current_time
os.makedirs(results_dir)
headers = ['Time','Util','Mem']
for gpu in gpus:
with open(f"{results_dir}/GPU_{gpu.id}.csv", 'w') as f:
csv.writer(f).writerow(headers)
with open(f"{results_dir}/CPU.csv", 'w') as f:
csv.writer(f).writerow(headers)
with open(f"{results_dir}/USAGE.csv", 'w') as f:
csv.writer(f).writerow(['Time', 'User', 'CPU_Util', 'CPU_Mem', 'GPU_Mem_GB'])
monitor = Monitor(args.interval,results_dir)
time.sleep(args.period*24*60*60) # days to seconds
monitor.stop()