1. 安装MUNGE
安装MUNGE进行身份验证。确保集群中的所有节点具有相同的munge.key。确保Munge的守护程序munged在Slurm的守护进程之前启动。(由于我是在本地测试的,就没有设置多个节点,需要同步的可通过scp同步)
sudo apt-get install munge # 安装munge
sudo /usr/sbin/create-munge-key # 生成munge密钥
2. 安装SLURM
sudo apt-get install slurm-llnl
3. 配置SLURM
进入etc/slurm-llnl/下,创建slurm.conf,可自定义配置
ControlMachine=localhost
SlurmctldPort=6817
SlurmdPort=6818
AuthType=auth/munge
StateSaveLocation=/tmp
SlurmdSpoolDir=/tmp/slurmd
SwitchType=switch/none
MpiDefault=none
SlurmctldPidFile=/var/run/slurm-llnl/slurmctld.pid
SlurmdPidFile=/var/run/slurm-llnl/slurmd.pid
ProctrackType=proctrack/pgid
CacheGroups=0
ReturnToService=2
TaskPlugin=task/affinity
# make the default memory per core
DefMemPerNode=1024
MaxJobCount=20
MinJobAge=180
# TIMERS
SlurmctldTimeout=120
SlurmdTimeout=120
InactiveLimit=0
KillWait=30
Waittime=0
# SCHEDULING
SchedulerType=sched/backfill
#SchedulerPort=7321
SelectType=select/cons_res
SelectTypeParameters=CR_CPU_Memory
FastSchedule=0
# LOGGING
SlurmctldDebug=3
#SlurmctldLogFile=/var/log/slurmctld.log
SlurmdDebug=3
#SlurmdLogFile=/var/log/slurmd.log
JobCompType=jobcomp/none
#JobCompLoc=
JobAcctGatherType=jobacct_gather/none
# COMPUTE NODES
NodeName=DEFAULT
PartitionName=DEFAULT MaxTime=INFINITE State=UP
# NODES
NodeName=localhost CPUs=1 RealMemory=1024
PartitionName=compute Nodes=ALL Default=YES Shared=YES
4. 启动MUNGE
systemctl start munge
systemctl status munge
systemctl enable munge
5. 测试slurmd配置
slurmd -C
6. 开启slurmctld服务
systemctl start slurmctld
systemctl status slurmctld
systemctl enable slurmctld
7.测试命令
scontrol show nodes
sinfo
以上!
网友评论