HPC Quick Notes
misc., misc., 2025
Some useful HPC commands
check CPU usages - live view
htop
top -u "$USER"
top -p 78309
top 11 CPU occupying tasks, or single event checks:
ps -u "$USER" -o pid,ppid,%cpu,%mem,etime,cmd --sort=-%cpu | head -n 11
- Basic status snapshot:
ps -p 78309 -o pid,ppid,sid,tty,stat,%cpu,%mem,etime,cmd
Download and Unzip
- download a zip file:
wget https://www2.mmm.ucar.edu/projects/mpas/real/cfsr.2010102300.tar.gz
unzip a file: tar xf cfsr.2010102300.tar.gz
Close your VSCode and go home without task being stopped
nohup "$(command -v python)" -u download_era5_pressure_levels.py > logs/era5_$(date +%F_%H%M).log 2>&1 & echo $! > era5.pid
disown
Another example
chmod +x blocking_clim_detect.sh
LOGDIR="/net/flood/data/users/x_yan/isobaric_era5/tracking_tmppp"
mkdir -p "$LOGDIR"
LOG="${LOGDIR}/blocking_clim_detect_$(date +%F_%H%M%S).log"
nohup bash -lc '
source ~/.bashrc
source activate tempest_extreme_new
export OMP_NUM_THREADS=1
./blocking_clim_detect.sh
' > "$LOG" 2>&1 & echo $! > "${LOG}.pid" && disown
To Watch progress
tail -f "$LOG"
# or only show warnings/errors:
tail -f "$LOG" | egrep -i 'error|warn|fail|missing'
# Check if running
ps -fp "$(cat "${LOG}.pid")"
To stop
PID=$(cat "${LOG}.pid")
PGID=$(ps -o pgid= -p "$PID" | tr -d ' ')
# TERM first, then KILL if needed:
kill -TERM -"$PGID" 2>/dev/null || kill -TERM "$PID"
sleep 5
kill -KILL -"$PGID" 2>/dev/null || true
Another way is to submit a batch job
To see resource name
sinfo -o "%P %l %D %N %f %G"
Sample blocking_clim_detect.sbatch
#!/usr/bin/env bash
#SBATCH -J blocking_clim_detect
#SBATCH -o /net/flood/data/users/x_yan/isobaric_era5/tracking_tmppp/slurm-%j.out
#SBATCH -e /net/flood/data/users/x_yan/isobaric_era5/tracking_tmppp/slurm-%j.err
#SBATCH -t 24:00:00 # walltime
#SBATCH -p standard # partition/queue name
#SBATCH --cpus-per-task=16 # match the NPROC
#SBATCH --mem=32G # adjust
#SBATCH --mail-type=END,FAIL # optional
#SBATCH --mail-user=x_yan@mit.edu
set -euo pipefail
source ~/.bashrc
source activate tempest_extreme_new
# Let the script pick up SLURM's CPU count automatically
export OMP_NUM_THREADS=1
export NPROC=${SLURM_CPUS_PER_TASK:-16}
# Optionally make I/O nicer to shared filesystems
export IONICE="ionice -c2 -n7"
export NICE="nice -n 10"
# Run
$NICE $IONICE ./blocking_clim_detect.sh