This page covers common issues and solutions when working with containers on the Euler cluster.
Cannot connect to the Docker daemon at unix:///var/run/docker.sock
Solution: Start Docker service
sudo systemctl start docker
# or on macOS:
open -a Docker
bash: apptainer: command not found
Solution: Install Apptainer (v1.2.5 recommended)
# Ubuntu/Debian
sudo apt-get update
sudo apt-get install -y apptainer
# From source (recommended for version control)
wget https://github.com/apptainer/apptainer/releases/download/v1.2.5/apptainer-1.2.5.tar.gz
ssh: connect to host euler.ethz.ch port 22: Connection timed out
Solution:
module load eth_proxy
# Use rsync with resume capability
rsync -avP --append-verify container.tar euler:/cluster/work/rsl/$USER/containers/
scp: write: No space left on device
Solution: Check quotas and clean up
# Check your quotas
lquota
# Clean old containers
rm /cluster/work/rsl/$USER/containers/old-*.tar
JOBID PARTITION NAME USER ST TIME NODES NODELIST(REASON)
12345 gpu.4h container user PD 0:00 1 (Resources)
Solution:
sinfo -o "%P %a %l %D %G"
#SBATCH --partition=gpu.24h
tar: my-app.sif: Cannot open: No such file or directory
Solution: Verify container path
# List available containers
ls -la /cluster/work/rsl/$USER/containers/
# Check if extraction completed
ls -la $TMPDIR/
CUDA available: False
Solution:
echo $CUDA_VISIBLE_DEVICES # Should show GPU ID
nvidia-smi # Should list GPUs
--nv
flag in singularity commandsingularity exec --nv container.sif nvidia-smi
RuntimeError: CUDA out of memory
Solution:
#SBATCH --mem-per-cpu=8G
import torch
print(f"Allocated: {torch.cuda.memory_allocated()/1024**3:.2f} GB")
print(f"Reserved: {torch.cuda.memory_reserved()/1024**3:.2f} GB")
Permission denied: '/output/results.txt'
Solution:
mkdir -p /cluster/project/rsl/$USER/output
chmod 755 /cluster/project/rsl/$USER/output
Solution: Always use local scratch
# Good - use $TMPDIR
cp /cluster/scratch/$USER/data.tar $TMPDIR/
tar -xf $TMPDIR/data.tar -C $TMPDIR/
# Bad - network I/O
tar -xf /cluster/scratch/$USER/data.tar -C /cluster/work/$USER/
Start an interactive session:
# Request resources
srun --gpus=1 --mem=16G --tmp=50G --time=1:00:00 --pty bash
# Extract container
tar -xf /cluster/work/rsl/$USER/containers/debug.tar -C $TMPDIR
# Enter container interactively
singularity shell --nv $TMPDIR/debug.sif
# Test commands manually
python3 -c "import torch; print(torch.cuda.is_available())"
Add debugging to job scripts:
#!/bin/bash
#SBATCH --job-name=debug-job
# Enable bash debugging
set -x
# Print environment
echo "=== Environment ==="
env | grep -E "(CUDA|SINGULARITY|SLURM)" | sort
# Check allocations
echo "=== Allocations ==="
echo "GPUs: $CUDA_VISIBLE_DEVICES"
echo "CPUs: $SLURM_CPUS_PER_TASK"
echo "Memory: $SLURM_MEM_PER_CPU MB per CPU"
echo "Tmp space: $(df -h $TMPDIR | tail -1)"
# Time each step
echo "=== Extraction ==="
time tar -xf container.tar -C $TMPDIR
echo "=== Container Info ==="
singularity inspect $TMPDIR/container.sif
# Check job details
scontrol show job $SLURM_JOB_ID
# Monitor resource usage
watch -n 1 'sstat -j $SLURM_JOB_ID --format=JobID,MaxRSS,MaxDiskRead,MaxDiskWrite'
# Check GPU usage on node
ssh $NODE 'nvidia-smi -l 1'
# View detailed job info after completion
sacct -j $SLURM_JOB_ID --format=JobID,JobName,Partition,State,ExitCode,Elapsed,MaxRSS,AllocGRES
# Check why job failed
scontrol show job $SLURM_JOB_ID | grep -E "(Reason|ExitCode)"
Reduce container size:
# Multi-stage build
FROM nvidia/cuda:11.8.0-cudnn8-devel-ubuntu22.04 AS builder
RUN apt-get update && apt-get install -y build-essential
# Build steps...
FROM nvidia/cuda:11.8.0-cudnn8-runtime-ubuntu22.04
COPY --from=builder /app/bin /app/bin
# Minimal runtime dependencies only
# Use local scratch for datasets
import os
import shutil
# Copy dataset to local scratch at job start
if os.environ.get('SLURM_JOB_ID'):
local_data = f"{os.environ['TMPDIR']}/dataset"
if not os.path.exists(local_data):
shutil.copytree('/cluster/scratch/user/dataset', local_data)
data_path = local_data
else:
data_path = './dataset'
# Use multiple workers for data loading
dataloader = DataLoader(dataset,
batch_size=32,
num_workers=8, # Match CPU count
pin_memory=True,
persistent_workers=True)
Back to Home | Container Workflow | Scripts |