This page provides a detailed, tested workflow for deploying containerized applications on the Euler cluster.
The workflow consists of four main steps:
FROM nvidia/cuda:11.8.0-cudnn8-runtime-ubuntu22.04
# Install system dependencies
RUN apt-get update && apt-get install -y \
python3 \
python3-pip \
git \
wget \
&& rm -rf /var/lib/apt/lists/*
# Install Python packages
RUN pip3 install --no-cache-dir \
torch==2.0.1 --index-url https://download.pytorch.org/whl/cu118 \
numpy \
scipy \
matplotlib
# Set working directory
WORKDIR /workspace
# Copy application code
COPY . .
# Set entrypoint
ENTRYPOINT ["python3"]
# Build the image
docker build -t my-app:latest .
# Test locally (optional)
docker run --rm -it my-app:latest --version
# Build stage
FROM nvidia/cuda:11.8.0-cudnn8-devel-ubuntu22.04 AS builder
RUN apt-get update && apt-get install -y build-essential
# ... compile dependencies ...
# Runtime stage
FROM nvidia/cuda:11.8.0-cudnn8-runtime-ubuntu22.04
COPY --from=builder /compiled/libs /usr/local/lib
# ... rest of the Dockerfile ...
Ensure Apptainer is installed:
apptainer --version # Should show 1.2.5 or compatible
# Create directory for exports
mkdir -p container-exports
cd container-exports
# Convert Docker to Singularity sandbox
APPTAINER_NOHTTPS=1 apptainer build --sandbox --fakeroot \
my-app.sif docker-daemon://my-app:latest
# Compress for transfer (required for efficient copying)
tar -czf my-app.tar.gz my-app.sif
Timing: For an 8GB container, expect:
First, set up your directories on Euler:
ssh euler << 'EOF'
mkdir -p /cluster/work/rsl/$USER/containers
mkdir -p /cluster/project/rsl/$USER/results
mkdir -p /cluster/scratch/$USER/datasets
EOF
# Transfer compressed container
scp container-exports/my-app.tar.gz \
euler:/cluster/work/rsl/$USER/containers/
# For large transfers, use rsync with resume capability
rsync -avP container-exports/my-app.tar.gz \
euler:/cluster/work/rsl/$USER/containers/
Create job_script.sh
:
#!/bin/bash
#SBATCH --job-name=my-container-job
#SBATCH --output=logs/job_%j.out
#SBATCH --error=logs/job_%j.err
#SBATCH --time=24:00:00
#SBATCH --ntasks=1
#SBATCH --cpus-per-task=8
#SBATCH --mem-per-cpu=4G
#SBATCH --gpus=1
#SBATCH --tmp=100G
# Load modules
module load eth_proxy
# Job info
echo "Job started on $(hostname) at $(date)"
echo "Job ID: $SLURM_JOB_ID"
echo "GPU: $CUDA_VISIBLE_DEVICES"
# Extract container to local scratch (CRITICAL for performance)
echo "Extracting container..."
time tar -xzf /cluster/work/rsl/$USER/containers/my-app.tar.gz -C $TMPDIR
# Setup directories
RESULTS_DIR="/cluster/project/rsl/$USER/results/$SLURM_JOB_ID"
mkdir -p $RESULTS_DIR
# Run container
echo "Running application..."
time singularity exec \
--nv \
--bind $RESULTS_DIR:/output \
--bind /cluster/scratch/$USER:/data:ro \
$TMPDIR/my-app.sif \
python3 /workspace/main.py \
--data-dir /data \
--output-dir /output
echo "Job completed at $(date)"
For parallel GPU training:
#!/bin/bash
#SBATCH --job-name=multi-gpu-training
#SBATCH --output=logs/job_%j.out
#SBATCH --error=logs/job_%j.err
#SBATCH --time=72:00:00
#SBATCH --ntasks=1
#SBATCH --cpus-per-task=32
#SBATCH --mem-per-cpu=8G
#SBATCH --gpus=4
#SBATCH --tmp=200G
module load eth_proxy
# Extract container
tar -xzf /cluster/work/rsl/$USER/containers/my-app.tar.gz -C $TMPDIR
# Run distributed training
singularity exec \
--nv \
--bind /cluster/project/rsl/$USER/checkpoints:/checkpoints \
--bind /cluster/scratch/$USER/datasets:/data:ro \
$TMPDIR/my-app.sif \
python3 -m torch.distributed.run \
--nproc_per_node=4 \
train.py --distributed
For debugging and development:
# Request interactive session
srun --gpus=1 --mem=32G --tmp=50G --pty bash
# Extract and run container interactively
tar -xzf /cluster/work/rsl/$USER/containers/my-app.tar.gz -C $TMPDIR
singularity shell --nv $TMPDIR/my-app.sif
Data Type | Location | Purpose |
---|---|---|
Containers | /cluster/work/rsl/$USER/containers/ |
Long-term storage |
Results | /cluster/project/rsl/$USER/results/ |
Persistent outputs |
Datasets | /cluster/scratch/$USER/ |
Large data, auto-cleaned |
Working files | $TMPDIR |
Fast local scratch |
From our testing with 8GB containers:
Container extraction to $TMPDIR: 10-15 seconds
Container extraction to /cluster/work: 2-5 minutes (avoid!)
Container startup overhead: ~2 seconds
GPU initialization: <1 second
$TMPDIR
# Good - fast local storage
tar -xzf /cluster/work/.../container.tar.gz -C $TMPDIR
# Bad - slow network storage
tar -xzf container.tar.gz -C /cluster/work/...
# Copy frequently accessed data to $TMPDIR
cp -r /cluster/scratch/$USER/dataset $TMPDIR/
--bind /cluster/scratch/$USER/data:/data:ro
# In another terminal while job runs
ssh euler squeue -j $JOBID
ssh $NODE nvidia-smi
Container extraction is very slow
$TMPDIR
df -h $TMPDIR
tar -cf
instead of tar -czf
GPU not detected in container
# Check if --nv flag is present
# Verify CUDA versions match
singularity exec --nv container.sif nvidia-smi
Permission denied errors
# Build with fakeroot
apptainer build --fakeroot ...
# Ensure directories exist and are writable
mkdir -p /cluster/project/rsl/$USER/results
Out of memory during extraction
#SBATCH --tmp=200G # Increase local scratch
# Check job details
scontrol show job $SLURM_JOB_ID
# Monitor job resource usage
sstat -j $SLURM_JOB_ID
# View node status
sinfo -N -l
# Check your disk quotas
lquota
For frequently used containers, consider keeping extracted versions:
# One-time extraction (use project for extracted containers)
tar -xzf /cluster/work/rsl/$USER/containers/container.tar.gz \
-C /cluster/project/rsl/$USER/containers/extracted/
# In job script, just copy
cp -r /cluster/project/rsl/$USER/containers/extracted/my-app.sif $TMPDIR/
Example automation script:
#!/bin/bash
# build_and_deploy.sh
IMAGE_NAME="my-app"
VERSION=$(date +%Y%m%d-%H%M%S)
# Build
docker build -t ${IMAGE_NAME}:${VERSION} .
# Convert
apptainer build --fakeroot ${IMAGE_NAME}-${VERSION}.sif \
docker-daemon://${IMAGE_NAME}:${VERSION}
# Compress and transfer
tar -czf ${IMAGE_NAME}-${VERSION}.tar.gz ${IMAGE_NAME}-${VERSION}.sif
scp ${IMAGE_NAME}-${VERSION}.tar.gz euler:/cluster/work/rsl/$USER/containers/
# Create job script from template
sed "s/VERSION/${VERSION}/g" job_template.sh > job_${VERSION}.sh
# Submit
ssh euler "cd /cluster/work/rsl/$USER && sbatch job_${VERSION}.sh"
Complete test files for the container workflow:
To test the complete workflow:
# 1. Build Docker image locally
docker build -t euler-test:latest -f Dockerfile .
# 2. Convert to Singularity
apptainer build --sandbox --fakeroot euler-test.sif docker-daemon://euler-test:latest
# 3. Compress and transfer
tar -czf euler-test.tar.gz euler-test.sif
scp euler-test.tar.gz euler:/cluster/work/rsl/$USER/containers/
# 4. Submit test job
ssh euler
sbatch test_job_project.sh
Back to Home | View Scripts | Troubleshooting |