Deploy ML training jobs and inference services on Vast.ai GPU cloud. Covers instance provisioning via the REST API, Docker image configuration, data transfer strategies, and automated deployment scripts for GPU workloads.
VASTAI_API_KEY environment variablepip install vastai)set -euo pipefail
# Search for available GPUs
vastai search offers 'gpu_name=RTX_4090 reliability2>0.95 disk_space>50' \
-o 'dph_total' --limit 5
# Create instance from best offer
vastai create instance $OFFER_ID \
--image pytorch/pytorch:2.1.0-cuda12.1-cudnn8-runtime \
--disk 100 \
--onstart-cmd "cd /workspace && git clone https://github.com/myorg/project.git && pip install -r project/requirements.txt"
# Dockerfile.gpu
FROM pytorch/pytorch:2.1.0-cuda12.1-cudnn8-runtime
WORKDIR /workspace
COPY requirements.txt .
RUN pip install --no-cache-dir -r requirements.txt
COPY . .
CMD ["python", "train.py", "--config", "config.yaml"]
set -euo pipefail
# Build and push
docker build -f Dockerfile.gpu -t myregistry/ml-trainer:latest .
docker push myregistry/ml-trainer:latest
# Deploy on Vast.ai
vastai create instance $OFFER_ID \
--image myregistry/ml-trainer:latest \
--disk 100 \
--env "WANDB_API_KEY=$WANDB_API_KEY HF_TOKEN=$HF_TOKEN"
import requests
import json
import os
VASTAI_API = "https://cloud.vast.ai/api/v0"
API_KEY = os.environ["VASTAI_API_KEY"]
def deploy_training_job(gpu_type="RTX_4090", disk_gb=100):
# Find cheapest matching offer
response = requests.get(f"{VASTAI_API}/bundles", params={
"api_key": API_KEY,
"q": json.dumps({
"gpu_name": {"eq": gpu_type},
"rentable": {"eq": True},
"disk_space": {"gte": disk_gb},
"reliability2": {"gte": 0.95},
}),
"order": "dph_total",
"limit": 1,
})
offers = response.json()["offers"]
if not offers:
raise ValueError(f"No {gpu_type} available")
# Provision instance
result = requests.put(
f"{VASTAI_API}/asks/{offers[0]['id']}/",
params={"api_key": API_KEY},
json={
"image": "myregistry/ml-trainer:latest",
"disk": disk_gb,
"env": {"WANDB_API_KEY": os.environ.get("WANDB_API_KEY", "")},
},
)
instance = result.json()
print(f"Instance {instance['new_contract']} created at ${offers[0]['dph_total']}/hr")
return instance
# List running instances
vastai show instances
# Check instance status
vastai show instance $INSTANCE_ID
# Download results
vastai scp $INSTANCE_ID:/workspace/output ./results/
# Destroy instance when done
vastai destroy instance $INSTANCE_ID
| Issue | Cause | Solution |
|---|---|---|
| No GPU available | High demand | Try different GPU type or region |
| Instance preempted | Outbid on spot | Use on-demand or increase bid |
| SSH connection refused | Instance still booting | Wait for running status |
| Out of disk | Large dataset | Increase --disk parameter |
vastai search offers 'gpu_name=A100_SXM4 num_gpus=1' -o 'dph_total' --limit 3
vastai create instance $BEST_OFFER --image myregistry/trainer:latest --disk 200 # HTTP 200 OK
For multi-environment setup, see vastai-multi-env-setup.