jan/jan-inference/llm/docker-compose.yml

version: '3'

services:

  # Service to download a model file.
  downloader:
    image: busybox
    # The command extracts the model filename from MODEL_URL and downloads it if it doesn't exist.
    command: /bin/sh -c "LLM_MODEL_FILE=$(basename ${MODEL_URL}); if [ ! -f /models/$LLM_MODEL_FILE ]; then wget -O /models/$LLM_MODEL_FILE ${MODEL_URL}; fi"
    # Mount a local directory to store the downloaded model.
    volumes:
      - ./models:/models

  # Service to wait for the downloader service to finish downloading the model.
  wait-for-downloader:
    image: busybox
    # The command waits until the model file (specified in MODEL_URL) exists.
    command: /bin/sh -c "LLM_MODEL_FILE=$(basename ${MODEL_URL}); echo 'Waiting for downloader to finish'; while [ ! -f /models/$LLM_MODEL_FILE ]; do sleep 1; done; echo 'Model downloaded!'"
    # Specifies that this service should start after the downloader service has started.
    depends_on:
      downloader:
        condition: service_started
    # Mount the same local directory to check for the downloaded model.
    volumes:
      - ./models:/models

  # Service to run the Llama web application.
  llm:
    image: ghcr.io/abetlen/llama-cpp-python:latest
    # Mount the directory that contains the downloaded model.
    volumes:
      - ./models:/models
    ports:
      - 8000:8000
    environment:
      # Specify the path to the model for the web application.
      MODEL: /models/llama-2-7b-chat.ggmlv3.q4_1.bin
      PYTHONUNBUFFERED: 1
    # Health check configuration
    # healthcheck:
    #   test: ["CMD", "wget", "--quiet", "--tries=1", "--spider", "http://localhost:8000"]
    #   interval: 30s
    #   timeout: 10s
    #   retries: 3
    #   start_period: 30s
    # Restart policy configuration
    restart: on-failure
    # Specifies that this service should start only after wait-for-downloader has completed successfully.
    depends_on:
      wait-for-downloader:
        condition: service_completed_successfully
    # Connect this service to two networks: inference_net and traefik_public.
    networks:
      - traefik_public

# Define networks used in this docker-compose file.
networks:
  # Public-facing network that Traefik uses. Marked as external to indicate it may be defined outside this file.
  traefik_public:
    external: true