feat(inference): Init commit for inference llm using python llama ggml

2023-08-24 09:14:18 +07:00 · 2023-08-24 09:14:18 +07:00 · 6ef61c45ec
commit 6ef61c45ec
parent b87564fba9
5 changed files with 107 additions and 0 deletions
--- a/.gitignore
+++ b/.gitignore
@ -0,0 +1,3 @@
+# Jan inference
+# jan-inference/llm/models/**
+jan-inference/llm/.env
--- a/jan-inference/llm/.env.example
+++ b/jan-inference/llm/.env.example
@ -0,0 +1,2 @@
+MODEL_URL=https://huggingface.co/TheBloke/Llama-2-7B-Chat-GGML/resolve/main/llama-2-7b-chat.ggmlv3.q4_1.bin
+MODEL_FILE=$(basename $MODEL_URL)
--- a/jan-inference/llm/README.md
+++ b/jan-inference/llm/README.md
@ -0,0 +1,8 @@
+# Inference - LLM
+
+```bash
+docker network create traefik_public
+cp .env.example .env
+# -> Update MODEL_URL in `.env` file
+docker compose up -d --scale llm=2
+``````
--- a/jan-inference/llm/docker-compose.yml
+++ b/jan-inference/llm/docker-compose.yml
@ -0,0 +1,94 @@
+version: '3'
+
+services:
+
+  # Service to download a model file.
+  downloader:
+    image: busybox
+    # The command extracts the model filename from MODEL_URL and downloads it if it doesn't exist.
+    command: /bin/sh -c "MODEL_FILE=$(basename ${MODEL_URL}); if [ ! -f /models/$MODEL_FILE ]; then wget -O /models/$MODEL_FILE ${MODEL_URL}; fi"
+    # Mount a local directory to store the downloaded model.
+    volumes:
+      - ./models:/models
+
+  # Service to wait for the downloader service to finish downloading the model.
+  wait-for-downloader:
+    image: busybox
+    # The command waits until the model file (specified in MODEL_URL) exists.
+    command: /bin/sh -c "MODEL_FILE=$(basename ${MODEL_URL}); echo 'Waiting for downloader to finish'; while [ ! -f /models/$MODEL_FILE ]; do sleep 1; done; echo 'Model downloaded!'"
+    # Specifies that this service should start after the downloader service has started.
+    depends_on:
+      downloader:
+        condition: service_started
+    # Mount the same local directory to check for the downloaded model.
+    volumes:
+      - ./models:/models
+
+  # Service to run the Llama web application.
+  llm:
+    image: ghcr.io/abetlen/llama-cpp-python:latest
+    # Mount the directory that contains the downloaded model.
+    volumes:
+      - ./models:/models
+    environment:
+      # Specify the path to the model for the web application.
+      MODEL: /models/llama-2-7b-chat.ggmlv3.q4_1.bin
+    labels:
+      # Instead of using the Host rule, set a PathPrefix rule
+      - "traefik.http.routers.web.rule=PathPrefix(`/inference/llm`)"
+      # This tells Traefik where to forward the traffic for this route.
+      - "traefik.http.routers.web.service=llm"
+    # Define a service for the llm and specify its load balancer configuration
+      - "traefik.http.services.llm-service.loadbalancer.server.port=8000"
+
+      - "traefik.http.middlewares.strip-llm-prefix.stripprefix.prefixes=/inference/llm"
+      - "traefik.http.routers.web.middlewares=strip-llm-prefix"
+    # Health check configuration
+    healthcheck:
+      test: ["CMD", "wget", "--quiet", "--tries=1", "--spider", "http://localhost:8000"]
+      interval: 30s
+      timeout: 10s
+      retries: 3
+      start_period: 30s
+    # Restart policy configuration
+    restart: on-failure
+    # Specifies that this service should start only after wait-for-downloader has completed successfully.
+    depends_on:
+      wait-for-downloader:
+        condition: service_completed_successfully
+    # Connect this service to two networks: inference_net and traefik_public.
+    networks:
+      - inference_net
+      - traefik_public
+  
+  # Service for Traefik, a modern HTTP reverse proxy and load balancer.
+  traefik:
+    image: traefik:v2.5
+    command:
+      # Enable the Traefik API dashboard without TLS (not recommended for production).
+      - "--api.insecure=true"
+      # Enable Traefik to use Docker as a provider.
+      - "--providers.docker=true"
+      # Do not expose services by default. Explicitly specify in each service if it should be exposed.
+      - "--providers.docker.exposedbydefault=false"
+      # Specify the default entry point on port 80.
+      - "--entrypoints.web.address=:80"
+    ports:
+      # Map port 80 in the container to port 80 on the host.
+      - "80:80"
+      # Map port 8080 in the container (Traefik's dashboard) to port 8080 on the host.
+      - "8080:8080"
+    # Mount the Docker socket to allow Traefik to listen to Docker's API.
+    volumes:
+      - /var/run/docker.sock:/var/run/docker.sock
+    # Connect this service to the traefik_public network.
+    networks:
+      - traefik_public
+
+# Define networks used in this docker-compose file.
+networks:
+  # Network for the llm service (used for inference).
+  inference_net:
+  # Public-facing network that Traefik uses. Marked as external to indicate it may be defined outside this file.
+  traefik_public:
+    external: true
--- a/jan-inference/llm/models/.gitkeep
+++ b/jan-inference/llm/models/.gitkeep