Merge pull request #18 from janhq/fix_inference

Fix inference service - ggml (LLM + SD)
2023-08-30 16:45:25 +07:00 · 2023-08-30 16:45:25 +07:00 · b61536aa67
commit b61536aa67
parent 914e7663dd 858c872b1b
10 changed files with 101 additions and 229 deletions
--- a/README.md
+++ b/README.md
@ -52,6 +52,15 @@ Jan is a free, source-available and [fair code licensed](https://faircode.io/) A

 Jan offers an [Docker Compose](https://docs.docker.com/compose/) deployment that automates the setup process.

+```bash
+# Download models
+# Runway SD 1.5
+wget https://huggingface.co/runwayml/stable-diffusion-v1-5/resolve/main/v1-5-pruned-emaonly.safetensors -P jan-inference/sd/models
+
+# Download LLM
+wget https://huggingface.co/TheBloke/Llama-2-7B-Chat-GGML/resolve/main/llama-2-7b-chat.ggmlv3.q4_1.bin -P jan-inference/llm/models
+``````
+
 Run the following command to start all the services defined in the `docker-compose.yml`

 ```shell
@ -102,14 +111,4 @@ Jan is a monorepo that pulls in the following submodules

 ## Live Demo

-You can access the live demo at https://cloud.jan.ai.
-
-## Common Issues and Troubleshooting
-
-**Error in `jan-inference` service** ![](images/download-model-error.png)
-
- Error: download model incomplete
- Solution:
-  - Manually download the LLM model using the URL specified in the environment variable `MODEL_URL` within the `.env` file. The URL is typically https://huggingface.co/TheBloke/Llama-2-7B-Chat-GGML/resolve/main/llama-2-7b-chat.ggmlv3.q4_1.bin
-  - Copy the downloaded file `llama-2-7b-chat.ggmlv3.q4_1.bin` to the folder `jan-inference/llm/models`
-  - Run `docker compose down` followed by `docker compose up -d` again to restart the services.
+You can access the live demo at https://cloud.jan.ai.
--- a/docker-compose.yml
+++ b/docker-compose.yml
@ -125,7 +125,6 @@ services:
      timeout: 10s
      retries: 5
      start_period: 5s
-
    networks:
      jan_community:
        ipv4_address: 172.20.0.14
@ -152,39 +151,9 @@ services:
      jan_community:
        ipv4_address: 172.20.0.15

-  # Service to download a model file.
-  downloader:
-    image: busybox
-    # The command extracts the model filename from MODEL_URL and downloads it if it doesn't exist.
-    command: /bin/sh -c "LLM_MODEL_FILE=$(basename ${MODEL_URL}); if [ ! -f /models/$LLM_MODEL_FILE ]; then wget -O /models/$LLM_MODEL_FILE ${MODEL_URL}; fi"
-    # Mount a local directory to store the downloaded model.
-    volumes:
-      - ./jan-inference/llm/models:/models
-
-    networks:
-      jan_community:
-        ipv4_address: 172.20.0.16
-
-  # Service to wait for the downloader service to finish downloading the model.
-  wait-for-downloader:
-    image: busybox
-    # The command waits until the model file (specified in MODEL_URL) exists.
-    command: /bin/sh -c "LLM_MODEL_FILE=$(basename ${MODEL_URL}); echo 'Waiting for downloader to finish'; while [ ! -f /models/$LLM_MODEL_FILE ]; do sleep 1; done; echo 'Model downloaded!'"
-    # Specifies that this service should start after the downloader service has started.
-    depends_on:
-      downloader:
-        condition: service_started
-    # Mount the same local directory to check for the downloaded model.
-    volumes:
-      - ./jan-inference/llm/models:/models
-
-    networks:
-      jan_community:
-        ipv4_address: 172.20.0.17
-
  # Service to run the Llama web application.
  llm:
-    image: ghcr.io/abetlen/llama-cpp-python:latest
+    image: ghcr.io/abetlen/llama-cpp-python@sha256:b6d21ff8c4d9baad65e1fa741a0f8c898d68735fff3f3cd777e3f0c6a1839dd4
    # Mount the directory that contains the downloaded model.
    volumes:
      - ./jan-inference/llm/models:/models
@ -192,20 +161,74 @@ services:
      - 8000:8000
    environment:
      # Specify the path to the model for the web application.
-      MODEL: /models/llama-2-7b-chat.ggmlv3.q4_1.bin
+      MODEL: /models/${LLM_MODEL_FILE}
      PYTHONUNBUFFERED: 1
    # Restart policy configuration
    restart: on-failure
    # Specifies that this service should start only after wait-for-downloader has completed successfully.
-    depends_on:
-      wait-for-downloader:
-        condition: service_completed_successfully
-    # Connect this service to two networks: inference_net and traefik_public.
-
    networks:
      jan_community:
        ipv4_address: 172.20.0.18

+  sd-downloader:
+    build:
+      context: ./jan-inference/sd/
+      dockerfile: compile.Dockerfile
+    # The command extracts the model filename from MODEL_URL and downloads it if it doesn't exist.
+    command: /bin/sh -c "if [ ! -f /models/*.bin ]; then python /sd.cpp/sd_cpp/models/convert.py  --out_type q4_0 --out_file /models/${SD_MODEL_FILE}.q4_0.bin  /models/${SD_MODEL_FILE}; fi"
+    # Mount a local directory to store the downloaded model.
+    volumes:
+      - ./jan-inference/sd/models:/models
+    networks:
+      jan_community:
+        ipv4_address: 172.20.0.19
+
+  # Service to run the SD web application.
+  sd:
+    build:
+      context: ./jan-inference/sd/
+      dockerfile: inference.Dockerfile
+    # Mount the directory that contains the downloaded model.
+    volumes:
+      - ./jan-inference/sd/models:/models
+      - ./jan-inference/sd/output/:/serving/output
+    command:  /bin/bash -c "python -m uvicorn main:app --proxy-headers --host 0.0.0.0 --port 8000"
+    environment:
+      # Specify the path to the model for the web application.
+      BASE_URL: http://0.0.0.0:8000
+      MODEL_NAME: ${SD_MODEL_FILE}.q4_0.bin
+      MODEL_DIR: "/models"
+      SD_PATH: "/sd"
+      PYTHONUNBUFFERED: 1
+    ports:
+      - 8001:8000
+    # Restart policy configuration
+    restart: on-failure
+    # Specifies that this service should start only after wait-for-downloader has completed successfully.
+    depends_on:
+      sd-downloader:
+        condition: service_completed_successfully
+    networks:
+      jan_community:
+        ipv4_address: 172.20.0.21
+
+  # Service for Traefik, a modern HTTP reverse proxy and load balancer.
+  # traefik:
+  #   image: traefik:v2.10
+  #   ports:
+  #     # Map port 80 in the container to port 80 on the host.
+  #     - "80:80"
+  #     # Map port 8080 in the container (Traefik's dashboard) to port 8080 on the host.
+  #     - "8080:8080"
+  #   # Mount the Docker socket to allow Traefik to listen to Docker's API.
+  #   volumes:
+  #     - /var/run/docker.sock:/var/run/docker.sock:ro
+  #     - ./traefik/traefik.yml:/traefik.yml:ro
+  #     - ./traefik/config.yml:/config.yml:ro
+  #   networks:
+  #     jan_community:
+  #       ipv4_address: 172.20.0.22
+
 networks:  
  jan_community:
    driver: bridge
--- a/jan-inference/docker-compose.yml
+++ b/jan-inference/docker-compose.yml
@ -1,25 +0,0 @@
-version: '3'
-
-services:
-  # Service for Traefik, a modern HTTP reverse proxy and load balancer.
-  traefik:
-    image: traefik:v2.10
-    ports:
-      # Map port 80 in the container to port 80 on the host.
-      - "80:80"
-      # Map port 8080 in the container (Traefik's dashboard) to port 8080 on the host.
-      - "8080:8080"
-    # Mount the Docker socket to allow Traefik to listen to Docker's API.
-    volumes:
-      - /var/run/docker.sock:/var/run/docker.sock:ro
-      - ./traefik/traefik.yml:/traefik.yml:ro
-      - ./traefik/config.yml:/config.yml:ro
-    # Connect this service to the traefik_public network.
-    networks:
-      - traefik_public
-
-# Define networks used in this docker-compose file.
-networks:
-  # Public-facing network that Traefik uses. Marked as external to indicate it may be defined outside this file.
-  traefik_public:
-    external: true
--- a/jan-inference/llm/README.md
+++ b/jan-inference/llm/README.md
@ -1,8 +0,0 @@
-# Inference - LLM
-
-```bash
-docker network create traefik_public
-cp .env.example .env
-# -> Update MODEL_URL in `.env` file
-docker compose up -d --scale llm=2
-``````
--- a/jan-inference/llm/docker-compose.yml
+++ b/jan-inference/llm/docker-compose.yml
@ -1,60 +0,0 @@
-version: '3'
-
-services:
-
-  # Service to download a model file.
-  downloader:
-    image: busybox
-    # The command extracts the model filename from MODEL_URL and downloads it if it doesn't exist.
-    command: /bin/sh -c "LLM_MODEL_FILE=$(basename ${MODEL_URL}); if [ ! -f /models/$LLM_MODEL_FILE ]; then wget -O /models/$LLM_MODEL_FILE ${MODEL_URL}; fi"
-    # Mount a local directory to store the downloaded model.
-    volumes:
-      - ./models:/models
-
-  # Service to wait for the downloader service to finish downloading the model.
-  wait-for-downloader:
-    image: busybox
-    # The command waits until the model file (specified in MODEL_URL) exists.
-    command: /bin/sh -c "LLM_MODEL_FILE=$(basename ${MODEL_URL}); echo 'Waiting for downloader to finish'; while [ ! -f /models/$LLM_MODEL_FILE ]; do sleep 1; done; echo 'Model downloaded!'"
-    # Specifies that this service should start after the downloader service has started.
-    depends_on:
-      downloader:
-        condition: service_started
-    # Mount the same local directory to check for the downloaded model.
-    volumes:
-      - ./models:/models
-
-  # Service to run the Llama web application.
-  llm:
-    image: ghcr.io/abetlen/llama-cpp-python:latest
-    # Mount the directory that contains the downloaded model.
-    volumes:
-      - ./models:/models
-    ports:
-      - 8000:8000
-    environment:
-      # Specify the path to the model for the web application.
-      MODEL: /models/llama-2-7b-chat.ggmlv3.q4_1.bin
-      PYTHONUNBUFFERED: 1
-    # Health check configuration
-    # healthcheck:
-    #   test: ["CMD", "wget", "--quiet", "--tries=1", "--spider", "http://localhost:8000"]
-    #   interval: 30s
-    #   timeout: 10s
-    #   retries: 3
-    #   start_period: 30s
-    # Restart policy configuration
-    restart: on-failure
-    # Specifies that this service should start only after wait-for-downloader has completed successfully.
-    depends_on:
-      wait-for-downloader:
-        condition: service_completed_successfully
-    # Connect this service to two networks: inference_net and traefik_public.
-    networks:
-      - traefik_public
-
-# Define networks used in this docker-compose file.
-networks:
-  # Public-facing network that Traefik uses. Marked as external to indicate it may be defined outside this file.
-  traefik_public:
-    external: true
--- a/jan-inference/sd/docker-compose.yml
+++ b/jan-inference/sd/docker-compose.yml
@ -1,71 +0,0 @@
-version: '3'
-
-services:
-
-  # Service to download a model file.
-  downloader:
-    build:
-      context: ./
-      dockerfile: compile.Dockerfile
-    # platform: "linux/amd64"
-    # The command extracts the model filename from MODEL_URL and downloads it if it doesn't exist.
-    command: /bin/sh -c "SD_MODEL_FILE=$(basename ${MODEL_URL}); if [ ! -f /converted_models/*.bin ]; then wget -O /converted_models/$SD_MODEL_FILE ${MODEL_URL}; python /sd.cpp/models/convert.py --out_type q4_0 --out_file /converted_models/$SD_MODEL_FILE; fi"
-    # Mount a local directory to store the downloaded model.
-    volumes:
-      - ./models:/converted_models
-
-  # Service to wait for the downloader service to finish downloading the model.
-  wait-for-downloader:
-    image: busybox
-    # The command waits until the model file (specified in MODEL_URL) exists.
-    command: /bin/sh -c "SD_MODEL_FILE=$(basename ${MODEL_URL}); echo 'Waiting for downloader to finish'; while [ ! -f /models/*.bin ]; do sleep 1; done; echo 'Model downloaded and converted!'"
-    # Specifies that this service should start after the downloader service has started.
-    depends_on:
-      downloader:
-        condition: service_started
-    # Mount the same local directory to check for the downloaded model.
-    volumes:
-      - ./models:/models
-
-  # Service to run the SD web application.
-  sd:
-    build:
-      context: ./
-      dockerfile: inference.Dockerfile
-    # Mount the directory that contains the downloaded model.
-    volumes:
-      - ./models:/models
-      - ./output/:/serving/output
-    command:  /bin/bash -c "python -m uvicorn main:app --proxy-headers --host 0.0.0.0 --port 8000"
-    # platform: "linux/amd64"
-    environment:
-      # Specify the path to the model for the web application.
-      BASE_URL: http://0.0.0.0:8000
-      MODEL_NAME: "v1-5-pruned-emaonly-ggml-model-q5_0.bin"
-      MODEL_DIR: "/models"
-      SD_PATH: "/sd"
-      PYTHONUNBUFFERED: 1
-    ports:
-      - 8000:8000
-    # Health check configuration
-    # healthcheck:
-    #   test: ["CMD", "wget", "--quiet", "--tries=1", "--spider", "http://localhost:8000"]
-    #   interval: 30s
-    #   timeout: 10s
-    #   retries: 3
-    #   start_period: 30s
-    # Restart policy configuration
-    restart: on-failure
-    # Specifies that this service should start only after wait-for-downloader has completed successfully.
-    depends_on:
-      wait-for-downloader:
-        condition: service_completed_successfully
-    # Connect this service to two networks: inference_net and traefik_public.
-    networks:
-      - traefik_public
-
-# Define networks used in this docker-compose file.
-networks:
-  # Public-facing network that Traefik uses. Marked as external to indicate it may be defined outside this file.
-  traefik_public:
-    external: true
--- a/jan-inference/sd/main.py
+++ b/jan-inference/sd/main.py
@ -4,16 +4,26 @@ from fastapi.staticfiles import StaticFiles
 import subprocess
 import os
 from uuid import uuid4
+from pydantic import BaseModel

 app = FastAPI()

 OUTPUT_DIR = "output"
 SD_PATH = os.environ.get("SD_PATH", "./sd")
 MODEL_DIR = os.environ.get("MODEL_DIR", "./models")
-BASE_URL = os.environ.get("BASE_URL", "http://localhost:8000")
 MODEL_NAME = os.environ.get(
    "MODEL_NAME", "v1-5-pruned-emaonly-ggml-model-q5_0.bin")

+
+class Payload(BaseModel):
+    prompt: str
+    neg_prompt: str
+    seed: int
+    steps: int
+    width: int
+    height: int
+
+
 # Create the OUTPUT_DIR directory if it does not exist
 if not os.path.exists(OUTPUT_DIR):
    os.makedirs(OUTPUT_DIR)
@ -26,33 +36,37 @@ if not os.path.exists(MODEL_DIR):
 app.mount("/output", StaticFiles(directory=OUTPUT_DIR), name="output")


-def run_command(prompt: str, filename: str):
+def run_command(payload: Payload, filename: str):
    # Construct the command based on your provided example
    command = [SD_PATH,
-               "-m", os.path.join(MODEL_DIR, MODEL_NAME),
-               "-p", prompt,
-               "-o", os.path.join(OUTPUT_DIR, filename)
+               "--model", f'{os.path.join(MODEL_DIR, MODEL_NAME)}',
+               "--prompt", f'"{payload.prompt}"',
+               "--negative-prompt", f'"{payload.neg_prompt}"',
+               "--height", str(payload.height),
+               "--width", str(payload.width),
+               "--steps", str(payload.steps),
+               "--seed", str(payload.seed),
+               "--mode", 'txt2img',
+               "-o", f'{os.path.join(OUTPUT_DIR, filename)}',
               ]

    try:
-        sub_output = subprocess.run(command, timeout=5*60, capture_output=True,
-                                    check=True, encoding="utf-8")
-        print(sub_output.stdout)
+        subprocess.run(command, timeout=5*60)
    except subprocess.CalledProcessError:
        raise HTTPException(
            status_code=500, detail="Failed to execute the command.")


-@app.post("/inference/")
-async def run_inference(background_tasks: BackgroundTasks, prompt: str = Form()):
+@app.post("/inferences/txt2img")
+async def run_inference(background_tasks: BackgroundTasks, payload: Payload):
    # Generate a unique filename using uuid4()
    filename = f"{uuid4()}.png"

    # We will use background task to run the command so it won't block
-    background_tasks.add_task(run_command, prompt, filename)
+    background_tasks.add_task(run_command, payload, filename)

    # Return the expected path of the output file
-    return {"url": f'{BASE_URL}/serve/{filename}'}
+    return {"url": f'/serve/{filename}'}


@app.get("/serve/{filename}")
--- a/sample.env
+++ b/sample.env
@ -9,9 +9,9 @@ KEYCLOAK_ADMIN_PASSWORD=admin

 # Inference
 ## LLM
-MODEL_URL=https://huggingface.co/TheBloke/Llama-2-7B-Chat-GGML/resolve/main/llama-2-7b-chat.ggmlv3.q4_1.bin
-LLM_MODEL_FILE=$(basename $MODEL_URL)
+LLM_MODEL_URL=https://huggingface.co/TheBloke/Llama-2-7B-Chat-GGML/resolve/main/llama-2-7b-chat.ggmlv3.q4_1.bin
+LLM_MODEL_FILE=llama-2-7b-chat.ggmlv3.q4_1.bin

 ## SD
-MODEL_URL=https://huggingface.co/runwayml/stable-diffusion-v1-5/resolve/main/v1-5-pruned-emaonly.safetensors
-SD_MODEL_FILE=$(basename $MODEL_URL)
+SD_MODEL_URL=https://huggingface.co/runwayml/stable-diffusion-v1-5/resolve/main/v1-5-pruned-emaonly.safetensors
+SD_MODEL_FILE=v1-5-pruned-emaonly.safetensors
--- a/jan-inference/traefik/config.yml
+++ b/jan-inference/traefik/config.yml
--- a/jan-inference/traefik/traefik.yml
+++ b/jan-inference/traefik/traefik.yml