feat(inference): Init commit for inference llm using python llama ggml
This commit is contained in:
parent
b87564fba9
commit
6ef61c45ec
3
.gitignore
vendored
Normal file
3
.gitignore
vendored
Normal file
@ -0,0 +1,3 @@
|
||||
# Jan inference
|
||||
# jan-inference/llm/models/**
|
||||
jan-inference/llm/.env
|
||||
2
jan-inference/llm/.env.example
Normal file
2
jan-inference/llm/.env.example
Normal file
@ -0,0 +1,2 @@
|
||||
MODEL_URL=https://huggingface.co/TheBloke/Llama-2-7B-Chat-GGML/resolve/main/llama-2-7b-chat.ggmlv3.q4_1.bin
|
||||
MODEL_FILE=$(basename $MODEL_URL)
|
||||
8
jan-inference/llm/README.md
Normal file
8
jan-inference/llm/README.md
Normal file
@ -0,0 +1,8 @@
|
||||
# Inference - LLM
|
||||
|
||||
```bash
|
||||
docker network create traefik_public
|
||||
cp .env.example .env
|
||||
# -> Update MODEL_URL in `.env` file
|
||||
docker compose up -d --scale llm=2
|
||||
``````
|
||||
94
jan-inference/llm/docker-compose.yml
Normal file
94
jan-inference/llm/docker-compose.yml
Normal file
@ -0,0 +1,94 @@
|
||||
version: '3'
|
||||
|
||||
services:
|
||||
|
||||
# Service to download a model file.
|
||||
downloader:
|
||||
image: busybox
|
||||
# The command extracts the model filename from MODEL_URL and downloads it if it doesn't exist.
|
||||
command: /bin/sh -c "MODEL_FILE=$(basename ${MODEL_URL}); if [ ! -f /models/$MODEL_FILE ]; then wget -O /models/$MODEL_FILE ${MODEL_URL}; fi"
|
||||
# Mount a local directory to store the downloaded model.
|
||||
volumes:
|
||||
- ./models:/models
|
||||
|
||||
# Service to wait for the downloader service to finish downloading the model.
|
||||
wait-for-downloader:
|
||||
image: busybox
|
||||
# The command waits until the model file (specified in MODEL_URL) exists.
|
||||
command: /bin/sh -c "MODEL_FILE=$(basename ${MODEL_URL}); echo 'Waiting for downloader to finish'; while [ ! -f /models/$MODEL_FILE ]; do sleep 1; done; echo 'Model downloaded!'"
|
||||
# Specifies that this service should start after the downloader service has started.
|
||||
depends_on:
|
||||
downloader:
|
||||
condition: service_started
|
||||
# Mount the same local directory to check for the downloaded model.
|
||||
volumes:
|
||||
- ./models:/models
|
||||
|
||||
# Service to run the Llama web application.
|
||||
llm:
|
||||
image: ghcr.io/abetlen/llama-cpp-python:latest
|
||||
# Mount the directory that contains the downloaded model.
|
||||
volumes:
|
||||
- ./models:/models
|
||||
environment:
|
||||
# Specify the path to the model for the web application.
|
||||
MODEL: /models/llama-2-7b-chat.ggmlv3.q4_1.bin
|
||||
labels:
|
||||
# Instead of using the Host rule, set a PathPrefix rule
|
||||
- "traefik.http.routers.web.rule=PathPrefix(`/inference/llm`)"
|
||||
# This tells Traefik where to forward the traffic for this route.
|
||||
- "traefik.http.routers.web.service=llm"
|
||||
# Define a service for the llm and specify its load balancer configuration
|
||||
- "traefik.http.services.llm-service.loadbalancer.server.port=8000"
|
||||
|
||||
- "traefik.http.middlewares.strip-llm-prefix.stripprefix.prefixes=/inference/llm"
|
||||
- "traefik.http.routers.web.middlewares=strip-llm-prefix"
|
||||
# Health check configuration
|
||||
healthcheck:
|
||||
test: ["CMD", "wget", "--quiet", "--tries=1", "--spider", "http://localhost:8000"]
|
||||
interval: 30s
|
||||
timeout: 10s
|
||||
retries: 3
|
||||
start_period: 30s
|
||||
# Restart policy configuration
|
||||
restart: on-failure
|
||||
# Specifies that this service should start only after wait-for-downloader has completed successfully.
|
||||
depends_on:
|
||||
wait-for-downloader:
|
||||
condition: service_completed_successfully
|
||||
# Connect this service to two networks: inference_net and traefik_public.
|
||||
networks:
|
||||
- inference_net
|
||||
- traefik_public
|
||||
|
||||
# Service for Traefik, a modern HTTP reverse proxy and load balancer.
|
||||
traefik:
|
||||
image: traefik:v2.5
|
||||
command:
|
||||
# Enable the Traefik API dashboard without TLS (not recommended for production).
|
||||
- "--api.insecure=true"
|
||||
# Enable Traefik to use Docker as a provider.
|
||||
- "--providers.docker=true"
|
||||
# Do not expose services by default. Explicitly specify in each service if it should be exposed.
|
||||
- "--providers.docker.exposedbydefault=false"
|
||||
# Specify the default entry point on port 80.
|
||||
- "--entrypoints.web.address=:80"
|
||||
ports:
|
||||
# Map port 80 in the container to port 80 on the host.
|
||||
- "80:80"
|
||||
# Map port 8080 in the container (Traefik's dashboard) to port 8080 on the host.
|
||||
- "8080:8080"
|
||||
# Mount the Docker socket to allow Traefik to listen to Docker's API.
|
||||
volumes:
|
||||
- /var/run/docker.sock:/var/run/docker.sock
|
||||
# Connect this service to the traefik_public network.
|
||||
networks:
|
||||
- traefik_public
|
||||
|
||||
# Define networks used in this docker-compose file.
|
||||
networks:
|
||||
# Network for the llm service (used for inference).
|
||||
inference_net:
|
||||
# Public-facing network that Traefik uses. Marked as external to indicate it may be defined outside this file.
|
||||
traefik_public:
|
||||
external: true
|
||||
0
jan-inference/llm/models/.gitkeep
Normal file
0
jan-inference/llm/models/.gitkeep
Normal file
Loading…
x
Reference in New Issue
Block a user