add llamacpp-extension. can list some models

2025-05-07 15:23:40 +07:00 · 2025-05-07 15:23:40 +07:00 · 3f082372fd
commit 3f082372fd
parent 15f0b11c0d
6 changed files with 289 additions and 0 deletions
--- a/extensions/llamacpp-extension/package.json
+++ b/extensions/llamacpp-extension/package.json
@ -0,0 +1,42 @@
 {
  "name": "@janhq/llamacpp-extension",
  "productName": "llama.cpp Inference Engine",
  "version": "1.0.0",
  "description": "This extension enables llama.cpp chat completion API calls",
  "main": "dist/index.js",
  "module": "dist/module.js",
  "engine": "llama.cpp",
  "author": "Jan <service@jan.ai>",
  "license": "AGPL-3.0",
  "scripts": {
    "build": "rolldown -c rolldown.config.mjs",
    "build:publish": "rimraf *.tgz --glob || true && yarn build && npm pack && cpx *.tgz ../../pre-install"
  },
  "devDependencies": {
    "cpx": "^1.5.0",
    "rimraf": "^3.0.2",
    "rolldown": "1.0.0-beta.1",
    "ts-loader": "^9.5.0",
    "typescript": "^5.7.2"
  },
  "dependencies": {
    "@janhq/core": "../../core/package.tgz",
    "fetch-retry": "^5.0.6",
    "ulidx": "^2.3.0"
  },
  "engines": {
    "node": ">=18.0.0"
  },
  "files": [
    "dist/*",
    "package.json",
    "README.md"
  ],
  "bundleDependencies": [
    "fetch-retry"
  ],
  "installConfig": {
    "hoistingLimits": "workspaces"
  },
  "packageManager": "yarn@4.5.3"
 }
--- a/extensions/llamacpp-extension/rolldown.config.mjs
+++ b/extensions/llamacpp-extension/rolldown.config.mjs
@ -0,0 +1,17 @@
 import { defineConfig } from 'rolldown'
 import pkgJson from './package.json' with { type: 'json' }
 import settingJson from './settings.json' with { type: 'json' }
 export default defineConfig({
  input: 'src/index.ts',
  output: {
    format: 'esm',
    file: 'dist/index.js',
  },
  platform: 'browser',
  define: {
    SETTINGS: JSON.stringify(settingJson),
    ENGINE: JSON.stringify(pkgJson.engine),
  },
 })
--- a/extensions/llamacpp-extension/settings.json
+++ b/extensions/llamacpp-extension/settings.json
@ -0,0 +1,98 @@
 [
  {
    "key": "port",
    "title": "Port",
    "description": "Port",
    "controllerType": "input",
    "controllerProps": {
      "value": "8080",
      "placeholder": "8080",
      "type": "number",
      "textAlign": "right"
    }
  },
  {
    "key": "cont_batching",
    "title": "Continuous Batching",
    "description": "Allows processing prompts in parallel with text generation, which usually improves performance.",
    "controllerType": "checkbox",
    "controllerProps": {
      "value": true
    }
  },
  {
    "key": "n_parallel",
    "title": "Parallel Operations",
    "description": "Number of prompts that can be processed simultaneously by the model.",
    "controllerType": "input",
    "controllerProps": {
      "value": "4",
      "placeholder": "4",
      "type": "number",
      "textAlign": "right"
    }
  },
  {
    "key": "cpu_threads",
    "title": "CPU Threads",
    "description": "Number of CPU cores used for model processing when running without GPU.",
    "controllerType": "input",
    "controllerProps": {
      "value": "",
      "placeholder": "Number of CPU threads",
      "type": "number",
      "textAlign": "right"
    }
  },
  {
    "key": "flash_attn",
    "title": "Flash Attention",
    "description": "Optimizes memory usage and speeds up model inference using an efficient attention implementation.",
    "controllerType": "checkbox",
    "controllerProps": {
      "value": true
    }
  },
  {
    "key": "caching_enabled",
    "title": "Caching",
    "description": "Stores recent prompts and responses to improve speed when similar questions are asked.",
    "controllerType": "checkbox",
    "controllerProps": {
      "value": true
    }
  },
  {
    "key": "cache_type",
    "title": "KV Cache Type",
    "description": "Controls memory usage and precision trade-off.",
    "controllerType": "dropdown",
    "controllerProps": {
      "value": "f16",
      "options": [
        {
          "value": "q4_0",
          "name": "q4_0"
        },
        {
          "value": "q8_0",
          "name": "q8_0"
        },
        {
          "value": "f16",
          "name": "f16"
        }
      ]
    }
  },
  {
    "key": "use_mmap",
    "title": "mmap",
    "description": "Loads model files more efficiently by mapping them to memory, reducing RAM usage.",
    "controllerType": "checkbox",
    "controllerProps": {
      "value": true
    }
  }
 ]
--- a/extensions/llamacpp-extension/src/env.d.ts
+++ b/extensions/llamacpp-extension/src/env.d.ts
@ -0,0 +1,2 @@
 declare const SETTINGS: SettingComponentProps[]
 declare const ENGINE: string
--- a/extensions/llamacpp-extension/src/index.ts
+++ b/extensions/llamacpp-extension/src/index.ts
@ -0,0 +1,115 @@
 /**
 * @file This file exports a class that implements the InferenceExtension interface from the @janhq/core package.
 * The class provides methods for initializing and stopping a model, and for making inference requests.
 * It also subscribes to events emitted by the @janhq/core package and handles new message requests.
 * @version 1.0.0
 * @module llamacpp-extension/src/index
 */
 import { RemoteOAIEngine, getJanDataFolderPath, fs, ModelCapability, Model } from '@janhq/core'
 export enum Settings {
  port = 'port',
 }
 /**
 * A class that implements the InferenceExtension interface from the @janhq/core package.
 * The class provides methods for initializing and stopping a model, and for making inference requests.
 * It also subscribes to events emitted by the @janhq/core package and handles new message requests.
 */
 export default class LlamacppProvider extends RemoteOAIEngine {
  inferenceUrl: string = ''
  baseURL: string = ''
  provider: string = ENGINE
  override async onLoad(): Promise<void> {
    super.onLoad()
    // Register Settings
    this.registerSettings(SETTINGS)
    // register models
    const models = await this.listModels()
    this.registerModels(models)
    // NOTE: port 0 may mean request free port from OS. we may want
    // to take advantage of this. llama-server --port 0 on macOS works.
    const port = await this.getSetting<number>(Settings.port, 0)
    this.updateBaseUrl(port)
  }
  // onSettingUpdate<T>(key: string, value: T): void {
  //   if (key === Settings.apiKey) {
  //     this.apiKey = value as string
  //   } else if (key === Settings.baseUrl) {
  //     if (typeof value !== 'string') return
  //     this.updateBaseUrl(value)
  //   }
  // }
  updateBaseUrl(value: number): void {
    if (value == 0) {
      // set to default value
      SETTINGS.forEach((setting) => {
        if (setting.key === Settings.port) {
          value = setting.controllerProps.value as number
        }
      })
    }
    this.baseURL = `http://127.0.0.1:${value}`
    this.inferenceUrl = `${this.baseURL}/chat/completions`
  }
  async listModels(): Promise<Model[]> {
    let modelIds = []
    const modelsFolder = `${await getJanDataFolderPath()}/models`
    // cortexso models
    const cortexsoFolder = `${modelsFolder}/cortex.so`
    const modelDirs = await fs.readdirSync(cortexsoFolder)
    for (const modelDir of modelDirs) {
      const modelName = modelDir.split('/').pop()
      // TODO: try removing this check
      // skip files start with . e.g. .DS_store
      if (!modelName || modelName.startsWith('.')) continue
      const variantDirs = await fs.readdirSync(modelDir)
      for (const variantDir of variantDirs) {
        // NOTE: we can't detect unfinished download here
        const ggufPath = `${variantDir}/model.gguf`
        if (await fs.existsSync(ggufPath)) {
          const variantName = variantDir.split('/').pop()
          modelIds.push(`${modelName}/${variantName}`)
        }
      }
    }
    // TODO: list models under huggingface.co
    const models = modelIds.map((modelId) => {
      return {
        sources: [],
        object: 'model',
        version: '1.0',
        format: 'api',
        id: modelId,
        name: modelId,
        created: 0,
        description: '',
        settings: {},
        parameters: {},
        metadata: {
          author: '',
          tags: [],
          size: 0,
        },
        engine: this.provider,
        capabilities: [ModelCapability.completion],
      }
    })
    return models
  }
 }
--- a/extensions/llamacpp-extension/tsconfig.json
+++ b/extensions/llamacpp-extension/tsconfig.json
@ -0,0 +1,15 @@
 {
  "compilerOptions": {
    "target": "es2016",
    "module": "ES6",
    "moduleResolution": "node",
    "outDir": "./dist",
    "esModuleInterop": true,
    "forceConsistentCasingInFileNames": true,
    "strict": false,
    "skipLibCheck": true,
    "rootDir": "./src"
  },
  "include": ["./src"],
  "exclude": ["**/*.test.ts"]
 }
		`@ -0,0 +1,2 @@`
							`declare const SETTINGS: SettingComponentProps[]`
							`declare const ENGINE: string`