LM-Studio

Ollama hat einige Schwächen, wie z.B.

Es kann keine GGUF-Dateien direkt laden (von denen habe ich einige Terabyte zur Verwendung mit TGWUI heruntergeladen)
Kontrolle über Kontext-Länge ist (unter Linux) sehr umständlich und nicht ohne weiteres einzeln für jedes Modell konfigurierbar. (?)
Kontrolle über andere Sampling-Parameter wie „Temperatur“ ist gar nicht möglich. (?)

An dieser Stelle kommt LM-Studio ins Spiel.

Du kannst (nur?) LLMs im GGUF-Format verwenden
Es bietet eine OpenAI-kompatible API (zumindest das Wichtgste)
Es bietet eine (Python- oder Javascipt/Typescript-) API, über die Modelle geladen werden können.
Es unterstützt „Strukturierte Ausgaben“ (siehe auch hier)
Es soll auch recht problemlos mit GPUs von AMD zusammenarbeiten.

Nachteile:

Im Gegensatz zu Ollama, das auch wunderbar als „Server-Prozess“ („headless“) läuft, benötigt LM-Studio eine Benutzeranmeldung bzw. eine X-Session (zumindest unter Linux ist das so).
Es gibt die API zum Laden von Modellen nur für Python und Javascript/Typescript.
Einige Dinge (GPU-Offload) lassen sich nur über die GUI beeinflussen, nicht beim Laden über die Python-API

im Tab Meine Modelle kannst Du den Pfad angeben, unter dem Du Deine GGUF-Dateien abgelegt hast. Du benötigst 2 übergeordnete Verzeichnisse (also /models/text/...)

Für das Problem, dass es die API nicht für Java gibt, habe ich mir einen „Python-Proxy“ geschrieben, über den es dann doch geht:

lms_proxy.py:

import asyncio
from time import time
from typing import Sequence, Literal, Optional

import lmstudio as lms
import uvicorn
from fastapi import FastAPI
from lmstudio import LlmLoadModelConfig, LlmInstanceInfo
from lmstudio._sdk_models import GpuSetting
from pydantic import BaseModel, Field
from starlette.responses import JSONResponse

app = FastAPI(title="LmsBackend")

class Model(BaseModel):
    model_key: str
    display_name: Optional[str] = None
    format: Optional[str] = None
    context_length: Optional[int] = None
    max_context_length: Optional[int] = None
    params_string: Optional[str] = None
    vision: Optional[bool] = None
    trained_for_tool_use: Optional[bool] = None
    architecture: Optional[str] = None
    ratio: Optional[float] = None

    @classmethod
    def from_llm_instance_info(cls, llm_instance_info: LlmInstanceInfo):
        return cls(
            model_key=llm_instance_info.model_key,
            display_name=llm_instance_info.display_name,
            format=llm_instance_info.format,
            context_length=llm_instance_info.context_length,
            max_context_length=llm_instance_info.max_context_length,
            params_string=llm_instance_info.params_string,
            vision=llm_instance_info.vision,
            trained_for_tool_use=llm_instance_info.trained_for_tool_use,
            architecture=llm_instance_info.architecture
        )

class LoadModelParams(BaseModel):
    model_key: str = "text/models/hermes-4-70b-q4_k_m.gguf"
    contextLength: int = 131072
    flashAttention: bool = True
    llamaKCacheQuantizationType: Literal["f32", "f16", "q8_0", "q4_0", "q4_1", "iq4_nl", "q5_0", "q5_1"] = "q8_0"
    llamaVCacheQuantizationType: Literal["f32", "f16", "q8_0", "q4_0", "q4_1", "iq4_nl", "q5_0", "q5_1"] = "q8_0"
    ratio: float =  Field(default=1.0, json_schema_extra={"format": "double"})
    offload_kv_cache_to_gpu: bool = True

    def config(self) -> LlmLoadModelConfig:
        result = LlmLoadModelConfig()
        result.context_length = self.contextLength
        result.flash_attention = self.flashAttention
        result.llama_k_cache_quantization_type = self.llamaKCacheQuantizationType
        result.llama_v_cache_quantization_type = self.llamaVCacheQuantizationType
        gpu = GpuSetting()
        gpu.ratio = self.ratio
        result.gpu = gpu
        return result

@app.post("/lms/loadModel/")
async def load_model(load_model_params: LoadModelParams):
    async with lms.AsyncClient() as client:
        await client.llm.model(load_model_params.model_key, config=load_model_params.config())

    start_time = time()
    model_key = load_model_params.model_key
    timeout = 60  # 1 Minute in Sekunden
    check_interval = 5  # Überprüfung alle 5 Sekunden

    while time() - start_time < timeout:
        if check_model_loaded(model_key):
            return JSONResponse(content={"message": "Model successfully loaded"}, status_code=200)
        await asyncio.sleep(check_interval)

    # Wenn nach 1 Minute das Modell immer noch geladen ist, gebe 408 zurück
    return JSONResponse(content={"message": "Timeout: Model not loaded"}, status_code=408)

@app.post("/lms/unloadModel/")
async def unload_model(unload_model_params: LoadModelParams):
    async with lms.AsyncClient() as client:
        await client.llm.unload(unload_model_params.model_key)

    start_time = time()
    model_key = unload_model_params.model_key
    timeout = 60  # 1 Minute in Sekunden
    check_interval = 5  # Überprüfung alle 5 Sekunden

    while time() - start_time < timeout:
        if not check_model_loaded(model_key):
            return JSONResponse(content={"message": "Model successfully unloaded"}, status_code=200)
        await asyncio.sleep(check_interval)

    # Wenn nach 1 Minute das Modell immer noch geladen ist, gebe 408 zurück
    return JSONResponse(content={"message": "Timeout: Model still loaded"}, status_code=408)

@app.get("/lms/getDownloadedModels")
async def get_downloaded_models() -> Sequence[Model]:
    with lms.Client() as client:
        downloaded = client.llm.list_downloaded()
    models = []
    for downloaded_model in downloaded:
        if downloaded_model.model_key.startswith("models@"):
            continue
        models.append(Model(model_key=downloaded_model.model_key))
    return models

@app.get("/lms/getLoadedModels")
async def get_loaded_models() -> Sequence[Model]:
    with lms.Client() as client:
        loaded = client.llm.list_loaded()
    models = list[Model]()
    for loaded_model in loaded:
        print(loaded_model)
        models.append(Model(model_key=loaded_model.identifier))
    return models

@app.get("/lms/getModelInfo")
async def get_model_info():
    with (lms.Client() as client):
        model = client.llm.model()
        print(model.get_info())
        info = model.get_info()

        result = Model.from_llm_instance_info(info)
        return result

def check_model_loaded(model_key: str) -> bool:
    with lms.Client() as client:
        loaded = client.llm.list_loaded()
    for loaded_model in loaded:
        if loaded_model.identifier == model_key:
            return True
    return False

def main():
    uvicorn.run(app, host="0.0.0.0", port=55555)

if __name__ == '__main__':
    main()

requirements.txt:

aiohappyeyeballs==2.6.1
aiohttp==3.13.2
aiosignal==1.4.0
annotated-doc==0.0.3
annotated-types==0.7.0
anyio==4.11.0
attrs==25.4.0
boto3==1.40.62
botocore==1.40.62
certifi==2025.10.5
charset-normalizer==3.4.4
click==8.3.0
dataclasses-json==0.6.7
distro==1.9.0
fastapi==0.120.2
frozenlist==1.8.0
greenlet==3.2.4
h11==0.16.0
httpcore==1.0.9
httpx==0.28.1
httpx-sse==0.4.3
httpx-ws==0.8.1
idna==3.11
jiter==0.11.1
jmespath==1.0.1
jsonpatch==1.33
jsonpointer==3.0.0
langchain==0.3.27
langchain-community==0.3.31
langchain-core==0.3.79
langchain-experimental==0.3.4
langchain-text-splitters==0.3.11
langsmith==0.4.38
llmstudio==1.0.6
llmstudio-core==1.0.4
lmstudio==1.5.0
marshmallow==3.26.1
msgspec==0.19.0
multidict==6.7.0
mypy==1.18.2
mypy_extensions==1.1.0
numpy==2.3.4
openai==1.109.1
orjson==3.11.4
packaging==25.0
pathspec==0.12.1
propcache==0.4.1
pydantic==2.12.3
pydantic-settings==2.11.0
pydantic_core==2.41.4
python-dateutil==2.9.0.post0
python-dotenv==1.2.1
PyYAML==6.0.3
regex==2025.10.23
requests==2.32.5
requests-toolbelt==1.0.0
s3transfer==0.14.0
six==1.17.0
sniffio==1.3.1
SQLAlchemy==2.0.44
starlette==0.49.1
tenacity==9.1.2
tiktoken==0.7.0
tqdm==4.67.1
typing-inspect==0.9.0
typing-inspection==0.4.2
typing_extensions==4.15.0
urllib3==2.5.0
uvicorn==0.38.0
wsproto==1.2.0
yarl==1.22.0
zstandard==0.25.0