diff --git a/.github/workflows/pr.yml b/.github/workflows/pr.yml index 5a5fff39..86f5ceb3 100644 --- a/.github/workflows/pr.yml +++ b/.github/workflows/pr.yml @@ -7,7 +7,7 @@ on: jobs: run-notebooks: - name: Run example Python notebooks + name: Cookbook notebooks runs-on: ubuntu-latest steps: - uses: actions/checkout@b4ffde65f46336ab88eb53be808477a3936bae11 # v4.1.1 @@ -18,8 +18,8 @@ jobs: cwd: "./dial-cookbook/ci" up-flags: "--abort-on-container-exit --exit-code-from test --timeout 300" - run-quickstart: - name: Run quickstart examples + run-quickstart-model: + name: Quickstart model runs-on: ubuntu-latest steps: - uses: actions/checkout@b4ffde65f46336ab88eb53be808477a3936bae11 # v4.1.1 @@ -28,19 +28,44 @@ jobs: with: cwd: "./dial-docker-compose/ci/model" up-flags: "--abort-on-container-exit --exit-code-from test --timeout 300" + + run-quickstart-application: + name: Quickstart application + runs-on: ubuntu-latest + steps: + - uses: actions/checkout@b4ffde65f46336ab88eb53be808477a3936bae11 # v4.1.1 - uses: isbang/compose-action@802a148945af6399a338c7906c267331b39a71af # v2.0.0 - name: Run quickstart application example with: cwd: "./dial-docker-compose/ci/application" up-flags: "--abort-on-container-exit --exit-code-from test --timeout 300" + + run-quickstart-addon: + name: Quickstart addon + runs-on: ubuntu-latest + steps: + - uses: actions/checkout@b4ffde65f46336ab88eb53be808477a3936bae11 # v4.1.1 - uses: isbang/compose-action@802a148945af6399a338c7906c267331b39a71af # v2.0.0 - name: Run quickstart addon example with: cwd: "./dial-docker-compose/ci/addon" up-flags: "--abort-on-container-exit --exit-code-from test --timeout 300" + run-quickstart-self-hosted-model: + name: Quickstart self-hosted model + runs-on: ubuntu-latest + steps: + - uses: actions/checkout@b4ffde65f46336ab88eb53be808477a3936bae11 # v4.1.1 + - uses: isbang/compose-action@802a148945af6399a338c7906c267331b39a71af # v2.0.0 + with: + cwd: "./dial-docker-compose/ci/ollama" + up-flags: "--abort-on-container-exit --exit-code-from test --timeout 300" + build: - needs: [run-notebooks, run-quickstart] + needs: + - run-notebooks + - run-quickstart-model + - run-quickstart-application + - run-quickstart-addon + - run-quickstart-self-hosted-model runs-on: ubuntu-latest steps: - uses: actions/checkout@b4ffde65f46336ab88eb53be808477a3936bae11 # v4.1.1 diff --git a/.github/workflows/release.yml b/.github/workflows/release.yml index 38cdb0ba..e3828138 100644 --- a/.github/workflows/release.yml +++ b/.github/workflows/release.yml @@ -7,7 +7,7 @@ on: jobs: run-notebooks: - name: Run example Python notebooks + name: Cookbook notebooks runs-on: ubuntu-latest steps: - uses: actions/checkout@b4ffde65f46336ab88eb53be808477a3936bae11 # v4.1.1 @@ -18,8 +18,9 @@ jobs: cwd: "./dial-cookbook/ci" up-flags: "--abort-on-container-exit --exit-code-from test --timeout 300" - run-quickstart: - name: Run quickstart examples + + run-quickstart-model: + name: Quickstart model runs-on: ubuntu-latest steps: - uses: actions/checkout@b4ffde65f46336ab88eb53be808477a3936bae11 # v4.1.1 @@ -28,19 +29,44 @@ jobs: with: cwd: "./dial-docker-compose/ci/model" up-flags: "--abort-on-container-exit --exit-code-from test --timeout 300" + + run-quickstart-application: + name: Quickstart application + runs-on: ubuntu-latest + steps: + - uses: actions/checkout@b4ffde65f46336ab88eb53be808477a3936bae11 # v4.1.1 - uses: isbang/compose-action@802a148945af6399a338c7906c267331b39a71af # v2.0.0 - name: Run quickstart application example with: cwd: "./dial-docker-compose/ci/application" up-flags: "--abort-on-container-exit --exit-code-from test --timeout 300" + + run-quickstart-addon: + name: Quickstart addon + runs-on: ubuntu-latest + steps: + - uses: actions/checkout@b4ffde65f46336ab88eb53be808477a3936bae11 # v4.1.1 - uses: isbang/compose-action@802a148945af6399a338c7906c267331b39a71af # v2.0.0 - name: Run quickstart addon example with: cwd: "./dial-docker-compose/ci/addon" up-flags: "--abort-on-container-exit --exit-code-from test --timeout 300" + run-quickstart-self-hosted-model: + name: Quickstart self-hosted model + runs-on: ubuntu-latest + steps: + - uses: actions/checkout@b4ffde65f46336ab88eb53be808477a3936bae11 # v4.1.1 + - uses: isbang/compose-action@802a148945af6399a338c7906c267331b39a71af # v2.0.0 + with: + cwd: "./dial-docker-compose/ci/ollama" + up-flags: "--abort-on-container-exit --exit-code-from test --timeout 300" + build-and-deploy: - needs: [run-notebooks, run-quickstart] + needs: + - run-notebooks + - run-quickstart-model + - run-quickstart-application + - run-quickstart-addon + - run-quickstart-self-hosted-model runs-on: ubuntu-latest steps: - uses: actions/checkout@b4ffde65f46336ab88eb53be808477a3936bae11 # v4.1.1 diff --git a/.gitignore b/.gitignore index 288b27ff..1c0d3cb8 100644 --- a/.gitignore +++ b/.gitignore @@ -23,8 +23,11 @@ yarn-error.log* .venv __pycache__ -# DIAL Core logs -*.log +# Docker container volumes +core-data +core-logs +.ollama + /.quarto/ # Autogenerated files by Quarto diff --git a/README.md b/README.md index c169e8f2..db8095ca 100644 --- a/README.md +++ b/README.md @@ -13,10 +13,11 @@ ## Helm Deployment * [AI DIAL Generic Installation Simple Guide](https://github.com/epam/ai-dial-helm/tree/main/charts/dial/examples/generic/simple) - + ## Tutorials * [Launch AI DIAL Chat with an Azure model](./docs/tutorials/quick-start-model.md) +* [Launch AI DIAL Chat with a self-hosted model](./docs/tutorials/quick-start-with-self-hosted-model.md) * [Launch AI DIAL Chat with a Sample Application](./docs/tutorials/quick-start-with-application.md) * [Launch AI DIAL Chat with a Sample Addon](./docs/tutorials/quick-start-with-addon.md) @@ -31,7 +32,7 @@ ## Configuration * Refer to [Configuration](./docs/Deployment/configuration.md) - + ## Other AI DIAL Project Open Source Repositories Here is the current list of repositories where you can find more details. You can also refer to [repository map](https://epam-rail.com/open-source). diff --git a/dial-docker-compose/addon/docker-compose.yml b/dial-docker-compose/addon/docker-compose.yml index 9df5bd32..65c3715d 100644 --- a/dial-docker-compose/addon/docker-compose.yml +++ b/dial-docker-compose/addon/docker-compose.yml @@ -4,7 +4,7 @@ include: services: adapter-openai: - image: epam/ai-dial-adapter-openai:0.11.0 + image: epam/ai-dial-adapter-openai:0.14.0 environment: WEB_CONCURRENCY: "3" diff --git a/dial-docker-compose/ci/ollama/.env b/dial-docker-compose/ci/ollama/.env new file mode 100644 index 00000000..15ace1c5 --- /dev/null +++ b/dial-docker-compose/ci/ollama/.env @@ -0,0 +1,4 @@ +DIAL_DIR="./ollama" +OLLAMA_CHAT_MODEL=llama3.1:8b-instruct-q4_0 +OLLAMA_VISION_MODEL=llava-phi3:3.8b-mini-q4_0 +OLLAMA_EMBEDDING_MODEL=nomic-embed-text:137m-v1.5-fp16 \ No newline at end of file diff --git a/dial-docker-compose/ci/ollama/docker-compose.yml b/dial-docker-compose/ci/ollama/docker-compose.yml new file mode 100644 index 00000000..aff7e6cc --- /dev/null +++ b/dial-docker-compose/ci/ollama/docker-compose.yml @@ -0,0 +1,16 @@ +include: + - path: ../../ollama/docker-compose.yml + env_file: ./.env + +services: + test: + build: test + environment: + DIAL_URL: "http://core:8080" + DIAL_API_KEY: "dial_api_key" + DIAL_API_VERSION: "2024-02-01" + depends_on: + ollama-setup: + condition: service_healthy + core: + condition: service_healthy \ No newline at end of file diff --git a/dial-docker-compose/ci/ollama/test/.dockerignore b/dial-docker-compose/ci/ollama/test/.dockerignore new file mode 100644 index 00000000..1d1fe94d --- /dev/null +++ b/dial-docker-compose/ci/ollama/test/.dockerignore @@ -0,0 +1 @@ +Dockerfile \ No newline at end of file diff --git a/dial-docker-compose/ci/ollama/test/Dockerfile b/dial-docker-compose/ci/ollama/test/Dockerfile new file mode 100644 index 00000000..c41fc702 --- /dev/null +++ b/dial-docker-compose/ci/ollama/test/Dockerfile @@ -0,0 +1,7 @@ +FROM python:3.11-alpine + +WORKDIR /app +COPY * /app +RUN pip install -r requirements.txt + +CMD ["python", "app.py"] \ No newline at end of file diff --git a/dial-docker-compose/ci/ollama/test/app.py b/dial-docker-compose/ci/ollama/test/app.py new file mode 100644 index 00000000..66e49085 --- /dev/null +++ b/dial-docker-compose/ci/ollama/test/app.py @@ -0,0 +1,141 @@ +import base64 +import os +from pathlib import Path +from typing import Any +import aiohttp +import asyncio +import backoff + +import logging +import time +from contextlib import asynccontextmanager + + +def get_env(name: str) -> str: + value = os.environ.get(name) + if value is None: + raise ValueError(f"'{name}' environment variable must be defined") + return value + + +DIAL_URL = get_env("DIAL_URL") +DIAL_API_KEY = get_env("DIAL_API_KEY") +DIAL_API_VERSION = get_env("DIAL_API_VERSION") + +logging.basicConfig(level=logging.DEBUG) +log = logging.getLogger(__name__) + + +@asynccontextmanager +async def timer(name: str): + log.debug(f"[{name}] Starting...") + start = time.perf_counter() + yield + elapsed = time.perf_counter() - start + log.debug(f"[{name}] Executed in {elapsed:.2f} seconds") + + +@backoff.on_exception( + backoff.expo, + (aiohttp.ClientError, aiohttp.ServerTimeoutError), + max_time=60, +) +async def post_with_retry(url: str, payload: dict, headers: dict, params: dict): + async with aiohttp.ClientSession() as session: + async with session.post( + url, json=payload, headers=headers, params=params + ) as response: + response.raise_for_status() + return await response.json() + + +def read_image_base64(png_file: Path) -> str: + return base64.b64encode(png_file.read_bytes()).decode("utf-8") + +async def dial_chat_completion(deployment_id: str, messages: list) -> str: + api_url = f"{DIAL_URL}/openai/deployments/{deployment_id}/chat/completions" + + payload = { + "model": deployment_id, + "messages": messages, + "stream": False, + } + headers = {"api-key": DIAL_API_KEY} + params = {"api-version": DIAL_API_VERSION} + + body = await post_with_retry(api_url, payload, headers, params) + log.debug(f"Response: {body}") + + content = body.get("choices", [])[0].get("message", {}).get("content", "") + + log.debug(f"Content: {content}") + + return content + +async def dial_embeddings(deployment_id: str, input: Any) -> str: + api_url = f"{DIAL_URL}/openai/deployments/{deployment_id}/embeddings" + + payload = { + "model": deployment_id, + "input": input, + } + headers = {"api-key": DIAL_API_KEY} + params = {"api-version": DIAL_API_VERSION} + + body = await post_with_retry(api_url, payload, headers, params) + log.debug(f"Response: {body}") + + embedding = body.get("data", [])[0].get("embedding", []) + + log.debug(f"Len embedding vector: {len(embedding)}") + + return embedding + +async def test_chat_model(deployment_id: str): + message = "2 + 3 = ? Reply with a single number:" + messages = [{"role": "user", "content": message}] + content = await dial_chat_completion(deployment_id, messages) + + if "5" not in content: + raise ValueError(f"Test failed for {deployment_id!r}") + + +async def test_vision_model(deployment_id: str): + base64_data = read_image_base64(Path("./image.png")) + base64_image = f"data:image/png;base64,{base64_data}" + + messages = [ + { + "role": "user", + "content": [ + {"type": "text", "text": "Describe the image"}, + {"type": "image_url", "image_url": {"url": base64_image}}, + ], + } + ] + + content = await dial_chat_completion(deployment_id, messages) + + if "vision" not in content.lower(): + raise ValueError(f"Test failed for {deployment_id!r}") + +async def test_embedding_model(deployment_id: str): + embeddings = await dial_embeddings(deployment_id, "cat") + + if len(embeddings) == 0 or not isinstance(embeddings[0], float): + raise ValueError(f"Test failed for {deployment_id!r}") + + +async def tests(): + async with timer("Testing chat-model"): + await test_chat_model("chat-model") + + async with timer("Testing vision-model"): + await test_vision_model("vision-model") + + async with timer("Testing embedding-model"): + await test_embedding_model("embedding-model") + +if __name__ == "__main__": + loop = asyncio.get_event_loop() + loop.run_until_complete(tests()) diff --git a/dial-docker-compose/ci/ollama/test/image.png b/dial-docker-compose/ci/ollama/test/image.png new file mode 100644 index 00000000..bfbcfc7e Binary files /dev/null and b/dial-docker-compose/ci/ollama/test/image.png differ diff --git a/dial-docker-compose/ci/ollama/test/requirements.txt b/dial-docker-compose/ci/ollama/test/requirements.txt new file mode 100644 index 00000000..1c6d30a5 --- /dev/null +++ b/dial-docker-compose/ci/ollama/test/requirements.txt @@ -0,0 +1,2 @@ +aiohttp==3.9.4 +backoff==2.2.1 \ No newline at end of file diff --git a/dial-docker-compose/common.yml b/dial-docker-compose/common.yml index 6da6adbb..8e2ae74a 100644 --- a/dial-docker-compose/common.yml +++ b/dial-docker-compose/common.yml @@ -1,13 +1,13 @@ services: themes: - image: epam/ai-dial-chat-themes:0.4.0 + image: epam/ai-dial-chat-themes:0.6.0 ports: - "3001:8080" chat: ports: - "3000:3000" - image: epam/ai-dial-chat:0.10.0 + image: epam/ai-dial-chat:0.17.0 depends_on: - themes - core @@ -36,7 +36,7 @@ services: user: ${UID:-root} ports: - "8080:8080" - image: epam/ai-dial-core:0.9.0 + image: epam/ai-dial-core:0.16.0 environment: 'AIDIAL_SETTINGS': '/opt/settings/settings.json' 'JAVA_OPTS': '-Dgflog.config=/opt/settings/gflog.xml' diff --git a/dial-docker-compose/model/docker-compose.yml b/dial-docker-compose/model/docker-compose.yml index 1dd03ae0..a918cb85 100644 --- a/dial-docker-compose/model/docker-compose.yml +++ b/dial-docker-compose/model/docker-compose.yml @@ -4,6 +4,6 @@ include: services: adapter-openai: - image: epam/ai-dial-adapter-openai:0.11.0 + image: epam/ai-dial-adapter-openai:0.14.0 environment: WEB_CONCURRENCY: "3" \ No newline at end of file diff --git a/dial-docker-compose/ollama/.env b/dial-docker-compose/ollama/.env new file mode 100644 index 00000000..cdabc6b5 --- /dev/null +++ b/dial-docker-compose/ollama/.env @@ -0,0 +1,4 @@ +DIAL_DIR="./ollama" +OLLAMA_CHAT_MODEL= +OLLAMA_VISION_MODEL= +OLLAMA_EMBEDDING_MODEL= \ No newline at end of file diff --git a/dial-docker-compose/ollama/.env.example b/dial-docker-compose/ollama/.env.example new file mode 100644 index 00000000..06216195 --- /dev/null +++ b/dial-docker-compose/ollama/.env.example @@ -0,0 +1,3 @@ +OLLAMA_CHAT_MODEL=llama3.1:8b-instruct-q4_0 +OLLAMA_VISION_MODEL=llava-phi3:3.8b-mini-q4_0 +OLLAMA_EMBEDDING_MODEL=bge-m3:567m-fp16 \ No newline at end of file diff --git a/dial-docker-compose/ollama/core/config.json b/dial-docker-compose/ollama/core/config.json new file mode 100644 index 00000000..e4b370cc --- /dev/null +++ b/dial-docker-compose/ollama/core/config.json @@ -0,0 +1,44 @@ +{ + "routes": {}, + "models": { + "chat-model": { + "type": "chat", + "displayName": "Self-hosted chat model", + "endpoint": "http://ollama:11434/v1/chat/completions" + }, + "vision-model": { + "type": "chat", + "displayName": "Self-hosted vision model", + "endpoint": "http://adapter-openai:5000/openai/deployments/vision-model/chat/completions", + "inputAttachmentTypes": [ + "image/png", + "image/jpeg" + ], + "upstreams": [ + { + "endpoint": "http://ollama:11434/v1/chat/completions", + "key": "dummy-key" + } + ] + }, + "embedding-model": { + "type": "embedding", + "endpoint": "http://ollama:11434/v1/embeddings" + } + }, + "keys": { + "dial_api_key": { + "project": "TEST-PROJECT", + "role": "default" + } + }, + "roles": { + "default": { + "limits": { + "chat-model": {}, + "vision-model": {}, + "embedding-model": {} + } + } + } +} \ No newline at end of file diff --git a/dial-docker-compose/ollama/docker-compose.yml b/dial-docker-compose/ollama/docker-compose.yml new file mode 100644 index 00000000..68e316f0 --- /dev/null +++ b/dial-docker-compose/ollama/docker-compose.yml @@ -0,0 +1,35 @@ +include: + - path: ../common.yml + env_file: ./.env + +services: + ollama: + image: ollama/ollama:0.3.10 + volumes: + - ./.ollama:/root/.ollama + ports: + - "11434:11434" + + ollama-setup: + depends_on: + ollama: + condition: service_started + build: ./ollama_setup + environment: + - OLLAMA_URL=http://ollama:11434 + - OLLAMA_CHAT_MODEL=${OLLAMA_CHAT_MODEL} + - OLLAMA_VISION_MODEL=${OLLAMA_VISION_MODEL} + - OLLAMA_EMBEDDING_MODEL=${OLLAMA_EMBEDDING_MODEL} + healthcheck: + test: ["CMD", "test", "-f", "/healthy"] + interval: 10s + start_period: 10s + retries: 10 + + adapter-openai: + image: epam/ai-dial-adapter-openai:0.14.0 + environment: + WEB_CONCURRENCY: "3" + DIAL_URL: "http://core:8080" + DIAL_USE_FILE_STORAGE: "True" + GPT4_VISION_DEPLOYMENTS: "vision-model" \ No newline at end of file diff --git a/dial-docker-compose/ollama/ollama_setup/Dockerfile b/dial-docker-compose/ollama/ollama_setup/Dockerfile new file mode 100644 index 00000000..91b223bf --- /dev/null +++ b/dial-docker-compose/ollama/ollama_setup/Dockerfile @@ -0,0 +1,7 @@ +FROM python:3.11-alpine + +WORKDIR /app +COPY * /app +RUN pip install -r requirements.txt + +CMD ["sh", "-c", "python setup.py && tail -f /dev/null"] diff --git a/dial-docker-compose/ollama/ollama_setup/requirements.txt b/dial-docker-compose/ollama/ollama_setup/requirements.txt new file mode 100644 index 00000000..ac6a93f1 --- /dev/null +++ b/dial-docker-compose/ollama/ollama_setup/requirements.txt @@ -0,0 +1,3 @@ +httpx==0.27.2 +tqdm==4.66.5 +ollama==0.3.3 \ No newline at end of file diff --git a/dial-docker-compose/ollama/ollama_setup/setup.py b/dial-docker-compose/ollama/ollama_setup/setup.py new file mode 100755 index 00000000..3e27663e --- /dev/null +++ b/dial-docker-compose/ollama/ollama_setup/setup.py @@ -0,0 +1,127 @@ +import asyncio +from contextlib import asynccontextmanager +import os +import sys +import time +import asyncio +from ollama import AsyncClient +from tqdm import tqdm + +OLLAMA_URL = os.getenv("OLLAMA_URL") +if OLLAMA_URL is None: + raise RuntimeError("OLLAMA_URL env var isn't set") + +OLLAMA_CHAT_MODEL = os.getenv("OLLAMA_CHAT_MODEL") +OLLAMA_VISION_MODEL = os.getenv("OLLAMA_VISION_MODEL") +OLLAMA_EMBEDDING_MODEL = os.getenv("OLLAMA_EMBEDDING_MODEL") + +HEALTH_FILE = "/healthy" + + +class Writer: + @classmethod + def write(cls, s: str): + # NOTE: every tqdm progress bar update is deliberately ended with "\n", + # otherwise one wouldn't see the bar running in console upon running `docker compose up`. + print(s, file=sys.stdout, flush=True, end="\n") + + @classmethod + def flush(cls): + sys.stdout.flush() + + +print_info = Writer.write + +print_info(f"OLLAMA_URL = {OLLAMA_URL}") +print_info(f"OLLAMA_CHAT_MODEL = {OLLAMA_CHAT_MODEL}") +print_info(f"OLLAMA_VISION_MODEL = {OLLAMA_VISION_MODEL}") +print_info(f"OLLAMA_EMBEDDING_MODEL = {OLLAMA_EMBEDDING_MODEL}") + + +@asynccontextmanager +async def timer(name: str): + print_info(f"[{name}] Starting...") + start = time.perf_counter() + yield + elapsed = time.perf_counter() - start + print_info(f"[{name}] Finished in {elapsed:.2f} seconds") + + +async def wait_for_startup(): + attempt = 0 + while True: + attempt += 1 + try: + await AsyncClient(host=OLLAMA_URL, timeout=5).ps() + except Exception: + print_info(f"[{attempt:>3}] Waiting for Ollama to start...") + await asyncio.sleep(5) + else: + break + + +async def pull_model(client: AsyncClient, model: str): + response = await client.pull(model, stream=True) + + progress_bar = None + prev_status = None + + async for chunk in response: + status = chunk["status"] + total = chunk.get("total") + completed = chunk.get("completed") + + if status != prev_status and total: + prev_status = status + if progress_bar: + progress_bar.close() + progress_bar = tqdm( + total=total, unit="B", unit_scale=True, desc=f"[{status}]", file=Writer + ) + + if completed and progress_bar and total: + progress_bar.n = completed + progress_bar.set_description(f"[{status}]") + progress_bar.refresh() + + if total and total == completed and progress_bar: + progress_bar.close() + + if not completed and not total: + print_info(f"[{status}]") + + +async def create_health_mark(): + open(HEALTH_FILE, "w").close() + + +async def main(): + client = AsyncClient(host=OLLAMA_URL, timeout=300000) + + async with timer("Waiting for Ollama to start"): + await wait_for_startup() + + for model, alias in [ + (OLLAMA_CHAT_MODEL, "chat-model"), + (OLLAMA_VISION_MODEL, "vision-model"), + (OLLAMA_EMBEDDING_MODEL, "embedding-model"), + ]: + if model: + async with timer(f"Pulling model {model}"): + await pull_model(client, model) + + async with timer(f"Creating alias for {model}: {alias}"): + await client.copy(model, alias) + + if model_to_load := (OLLAMA_CHAT_MODEL or OLLAMA_VISION_MODEL): + async with timer(f"Loading model {model_to_load} into memory"): + await client.generate(model_to_load) + + await create_health_mark() + + print_info("The Ollama server is up and running.") + + +if __name__ == "__main__": + loop = asyncio.get_event_loop() + loop.run_until_complete(main()) diff --git a/dial-docker-compose/settings/settings.json b/dial-docker-compose/settings/settings.json index 446b1b6b..0471610d 100644 --- a/dial-docker-compose/settings/settings.json +++ b/dial-docker-compose/settings/settings.json @@ -9,7 +9,7 @@ } }, "encryption": { - "salt": "salt", - "password": "password" + "secret": "salt", + "key": "password" } } \ No newline at end of file diff --git a/docs/README.md b/docs/README.md index a4beeb25..93b9a2de 100644 --- a/docs/README.md +++ b/docs/README.md @@ -33,8 +33,9 @@ ## Tutorials * [Launch AI DIAL Chat with an Azure model](./tutorials/quick-start-model.md) -* [Launch AI DIAL Chat with a Sample Application](./tutorials/quick-start-with-application.md) -* [Launch AI DIAL Chat with a Sample Addon](./tutorials/quick-start-with-addon.md) +* [Launch AI DIAL Chat with a self-hosted model](./tutorials/quick-start-with-self-hosted-model.md) +* [Launch AI DIAL Chat with a sample application](./tutorials/quick-start-with-application.md) +* [Launch AI DIAL Chat with a sample addon](./tutorials/quick-start-with-addon.md) ## AI DIAL Chat Application User Manual diff --git a/docs/tutorials/quick-start-with-self-hosted-model.md b/docs/tutorials/quick-start-with-self-hosted-model.md new file mode 100644 index 00000000..77b18828 --- /dev/null +++ b/docs/tutorials/quick-start-with-self-hosted-model.md @@ -0,0 +1,79 @@ +# Launch AI DIAL Chat with a Self-Hosted Model + +## Introduction + +In this tutorial, you will learn how to quickly launch AI DIAL Chat with a self-hosted model powered by [Ollama](https://ollama.com/). + +## Prerequisites + +Docker engine installed on your machine (Docker Compose Version 2.20.0 +). + +> Refer to [Docker](https://docs.docker.com/desktop/) documentation. + +## Step 1: Get AI DIAL + +Clone [the repository](https://github.com/epam/ai-dial/) with the tutorials and change directory to the following folder: + +```sh +cd dial-docker-compose/ollama +``` + +## Step 2: Choose a model to run + +Ollama supports a wide range of popular open-source models. + +Consider first the modality your are interested in - is it a regular text-to-text chat model, a multi-modal vision model or an embedding model? + +Follow the feature tags _(`Embeddings`, `Code`, `Tools`, `Vision`)_ at [Ollama Search](https://ollama.com/search) to find the appropriate model. + +We recommend choosing one of the following models which have been tested. + +### Chat models + +|Model|Tools| +|----|----| +|[llama3.1:8b-instruct-q4_0](https://ollama.com/library/llama3.1:8b-instruct-q4_0)|✅ *(only in non-streaming mode)*| +|[mistral:7b-instruct-q4_0](https://ollama.com/library/mistral:7b-instruct-q4_0)|❌| +|[phi3.5:3.8b-mini-instruct-q4_0](https://ollama.com/library/phi3.5:3.8b-mini-instruct-q4_0)|❌| +|[gemma2:2b-instruct-q4_0](https://ollama.com/library/gemma2:2b-instruct-q4_0)|❌| + +All the models support streaming. + +### Vision models + +* [llava:7b-v1.6-mistral-q4_0](https://ollama.com/library/llava:7b-v1.6-mistral-q4_0) +* [llava-phi3:3.8b-mini-q4_0](https://ollama.com/library/llava-phi3:3.8b-mini-q4_0) + +### Embedding models + +* [nomic-embed-text:137m-v1.5-fp16](https://ollama.com/library/nomic-embed-text:137m-v1.5-fp16) +* [bge-m3:567m-fp16](https://ollama.com/library/bge-m3:567m-fp16) + +## Step 3: Launch AI DIAL Chat + +1. Configure `.env` file in the current directory according to the type of model you've chosen: + + * Set `OLLAMA_CHAT_MODEL` for the name of a text model. + * Set `OLLAMA_VISION_MODEL` for the name of a vision model. + * Set `OLLAMA_EMBEDDING_MODEL` for the name of an embedding model. + + **Note**: It's not necessary to configure all the models. If a model isn't set, then it won't be downloaded. + +2. Then run the following command to pull and load into the memory of the Ollama server the specified models: + + ```sh + docker compose up --abort-on-container-exit + ``` + + > Keep in mind that a typical size of a lightweight Ollama model is around a few gigabytes. So it may take a few minutes _(or dozens of minutes)_ to download them on the first run depending on your Internet bandwidth. + > + > The model is fully loaded once `ollama-setup` service prints `The Ollama server is up and running.` + +3. Finally, open http://localhost:3000/ in your browser to launch the AI DIAL Chat application and select an appropriate AI DIAL deployments to converse with: + + * `Self-hosted chat model` deployment for the `OLLAMA_CHAT_MODEL` + * `Self-hosted vision model` deployment for the `OLLAMA_VISION_MODEL` + +> Note, that the vision models we tested, do not support streaming of response. Moreover, they are typically more computationally expensive than the chat models. So it may take minutes for a vision model to respond. + +The embedding model will become available in AI DIAL under the deployment name `embedding-model` and could be called via the endpoint: `localhost:8080/openai/deployments/embedding-model/embeddings`. diff --git a/sidebars.js b/sidebars.js index a0ab1bff..dc81800b 100644 --- a/sidebars.js +++ b/sidebars.js @@ -102,6 +102,11 @@ const sidebars = { id: 'tutorials/quick-start-model', label: 'Chat with OpenAI Model', }, + { + type: 'doc', + id: 'tutorials/quick-start-with-self-hosted-model', + label: 'Chat with a Self-Hosted Model', + }, { type: 'doc', id: 'tutorials/quick-start-with-addon', @@ -197,13 +202,13 @@ const sidebars = { Demos: [ { type: 'autogenerated', - dirName: 'video demos/demos', + dirName: 'video demos/demos', }, ], 'Demos For Developers': [ { type: 'autogenerated', - dirName: 'video demos/demos-for-developers', + dirName: 'video demos/demos-for-developers', }, ], },