Source code for lmdeploy.api

# Copyright (c) OpenMMLab. All rights reserved.
import os
from typing import List, Literal, Optional, Union

from .archs import autoget_backend_config, get_task
from .messages import PytorchEngineConfig, TurbomindEngineConfig
from .model import ChatTemplateConfig


[docs] def pipeline(model_path: str, backend_config: Optional[Union[TurbomindEngineConfig, PytorchEngineConfig]] = None, chat_template_config: Optional[ChatTemplateConfig] = None, log_level: str = 'WARNING', max_log_len: int = None, **kwargs): """ Args: model_path (str): the path of a model. It could be one of the following options: - i) A local directory path of a turbomind model which is converted by `lmdeploy convert` command or download from ii) and iii). - ii) The model_id of a lmdeploy-quantized model hosted inside a model repo on huggingface.co, such as "InternLM/internlm-chat-20b-4bit", "lmdeploy/llama2-chat-70b-4bit", etc. - iii) The model_id of a model hosted inside a model repo on huggingface.co, such as "internlm/internlm-chat-7b", "Qwen/Qwen-7B-Chat ", "baichuan-inc/Baichuan2-7B-Chat" and so on. backend_config (TurbomindEngineConfig | PytorchEngineConfig): backend config instance. Default to None. chat_template_config (ChatTemplateConfig): chat template configuration. Default to None. log_level(str): set log level whose value among [CRITICAL, ERROR, WARNING, INFO, DEBUG] max_log_len(int): Max number of prompt characters or prompt tokens being printed in log Examples: >>> # LLM >>> import lmdeploy >>> pipe = lmdeploy.pipeline('internlm/internlm-chat-7b') >>> response = pipe(['hi','say this is a test']) >>> print(response) >>> >>> # VLM >>> from lmdeploy.vl import load_image >>> from lmdeploy import pipeline, TurbomindEngineConfig, ChatTemplateConfig >>> pipe = pipeline('liuhaotian/llava-v1.5-7b', ... backend_config=TurbomindEngineConfig(session_len=8192), ... chat_template_config=ChatTemplateConfig(model_name='vicuna')) >>> im = load_image('https://raw.githubusercontent.com/open-mmlab/mmdeploy/main/demo/resources/human-pose.jpg') >>> response = pipe([('describe this image', [im])]) >>> print(response) """ # noqa E501 if os.getenv('TM_LOG_LEVEL') is None: os.environ['TM_LOG_LEVEL'] = log_level from lmdeploy.utils import get_logger, get_model logger = get_logger('lmdeploy') logger.setLevel(log_level) # model_path is not local path. if not os.path.exists(model_path): download_dir = backend_config.download_dir \ if backend_config is not None else None revision = backend_config.revision \ if backend_config is not None else None model_path = get_model(model_path, download_dir, revision) task, pipeline_class = get_task(model_path) if task == 'vlm': if backend_config and backend_config.enable_prefix_caching: backend_config.enable_prefix_caching = False logger.warning('VLM does not support prefix caching.') if type(backend_config) is not PytorchEngineConfig: # set auto backend mode backend_config = autoget_backend_config(model_path, backend_config) backend = 'pytorch' if type(backend_config) is PytorchEngineConfig else 'turbomind' logger.info(f'Using {backend} engine') return pipeline_class(model_path, backend=backend, backend_config=backend_config, chat_template_config=chat_template_config, max_log_len=max_log_len, **kwargs)
[docs] def serve(model_path: str, model_name: Optional[str] = None, backend: Literal['turbomind', 'pytorch'] = 'turbomind', backend_config: Optional[Union[TurbomindEngineConfig, PytorchEngineConfig]] = None, chat_template_config: Optional[ChatTemplateConfig] = None, server_name: str = '0.0.0.0', server_port: int = 23333, log_level: str = 'ERROR', api_keys: Optional[Union[List[str], str]] = None, ssl: bool = False, **kwargs): """This will run the api_server in a subprocess. Args: model_path (str): the path of a model. It could be one of the following options: - i) A local directory path of a turbomind model which is converted by `lmdeploy convert` command or download from ii) and iii). - ii) The model_id of a lmdeploy-quantized model hosted inside a model repo on huggingface.co, such as "InternLM/internlm-chat-20b-4bit", "lmdeploy/llama2-chat-70b-4bit", etc. - iii) The model_id of a model hosted inside a model repo on huggingface.co, such as "internlm/internlm-chat-7b", "Qwen/Qwen-7B-Chat ", "baichuan-inc/Baichuan2-7B-Chat" and so on. model_name (str): the name of the served model. It can be accessed by the RESTful API `/v1/models`. If it is not specified, `model_path` will be adopted backend (str): either `turbomind` or `pytorch` backend. Default to `turbomind` backend. backend_config (TurbomindEngineConfig | PytorchEngineConfig): backend config instance. Default to none. chat_template_config (ChatTemplateConfig): chat template configuration. Default to None. server_name (str): host ip for serving server_port (int): server port log_level(str): set log level whose value among [CRITICAL, ERROR, WARNING, INFO, DEBUG] api_keys (List[str] | str | None): Optional list of API keys. Accepts string type as a single api_key. Default to None, which means no api key applied. ssl (bool): Enable SSL. Requires OS Environment variables 'SSL_KEYFILE' and 'SSL_CERTFILE'. Return: APIClient: A client chatbot for LLaMA series models. Examples: >>> import lmdeploy >>> client = lmdeploy.serve('internlm/internlm-chat-7b', 'internlm-chat-7b') >>> for output in client.chat('hi', 1): ... print(output) """ # noqa E501 import time from multiprocessing import Process from lmdeploy.serve.openai.api_client import APIClient from lmdeploy.serve.openai.api_server import serve if type(backend_config) is not PytorchEngineConfig: # set auto backend mode backend_config = autoget_backend_config(model_path, backend_config) backend = 'pytorch' if type(backend_config) is PytorchEngineConfig else 'turbomind' task = Process(target=serve, args=(model_path, ), kwargs=dict(model_name=model_name, backend=backend, backend_config=backend_config, chat_template_config=chat_template_config, server_name=server_name, server_port=server_port, log_level=log_level, api_keys=api_keys, ssl=ssl, **kwargs), daemon=True) task.start() client = APIClient(f'http://{server_name}:{server_port}') while True: time.sleep(1) try: client.available_models print(f'Launched the api_server in process {task.pid}, user can ' f'kill the server by:\nimport os,signal\nos.kill({task.pid}, ' 'signal.SIGKILL)') return client except: # noqa pass
[docs] def client(api_server_url: str = 'http://0.0.0.0:23333', api_key: Optional[str] = None, **kwargs): """ Args: api_server_url (str): communicating address 'http://<ip>:<port>' of api_server api_key (str | None): api key. Default to None, which means no api key will be used. Return: Chatbot for LLaMA series models with turbomind as inference engine. """ from lmdeploy.serve.openai.api_client import APIClient return APIClient(api_server_url, api_key, **kwargs)