From 792736d70a4a5acb7a30f57bba3149ea58dad995 Mon Sep 17 00:00:00 2001 From: Kunshang Ji Date: Thu, 11 Apr 2024 23:39:26 +0800 Subject: [PATCH 01/67] add build, dependency --- requirements-xpu.txt | 30 ++++++++++++++++++++++++++++++ setup.py | 9 ++++++++- 2 files changed, 38 insertions(+), 1 deletion(-) create mode 100644 requirements-xpu.txt diff --git a/requirements-xpu.txt b/requirements-xpu.txt new file mode 100644 index 0000000000000..6f6b6ed5642bf --- /dev/null +++ b/requirements-xpu.txt @@ -0,0 +1,30 @@ +cmake>=3.21 +ninja # For faster builds. +psutil +ray >= 2.9 +sentencepiece # Required for LLaMA tokenizer. +numpy + +torch @ https://ubit-artifactory-sh.intel.com/artifactory/aipc_releases-sh-local/gpu-new/validation/IPEX/weekly/PVC/2024/ww13/py39/torch-2.1.0a0+gitc61d29a-cp39-cp39-linux_x86_64.whl +# intel_extension_for_pytorch @ https://ubit-artifactory-sh.intel.com/artifactory/aipc_releases-sh-local/gpu-new/validation/IPEX/weekly/PVC/2024/ww13/py39/intel_extension_for_pytorch-2.1.30+gitcdec5e9-cp39-cp39-linux_x86_64.whl +oneccl_bind_pt @ https://ubit-artifactory-sh.intel.com/artifactory/aipc_releases-sh-local/gpu-new/validation/IPEX/weekly/PVC/2024/ww13/py39/oneccl_bind_pt-2.1.0+gpu-cp39-cp39-linux_x86_64.whl +# torch @ https://intel-extension-for-pytorch.s3.amazonaws.com/ipex_stable/xpu/torch-2.1.0a0%2Bcxx11.abi-cp310-cp310-linux_x86_64.whl +# intel_extension_for_pytorch @ https://intel-extension-for-pytorch.s3.amazonaws.com/ipex_stable/xpu/intel_extension_for_pytorch-2.1.10%2Bxpu-cp310-cp310-linux_x86_64.whl +# oneccl_bind_pt @ https://intel-extension-for-pytorch.s3.amazonaws.com/ipex_stable/xpu/oneccl_bind_pt-2.1.100%2Bxpu-cp310-cp310-linux_x86_64.whl +#torch == 2.1.0a0+cxx11.abi +#intel_extension_for_pytorch == 2.1.10+xpu +#oneccl_bind_pt == 2.1.100+xpu + +transformers >= 4.39.1 # Required for StarCoder2 & Llava. +fastapi == 0.109.0 +uvicorn[standard] +pydantic >= 2.0 # Required for OpenAI server. +prometheus_client >= 0.18.0 +pynvml == 11.5.0 +# outlines == 0.0.34 + +#triton @ https://github.com/intel/intel-xpu-backend-for-triton/releases/download/v2.1.0/triton-2.1.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl +# triton @ https://github.com/intel/intel-xpu-backend-for-triton/releases/download/v2.1.0/triton-2.1.0-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl + +wheel +einops # Required for phi-1_5 diff --git a/setup.py b/setup.py index 12a704e08eedb..17e5ae9ec137f 100644 --- a/setup.py +++ b/setup.py @@ -233,10 +233,13 @@ def _is_cpu() -> bool: return VLLM_TARGET_DEVICE == "cpu" +def _is_xpu() -> bool: + return VLLM_TARGET_DEVICE == "xpu" + + def _build_custom_ops() -> bool: return _is_cuda() or _is_hip() or _is_cpu() - def _install_punica() -> bool: return envs.VLLM_INSTALL_PUNICA_KERNELS @@ -337,6 +340,8 @@ def get_vllm_version() -> str: version += "+tpu" elif _is_cpu(): version += "+cpu" + elif _is_xpu(): + version += "+xpu" else: raise RuntimeError("Unknown runtime environment") @@ -386,6 +391,8 @@ def _read_requirements(filename: str) -> List[str]: requirements = _read_requirements("requirements-tpu.txt") elif _is_cpu(): requirements = _read_requirements("requirements-cpu.txt") + elif _is_xpu(): + requirements = _read_requirements("requirements-xpu.txt") else: raise ValueError( "Unsupported platform, please use CUDA, ROCm, Neuron, or CPU.") From f762493e0fe8afeac9a790855d5d3f6c0c6edc43 Mon Sep 17 00:00:00 2001 From: Kunshang Ji Date: Thu, 11 Apr 2024 23:45:04 +0800 Subject: [PATCH 02/67] add python layer support for xpu --- vllm/attention/backends/torch_sdpa.py | 58 +++- vllm/attention/selector.py | 4 +- vllm/config.py | 4 +- vllm/engine/arg_utils.py | 3 +- vllm/engine/async_llm_engine.py | 7 + vllm/engine/llm_engine.py | 8 + vllm/executor/ray_utils.py | 4 +- vllm/executor/ray_xpu_executor.py | 416 ++++++++++++++++++++++++++ vllm/executor/xpu_executor.py | 151 ++++++++++ vllm/utils.py | 38 ++- vllm/worker/cache_engine.py | 7 +- vllm/worker/model_runner.py | 2 +- vllm/worker/worker.py | 3 +- vllm/worker/xpu_model_runner.py | 218 ++++++++++++++ vllm/worker/xpu_worker.py | 330 ++++++++++++++++++++ 15 files changed, 1236 insertions(+), 17 deletions(-) create mode 100644 vllm/executor/ray_xpu_executor.py create mode 100644 vllm/executor/xpu_executor.py create mode 100644 vllm/worker/xpu_model_runner.py create mode 100644 vllm/worker/xpu_worker.py diff --git a/vllm/attention/backends/torch_sdpa.py b/vllm/attention/backends/torch_sdpa.py index 4b08cce99afb0..480188561bf7a 100644 --- a/vllm/attention/backends/torch_sdpa.py +++ b/vllm/attention/backends/torch_sdpa.py @@ -121,6 +121,7 @@ def __init__( self.alibi_slopes = alibi_slopes self.sliding_window = sliding_window self.kv_cache_dtype = kv_cache_dtype + self.fuse_batch = is_xpu() assert self.num_heads % self.num_kv_heads == 0 self.num_queries_per_kv = self.num_heads // self.num_kv_heads @@ -137,6 +138,22 @@ def __init__( "Torch SDPA backend does not support FP8 KV cache. " "Please use xFormers backend instead.") + def split_kv_cache( + self, + kv_cache: torch.Tensor, + num_kv_heads: int, + head_size: int, + ) -> Tuple[torch.Tensor, torch.Tensor]: + x = 1 + num_blocks = kv_cache.shape[1] + + key_cache = kv_cache[0] + key_cache = key_cache.view(num_blocks, num_kv_heads, head_size // x, + -1, x) + value_cache = kv_cache[1] + value_cache = value_cache.view(num_blocks, num_kv_heads, head_size, -1) + return key_cache, value_cache + def forward( self, query: torch.Tensor, @@ -165,7 +182,7 @@ def forward( value = value.view(-1, self.num_kv_heads, self.head_size) if kv_cache is not None: - key_cache, value_cache = PagedAttention.split_kv_cache( + key_cache, value_cache = self.split_kv_cache( kv_cache, self.num_kv_heads, self.head_size) PagedAttention.write_to_paged_cache(key, value, key_cache, value_cache, @@ -190,12 +207,21 @@ def forward( attn_metadata.seq_lens, self.sliding_window, query.dtype) # type: ignore else: - att_masks = [None] * len(attn_metadata.seq_lens) + if self.fuse_batch: + att_masks = _make_sliding_window_bias( + attn_metadata.seq_lens, + None, + dtype=query.dtype) + else: + att_masks = [None] * len(attn_metadata.seq_lens) attn_metadata.attn_bias = att_masks - query = query.movedim(0, query.dim() - 2) - key = key.movedim(0, key.dim() - 2) - value = value.movedim(0, value.dim() - 2) + query = query.unsqueeze(0) + key = key.unsqueeze(0) + value = value.unsqueeze(0) + query = query.movedim(1, query.dim() - 2) + key = key.movedim(1, key.dim() - 2) + value = value.movedim(1, value.dim() - 2) start = 0 output = torch.empty( @@ -240,6 +266,28 @@ def forward( return output.view(-1, self.num_heads * self.head_size) +def _make_attention_mask( + att_bias: List[torch.Tensor], + seq_lens: List[int], + prompt_token_num: int, + dtype: torch.dtype, +) -> torch.Tensor: + assert att_bias[0].dim() == 3 + assert len(att_bias) == len(seq_lens) + head_size, _, _ = att_bias[0].size() + mask = torch.empty(head_size, + prompt_token_num, + prompt_token_num, + dtype=dtype) + mask.fill_(-torch.inf) + start = 0 + for seq_len, sub_mask in zip(seq_lens, att_bias): + end = start + seq_len + mask[:, start:end, start:end] = sub_mask + start += seq_len + return mask + + def _make_alibi_bias( alibi_slopes: torch.Tensor, dtype: torch.dtype, diff --git a/vllm/attention/selector.py b/vllm/attention/selector.py index 8b07fb2d768f5..abc4c26ffe94b 100644 --- a/vllm/attention/selector.py +++ b/vllm/attention/selector.py @@ -7,7 +7,7 @@ import vllm.envs as envs from vllm.attention.backends.abstract import AttentionBackend from vllm.logger import init_logger -from vllm.utils import is_cpu, is_hip, is_tpu +from vllm.utils import is_cpu, is_hip, is_tpu, is_xpu logger = init_logger(__name__) @@ -102,7 +102,7 @@ def which_attn_to_use( "(case-sensitive).") selected_backend = _Backend[backend_by_env_var] - if is_cpu(): + if is_cpu() or is_xpu(): if selected_backend != _Backend.TORCH_SDPA: logger.info("Cannot use %s backend on CPU.", selected_backend) return _Backend.TORCH_SDPA diff --git a/vllm/config.py b/vllm/config.py index d9e4a619ee010..6232d733538de 100644 --- a/vllm/config.py +++ b/vllm/config.py @@ -12,7 +12,7 @@ from vllm.model_executor.models import ModelRegistry from vllm.transformers_utils.config import get_config, get_hf_text_config from vllm.utils import (cuda_device_count_stateless, get_cpu_memory, is_cpu, - is_hip, is_neuron, is_tpu) + is_hip, is_neuron, is_tpu, is_xpu) if TYPE_CHECKING: from ray.util.placement_group import PlacementGroup @@ -752,6 +752,8 @@ def __init__(self, device: str = "auto") -> None: self.device_type = "tpu" elif is_cpu(): self.device_type = "cpu" + elif is_xpu(): + self.device_type = "xpu" else: # We don't call torch.cuda.is_available() here to # avoid initializing CUDA before workers are forked diff --git a/vllm/engine/arg_utils.py b/vllm/engine/arg_utils.py index 227de5475b949..16bf7e5d66e3f 100644 --- a/vllm/engine/arg_utils.py +++ b/vllm/engine/arg_utils.py @@ -504,7 +504,8 @@ def add_cli_args( parser.add_argument("--device", type=str, default=EngineArgs.device, - choices=["auto", "cuda", "neuron", "cpu", "tpu"], + choices=["auto", "cuda", "neuron", "cpu", "tpu", + "xpu"], help='Device type for vLLM execution.') # Related to Vision-language models such as llava diff --git a/vllm/engine/async_llm_engine.py b/vllm/engine/async_llm_engine.py index 943402c865bd2..84e92096040fa 100644 --- a/vllm/engine/async_llm_engine.py +++ b/vllm/engine/async_llm_engine.py @@ -20,6 +20,7 @@ from vllm.sampling_params import SamplingParams from vllm.sequence import ExecuteModelRequest, SamplerOutput from vllm.usage.usage_lib import UsageContext +from vllm.utils import is_xpu logger = init_logger(__name__) ENGINE_ITERATION_TIMEOUT_S = envs.VLLM_ENGINE_ITERATION_TIMEOUT_S @@ -383,6 +384,12 @@ def from_engine_args( "Distributed execution is not supported with the CPU backend.") from vllm.executor.cpu_executor import CPUExecutorAsync executor_class = CPUExecutorAsync + elif engine_config.device_config.device_type == "xpu" and is_xpu(): + if (engine_config.parallel_config.worker_use_ray): + logger.warning("not support ray yet") + else: + from vllm.executor.xpu_executor import XPUExecutorAsync + executor_class = XPUExecutorAsync elif distributed_executor_backend == "ray": initialize_ray_cluster(engine_config.parallel_config) from vllm.executor.ray_gpu_executor import RayGPUExecutorAsync diff --git a/vllm/engine/llm_engine.py b/vllm/engine/llm_engine.py index b2f6478cbfd7b..045eb93ccfee2 100644 --- a/vllm/engine/llm_engine.py +++ b/vllm/engine/llm_engine.py @@ -347,6 +347,14 @@ def from_engine_args( elif engine_config.device_config.device_type == "cpu": from vllm.executor.cpu_executor import CPUExecutor executor_class = CPUExecutor + elif engine_config.device_config.device_type == "xpu": + if engine_config.parallel_config.worker_use_ray: + initialize_ray_cluster(engine_config.parallel_config) + from vllm.executor.ray_xpu_executor import RayXPUExecutor + executor_class = RayXPUExecutor + else: + from vllm.executor.xpu_executor import XPUExecutor + executor_class = XPUExecutor elif distributed_executor_backend == "ray": initialize_ray_cluster(engine_config.parallel_config) from vllm.executor.ray_gpu_executor import RayGPUExecutor diff --git a/vllm/executor/ray_utils.py b/vllm/executor/ray_utils.py index 4704f5f1b1a10..495fddd175dd4 100644 --- a/vllm/executor/ray_utils.py +++ b/vllm/executor/ray_utils.py @@ -3,7 +3,7 @@ from vllm.config import ParallelConfig from vllm.logger import init_logger -from vllm.utils import get_ip, is_hip +from vllm.utils import get_ip, is_hip, is_xpu from vllm.worker.worker_base import WorkerWrapperBase logger = init_logger(__name__) @@ -71,7 +71,7 @@ def initialize_ray_cluster( "serving.") # Connect to a ray cluster. - if is_hip(): + if is_hip() or is_xpu(): ray.init(address=ray_address, ignore_reinit_error=True, num_gpus=parallel_config.world_size) diff --git a/vllm/executor/ray_xpu_executor.py b/vllm/executor/ray_xpu_executor.py new file mode 100644 index 0000000000000..c1b4027bce247 --- /dev/null +++ b/vllm/executor/ray_xpu_executor.py @@ -0,0 +1,416 @@ +import asyncio +import copy +import os +import pickle +from collections import defaultdict +from typing import TYPE_CHECKING, Any, Dict, List, Optional + +from vllm.config import (CacheConfig, DeviceConfig, LoRAConfig, ModelConfig, + ParallelConfig, SchedulerConfig, VisionLanguageConfig) +from vllm.engine.ray_utils import RayWorkerVllm, ray +from vllm.executor.executor_base import ExecutorAsyncBase, ExecutorBase +from vllm.executor.utils import check_block_size_valid +from vllm.logger import init_logger +from vllm.lora.request import LoRARequest +from vllm.sequence import SamplerOutput, SequenceGroupMetadata +from vllm.utils import (get_distributed_init_method, get_ip, get_open_port, + make_async) + +if ray is not None: + from ray.util.scheduling_strategies import PlacementGroupSchedulingStrategy + +if TYPE_CHECKING: + from ray.util.placement_group import PlacementGroup + +logger = init_logger(__name__) + +# If the env var is set, it uses the Ray's compiled DAG API +# which optimizes the control plane overhead. +# Run vLLM with VLLM_USE_RAY_COMPILED_DAG=1 to enable it. +USE_RAY_COMPILED_DAG = bool(os.getenv("VLLM_USE_RAY_COMPILED_DAG", 0)) + + +class RayXPUExecutor(ExecutorBase): + + def __init__( + self, + model_config: ModelConfig, + cache_config: CacheConfig, + parallel_config: ParallelConfig, + scheduler_config: SchedulerConfig, + device_config: DeviceConfig, + lora_config: Optional[LoRAConfig], + vision_language_config: Optional[VisionLanguageConfig], + ) -> None: + self.model_config = model_config + self.cache_config = cache_config + self.lora_config = lora_config + self.parallel_config = parallel_config + self.scheduler_config = scheduler_config + self.device_config = device_config + self.vision_language_config = vision_language_config + + assert self.parallel_config.worker_use_ray + placement_group = self.parallel_config.placement_group + + # Disable Ray usage stats collection. + ray_usage = os.environ.get("RAY_USAGE_STATS_ENABLED", "0") + if ray_usage != "1": + os.environ["RAY_USAGE_STATS_ENABLED"] = "0" + + # Create the parallel GPU workers. + self._init_workers_ray(placement_group) + + # Profile the memory usage and initialize the cache. + self._init_cache() + + self.forward_dag = None + if USE_RAY_COMPILED_DAG: + self.forward_dag = self._compiled_ray_dag() + + def _init_workers_ray(self, placement_group: "PlacementGroup", + **ray_remote_kwargs): + if self.parallel_config.tensor_parallel_size == 1: + # For single GPU case, we use a ray worker with constrained memory. + num_gpus = self.cache_config.gpu_memory_utilization + else: + # Otherwise, the ray workers are allocated with a full GPU. + num_gpus = 1 + + # The driver dummy worker does not actually use any resources. + # It holds the resource for the driver worker. + self.driver_dummy_worker: RayWorkerVllm = None + # The remaining workers are the actual ray actors. + self.workers: List[RayWorkerVllm] = [] + + # Create the workers. + driver_ip = get_ip() + for bundle_id, bundle in enumerate(placement_group.bundle_specs): + if not bundle.get("GPU", 0): + continue + scheduling_strategy = PlacementGroupSchedulingStrategy( + placement_group=placement_group, + placement_group_capture_child_tasks=True, + placement_group_bundle_index=bundle_id, + ) + worker = ray.remote( + num_cpus=0, + num_gpus=num_gpus, + scheduling_strategy=scheduling_strategy, + **ray_remote_kwargs, + )(RayWorkerVllm).remote(self.model_config.trust_remote_code) + + worker_ip = ray.get(worker.get_node_ip.remote()) + if worker_ip == driver_ip and self.driver_dummy_worker is None: + # If the worker is on the same node as the driver, we use it + # as the resource holder for the driver process. + self.driver_dummy_worker = worker + else: + # Else, added to the list of workers. + self.workers.append(worker) + + if self.driver_dummy_worker is None: + raise ValueError( + "Ray does not allocate any GPUs on the driver node. Consider " + "adjusting the Ray placement group or running the driver on a " + "GPU node.") + + # Get the set of GPU IDs used on each node. + driver_node_id, driver_gpu_ids = ray.get( + self.driver_dummy_worker.get_node_and_gpu_ids.remote()) + worker_node_and_gpu_ids = ray.get( + [worker.get_node_and_gpu_ids.remote() for worker in self.workers]) + + node_workers = defaultdict(list) + node_gpus = defaultdict(list) + + node_workers[driver_node_id].append(0) + node_gpus[driver_node_id].extend(driver_gpu_ids) + for i, (node_id, gpu_ids) in enumerate(worker_node_and_gpu_ids, + start=1): + node_workers[node_id].append(i) + node_gpus[node_id].extend(gpu_ids) + for node_id, gpu_ids in node_gpus.items(): + node_gpus[node_id] = sorted(gpu_ids) + + distributed_init_method = get_distributed_init_method( + driver_ip, get_open_port()) + + # Lazy import the Worker to avoid importing torch.cuda/xformers + # before CUDA_VISIBLE_DEVICES is set in the Worker + from vllm.worker.xpu_worker import XPUWorker + + model_config = copy.deepcopy(self.model_config) + parallel_config = copy.deepcopy(self.parallel_config) + scheduler_config = copy.deepcopy(self.scheduler_config) + device_config = copy.deepcopy(self.device_config) + lora_config = copy.deepcopy(self.lora_config) + kv_cache_dtype = self.cache_config.cache_dtype + + # Initialize the actual workers with the Worker class. + for rank, (worker, (node_id, _)) in enumerate( + zip(self.workers, worker_node_and_gpu_ids), + start=1, + ): + local_rank = node_workers[node_id].index(rank) + worker.init_worker.remote( + lambda rank=rank, local_rank=local_rank: XPUWorker( + model_config, + parallel_config, + scheduler_config, + device_config, + local_rank, + rank, + distributed_init_method, + lora_config=lora_config, + kv_cache_dtype=kv_cache_dtype, + )) + + # Initialize the driver worker with the Worker class. + driver_rank = 0 + driver_local_rank = node_workers[driver_node_id].index(driver_rank) + self.driver_worker = XPUWorker( + self.model_config, + self.parallel_config, + self.scheduler_config, + self.device_config, + driver_local_rank, + driver_rank, + distributed_init_method, + lora_config=self.lora_config, + kv_cache_dtype=kv_cache_dtype, + is_driver_worker=True, + ) + + # FIXME(woosuk): We are not properly initializing cupy NCCL when + # we have multiple nodes. + self._run_workers("init_device") + self._run_workers( + "load_model", + max_concurrent_workers=self.parallel_config. + max_parallel_loading_workers, + ) + + def _init_cache(self) -> None: + """Profiles the memory usage and initializes the KV cache. + + The engine will first conduct a profiling of the existing memory usage. + Then, it calculate the maximum possible number of GPU and CPU blocks + that can be allocated with the remaining free memory. + More details can be found in the + :meth:`~vllm.worker.worker.xpu_Worker.determine_num_available_blocks` + method from class :class:`~vllm.worker.xpu_Worker`. + + Afterwards, as there may be multiple workers, + we take the minimum number of blocks across all workers + to ensure this can be applied to all of them. + + Finally, the engine will initialize the KV cache + with the calculated number of blocks. + + .. tip:: + You may limit the usage of GPU memory + by adjusting the `gpu_memory_utilization` parameter. + """ + # Get the maximum number of blocks that can be allocated on GPU and CPU. + num_blocks = self._run_workers("determine_num_available_blocks", ) + + # Since we use a shared centralized controller, we take the minimum + # number of blocks across all workers to make sure all the memory + # operators can be applied to all workers. + num_gpu_blocks = min(b[0] for b in num_blocks) + num_cpu_blocks = min(b[1] for b in num_blocks) + logger.info(f"# GPU blocks: {num_gpu_blocks}, " + f"# CPU blocks: {num_cpu_blocks}") + + check_block_size_valid(num_gpu_blocks, self.cache_config.block_size, + self.model_config.max_model_len) + + self.cache_config.num_gpu_blocks = num_gpu_blocks + self.cache_config.num_cpu_blocks = num_cpu_blocks + + # Initialize the cache. + self._run_workers("init_cache_engine", cache_config=self.cache_config) + # Warm up the model. This includes capturing the model into CUDA graph + # if enforce_eager is False. + self._run_workers("warm_up_model") + + def execute_model(self, + seq_group_metadata_list: List[SequenceGroupMetadata], + blocks_to_swap_in: Dict[int, int], + blocks_to_swap_out: Dict[int, int], + blocks_to_copy: Dict[int, List[int]]) -> SamplerOutput: + all_outputs = self._run_workers( + "execute_model", + driver_kwargs={ + "seq_group_metadata_list": seq_group_metadata_list, + "blocks_to_swap_in": blocks_to_swap_in, + "blocks_to_swap_out": blocks_to_swap_out, + "blocks_to_copy": blocks_to_copy, + }, + use_ray_compiled_dag=USE_RAY_COMPILED_DAG) + + # Only the driver worker returns the sampling results. + output = all_outputs[0] + return output + + def add_lora(self, lora_request: LoRARequest) -> bool: + assert lora_request.lora_int_id > 0, "lora_id must be greater than 0." + return self._run_workers( + "add_lora", + lora_request=lora_request, + ) + + def remove_lora(self, lora_id: int) -> bool: + assert lora_id > 0, "lora_id must be greater than 0." + return self._run_workers( + "remove_lora", + lora_id=lora_id, + ) + + def list_loras(self) -> List[int]: + return self._run_workers("list_loras") + + def _run_workers( + self, + method: str, + *args, + driver_args: Optional[List[Any]] = None, + driver_kwargs: Optional[Dict[str, Any]] = None, + max_concurrent_workers: Optional[int] = None, + use_ray_compiled_dag: bool = False, + **kwargs, + ) -> Any: + """Runs the given method on all workers.""" + + if max_concurrent_workers: + raise NotImplementedError( + "max_concurrent_workers is not supported yet.") + + if use_ray_compiled_dag: + # Right now, compiled DAG can only accept a single + # input. TODO(sang): Fix it. + output_channels = self.forward_dag.execute(1) + else: + # Start the ray workers first. + ray_worker_outputs = [ + worker.execute_method.remote(method, *args, **kwargs) + for worker in self.workers + ] + + if driver_args is None: + driver_args = args + if driver_kwargs is None: + driver_kwargs = kwargs + + # Start the driver worker after all the ray workers. + driver_worker_output = getattr(self.driver_worker, + method)(*driver_args, **driver_kwargs) + + # Get the results of the ray workers. + if self.workers: + if use_ray_compiled_dag: + try: + ray_worker_outputs = [ + pickle.loads(chan.begin_read()) + for chan in output_channels + ] + finally: + # Has to call end_read in order to reuse the DAG. + for chan in output_channels: + chan.end_read() + else: + ray_worker_outputs = ray.get(ray_worker_outputs) + + return [driver_worker_output] + ray_worker_outputs + + def _compiled_ray_dag(self): + import pkg_resources + required_version = "2.9" + current_version = pkg_resources.get_distribution("ray").version + if current_version < required_version: + raise ValueError(f"Ray version {required_version} or greater is " + f"required, but found {current_version}") + + from ray.dag import InputNode, MultiOutputNode + assert self.parallel_config.worker_use_ray + + # Right now, compiled DAG requires at least 1 arg. We send + # a dummy value for now. It will be fixed soon. + with InputNode() as input_data: + forward_dag = MultiOutputNode([ + worker.execute_model_compiled_dag_remote.bind(input_data) + for worker in self.workers + ]) + return forward_dag.experimental_compile() + + def check_health(self) -> None: + """Raises an error if engine is unhealthy.""" + self._check_if_any_actor_is_dead() + + def _check_if_any_actor_is_dead(self): + if not self.workers: + return + + dead_actors = [] + for actor in self.workers: + actor_state = ray.state.actors(actor._ray_actor_id.hex()) # pylint: disable=protected-access + if actor_state["State"] == "DEAD": + dead_actors.append(actor) + if dead_actors: + raise RuntimeError("At least one Worker is dead. " + f"Dead Workers: {dead_actors}. ") + + +class RayXPUExecutorAsync(RayXPUExecutor, ExecutorAsyncBase): + + async def _run_workers_async( + self, + method: str, + *args, + driver_args: Optional[List[Any]] = None, + driver_kwargs: Optional[Dict[str, Any]] = None, + **kwargs, + ) -> Any: + """Runs the given method on all workers.""" + coros = [] + + if driver_args is None: + driver_args = args + if driver_kwargs is None: + driver_kwargs = kwargs + + # Run the driver worker asynchronously. + driver_executor = make_async(getattr(self.driver_worker, method)) + coros.append(driver_executor(*driver_args, **driver_kwargs)) + + # Run the ray workers asynchronously. + for worker in self.workers: + coros.append(worker.execute_method.remote(method, *args, **kwargs)) + + all_outputs = await asyncio.gather(*coros) + return all_outputs + + async def execute_model_async( + self, + seq_group_metadata_list: List[SequenceGroupMetadata], + blocks_to_swap_in: Dict[int, int], + blocks_to_swap_out: Dict[int, int], + blocks_to_copy: Dict[int, List[int]], + ) -> SamplerOutput: + all_outputs = await self._run_workers_async( + "execute_model", + driver_kwargs={ + "seq_group_metadata_list": seq_group_metadata_list, + "blocks_to_swap_in": blocks_to_swap_in, + "blocks_to_swap_out": blocks_to_swap_out, + "blocks_to_copy": blocks_to_copy, + }) + + # Only the driver worker returns the sampling results. + output = all_outputs[0] + return output + + async def check_health_async(self) -> None: + """Raises an error if engine is unhealthy.""" + self._check_if_any_actor_is_dead() diff --git a/vllm/executor/xpu_executor.py b/vllm/executor/xpu_executor.py new file mode 100644 index 0000000000000..687e61ec0cb3a --- /dev/null +++ b/vllm/executor/xpu_executor.py @@ -0,0 +1,151 @@ +from typing import Dict, List, Optional + +import torch + +from vllm.config import (CacheConfig, DeviceConfig, LoRAConfig, ModelConfig, + ParallelConfig, SchedulerConfig, SpeculativeConfig, + VisionLanguageConfig) +from vllm.executor.executor_base import ExecutorAsyncBase, ExecutorBase +from vllm.logger import init_logger +from vllm.lora.request import LoRARequest +from vllm.sequence import SamplerOutput, SequenceGroupMetadata +from vllm.utils import (get_distributed_init_method, get_ip, get_open_port, + make_async) + +logger = init_logger(__name__) + + +class XPUExecutor(ExecutorBase): + + def __init__( + self, + model_config: ModelConfig, + cache_config: CacheConfig, + parallel_config: ParallelConfig, + scheduler_config: SchedulerConfig, + device_config: DeviceConfig, + lora_config: Optional[LoRAConfig], + vision_language_config: Optional[VisionLanguageConfig], + speculative_config: Optional[SpeculativeConfig], + ) -> None: + assert device_config.device_type == "xpu" + assert (not speculative_config + ), "Speculative decoding not yet supported for XPU backend" + + model_config = _verify_and_get_model_config(model_config) + + self.model_config = model_config + self.cache_config = cache_config + self.lora_config = lora_config + self.parallel_config = parallel_config + self.scheduler_config = scheduler_config + self.device_config = device_config + self.vision_language_config = vision_language_config + + # Instantiate the worker and load the model to GPU. + self._init_worker() + + def _init_worker(self): + from vllm.worker.xpu_worker import XPUWorker + + assert self.parallel_config.world_size == 1, ( + "XPUExecutor only supports single GPU.") + + distributed_init_method = get_distributed_init_method( + get_ip(), get_open_port()) + self.driver_worker = XPUWorker( + model_config=self.model_config, + parallel_config=self.parallel_config, + scheduler_config=self.scheduler_config, + device_config=self.device_config, + cache_config=self.cache_config, + local_rank=0, + rank=0, + distributed_init_method=distributed_init_method, + lora_config=self.lora_config, + vision_language_config=self.vision_language_config, + is_driver_worker=True, + ) + self.driver_worker.init_device() + self.driver_worker.load_model() + + def initialize_cache(self, num_gpu_blocks: int, num_cpu_blocks) -> None: + """Initialize the KV cache by invoking the underlying worker. + """ + # NOTE: This is logged in the executor because there can be >1 worker + # with other executors. We could log in the engine level, but work + # remains to abstract away the device for non-GPU configurations. + logger.info(f"# GPU blocks: {num_gpu_blocks}, " + f"# CPU blocks: {num_cpu_blocks}") + + self.driver_worker.initialize_cache(num_gpu_blocks, num_cpu_blocks) + + def determine_num_available_blocks(self) -> tuple[int, int]: + """Determine the number of available KV blocks by invoking the + underlying worker. + """ + return self.driver_worker.determine_num_available_blocks() + + def execute_model(self, + seq_group_metadata_list: List[SequenceGroupMetadata], + blocks_to_swap_in: Dict[int, int], + blocks_to_swap_out: Dict[int, int], + blocks_to_copy: Dict[int, List[int]]) -> SamplerOutput: + output = self.driver_worker.execute_model( + seq_group_metadata_list=seq_group_metadata_list, + blocks_to_swap_in=blocks_to_swap_in, + blocks_to_swap_out=blocks_to_swap_out, + blocks_to_copy=blocks_to_copy, + ) + return output + + def add_lora(self, lora_request: LoRARequest) -> bool: + assert lora_request.lora_int_id > 0, "lora_id must be greater than 0." + return self.driver_worker.add_lora(lora_request) + + def remove_lora(self, lora_id: int) -> bool: + assert lora_id > 0, "lora_id must be greater than 0." + return self.driver_worker.remove_lora(lora_id) + + def list_loras(self) -> List[int]: + return self.driver_worker.list_loras() + + def check_health(self) -> None: + # XPUExecutor will always be healthy as long as + # it's running. + return + + +class XPUExecutorAsync(XPUExecutor, ExecutorAsyncBase): + + async def execute_model_async( + self, + seq_group_metadata_list: List[SequenceGroupMetadata], + blocks_to_swap_in: Dict[int, int], + blocks_to_swap_out: Dict[int, int], + blocks_to_copy: Dict[int, List[int]], + ) -> SamplerOutput: + output = await make_async(self.driver_worker.execute_model)( + seq_group_metadata_list=seq_group_metadata_list, + blocks_to_swap_in=blocks_to_swap_in, + blocks_to_swap_out=blocks_to_swap_out, + blocks_to_copy=blocks_to_copy) + return output + + async def check_health_async(self) -> None: + # XPUExecutor will always be healthy as long as + # it's running. + return + + +def _verify_and_get_model_config(config: ModelConfig) -> ModelConfig: + if config.dtype == torch.bfloat16: + logger.warning( + "bfloat16 is not fully supported on XPU, casting to float16.") + config.dtype = torch.float16 + if not config.enforce_eager: + logger.warning( + "CUDA graph is not supported on CPU, fallback to the eager " + "mode.") + config.enforce_eager = True + return config diff --git a/vllm/utils.py b/vllm/utils.py index b5c42605ba358..b7b020648da42 100644 --- a/vllm/utils.py +++ b/vllm/utils.py @@ -21,6 +21,12 @@ import psutil import torch +try: + import intel_extension_for_pytorch # noqa: F401 + _import_ipex = True +except ImportError: + _import_ipex = False + import vllm.envs as envs from vllm import _custom_ops as ops from vllm.logger import enable_trace_function_call, init_logger @@ -155,6 +161,20 @@ def is_tpu() -> bool: return libtpu is not None +@lru_cache(maxsize=None) +def is_xpu() -> bool: + from importlib.metadata import version + is_xpu_flag = "xpu" in version("vllm") + # vllm is not build with xpu + if not is_xpu_flag: + return False + # ipex dependency is not ready + if not _import_ipex: + logger.warning("not found ipex lib") + return False + return hasattr(torch, "xpu") and torch.xpu.is_available() + + @lru_cache(maxsize=None) def get_max_shared_memory_bytes(gpu: int = 0) -> int: """Returns the maximum shared memory per thread block in bytes.""" @@ -474,6 +494,9 @@ def is_pin_memory_available() -> bool: print_warning_once("Using 'pin_memory=False' as WSL is detected. " "This may slow down the performance.") return False + elif is_xpu(): + print_warning_once("Pin memory is not supported on XPU.") + return False elif is_neuron(): print_warning_once("Pin memory is not supported on Neuron.") return False @@ -482,6 +505,13 @@ def is_pin_memory_available() -> bool: return True +def device_sync(): + if torch.cuda.is_available(): + torch.cuda.synchronize() + elif is_xpu(): + torch.xpu.synchronize() + + class CudaMemoryProfiler: def __init__(self, device=None): @@ -489,8 +519,12 @@ def __init__(self, device=None): def current_memory_usage(self) -> float: # Return the memory usage in bytes. - torch.cuda.reset_peak_memory_stats(self.device) - mem = torch.cuda.max_memory_allocated(self.device) + if torch.cuda.is_available(): + torch.cuda.reset_peak_memory_stats(self.device) + mem = torch.cuda.max_memory_allocated(self.device) + elif is_xpu(): + torch.xpu.reset_peak_memory_stats(self.device) + mem = torch.xpu.max_memory_allocated(self.device) return mem def __enter__(self): diff --git a/vllm/worker/cache_engine.py b/vllm/worker/cache_engine.py index 341b177d4af2a..fbd1343fea19c 100644 --- a/vllm/worker/cache_engine.py +++ b/vllm/worker/cache_engine.py @@ -4,7 +4,7 @@ import torch from vllm.attention import get_attn_backend -from vllm.config import CacheConfig, ModelConfig, ParallelConfig +from vllm.config import CacheConfig, DeviceConfig, ModelConfig, ParallelConfig from vllm.logger import init_logger from vllm.utils import (STR_DTYPE_TO_TORCH_DTYPE, get_dtype_size, is_pin_memory_available) @@ -25,10 +25,12 @@ def __init__( cache_config: CacheConfig, model_config: ModelConfig, parallel_config: ParallelConfig, + device_config: DeviceConfig, ) -> None: self.cache_config = cache_config self.model_config = model_config self.parallel_config = parallel_config + self.device_config = device_config self.head_size = model_config.get_head_size() self.num_layers = model_config.get_num_layers(parallel_config) @@ -55,7 +57,8 @@ def __init__( ) # Initialize the cache. - self.gpu_cache = self._allocate_kv_cache(self.num_gpu_blocks, "cuda") + self.gpu_cache = self._allocate_kv_cache( + self.num_gpu_blocks, self.device_config.device_type) self.cpu_cache = self._allocate_kv_cache(self.num_cpu_blocks, "cpu") def _allocate_kv_cache( diff --git a/vllm/worker/model_runner.py b/vllm/worker/model_runner.py index 476e9ba3bb463..81a63cde64256 100644 --- a/vllm/worker/model_runner.py +++ b/vllm/worker/model_runner.py @@ -842,7 +842,7 @@ def profile_run(self) -> None: num_layers = self.model_config.get_num_layers(self.parallel_config) kv_caches = [None] * num_layers self.execute_model(seqs, kv_caches) - torch.cuda.synchronize() + device_sync() return def remove_all_loras(self): diff --git a/vllm/worker/worker.py b/vllm/worker/worker.py index 7a378a862d0c0..f9b8a065a8b24 100644 --- a/vllm/worker/worker.py +++ b/vllm/worker/worker.py @@ -205,7 +205,8 @@ def initialize_cache(self, num_gpu_blocks: int, def _init_cache_engine(self): assert self.cache_config.num_gpu_blocks is not None self.cache_engine = CacheEngine(self.cache_config, self.model_config, - self.parallel_config) + self.parallel_config, + self.device_config) self.gpu_cache = self.cache_engine.gpu_cache def _warm_up_model(self) -> None: diff --git a/vllm/worker/xpu_model_runner.py b/vllm/worker/xpu_model_runner.py new file mode 100644 index 0000000000000..715ef82d9d26b --- /dev/null +++ b/vllm/worker/xpu_model_runner.py @@ -0,0 +1,218 @@ +from typing import List, Set + +import torch + +from vllm.lora.request import LoRARequest +from vllm.sequence import SequenceGroupMetadata +from vllm.utils import make_tensor_with_pad +from vllm.worker.model_runner import ModelRunner, PreparePromptMetadata + +_PAD_SLOT_ID = -1 + + +class XPUModelRunner(ModelRunner): + + def _prepare_prompt( + self, + seq_group_metadata_list: List[SequenceGroupMetadata], + ) -> PreparePromptMetadata: + input_tokens: List[int] = [] + input_positions: List[int] = [] + slot_mapping: List[int] = [] + lora_index_mapping: List[int] = [] + lora_prompt_mapping: List[int] = [] + lora_requests: Set[LoRARequest] = set() + + prompt_lens: List[int] = [] + context_lens: List[int] = [] + subquery_lens: List[int] = [] + prefix_block_tables: List[List[int]] = [] + multi_modal_input_list: List[torch.Tensor] = [] + + if len(seq_group_metadata_list) == 0: + return PreparePromptMetadata.empty() + + for seq_group_metadata in seq_group_metadata_list: + assert seq_group_metadata.is_prompt + seq_ids = list(seq_group_metadata.seq_data.keys()) + assert len(seq_ids) == 1 + seq_id = seq_ids[0] + + computed_block_nums = seq_group_metadata.computed_block_nums + if (self.scheduler_config is not None + and self.scheduler_config.chunked_prefill_enabled + and computed_block_nums is not None): + raise RuntimeError( + "chunked prefill cannot be used with prefix caching " + "now.") + + token_chunk_size = seq_group_metadata.token_chunk_size + seq_data = seq_group_metadata.seq_data[seq_id] + computed_len = seq_data.get_num_computed_tokens() + # We should use get_len here because in case of preemption + # it contains output tokens. + prefill_end = min(seq_data.get_len(), + computed_len + token_chunk_size) + # TODO(sang): Rename it after chunked prefill is introduced. + prompt_tokens = seq_data.get_token_ids()[computed_len:prefill_end] + prompt_len = len(prompt_tokens) + # Right now, the prefill_end is always same as the length of + # sequence. However, once chunked prefill is introduced, this + # assumption can be changed. + assert prefill_end == seq_data.get_len() + prompt_lens.append(prompt_len) + + # NOTE: This only works for oooooooxxx style attention. + if computed_block_nums is not None and len( + computed_block_nums) > 0 and self.sliding_window is None: + # Prefix is not supported with sliding_window + computed_len = len(computed_block_nums) * self.block_size + prompt_tokens = prompt_tokens[computed_len:] + prefix_block_tables.append(computed_block_nums) + else: + prefix_block_tables.append([]) + # Right now, prefill start is always 0. However, this + # assumption can be changed once chunked prefill is introduced. + assert computed_len == 0 + + # actual prompt lens + context_lens.append(computed_len) + subquery_lens.append(prompt_len - computed_len) + + input_tokens.extend(prompt_tokens) + # NOTE(woosuk): Here we assume that the first token in the prompt + # is always the first token in the sequence. + input_positions.extend(list(range(computed_len, prefill_end))) + + lora_id = seq_group_metadata.lora_int_id + + if lora_id > 0: + lora_requests.add(seq_group_metadata.lora_request) + + lora_index_mapping += [lora_id] * (prompt_len - computed_len) + lora_prompt_mapping.extend( + [lora_id] * + (prompt_len - computed_len + if seq_group_metadata.sampling_params.prompt_logprobs else 1)) + + if seq_group_metadata.multi_modal_data: + multi_modal_input_list.append( + seq_group_metadata.multi_modal_data.data) + + if seq_group_metadata.block_tables is None: + # During memory profiling, the block tables are not initialized + # yet. In this case, we just use a dummy slot mapping. + slot_mapping.extend([_PAD_SLOT_ID] * prompt_len) + continue + + # Compute the slot mapping. + block_table = seq_group_metadata.block_tables[seq_id] + # Mask the [0, start_idx) tokens of the prompt with _PAD_SLOT_ID, + # where start_idx is max(0, prompt_len - sliding_window). + # For example, if the prompt len is 10, sliding window is 8, and + # block size is 4, the first two tokens are masked and the slot + # mapping will be [-1, -1, 2, 3, 4, 5, 6, 7, 0, 1]. + start_idx = 0 + if self.sliding_window is not None: + assert computed_len == 0, ( + "Prefix caching is currently not supported with " + "sliding window attention") + start_idx = max(0, prompt_len - self.sliding_window) + + for i in range(computed_len, prefill_end): + if i < start_idx: + slot_mapping.append(_PAD_SLOT_ID) + continue + + block_number = block_table[i // self.block_size] + block_offset = i % self.block_size + slot = block_number * self.block_size + block_offset + slot_mapping.append(slot) + + # padding to 8 bytes for XeTLA + padding_num = (8 - len(input_tokens) % 8) % 8 + input_tokens.extend([0] * padding_num) + input_positions.extend([0] * padding_num) + slot_mapping.extend([_PAD_SLOT_ID] * padding_num) + prompt_lens[-1] += padding_num + + max_subquery_len = max(subquery_lens) + max_prompt_len = max(prompt_lens) + assert max_subquery_len > 0 + + context_lens_tensor = torch.tensor(context_lens, + dtype=torch.int, + device=self.device) + + if multi_modal_input_list: + assert self.vision_language_config, ( + "Multi-modal inputs are only supported by " + "vision language models.") + multi_modal_input = torch.cat(multi_modal_input_list, + dim=0).to(self.device) + else: + multi_modal_input = None + + # Prepare prefix block tables + max_prompt_block_table_len = max(len(t) for t in prefix_block_tables) + block_tables = make_tensor_with_pad( + prefix_block_tables, + max_len=max_prompt_block_table_len, + pad=0, + dtype=torch.int, + device=self.device, + ) + + # Query length can be shorter than key (i.e., prompt) when prefill + # is chunked or prefix cached. + subquery_lens_tensor = torch.tensor(subquery_lens, + dtype=torch.long, + device=self.device) + subquery_start_loc = torch.zeros(subquery_lens_tensor.shape[0] + 1, + dtype=torch.int32, + device=self.device) + + prompt_lens_tensor = torch.tensor(prompt_lens, + dtype=torch.long, + device=self.device) + seq_start_loc = torch.zeros(prompt_lens_tensor.shape[0] + 1, + dtype=torch.int32, + device=self.device) + + torch.cumsum(subquery_lens_tensor, + dim=0, + dtype=subquery_start_loc.dtype, + out=subquery_start_loc[1:]) + + torch.cumsum(prompt_lens_tensor, + dim=0, + dtype=seq_start_loc.dtype, + out=seq_start_loc[1:]) + + attn_metadata = self.attn_backend.make_metadata( + is_prompt=True, + # slot_mapping=slot_mapping, + prompt_lens=prompt_lens, + prompt_lens_tensor=prompt_lens_tensor, + # num_prompt_tokens=num_prompt_tokens, + # num_generation_tokens=0, + max_subquery_len=max_subquery_len, + max_context_len=None, + max_prompt_len=max_prompt_len, + subquery_start_loc=subquery_start_loc, + seq_start_loc=seq_start_loc, + context_lens=context_lens_tensor, + block_tables=block_tables, + use_cuda_graph=False, + # kv_cache_dtype=self.kv_cache_dtype, + ) + return PreparePromptMetadata(input_tokens, + input_positions, + attn_metadata, + prompt_lens, + subquery_lens, + lora_index_mapping, + lora_prompt_mapping, + lora_requests, + multi_modal_input, + slot_mapping=slot_mapping) diff --git a/vllm/worker/xpu_worker.py b/vllm/worker/xpu_worker.py new file mode 100644 index 0000000000000..8194ef36ba589 --- /dev/null +++ b/vllm/worker/xpu_worker.py @@ -0,0 +1,330 @@ +"""A XPU worker class.""" +import gc +from typing import Dict, List, Optional, Tuple + +import intel_extension_for_pytorch # noqa: F401 +import oneccl_bindings_for_pytorch # noqa: F401 +import torch +import torch.distributed + +from vllm.config import (CacheConfig, DeviceConfig, LoRAConfig, ModelConfig, + ParallelConfig, SchedulerConfig, VisionLanguageConfig) +from vllm.distributed import (broadcast_tensor_dict, + ensure_model_parallel_initialized) +from vllm.logger import init_logger +from vllm.model_executor import set_random_seed +from vllm.sequence import SamplerOutput, SequenceGroupMetadata +from vllm.utils import is_xpu +from vllm.worker.cache_engine import CacheEngine +from vllm.worker.worker import raise_if_cache_size_invalid +from vllm.worker.worker_base import LoraNotSupportedWorkerBase +from vllm.worker.xpu_model_runner import XPUModelRunner + +logger = init_logger(__name__) + + +class XPUWorker(LoraNotSupportedWorkerBase): + """A worker class that executes (a partition of) the model on a GPU. + + Each worker is associated with a single XPU device. The worker is + responsible for maintaining the KV cache and executing the model on the + XPU. In case of distributed inference, each worker is assigned a partition + of the model. + """ + + def __init__( + self, + model_config: ModelConfig, + parallel_config: ParallelConfig, + scheduler_config: SchedulerConfig, + device_config: DeviceConfig, + cache_config: CacheConfig, + local_rank: int, + rank: int, + distributed_init_method: str, + lora_config: Optional[LoRAConfig] = None, + vision_language_config: Optional[VisionLanguageConfig] = None, + is_driver_worker: bool = False, + ) -> None: + assert device_config.device_type == "xpu" + assert is_xpu() + + self.model_config = model_config + self.parallel_config = parallel_config + self.scheduler_config = scheduler_config + self.device_config = device_config + self.cache_config = cache_config + self.local_rank = local_rank + self.rank = rank + self.distributed_init_method = distributed_init_method + self.lora_config = lora_config + self.is_driver_worker = is_driver_worker + if self.is_driver_worker: + assert self.rank == 0, "The driver worker must have rank 0." + + self.vision_language_config = vision_language_config + if self.vision_language_config: + assert not self.lora_config, ( + "To be tested: vision language model with LoRA settings.") + + self.model_runner = XPUModelRunner( + model_config, + parallel_config, + scheduler_config, + device_config, + lora_config=self.lora_config, + kv_cache_dtype=self.cache_config.cache_dtype, + is_driver_worker=is_driver_worker, + vision_language_config=vision_language_config) + # Uninitialized cache engine. Will be initialized by + # initialize_cache. + self.cache_engine = None + self.gpu_cache = None + + def init_device(self) -> None: + if self.device_config.device.type == "xpu" and is_xpu(): + self.device = torch.device(f"xpu:{self.local_rank}") + torch.xpu.set_device(self.device) + torch.xpu.empty_cache() + self.init_gpu_memory = torch.xpu.get_device_properties( + self.local_rank).total_memory + else: + raise RuntimeError( + f"Not support device type: {self.device_config.device}") + # Initialize the distributed environment. + self.init_distributed_environment() + # Initialize the model. + set_random_seed(self.model_config.seed) + + def load_model(self): + self.model_runner.load_model() + + @torch.inference_mode() + def determine_num_available_blocks(self) -> Tuple[int, int]: + """Profiles the peak memory usage of the model to determine how many + KV blocks may be allocated without OOMs. + + The engine will first conduct a profiling of the existing memory usage. + Then, it calculate the maximum possible number of GPU and CPU blocks + that can be allocated with the remaining free memory. + + .. tip:: + You may limit the usage of GPU memory + by adjusting the `gpu_memory_utilization` parameter. + """ + # Profile the memory usage of the model and get the maximum number of + # cache blocks that can be allocated with the remaining free memory. + torch.xpu.empty_cache() + + # Execute a forward pass with dummy inputs to profile the memory usage + # of the model. + self.model_runner.profile_run() + + # Calculate the number of blocks that can be allocated with the + # profiled peak memory. + torch.xpu.synchronize() + used_memory = torch.xpu.memory_allocated() + total_gpu_memory = torch.xpu.get_device_properties( + self.local_rank).total_memory + free_gpu_memory = total_gpu_memory - used_memory + + # NOTE(woosuk): Here we assume that the other processes using the same + # GPU did not change their memory usage during the profiling. + peak_memory = self.init_gpu_memory - free_gpu_memory + assert peak_memory > 0, ( + "Error in memory profiling. This happens when the GPU memory was " + "not properly cleaned up before initializing the vLLM instance.") + + cache_block_size = self.get_cache_block_size_bytes() + num_gpu_blocks = int( + (total_gpu_memory * self.cache_config.gpu_memory_utilization - + peak_memory) // cache_block_size) + num_cpu_blocks = int(self.cache_config.swap_space_bytes // + cache_block_size) + num_gpu_blocks = max(num_gpu_blocks, 0) + num_cpu_blocks = max(num_cpu_blocks, 0) + if self.model_runner.lora_manager: + self.model_runner.remove_all_loras() + gc.collect() + torch.xpu.empty_cache() + return num_gpu_blocks, num_cpu_blocks + + @torch.inference_mode() + def profile_num_available_blocks( + self, + block_size: int, + gpu_memory_utilization: float, + cpu_swap_space: int, + cache_dtype: str, + ) -> Tuple[int, int]: + """Profiles the peak memory usage of the model and returns the maximum + number of GPU and CPU cache blocks that can be allocated. + + Args: + block_size: The size of the cache block. + gpu_memory_utilization: The fraction of the total GPU memory to use. + cpu_swap_space: The size of the CPU swap space in bytes. + """ + # Profile the memory usage of the model and get the maximum number of + # cache blocks that can be allocated with the remaining free memory. + torch.xpu.empty_cache() + + # Execute a forward pass with dummy inputs to profile the memory usage + # of the model. + self.model_runner.profile_run() + + # Calculate the number of blocks that can be allocated with the + # profiled peak memory. + torch.xpu.synchronize() + + used_memory = torch.xpu.memory_allocated() + total_gpu_memory = torch.xpu.get_device_properties( + self.local_rank).total_memory + # print(f"rank:{self.local_rank}, used_memory:{used_memory}") + + free_gpu_memory = total_gpu_memory - used_memory + # NOTE(woosuk): Here we assume that the other processes using the same + # GPU did not change their memory usage during the profiling. + peak_memory = self.init_gpu_memory - free_gpu_memory + + cache_block_size = self.get_cache_block_size_bytes( + block_size, cache_dtype) + num_gpu_blocks = int( + (total_gpu_memory * gpu_memory_utilization - peak_memory) // + cache_block_size) + num_cpu_blocks = int(cpu_swap_space // cache_block_size) + num_gpu_blocks = max(num_gpu_blocks, 0) + num_cpu_blocks = max(num_cpu_blocks, 0) + if self.model_runner.lora_manager: + self.model_runner.remove_all_loras() + gc.collect() + torch.xpu.empty_cache() + return num_gpu_blocks, num_cpu_blocks + + def initialize_cache(self, num_gpu_blocks: int, + num_cpu_blocks: int) -> None: + """Allocate GPU and CPU KV cache with the specified number of blocks. + + This also warms up the model, which may record CUDA graphs. + """ + raise_if_cache_size_invalid(num_gpu_blocks, + self.cache_config.block_size, + self.model_config.max_model_len) + + self.cache_config.num_gpu_blocks = num_gpu_blocks + self.cache_config.num_cpu_blocks = num_cpu_blocks + + self._init_cache_engine() + self._warm_up_model() + + def _init_cache_engine(self) -> None: + assert self.cache_config.num_gpu_blocks is not None + self.cache_engine = CacheEngine(self.cache_config, self.model_config, + self.parallel_config, + self.device_config) + self.gpu_cache = self.cache_engine.gpu_cache + self.model_runner.set_block_size(self.cache_engine.block_size) + + def _warm_up_model(self) -> None: + if not self.model_config.enforce_eager: + self.model_runner.capture_model(self.gpu_cache) + # Reset the seed to ensure that the random state is not affected by + # the model initialization and profiling. + set_random_seed(self.model_config.seed) + + def get_cache_block_size_bytes(self) -> int: + """Get the size of the KV cache block size in bytes. + """ + return CacheEngine.get_cache_block_size(self.cache_config, + self.model_config, + self.parallel_config) + + @torch.inference_mode() + def execute_model( + self, + seq_group_metadata_list: Optional[List[SequenceGroupMetadata]] = None, + blocks_to_swap_in: Optional[Dict[int, int]] = None, + blocks_to_swap_out: Optional[Dict[int, int]] = None, + blocks_to_copy: Optional[Dict[int, List[int]]] = None, + ) -> Optional[SamplerOutput]: + if self.is_driver_worker: + assert seq_group_metadata_list is not None + num_seq_groups = len(seq_group_metadata_list) + assert blocks_to_swap_in is not None + assert blocks_to_swap_out is not None + assert blocks_to_copy is not None + data = { + "num_seq_groups": num_seq_groups, + "blocks_to_swap_in": blocks_to_swap_in, + "blocks_to_swap_out": blocks_to_swap_out, + "blocks_to_copy": blocks_to_copy, + } + broadcast_tensor_dict(data, src=0) + else: + data = broadcast_tensor_dict(src=0) + num_seq_groups = data["num_seq_groups"] + blocks_to_swap_in = data["blocks_to_swap_in"] + blocks_to_swap_out = data["blocks_to_swap_out"] + blocks_to_copy = data["blocks_to_copy"] + + self.cache_swap(blocks_to_swap_in, blocks_to_swap_out, blocks_to_copy) + + # If there is no input, we don't need to execute the model. + if num_seq_groups == 0: + return {} + + output = self.model_runner.execute_model(seq_group_metadata_list, + self.gpu_cache) + return output + + def cache_swap( + self, + blocks_to_swap_in: Dict[int, int], + blocks_to_swap_out: Dict[int, int], + blocks_to_copy: Dict[int, List[int]], + ) -> None: + # Issue cache operations. + # TODO(woosuk): Profile swapping overhead and optimize if needed. + if blocks_to_swap_in: + self.cache_engine.swap_in(blocks_to_swap_in) + if blocks_to_swap_out: + self.cache_engine.swap_out(blocks_to_swap_out) + if blocks_to_copy: + self.cache_engine.copy(blocks_to_copy) + + def init_distributed_environment(self) -> None: + """Initialize the distributed environment.""" + + parallel_config = self.parallel_config + rank = self.rank + distributed_init_method = self.distributed_init_method + + if torch.distributed.is_initialized(): + torch_world_size = torch.distributed.get_world_size() + if torch_world_size != parallel_config.world_size: + raise RuntimeError( + "torch.distributed is already initialized but the torch " + "world size does not match parallel_config.world_size " + f"({torch_world_size} vs. {parallel_config.world_size}).") + elif not distributed_init_method: + raise ValueError( + "distributed_init_method must be set if torch.distributed " + "is not already initialized") + else: + import os + ENV_CCL_ZE_IPC_EXCHANGE = os.getenv("CCL_ZE_IPC_EXCHANGE", + "sockets") + os.environ['CCL_ZE_IPC_EXCHANGE'] = ENV_CCL_ZE_IPC_EXCHANGE + torch.distributed.init_process_group( + backend="ccl", + world_size=parallel_config.world_size, + rank=rank, + init_method=distributed_init_method, + ) + + # A small all_reduce for warmup. + torch.distributed.all_reduce(torch.zeros(1).xpu()) + + ensure_model_parallel_initialized( + parallel_config.tensor_parallel_size, + parallel_config.pipeline_parallel_size) From 068d34b2bff955ec38a5c8551539901b691d5c29 Mon Sep 17 00:00:00 2001 From: Kunshang Ji Date: Fri, 12 Apr 2024 01:31:47 +0800 Subject: [PATCH 03/67] add ipex ops --- vllm/_custom_ops.py | 6 ++ vllm/_ipex_ops.py | 214 ++++++++++++++++++++++++++++++++++++++++++++ 2 files changed, 220 insertions(+) create mode 100644 vllm/_ipex_ops.py diff --git a/vllm/_custom_ops.py b/vllm/_custom_ops.py index 2f84b8bde6b57..430f18cce9338 100644 --- a/vllm/_custom_ops.py +++ b/vllm/_custom_ops.py @@ -13,6 +13,12 @@ except ImportError as e: logger.warning("Failed to import from vllm._C with %r", e) +# todo: how to support cpu+ipex? +from vllm.utils import is_xpu +if is_xpu(): + from vllm._ipex_ops import ipex_ops as vllm_ops + from vllm._ipex_ops import ipex_cache_ops as vllm_cache_ops + with contextlib.suppress(ImportError): import vllm._moe_C diff --git a/vllm/_ipex_ops.py b/vllm/_ipex_ops.py new file mode 100644 index 0000000000000..df86120720c68 --- /dev/null +++ b/vllm/_ipex_ops.py @@ -0,0 +1,214 @@ +import torch +import intel_extension_for_pytorch as ipex +from typing import Dict, Optional + + +class ipex_ops: + + @staticmethod + def reshape_activation_tensor(x: torch.Tensor): + num = x.size(0) + d = x.size(1) // 2 + x = x.reshape(num, 2, d) + x1, x2 = torch.chunk(x, chunks=2, dim=1) + x1 = x1.reshape(num, d) + x2 = x2.reshape(num, d) + return x1, x2 + + def silu_and_mul(out: torch.Tensor, x: torch.Tensor) -> None: + x1, x2 = ipex_ops.reshape_activation_tensor(x) + ipex.llm.functional.silu_mul(x1, x2, out) + + def gelu_and_mul(out: torch.Tensor, x: torch.Tensor) -> None: + x1, x2 = ipex_ops.reshape_activation_tensor(x) + ipex.llm.functional.gelu_mul(x1, x2, out, "none") + + def gelu_tanh_and_mul(out: torch.Tensor, x: torch.Tensor) -> None: + x1, x2 = ipex_ops.reshape_activation_tensor(x) + ipex.llm.functional.gelu_mul(x1, x2, out, "tanh") + + def gelu_fast(out: torch.Tensor, x: torch.Tensor) -> None: + out.copy_(torch.nn.functional.gelu(x)) + + def gelu_new(out: torch.Tensor, x: torch.Tensor) -> None: + out.copy_(torch.nn.functional.gelu(x)) + + def paged_attention_v1( + out: torch.Tensor, + query: torch.Tensor, + key_cache: torch.Tensor, + value_cache: torch.Tensor, + num_kv_heads: int, + scale: float, + block_tables: torch.Tensor, + context_lens: torch.Tensor, + block_size: int, + max_context_len: int, + alibi_slopes: Optional[torch.Tensor], + kv_cache_dtype: str, + kv_scale: float, + ) -> None: + assert kv_cache_dtype == "auto" + num_heads = out.size(1) + num_queries_per_tokens = num_heads // num_kv_heads + head_mapping = torch.arange( + 0, + num_kv_heads, + device=query.device, + dtype=torch.int32, + ).view(num_kv_heads, + 1).repeat_interleave(num_queries_per_tokens).flatten() + # todo: ipex will refactor namespace + torch.xpu.paged_attention_v1(out, query.contiguous(), + key_cache.view_as(value_cache), + value_cache, head_mapping, scale, + block_tables, context_lens, block_size, + max_context_len, alibi_slopes) + + def paged_attention_v2( + out: torch.Tensor, + exp_sum: torch.Tensor, + max_logits: torch.Tensor, + tmp_out: torch.Tensor, + query: torch.Tensor, + key_cache: torch.Tensor, + value_cache: torch.Tensor, + num_kv_heads: int, + scale: float, + block_tables: torch.Tensor, + context_lens: torch.Tensor, + block_size: int, + max_context_len: int, + alibi_slopes: Optional[torch.Tensor], + kv_cache_dtype: str, + kv_scale: float, + ) -> None: + assert kv_cache_dtype == "auto" + num_heads = out.size(1) + num_queries_per_tokens = num_heads // num_kv_heads + head_mapping = torch.arange( + 0, + num_kv_heads, + dtype=torch.int32, + ).view(num_kv_heads, + 1).repeat_interleave(num_queries_per_tokens).flatten() + # todo: ipex will refactor namespace + torch.xpu.paged_attention_v2(out, exp_sum, max_logits, tmp_out, + query.contiguous(), + key_cache.view_as(value_cache), + value_cache, head_mapping, block_tables, + context_lens, scale, block_size, + max_context_len, alibi_slopes) + + def rotary_embedding( + positions: torch.Tensor, # [batch_size, seq_len] + query: torch.Tensor, # [batch_size, seq_len, num_heads*head_size] + key: torch.Tensor, # [batch_size, seq_len, num_kv_heads*head_size] + head_size: int, + cos_sin_cache: torch.Tensor, # [cos_sin_dim, rot_dim] + is_neox: bool, + ) -> None: + if positions.dim() == 1: + positions = positions.unsqueeze(0) + query = query.unsqueeze(0) + key = key.unsqueeze(0) + + rotary_dim = cos_sin_cache.size(1) + query = query.view(*query.shape[:-1], -1, head_size) + key = key.view(*key.shape[:-1], -1, head_size) + + query_rot = query[..., :rotary_dim] + key_rot = key[..., :rotary_dim] + + cos_sin = cos_sin_cache[positions.long()] + cos, sin = cos_sin.chunk(2, dim=-1) + + if is_neox: + cos = cos.repeat(1, 1, 2).unsqueeze(-2) + sin = sin.repeat(1, 1, 2).unsqueeze(-2) + else: + cos = cos.repeat_interleave(2, dim=-1).unsqueeze(-2) + sin = sin.repeat_interleave(2, dim=-1).unsqueeze(-2) + ipex.llm.functional.rotary_embedding(query_rot, key_rot, sin, cos, + rotary_dim, is_neox, positions) + + def batched_rotary_embedding(positions: torch.Tensor, query: torch.Tensor, + key: torch.Tensor, head_size: int, + cos_sin_cache: torch.Tensor, is_neox: bool, + rot_dim: int, + cos_sin_cache_offsets: torch.Tensor) -> None: + if positions.dim() == 1: + positions = positions.unsqueeze(0) + query = query.unsqueeze(0) + key = key.unsqueeze(0) + cos_sin_cache_offsets = cos_sin_cache_offsets.view_as(positions) + rotary_dim = cos_sin_cache.size(1) + query = query.view(*query.shape[:-1], -1, head_size) + key = key.view(*key.shape[:-1], -1, head_size) + + query_rot = query[..., :rotary_dim] + key_rot = key[..., :rotary_dim] + + cos_sin = cos_sin_cache[torch.add(positions, + cos_sin_cache_offsets).long()] + cos, sin = cos_sin.chunk(2, dim=-1) + + if is_neox: + cos = cos.repeat(1, 1, 2).unsqueeze(-2) + sin = sin.repeat(1, 1, 2).unsqueeze(-2) + else: + cos = cos.repeat_interleave(2, dim=-1).unsqueeze(-2) + sin = sin.repeat_interleave(2, dim=-1).unsqueeze(-2) + + ipex.llm.functional.rotary_embedding(query_rot, key_rot, sin, cos, + rotary_dim, is_neox, positions) + + def rms_norm(out: torch.Tensor, input: torch.Tensor, weight: torch.Tensor, + epsilon: float) -> None: + tmp = ipex.llm.functional.rms_norm(input, weight, epsilon) + out.copy_(tmp) + + def fused_add_rms_norm(input: torch.Tensor, residual: torch.Tensor, + weight: torch.Tensor, epsilon: float) -> None: + tmp = ipex.llm.functional.add_rms_norm(residual, input, weight, None, + epsilon, True) + input.copy_(tmp) + + +class ipex_cache_ops: + + def reshape_and_cache( + key: torch.Tensor, + value: torch.Tensor, + key_cache: torch.Tensor, + value_cache: torch.Tensor, + slot_mapping: torch.Tensor, + kv_cache_dtype: str, + kv_scale: float, + ) -> None: + assert kv_cache_dtype == "auto" + torch.ops.torch_ipex.reshape_and_cache(key, value, key_cache, + value_cache, slot_mapping) + + def copy_blocks(key_caches: torch.Tensor, value_caches: torch.Tensor, + block_mapping: torch.Tensor) -> None: + block_mapping_tensor = [] + for key, values in block_mapping.items(): + if hasattr(values, "__iter__"): + for value in values: + block_mapping_tensor.append([key, value]) + block_mapping = torch.Tensor(block_mapping_tensor, + device=key_caches.device, + dtype=torch.int64) + torch.ops.torch_ipex.copy_blocks(key_caches, value_caches, + block_mapping) + + def swap_blocks(src: torch.Tensor, dst: torch.Tensor, + block_mapping: Dict[int, int]) -> None: + keys = list(block_mapping.keys()) + values = list(block_mapping.values()) + key_tensor = torch.Tensor(keys) + value_tensor = torch.Tensor(values) + block_mapping_tensor = torch.stack([key_tensor, value_tensor], dim=1) + + torch.ops.torch_ipex.swap_blocks(src, dst, block_mapping_tensor) From 19bc1777dd7885d624b319948eb206d5fa655bee Mon Sep 17 00:00:00 2001 From: Kunshang Ji Date: Fri, 12 Apr 2024 01:31:58 +0800 Subject: [PATCH 04/67] add test --- tests/kernels/test_activation.py | 8 ++- tests/kernels/test_attention.py | 15 +++-- tests/kernels/test_cache.py | 100 ++++++++++++++++++++++++++--- tests/kernels/test_layernorm.py | 4 +- tests/kernels/test_pos_encoding.py | 46 +++++++------ 5 files changed, 134 insertions(+), 39 deletions(-) diff --git a/tests/kernels/test_activation.py b/tests/kernels/test_activation.py index a4b9f91c7688b..c69074392bbe5 100644 --- a/tests/kernels/test_activation.py +++ b/tests/kernels/test_activation.py @@ -5,6 +5,7 @@ from vllm.model_executor.layers.activation import (FastGELU, GeluAndMul, NewGELU, SiluAndMul) +from vllm.utils import is_xpu from .allclose_default import get_default_atol, get_default_rtol @@ -15,6 +16,7 @@ CUDA_DEVICES = [ f"cuda:{i}" for i in range(1 if torch.cuda.device_count() == 1 else 2) ] +SYCL_DEVICES = [f"xpu:0"] if is_xpu() else [] @pytest.mark.parametrize("activation", ["silu", "gelu", "gelu_tanh"]) @@ -22,7 +24,7 @@ @pytest.mark.parametrize("d", D) @pytest.mark.parametrize("dtype", DTYPES) @pytest.mark.parametrize("seed", SEEDS) -@pytest.mark.parametrize("device", CUDA_DEVICES) +@pytest.mark.parametrize("device", SYCL_DEVICES) @torch.inference_mode() def test_act_and_mul( activation: str, @@ -47,7 +49,7 @@ def test_act_and_mul( ref_out = layer.forward_native(x) # The SiLU and GELU implementations are equivalent to the native PyTorch # implementations, so we can do exact comparison. - assert torch.allclose(out, ref_out, atol=0.0, rtol=0.0) + assert torch.allclose(out, ref_out, atol=0.001, rtol=0.01) @pytest.mark.parametrize("activation", [FastGELU, NewGELU]) @@ -55,7 +57,7 @@ def test_act_and_mul( @pytest.mark.parametrize("d", D) @pytest.mark.parametrize("dtype", DTYPES) @pytest.mark.parametrize("seed", SEEDS) -@pytest.mark.parametrize("device", CUDA_DEVICES) +@pytest.mark.parametrize("device", SYCL_DEVICES) @torch.inference_mode() def test_activation( activation: Type[torch.nn.Module], diff --git a/tests/kernels/test_attention.py b/tests/kernels/test_attention.py index 8bc4766fc93c4..dfc629abfbde0 100644 --- a/tests/kernels/test_attention.py +++ b/tests/kernels/test_attention.py @@ -7,14 +7,14 @@ from xformers.ops.fmha.attn_bias import BlockDiagonalCausalMask from vllm import _custom_ops as ops -from vllm.utils import get_max_shared_memory_bytes, is_hip +from vllm.utils import get_max_shared_memory_bytes, is_hip, is_xpu from .allclose_default import get_default_atol, get_default_rtol FLOAT32_BYTES = torch.finfo(torch.float).bits // 8 # This will change depending on the compute capability. # - 512 as a buffer -MAX_SEQ_LEN = get_max_shared_memory_bytes() // FLOAT32_BYTES - 512 +MAX_SEQ_LEN = 1024 # get_max_shared_memory_bytes() // FLOAT32_BYTES - 512 # There may not be enough gpu memory due to large NUM_BLOCKS. # Reduce NUM_BLOCKS when it happens. NUM_BLOCKS = 4321 # Arbitrary values for testing @@ -31,13 +31,16 @@ HEAD_SIZES = [64, 80, 96, 112, 128, 192, 256 ] if not is_hip() else [64, 80, 96, 112, 128] -BLOCK_SIZES = [16, 32] +BLOCK_SIZES = [ + 16, +] #32] USE_ALIBI = [False, True] -KV_CACHE_DTYPE = ["auto", "fp8"] +KV_CACHE_DTYPE = ["auto"] #, "fp8"] SEEDS = [0] CUDA_DEVICES = [ f"cuda:{i}" for i in range(1 if torch.cuda.device_count() == 1 else 2) ] +SYCL_DEVICES = [f"xpu:0"] if is_xpu() else [] def ref_masked_attention( @@ -120,7 +123,7 @@ def ref_single_query_cached_kv_attention( @pytest.mark.parametrize("dtype", DTYPES) @pytest.mark.parametrize("kv_cache_dtype", KV_CACHE_DTYPE) @pytest.mark.parametrize("seed", SEEDS) -@pytest.mark.parametrize("device", CUDA_DEVICES) +@pytest.mark.parametrize("device", SYCL_DEVICES) def test_paged_attention( kv_cache_factory, version: str, @@ -313,7 +316,7 @@ def ref_multi_query_kv_attention( @pytest.mark.parametrize("head_size", HEAD_SIZES) @pytest.mark.parametrize("dtype", DTYPES) @pytest.mark.parametrize("seed", SEEDS) -@pytest.mark.parametrize("device", CUDA_DEVICES) +@pytest.mark.parametrize("device", SYCL_DEVICES) @torch.inference_mode() def test_multi_query_kv_attention( num_seqs: int, diff --git a/tests/kernels/test_cache.py b/tests/kernels/test_cache.py index 29572cfa57499..30d9423d4f478 100644 --- a/tests/kernels/test_cache.py +++ b/tests/kernels/test_cache.py @@ -5,8 +5,12 @@ import torch from vllm import _custom_ops as ops +from vllm._C import cache_ops +from vllm.utils import is_hip, is_xpu -COPYING_DIRECTION = [('cuda', 'cpu'), ('cuda', 'cuda'), ('cpu', 'cuda')] +COPYING_DIRECTION = [ + ('xpu', 'cpu'), +] # ('xpu', 'xpu'), ('cpu', 'xpu')] DTYPES = [torch.half, torch.bfloat16, torch.float] NUM_TOKENS = [42] # Arbitrary values for testing NUM_LAYERS = [1] # Arbitrary values for testing @@ -16,7 +20,7 @@ # Arbitrary values for testing # don't make it too large. e.g. [1024, 36000] will OOM -NUM_BLOCKS = [1024, 10000] +NUM_BLOCKS = [1024] #, 10000] NUM_MAPPINGS = [256] # Arbitrary values for testing SEEDS = [0] @@ -24,8 +28,9 @@ f"cuda:{i}" for i in range(1 if torch.cuda.device_count() == 1 else 2) ] +SYCL_DEVICES = [f"xpu:0"] if is_xpu() else [] # We assume fp8 is always enabled for testing. -KV_CACHE_DTYPE = ["auto", "fp8"] +KV_CACHE_DTYPE = ["auto"] #, "fp8"] @pytest.mark.parametrize("num_mappings", NUM_MAPPINGS) @@ -36,7 +41,7 @@ @pytest.mark.parametrize("num_blocks", NUM_BLOCKS) @pytest.mark.parametrize("dtype", DTYPES) @pytest.mark.parametrize("seed", SEEDS) -@pytest.mark.parametrize("device", CUDA_DEVICES) +@pytest.mark.parametrize("device", SYCL_DEVICES) @pytest.mark.parametrize("kv_cache_dtype", KV_CACHE_DTYPE) @torch.inference_mode() def test_copy_blocks( @@ -109,7 +114,7 @@ def test_copy_blocks( @pytest.mark.parametrize("num_blocks", NUM_BLOCKS) @pytest.mark.parametrize("dtype", DTYPES) @pytest.mark.parametrize("seed", SEEDS) -@pytest.mark.parametrize("device", CUDA_DEVICES) +@pytest.mark.parametrize("device", SYCL_DEVICES) @pytest.mark.parametrize("kv_cache_dtype", KV_CACHE_DTYPE) @torch.inference_mode() def test_reshape_and_cache( @@ -271,8 +276,85 @@ def test_reshape_and_cache_flash( assert torch.allclose(value_cache, cloned_value_cache) -@pytest.mark.parametrize("direction", COPYING_DIRECTION) -@pytest.mark.parametrize("num_mappings", NUM_MAPPINGS) +@pytest.mark.parametrize("num_tokens", NUM_TOKENS) +@pytest.mark.parametrize("num_heads", NUM_HEADS) +@pytest.mark.parametrize("head_size", HEAD_SIZES) +@pytest.mark.parametrize("block_size", BLOCK_SIZES) +@pytest.mark.parametrize("num_blocks", NUM_BLOCKS) +@pytest.mark.parametrize("dtype", DTYPES) +@pytest.mark.parametrize("seed", SEEDS) +@pytest.mark.parametrize("device", CUDA_DEVICES) +@pytest.mark.parametrize("kv_cache_dtype", KV_CACHE_DTYPE) +@torch.inference_mode() +def test_reshape_and_cache_flash( + kv_cache_factory_flashinfer, + num_tokens: int, + num_heads: int, + head_size: int, + block_size: int, + num_blocks: int, + dtype: torch.dtype, + seed: int, + device: str, + kv_cache_dtype: str, +) -> None: + if kv_cache_dtype == "fp8": + pytest.skip() + random.seed(seed) + torch.random.manual_seed(seed) + torch.cuda.manual_seed(seed) + torch.set_default_device(device) + + # Create a random slot mapping. + num_slots = block_size * num_blocks + slot_mapping = random.sample(range(num_slots), num_tokens) + slot_mapping = torch.tensor(slot_mapping, dtype=torch.long, device=device) + + qkv = torch.randn(num_tokens, + 3, + num_heads, + head_size, + dtype=dtype, + device=device) + _, key, value = qkv.unbind(dim=1) + + # Create the KV caches. + key_caches, value_caches = kv_cache_factory_flashinfer( + num_blocks, + block_size, + 1, + num_heads, + head_size, + kv_cache_dtype, + dtype, + device=device, + ) + key_cache, value_cache = key_caches[0], value_caches[0] + + # Clone the KV caches. + cloned_key_cache = key_cache.clone() + cloned_value_cache = value_cache.clone() + + # Call the reshape_and_cache kernel. + ops.reshape_and_cache_flash(key, value, key_cache, value_cache, + slot_mapping, kv_cache_dtype) + + # Run the reference implementation. + block_indicies = torch.div(slot_mapping, block_size, rounding_mode='floor') + block_indicies = block_indicies.cpu().tolist() + block_offsets = slot_mapping % block_size + block_offsets = block_offsets.cpu().tolist() + for i in range(num_tokens): + block_idx = block_indicies[i] + block_offset = block_offsets[i] + cloned_key_cache[block_idx, block_offset, :, :] = key[i] + cloned_value_cache[block_idx, block_offset, :, :] = value[i] + + assert torch.allclose(key_cache, cloned_key_cache) + assert torch.allclose(value_cache, cloned_value_cache) + + +@pytest.mark.parametrize("num_tokens", NUM_TOKENS) @pytest.mark.parametrize("num_heads", NUM_HEADS) @pytest.mark.parametrize("head_size", HEAD_SIZES) @pytest.mark.parametrize("block_size", BLOCK_SIZES) @@ -302,8 +384,8 @@ def test_swap_blocks( if torch.cuda.is_available(): torch.cuda.manual_seed(seed) - src_device = device if direction[0] == "cuda" else 'cpu' - dst_device = device if direction[1] == "cuda" else 'cpu' + src_device = device if direction[0] == "xpu" else 'cpu' + dst_device = device if direction[1] == "xpu" else 'cpu' src_blocks = random.sample(range(num_blocks), num_mappings) # For the same device, mapping must not overlap diff --git a/tests/kernels/test_layernorm.py b/tests/kernels/test_layernorm.py index a635e6c12c594..544d5dc2fea9d 100644 --- a/tests/kernels/test_layernorm.py +++ b/tests/kernels/test_layernorm.py @@ -2,6 +2,7 @@ import torch from vllm.model_executor.layers.layernorm import RMSNorm +from vllm.utils import is_xpu DTYPES = [torch.half, torch.bfloat16, torch.float] NUM_TOKENS = [7, 83, 4096] # Arbitrary values for testing @@ -12,6 +13,7 @@ CUDA_DEVICES = [ f"cuda:{i}" for i in range(1 if torch.cuda.device_count() == 1 else 2) ] +SYCL_DEVICES = ["xpu:0"] if is_xpu() else [] @pytest.mark.parametrize("num_tokens", NUM_TOKENS) @@ -19,7 +21,7 @@ @pytest.mark.parametrize("add_residual", ADD_RESIDUAL) @pytest.mark.parametrize("dtype", DTYPES) @pytest.mark.parametrize("seed", SEEDS) -@pytest.mark.parametrize("device", CUDA_DEVICES) +@pytest.mark.parametrize("device", SYCL_DEVICES) @torch.inference_mode() def test_rms_norm( num_tokens: int, diff --git a/tests/kernels/test_pos_encoding.py b/tests/kernels/test_pos_encoding.py index e564e325112a6..d38a3e77053eb 100644 --- a/tests/kernels/test_pos_encoding.py +++ b/tests/kernels/test_pos_encoding.py @@ -5,6 +5,7 @@ import torch from vllm.model_executor.layers.rotary_embedding import get_rope +from vllm.utils import is_xpu from .allclose_default import get_default_atol, get_default_rtol @@ -14,11 +15,12 @@ ROTARY_DIMS = [None, 32] # None means rotary dim == head size NUM_HEADS = [7, 17] # Arbitrary values for testing BATCH_SIZES = [1, 5] # Arbitrary values for testing -SEQ_LENS = [11, 8192] # Arbitrary values for testing +SEQ_LENS = [11, 512] # Arbitrary values for testing SEEDS = [0] CUDA_DEVICES = [ f"cuda:{i}" for i in range(1 if torch.cuda.device_count() == 1 else 2) ] +SYCL_DEVICES = [f"xpu:0"] if is_xpu() else [] @pytest.mark.parametrize("is_neox_style", IS_NEOX_STYLE) @@ -29,7 +31,7 @@ @pytest.mark.parametrize("rotary_dim", ROTARY_DIMS) @pytest.mark.parametrize("dtype", DTYPES) @pytest.mark.parametrize("seed", SEEDS) -@pytest.mark.parametrize("device", CUDA_DEVICES) +@pytest.mark.parametrize("device", SYCL_DEVICES) @torch.inference_mode() def test_rotary_embedding( is_neox_style: bool, @@ -67,14 +69,16 @@ def test_rotary_embedding( ref_query, ref_key = rope.forward_native(positions, query, key) out_query, out_key = rope.forward(positions, query, key) # Compare the results. - assert torch.allclose(out_query, - ref_query, - atol=get_default_atol(out_query), - rtol=get_default_rtol(out_query)) - assert torch.allclose(out_key, - ref_key, - atol=get_default_atol(out_key), - rtol=get_default_rtol(out_key)) + assert torch.allclose( + out_query, + ref_query, + atol=0.01, #get_default_atol(out_query), + rtol=0.01) #get_default_rtol(out_query)) + assert torch.allclose( + out_key, + ref_key, + atol=0.01, #get_default_atol(out_query), + rtol=0.01) #get_default_rtol(out_query)) @pytest.mark.parametrize("is_neox_style", IS_NEOX_STYLE) @@ -85,7 +89,7 @@ def test_rotary_embedding( @pytest.mark.parametrize("rotary_dim", ROTARY_DIMS) @pytest.mark.parametrize("dtype", DTYPES) @pytest.mark.parametrize("seed", SEEDS) -@pytest.mark.parametrize("device", CUDA_DEVICES) +@pytest.mark.parametrize("device", SYCL_DEVICES) @torch.inference_mode() def test_batched_rotary_embedding( is_neox_style: bool, @@ -129,14 +133,16 @@ def test_batched_rotary_embedding( dtype=int, device=device)) # Compare the results. - assert torch.allclose(out_query, - ref_query, - atol=get_default_atol(out_query), - rtol=get_default_rtol(out_query)) - assert torch.allclose(out_key, - ref_key, - atol=get_default_atol(out_key), - rtol=get_default_rtol(out_key)) + assert torch.allclose( + out_query, + ref_query, + atol=0.01, #get_default_atol(out_query), + rtol=0.01) #get_default_rtol(out_query)) + assert torch.allclose( + out_key, + ref_key, + atol=0.01, #get_default_atol(out_key), + rtol=0.01) #get_default_rtol(out_key)) @pytest.mark.parametrize("is_neox_style", IS_NEOX_STYLE) @@ -147,7 +153,7 @@ def test_batched_rotary_embedding( @pytest.mark.parametrize("rotary_dim", ROTARY_DIMS) @pytest.mark.parametrize("dtype", DTYPES) @pytest.mark.parametrize("seed", SEEDS) -@pytest.mark.parametrize("device", CUDA_DEVICES) +@pytest.mark.parametrize("device", SYCL_DEVICES) @torch.inference_mode() def test_batched_rotary_embedding_multi_lora( is_neox_style: bool, From df77d6f76922cc4339324e766ce5879df22b6d56 Mon Sep 17 00:00:00 2001 From: Kunshang Ji Date: Fri, 12 Apr 2024 17:46:34 +0800 Subject: [PATCH 05/67] revert prepare_prompt --- vllm/worker/xpu_model_runner.py | 126 +++++++++++++++++++++++++------- 1 file changed, 101 insertions(+), 25 deletions(-) diff --git a/vllm/worker/xpu_model_runner.py b/vllm/worker/xpu_model_runner.py index 715ef82d9d26b..ba9457f16b25d 100644 --- a/vllm/worker/xpu_model_runner.py +++ b/vllm/worker/xpu_model_runner.py @@ -1,11 +1,11 @@ -from typing import List, Set +from typing import List, Set, Tuple import torch from vllm.lora.request import LoRARequest from vllm.sequence import SequenceGroupMetadata from vllm.utils import make_tensor_with_pad -from vllm.worker.model_runner import ModelRunner, PreparePromptMetadata +from vllm.worker.model_runner import ModelRunner, PreparePromptMetadata, AttentionMetadata _PAD_SLOT_ID = -1 @@ -15,6 +15,90 @@ class XPUModelRunner(ModelRunner): def _prepare_prompt( self, seq_group_metadata_list: List[SequenceGroupMetadata], + ) -> Tuple[torch.Tensor, torch.Tensor, AttentionMetadata, List[int]]: + assert len(seq_group_metadata_list) > 0 + input_tokens: List[int] = [] + input_positions: List[int] = [] + slot_mapping: List[int] = [] + prompt_lens: List[int] = [] + + for seq_group_metadata in seq_group_metadata_list: + assert seq_group_metadata.is_prompt + seq_ids = list(seq_group_metadata.seq_data.keys()) + assert len(seq_ids) == 1 + seq_id = seq_ids[0] + + seq_data = seq_group_metadata.seq_data[seq_id] + prompt_tokens = seq_data.get_token_ids() + computed_len = seq_data.get_num_computed_tokens() + prompt_len = len(prompt_tokens) + + prompt_lens.append(prompt_len) # Prompt token num + input_tokens.extend(prompt_tokens) # Token ids + + # Token position ids + # NOTE(woosuk): Here we assume that the first token in the prompt + # is always the first token in the sequence. + input_positions.extend(list(range(computed_len, prompt_len))) + + # Compute the slot mapping. + block_table = seq_group_metadata.block_tables[seq_id] + # Mask the [0, start_idx) tokens of the prompt with _PAD_SLOT_ID, + # where start_idx is max(0, prompt_len - sliding_window). + # For example, if the prompt len is 10, sliding window is 8, and + # block size is 4, the first two tokens are masked and the slot + # mapping will be [-1, -1, 2, 3, 4, 5, 6, 7, 0, 1]. + start_idx = 0 + if self.sliding_window is not None: + start_idx = max(0, prompt_len - self.sliding_window) + + for i in range(computed_len, prompt_len): + if i < start_idx: + slot_mapping.append(_PAD_SLOT_ID) + continue + + block_number = block_table[i // + self.block_size] # type: ignore + block_offset = i % self.block_size # type: ignore + slot = block_number * self.block_size + block_offset + slot_mapping.append(slot) + + num_prompt_tokens = len(input_tokens) + + input_tokens = torch.tensor(input_tokens, + dtype=torch.long, + device=self.device) # type: ignore + input_positions = torch.tensor(input_positions, + dtype=torch.long, + device=self.device) # type: ignore + slot_mapping = torch.tensor(slot_mapping, + dtype=torch.long, + device=self.device) # type: ignore + + attn_metadata = self.attn_backend.make_metadata( + is_prompt=True, + prompt_lens=prompt_lens, + num_prefills=len(prompt_lens), + num_prefill_tokens=num_prompt_tokens, + num_decode_tokens=0, + prefill_metadata=None, + decode_metadata=None, + max_context_len=None, + context_lens=None, + block_tables=torch.tensor([]), + slot_mapping=slot_mapping, + kv_cache_dtype=self.kv_cache_dtype, + ) + return ( + input_tokens, + input_positions, + attn_metadata, + prompt_lens, + ) + + def a_prepare_prompt( + self, + seq_group_metadata_list: List[SequenceGroupMetadata], ) -> PreparePromptMetadata: input_tokens: List[int] = [] input_positions: List[int] = [] @@ -191,28 +275,20 @@ def _prepare_prompt( attn_metadata = self.attn_backend.make_metadata( is_prompt=True, - # slot_mapping=slot_mapping, - prompt_lens=prompt_lens, - prompt_lens_tensor=prompt_lens_tensor, - # num_prompt_tokens=num_prompt_tokens, - # num_generation_tokens=0, - max_subquery_len=max_subquery_len, + num_prefills=len(prompt_lens), + num_prefill_tokens=num_prompt_tokens, + num_decode_tokens=0, + prefill_metadata=None, + decode_metadata=None, max_context_len=None, - max_prompt_len=max_prompt_len, - subquery_start_loc=subquery_start_loc, - seq_start_loc=seq_start_loc, - context_lens=context_lens_tensor, - block_tables=block_tables, - use_cuda_graph=False, - # kv_cache_dtype=self.kv_cache_dtype, + context_lens=None, + block_tables=torch.tensor([]), + slot_mapping=slot_mapping, + kv_cache_dtype=self.kv_cache_dtype, + ) + return ( + input_tokens, + input_positions, + attn_metadata, + prompt_lens, ) - return PreparePromptMetadata(input_tokens, - input_positions, - attn_metadata, - prompt_lens, - subquery_lens, - lora_index_mapping, - lora_prompt_mapping, - lora_requests, - multi_modal_input, - slot_mapping=slot_mapping) From a5f2c850248f7a9d21ccec1bc49251f89f2c2d91 Mon Sep 17 00:00:00 2001 From: Kunshang Ji Date: Fri, 12 Apr 2024 18:59:42 +0800 Subject: [PATCH 06/67] revert prefill --- vllm/worker/xpu_model_runner.py | 276 +++++++++++++++++++++++++++++++- vllm/worker/xpu_worker.py | 2 +- 2 files changed, 273 insertions(+), 5 deletions(-) diff --git a/vllm/worker/xpu_model_runner.py b/vllm/worker/xpu_model_runner.py index ba9457f16b25d..88b823b78f8fd 100644 --- a/vllm/worker/xpu_model_runner.py +++ b/vllm/worker/xpu_model_runner.py @@ -1,17 +1,284 @@ -from typing import List, Set, Tuple +from typing import Dict, List, Optional, Tuple import torch from vllm.lora.request import LoRARequest -from vllm.sequence import SequenceGroupMetadata -from vllm.utils import make_tensor_with_pad -from vllm.worker.model_runner import ModelRunner, PreparePromptMetadata, AttentionMetadata +from vllm.distributed import broadcast_tensor_dict +from vllm.sequence import SequenceGroupMetadata, SamplerOutput, SequenceData +from vllm.utils import make_tensor_with_pad, maybe_expand_dim +from vllm.worker.model_runner import ModelRunner, PreparePromptMetadata, AttentionMetadata, SamplingMetadata +from vllm.sampling_params import SamplingParams, SamplingType _PAD_SLOT_ID = -1 class XPUModelRunner(ModelRunner): + def prepare_input_tensors( + self, + seq_group_metadata_list: Optional[List[SequenceGroupMetadata]], + ) -> Tuple[torch.Tensor, torch.Tensor, AttentionMetadata, + SamplingMetadata]: + if self.is_driver_worker: + # NOTE: We assume that all sequences in the group are all prompts or + # all decodes. + is_prompt = seq_group_metadata_list[0].is_prompt + # Prepare input tensors. + if is_prompt: + (input_tokens, input_positions, attn_metadata, + prompt_lens) = self._prepare_prompt(seq_group_metadata_list) + else: + (input_tokens, input_positions, + attn_metadata) = self._prepare_decode(seq_group_metadata_list) + prompt_lens = [] + sampling_metadata = self._prepare_sample(seq_group_metadata_list, + prompt_lens) + # Broadcast the metadata. + metadata_dict = { + "input_tokens": input_tokens, + "input_positions": input_positions, + "selected_token_indices": + sampling_metadata.selected_token_indices, + } + metadata_dict.update(attn_metadata.asdict_zerocopy()) + broadcast_tensor_dict(metadata_dict, src=0) + else: + metadata_dict = broadcast_tensor_dict(src=0) + input_tokens = metadata_dict.pop("input_tokens") + input_positions = metadata_dict.pop("input_positions") + selected_token_indices = metadata_dict.pop( + "selected_token_indices") + attn_metadata = self.attn_backend.make_metadata(**metadata_dict) + sampling_metadata = SamplingMetadata( + seq_groups=None, + seq_data=None, + prompt_lens=None, + selected_token_indices=selected_token_indices, + categorized_sample_indices=None, + generators=None, + perform_sampling=False, + ) + + return ( + input_tokens, + input_positions, + attn_metadata, + sampling_metadata, + ) + + def _prepare_decode( + self, + seq_group_metadata_list: List[SequenceGroupMetadata], + ) -> Tuple[torch.Tensor, torch.Tensor, AttentionMetadata]: + assert len(seq_group_metadata_list) > 0 + input_tokens: List[int] = [] + input_positions: List[int] = [] + slot_mapping: List[int] = [] + context_lens: List[int] = [] + block_tables: List[List[int]] = [] + + for seq_group_metadata in seq_group_metadata_list: + assert not seq_group_metadata.is_prompt + assert seq_group_metadata.token_chunk_size == 1 + + seq_ids = list(seq_group_metadata.seq_data.keys()) + + for seq_id in seq_ids: + seq_data = seq_group_metadata.seq_data[seq_id] + generation_token = seq_data.get_last_token_id() + input_tokens.append(generation_token) + + seq_len = seq_data.get_len() + position = seq_len - 1 + input_positions.append(position) + + context_len = seq_len if self.sliding_window is None else min( + seq_len, self.sliding_window) + context_lens.append(context_len) + + block_table = seq_group_metadata.block_tables[seq_id] + block_number = block_table[position // self.block_size] + block_offset = position % self.block_size + slot = block_number * self.block_size + block_offset + slot_mapping.append(slot) + + if self.sliding_window is not None: + sliding_window_blocks = (self.sliding_window // + self.block_size) + block_table = block_table[-sliding_window_blocks:] + block_tables.append(block_table) + + max_context_len = max(context_lens) + + input_tokens = torch.tensor(input_tokens, + dtype=torch.long, + device=self.device) + input_positions = torch.tensor(input_positions, + dtype=torch.long, + device=self.device) + slot_mapping = torch.tensor(slot_mapping, + dtype=torch.long, + device=self.device) + context_lens = torch.tensor(context_lens, + dtype=torch.int, + device=self.device) + + max_block_table_len = max( + len(block_table) for block_table in block_tables) + block_tables = make_tensor_with_pad( + block_tables, + max_len=max_block_table_len, + pad=0, + dtype=torch.int, + device=self.device, + ) + + attn_metadata = self.attn_backend.make_metadata( + is_prompt=False, + slot_mapping=slot_mapping, + prompt_lens=None, + num_prefill_tokens=0, + num_decode_tokens=len(input_tokens), + max_context_len=max_context_len, + num_prefills=0, + prefill_metadata=None, + decode_metadata=None, + context_lens=context_lens, + block_tables=block_tables, + kv_cache_dtype=self.kv_cache_dtype, + ) + return ( + input_tokens, + input_positions, + attn_metadata, + ) + + def _prepare_sample( + self, + seq_group_metadata_list: List[SequenceGroupMetadata], + prompt_lens: List[int], + ) -> SamplingMetadata: + seq_groups: List[Tuple[List[int], SamplingParams]] = [] + selected_token_indices: List[int] = [] + generators: List[torch.Generator] = [] + selected_token_start_idx = 0 + categorized_sample_indices = {t: [] for t in SamplingType} + categorized_sample_indices_start_idx = 0 + categorized_sampled_token_indices_start_idx = 0 + + for i, seq_group_metadata in enumerate(seq_group_metadata_list): + seq_ids = list(seq_group_metadata.seq_data.keys()) + sampling_params = seq_group_metadata.sampling_params + seq_groups.append((seq_ids, sampling_params)) + + if seq_group_metadata.is_prompt: + assert len(seq_ids) == 1 + subquery_len = prompt_lens[i] + if sampling_params.prompt_logprobs is not None: + # NOTE: prompt token positions do not need sample, skip + categorized_sample_indices_start_idx += subquery_len - 1 + + categorized_sample_indices[ + sampling_params.sampling_type].append([ + categorized_sample_indices_start_idx, + categorized_sampled_token_indices_start_idx + ]) + categorized_sample_indices_start_idx += 1 + categorized_sampled_token_indices_start_idx += 1 + + if sampling_params.prompt_logprobs is not None: + selected_token_indices.extend( + range(selected_token_start_idx, + selected_token_start_idx + subquery_len - 1)) + selected_token_indices.append(selected_token_start_idx + + subquery_len - 1) + selected_token_start_idx += subquery_len + + if sampling_params.seed is not None: + seq_group_metadata.state.generator = torch.Generator( + device=self.device).manual_seed(sampling_params.seed) + else: + num_seqs = len(seq_ids) + selected_token_indices.extend( + range(selected_token_start_idx, + selected_token_start_idx + num_seqs)) + selected_token_start_idx += num_seqs + + categorized_sample_indices[ + sampling_params.sampling_type].extend( + zip( + range( + categorized_sample_indices_start_idx, + categorized_sample_indices_start_idx + + num_seqs), + range( + categorized_sampled_token_indices_start_idx, + categorized_sampled_token_indices_start_idx + + num_seqs))) + categorized_sample_indices_start_idx += num_seqs + categorized_sampled_token_indices_start_idx += num_seqs + + if sampling_params.seed is not None: + generators.append(seq_group_metadata.state.generator) + + selected_token_indices = torch.tensor(selected_token_indices, + dtype=torch.long, + device="xpu") + + categorized_sample_indices = { + t: maybe_expand_dim(torch.tensor(seq_ids, dtype=torch.int), 2, 2) + for t, seq_ids in categorized_sample_indices.items() + } + + seq_data: Dict[int, SequenceData] = {} + for seq_group_metadata in seq_group_metadata_list: + seq_data.update(seq_group_metadata.seq_data) + + sampling_metadata = SamplingMetadata( + seq_groups=seq_groups, + seq_data=seq_data, + prompt_lens=prompt_lens, + selected_token_indices=selected_token_indices, + categorized_sample_indices=categorized_sample_indices, + generators=generators, + ) + return sampling_metadata + + + @torch.inference_mode() + def execute_model( + self, + seq_group_metadata_list: Optional[List[SequenceGroupMetadata]], + kv_caches: List[torch.Tensor], + ) -> Optional[SamplerOutput]: + (input_tokens, input_positions, attn_metadata, sampling_metadata + ) = self.prepare_input_tensors(seq_group_metadata_list) + + model_executable = self.model + execute_model_kwargs = { + "input_ids": input_tokens, + "positions": input_positions, + "kv_caches": kv_caches, + "attn_metadata": attn_metadata, + } + + hidden_states = model_executable(**execute_model_kwargs) + + # Compute the logits. + logits = self.model.compute_logits(hidden_states, sampling_metadata) + + # Only perform sampling in the driver worker. + if not sampling_metadata.perform_sampling: + return None + + # Sample the next token. + output = self.model.sample( + logits=logits, + sampling_metadata=sampling_metadata, + ) + return output + + def _prepare_prompt( self, seq_group_metadata_list: List[SequenceGroupMetadata], @@ -41,6 +308,7 @@ def _prepare_prompt( # is always the first token in the sequence. input_positions.extend(list(range(computed_len, prompt_len))) + print(seq_group_metadata.block_tables) # Compute the slot mapping. block_table = seq_group_metadata.block_tables[seq_id] # Mask the [0, start_idx) tokens of the prompt with _PAD_SLOT_ID, diff --git a/vllm/worker/xpu_worker.py b/vllm/worker/xpu_worker.py index 8194ef36ba589..bf3ec89a24936 100644 --- a/vllm/worker/xpu_worker.py +++ b/vllm/worker/xpu_worker.py @@ -118,7 +118,7 @@ def determine_num_available_blocks(self) -> Tuple[int, int]: # Execute a forward pass with dummy inputs to profile the memory usage # of the model. - self.model_runner.profile_run() + # self.model_runner.profile_run() # Calculate the number of blocks that can be allocated with the # profiled peak memory. From 1c3a527fac24eeb818f20976842111fb144ef630 Mon Sep 17 00:00:00 2001 From: Kunshang Ji Date: Fri, 12 Apr 2024 21:16:25 +0800 Subject: [PATCH 07/67] fix --- vllm/worker/xpu_model_runner.py | 210 ++------------------------------ vllm/worker/xpu_worker.py | 2 +- 2 files changed, 9 insertions(+), 203 deletions(-) diff --git a/vllm/worker/xpu_model_runner.py b/vllm/worker/xpu_model_runner.py index 88b823b78f8fd..67041989a1ca2 100644 --- a/vllm/worker/xpu_model_runner.py +++ b/vllm/worker/xpu_model_runner.py @@ -2,11 +2,11 @@ import torch -from vllm.lora.request import LoRARequest from vllm.distributed import broadcast_tensor_dict from vllm.sequence import SequenceGroupMetadata, SamplerOutput, SequenceData from vllm.utils import make_tensor_with_pad, maybe_expand_dim -from vllm.worker.model_runner import ModelRunner, PreparePromptMetadata, AttentionMetadata, SamplingMetadata +from vllm.worker.model_runner import (ModelRunner, AttentionMetadata, + SamplingMetadata) from vllm.sampling_params import SamplingParams, SamplingType _PAD_SLOT_ID = -1 @@ -244,7 +244,6 @@ def _prepare_sample( ) return sampling_metadata - @torch.inference_mode() def execute_model( self, @@ -278,7 +277,6 @@ def execute_model( ) return output - def _prepare_prompt( self, seq_group_metadata_list: List[SequenceGroupMetadata], @@ -308,7 +306,12 @@ def _prepare_prompt( # is always the first token in the sequence. input_positions.extend(list(range(computed_len, prompt_len))) - print(seq_group_metadata.block_tables) + if seq_group_metadata.block_tables is None: + # During memory profiling, the block tables are not initialized + # yet. In this case, we just use a dummy slot mapping. + slot_mapping.extend([_PAD_SLOT_ID] * prompt_len) + continue + # Compute the slot mapping. block_table = seq_group_metadata.block_tables[seq_id] # Mask the [0, start_idx) tokens of the prompt with _PAD_SLOT_ID, @@ -363,200 +366,3 @@ def _prepare_prompt( attn_metadata, prompt_lens, ) - - def a_prepare_prompt( - self, - seq_group_metadata_list: List[SequenceGroupMetadata], - ) -> PreparePromptMetadata: - input_tokens: List[int] = [] - input_positions: List[int] = [] - slot_mapping: List[int] = [] - lora_index_mapping: List[int] = [] - lora_prompt_mapping: List[int] = [] - lora_requests: Set[LoRARequest] = set() - - prompt_lens: List[int] = [] - context_lens: List[int] = [] - subquery_lens: List[int] = [] - prefix_block_tables: List[List[int]] = [] - multi_modal_input_list: List[torch.Tensor] = [] - - if len(seq_group_metadata_list) == 0: - return PreparePromptMetadata.empty() - - for seq_group_metadata in seq_group_metadata_list: - assert seq_group_metadata.is_prompt - seq_ids = list(seq_group_metadata.seq_data.keys()) - assert len(seq_ids) == 1 - seq_id = seq_ids[0] - - computed_block_nums = seq_group_metadata.computed_block_nums - if (self.scheduler_config is not None - and self.scheduler_config.chunked_prefill_enabled - and computed_block_nums is not None): - raise RuntimeError( - "chunked prefill cannot be used with prefix caching " - "now.") - - token_chunk_size = seq_group_metadata.token_chunk_size - seq_data = seq_group_metadata.seq_data[seq_id] - computed_len = seq_data.get_num_computed_tokens() - # We should use get_len here because in case of preemption - # it contains output tokens. - prefill_end = min(seq_data.get_len(), - computed_len + token_chunk_size) - # TODO(sang): Rename it after chunked prefill is introduced. - prompt_tokens = seq_data.get_token_ids()[computed_len:prefill_end] - prompt_len = len(prompt_tokens) - # Right now, the prefill_end is always same as the length of - # sequence. However, once chunked prefill is introduced, this - # assumption can be changed. - assert prefill_end == seq_data.get_len() - prompt_lens.append(prompt_len) - - # NOTE: This only works for oooooooxxx style attention. - if computed_block_nums is not None and len( - computed_block_nums) > 0 and self.sliding_window is None: - # Prefix is not supported with sliding_window - computed_len = len(computed_block_nums) * self.block_size - prompt_tokens = prompt_tokens[computed_len:] - prefix_block_tables.append(computed_block_nums) - else: - prefix_block_tables.append([]) - # Right now, prefill start is always 0. However, this - # assumption can be changed once chunked prefill is introduced. - assert computed_len == 0 - - # actual prompt lens - context_lens.append(computed_len) - subquery_lens.append(prompt_len - computed_len) - - input_tokens.extend(prompt_tokens) - # NOTE(woosuk): Here we assume that the first token in the prompt - # is always the first token in the sequence. - input_positions.extend(list(range(computed_len, prefill_end))) - - lora_id = seq_group_metadata.lora_int_id - - if lora_id > 0: - lora_requests.add(seq_group_metadata.lora_request) - - lora_index_mapping += [lora_id] * (prompt_len - computed_len) - lora_prompt_mapping.extend( - [lora_id] * - (prompt_len - computed_len - if seq_group_metadata.sampling_params.prompt_logprobs else 1)) - - if seq_group_metadata.multi_modal_data: - multi_modal_input_list.append( - seq_group_metadata.multi_modal_data.data) - - if seq_group_metadata.block_tables is None: - # During memory profiling, the block tables are not initialized - # yet. In this case, we just use a dummy slot mapping. - slot_mapping.extend([_PAD_SLOT_ID] * prompt_len) - continue - - # Compute the slot mapping. - block_table = seq_group_metadata.block_tables[seq_id] - # Mask the [0, start_idx) tokens of the prompt with _PAD_SLOT_ID, - # where start_idx is max(0, prompt_len - sliding_window). - # For example, if the prompt len is 10, sliding window is 8, and - # block size is 4, the first two tokens are masked and the slot - # mapping will be [-1, -1, 2, 3, 4, 5, 6, 7, 0, 1]. - start_idx = 0 - if self.sliding_window is not None: - assert computed_len == 0, ( - "Prefix caching is currently not supported with " - "sliding window attention") - start_idx = max(0, prompt_len - self.sliding_window) - - for i in range(computed_len, prefill_end): - if i < start_idx: - slot_mapping.append(_PAD_SLOT_ID) - continue - - block_number = block_table[i // self.block_size] - block_offset = i % self.block_size - slot = block_number * self.block_size + block_offset - slot_mapping.append(slot) - - # padding to 8 bytes for XeTLA - padding_num = (8 - len(input_tokens) % 8) % 8 - input_tokens.extend([0] * padding_num) - input_positions.extend([0] * padding_num) - slot_mapping.extend([_PAD_SLOT_ID] * padding_num) - prompt_lens[-1] += padding_num - - max_subquery_len = max(subquery_lens) - max_prompt_len = max(prompt_lens) - assert max_subquery_len > 0 - - context_lens_tensor = torch.tensor(context_lens, - dtype=torch.int, - device=self.device) - - if multi_modal_input_list: - assert self.vision_language_config, ( - "Multi-modal inputs are only supported by " - "vision language models.") - multi_modal_input = torch.cat(multi_modal_input_list, - dim=0).to(self.device) - else: - multi_modal_input = None - - # Prepare prefix block tables - max_prompt_block_table_len = max(len(t) for t in prefix_block_tables) - block_tables = make_tensor_with_pad( - prefix_block_tables, - max_len=max_prompt_block_table_len, - pad=0, - dtype=torch.int, - device=self.device, - ) - - # Query length can be shorter than key (i.e., prompt) when prefill - # is chunked or prefix cached. - subquery_lens_tensor = torch.tensor(subquery_lens, - dtype=torch.long, - device=self.device) - subquery_start_loc = torch.zeros(subquery_lens_tensor.shape[0] + 1, - dtype=torch.int32, - device=self.device) - - prompt_lens_tensor = torch.tensor(prompt_lens, - dtype=torch.long, - device=self.device) - seq_start_loc = torch.zeros(prompt_lens_tensor.shape[0] + 1, - dtype=torch.int32, - device=self.device) - - torch.cumsum(subquery_lens_tensor, - dim=0, - dtype=subquery_start_loc.dtype, - out=subquery_start_loc[1:]) - - torch.cumsum(prompt_lens_tensor, - dim=0, - dtype=seq_start_loc.dtype, - out=seq_start_loc[1:]) - - attn_metadata = self.attn_backend.make_metadata( - is_prompt=True, - num_prefills=len(prompt_lens), - num_prefill_tokens=num_prompt_tokens, - num_decode_tokens=0, - prefill_metadata=None, - decode_metadata=None, - max_context_len=None, - context_lens=None, - block_tables=torch.tensor([]), - slot_mapping=slot_mapping, - kv_cache_dtype=self.kv_cache_dtype, - ) - return ( - input_tokens, - input_positions, - attn_metadata, - prompt_lens, - ) diff --git a/vllm/worker/xpu_worker.py b/vllm/worker/xpu_worker.py index bf3ec89a24936..8194ef36ba589 100644 --- a/vllm/worker/xpu_worker.py +++ b/vllm/worker/xpu_worker.py @@ -118,7 +118,7 @@ def determine_num_available_blocks(self) -> Tuple[int, int]: # Execute a forward pass with dummy inputs to profile the memory usage # of the model. - # self.model_runner.profile_run() + self.model_runner.profile_run() # Calculate the number of blocks that can be allocated with the # profiled peak memory. From feb6d663fcf70e855fe6503d76d4fc5203b2ac1c Mon Sep 17 00:00:00 2001 From: Kunshang Ji Date: Mon, 15 Apr 2024 18:48:51 +0800 Subject: [PATCH 08/67] fix alibi device support --- vllm/attention/backends/torch_sdpa.py | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/vllm/attention/backends/torch_sdpa.py b/vllm/attention/backends/torch_sdpa.py index 480188561bf7a..9c3eba5e79c47 100644 --- a/vllm/attention/backends/torch_sdpa.py +++ b/vllm/attention/backends/torch_sdpa.py @@ -295,7 +295,9 @@ def _make_alibi_bias( ) -> List[torch.Tensor]: attn_biases = [] for seq_len in seq_lens: - bias = torch.arange(seq_len, dtype=dtype) + bias = torch.arange(seq_len, + dtype=dtype, + device=alibi_slopes.device) # NOTE(zhuohan): HF uses # `bias = bias[None, :].repeat(seq_len, 1)` # here. We find that both biases give the same results, but @@ -308,7 +310,8 @@ def _make_alibi_bias( bias.mul_(alibi_slopes[:, None, None]).unsqueeze_(0) inf_mask = torch.empty( (1, seq_len, seq_len), - dtype=bias.dtype).fill_(-torch.inf).triu_(diagonal=1) + dtype=bias.dtype, + device=alibi_slopes.device).fill_(-torch.inf).triu_(diagonal=1) attn_biases.append((bias + inf_mask).to(dtype)) return attn_biases From 607c46eedab37fe094662e02c9983ff5d3534215 Mon Sep 17 00:00:00 2001 From: Kunshang Ji Date: Mon, 15 Apr 2024 18:50:03 +0800 Subject: [PATCH 09/67] remove --- vllm/worker/xpu_worker.py | 52 --------------------------------------- 1 file changed, 52 deletions(-) diff --git a/vllm/worker/xpu_worker.py b/vllm/worker/xpu_worker.py index 8194ef36ba589..7ee9f7972fe53 100644 --- a/vllm/worker/xpu_worker.py +++ b/vllm/worker/xpu_worker.py @@ -149,58 +149,6 @@ def determine_num_available_blocks(self) -> Tuple[int, int]: torch.xpu.empty_cache() return num_gpu_blocks, num_cpu_blocks - @torch.inference_mode() - def profile_num_available_blocks( - self, - block_size: int, - gpu_memory_utilization: float, - cpu_swap_space: int, - cache_dtype: str, - ) -> Tuple[int, int]: - """Profiles the peak memory usage of the model and returns the maximum - number of GPU and CPU cache blocks that can be allocated. - - Args: - block_size: The size of the cache block. - gpu_memory_utilization: The fraction of the total GPU memory to use. - cpu_swap_space: The size of the CPU swap space in bytes. - """ - # Profile the memory usage of the model and get the maximum number of - # cache blocks that can be allocated with the remaining free memory. - torch.xpu.empty_cache() - - # Execute a forward pass with dummy inputs to profile the memory usage - # of the model. - self.model_runner.profile_run() - - # Calculate the number of blocks that can be allocated with the - # profiled peak memory. - torch.xpu.synchronize() - - used_memory = torch.xpu.memory_allocated() - total_gpu_memory = torch.xpu.get_device_properties( - self.local_rank).total_memory - # print(f"rank:{self.local_rank}, used_memory:{used_memory}") - - free_gpu_memory = total_gpu_memory - used_memory - # NOTE(woosuk): Here we assume that the other processes using the same - # GPU did not change their memory usage during the profiling. - peak_memory = self.init_gpu_memory - free_gpu_memory - - cache_block_size = self.get_cache_block_size_bytes( - block_size, cache_dtype) - num_gpu_blocks = int( - (total_gpu_memory * gpu_memory_utilization - peak_memory) // - cache_block_size) - num_cpu_blocks = int(cpu_swap_space // cache_block_size) - num_gpu_blocks = max(num_gpu_blocks, 0) - num_cpu_blocks = max(num_cpu_blocks, 0) - if self.model_runner.lora_manager: - self.model_runner.remove_all_loras() - gc.collect() - torch.xpu.empty_cache() - return num_gpu_blocks, num_cpu_blocks - def initialize_cache(self, num_gpu_blocks: int, num_cpu_blocks: int) -> None: """Allocate GPU and CPU KV cache with the specified number of blocks. From 09d0382c48ac04e9781dd00a41186aaff354156d Mon Sep 17 00:00:00 2001 From: Kunshang Ji Date: Mon, 15 Apr 2024 22:26:36 +0800 Subject: [PATCH 10/67] add tensorizer config, fix format --- vllm/executor/ray_xpu_executor.py | 7 ++++--- vllm/executor/xpu_executor.py | 9 ++++++--- vllm/worker/xpu_worker.py | 9 +++++++-- 3 files changed, 17 insertions(+), 8 deletions(-) diff --git a/vllm/executor/ray_xpu_executor.py b/vllm/executor/ray_xpu_executor.py index c1b4027bce247..41a1d5e9703c8 100644 --- a/vllm/executor/ray_xpu_executor.py +++ b/vllm/executor/ray_xpu_executor.py @@ -3,7 +3,7 @@ import os import pickle from collections import defaultdict -from typing import TYPE_CHECKING, Any, Dict, List, Optional +from typing import TYPE_CHECKING, Any, Dict, List, Optional, Tuple from vllm.config import (CacheConfig, DeviceConfig, LoRAConfig, ModelConfig, ParallelConfig, SchedulerConfig, VisionLanguageConfig) @@ -275,7 +275,7 @@ def _run_workers( self, method: str, *args, - driver_args: Optional[List[Any]] = None, + driver_args: Optional[Tuple[Any]] = None, driver_kwargs: Optional[Dict[str, Any]] = None, max_concurrent_workers: Optional[int] = None, use_ray_compiled_dag: bool = False, @@ -290,6 +290,7 @@ def _run_workers( if use_ray_compiled_dag: # Right now, compiled DAG can only accept a single # input. TODO(sang): Fix it. + assert self.forward_dag is not None output_channels = self.forward_dag.execute(1) else: # Start the ray workers first. @@ -368,7 +369,7 @@ async def _run_workers_async( self, method: str, *args, - driver_args: Optional[List[Any]] = None, + driver_args: Optional[Tuple[Any]] = None, driver_kwargs: Optional[Dict[str, Any]] = None, **kwargs, ) -> Any: diff --git a/vllm/executor/xpu_executor.py b/vllm/executor/xpu_executor.py index 687e61ec0cb3a..f29da2176203d 100644 --- a/vllm/executor/xpu_executor.py +++ b/vllm/executor/xpu_executor.py @@ -1,10 +1,10 @@ -from typing import Dict, List, Optional +from typing import Dict, List, Optional, Tuple import torch from vllm.config import (CacheConfig, DeviceConfig, LoRAConfig, ModelConfig, ParallelConfig, SchedulerConfig, SpeculativeConfig, - VisionLanguageConfig) + TensorizerConfig, VisionLanguageConfig) from vllm.executor.executor_base import ExecutorAsyncBase, ExecutorBase from vllm.logger import init_logger from vllm.lora.request import LoRARequest @@ -27,6 +27,7 @@ def __init__( lora_config: Optional[LoRAConfig], vision_language_config: Optional[VisionLanguageConfig], speculative_config: Optional[SpeculativeConfig], + tensorizer_config: Optional[TensorizerConfig], ) -> None: assert device_config.device_type == "xpu" assert (not speculative_config @@ -41,6 +42,7 @@ def __init__( self.scheduler_config = scheduler_config self.device_config = device_config self.vision_language_config = vision_language_config + self.tensorizer_config = tensorizer_config # Instantiate the worker and load the model to GPU. self._init_worker() @@ -65,6 +67,7 @@ def _init_worker(self): lora_config=self.lora_config, vision_language_config=self.vision_language_config, is_driver_worker=True, + tensorizer_config=self.tensorizer_config, ) self.driver_worker.init_device() self.driver_worker.load_model() @@ -80,7 +83,7 @@ def initialize_cache(self, num_gpu_blocks: int, num_cpu_blocks) -> None: self.driver_worker.initialize_cache(num_gpu_blocks, num_cpu_blocks) - def determine_num_available_blocks(self) -> tuple[int, int]: + def determine_num_available_blocks(self) -> Tuple[int, int]: """Determine the number of available KV blocks by invoking the underlying worker. """ diff --git a/vllm/worker/xpu_worker.py b/vllm/worker/xpu_worker.py index 7ee9f7972fe53..f1b2c791e3e10 100644 --- a/vllm/worker/xpu_worker.py +++ b/vllm/worker/xpu_worker.py @@ -8,7 +8,8 @@ import torch.distributed from vllm.config import (CacheConfig, DeviceConfig, LoRAConfig, ModelConfig, - ParallelConfig, SchedulerConfig, VisionLanguageConfig) + ParallelConfig, SchedulerConfig, TensorizerConfig, + VisionLanguageConfig) from vllm.distributed import (broadcast_tensor_dict, ensure_model_parallel_initialized) from vllm.logger import init_logger @@ -44,6 +45,7 @@ def __init__( distributed_init_method: str, lora_config: Optional[LoRAConfig] = None, vision_language_config: Optional[VisionLanguageConfig] = None, + tensorizer_config: Optional[TensorizerConfig] = None, is_driver_worker: bool = False, ) -> None: assert device_config.device_type == "xpu" @@ -58,6 +60,7 @@ def __init__( self.rank = rank self.distributed_init_method = distributed_init_method self.lora_config = lora_config + self.tensorizer_config = tensorizer_config self.is_driver_worker = is_driver_worker if self.is_driver_worker: assert self.rank == 0, "The driver worker must have rank 0." @@ -75,7 +78,9 @@ def __init__( lora_config=self.lora_config, kv_cache_dtype=self.cache_config.cache_dtype, is_driver_worker=is_driver_worker, - vision_language_config=vision_language_config) + vision_language_config=vision_language_config, + tensorizer_config=tensorizer_config, + ) # Uninitialized cache engine. Will be initialized by # initialize_cache. self.cache_engine = None From ef280e08cc48437b0b30775809f9a3ff8a536ee2 Mon Sep 17 00:00:00 2001 From: Kunshang Ji Date: Tue, 16 Apr 2024 16:10:35 +0800 Subject: [PATCH 11/67] update wheel, fix typo --- requirements-xpu.txt | 33 ++++++--------------------------- vllm/executor/xpu_executor.py | 2 +- 2 files changed, 7 insertions(+), 28 deletions(-) diff --git a/requirements-xpu.txt b/requirements-xpu.txt index 6f6b6ed5642bf..3f63176edb7fb 100644 --- a/requirements-xpu.txt +++ b/requirements-xpu.txt @@ -1,30 +1,9 @@ -cmake>=3.21 -ninja # For faster builds. -psutil -ray >= 2.9 -sentencepiece # Required for LLaMA tokenizer. -numpy +# Common dependencies +-r requirements-common.txt -torch @ https://ubit-artifactory-sh.intel.com/artifactory/aipc_releases-sh-local/gpu-new/validation/IPEX/weekly/PVC/2024/ww13/py39/torch-2.1.0a0+gitc61d29a-cp39-cp39-linux_x86_64.whl -# intel_extension_for_pytorch @ https://ubit-artifactory-sh.intel.com/artifactory/aipc_releases-sh-local/gpu-new/validation/IPEX/weekly/PVC/2024/ww13/py39/intel_extension_for_pytorch-2.1.30+gitcdec5e9-cp39-cp39-linux_x86_64.whl -oneccl_bind_pt @ https://ubit-artifactory-sh.intel.com/artifactory/aipc_releases-sh-local/gpu-new/validation/IPEX/weekly/PVC/2024/ww13/py39/oneccl_bind_pt-2.1.0+gpu-cp39-cp39-linux_x86_64.whl -# torch @ https://intel-extension-for-pytorch.s3.amazonaws.com/ipex_stable/xpu/torch-2.1.0a0%2Bcxx11.abi-cp310-cp310-linux_x86_64.whl -# intel_extension_for_pytorch @ https://intel-extension-for-pytorch.s3.amazonaws.com/ipex_stable/xpu/intel_extension_for_pytorch-2.1.10%2Bxpu-cp310-cp310-linux_x86_64.whl -# oneccl_bind_pt @ https://intel-extension-for-pytorch.s3.amazonaws.com/ipex_stable/xpu/oneccl_bind_pt-2.1.100%2Bxpu-cp310-cp310-linux_x86_64.whl -#torch == 2.1.0a0+cxx11.abi -#intel_extension_for_pytorch == 2.1.10+xpu -#oneccl_bind_pt == 2.1.100+xpu +torch @ https://intel-extension-for-pytorch.s3.amazonaws.com/ipex_dev/xpu/torch-2.1.0.post1%2Bcxx11.abi-cp310-cp310-linux_x86_64.whl +intel_extension_for_pytorch @ https://intel-extension-for-pytorch.s3.amazonaws.com/ipex_dev/xpu/intel_extension_for_pytorch-2.1.30a0-cp310-cp310-linux_x86_64.whl +oneccl_bind_pt @ https://intel-extension-for-pytorch.s3.amazonaws.com/ipex_stable/xpu/oneccl_bind_pt-2.1.200%2Bxpu-cp311-cp311-linux_x86_64.whl -transformers >= 4.39.1 # Required for StarCoder2 & Llava. -fastapi == 0.109.0 -uvicorn[standard] -pydantic >= 2.0 # Required for OpenAI server. -prometheus_client >= 0.18.0 -pynvml == 11.5.0 -# outlines == 0.0.34 +triton @ https://github.com/intel/intel-xpu-backend-for-triton/releases/download/v2.1.0/triton-2.1.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl -#triton @ https://github.com/intel/intel-xpu-backend-for-triton/releases/download/v2.1.0/triton-2.1.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl -# triton @ https://github.com/intel/intel-xpu-backend-for-triton/releases/download/v2.1.0/triton-2.1.0-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl - -wheel -einops # Required for phi-1_5 diff --git a/vllm/executor/xpu_executor.py b/vllm/executor/xpu_executor.py index f29da2176203d..85f06293d72b9 100644 --- a/vllm/executor/xpu_executor.py +++ b/vllm/executor/xpu_executor.py @@ -148,7 +148,7 @@ def _verify_and_get_model_config(config: ModelConfig) -> ModelConfig: config.dtype = torch.float16 if not config.enforce_eager: logger.warning( - "CUDA graph is not supported on CPU, fallback to the eager " + "CUDA graph is not supported on XPU, fallback to the eager " "mode.") config.enforce_eager = True return config From d5f3e1f619a298dda2a4e4f74c5157eb389acbd6 Mon Sep 17 00:00:00 2001 From: Kunshang Ji Date: Tue, 16 Apr 2024 16:39:45 +0800 Subject: [PATCH 12/67] fix --- requirements-xpu.txt | 2 +- vllm/executor/ray_xpu_executor.py | 4 ++-- vllm/executor/xpu_executor.py | 4 ++-- 3 files changed, 5 insertions(+), 5 deletions(-) diff --git a/requirements-xpu.txt b/requirements-xpu.txt index 3f63176edb7fb..3eec032234b5a 100644 --- a/requirements-xpu.txt +++ b/requirements-xpu.txt @@ -3,7 +3,7 @@ torch @ https://intel-extension-for-pytorch.s3.amazonaws.com/ipex_dev/xpu/torch-2.1.0.post1%2Bcxx11.abi-cp310-cp310-linux_x86_64.whl intel_extension_for_pytorch @ https://intel-extension-for-pytorch.s3.amazonaws.com/ipex_dev/xpu/intel_extension_for_pytorch-2.1.30a0-cp310-cp310-linux_x86_64.whl -oneccl_bind_pt @ https://intel-extension-for-pytorch.s3.amazonaws.com/ipex_stable/xpu/oneccl_bind_pt-2.1.200%2Bxpu-cp311-cp311-linux_x86_64.whl +oneccl_bind_pt @ https://intel-extension-for-pytorch.s3.amazonaws.com/ipex_stable/xpu/oneccl_bind_pt-2.1.200%2Bxpu-cp310-cp310-linux_x86_64.whl triton @ https://github.com/intel/intel-xpu-backend-for-triton/releases/download/v2.1.0/triton-2.1.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl diff --git a/vllm/executor/ray_xpu_executor.py b/vllm/executor/ray_xpu_executor.py index 41a1d5e9703c8..9b249f106078a 100644 --- a/vllm/executor/ray_xpu_executor.py +++ b/vllm/executor/ray_xpu_executor.py @@ -3,7 +3,7 @@ import os import pickle from collections import defaultdict -from typing import TYPE_CHECKING, Any, Dict, List, Optional, Tuple +from typing import TYPE_CHECKING, Any, Dict, List, Optional, Tuple, Set from vllm.config import (CacheConfig, DeviceConfig, LoRAConfig, ModelConfig, ParallelConfig, SchedulerConfig, VisionLanguageConfig) @@ -268,7 +268,7 @@ def remove_lora(self, lora_id: int) -> bool: lora_id=lora_id, ) - def list_loras(self) -> List[int]: + def list_loras(self) -> Set[int]: return self._run_workers("list_loras") def _run_workers( diff --git a/vllm/executor/xpu_executor.py b/vllm/executor/xpu_executor.py index 85f06293d72b9..abe563f5356d8 100644 --- a/vllm/executor/xpu_executor.py +++ b/vllm/executor/xpu_executor.py @@ -1,4 +1,4 @@ -from typing import Dict, List, Optional, Tuple +from typing import Dict, List, Optional, Tuple, Set import torch @@ -110,7 +110,7 @@ def remove_lora(self, lora_id: int) -> bool: assert lora_id > 0, "lora_id must be greater than 0." return self.driver_worker.remove_lora(lora_id) - def list_loras(self) -> List[int]: + def list_loras(self) -> Set[int]: return self.driver_worker.list_loras() def check_health(self) -> None: From 6b5f58c7bd82774df7aff38a254c9d4472547e86 Mon Sep 17 00:00:00 2001 From: Kunshang Ji Date: Tue, 16 Apr 2024 16:46:19 +0800 Subject: [PATCH 13/67] fix xpu_executor --- vllm/executor/xpu_executor.py | 10 ++++++++++ 1 file changed, 10 insertions(+) diff --git a/vllm/executor/xpu_executor.py b/vllm/executor/xpu_executor.py index abe563f5356d8..f8313a42d3b12 100644 --- a/vllm/executor/xpu_executor.py +++ b/vllm/executor/xpu_executor.py @@ -45,8 +45,18 @@ def __init__( self.tensorizer_config = tensorizer_config # Instantiate the worker and load the model to GPU. + self._init_executor() + + def _init_executor(self) -> None: + assert (self.lora_config is + None), "LoRA is not supported for XPU backend." + assert (not self.speculative_config + ), "Speculative decoding not yet supported for XPU backend." + + # Instantiate the worker and load the model to the device. self._init_worker() + def _init_worker(self): from vllm.worker.xpu_worker import XPUWorker From 27e2dcf4601ca550d76251430a246933a6386f9a Mon Sep 17 00:00:00 2001 From: Kunshang Ji Date: Tue, 16 Apr 2024 18:22:18 +0800 Subject: [PATCH 14/67] more fix --- vllm/_ipex_ops.py | 29 ++++++++++++++++------------- vllm/executor/xpu_executor.py | 7 ------- vllm/worker/xpu_model_runner.py | 7 +++++++ 3 files changed, 23 insertions(+), 20 deletions(-) diff --git a/vllm/_ipex_ops.py b/vllm/_ipex_ops.py index df86120720c68..5cdc10187610b 100644 --- a/vllm/_ipex_ops.py +++ b/vllm/_ipex_ops.py @@ -1,6 +1,6 @@ import torch import intel_extension_for_pytorch as ipex -from typing import Dict, Optional +from typing import Dict, List, Optional class ipex_ops: @@ -177,6 +177,7 @@ def fused_add_rms_norm(input: torch.Tensor, residual: torch.Tensor, class ipex_cache_ops: + @staticmethod def reshape_and_cache( key: torch.Tensor, value: torch.Tensor, @@ -187,28 +188,30 @@ def reshape_and_cache( kv_scale: float, ) -> None: assert kv_cache_dtype == "auto" - torch.ops.torch_ipex.reshape_and_cache(key, value, key_cache, - value_cache, slot_mapping) + ipex.llm.modules.PageAttention.reshape_and_cache( + key, value, key_cache, value_cache, slot_mapping) - def copy_blocks(key_caches: torch.Tensor, value_caches: torch.Tensor, - block_mapping: torch.Tensor) -> None: + @staticmethod + def copy_blocks(key_caches: List[torch.Tensor], + value_caches: List[torch.Tensor], + block_mapping: Dict[int, List[int]]) -> None: block_mapping_tensor = [] for key, values in block_mapping.items(): if hasattr(values, "__iter__"): for value in values: block_mapping_tensor.append([key, value]) - block_mapping = torch.Tensor(block_mapping_tensor, - device=key_caches.device, + block_mapping = torch.tensor(block_mapping_tensor, + device=key_caches[0].device, dtype=torch.int64) - torch.ops.torch_ipex.copy_blocks(key_caches, value_caches, - block_mapping) + torch.xpu.copy_blocks(key_caches, value_caches, block_mapping) - def swap_blocks(src: torch.Tensor, dst: torch.Tensor, + @staticmethod + def swap_blocks(src: List[torch.Tensor], dst: List[torch.Tensor], block_mapping: Dict[int, int]) -> None: keys = list(block_mapping.keys()) values = list(block_mapping.values()) - key_tensor = torch.Tensor(keys) - value_tensor = torch.Tensor(values) + key_tensor = torch.tensor(keys, dtype=torch.int64) + value_tensor = torch.tensor(values, dtype=torch.int64) block_mapping_tensor = torch.stack([key_tensor, value_tensor], dim=1) - torch.ops.torch_ipex.swap_blocks(src, dst, block_mapping_tensor) + torch.xpu.swap_blocks(src, dst, block_mapping_tensor) diff --git a/vllm/executor/xpu_executor.py b/vllm/executor/xpu_executor.py index f8313a42d3b12..16389e7efcd38 100644 --- a/vllm/executor/xpu_executor.py +++ b/vllm/executor/xpu_executor.py @@ -48,15 +48,8 @@ def __init__( self._init_executor() def _init_executor(self) -> None: - assert (self.lora_config is - None), "LoRA is not supported for XPU backend." - assert (not self.speculative_config - ), "Speculative decoding not yet supported for XPU backend." - - # Instantiate the worker and load the model to the device. self._init_worker() - def _init_worker(self): from vllm.worker.xpu_worker import XPUWorker diff --git a/vllm/worker/xpu_model_runner.py b/vllm/worker/xpu_model_runner.py index 67041989a1ca2..8db38184c4513 100644 --- a/vllm/worker/xpu_model_runner.py +++ b/vllm/worker/xpu_model_runner.py @@ -334,6 +334,13 @@ def _prepare_prompt( slot = block_number * self.block_size + block_offset slot_mapping.append(slot) + # padding to 8 bytes for XeTLA + padding_num = (8 - len(input_tokens) % 8) % 8 + input_tokens.extend([0] * padding_num) + input_positions.extend([0] * padding_num) + slot_mapping.extend([_PAD_SLOT_ID] * padding_num) + prompt_lens[-1] += padding_num + num_prompt_tokens = len(input_tokens) input_tokens = torch.tensor(input_tokens, From 7a6e6cdb29bcf6d7c4ec60a8bde51cf41f503250 Mon Sep 17 00:00:00 2001 From: Kunshang Ji Date: Tue, 16 Apr 2024 18:33:40 +0800 Subject: [PATCH 15/67] typo --- vllm/_ipex_ops.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/vllm/_ipex_ops.py b/vllm/_ipex_ops.py index 5cdc10187610b..2ae92daccda71 100644 --- a/vllm/_ipex_ops.py +++ b/vllm/_ipex_ops.py @@ -188,7 +188,7 @@ def reshape_and_cache( kv_scale: float, ) -> None: assert kv_cache_dtype == "auto" - ipex.llm.modules.PageAttention.reshape_and_cache( + ipex.llm.modules.PagedAttention.reshape_and_cache( key, value, key_cache, value_cache, slot_mapping) @staticmethod From a63dbf8289f34e5b80c974e51b93b7cb30596e5c Mon Sep 17 00:00:00 2001 From: Kunshang Ji Date: Tue, 16 Apr 2024 18:33:50 +0800 Subject: [PATCH 16/67] add co-author Co-authored-by: Jiang Li Co-authored-by: Abhilash Majumder From 0acfe755b5f768c1d5515090e548b27e76c6fd5f Mon Sep 17 00:00:00 2001 From: Kunshang Ji Date: Tue, 16 Apr 2024 19:39:00 +0800 Subject: [PATCH 17/67] revert test, fix format --- tests/kernels/allclose_default.py | 11 +++++-- tests/kernels/test_activation.py | 14 ++++++--- tests/kernels/test_attention.py | 22 +++++++------- tests/kernels/test_cache.py | 20 ++++++------ tests/kernels/test_layernorm.py | 5 +-- tests/kernels/test_pos_encoding.py | 49 ++++++++++++++---------------- vllm/_custom_ops.py | 4 +-- vllm/_ipex_ops.py | 5 +-- vllm/executor/ray_xpu_executor.py | 2 +- vllm/executor/xpu_executor.py | 2 +- vllm/utils.py | 3 +- vllm/worker/xpu_model_runner.py | 6 ++-- 12 files changed, 76 insertions(+), 67 deletions(-) diff --git a/tests/kernels/allclose_default.py b/tests/kernels/allclose_default.py index 175cfe82fb74e..db38b5af4ff99 100644 --- a/tests/kernels/allclose_default.py +++ b/tests/kernels/allclose_default.py @@ -1,5 +1,7 @@ import torch +from vllm.utils import is_xpu + # Reference default values of atol and rtol are from # https://github.com/pytorch/pytorch/blob/6d96beb6bec24d73ee3f080bac54d2104068f675/test/test_transformers.py#L67 default_atol = {torch.float16: 1e-3, torch.bfloat16: 1e-3, torch.float: 1e-5} @@ -9,10 +11,15 @@ torch.float: 1.3e-6 } +ipex_xpu_atol = {torch.float16: 1e-3, torch.bfloat16: 1e-3, torch.float: 1e-5} +ipex_xpu_rtol = {torch.float16: 1e-3, torch.bfloat16: 1e-3, torch.float: 1e-5} + def get_default_atol(output) -> float: - return default_atol[output.dtype] + return default_atol[output.dtype] if not is_xpu() else ipex_xpu_atol[ + output.dtype] def get_default_rtol(output) -> float: - return default_rtol[output.dtype] + return default_rtol[output.dtype] if not is_xpu() else ipex_xpu_rtol[ + output.dtype] diff --git a/tests/kernels/test_activation.py b/tests/kernels/test_activation.py index c69074392bbe5..7b29f94df59dd 100644 --- a/tests/kernels/test_activation.py +++ b/tests/kernels/test_activation.py @@ -15,8 +15,9 @@ SEEDS = [0] CUDA_DEVICES = [ f"cuda:{i}" for i in range(1 if torch.cuda.device_count() == 1 else 2) -] -SYCL_DEVICES = [f"xpu:0"] if is_xpu() else [] +] if torch.cuda.is_available() else [] +SYCL_DEVICES = ["xpu:0"] if is_xpu() else [] +DEVICES = CUDA_DEVICES + SYCL_DEVICES @pytest.mark.parametrize("activation", ["silu", "gelu", "gelu_tanh"]) @@ -24,7 +25,7 @@ @pytest.mark.parametrize("d", D) @pytest.mark.parametrize("dtype", DTYPES) @pytest.mark.parametrize("seed", SEEDS) -@pytest.mark.parametrize("device", SYCL_DEVICES) +@pytest.mark.parametrize("device", DEVICES) @torch.inference_mode() def test_act_and_mul( activation: str, @@ -49,7 +50,10 @@ def test_act_and_mul( ref_out = layer.forward_native(x) # The SiLU and GELU implementations are equivalent to the native PyTorch # implementations, so we can do exact comparison. - assert torch.allclose(out, ref_out, atol=0.001, rtol=0.01) + assert torch.allclose(out, + ref_out, + atol=get_default_atol(out), + rtol=get_default_rtol(out)) @pytest.mark.parametrize("activation", [FastGELU, NewGELU]) @@ -57,7 +61,7 @@ def test_act_and_mul( @pytest.mark.parametrize("d", D) @pytest.mark.parametrize("dtype", DTYPES) @pytest.mark.parametrize("seed", SEEDS) -@pytest.mark.parametrize("device", SYCL_DEVICES) +@pytest.mark.parametrize("device", DEVICES) @torch.inference_mode() def test_activation( activation: Type[torch.nn.Module], diff --git a/tests/kernels/test_attention.py b/tests/kernels/test_attention.py index dfc629abfbde0..cbee305c8707f 100644 --- a/tests/kernels/test_attention.py +++ b/tests/kernels/test_attention.py @@ -14,10 +14,11 @@ FLOAT32_BYTES = torch.finfo(torch.float).bits // 8 # This will change depending on the compute capability. # - 512 as a buffer -MAX_SEQ_LEN = 1024 # get_max_shared_memory_bytes() // FLOAT32_BYTES - 512 +MAX_SEQ_LEN = (get_max_shared_memory_bytes() // FLOAT32_BYTES - + 512) if not is_xpu else 1024 # There may not be enough gpu memory due to large NUM_BLOCKS. # Reduce NUM_BLOCKS when it happens. -NUM_BLOCKS = 4321 # Arbitrary values for testing +NUM_BLOCKS = 4321 if not is_xpu() else 500 # Arbitrary values for testing PARTITION_SIZE = 512 # flshattF and tritonflashattF supported: {torch.float16, torch.bfloat16} DTYPES = [torch.half, torch.bfloat16, torch.float @@ -31,16 +32,15 @@ HEAD_SIZES = [64, 80, 96, 112, 128, 192, 256 ] if not is_hip() else [64, 80, 96, 112, 128] -BLOCK_SIZES = [ - 16, -] #32] -USE_ALIBI = [False, True] -KV_CACHE_DTYPE = ["auto"] #, "fp8"] +BLOCK_SIZES = [16, 32] +USE_ALIBI = [False, True] if not is_xpu() else [True] +KV_CACHE_DTYPE = ["auto", "fp8"] SEEDS = [0] CUDA_DEVICES = [ f"cuda:{i}" for i in range(1 if torch.cuda.device_count() == 1 else 2) -] -SYCL_DEVICES = [f"xpu:0"] if is_xpu() else [] +] if torch.cuda.is_available() else [] +SYCL_DEVICES = ["xpu:0"] if is_xpu() else [] +DEVICES = CUDA_DEVICES + SYCL_DEVICES def ref_masked_attention( @@ -123,7 +123,7 @@ def ref_single_query_cached_kv_attention( @pytest.mark.parametrize("dtype", DTYPES) @pytest.mark.parametrize("kv_cache_dtype", KV_CACHE_DTYPE) @pytest.mark.parametrize("seed", SEEDS) -@pytest.mark.parametrize("device", SYCL_DEVICES) +@pytest.mark.parametrize("device", DEVICES) def test_paged_attention( kv_cache_factory, version: str, @@ -316,7 +316,7 @@ def ref_multi_query_kv_attention( @pytest.mark.parametrize("head_size", HEAD_SIZES) @pytest.mark.parametrize("dtype", DTYPES) @pytest.mark.parametrize("seed", SEEDS) -@pytest.mark.parametrize("device", SYCL_DEVICES) +@pytest.mark.parametrize("device", DEVICES) @torch.inference_mode() def test_multi_query_kv_attention( num_seqs: int, diff --git a/tests/kernels/test_cache.py b/tests/kernels/test_cache.py index 30d9423d4f478..e934a0e50244f 100644 --- a/tests/kernels/test_cache.py +++ b/tests/kernels/test_cache.py @@ -8,9 +8,8 @@ from vllm._C import cache_ops from vllm.utils import is_hip, is_xpu -COPYING_DIRECTION = [ - ('xpu', 'cpu'), -] # ('xpu', 'xpu'), ('cpu', 'xpu')] +COPYING_DIRECTION = [('cuda', 'cpu'), ('cuda', 'cuda'), ('cpu', 'cuda')] \ + if not is_xpu() else [('xpu', 'cpu'), ('xpu', 'xpu'), ('cpu', 'xpu')] DTYPES = [torch.half, torch.bfloat16, torch.float] NUM_TOKENS = [42] # Arbitrary values for testing NUM_LAYERS = [1] # Arbitrary values for testing @@ -20,17 +19,16 @@ # Arbitrary values for testing # don't make it too large. e.g. [1024, 36000] will OOM -NUM_BLOCKS = [1024] #, 10000] +NUM_BLOCKS = [1024, 10000] NUM_MAPPINGS = [256] # Arbitrary values for testing SEEDS = [0] CUDA_DEVICES = [ f"cuda:{i}" for i in range(1 if torch.cuda.device_count() == 1 else 2) -] - -SYCL_DEVICES = [f"xpu:0"] if is_xpu() else [] -# We assume fp8 is always enabled for testing. -KV_CACHE_DTYPE = ["auto"] #, "fp8"] +] if torch.cuda.is_available() else [] +SYCL_DEVICES = ["xpu:0"] if is_xpu() else [] +DEVICES = CUDA_DEVICES + SYCL_DEVICES +KV_CACHE_DTYPE = ["auto", "fp8"] @pytest.mark.parametrize("num_mappings", NUM_MAPPINGS) @@ -384,8 +382,8 @@ def test_swap_blocks( if torch.cuda.is_available(): torch.cuda.manual_seed(seed) - src_device = device if direction[0] == "xpu" else 'cpu' - dst_device = device if direction[1] == "xpu" else 'cpu' + src_device = device if direction[0] != "cpu" else 'cpu' + dst_device = device if direction[1] != "cpu" else 'cpu' src_blocks = random.sample(range(num_blocks), num_mappings) # For the same device, mapping must not overlap diff --git a/tests/kernels/test_layernorm.py b/tests/kernels/test_layernorm.py index 544d5dc2fea9d..7e7175962c76d 100644 --- a/tests/kernels/test_layernorm.py +++ b/tests/kernels/test_layernorm.py @@ -12,8 +12,9 @@ SEEDS = [0] CUDA_DEVICES = [ f"cuda:{i}" for i in range(1 if torch.cuda.device_count() == 1 else 2) -] +] if torch.cuda.is_available() else [] SYCL_DEVICES = ["xpu:0"] if is_xpu() else [] +DEVICES = CUDA_DEVICES + SYCL_DEVICES @pytest.mark.parametrize("num_tokens", NUM_TOKENS) @@ -21,7 +22,7 @@ @pytest.mark.parametrize("add_residual", ADD_RESIDUAL) @pytest.mark.parametrize("dtype", DTYPES) @pytest.mark.parametrize("seed", SEEDS) -@pytest.mark.parametrize("device", SYCL_DEVICES) +@pytest.mark.parametrize("device", DEVICES) @torch.inference_mode() def test_rms_norm( num_tokens: int, diff --git a/tests/kernels/test_pos_encoding.py b/tests/kernels/test_pos_encoding.py index d38a3e77053eb..84c6596ce8407 100644 --- a/tests/kernels/test_pos_encoding.py +++ b/tests/kernels/test_pos_encoding.py @@ -15,12 +15,13 @@ ROTARY_DIMS = [None, 32] # None means rotary dim == head size NUM_HEADS = [7, 17] # Arbitrary values for testing BATCH_SIZES = [1, 5] # Arbitrary values for testing -SEQ_LENS = [11, 512] # Arbitrary values for testing +SEQ_LENS = [11, 8192] # Arbitrary values for testing SEEDS = [0] CUDA_DEVICES = [ f"cuda:{i}" for i in range(1 if torch.cuda.device_count() == 1 else 2) -] -SYCL_DEVICES = [f"xpu:0"] if is_xpu() else [] +] if torch.cuda.is_available() else [] +SYCL_DEVICES = ["xpu:0"] if is_xpu() else [] +DEVICES = CUDA_DEVICES + SYCL_DEVICES @pytest.mark.parametrize("is_neox_style", IS_NEOX_STYLE) @@ -31,7 +32,7 @@ @pytest.mark.parametrize("rotary_dim", ROTARY_DIMS) @pytest.mark.parametrize("dtype", DTYPES) @pytest.mark.parametrize("seed", SEEDS) -@pytest.mark.parametrize("device", SYCL_DEVICES) +@pytest.mark.parametrize("device", DEVICES) @torch.inference_mode() def test_rotary_embedding( is_neox_style: bool, @@ -69,16 +70,14 @@ def test_rotary_embedding( ref_query, ref_key = rope.forward_native(positions, query, key) out_query, out_key = rope.forward(positions, query, key) # Compare the results. - assert torch.allclose( - out_query, - ref_query, - atol=0.01, #get_default_atol(out_query), - rtol=0.01) #get_default_rtol(out_query)) - assert torch.allclose( - out_key, - ref_key, - atol=0.01, #get_default_atol(out_query), - rtol=0.01) #get_default_rtol(out_query)) + assert torch.allclose(out_query, + ref_query, + atol=get_default_atol(out_query), + rtol=get_default_rtol(out_query)) + assert torch.allclose(out_key, + ref_key, + atol=get_default_atol(out_query), + rtol=get_default_rtol(out_query)) @pytest.mark.parametrize("is_neox_style", IS_NEOX_STYLE) @@ -89,7 +88,7 @@ def test_rotary_embedding( @pytest.mark.parametrize("rotary_dim", ROTARY_DIMS) @pytest.mark.parametrize("dtype", DTYPES) @pytest.mark.parametrize("seed", SEEDS) -@pytest.mark.parametrize("device", SYCL_DEVICES) +@pytest.mark.parametrize("device", DEVICES) @torch.inference_mode() def test_batched_rotary_embedding( is_neox_style: bool, @@ -133,16 +132,14 @@ def test_batched_rotary_embedding( dtype=int, device=device)) # Compare the results. - assert torch.allclose( - out_query, - ref_query, - atol=0.01, #get_default_atol(out_query), - rtol=0.01) #get_default_rtol(out_query)) - assert torch.allclose( - out_key, - ref_key, - atol=0.01, #get_default_atol(out_key), - rtol=0.01) #get_default_rtol(out_key)) + assert torch.allclose(out_query, + ref_query, + atol=get_default_atol(out_query), + rtol=get_default_rtol(out_query)) + assert torch.allclose(out_key, + ref_key, + atol=get_default_atol(out_key), + rtol=get_default_rtol(out_key)) @pytest.mark.parametrize("is_neox_style", IS_NEOX_STYLE) @@ -153,7 +150,7 @@ def test_batched_rotary_embedding( @pytest.mark.parametrize("rotary_dim", ROTARY_DIMS) @pytest.mark.parametrize("dtype", DTYPES) @pytest.mark.parametrize("seed", SEEDS) -@pytest.mark.parametrize("device", SYCL_DEVICES) +@pytest.mark.parametrize("device", DEVICES) @torch.inference_mode() def test_batched_rotary_embedding_multi_lora( is_neox_style: bool, diff --git a/vllm/_custom_ops.py b/vllm/_custom_ops.py index 430f18cce9338..73b40f7a33bbd 100644 --- a/vllm/_custom_ops.py +++ b/vllm/_custom_ops.py @@ -13,11 +13,11 @@ except ImportError as e: logger.warning("Failed to import from vllm._C with %r", e) -# todo: how to support cpu+ipex? from vllm.utils import is_xpu + if is_xpu(): - from vllm._ipex_ops import ipex_ops as vllm_ops from vllm._ipex_ops import ipex_cache_ops as vllm_cache_ops + from vllm._ipex_ops import ipex_ops as vllm_ops with contextlib.suppress(ImportError): import vllm._moe_C diff --git a/vllm/_ipex_ops.py b/vllm/_ipex_ops.py index 2ae92daccda71..628f061bbb736 100644 --- a/vllm/_ipex_ops.py +++ b/vllm/_ipex_ops.py @@ -1,7 +1,8 @@ -import torch -import intel_extension_for_pytorch as ipex from typing import Dict, List, Optional +import intel_extension_for_pytorch as ipex +import torch + class ipex_ops: diff --git a/vllm/executor/ray_xpu_executor.py b/vllm/executor/ray_xpu_executor.py index 9b249f106078a..6cd140ec97f7e 100644 --- a/vllm/executor/ray_xpu_executor.py +++ b/vllm/executor/ray_xpu_executor.py @@ -3,7 +3,7 @@ import os import pickle from collections import defaultdict -from typing import TYPE_CHECKING, Any, Dict, List, Optional, Tuple, Set +from typing import TYPE_CHECKING, Any, Dict, List, Optional, Set, Tuple from vllm.config import (CacheConfig, DeviceConfig, LoRAConfig, ModelConfig, ParallelConfig, SchedulerConfig, VisionLanguageConfig) diff --git a/vllm/executor/xpu_executor.py b/vllm/executor/xpu_executor.py index 16389e7efcd38..3bfd227f65d5e 100644 --- a/vllm/executor/xpu_executor.py +++ b/vllm/executor/xpu_executor.py @@ -1,4 +1,4 @@ -from typing import Dict, List, Optional, Tuple, Set +from typing import Dict, List, Optional, Set, Tuple import torch diff --git a/vllm/utils.py b/vllm/utils.py index b7b020648da42..c7f8a14400dc8 100644 --- a/vllm/utils.py +++ b/vllm/utils.py @@ -24,7 +24,8 @@ try: import intel_extension_for_pytorch # noqa: F401 _import_ipex = True -except ImportError: +except ImportError as e: + print(f"Import Error for IPEX: {e.msg}") _import_ipex = False import vllm.envs as envs diff --git a/vllm/worker/xpu_model_runner.py b/vllm/worker/xpu_model_runner.py index 8db38184c4513..2805e6de734d6 100644 --- a/vllm/worker/xpu_model_runner.py +++ b/vllm/worker/xpu_model_runner.py @@ -3,11 +3,11 @@ import torch from vllm.distributed import broadcast_tensor_dict -from vllm.sequence import SequenceGroupMetadata, SamplerOutput, SequenceData +from vllm.sampling_params import SamplingParams, SamplingType +from vllm.sequence import SamplerOutput, SequenceData, SequenceGroupMetadata from vllm.utils import make_tensor_with_pad, maybe_expand_dim -from vllm.worker.model_runner import (ModelRunner, AttentionMetadata, +from vllm.worker.model_runner import (AttentionMetadata, ModelRunner, SamplingMetadata) -from vllm.sampling_params import SamplingParams, SamplingType _PAD_SLOT_ID = -1 From 58eafd0083f241d974fc660a0c425a8036a922e4 Mon Sep 17 00:00:00 2001 From: Kunshang Ji Date: Tue, 16 Apr 2024 22:28:56 +0800 Subject: [PATCH 18/67] fix ray_xpu_executor --- vllm/executor/ray_xpu_executor.py | 81 ++++++++++++++++--------------- 1 file changed, 42 insertions(+), 39 deletions(-) diff --git a/vllm/executor/ray_xpu_executor.py b/vllm/executor/ray_xpu_executor.py index 6cd140ec97f7e..99734c225994b 100644 --- a/vllm/executor/ray_xpu_executor.py +++ b/vllm/executor/ray_xpu_executor.py @@ -6,10 +6,10 @@ from typing import TYPE_CHECKING, Any, Dict, List, Optional, Set, Tuple from vllm.config import (CacheConfig, DeviceConfig, LoRAConfig, ModelConfig, - ParallelConfig, SchedulerConfig, VisionLanguageConfig) + ParallelConfig, SchedulerConfig, SpeculativeConfig, + TensorizerConfig, VisionLanguageConfig) from vllm.engine.ray_utils import RayWorkerVllm, ray from vllm.executor.executor_base import ExecutorAsyncBase, ExecutorBase -from vllm.executor.utils import check_block_size_valid from vllm.logger import init_logger from vllm.lora.request import LoRARequest from vllm.sequence import SamplerOutput, SequenceGroupMetadata @@ -41,7 +41,13 @@ def __init__( device_config: DeviceConfig, lora_config: Optional[LoRAConfig], vision_language_config: Optional[VisionLanguageConfig], + speculative_config: Optional[SpeculativeConfig], + tensorizer_config: Optional[TensorizerConfig], ) -> None: + assert device_config.device_type == "xpu" + assert (not speculative_config + ), "Speculative decoding not yet supported for XPU backend" + self.model_config = model_config self.cache_config = cache_config self.lora_config = lora_config @@ -49,6 +55,7 @@ def __init__( self.scheduler_config = scheduler_config self.device_config = device_config self.vision_language_config = vision_language_config + self.tensorizer_config = tensorizer_config assert self.parallel_config.worker_use_ray placement_group = self.parallel_config.placement_group @@ -62,11 +69,33 @@ def __init__( self._init_workers_ray(placement_group) # Profile the memory usage and initialize the cache. - self._init_cache() - self.forward_dag = None if USE_RAY_COMPILED_DAG: self.forward_dag = self._compiled_ray_dag() + + def _init_executor(self) -> None: + pass + + def determine_num_available_blocks(self) -> Tuple[int, int]: + """Determine the number of available KV blocks. + + This invokes `determine_num_available_blocks` on each worker and takes + the min of the results, guaranteeing that the selected cache sizes are + compatible with all workers. + + Returns: + - Tuple[num_gpu_blocks, num_cpu_blocks] + """ + # Get the maximum number of blocks that can be allocated on GPU and CPU. + num_blocks = self._run_workers("determine_num_available_blocks", ) + + # Since we use a shared centralized controller, we take the minimum + # number of blocks across all workers to make sure all the memory + # operators can be applied to all workers. + num_gpu_blocks = min(b[0] for b in num_blocks) + num_cpu_blocks = min(b[1] for b in num_blocks) + + return num_gpu_blocks, num_cpu_blocks def _init_workers_ray(self, placement_group: "PlacementGroup", **ray_remote_kwargs): @@ -191,49 +220,23 @@ def _init_workers_ray(self, placement_group: "PlacementGroup", max_parallel_loading_workers, ) - def _init_cache(self) -> None: - """Profiles the memory usage and initializes the KV cache. - - The engine will first conduct a profiling of the existing memory usage. - Then, it calculate the maximum possible number of GPU and CPU blocks - that can be allocated with the remaining free memory. - More details can be found in the - :meth:`~vllm.worker.worker.xpu_Worker.determine_num_available_blocks` - method from class :class:`~vllm.worker.xpu_Worker`. - - Afterwards, as there may be multiple workers, - we take the minimum number of blocks across all workers - to ensure this can be applied to all of them. - - Finally, the engine will initialize the KV cache - with the calculated number of blocks. - - .. tip:: - You may limit the usage of GPU memory - by adjusting the `gpu_memory_utilization` parameter. + def initialize_cache(self, num_gpu_blocks: int, + num_cpu_blocks: int) -> None: + """Initialize the KV cache in all workers. """ - # Get the maximum number of blocks that can be allocated on GPU and CPU. - num_blocks = self._run_workers("determine_num_available_blocks", ) - # Since we use a shared centralized controller, we take the minimum - # number of blocks across all workers to make sure all the memory - # operators can be applied to all workers. - num_gpu_blocks = min(b[0] for b in num_blocks) - num_cpu_blocks = min(b[1] for b in num_blocks) + # NOTE: We log here to avoid multiple logs when number of workers is + # greater than one. We could log in the engine, but not all executors + # have GPUs. logger.info(f"# GPU blocks: {num_gpu_blocks}, " f"# CPU blocks: {num_cpu_blocks}") - check_block_size_valid(num_gpu_blocks, self.cache_config.block_size, - self.model_config.max_model_len) - self.cache_config.num_gpu_blocks = num_gpu_blocks self.cache_config.num_cpu_blocks = num_cpu_blocks - # Initialize the cache. - self._run_workers("init_cache_engine", cache_config=self.cache_config) - # Warm up the model. This includes capturing the model into CUDA graph - # if enforce_eager is False. - self._run_workers("warm_up_model") + self._run_workers("initialize_cache", + num_gpu_blocks=num_gpu_blocks, + num_cpu_blocks=num_cpu_blocks) def execute_model(self, seq_group_metadata_list: List[SequenceGroupMetadata], From bc45bce1dcedd66a7c6d2ef155d603206e0ccbb4 Mon Sep 17 00:00:00 2001 From: Kunshang Ji Date: Wed, 17 Apr 2024 22:13:42 +0800 Subject: [PATCH 19/67] use varlen_fwd --- vllm/attention/backends/torch_sdpa.py | 9 +++------ 1 file changed, 3 insertions(+), 6 deletions(-) diff --git a/vllm/attention/backends/torch_sdpa.py b/vllm/attention/backends/torch_sdpa.py index 9c3eba5e79c47..72afac8403598 100644 --- a/vllm/attention/backends/torch_sdpa.py +++ b/vllm/attention/backends/torch_sdpa.py @@ -216,12 +216,9 @@ def forward( att_masks = [None] * len(attn_metadata.seq_lens) attn_metadata.attn_bias = att_masks - query = query.unsqueeze(0) - key = key.unsqueeze(0) - value = value.unsqueeze(0) - query = query.movedim(1, query.dim() - 2) - key = key.movedim(1, key.dim() - 2) - value = value.movedim(1, value.dim() - 2) + # query = query.unsqueeze(0) # [batch_size, num_tokens, num_heads, head_size] + # key = key.unsqueeze(0) + # value = value.unsqueeze(0) start = 0 output = torch.empty( From 56b0016e1a03071a13c83da0e71482e436302419 Mon Sep 17 00:00:00 2001 From: Kunshang Ji Date: Wed, 17 Apr 2024 23:42:59 +0800 Subject: [PATCH 20/67] use varlen_attention --- tests/kernels/test_attention.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/kernels/test_attention.py b/tests/kernels/test_attention.py index cbee305c8707f..c037648b4e493 100644 --- a/tests/kernels/test_attention.py +++ b/tests/kernels/test_attention.py @@ -33,7 +33,7 @@ ] if not is_hip() else [64, 80, 96, 112, 128] BLOCK_SIZES = [16, 32] -USE_ALIBI = [False, True] if not is_xpu() else [True] +USE_ALIBI = [False, True] if not is_xpu() else [False] KV_CACHE_DTYPE = ["auto", "fp8"] SEEDS = [0] CUDA_DEVICES = [ From ded32a28e1749446259fe474033562b3c62a6c76 Mon Sep 17 00:00:00 2001 From: Kunshang Ji Date: Thu, 18 Apr 2024 23:47:09 +0800 Subject: [PATCH 21/67] fix --- benchmarks/benchmark_throughput.py | 2 +- vllm/_ipex_ops.py | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/benchmarks/benchmark_throughput.py b/benchmarks/benchmark_throughput.py index 07b2f85410e3c..bb1e806588607 100644 --- a/benchmarks/benchmark_throughput.py +++ b/benchmarks/benchmark_throughput.py @@ -346,7 +346,7 @@ def main(args: argparse.Namespace): "--device", type=str, default="cuda", - choices=["cuda", "cpu", "tpu"], + choices=["cuda", "cpu", "tpu", "xpu"], help='device type for vLLM execution, supporting CUDA and CPU.') parser.add_argument( "--enable-prefix-caching", diff --git a/vllm/_ipex_ops.py b/vllm/_ipex_ops.py index 628f061bbb736..8fb74b27e7454 100644 --- a/vllm/_ipex_ops.py +++ b/vllm/_ipex_ops.py @@ -207,7 +207,7 @@ def copy_blocks(key_caches: List[torch.Tensor], torch.xpu.copy_blocks(key_caches, value_caches, block_mapping) @staticmethod - def swap_blocks(src: List[torch.Tensor], dst: List[torch.Tensor], + def swap_blocks(src: torch.Tensor, dst: torch.Tensor, block_mapping: Dict[int, int]) -> None: keys = list(block_mapping.keys()) values = list(block_mapping.values()) From 25b368b819a88042506d259e20862c6150e60431 Mon Sep 17 00:00:00 2001 From: Kunshang Ji Date: Fri, 19 Apr 2024 00:41:42 +0800 Subject: [PATCH 22/67] remove paading --- vllm/executor/ray_xpu_executor.py | 30 +++++++++++++++--------------- vllm/worker/xpu_model_runner.py | 7 ------- 2 files changed, 15 insertions(+), 22 deletions(-) diff --git a/vllm/executor/ray_xpu_executor.py b/vllm/executor/ray_xpu_executor.py index 99734c225994b..f3df4014834ec 100644 --- a/vllm/executor/ray_xpu_executor.py +++ b/vllm/executor/ray_xpu_executor.py @@ -47,7 +47,7 @@ def __init__( assert device_config.device_type == "xpu" assert (not speculative_config ), "Speculative decoding not yet supported for XPU backend" - + self.model_config = model_config self.cache_config = cache_config self.lora_config = lora_config @@ -72,9 +72,9 @@ def __init__( self.forward_dag = None if USE_RAY_COMPILED_DAG: self.forward_dag = self._compiled_ray_dag() - + def _init_executor(self) -> None: - pass + pass def determine_num_available_blocks(self) -> Tuple[int, int]: """Determine the number of available KV blocks. @@ -188,26 +188,26 @@ def _init_workers_ray(self, placement_group: "PlacementGroup", parallel_config, scheduler_config, device_config, - local_rank, - rank, - distributed_init_method, + cache_config=self.cache_config, + local_rank=local_rank, + rank=rank, + distributed_init_method=distributed_init_method, lora_config=lora_config, - kv_cache_dtype=kv_cache_dtype, )) # Initialize the driver worker with the Worker class. driver_rank = 0 driver_local_rank = node_workers[driver_node_id].index(driver_rank) self.driver_worker = XPUWorker( - self.model_config, - self.parallel_config, - self.scheduler_config, - self.device_config, - driver_local_rank, - driver_rank, - distributed_init_method, + model_config, + parallel_config, + scheduler_config, + device_config, + cache_config=self.cache_config, + local_rank=local_rank, + rank=rank, + distributed_init_method=distributed_init_method, lora_config=self.lora_config, - kv_cache_dtype=kv_cache_dtype, is_driver_worker=True, ) diff --git a/vllm/worker/xpu_model_runner.py b/vllm/worker/xpu_model_runner.py index 2805e6de734d6..51b784f4d3eb9 100644 --- a/vllm/worker/xpu_model_runner.py +++ b/vllm/worker/xpu_model_runner.py @@ -334,13 +334,6 @@ def _prepare_prompt( slot = block_number * self.block_size + block_offset slot_mapping.append(slot) - # padding to 8 bytes for XeTLA - padding_num = (8 - len(input_tokens) % 8) % 8 - input_tokens.extend([0] * padding_num) - input_positions.extend([0] * padding_num) - slot_mapping.extend([_PAD_SLOT_ID] * padding_num) - prompt_lens[-1] += padding_num - num_prompt_tokens = len(input_tokens) input_tokens = torch.tensor(input_tokens, From 3519897e465dcd027303a66ecdb34b37dbd48187 Mon Sep 17 00:00:00 2001 From: Kunshang Ji Date: Fri, 19 Apr 2024 16:29:44 +0800 Subject: [PATCH 23/67] avoid using page attention v2 for ipex --- vllm/attention/ops/paged_attn.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/vllm/attention/ops/paged_attn.py b/vllm/attention/ops/paged_attn.py index a214f40d16514..64707791b25d3 100644 --- a/vllm/attention/ops/paged_attn.py +++ b/vllm/attention/ops/paged_attn.py @@ -5,6 +5,7 @@ from vllm import _custom_ops as ops from vllm.attention.ops.prefix_prefill import context_attention_fwd +from vllm.utils import is_xpu # Should be the same as PARTITION_SIZE in `paged_attention_v2_launcher`. _PARTITION_SIZE = 512 @@ -119,6 +120,7 @@ def forward_decode( # For context len > 8192, use V2 kernel to avoid shared memory shortage. use_v1 = (max_seq_len <= 8192 and (max_num_partitions == 1 or num_seqs * num_heads > 512)) + use_v1 = use_v1 or is_xpu() # ipex page_attn v2 is not ready yet. if use_v1: # Run PagedAttention V1. From 36fae836847835593234e901573cafc4867d40ad Mon Sep 17 00:00:00 2001 From: Kunshang Ji Date: Fri, 19 Apr 2024 18:47:30 +0800 Subject: [PATCH 24/67] refactor --- vllm/_ipex_ops.py | 8 ++++---- vllm/attention/ops/paged_attn.py | 1 - 2 files changed, 4 insertions(+), 5 deletions(-) diff --git a/vllm/_ipex_ops.py b/vllm/_ipex_ops.py index 8fb74b27e7454..d92d1e40d6884 100644 --- a/vllm/_ipex_ops.py +++ b/vllm/_ipex_ops.py @@ -94,11 +94,11 @@ def paged_attention_v2( ).view(num_kv_heads, 1).repeat_interleave(num_queries_per_tokens).flatten() # todo: ipex will refactor namespace - torch.xpu.paged_attention_v2(out, exp_sum, max_logits, tmp_out, - query.contiguous(), + # ipex cpp layer unified paged_attention v1 and v2 + torch.xpu.paged_attention_v1(out, query.contiguous(), key_cache.view_as(value_cache), - value_cache, head_mapping, block_tables, - context_lens, scale, block_size, + value_cache, head_mapping, scale, + block_tables, context_lens, block_size, max_context_len, alibi_slopes) def rotary_embedding( diff --git a/vllm/attention/ops/paged_attn.py b/vllm/attention/ops/paged_attn.py index 64707791b25d3..4cbdf7395aeed 100644 --- a/vllm/attention/ops/paged_attn.py +++ b/vllm/attention/ops/paged_attn.py @@ -121,7 +121,6 @@ def forward_decode( use_v1 = (max_seq_len <= 8192 and (max_num_partitions == 1 or num_seqs * num_heads > 512)) use_v1 = use_v1 or is_xpu() # ipex page_attn v2 is not ready yet. - if use_v1: # Run PagedAttention V1. ops.paged_attention_v1( From 851443290d654229ce5c595cec2b053977579636 Mon Sep 17 00:00:00 2001 From: Kunshang Ji Date: Fri, 19 Apr 2024 19:04:09 +0800 Subject: [PATCH 25/67] minor --- vllm/_ipex_ops.py | 1 + 1 file changed, 1 insertion(+) diff --git a/vllm/_ipex_ops.py b/vllm/_ipex_ops.py index d92d1e40d6884..95f3355baf0d2 100644 --- a/vllm/_ipex_ops.py +++ b/vllm/_ipex_ops.py @@ -91,6 +91,7 @@ def paged_attention_v2( 0, num_kv_heads, dtype=torch.int32, + device=query.device, ).view(num_kv_heads, 1).repeat_interleave(num_queries_per_tokens).flatten() # todo: ipex will refactor namespace From e1a42dad7802d173479363964d21830a7ad083d9 Mon Sep 17 00:00:00 2001 From: Kunshang Ji Date: Fri, 19 Apr 2024 19:56:53 +0800 Subject: [PATCH 26/67] add xpu test --- .buildkite/run-xpu-test.sh | 14 ++++++++++++++ Dockerfile.xpu | 17 +++++++++++++++++ 2 files changed, 31 insertions(+) create mode 100644 .buildkite/run-xpu-test.sh create mode 100644 Dockerfile.xpu diff --git a/.buildkite/run-xpu-test.sh b/.buildkite/run-xpu-test.sh new file mode 100644 index 0000000000000..5d4311b696d2e --- /dev/null +++ b/.buildkite/run-xpu-test.sh @@ -0,0 +1,14 @@ +# This script build the CPU docker image and run the offline inference inside the container. +# It serves a sanity check for compilation and basic model usage. +set -ex + +# Try building the docker image +docker build -t xpu-test -f Dockerfile.xpu . + +# Setup cleanup +remove_docker_container() { docker rm -f xpu-test || true; } +trap remove_docker_container EXIT +remove_docker_container + +# Run the image and launch offline inference +docker run --network host --env VLLM_CPU_KVCACHE_SPACE=1 --name xpu-test --device /dev/dri -v /dev/dri/by-path:/dev/dri/by-path xpu-test python3 examples/offline_inference.py \ No newline at end of file diff --git a/Dockerfile.xpu b/Dockerfile.xpu new file mode 100644 index 0000000000000..52b06e14dd4a1 --- /dev/null +++ b/Dockerfile.xpu @@ -0,0 +1,17 @@ +FROM intel/intel-optimized-pytorch:2.1.10-xpu-pip-base + +RUN apt update && apt install -y intel-basekit + +RUN apt-get update -y \ +&& apt-get install -y curl libicu70 lsb-release git wget vim numactl python3 python3-pip + + +COPY ./ /workspace/vllm + +WORKDIR /workspace/vllm + +RUN pip install -v -r requirements-xpu.txt + +RUN VLLM_TARGET_DEVICE=xpu python3 setup.py install + +CMD ["/bin/bash"] From 917b74a8d3deacdc784b9cd0ab739005ba4cc14f Mon Sep 17 00:00:00 2001 From: Kunshang Ji Date: Fri, 19 Apr 2024 21:45:48 +0800 Subject: [PATCH 27/67] use oneapi base docker imahe --- Dockerfile.xpu | 5 +---- 1 file changed, 1 insertion(+), 4 deletions(-) diff --git a/Dockerfile.xpu b/Dockerfile.xpu index 52b06e14dd4a1..15c4a50116d68 100644 --- a/Dockerfile.xpu +++ b/Dockerfile.xpu @@ -1,11 +1,8 @@ -FROM intel/intel-optimized-pytorch:2.1.10-xpu-pip-base - -RUN apt update && apt install -y intel-basekit +FROM intel/oneapi-basekit:2024.1.0-devel-ubuntu22.04 RUN apt-get update -y \ && apt-get install -y curl libicu70 lsb-release git wget vim numactl python3 python3-pip - COPY ./ /workspace/vllm WORKDIR /workspace/vllm From 80961f79862fa4a3ca72727b162113e7b19194ca Mon Sep 17 00:00:00 2001 From: Kunshang Ji Date: Fri, 19 Apr 2024 23:03:41 +0800 Subject: [PATCH 28/67] format --- vllm/attention/ops/paged_attn.py | 1 - 1 file changed, 1 deletion(-) diff --git a/vllm/attention/ops/paged_attn.py b/vllm/attention/ops/paged_attn.py index 4cbdf7395aeed..c30e0d78a4935 100644 --- a/vllm/attention/ops/paged_attn.py +++ b/vllm/attention/ops/paged_attn.py @@ -5,7 +5,6 @@ from vllm import _custom_ops as ops from vllm.attention.ops.prefix_prefill import context_attention_fwd -from vllm.utils import is_xpu # Should be the same as PARTITION_SIZE in `paged_attention_v2_launcher`. _PARTITION_SIZE = 512 From 903756481354a91b74335f1434be0f9193e88a2f Mon Sep 17 00:00:00 2001 From: Kunshang Ji Date: Fri, 19 Apr 2024 23:58:44 +0800 Subject: [PATCH 29/67] rebase, remove some config --- vllm/executor/ray_xpu_executor.py | 12 ++- vllm/executor/xpu_executor.py | 8 +- vllm/utils.py | 7 -- vllm/worker/model_runner.py | 2 +- vllm/worker/xpu_model_runner.py | 136 +++++++++++++++++++++++++++--- vllm/worker/xpu_worker.py | 30 +++---- 6 files changed, 149 insertions(+), 46 deletions(-) diff --git a/vllm/executor/ray_xpu_executor.py b/vllm/executor/ray_xpu_executor.py index f3df4014834ec..c36e23269d848 100644 --- a/vllm/executor/ray_xpu_executor.py +++ b/vllm/executor/ray_xpu_executor.py @@ -7,7 +7,7 @@ from vllm.config import (CacheConfig, DeviceConfig, LoRAConfig, ModelConfig, ParallelConfig, SchedulerConfig, SpeculativeConfig, - TensorizerConfig, VisionLanguageConfig) + VisionLanguageConfig) from vllm.engine.ray_utils import RayWorkerVllm, ray from vllm.executor.executor_base import ExecutorAsyncBase, ExecutorBase from vllm.logger import init_logger @@ -42,7 +42,6 @@ def __init__( lora_config: Optional[LoRAConfig], vision_language_config: Optional[VisionLanguageConfig], speculative_config: Optional[SpeculativeConfig], - tensorizer_config: Optional[TensorizerConfig], ) -> None: assert device_config.device_type == "xpu" assert (not speculative_config @@ -55,7 +54,6 @@ def __init__( self.scheduler_config = scheduler_config self.device_config = device_config self.vision_language_config = vision_language_config - self.tensorizer_config = tensorizer_config assert self.parallel_config.worker_use_ray placement_group = self.parallel_config.placement_group @@ -174,7 +172,6 @@ def _init_workers_ray(self, placement_group: "PlacementGroup", scheduler_config = copy.deepcopy(self.scheduler_config) device_config = copy.deepcopy(self.device_config) lora_config = copy.deepcopy(self.lora_config) - kv_cache_dtype = self.cache_config.cache_dtype # Initialize the actual workers with the Worker class. for rank, (worker, (node_id, _)) in enumerate( @@ -204,8 +201,8 @@ def _init_workers_ray(self, placement_group: "PlacementGroup", scheduler_config, device_config, cache_config=self.cache_config, - local_rank=local_rank, - rank=rank, + local_rank=driver_local_rank, + rank=driver_rank, distributed_init_method=distributed_init_method, lora_config=self.lora_config, is_driver_worker=True, @@ -242,7 +239,8 @@ def execute_model(self, seq_group_metadata_list: List[SequenceGroupMetadata], blocks_to_swap_in: Dict[int, int], blocks_to_swap_out: Dict[int, int], - blocks_to_copy: Dict[int, List[int]]) -> SamplerOutput: + blocks_to_copy: Dict[int, List[int]], + num_lookahead_slots: int) -> List[SamplerOutput]: all_outputs = self._run_workers( "execute_model", driver_kwargs={ diff --git a/vllm/executor/xpu_executor.py b/vllm/executor/xpu_executor.py index 3bfd227f65d5e..d906a8e6528b6 100644 --- a/vllm/executor/xpu_executor.py +++ b/vllm/executor/xpu_executor.py @@ -4,7 +4,7 @@ from vllm.config import (CacheConfig, DeviceConfig, LoRAConfig, ModelConfig, ParallelConfig, SchedulerConfig, SpeculativeConfig, - TensorizerConfig, VisionLanguageConfig) + VisionLanguageConfig) from vllm.executor.executor_base import ExecutorAsyncBase, ExecutorBase from vllm.logger import init_logger from vllm.lora.request import LoRARequest @@ -27,7 +27,6 @@ def __init__( lora_config: Optional[LoRAConfig], vision_language_config: Optional[VisionLanguageConfig], speculative_config: Optional[SpeculativeConfig], - tensorizer_config: Optional[TensorizerConfig], ) -> None: assert device_config.device_type == "xpu" assert (not speculative_config @@ -42,7 +41,6 @@ def __init__( self.scheduler_config = scheduler_config self.device_config = device_config self.vision_language_config = vision_language_config - self.tensorizer_config = tensorizer_config # Instantiate the worker and load the model to GPU. self._init_executor() @@ -70,7 +68,6 @@ def _init_worker(self): lora_config=self.lora_config, vision_language_config=self.vision_language_config, is_driver_worker=True, - tensorizer_config=self.tensorizer_config, ) self.driver_worker.init_device() self.driver_worker.load_model() @@ -96,7 +93,8 @@ def execute_model(self, seq_group_metadata_list: List[SequenceGroupMetadata], blocks_to_swap_in: Dict[int, int], blocks_to_swap_out: Dict[int, int], - blocks_to_copy: Dict[int, List[int]]) -> SamplerOutput: + blocks_to_copy: Dict[int, List[int]], + num_lookahead_slots: int) -> List[SamplerOutput]: output = self.driver_worker.execute_model( seq_group_metadata_list=seq_group_metadata_list, blocks_to_swap_in=blocks_to_swap_in, diff --git a/vllm/utils.py b/vllm/utils.py index c7f8a14400dc8..5598c38d07e98 100644 --- a/vllm/utils.py +++ b/vllm/utils.py @@ -506,13 +506,6 @@ def is_pin_memory_available() -> bool: return True -def device_sync(): - if torch.cuda.is_available(): - torch.cuda.synchronize() - elif is_xpu(): - torch.xpu.synchronize() - - class CudaMemoryProfiler: def __init__(self, device=None): diff --git a/vllm/worker/model_runner.py b/vllm/worker/model_runner.py index 81a63cde64256..476e9ba3bb463 100644 --- a/vllm/worker/model_runner.py +++ b/vllm/worker/model_runner.py @@ -842,7 +842,7 @@ def profile_run(self) -> None: num_layers = self.model_config.get_num_layers(self.parallel_config) kv_caches = [None] * num_layers self.execute_model(seqs, kv_caches) - device_sync() + torch.cuda.synchronize() return def remove_all_loras(self): diff --git a/vllm/worker/xpu_model_runner.py b/vllm/worker/xpu_model_runner.py index 51b784f4d3eb9..239ae3f661d9a 100644 --- a/vllm/worker/xpu_model_runner.py +++ b/vllm/worker/xpu_model_runner.py @@ -1,22 +1,135 @@ from typing import Dict, List, Optional, Tuple +import numpy as np import torch +import torch.nn as nn +from vllm.attention import get_attn_backend +from vllm.config import (DeviceConfig, LoadConfig, LoRAConfig, ModelConfig, + ParallelConfig, SchedulerConfig) from vllm.distributed import broadcast_tensor_dict +from vllm.model_executor.model_loader import get_model from vllm.sampling_params import SamplingParams, SamplingType from vllm.sequence import SamplerOutput, SequenceData, SequenceGroupMetadata from vllm.utils import make_tensor_with_pad, maybe_expand_dim -from vllm.worker.model_runner import (AttentionMetadata, ModelRunner, - SamplingMetadata) +from vllm.worker.model_runner import (AttentionMetadata, SamplingMetadata, + _prepare_fake_inputs) _PAD_SLOT_ID = -1 +_BATCH_SIZE_ALIGNMENT = 8 +_BATCH_SIZES_TO_CAPTURE = [1, 2, 4] + [ + _BATCH_SIZE_ALIGNMENT * i for i in range(1, 33) +] -class XPUModelRunner(ModelRunner): +class XPUModelRunner(): + + def __init__( + self, + model_config: ModelConfig, + parallel_config: ParallelConfig, + scheduler_config: SchedulerConfig, + device_config: DeviceConfig, + load_config: LoadConfig, + lora_config: Optional[LoRAConfig], + kv_cache_dtype: Optional[str] = "auto", + is_driver_worker: bool = False, + *args, + **kwargs, + ): + self.model_config = model_config + self.parallel_config = parallel_config + self.scheduler_config = scheduler_config + self.lora_config = lora_config + self.load_config = load_config + self.is_driver_worker = is_driver_worker + + # model_config can be None in tests/samplers/test_sampler.py. + # FIXME(woosuk): This is a hack to make the tests work. Refactor this. + self.sliding_window = (model_config.get_sliding_window() + if model_config is not None else None) + self.device_config = (device_config + if device_config is not None else DeviceConfig()) + self.device = self.device_config.device + + self.kv_cache_dtype = kv_cache_dtype + self.max_context_len_to_capture = ( + self.model_config.max_context_len_to_capture + if self.model_config is not None else 0) + + self.attn_backend = get_attn_backend( + self.model_config.dtype if model_config is not None else None) + + # Lazy initialization. + self.model: nn.Module # Set after init_Model + self.block_size: int # Set after initial profiling. + + def load_model(self) -> None: + self.model = get_model(model_config=self.model_config, + load_config=self.load_config, + device_config=self.device_config, + vision_language_config=None, + lora_config=self.lora_config, + parallel_config=self.parallel_config, + scheduler_config=self.scheduler_config) + + @property + def vocab_size(self) -> int: + return self.model_config.get_vocab_size() + + @torch.inference_mode() + def profile_run(self) -> None: + # Enable top-k sampling to reflect the accurate memory usage. + sampling_params = SamplingParams(top_p=0.99, top_k=self.vocab_size - 1) + max_num_batched_tokens = self.scheduler_config.max_num_batched_tokens + max_num_seqs = self.scheduler_config.max_num_seqs + + # Profile memory usage with max_num_sequences sequences and the total + # number of tokens equal to max_num_batched_tokens. + seqs: List[SequenceGroupMetadata] = [] + # Additional GPU memory may be needed for vision encoding, which needs + # to be accounted for when calculating the GPU blocks for + # vLLM blocker manager. + # To exercise the worst scenario for GPU memory consumption, + # the number of seqs (batch_size) is chosen to maximize the number + # of images processed. + for group_id in range(max_num_seqs): + seq_len = (max_num_batched_tokens // max_num_seqs + + (group_id < max_num_batched_tokens % max_num_seqs)) + seq_data, fake_multi_modal_input = _prepare_fake_inputs( + seq_len, None) + seq = SequenceGroupMetadata( + request_id=str(group_id), + is_prompt=True, + seq_data={group_id: seq_data}, + sampling_params=sampling_params, + block_tables=None, + lora_request=None, + multi_modal_data=fake_multi_modal_input, + ) + seqs.append(seq) + + # Run the model with the dummy inputs. + num_layers = self.model_config.get_num_layers(self.parallel_config) + kv_caches = [None] * num_layers + self.execute_model(seqs, kv_caches) + torch.xpu.synchronize() + return + + def set_block_size(self, block_size: int) -> None: + self.block_size = block_size + + self.graph_block_tables = np.zeros( + (max(_BATCH_SIZES_TO_CAPTURE), self.get_max_block_per_batch()), + dtype=np.int32) + + def get_max_block_per_batch(self) -> int: + block_size = self.block_size + return (self.max_context_len_to_capture + block_size - 1) // block_size def prepare_input_tensors( self, - seq_group_metadata_list: Optional[List[SequenceGroupMetadata]], + seq_group_metadata_list: List[SequenceGroupMetadata], ) -> Tuple[torch.Tensor, torch.Tensor, AttentionMetadata, SamplingMetadata]: if self.is_driver_worker: @@ -162,7 +275,11 @@ def _prepare_sample( selected_token_indices: List[int] = [] generators: List[torch.Generator] = [] selected_token_start_idx = 0 - categorized_sample_indices = {t: [] for t in SamplingType} + categorized_sample_indices: Dict[SamplingType, + List[Tuple[int, int]]] = { + t: [] + for t in SamplingType + } categorized_sample_indices_start_idx = 0 categorized_sampled_token_indices_start_idx = 0 @@ -179,10 +296,9 @@ def _prepare_sample( categorized_sample_indices_start_idx += subquery_len - 1 categorized_sample_indices[ - sampling_params.sampling_type].append([ - categorized_sample_indices_start_idx, - categorized_sampled_token_indices_start_idx - ]) + sampling_params.sampling_type].append( + (categorized_sample_indices_start_idx, + categorized_sampled_token_indices_start_idx)) categorized_sample_indices_start_idx += 1 categorized_sampled_token_indices_start_idx += 1 @@ -247,7 +363,7 @@ def _prepare_sample( @torch.inference_mode() def execute_model( self, - seq_group_metadata_list: Optional[List[SequenceGroupMetadata]], + seq_group_metadata_list: List[SequenceGroupMetadata], kv_caches: List[torch.Tensor], ) -> Optional[SamplerOutput]: (input_tokens, input_positions, attn_metadata, sampling_metadata diff --git a/vllm/worker/xpu_worker.py b/vllm/worker/xpu_worker.py index f1b2c791e3e10..5bb6b322d705d 100644 --- a/vllm/worker/xpu_worker.py +++ b/vllm/worker/xpu_worker.py @@ -1,14 +1,14 @@ """A XPU worker class.""" import gc -from typing import Dict, List, Optional, Tuple +from typing import Any, Dict, List, Optional, Tuple import intel_extension_for_pytorch # noqa: F401 import oneccl_bindings_for_pytorch # noqa: F401 import torch import torch.distributed -from vllm.config import (CacheConfig, DeviceConfig, LoRAConfig, ModelConfig, - ParallelConfig, SchedulerConfig, TensorizerConfig, +from vllm.config import (CacheConfig, DeviceConfig, LoadConfig, LoRAConfig, + ModelConfig, ParallelConfig, SchedulerConfig, VisionLanguageConfig) from vllm.distributed import (broadcast_tensor_dict, ensure_model_parallel_initialized) @@ -40,12 +40,12 @@ def __init__( scheduler_config: SchedulerConfig, device_config: DeviceConfig, cache_config: CacheConfig, + load_config: LoadConfig, local_rank: int, rank: int, distributed_init_method: str, lora_config: Optional[LoRAConfig] = None, vision_language_config: Optional[VisionLanguageConfig] = None, - tensorizer_config: Optional[TensorizerConfig] = None, is_driver_worker: bool = False, ) -> None: assert device_config.device_type == "xpu" @@ -56,11 +56,11 @@ def __init__( self.scheduler_config = scheduler_config self.device_config = device_config self.cache_config = cache_config + self.load_config = load_config self.local_rank = local_rank self.rank = rank self.distributed_init_method = distributed_init_method self.lora_config = lora_config - self.tensorizer_config = tensorizer_config self.is_driver_worker = is_driver_worker if self.is_driver_worker: assert self.rank == 0, "The driver worker must have rank 0." @@ -75,16 +75,16 @@ def __init__( parallel_config, scheduler_config, device_config, + load_config=self.load_config, lora_config=self.lora_config, kv_cache_dtype=self.cache_config.cache_dtype, is_driver_worker=is_driver_worker, vision_language_config=vision_language_config, - tensorizer_config=tensorizer_config, ) # Uninitialized cache engine. Will be initialized by # initialize_cache. - self.cache_engine = None - self.gpu_cache = None + self.cache_engine: CacheEngine + self.gpu_cache: List[torch.Tensor] def init_device(self) -> None: if self.device_config.device.type == "xpu" and is_xpu(): @@ -148,8 +148,6 @@ def determine_num_available_blocks(self) -> Tuple[int, int]: cache_block_size) num_gpu_blocks = max(num_gpu_blocks, 0) num_cpu_blocks = max(num_cpu_blocks, 0) - if self.model_runner.lora_manager: - self.model_runner.remove_all_loras() gc.collect() torch.xpu.empty_cache() return num_gpu_blocks, num_cpu_blocks @@ -179,11 +177,8 @@ def _init_cache_engine(self) -> None: self.model_runner.set_block_size(self.cache_engine.block_size) def _warm_up_model(self) -> None: - if not self.model_config.enforce_eager: - self.model_runner.capture_model(self.gpu_cache) - # Reset the seed to ensure that the random state is not affected by - # the model initialization and profiling. - set_random_seed(self.model_config.seed) + # IPEX don't support capture graph yet + pass def get_cache_block_size_bytes(self) -> int: """Get the size of the KV cache block size in bytes. @@ -206,7 +201,7 @@ def execute_model( assert blocks_to_swap_in is not None assert blocks_to_swap_out is not None assert blocks_to_copy is not None - data = { + data: Dict[str, Any] = { "num_seq_groups": num_seq_groups, "blocks_to_swap_in": blocks_to_swap_in, "blocks_to_swap_out": blocks_to_swap_out, @@ -220,6 +215,9 @@ def execute_model( blocks_to_swap_out = data["blocks_to_swap_out"] blocks_to_copy = data["blocks_to_copy"] + assert blocks_to_swap_in is not None + assert blocks_to_swap_out is not None + assert blocks_to_copy is not None self.cache_swap(blocks_to_swap_in, blocks_to_swap_out, blocks_to_copy) # If there is no input, we don't need to execute the model. From 4e3d1ed90d409a97c8467f5309749bc1939298f6 Mon Sep 17 00:00:00 2001 From: Kunshang Ji Date: Sat, 20 Apr 2024 00:14:10 +0800 Subject: [PATCH 30/67] add LoadConfig --- vllm/executor/xpu_executor.py | 9 ++++++--- 1 file changed, 6 insertions(+), 3 deletions(-) diff --git a/vllm/executor/xpu_executor.py b/vllm/executor/xpu_executor.py index d906a8e6528b6..4c36fe872c761 100644 --- a/vllm/executor/xpu_executor.py +++ b/vllm/executor/xpu_executor.py @@ -2,9 +2,9 @@ import torch -from vllm.config import (CacheConfig, DeviceConfig, LoRAConfig, ModelConfig, - ParallelConfig, SchedulerConfig, SpeculativeConfig, - VisionLanguageConfig) +from vllm.config import (CacheConfig, DeviceConfig, LoadConfig, LoRAConfig, + ModelConfig, ParallelConfig, SchedulerConfig, + SpeculativeConfig, VisionLanguageConfig) from vllm.executor.executor_base import ExecutorAsyncBase, ExecutorBase from vllm.logger import init_logger from vllm.lora.request import LoRARequest @@ -24,6 +24,7 @@ def __init__( parallel_config: ParallelConfig, scheduler_config: SchedulerConfig, device_config: DeviceConfig, + load_config: LoadConfig, lora_config: Optional[LoRAConfig], vision_language_config: Optional[VisionLanguageConfig], speculative_config: Optional[SpeculativeConfig], @@ -36,6 +37,7 @@ def __init__( self.model_config = model_config self.cache_config = cache_config + self.load_config = load_config self.lora_config = lora_config self.parallel_config = parallel_config self.scheduler_config = scheduler_config @@ -62,6 +64,7 @@ def _init_worker(self): scheduler_config=self.scheduler_config, device_config=self.device_config, cache_config=self.cache_config, + load_config=self.load_config, local_rank=0, rank=0, distributed_init_method=distributed_init_method, From e42f23a232b635401a383271395b0a7058448749 Mon Sep 17 00:00:00 2001 From: Kunshang Ji Date: Sat, 20 Apr 2024 00:24:50 +0800 Subject: [PATCH 31/67] fix execute_model --- vllm/worker/xpu_worker.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/vllm/worker/xpu_worker.py b/vllm/worker/xpu_worker.py index 5bb6b322d705d..e85a30ae6f26b 100644 --- a/vllm/worker/xpu_worker.py +++ b/vllm/worker/xpu_worker.py @@ -194,7 +194,7 @@ def execute_model( blocks_to_swap_in: Optional[Dict[int, int]] = None, blocks_to_swap_out: Optional[Dict[int, int]] = None, blocks_to_copy: Optional[Dict[int, List[int]]] = None, - ) -> Optional[SamplerOutput]: + ) -> List[SamplerOutput]: if self.is_driver_worker: assert seq_group_metadata_list is not None num_seq_groups = len(seq_group_metadata_list) @@ -222,11 +222,11 @@ def execute_model( # If there is no input, we don't need to execute the model. if num_seq_groups == 0: - return {} + return [] output = self.model_runner.execute_model(seq_group_metadata_list, self.gpu_cache) - return output + return [output] def cache_swap( self, From 8d9ef99492d2dcd55f1f7e59d17e3c787bcfcc82 Mon Sep 17 00:00:00 2001 From: Kunshang Ji Date: Sat, 20 Apr 2024 01:04:00 +0800 Subject: [PATCH 32/67] use v2 --- vllm/_ipex_ops.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/vllm/_ipex_ops.py b/vllm/_ipex_ops.py index 95f3355baf0d2..1c95a729e7394 100644 --- a/vllm/_ipex_ops.py +++ b/vllm/_ipex_ops.py @@ -95,11 +95,11 @@ def paged_attention_v2( ).view(num_kv_heads, 1).repeat_interleave(num_queries_per_tokens).flatten() # todo: ipex will refactor namespace - # ipex cpp layer unified paged_attention v1 and v2 - torch.xpu.paged_attention_v1(out, query.contiguous(), + torch.xpu.paged_attention_v2(out, exp_sum, max_logits, tmp_out, + query.contiguous(), key_cache.view_as(value_cache), - value_cache, head_mapping, scale, - block_tables, context_lens, block_size, + value_cache, head_mapping, block_tables, + context_lens, scale, block_size, max_context_len, alibi_slopes) def rotary_embedding( From d4dd31e8b7aa47b0d237d12663fe96b441b16662 Mon Sep 17 00:00:00 2001 From: Kunshang Ji Date: Mon, 22 Apr 2024 18:10:01 +0800 Subject: [PATCH 33/67] revert torch sdpa cpu path --- vllm/attention/backends/torch_sdpa.py | 22 ---------------------- 1 file changed, 22 deletions(-) diff --git a/vllm/attention/backends/torch_sdpa.py b/vllm/attention/backends/torch_sdpa.py index 72afac8403598..4539cdea876f1 100644 --- a/vllm/attention/backends/torch_sdpa.py +++ b/vllm/attention/backends/torch_sdpa.py @@ -263,28 +263,6 @@ def forward( return output.view(-1, self.num_heads * self.head_size) -def _make_attention_mask( - att_bias: List[torch.Tensor], - seq_lens: List[int], - prompt_token_num: int, - dtype: torch.dtype, -) -> torch.Tensor: - assert att_bias[0].dim() == 3 - assert len(att_bias) == len(seq_lens) - head_size, _, _ = att_bias[0].size() - mask = torch.empty(head_size, - prompt_token_num, - prompt_token_num, - dtype=dtype) - mask.fill_(-torch.inf) - start = 0 - for seq_len, sub_mask in zip(seq_lens, att_bias): - end = start + seq_len - mask[:, start:end, start:end] = sub_mask - start += seq_len - return mask - - def _make_alibi_bias( alibi_slopes: torch.Tensor, dtype: torch.dtype, From b4ca33035f1821f519aebebe2e9adb14782b4fac Mon Sep 17 00:00:00 2001 From: Kunshang Ji Date: Tue, 23 Apr 2024 16:36:26 +0800 Subject: [PATCH 34/67] fix sdpa split cache on cpu path --- vllm/attention/backends/torch_sdpa.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/vllm/attention/backends/torch_sdpa.py b/vllm/attention/backends/torch_sdpa.py index 4539cdea876f1..493009185dac6 100644 --- a/vllm/attention/backends/torch_sdpa.py +++ b/vllm/attention/backends/torch_sdpa.py @@ -144,7 +144,7 @@ def split_kv_cache( num_kv_heads: int, head_size: int, ) -> Tuple[torch.Tensor, torch.Tensor]: - x = 1 + x = 1 if is_xpu() else (16 // kv_cache.element_size()) num_blocks = kv_cache.shape[1] key_cache = kv_cache[0] From eaec862b9032519ad7399d990029e27851a745be Mon Sep 17 00:00:00 2001 From: Kunshang Ji Date: Tue, 23 Apr 2024 16:56:26 +0800 Subject: [PATCH 35/67] add vision model support --- vllm/worker/xpu_model_runner.py | 67 ++++++++++++++++++++------------- 1 file changed, 41 insertions(+), 26 deletions(-) diff --git a/vllm/worker/xpu_model_runner.py b/vllm/worker/xpu_model_runner.py index 239ae3f661d9a..8efadd3ea74e0 100644 --- a/vllm/worker/xpu_model_runner.py +++ b/vllm/worker/xpu_model_runner.py @@ -6,7 +6,7 @@ from vllm.attention import get_attn_backend from vllm.config import (DeviceConfig, LoadConfig, LoRAConfig, ModelConfig, - ParallelConfig, SchedulerConfig) + ParallelConfig, SchedulerConfig, VisionLanguageConfig) from vllm.distributed import broadcast_tensor_dict from vllm.model_executor.model_loader import get_model from vllm.sampling_params import SamplingParams, SamplingType @@ -32,6 +32,7 @@ def __init__( device_config: DeviceConfig, load_config: LoadConfig, lora_config: Optional[LoRAConfig], + vision_language_config: Optional[VisionLanguageConfig], kv_cache_dtype: Optional[str] = "auto", is_driver_worker: bool = False, *args, @@ -42,6 +43,7 @@ def __init__( self.scheduler_config = scheduler_config self.lora_config = lora_config self.load_config = load_config + self.vision_language_config = vision_language_config self.is_driver_worker = is_driver_worker # model_config can be None in tests/samplers/test_sampler.py. @@ -65,13 +67,14 @@ def __init__( self.block_size: int # Set after initial profiling. def load_model(self) -> None: - self.model = get_model(model_config=self.model_config, - load_config=self.load_config, - device_config=self.device_config, - vision_language_config=None, - lora_config=self.lora_config, - parallel_config=self.parallel_config, - scheduler_config=self.scheduler_config) + self.model = get_model( + model_config=self.model_config, + load_config=self.load_config, + device_config=self.device_config, + vision_language_config=self.vision_language_config, + lora_config=self.lora_config, + parallel_config=self.parallel_config, + scheduler_config=self.scheduler_config) @property def vocab_size(self) -> int: @@ -130,16 +133,18 @@ def get_max_block_per_batch(self) -> int: def prepare_input_tensors( self, seq_group_metadata_list: List[SequenceGroupMetadata], - ) -> Tuple[torch.Tensor, torch.Tensor, AttentionMetadata, - SamplingMetadata]: + ) -> Tuple[torch.Tensor, torch.Tensor, AttentionMetadata, SamplingMetadata, + Optional[torch.Tensor]]: + multi_modal_input = None if self.is_driver_worker: # NOTE: We assume that all sequences in the group are all prompts or # all decodes. is_prompt = seq_group_metadata_list[0].is_prompt # Prepare input tensors. if is_prompt: - (input_tokens, input_positions, attn_metadata, - prompt_lens) = self._prepare_prompt(seq_group_metadata_list) + (input_tokens, input_positions, attn_metadata, prompt_lens, + multi_modal_input + ) = self._prepare_prompt(seq_group_metadata_list) else: (input_tokens, input_positions, attn_metadata) = self._prepare_decode(seq_group_metadata_list) @@ -172,12 +177,8 @@ def prepare_input_tensors( perform_sampling=False, ) - return ( - input_tokens, - input_positions, - attn_metadata, - sampling_metadata, - ) + return (input_tokens, input_positions, attn_metadata, + sampling_metadata, multi_modal_input) def _prepare_decode( self, @@ -366,7 +367,8 @@ def execute_model( seq_group_metadata_list: List[SequenceGroupMetadata], kv_caches: List[torch.Tensor], ) -> Optional[SamplerOutput]: - (input_tokens, input_positions, attn_metadata, sampling_metadata + (input_tokens, input_positions, attn_metadata, sampling_metadata, + multi_modal_input ) = self.prepare_input_tensors(seq_group_metadata_list) model_executable = self.model @@ -376,6 +378,8 @@ def execute_model( "kv_caches": kv_caches, "attn_metadata": attn_metadata, } + if self.vision_language_config: + execute_model_kwargs.update({"image_input": multi_modal_input}) hidden_states = model_executable(**execute_model_kwargs) @@ -396,12 +400,14 @@ def execute_model( def _prepare_prompt( self, seq_group_metadata_list: List[SequenceGroupMetadata], - ) -> Tuple[torch.Tensor, torch.Tensor, AttentionMetadata, List[int]]: + ) -> Tuple[torch.Tensor, torch.Tensor, AttentionMetadata, List[int], + Optional[torch.Tensor]]: assert len(seq_group_metadata_list) > 0 input_tokens: List[int] = [] input_positions: List[int] = [] slot_mapping: List[int] = [] prompt_lens: List[int] = [] + multi_modal_input_list: List[torch.Tensor] = [] for seq_group_metadata in seq_group_metadata_list: assert seq_group_metadata.is_prompt @@ -422,6 +428,10 @@ def _prepare_prompt( # is always the first token in the sequence. input_positions.extend(list(range(computed_len, prompt_len))) + if seq_group_metadata.multi_modal_data: + multi_modal_input_list.append( + seq_group_metadata.multi_modal_data.data) + if seq_group_metadata.block_tables is None: # During memory profiling, the block tables are not initialized # yet. In this case, we just use a dummy slot mapping. @@ -450,6 +460,15 @@ def _prepare_prompt( slot = block_number * self.block_size + block_offset slot_mapping.append(slot) + if multi_modal_input_list: + assert self.vision_language_config, ( + "Multi-modal inputs are only supported by " + "vision language models.") + multi_modal_input = torch.cat(multi_modal_input_list, + dim=0).to(self.device) + else: + multi_modal_input = None + num_prompt_tokens = len(input_tokens) input_tokens = torch.tensor(input_tokens, @@ -476,9 +495,5 @@ def _prepare_prompt( slot_mapping=slot_mapping, kv_cache_dtype=self.kv_cache_dtype, ) - return ( - input_tokens, - input_positions, - attn_metadata, - prompt_lens, - ) + return (input_tokens, input_positions, attn_metadata, prompt_lens, + multi_modal_input) From 7d76334a8b283490e788ec696dedd842239aa619 Mon Sep 17 00:00:00 2001 From: Kunshang Ji Date: Tue, 23 Apr 2024 17:30:28 +0800 Subject: [PATCH 36/67] fix ray xpu executor --- vllm/executor/ray_xpu_executor.py | 153 +++++++++++++++++------------- 1 file changed, 89 insertions(+), 64 deletions(-) diff --git a/vllm/executor/ray_xpu_executor.py b/vllm/executor/ray_xpu_executor.py index c36e23269d848..3fdf6a4daf3d1 100644 --- a/vllm/executor/ray_xpu_executor.py +++ b/vllm/executor/ray_xpu_executor.py @@ -1,20 +1,19 @@ import asyncio -import copy import os import pickle from collections import defaultdict from typing import TYPE_CHECKING, Any, Dict, List, Optional, Set, Tuple -from vllm.config import (CacheConfig, DeviceConfig, LoRAConfig, ModelConfig, - ParallelConfig, SchedulerConfig, SpeculativeConfig, - VisionLanguageConfig) -from vllm.engine.ray_utils import RayWorkerVllm, ray +from vllm.config import (CacheConfig, DeviceConfig, LoadConfig, LoRAConfig, + ModelConfig, ParallelConfig, SchedulerConfig, + SpeculativeConfig, VisionLanguageConfig) +from vllm.engine.ray_utils import RayWorkerWrapper, ray from vllm.executor.executor_base import ExecutorAsyncBase, ExecutorBase from vllm.logger import init_logger from vllm.lora.request import LoRARequest from vllm.sequence import SamplerOutput, SequenceGroupMetadata from vllm.utils import (get_distributed_init_method, get_ip, get_open_port, - make_async) + get_vllm_instance_id, make_async) if ray is not None: from ray.util.scheduling_strategies import PlacementGroupSchedulingStrategy @@ -39,6 +38,7 @@ def __init__( parallel_config: ParallelConfig, scheduler_config: SchedulerConfig, device_config: DeviceConfig, + load_config: LoadConfig, lora_config: Optional[LoRAConfig], vision_language_config: Optional[VisionLanguageConfig], speculative_config: Optional[SpeculativeConfig], @@ -49,6 +49,7 @@ def __init__( self.model_config = model_config self.cache_config = cache_config + self.load_config = load_config self.lora_config = lora_config self.parallel_config = parallel_config self.scheduler_config = scheduler_config @@ -106,9 +107,9 @@ def _init_workers_ray(self, placement_group: "PlacementGroup", # The driver dummy worker does not actually use any resources. # It holds the resource for the driver worker. - self.driver_dummy_worker: RayWorkerVllm = None + self.driver_dummy_worker: RayWorkerWrapper = None # The remaining workers are the actual ray actors. - self.workers: List[RayWorkerVllm] = [] + self.workers: List[RayWorkerWrapper] = [] # Create the workers. driver_ip = get_ip() @@ -125,17 +126,23 @@ def _init_workers_ray(self, placement_group: "PlacementGroup", num_gpus=num_gpus, scheduling_strategy=scheduling_strategy, **ray_remote_kwargs, - )(RayWorkerVllm).remote(self.model_config.trust_remote_code) + )(RayWorkerWrapper).remote( + worker_module_name="vllm.worker.xpu_worker", + worker_class_name="XPUWorker", + ) worker_ip = ray.get(worker.get_node_ip.remote()) if worker_ip == driver_ip and self.driver_dummy_worker is None: # If the worker is on the same node as the driver, we use it # as the resource holder for the driver process. self.driver_dummy_worker = worker + self.driver_worker = RayWorkerWrapper( + worker_module_name="vllm.worker.xpu_worker", + worker_class_name="XPUWorker", + ) else: # Else, added to the list of workers. self.workers.append(worker) - if self.driver_dummy_worker is None: raise ValueError( "Ray does not allocate any GPUs on the driver node. Consider " @@ -143,73 +150,63 @@ def _init_workers_ray(self, placement_group: "PlacementGroup", "GPU node.") # Get the set of GPU IDs used on each node. - driver_node_id, driver_gpu_ids = ray.get( - self.driver_dummy_worker.get_node_and_gpu_ids.remote()) - worker_node_and_gpu_ids = ray.get( - [worker.get_node_and_gpu_ids.remote() for worker in self.workers]) + worker_node_and_gpu_ids = self._run_workers("get_node_and_gpu_ids", + use_dummy_driver=True) node_workers = defaultdict(list) node_gpus = defaultdict(list) - node_workers[driver_node_id].append(0) - node_gpus[driver_node_id].extend(driver_gpu_ids) - for i, (node_id, gpu_ids) in enumerate(worker_node_and_gpu_ids, - start=1): + for i, (node_id, gpu_ids) in enumerate(worker_node_and_gpu_ids): node_workers[node_id].append(i) node_gpus[node_id].extend(gpu_ids) for node_id, gpu_ids in node_gpus.items(): node_gpus[node_id] = sorted(gpu_ids) + VLLM_INSTANCE_ID = get_vllm_instance_id() + + # Set environment variables for the driver and workers. + all_args_to_update_environment_variables = [] + for (node_id, _) in worker_node_and_gpu_ids: + all_args_to_update_environment_variables.append([{ + "CUDA_VISIBLE_DEVICES": + ",".join(map(str, node_gpus[node_id])), + "VLLM_INSTANCE_ID": + VLLM_INSTANCE_ID, + "VLLM_TRACE_FUNCTION": + os.getenv("VLLM_TRACE_FUNCTION", "0"), + }]) + self._run_workers("update_environment_variables", + all_args=all_args_to_update_environment_variables) + distributed_init_method = get_distributed_init_method( driver_ip, get_open_port()) - # Lazy import the Worker to avoid importing torch.cuda/xformers - # before CUDA_VISIBLE_DEVICES is set in the Worker - from vllm.worker.xpu_worker import XPUWorker - - model_config = copy.deepcopy(self.model_config) - parallel_config = copy.deepcopy(self.parallel_config) - scheduler_config = copy.deepcopy(self.scheduler_config) - device_config = copy.deepcopy(self.device_config) - lora_config = copy.deepcopy(self.lora_config) - - # Initialize the actual workers with the Worker class. - for rank, (worker, (node_id, _)) in enumerate( - zip(self.workers, worker_node_and_gpu_ids), - start=1, - ): + def collect_arg_helper_func(**kwargs): + # avoid writing `{"name": value}` manually + return kwargs + + init_worker_all_kwargs = [] + + # Initialize the actual workers inside worker wrapper. + for rank, (node_id, _) in enumerate(worker_node_and_gpu_ids, ): local_rank = node_workers[node_id].index(rank) - worker.init_worker.remote( - lambda rank=rank, local_rank=local_rank: XPUWorker( - model_config, - parallel_config, - scheduler_config, - device_config, + init_worker_all_kwargs.append( + collect_arg_helper_func( + model_config=self.model_config, + parallel_config=self.parallel_config, + scheduler_config=self.scheduler_config, + device_config=self.device_config, cache_config=self.cache_config, + load_config=self.load_config, local_rank=local_rank, rank=rank, distributed_init_method=distributed_init_method, - lora_config=lora_config, + lora_config=self.lora_config, + vision_language_config=self.vision_language_config, + is_driver_worker=rank == 0, )) + self._run_workers("init_worker", all_kwargs=init_worker_all_kwargs) - # Initialize the driver worker with the Worker class. - driver_rank = 0 - driver_local_rank = node_workers[driver_node_id].index(driver_rank) - self.driver_worker = XPUWorker( - model_config, - parallel_config, - scheduler_config, - device_config, - cache_config=self.cache_config, - local_rank=driver_local_rank, - rank=driver_rank, - distributed_init_method=distributed_init_method, - lora_config=self.lora_config, - is_driver_worker=True, - ) - - # FIXME(woosuk): We are not properly initializing cupy NCCL when - # we have multiple nodes. self._run_workers("init_device") self._run_workers( "load_model", @@ -278,11 +275,33 @@ def _run_workers( *args, driver_args: Optional[Tuple[Any]] = None, driver_kwargs: Optional[Dict[str, Any]] = None, + all_args: Optional[List[List[Any]]] = None, + all_kwargs: Optional[List[Dict[str, Any]]] = None, + use_dummy_driver: bool = False, max_concurrent_workers: Optional[int] = None, use_ray_compiled_dag: bool = False, **kwargs, ) -> Any: - """Runs the given method on all workers.""" + """Runs the given method on all workers. + all_args and all_kwargs are used to pass heterogeneous arguments, + i.e. different arguments for each worker. + """ + if driver_args is None: + driver_args = args + if driver_kwargs is None: + driver_kwargs = kwargs + + # for mypy type checking + assert driver_args is not None + assert driver_kwargs is not None + if all_args is None: + all_args = [driver_args] + [args] * len(self.workers) + if all_kwargs is None: + all_kwargs = [driver_kwargs] + [kwargs] * len(self.workers) + + # for mypy type checking + assert all_args is not None + assert all_kwargs is not None if max_concurrent_workers: raise NotImplementedError( @@ -296,8 +315,10 @@ def _run_workers( else: # Start the ray workers first. ray_worker_outputs = [ - worker.execute_method.remote(method, *args, **kwargs) - for worker in self.workers + worker.execute_method.remote(method, *worker_args, + **worker_kwargs) + for (worker, worker_args, worker_kwargs + ) in zip(self.workers, all_args[1:], all_kwargs[1:]) ] if driver_args is None: @@ -306,9 +327,13 @@ def _run_workers( driver_kwargs = kwargs # Start the driver worker after all the ray workers. - driver_worker_output = getattr(self.driver_worker, - method)(*driver_args, **driver_kwargs) - + if not use_dummy_driver: + driver_worker_output = self.driver_worker.execute_method( + method, *all_args[0], **all_kwargs[0]) + else: + driver_worker_output = ray.get( + self.driver_dummy_worker.execute_method.remote( + method, *all_args[0], **all_kwargs[0])) # Get the results of the ray workers. if self.workers: if use_ray_compiled_dag: From ce55b60cd0324ed9410be771574f26e7e62aafba Mon Sep 17 00:00:00 2001 From: Kunshang Ji Date: Tue, 23 Apr 2024 23:41:52 +0800 Subject: [PATCH 37/67] fix block table device --- vllm/worker/xpu_model_runner.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/vllm/worker/xpu_model_runner.py b/vllm/worker/xpu_model_runner.py index 8efadd3ea74e0..4b2d0430c78d9 100644 --- a/vllm/worker/xpu_model_runner.py +++ b/vllm/worker/xpu_model_runner.py @@ -491,7 +491,7 @@ def _prepare_prompt( decode_metadata=None, max_context_len=None, context_lens=None, - block_tables=torch.tensor([]), + block_tables=torch.tensor([], device=self.device, dtype=torch.int), slot_mapping=slot_mapping, kv_cache_dtype=self.kv_cache_dtype, ) From 39c07d991253da8de181c5f902a7e58bad3c9989 Mon Sep 17 00:00:00 2001 From: Kunshang Ji Date: Fri, 26 Apr 2024 01:04:23 +0800 Subject: [PATCH 38/67] add intel xpu test --- .buildkite/test-template.j2 | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/.buildkite/test-template.j2 b/.buildkite/test-template.j2 index 4a20a462b98ec..3bd1e90c2b711 100644 --- a/.buildkite/test-template.j2 +++ b/.buildkite/test-template.j2 @@ -45,6 +45,11 @@ steps: queue: intel command: bash .buildkite/run-cpu-test.sh + - label: "XPU Test" + agents: + queue: intel + command: bash .buildkite/run-xpu-test.sh + {% for step in steps %} - label: "{{ step.label }}" agents: From 88d1b6eea4fc497446ded37cdcd5fb35982a6b28 Mon Sep 17 00:00:00 2001 From: Kunshang Ji Date: Tue, 30 Apr 2024 00:29:39 +0800 Subject: [PATCH 39/67] address comments --- .buildkite/run-xpu-test.sh | 2 +- vllm/_custom_ops.py | 31 ++++++ vllm/_ipex_ops.py | 33 +++++- vllm/attention/backends/torch_sdpa.py | 1 + vllm/engine/async_llm_engine.py | 12 +-- vllm/executor/ray_xpu_executor.py | 4 +- vllm/executor/xpu_executor.py | 48 ++------- vllm/utils.py | 14 +-- vllm/worker/xpu_model_runner.py | 140 ++++++-------------------- 9 files changed, 114 insertions(+), 171 deletions(-) diff --git a/.buildkite/run-xpu-test.sh b/.buildkite/run-xpu-test.sh index 5d4311b696d2e..22a7e76937a76 100644 --- a/.buildkite/run-xpu-test.sh +++ b/.buildkite/run-xpu-test.sh @@ -11,4 +11,4 @@ trap remove_docker_container EXIT remove_docker_container # Run the image and launch offline inference -docker run --network host --env VLLM_CPU_KVCACHE_SPACE=1 --name xpu-test --device /dev/dri -v /dev/dri/by-path:/dev/dri/by-path xpu-test python3 examples/offline_inference.py \ No newline at end of file +docker run --network host --name xpu-test --device /dev/dri -v /dev/dri/by-path:/dev/dri/by-path xpu-test python3 examples/offline_inference.py diff --git a/vllm/_custom_ops.py b/vllm/_custom_ops.py index 73b40f7a33bbd..96a44f80f0652 100644 --- a/vllm/_custom_ops.py +++ b/vllm/_custom_ops.py @@ -32,6 +32,37 @@ def is_custom_op_supported(op_name: str) -> bool: return op is not None +def hint_on_error(fn): + + @functools.wraps(fn) + def wrapper(*args, **kwargs): + try: + return fn(*args, **kwargs) + except AttributeError as e: + msg = ( + "Error in calling custom op %s: %s\n" + "Possibly you have built or installed an obsolete version of vllm.\n" + "Please try a clean build and install of vllm," + "or remove old built files such as vllm/*cpython*.so and build/ ." + ) + logger.error(msg, fn.__name__, e) + raise e + + return wrapper + +with contextlib.suppress(ImportError): + import vllm._moe_C + +with contextlib.suppress(ImportError): + # ruff: noqa: F401 + import vllm._punica_C + + +def is_custom_op_supported(op_name: str) -> bool: + op, overloads = torch._C._jit_get_operation(op_name) + return op is not None + + def hint_on_error(fn): @functools.wraps(fn) diff --git a/vllm/_ipex_ops.py b/vllm/_ipex_ops.py index 1c95a729e7394..27795f7377843 100644 --- a/vllm/_ipex_ops.py +++ b/vllm/_ipex_ops.py @@ -1,4 +1,4 @@ -from typing import Dict, List, Optional +from typing import Dict, List, Optional, Tuple import intel_extension_for_pytorch as ipex import torch @@ -7,7 +7,8 @@ class ipex_ops: @staticmethod - def reshape_activation_tensor(x: torch.Tensor): + def _reshape_activation_tensor( + x: torch.Tensor) -> Tuple[torch.Tensor, torch.Tensor]: num = x.size(0) d = x.size(1) // 2 x = x.reshape(num, 2, d) @@ -17,15 +18,15 @@ def reshape_activation_tensor(x: torch.Tensor): return x1, x2 def silu_and_mul(out: torch.Tensor, x: torch.Tensor) -> None: - x1, x2 = ipex_ops.reshape_activation_tensor(x) + x1, x2 = ipex_ops._reshape_activation_tensor(x) ipex.llm.functional.silu_mul(x1, x2, out) def gelu_and_mul(out: torch.Tensor, x: torch.Tensor) -> None: - x1, x2 = ipex_ops.reshape_activation_tensor(x) + x1, x2 = ipex_ops._reshape_activation_tensor(x) ipex.llm.functional.gelu_mul(x1, x2, out, "none") def gelu_tanh_and_mul(out: torch.Tensor, x: torch.Tensor) -> None: - x1, x2 = ipex_ops.reshape_activation_tensor(x) + x1, x2 = ipex_ops._reshape_activation_tensor(x) ipex.llm.functional.gelu_mul(x1, x2, out, "tanh") def gelu_fast(out: torch.Tensor, x: torch.Tensor) -> None: @@ -176,6 +177,28 @@ def fused_add_rms_norm(input: torch.Tensor, residual: torch.Tensor, epsilon, True) input.copy_(tmp) + def varlen_attention( + query: torch.Tensor, + key: torch.Tensor, + value: torch.Tensor, + out: torch.Tensor, + seqlen_q: torch.Tensor, + seqlen_k: torch.Tensor, + max_seqlen_q: int, + max_seqlen_k: int, + pdropout: float, + softmax_scale: float, + zero_tensors: bool, + is_causal: bool, + return_softmax: bool, + gen_: torch.Generator, + ) -> None: + ipex.llm.functional.varlen_attention(query, key, value, out, seqlen_q, + seqlen_k, max_seqlen_q, + max_seqlen_k, pdropout, + softmax_scale, zero_tensors, + is_causal, return_softmax, gen_) + class ipex_cache_ops: diff --git a/vllm/attention/backends/torch_sdpa.py b/vllm/attention/backends/torch_sdpa.py index 493009185dac6..6a8430a70a7f3 100644 --- a/vllm/attention/backends/torch_sdpa.py +++ b/vllm/attention/backends/torch_sdpa.py @@ -6,6 +6,7 @@ import torch from torch.nn.functional import scaled_dot_product_attention +from vllm._ipex_ops import ipex_ops from vllm.attention.backends.abstract import (AttentionBackend, AttentionImpl, AttentionMetadata) from vllm.attention.ops.paged_attn import PagedAttentionMetadata diff --git a/vllm/engine/async_llm_engine.py b/vllm/engine/async_llm_engine.py index 84e92096040fa..82a572b5e7f1a 100644 --- a/vllm/engine/async_llm_engine.py +++ b/vllm/engine/async_llm_engine.py @@ -20,7 +20,6 @@ from vllm.sampling_params import SamplingParams from vllm.sequence import ExecuteModelRequest, SamplerOutput from vllm.usage.usage_lib import UsageContext -from vllm.utils import is_xpu logger = init_logger(__name__) ENGINE_ITERATION_TIMEOUT_S = envs.VLLM_ENGINE_ITERATION_TIMEOUT_S @@ -384,12 +383,11 @@ def from_engine_args( "Distributed execution is not supported with the CPU backend.") from vllm.executor.cpu_executor import CPUExecutorAsync executor_class = CPUExecutorAsync - elif engine_config.device_config.device_type == "xpu" and is_xpu(): - if (engine_config.parallel_config.worker_use_ray): - logger.warning("not support ray yet") - else: - from vllm.executor.xpu_executor import XPUExecutorAsync - executor_class = XPUExecutorAsync + elif engine_config.device_config.device_type == "xpu": + assert not engine_config.parallel_config.worker_use_ray, ( + "Ray is not supported with the XPU backend.") + from vllm.executor.xpu_executor import XPUExecutorAsync + executor_class = XPUExecutorAsync elif distributed_executor_backend == "ray": initialize_ray_cluster(engine_config.parallel_config) from vllm.executor.ray_gpu_executor import RayGPUExecutorAsync diff --git a/vllm/executor/ray_xpu_executor.py b/vllm/executor/ray_xpu_executor.py index 3fdf6a4daf3d1..5c2919a782e99 100644 --- a/vllm/executor/ray_xpu_executor.py +++ b/vllm/executor/ray_xpu_executor.py @@ -222,8 +222,8 @@ def initialize_cache(self, num_gpu_blocks: int, # NOTE: We log here to avoid multiple logs when number of workers is # greater than one. We could log in the engine, but not all executors # have GPUs. - logger.info(f"# GPU blocks: {num_gpu_blocks}, " - f"# CPU blocks: {num_cpu_blocks}") + logger.info("# GPU blocks: %d, " + "# CPU blocks: %d", num_gpu_blocks, num_cpu_blocks) self.cache_config.num_gpu_blocks = num_gpu_blocks self.cache_config.num_cpu_blocks = num_cpu_blocks diff --git a/vllm/executor/xpu_executor.py b/vllm/executor/xpu_executor.py index 4c36fe872c761..1735858d6cb9a 100644 --- a/vllm/executor/xpu_executor.py +++ b/vllm/executor/xpu_executor.py @@ -1,13 +1,13 @@ -from typing import Dict, List, Optional, Set, Tuple +from typing import Dict, List, Optional import torch from vllm.config import (CacheConfig, DeviceConfig, LoadConfig, LoRAConfig, ModelConfig, ParallelConfig, SchedulerConfig, SpeculativeConfig, VisionLanguageConfig) -from vllm.executor.executor_base import ExecutorAsyncBase, ExecutorBase +from vllm.executor.executor_base import ExecutorAsyncBase +from vllm.executor.gpu_executor import GPUExecutor from vllm.logger import init_logger -from vllm.lora.request import LoRARequest from vllm.sequence import SamplerOutput, SequenceGroupMetadata from vllm.utils import (get_distributed_init_method, get_ip, get_open_port, make_async) @@ -15,7 +15,7 @@ logger = init_logger(__name__) -class XPUExecutor(ExecutorBase): +class XPUExecutor(GPUExecutor): def __init__( self, @@ -43,14 +43,15 @@ def __init__( self.scheduler_config = scheduler_config self.device_config = device_config self.vision_language_config = vision_language_config + self.speculative_config = None # Instantiate the worker and load the model to GPU. self._init_executor() - def _init_executor(self) -> None: - self._init_worker() + def _init_spec_worker(self): + logger.error("not support speculative for XPU executor!") - def _init_worker(self): + def _init_non_spec_worker(self): from vllm.worker.xpu_worker import XPUWorker assert self.parallel_config.world_size == 1, ( @@ -75,23 +76,6 @@ def _init_worker(self): self.driver_worker.init_device() self.driver_worker.load_model() - def initialize_cache(self, num_gpu_blocks: int, num_cpu_blocks) -> None: - """Initialize the KV cache by invoking the underlying worker. - """ - # NOTE: This is logged in the executor because there can be >1 worker - # with other executors. We could log in the engine level, but work - # remains to abstract away the device for non-GPU configurations. - logger.info(f"# GPU blocks: {num_gpu_blocks}, " - f"# CPU blocks: {num_cpu_blocks}") - - self.driver_worker.initialize_cache(num_gpu_blocks, num_cpu_blocks) - - def determine_num_available_blocks(self) -> Tuple[int, int]: - """Determine the number of available KV blocks by invoking the - underlying worker. - """ - return self.driver_worker.determine_num_available_blocks() - def execute_model(self, seq_group_metadata_list: List[SequenceGroupMetadata], blocks_to_swap_in: Dict[int, int], @@ -106,22 +90,6 @@ def execute_model(self, ) return output - def add_lora(self, lora_request: LoRARequest) -> bool: - assert lora_request.lora_int_id > 0, "lora_id must be greater than 0." - return self.driver_worker.add_lora(lora_request) - - def remove_lora(self, lora_id: int) -> bool: - assert lora_id > 0, "lora_id must be greater than 0." - return self.driver_worker.remove_lora(lora_id) - - def list_loras(self) -> Set[int]: - return self.driver_worker.list_loras() - - def check_health(self) -> None: - # XPUExecutor will always be healthy as long as - # it's running. - return - class XPUExecutorAsync(XPUExecutor, ExecutorAsyncBase): diff --git a/vllm/utils.py b/vllm/utils.py index 5598c38d07e98..e2ff019634926 100644 --- a/vllm/utils.py +++ b/vllm/utils.py @@ -21,13 +21,6 @@ import psutil import torch -try: - import intel_extension_for_pytorch # noqa: F401 - _import_ipex = True -except ImportError as e: - print(f"Import Error for IPEX: {e.msg}") - _import_ipex = False - import vllm.envs as envs from vllm import _custom_ops as ops from vllm.logger import enable_trace_function_call, init_logger @@ -35,6 +28,13 @@ T = TypeVar("T") logger = init_logger(__name__) +try: + import intel_extension_for_pytorch as ipex # noqa: F401 + _import_ipex = True +except ImportError as e: + logger.warning("Import Error for IPEX: %s", e.msg) + _import_ipex = False + STR_DTYPE_TO_TORCH_DTYPE = { "half": torch.half, "bfloat16": torch.bfloat16, diff --git a/vllm/worker/xpu_model_runner.py b/vllm/worker/xpu_model_runner.py index 4b2d0430c78d9..99a400fbcc404 100644 --- a/vllm/worker/xpu_model_runner.py +++ b/vllm/worker/xpu_model_runner.py @@ -1,4 +1,4 @@ -from typing import Dict, List, Optional, Tuple +from typing import List, Optional, Tuple import numpy as np import torch @@ -8,13 +8,16 @@ from vllm.config import (DeviceConfig, LoadConfig, LoRAConfig, ModelConfig, ParallelConfig, SchedulerConfig, VisionLanguageConfig) from vllm.distributed import broadcast_tensor_dict +from vllm.logger import init_logger from vllm.model_executor.model_loader import get_model -from vllm.sampling_params import SamplingParams, SamplingType -from vllm.sequence import SamplerOutput, SequenceData, SequenceGroupMetadata -from vllm.utils import make_tensor_with_pad, maybe_expand_dim +from vllm.sampling_params import SamplingParams +from vllm.sequence import SamplerOutput, SequenceGroupMetadata +from vllm.utils import CudaMemoryProfiler, make_tensor_with_pad from vllm.worker.model_runner import (AttentionMetadata, SamplingMetadata, _prepare_fake_inputs) +logger = init_logger(__name__) + _PAD_SLOT_ID = -1 _BATCH_SIZE_ALIGNMENT = 8 _BATCH_SIZES_TO_CAPTURE = [1, 2, 4] + [ @@ -67,14 +70,20 @@ def __init__( self.block_size: int # Set after initial profiling. def load_model(self) -> None: - self.model = get_model( - model_config=self.model_config, - load_config=self.load_config, - device_config=self.device_config, - vision_language_config=self.vision_language_config, - lora_config=self.lora_config, - parallel_config=self.parallel_config, - scheduler_config=self.scheduler_config) + with CudaMemoryProfiler() as m: + self.model = get_model( + model_config=self.model_config, + device_config=self.device_config, + load_config=self.load_config, + lora_config=self.lora_config, + vision_language_config=self.vision_language_config, + parallel_config=self.parallel_config, + scheduler_config=self.scheduler_config, + ) + + self.model_memory_usage = m.consumed_memory + logger.info("Loading model weights took %.4f GB", + self.model_memory_usage / float(2**30)) @property def vocab_size(self) -> int: @@ -149,8 +158,15 @@ def prepare_input_tensors( (input_tokens, input_positions, attn_metadata) = self._prepare_decode(seq_group_metadata_list) prompt_lens = [] - sampling_metadata = self._prepare_sample(seq_group_metadata_list, - prompt_lens) + sampling_metadata = SamplingMetadata.prepare( + seq_group_metadata_list, + prompt_lens, + # subquery_lens is not needed if chunked prefill is not + # supported. Since CPU worker doesn't support chunked prefill + # just use prompt_lens instead. + prompt_lens, + self.device, + pin_memory=False) # Broadcast the metadata. metadata_dict = { "input_tokens": input_tokens, @@ -267,100 +283,6 @@ def _prepare_decode( attn_metadata, ) - def _prepare_sample( - self, - seq_group_metadata_list: List[SequenceGroupMetadata], - prompt_lens: List[int], - ) -> SamplingMetadata: - seq_groups: List[Tuple[List[int], SamplingParams]] = [] - selected_token_indices: List[int] = [] - generators: List[torch.Generator] = [] - selected_token_start_idx = 0 - categorized_sample_indices: Dict[SamplingType, - List[Tuple[int, int]]] = { - t: [] - for t in SamplingType - } - categorized_sample_indices_start_idx = 0 - categorized_sampled_token_indices_start_idx = 0 - - for i, seq_group_metadata in enumerate(seq_group_metadata_list): - seq_ids = list(seq_group_metadata.seq_data.keys()) - sampling_params = seq_group_metadata.sampling_params - seq_groups.append((seq_ids, sampling_params)) - - if seq_group_metadata.is_prompt: - assert len(seq_ids) == 1 - subquery_len = prompt_lens[i] - if sampling_params.prompt_logprobs is not None: - # NOTE: prompt token positions do not need sample, skip - categorized_sample_indices_start_idx += subquery_len - 1 - - categorized_sample_indices[ - sampling_params.sampling_type].append( - (categorized_sample_indices_start_idx, - categorized_sampled_token_indices_start_idx)) - categorized_sample_indices_start_idx += 1 - categorized_sampled_token_indices_start_idx += 1 - - if sampling_params.prompt_logprobs is not None: - selected_token_indices.extend( - range(selected_token_start_idx, - selected_token_start_idx + subquery_len - 1)) - selected_token_indices.append(selected_token_start_idx + - subquery_len - 1) - selected_token_start_idx += subquery_len - - if sampling_params.seed is not None: - seq_group_metadata.state.generator = torch.Generator( - device=self.device).manual_seed(sampling_params.seed) - else: - num_seqs = len(seq_ids) - selected_token_indices.extend( - range(selected_token_start_idx, - selected_token_start_idx + num_seqs)) - selected_token_start_idx += num_seqs - - categorized_sample_indices[ - sampling_params.sampling_type].extend( - zip( - range( - categorized_sample_indices_start_idx, - categorized_sample_indices_start_idx + - num_seqs), - range( - categorized_sampled_token_indices_start_idx, - categorized_sampled_token_indices_start_idx + - num_seqs))) - categorized_sample_indices_start_idx += num_seqs - categorized_sampled_token_indices_start_idx += num_seqs - - if sampling_params.seed is not None: - generators.append(seq_group_metadata.state.generator) - - selected_token_indices = torch.tensor(selected_token_indices, - dtype=torch.long, - device="xpu") - - categorized_sample_indices = { - t: maybe_expand_dim(torch.tensor(seq_ids, dtype=torch.int), 2, 2) - for t, seq_ids in categorized_sample_indices.items() - } - - seq_data: Dict[int, SequenceData] = {} - for seq_group_metadata in seq_group_metadata_list: - seq_data.update(seq_group_metadata.seq_data) - - sampling_metadata = SamplingMetadata( - seq_groups=seq_groups, - seq_data=seq_data, - prompt_lens=prompt_lens, - selected_token_indices=selected_token_indices, - categorized_sample_indices=categorized_sample_indices, - generators=generators, - ) - return sampling_metadata - @torch.inference_mode() def execute_model( self, @@ -387,7 +309,7 @@ def execute_model( logits = self.model.compute_logits(hidden_states, sampling_metadata) # Only perform sampling in the driver worker. - if not sampling_metadata.perform_sampling: + if not self.is_driver_worker: return None # Sample the next token. From e00fbce0c6642e71a5480e0f82438b919a57eeba Mon Sep 17 00:00:00 2001 From: Kunshang Ji Date: Tue, 30 Apr 2024 01:36:12 +0800 Subject: [PATCH 40/67] fix import, add doc --- .../getting_started/xpu-installation.rst | 61 +++++++++++++++++++ vllm/_ipex_ops.py | 10 ++- 2 files changed, 70 insertions(+), 1 deletion(-) create mode 100644 docs/source/getting_started/xpu-installation.rst diff --git a/docs/source/getting_started/xpu-installation.rst b/docs/source/getting_started/xpu-installation.rst new file mode 100644 index 0000000000000..8a61c3003a9af --- /dev/null +++ b/docs/source/getting_started/xpu-installation.rst @@ -0,0 +1,61 @@ +.. _installation_xpu: + +Installation with XPU +======================== + +vLLM initially supports basic model inferencing and serving on Intel GPU platform. + +Table of contents: + +#. :ref:`Requirements ` +#. :ref:`Quick start using Dockerfile ` +#. :ref:`Build from source ` + +.. _xpu_backend_requirements: + +Requirements +------------ + +* OS: Linux +* Supported Hardware: Intel Data Center GPU (Intel ARC GPU WIP) +* OneAPI requirements: oneAPI 2024.1 + +.. _xpu_backend_quick_start_dockerfile: + +Quick start using Dockerfile +---------------------------- + +.. code-block:: console + + $ docker build -f Dockerfile.xpu -t vllm-xpu-env --shm-size=4g . + $ docker run -it \ + --rm \ + --network=host \ + --device /dev/dri \ + -v /dev/dri/by-path:/dev/dri/by-path \ + vllm-xpu-env + +.. _build_xpu_backend_from_source: + +Build from source +----------------- + +- First, install required driver and intel OneAPI 2024.1. + +- Second, install Python packages for vLLM XPU backend building: + +.. code-block:: console + + $ pip install --upgrade pip + $ pip install wheel packaging ninja setuptools>=49.4.0 numpy + $ pip install -v -r requirements-xpu.txt + +- Finally, build and install vLLM XPU backend: + +.. code-block:: console + + $ VLLM_TARGET_DEVICE=xpu python setup.py install + +.. note:: + - FP16 is the default data type in the current XPU backend. + diff --git a/vllm/_ipex_ops.py b/vllm/_ipex_ops.py index 27795f7377843..89865fabb5da7 100644 --- a/vllm/_ipex_ops.py +++ b/vllm/_ipex_ops.py @@ -1,8 +1,16 @@ from typing import Dict, List, Optional, Tuple -import intel_extension_for_pytorch as ipex import torch +from vllm.logger import init_logger + +logger = init_logger(__name__) + +try: + import intel_extension_for_pytorch as ipex +except ImportError as e: + logger.warning("Import error msg: %s", e.msg) + class ipex_ops: From 50719c42073939f7096a7cc399cf64ba287903cd Mon Sep 17 00:00:00 2001 From: Kunshang Ji Date: Tue, 30 Apr 2024 16:28:16 +0800 Subject: [PATCH 41/67] fix doc --- docs/source/index.rst | 1 + 1 file changed, 1 insertion(+) diff --git a/docs/source/index.rst b/docs/source/index.rst index b7c0d5b880079..8d87c16c6113f 100644 --- a/docs/source/index.rst +++ b/docs/source/index.rst @@ -66,6 +66,7 @@ Documentation getting_started/cpu-installation getting_started/neuron-installation getting_started/tpu-installation + getting_started/xpu-installation getting_started/quickstart getting_started/debugging getting_started/examples/examples_index From 342ea727a1d074597e1e83aa911e0d963b3a406f Mon Sep 17 00:00:00 2001 From: Kunshang Ji Date: Wed, 1 May 2024 22:30:00 +0800 Subject: [PATCH 42/67] fix --- vllm/executor/ray_xpu_executor.py | 35 +++++++------------------------ vllm/executor/xpu_executor.py | 12 +++++------ vllm/worker/xpu_worker.py | 21 ++++++++++--------- 3 files changed, 24 insertions(+), 44 deletions(-) diff --git a/vllm/executor/ray_xpu_executor.py b/vllm/executor/ray_xpu_executor.py index 5c2919a782e99..c3002f3e49a87 100644 --- a/vllm/executor/ray_xpu_executor.py +++ b/vllm/executor/ray_xpu_executor.py @@ -8,7 +8,8 @@ ModelConfig, ParallelConfig, SchedulerConfig, SpeculativeConfig, VisionLanguageConfig) from vllm.engine.ray_utils import RayWorkerWrapper, ray -from vllm.executor.executor_base import ExecutorAsyncBase, ExecutorBase +from vllm.executor.distributed_gpu_executor import ( + DistributedGPUExecutor, DistributedGPUExecutorAsync) from vllm.logger import init_logger from vllm.lora.request import LoRARequest from vllm.sequence import SamplerOutput, SequenceGroupMetadata @@ -29,7 +30,7 @@ USE_RAY_COMPILED_DAG = bool(os.getenv("VLLM_USE_RAY_COMPILED_DAG", 0)) -class RayXPUExecutor(ExecutorBase): +class RayXPUExecutor(DistributedGPUExecutor): def __init__( self, @@ -389,7 +390,11 @@ def _check_if_any_actor_is_dead(self): f"Dead Workers: {dead_actors}. ") -class RayXPUExecutorAsync(RayXPUExecutor, ExecutorAsyncBase): +class RayXPUExecutorAsync(RayXPUExecutor, DistributedGPUExecutorAsync): + + def __init__(self, *args, **kwargs): + super().__init__(*args, **kwargs) + self.driver_executor = make_async(self.driver_worker.execute_method) async def _run_workers_async( self, @@ -417,27 +422,3 @@ async def _run_workers_async( all_outputs = await asyncio.gather(*coros) return all_outputs - - async def execute_model_async( - self, - seq_group_metadata_list: List[SequenceGroupMetadata], - blocks_to_swap_in: Dict[int, int], - blocks_to_swap_out: Dict[int, int], - blocks_to_copy: Dict[int, List[int]], - ) -> SamplerOutput: - all_outputs = await self._run_workers_async( - "execute_model", - driver_kwargs={ - "seq_group_metadata_list": seq_group_metadata_list, - "blocks_to_swap_in": blocks_to_swap_in, - "blocks_to_swap_out": blocks_to_swap_out, - "blocks_to_copy": blocks_to_copy, - }) - - # Only the driver worker returns the sampling results. - output = all_outputs[0] - return output - - async def check_health_async(self) -> None: - """Raises an error if engine is unhealthy.""" - self._check_if_any_actor_is_dead() diff --git a/vllm/executor/xpu_executor.py b/vllm/executor/xpu_executor.py index 1735858d6cb9a..ecacb4958f276 100644 --- a/vllm/executor/xpu_executor.py +++ b/vllm/executor/xpu_executor.py @@ -87,6 +87,7 @@ def execute_model(self, blocks_to_swap_in=blocks_to_swap_in, blocks_to_swap_out=blocks_to_swap_out, blocks_to_copy=blocks_to_copy, + num_lookahead_slots=num_lookahead_slots, ) return output @@ -99,19 +100,16 @@ async def execute_model_async( blocks_to_swap_in: Dict[int, int], blocks_to_swap_out: Dict[int, int], blocks_to_copy: Dict[int, List[int]], - ) -> SamplerOutput: + num_lookahead_slots: int, + ) -> List[SamplerOutput]: output = await make_async(self.driver_worker.execute_model)( seq_group_metadata_list=seq_group_metadata_list, blocks_to_swap_in=blocks_to_swap_in, blocks_to_swap_out=blocks_to_swap_out, - blocks_to_copy=blocks_to_copy) + blocks_to_copy=blocks_to_copy, + num_lookahead_slots=num_lookahead_slots) return output - async def check_health_async(self) -> None: - # XPUExecutor will always be healthy as long as - # it's running. - return - def _verify_and_get_model_config(config: ModelConfig) -> ModelConfig: if config.dtype == torch.bfloat16: diff --git a/vllm/worker/xpu_worker.py b/vllm/worker/xpu_worker.py index e85a30ae6f26b..e1494936934ce 100644 --- a/vllm/worker/xpu_worker.py +++ b/vllm/worker/xpu_worker.py @@ -11,7 +11,8 @@ ModelConfig, ParallelConfig, SchedulerConfig, VisionLanguageConfig) from vllm.distributed import (broadcast_tensor_dict, - ensure_model_parallel_initialized) + ensure_model_parallel_initialized, + init_distributed_environment) from vllm.logger import init_logger from vllm.model_executor import set_random_seed from vllm.sequence import SamplerOutput, SequenceGroupMetadata @@ -190,10 +191,11 @@ def get_cache_block_size_bytes(self) -> int: @torch.inference_mode() def execute_model( self, - seq_group_metadata_list: Optional[List[SequenceGroupMetadata]] = None, - blocks_to_swap_in: Optional[Dict[int, int]] = None, - blocks_to_swap_out: Optional[Dict[int, int]] = None, - blocks_to_copy: Optional[Dict[int, List[int]]] = None, + seq_group_metadata_list: Optional[List[SequenceGroupMetadata]], + blocks_to_swap_in: Optional[Dict[int, int]], + blocks_to_swap_out: Optional[Dict[int, int]], + blocks_to_copy: Optional[Dict[int, List[int]]], + num_lookahead_slots: int = 0, ) -> List[SamplerOutput]: if self.is_driver_worker: assert seq_group_metadata_list is not None @@ -266,13 +268,12 @@ def init_distributed_environment(self) -> None: ENV_CCL_ZE_IPC_EXCHANGE = os.getenv("CCL_ZE_IPC_EXCHANGE", "sockets") os.environ['CCL_ZE_IPC_EXCHANGE'] = ENV_CCL_ZE_IPC_EXCHANGE - torch.distributed.init_process_group( - backend="ccl", + init_distributed_environment( world_size=parallel_config.world_size, rank=rank, - init_method=distributed_init_method, - ) - + distributed_init_method=distributed_init_method, + local_rank=self.local_rank, + backend="ccl") # A small all_reduce for warmup. torch.distributed.all_reduce(torch.zeros(1).xpu()) From b3231b75f12c716f472c4c5d6f6eec86e2291e63 Mon Sep 17 00:00:00 2001 From: Kunshang Ji Date: Wed, 1 May 2024 23:15:16 +0800 Subject: [PATCH 43/67] format --- vllm/executor/ray_xpu_executor.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/vllm/executor/ray_xpu_executor.py b/vllm/executor/ray_xpu_executor.py index c3002f3e49a87..8562eab1c257d 100644 --- a/vllm/executor/ray_xpu_executor.py +++ b/vllm/executor/ray_xpu_executor.py @@ -8,7 +8,7 @@ ModelConfig, ParallelConfig, SchedulerConfig, SpeculativeConfig, VisionLanguageConfig) from vllm.engine.ray_utils import RayWorkerWrapper, ray -from vllm.executor.distributed_gpu_executor import ( +from vllm.executor.distributed_gpu_executor import ( # yapf: disable DistributedGPUExecutor, DistributedGPUExecutorAsync) from vllm.logger import init_logger from vllm.lora.request import LoRARequest From 765fc2ee6ab3951e5b620f2a9f2eb86fa112667d Mon Sep 17 00:00:00 2001 From: Kunshang Ji Date: Mon, 6 May 2024 19:03:18 +0800 Subject: [PATCH 44/67] fix rebase issues --- vllm/attention/backends/torch_sdpa.py | 4 +- vllm/executor/ray_xpu_executor.py | 18 ++----- vllm/executor/xpu_executor.py | 35 ++++---------- vllm/worker/xpu_model_runner.py | 67 ++++++++++++--------------- vllm/worker/xpu_worker.py | 20 ++++---- 5 files changed, 55 insertions(+), 89 deletions(-) diff --git a/vllm/attention/backends/torch_sdpa.py b/vllm/attention/backends/torch_sdpa.py index 6a8430a70a7f3..0e3e8309dc4a0 100644 --- a/vllm/attention/backends/torch_sdpa.py +++ b/vllm/attention/backends/torch_sdpa.py @@ -271,9 +271,7 @@ def _make_alibi_bias( ) -> List[torch.Tensor]: attn_biases = [] for seq_len in seq_lens: - bias = torch.arange(seq_len, - dtype=dtype, - device=alibi_slopes.device) + bias = torch.arange(seq_len, dtype=dtype, device=alibi_slopes.device) # NOTE(zhuohan): HF uses # `bias = bias[None, :].repeat(seq_len, 1)` # here. We find that both biases give the same results, but diff --git a/vllm/executor/ray_xpu_executor.py b/vllm/executor/ray_xpu_executor.py index 8562eab1c257d..dd0f7f2568b8a 100644 --- a/vllm/executor/ray_xpu_executor.py +++ b/vllm/executor/ray_xpu_executor.py @@ -12,7 +12,7 @@ DistributedGPUExecutor, DistributedGPUExecutorAsync) from vllm.logger import init_logger from vllm.lora.request import LoRARequest -from vllm.sequence import SamplerOutput, SequenceGroupMetadata +from vllm.sequence import ExecuteModelRequest, SamplerOutput from vllm.utils import (get_distributed_init_method, get_ip, get_open_port, get_vllm_instance_id, make_async) @@ -233,20 +233,12 @@ def initialize_cache(self, num_gpu_blocks: int, num_gpu_blocks=num_gpu_blocks, num_cpu_blocks=num_cpu_blocks) - def execute_model(self, - seq_group_metadata_list: List[SequenceGroupMetadata], - blocks_to_swap_in: Dict[int, int], - blocks_to_swap_out: Dict[int, int], - blocks_to_copy: Dict[int, List[int]], - num_lookahead_slots: int) -> List[SamplerOutput]: + def execute_model( + self, + execute_model_req: ExecuteModelRequest) -> List[SamplerOutput]: all_outputs = self._run_workers( "execute_model", - driver_kwargs={ - "seq_group_metadata_list": seq_group_metadata_list, - "blocks_to_swap_in": blocks_to_swap_in, - "blocks_to_swap_out": blocks_to_swap_out, - "blocks_to_copy": blocks_to_copy, - }, + driver_kwargs={"execute_model_req": execute_model_req}, use_ray_compiled_dag=USE_RAY_COMPILED_DAG) # Only the driver worker returns the sampling results. diff --git a/vllm/executor/xpu_executor.py b/vllm/executor/xpu_executor.py index ecacb4958f276..3e6791280079d 100644 --- a/vllm/executor/xpu_executor.py +++ b/vllm/executor/xpu_executor.py @@ -1,4 +1,4 @@ -from typing import Dict, List, Optional +from typing import List, Optional import torch @@ -8,7 +8,7 @@ from vllm.executor.executor_base import ExecutorAsyncBase from vllm.executor.gpu_executor import GPUExecutor from vllm.logger import init_logger -from vllm.sequence import SamplerOutput, SequenceGroupMetadata +from vllm.sequence import ExecuteModelRequest, SamplerOutput from vllm.utils import (get_distributed_init_method, get_ip, get_open_port, make_async) @@ -76,19 +76,10 @@ def _init_non_spec_worker(self): self.driver_worker.init_device() self.driver_worker.load_model() - def execute_model(self, - seq_group_metadata_list: List[SequenceGroupMetadata], - blocks_to_swap_in: Dict[int, int], - blocks_to_swap_out: Dict[int, int], - blocks_to_copy: Dict[int, List[int]], - num_lookahead_slots: int) -> List[SamplerOutput]: - output = self.driver_worker.execute_model( - seq_group_metadata_list=seq_group_metadata_list, - blocks_to_swap_in=blocks_to_swap_in, - blocks_to_swap_out=blocks_to_swap_out, - blocks_to_copy=blocks_to_copy, - num_lookahead_slots=num_lookahead_slots, - ) + def execute_model( + self, + execute_model_req: ExecuteModelRequest) -> List[SamplerOutput]: + output = self.driver_worker.execute_model(execute_model_req) return output @@ -96,18 +87,10 @@ class XPUExecutorAsync(XPUExecutor, ExecutorAsyncBase): async def execute_model_async( self, - seq_group_metadata_list: List[SequenceGroupMetadata], - blocks_to_swap_in: Dict[int, int], - blocks_to_swap_out: Dict[int, int], - blocks_to_copy: Dict[int, List[int]], - num_lookahead_slots: int, + execute_model_req: ExecuteModelRequest, ) -> List[SamplerOutput]: - output = await make_async(self.driver_worker.execute_model)( - seq_group_metadata_list=seq_group_metadata_list, - blocks_to_swap_in=blocks_to_swap_in, - blocks_to_swap_out=blocks_to_swap_out, - blocks_to_copy=blocks_to_copy, - num_lookahead_slots=num_lookahead_slots) + output = await make_async(self.driver_worker.execute_model + )(execute_model_req=execute_model_req, ) return output diff --git a/vllm/worker/xpu_model_runner.py b/vllm/worker/xpu_model_runner.py index 99a400fbcc404..df1f6f232b94b 100644 --- a/vllm/worker/xpu_model_runner.py +++ b/vllm/worker/xpu_model_runner.py @@ -1,6 +1,5 @@ from typing import List, Optional, Tuple -import numpy as np import torch import torch.nn as nn @@ -131,14 +130,6 @@ def profile_run(self) -> None: def set_block_size(self, block_size: int) -> None: self.block_size = block_size - self.graph_block_tables = np.zeros( - (max(_BATCH_SIZES_TO_CAPTURE), self.get_max_block_per_batch()), - dtype=np.int32) - - def get_max_block_per_batch(self) -> int: - block_size = self.block_size - return (self.max_context_len_to_capture + block_size - 1) // block_size - def prepare_input_tensors( self, seq_group_metadata_list: List[SequenceGroupMetadata], @@ -151,20 +142,20 @@ def prepare_input_tensors( is_prompt = seq_group_metadata_list[0].is_prompt # Prepare input tensors. if is_prompt: - (input_tokens, input_positions, attn_metadata, prompt_lens, + (input_tokens, input_positions, attn_metadata, seq_lens, multi_modal_input ) = self._prepare_prompt(seq_group_metadata_list) else: (input_tokens, input_positions, attn_metadata) = self._prepare_decode(seq_group_metadata_list) - prompt_lens = [] + seq_lens = [] sampling_metadata = SamplingMetadata.prepare( seq_group_metadata_list, - prompt_lens, + seq_lens, # subquery_lens is not needed if chunked prefill is not # supported. Since CPU worker doesn't support chunked prefill - # just use prompt_lens instead. - prompt_lens, + # just use seq_lens instead. + seq_lens, self.device, pin_memory=False) # Broadcast the metadata. @@ -186,7 +177,7 @@ def prepare_input_tensors( sampling_metadata = SamplingMetadata( seq_groups=None, seq_data=None, - prompt_lens=None, + seq_lens=None, selected_token_indices=selected_token_indices, categorized_sample_indices=None, generators=None, @@ -204,7 +195,7 @@ def _prepare_decode( input_tokens: List[int] = [] input_positions: List[int] = [] slot_mapping: List[int] = [] - context_lens: List[int] = [] + seq_lens: List[int] = [] block_tables: List[List[int]] = [] for seq_group_metadata in seq_group_metadata_list: @@ -222,9 +213,9 @@ def _prepare_decode( position = seq_len - 1 input_positions.append(position) - context_len = seq_len if self.sliding_window is None else min( + seq_len = seq_len if self.sliding_window is None else min( seq_len, self.sliding_window) - context_lens.append(context_len) + seq_lens.append(seq_len) block_table = seq_group_metadata.block_tables[seq_id] block_number = block_table[position // self.block_size] @@ -238,7 +229,7 @@ def _prepare_decode( block_table = block_table[-sliding_window_blocks:] block_tables.append(block_table) - max_context_len = max(context_lens) + max_seq_len = max(seq_lens) input_tokens = torch.tensor(input_tokens, dtype=torch.long, @@ -249,9 +240,9 @@ def _prepare_decode( slot_mapping = torch.tensor(slot_mapping, dtype=torch.long, device=self.device) - context_lens = torch.tensor(context_lens, - dtype=torch.int, - device=self.device) + seq_lens_tensor = torch.tensor(seq_lens, + dtype=torch.int, + device=self.device) max_block_table_len = max( len(block_table) for block_table in block_tables) @@ -266,14 +257,14 @@ def _prepare_decode( attn_metadata = self.attn_backend.make_metadata( is_prompt=False, slot_mapping=slot_mapping, - prompt_lens=None, + seq_lens=seq_lens, + seq_lens_tensor=seq_lens_tensor, + max_seq_len=max_seq_len, num_prefill_tokens=0, num_decode_tokens=len(input_tokens), - max_context_len=max_context_len, num_prefills=0, prefill_metadata=None, decode_metadata=None, - context_lens=context_lens, block_tables=block_tables, kv_cache_dtype=self.kv_cache_dtype, ) @@ -328,7 +319,7 @@ def _prepare_prompt( input_tokens: List[int] = [] input_positions: List[int] = [] slot_mapping: List[int] = [] - prompt_lens: List[int] = [] + seq_lens: List[int] = [] multi_modal_input_list: List[torch.Tensor] = [] for seq_group_metadata in seq_group_metadata_list: @@ -340,15 +331,15 @@ def _prepare_prompt( seq_data = seq_group_metadata.seq_data[seq_id] prompt_tokens = seq_data.get_token_ids() computed_len = seq_data.get_num_computed_tokens() - prompt_len = len(prompt_tokens) + seq_len = len(prompt_tokens) - prompt_lens.append(prompt_len) # Prompt token num + seq_lens.append(seq_len) # Prompt token num input_tokens.extend(prompt_tokens) # Token ids # Token position ids # NOTE(woosuk): Here we assume that the first token in the prompt # is always the first token in the sequence. - input_positions.extend(list(range(computed_len, prompt_len))) + input_positions.extend(list(range(computed_len, seq_len))) if seq_group_metadata.multi_modal_data: multi_modal_input_list.append( @@ -357,21 +348,21 @@ def _prepare_prompt( if seq_group_metadata.block_tables is None: # During memory profiling, the block tables are not initialized # yet. In this case, we just use a dummy slot mapping. - slot_mapping.extend([_PAD_SLOT_ID] * prompt_len) + slot_mapping.extend([_PAD_SLOT_ID] * seq_len) continue # Compute the slot mapping. block_table = seq_group_metadata.block_tables[seq_id] # Mask the [0, start_idx) tokens of the prompt with _PAD_SLOT_ID, - # where start_idx is max(0, prompt_len - sliding_window). + # where start_idx is max(0, seq_len - sliding_window). # For example, if the prompt len is 10, sliding window is 8, and # block size is 4, the first two tokens are masked and the slot # mapping will be [-1, -1, 2, 3, 4, 5, 6, 7, 0, 1]. start_idx = 0 if self.sliding_window is not None: - start_idx = max(0, prompt_len - self.sliding_window) + start_idx = max(0, seq_len - self.sliding_window) - for i in range(computed_len, prompt_len): + for i in range(computed_len, seq_len): if i < start_idx: slot_mapping.append(_PAD_SLOT_ID) continue @@ -405,17 +396,17 @@ def _prepare_prompt( attn_metadata = self.attn_backend.make_metadata( is_prompt=True, - prompt_lens=prompt_lens, - num_prefills=len(prompt_lens), + seq_lens=seq_lens, + seq_lens_tensor=None, + max_seq_len=None, + num_prefills=len(seq_lens), num_prefill_tokens=num_prompt_tokens, num_decode_tokens=0, prefill_metadata=None, decode_metadata=None, - max_context_len=None, - context_lens=None, block_tables=torch.tensor([], device=self.device, dtype=torch.int), slot_mapping=slot_mapping, kv_cache_dtype=self.kv_cache_dtype, ) - return (input_tokens, input_positions, attn_metadata, prompt_lens, + return (input_tokens, input_positions, attn_metadata, seq_lens, multi_modal_input) diff --git a/vllm/worker/xpu_worker.py b/vllm/worker/xpu_worker.py index e1494936934ce..925d8d9aabe34 100644 --- a/vllm/worker/xpu_worker.py +++ b/vllm/worker/xpu_worker.py @@ -15,7 +15,7 @@ init_distributed_environment) from vllm.logger import init_logger from vllm.model_executor import set_random_seed -from vllm.sequence import SamplerOutput, SequenceGroupMetadata +from vllm.sequence import ExecuteModelRequest, SamplerOutput from vllm.utils import is_xpu from vllm.worker.cache_engine import CacheEngine from vllm.worker.worker import raise_if_cache_size_invalid @@ -191,18 +191,20 @@ def get_cache_block_size_bytes(self) -> int: @torch.inference_mode() def execute_model( self, - seq_group_metadata_list: Optional[List[SequenceGroupMetadata]], - blocks_to_swap_in: Optional[Dict[int, int]], - blocks_to_swap_out: Optional[Dict[int, int]], - blocks_to_copy: Optional[Dict[int, List[int]]], - num_lookahead_slots: int = 0, + execute_model_req: Optional[ExecuteModelRequest] = None, ) -> List[SamplerOutput]: + if execute_model_req is None: + seq_group_metadata_list = None + else: + seq_group_metadata_list = execute_model_req.seq_group_metadata_list + if self.is_driver_worker: assert seq_group_metadata_list is not None num_seq_groups = len(seq_group_metadata_list) - assert blocks_to_swap_in is not None - assert blocks_to_swap_out is not None - assert blocks_to_copy is not None + assert execute_model_req is not None + blocks_to_swap_in = execute_model_req.blocks_to_swap_in + blocks_to_swap_out = execute_model_req.blocks_to_swap_out + blocks_to_copy = execute_model_req.blocks_to_copy data: Dict[str, Any] = { "num_seq_groups": num_seq_groups, "blocks_to_swap_in": blocks_to_swap_in, From ba7c162c360750cf96dd9451d2d30b7e59293a7f Mon Sep 17 00:00:00 2001 From: Kunshang Ji Date: Tue, 7 May 2024 00:06:27 +0800 Subject: [PATCH 45/67] fix ray_xpu_executor --- vllm/executor/ray_xpu_executor.py | 86 ++++++++++++++----------------- vllm/worker/xpu_model_runner.py | 5 +- 2 files changed, 40 insertions(+), 51 deletions(-) diff --git a/vllm/executor/ray_xpu_executor.py b/vllm/executor/ray_xpu_executor.py index dd0f7f2568b8a..a889c7d4abddb 100644 --- a/vllm/executor/ray_xpu_executor.py +++ b/vllm/executor/ray_xpu_executor.py @@ -2,19 +2,20 @@ import os import pickle from collections import defaultdict +from itertools import islice, repeat from typing import TYPE_CHECKING, Any, Dict, List, Optional, Set, Tuple from vllm.config import (CacheConfig, DeviceConfig, LoadConfig, LoRAConfig, ModelConfig, ParallelConfig, SchedulerConfig, SpeculativeConfig, VisionLanguageConfig) -from vllm.engine.ray_utils import RayWorkerWrapper, ray from vllm.executor.distributed_gpu_executor import ( # yapf: disable DistributedGPUExecutor, DistributedGPUExecutorAsync) +from vllm.executor.ray_utils import RayWorkerWrapper, ray from vllm.logger import init_logger from vllm.lora.request import LoRARequest from vllm.sequence import ExecuteModelRequest, SamplerOutput from vllm.utils import (get_distributed_init_method, get_ip, get_open_port, - get_vllm_instance_id, make_async) + make_async) if ray is not None: from ray.util.scheduling_strategies import PlacementGroupSchedulingStrategy @@ -108,7 +109,7 @@ def _init_workers_ray(self, placement_group: "PlacementGroup", # The driver dummy worker does not actually use any resources. # It holds the resource for the driver worker. - self.driver_dummy_worker: RayWorkerWrapper = None + self.driver_dummy_worker: Optional[RayWorkerWrapper] = None # The remaining workers are the actual ray actors. self.workers: List[RayWorkerWrapper] = [] @@ -130,6 +131,7 @@ def _init_workers_ray(self, placement_group: "PlacementGroup", )(RayWorkerWrapper).remote( worker_module_name="vllm.worker.xpu_worker", worker_class_name="XPUWorker", + trust_remote_code=self.model_config.trust_remote_code, ) worker_ip = ray.get(worker.get_node_ip.remote()) @@ -140,6 +142,7 @@ def _init_workers_ray(self, placement_group: "PlacementGroup", self.driver_worker = RayWorkerWrapper( worker_module_name="vllm.worker.xpu_worker", worker_class_name="XPUWorker", + trust_remote_code=self.model_config.trust_remote_code, ) else: # Else, added to the list of workers. @@ -163,21 +166,13 @@ def _init_workers_ray(self, placement_group: "PlacementGroup", for node_id, gpu_ids in node_gpus.items(): node_gpus[node_id] = sorted(gpu_ids) - VLLM_INSTANCE_ID = get_vllm_instance_id() + # VLLM_INSTANCE_ID = get_vllm_instance_id() + # TODO: add env var for xpu # Set environment variables for the driver and workers. - all_args_to_update_environment_variables = [] - for (node_id, _) in worker_node_and_gpu_ids: - all_args_to_update_environment_variables.append([{ - "CUDA_VISIBLE_DEVICES": - ",".join(map(str, node_gpus[node_id])), - "VLLM_INSTANCE_ID": - VLLM_INSTANCE_ID, - "VLLM_TRACE_FUNCTION": - os.getenv("VLLM_TRACE_FUNCTION", "0"), - }]) - self._run_workers("update_environment_variables", - all_args=all_args_to_update_environment_variables) + # all_args_to_update_environment_variables = [] + # self._run_workers("update_environment_variables", + # all_args=all_args_to_update_environment_variables) distributed_init_method = get_distributed_init_method( driver_ip, get_open_port()) @@ -266,40 +261,40 @@ def _run_workers( self, method: str, *args, - driver_args: Optional[Tuple[Any]] = None, + driver_args: Optional[Tuple[Any, ...]] = None, driver_kwargs: Optional[Dict[str, Any]] = None, - all_args: Optional[List[List[Any]]] = None, + all_args: Optional[List[Tuple[Any, ...]]] = None, all_kwargs: Optional[List[Dict[str, Any]]] = None, use_dummy_driver: bool = False, max_concurrent_workers: Optional[int] = None, use_ray_compiled_dag: bool = False, **kwargs, ) -> Any: - """Runs the given method on all workers. - all_args and all_kwargs are used to pass heterogeneous arguments, - i.e. different arguments for each worker. + """Runs the given method on all workers. Can be used in the following + ways: + + - args/kwargs: All workers share the same args/kwargs + - args/kwargs and driver_args/driver_kwargs: Driver worker has + different args + - all_args/all_kwargs: args/kwargs for each worker are specified + individually """ - if driver_args is None: - driver_args = args - if driver_kwargs is None: - driver_kwargs = kwargs - - # for mypy type checking - assert driver_args is not None - assert driver_kwargs is not None - if all_args is None: - all_args = [driver_args] + [args] * len(self.workers) - if all_kwargs is None: - all_kwargs = [driver_kwargs] + [kwargs] * len(self.workers) - - # for mypy type checking - assert all_args is not None - assert all_kwargs is not None if max_concurrent_workers: raise NotImplementedError( "max_concurrent_workers is not supported yet.") + if driver_args is None: + driver_args = args if all_args is None else all_args[0] + if driver_kwargs is None: + driver_kwargs = kwargs if all_kwargs is None else all_kwargs[0] + + count = len(self.workers) + all_worker_args = repeat(args, count) if all_args is None \ + else islice(all_args, 1, None) + all_worker_kwargs = repeat(kwargs, count) if all_kwargs is None \ + else islice(all_kwargs, 1, None) + if use_ray_compiled_dag: # Right now, compiled DAG can only accept a single # input. TODO(sang): Fix it. @@ -311,22 +306,18 @@ def _run_workers( worker.execute_method.remote(method, *worker_args, **worker_kwargs) for (worker, worker_args, worker_kwargs - ) in zip(self.workers, all_args[1:], all_kwargs[1:]) + ) in zip(self.workers, all_worker_args, all_worker_kwargs) ] - if driver_args is None: - driver_args = args - if driver_kwargs is None: - driver_kwargs = kwargs - # Start the driver worker after all the ray workers. if not use_dummy_driver: driver_worker_output = self.driver_worker.execute_method( - method, *all_args[0], **all_kwargs[0]) + method, *driver_args, **driver_kwargs) else: + assert self.driver_dummy_worker is not None driver_worker_output = ray.get( self.driver_dummy_worker.execute_method.remote( - method, *all_args[0], **all_kwargs[0])) + method, *driver_args, **driver_kwargs)) # Get the results of the ray workers. if self.workers: if use_ray_compiled_dag: @@ -359,8 +350,9 @@ def _compiled_ray_dag(self): # a dummy value for now. It will be fixed soon. with InputNode() as input_data: forward_dag = MultiOutputNode([ - worker.execute_model_compiled_dag_remote.bind(input_data) - for worker in self.workers + worker.execute_model_compiled_dag_remote. + bind( # type: ignore[attr-defined] + input_data) for worker in self.workers ]) return forward_dag.experimental_compile() diff --git a/vllm/worker/xpu_model_runner.py b/vllm/worker/xpu_model_runner.py index df1f6f232b94b..c193ccb0465a0 100644 --- a/vllm/worker/xpu_model_runner.py +++ b/vllm/worker/xpu_model_runner.py @@ -176,12 +176,9 @@ def prepare_input_tensors( attn_metadata = self.attn_backend.make_metadata(**metadata_dict) sampling_metadata = SamplingMetadata( seq_groups=None, - seq_data=None, - seq_lens=None, selected_token_indices=selected_token_indices, categorized_sample_indices=None, - generators=None, - perform_sampling=False, + num_prompts=0, ) return (input_tokens, input_positions, attn_metadata, From 4d0ab33e7b8f5366307258653cab146fcde6d2d6 Mon Sep 17 00:00:00 2001 From: Kunshang Ji Date: Fri, 10 May 2024 18:39:43 +0800 Subject: [PATCH 46/67] fix rebase issue, copy/swap_blocks --- tests/kernels/test_cache.py | 87 +++---------------------------------- vllm/_ipex_ops.py | 20 ++------- 2 files changed, 8 insertions(+), 99 deletions(-) diff --git a/tests/kernels/test_cache.py b/tests/kernels/test_cache.py index e934a0e50244f..c5f46b381c72a 100644 --- a/tests/kernels/test_cache.py +++ b/tests/kernels/test_cache.py @@ -5,8 +5,7 @@ import torch from vllm import _custom_ops as ops -from vllm._C import cache_ops -from vllm.utils import is_hip, is_xpu +from vllm.utils import is_xpu COPYING_DIRECTION = [('cuda', 'cpu'), ('cuda', 'cuda'), ('cpu', 'cuda')] \ if not is_xpu() else [('xpu', 'cpu'), ('xpu', 'xpu'), ('cpu', 'xpu')] @@ -131,6 +130,7 @@ def test_reshape_and_cache( torch.random.manual_seed(seed) if torch.cuda.is_available(): torch.cuda.manual_seed(seed) + torch.xpu.empty_cache() torch.set_default_device(device) # Create a random slot mapping. num_slots = block_size * num_blocks @@ -274,92 +274,15 @@ def test_reshape_and_cache_flash( assert torch.allclose(value_cache, cloned_value_cache) -@pytest.mark.parametrize("num_tokens", NUM_TOKENS) -@pytest.mark.parametrize("num_heads", NUM_HEADS) -@pytest.mark.parametrize("head_size", HEAD_SIZES) -@pytest.mark.parametrize("block_size", BLOCK_SIZES) -@pytest.mark.parametrize("num_blocks", NUM_BLOCKS) -@pytest.mark.parametrize("dtype", DTYPES) -@pytest.mark.parametrize("seed", SEEDS) -@pytest.mark.parametrize("device", CUDA_DEVICES) -@pytest.mark.parametrize("kv_cache_dtype", KV_CACHE_DTYPE) -@torch.inference_mode() -def test_reshape_and_cache_flash( - kv_cache_factory_flashinfer, - num_tokens: int, - num_heads: int, - head_size: int, - block_size: int, - num_blocks: int, - dtype: torch.dtype, - seed: int, - device: str, - kv_cache_dtype: str, -) -> None: - if kv_cache_dtype == "fp8": - pytest.skip() - random.seed(seed) - torch.random.manual_seed(seed) - torch.cuda.manual_seed(seed) - torch.set_default_device(device) - - # Create a random slot mapping. - num_slots = block_size * num_blocks - slot_mapping = random.sample(range(num_slots), num_tokens) - slot_mapping = torch.tensor(slot_mapping, dtype=torch.long, device=device) - - qkv = torch.randn(num_tokens, - 3, - num_heads, - head_size, - dtype=dtype, - device=device) - _, key, value = qkv.unbind(dim=1) - - # Create the KV caches. - key_caches, value_caches = kv_cache_factory_flashinfer( - num_blocks, - block_size, - 1, - num_heads, - head_size, - kv_cache_dtype, - dtype, - device=device, - ) - key_cache, value_cache = key_caches[0], value_caches[0] - - # Clone the KV caches. - cloned_key_cache = key_cache.clone() - cloned_value_cache = value_cache.clone() - - # Call the reshape_and_cache kernel. - ops.reshape_and_cache_flash(key, value, key_cache, value_cache, - slot_mapping, kv_cache_dtype) - - # Run the reference implementation. - block_indicies = torch.div(slot_mapping, block_size, rounding_mode='floor') - block_indicies = block_indicies.cpu().tolist() - block_offsets = slot_mapping % block_size - block_offsets = block_offsets.cpu().tolist() - for i in range(num_tokens): - block_idx = block_indicies[i] - block_offset = block_offsets[i] - cloned_key_cache[block_idx, block_offset, :, :] = key[i] - cloned_value_cache[block_idx, block_offset, :, :] = value[i] - - assert torch.allclose(key_cache, cloned_key_cache) - assert torch.allclose(value_cache, cloned_value_cache) - - -@pytest.mark.parametrize("num_tokens", NUM_TOKENS) +@pytest.mark.parametrize("direction", COPYING_DIRECTION) +@pytest.mark.parametrize("num_mappings", NUM_MAPPINGS) @pytest.mark.parametrize("num_heads", NUM_HEADS) @pytest.mark.parametrize("head_size", HEAD_SIZES) @pytest.mark.parametrize("block_size", BLOCK_SIZES) @pytest.mark.parametrize("num_blocks", NUM_BLOCKS) @pytest.mark.parametrize("dtype", DTYPES) @pytest.mark.parametrize("seed", SEEDS) -@pytest.mark.parametrize("device", CUDA_DEVICES) +@pytest.mark.parametrize("device", DEVICES) @pytest.mark.parametrize("kv_cache_dtype", KV_CACHE_DTYPE) @torch.inference_mode() def test_swap_blocks( diff --git a/vllm/_ipex_ops.py b/vllm/_ipex_ops.py index 89865fabb5da7..dc2fdffbcb6f1 100644 --- a/vllm/_ipex_ops.py +++ b/vllm/_ipex_ops.py @@ -227,24 +227,10 @@ def reshape_and_cache( @staticmethod def copy_blocks(key_caches: List[torch.Tensor], value_caches: List[torch.Tensor], - block_mapping: Dict[int, List[int]]) -> None: - block_mapping_tensor = [] - for key, values in block_mapping.items(): - if hasattr(values, "__iter__"): - for value in values: - block_mapping_tensor.append([key, value]) - block_mapping = torch.tensor(block_mapping_tensor, - device=key_caches[0].device, - dtype=torch.int64) + block_mapping: torch.Tensor) -> None: torch.xpu.copy_blocks(key_caches, value_caches, block_mapping) @staticmethod def swap_blocks(src: torch.Tensor, dst: torch.Tensor, - block_mapping: Dict[int, int]) -> None: - keys = list(block_mapping.keys()) - values = list(block_mapping.values()) - key_tensor = torch.tensor(keys, dtype=torch.int64) - value_tensor = torch.tensor(values, dtype=torch.int64) - block_mapping_tensor = torch.stack([key_tensor, value_tensor], dim=1) - - torch.xpu.swap_blocks(src, dst, block_mapping_tensor) + block_mapping: torch.Tensor) -> None: + torch.xpu.swap_blocks(src, dst, block_mapping) From dc4d41ae330f8a3f6513c97958bf4a87bf2a6f51 Mon Sep 17 00:00:00 2001 From: Kunshang Ji Date: Fri, 10 May 2024 18:51:04 +0800 Subject: [PATCH 47/67] fix format --- tests/kernels/test_cache.py | 5 +++-- vllm/_custom_ops.py | 5 +++-- vllm/_ipex_ops.py | 2 +- 3 files changed, 7 insertions(+), 5 deletions(-) diff --git a/tests/kernels/test_cache.py b/tests/kernels/test_cache.py index c5f46b381c72a..cfb860cbf6cbf 100644 --- a/tests/kernels/test_cache.py +++ b/tests/kernels/test_cache.py @@ -27,6 +27,7 @@ ] if torch.cuda.is_available() else [] SYCL_DEVICES = ["xpu:0"] if is_xpu() else [] DEVICES = CUDA_DEVICES + SYCL_DEVICES +# We assume fp8 is always enabled for testing. KV_CACHE_DTYPE = ["auto", "fp8"] @@ -38,7 +39,7 @@ @pytest.mark.parametrize("num_blocks", NUM_BLOCKS) @pytest.mark.parametrize("dtype", DTYPES) @pytest.mark.parametrize("seed", SEEDS) -@pytest.mark.parametrize("device", SYCL_DEVICES) +@pytest.mark.parametrize("device", DEVICES) @pytest.mark.parametrize("kv_cache_dtype", KV_CACHE_DTYPE) @torch.inference_mode() def test_copy_blocks( @@ -111,7 +112,7 @@ def test_copy_blocks( @pytest.mark.parametrize("num_blocks", NUM_BLOCKS) @pytest.mark.parametrize("dtype", DTYPES) @pytest.mark.parametrize("seed", SEEDS) -@pytest.mark.parametrize("device", SYCL_DEVICES) +@pytest.mark.parametrize("device", DEVICES) @pytest.mark.parametrize("kv_cache_dtype", KV_CACHE_DTYPE) @torch.inference_mode() def test_reshape_and_cache( diff --git a/vllm/_custom_ops.py b/vllm/_custom_ops.py index 96a44f80f0652..20eef1a893ca4 100644 --- a/vllm/_custom_ops.py +++ b/vllm/_custom_ops.py @@ -410,9 +410,10 @@ def reshape_and_cache_flash( kv_cache_dtype) -def copy_blocks(key_caches: torch.Tensor, value_caches: torch.Tensor, +def copy_blocks(key_caches: List[torch.Tensor], + value_caches: List[torch.Tensor], block_mapping: torch.Tensor) -> None: - torch.ops._C_cache_ops.copy_blocks(key_caches, value_caches, block_mapping) + vllm_cache_ops.copy_blocks(key_caches, value_caches, block_mapping) def swap_blocks(src: torch.Tensor, dst: torch.Tensor, diff --git a/vllm/_ipex_ops.py b/vllm/_ipex_ops.py index dc2fdffbcb6f1..997009c751e28 100644 --- a/vllm/_ipex_ops.py +++ b/vllm/_ipex_ops.py @@ -1,4 +1,4 @@ -from typing import Dict, List, Optional, Tuple +from typing import List, Optional, Tuple import torch From f505011bb9f30b9d7492df87c821abb710ef8f06 Mon Sep 17 00:00:00 2001 From: Abhilash Majumder <30946547+abhilash1910@users.noreply.github.com> Date: Tue, 14 May 2024 23:23:38 +0530 Subject: [PATCH 48/67] add xpu in benchmark_latency.py --- benchmarks/benchmark_latency.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/benchmarks/benchmark_latency.py b/benchmarks/benchmark_latency.py index 17edb7515964a..fd63b40f521d7 100644 --- a/benchmarks/benchmark_latency.py +++ b/benchmarks/benchmark_latency.py @@ -189,7 +189,7 @@ def run_to_completion(profile_dir: Optional[str] = None): "--device", type=str, default="cuda", - choices=["cuda", "cpu", "tpu"], + choices=["cuda", "cpu", "tpu", "xpu"], help='device type for vLLM execution, supporting CUDA and CPU.') parser.add_argument('--block-size', type=int, From d23aec692a585be0b7b51c0f1ef1e270d89c653e Mon Sep 17 00:00:00 2001 From: Kunshang Ji Date: Thu, 16 May 2024 17:31:19 +0800 Subject: [PATCH 49/67] fix --- vllm/engine/async_llm_engine.py | 4 ++-- vllm/engine/llm_engine.py | 2 +- vllm/worker/xpu_model_runner.py | 35 +++++++++++++++++++++------------ 3 files changed, 25 insertions(+), 16 deletions(-) diff --git a/vllm/engine/async_llm_engine.py b/vllm/engine/async_llm_engine.py index 82a572b5e7f1a..7f0f381f51cc7 100644 --- a/vllm/engine/async_llm_engine.py +++ b/vllm/engine/async_llm_engine.py @@ -384,8 +384,8 @@ def from_engine_args( from vllm.executor.cpu_executor import CPUExecutorAsync executor_class = CPUExecutorAsync elif engine_config.device_config.device_type == "xpu": - assert not engine_config.parallel_config.worker_use_ray, ( - "Ray is not supported with the XPU backend.") + assert distributed_executor_backend is None, ( + "Distributed execution is not supported with the XPU backend.") from vllm.executor.xpu_executor import XPUExecutorAsync executor_class = XPUExecutorAsync elif distributed_executor_backend == "ray": diff --git a/vllm/engine/llm_engine.py b/vllm/engine/llm_engine.py index 045eb93ccfee2..12cc877a05c92 100644 --- a/vllm/engine/llm_engine.py +++ b/vllm/engine/llm_engine.py @@ -348,7 +348,7 @@ def from_engine_args( from vllm.executor.cpu_executor import CPUExecutor executor_class = CPUExecutor elif engine_config.device_config.device_type == "xpu": - if engine_config.parallel_config.worker_use_ray: + if distributed_executor_backend == "ray": initialize_ray_cluster(engine_config.parallel_config) from vllm.executor.ray_xpu_executor import RayXPUExecutor executor_class = RayXPUExecutor diff --git a/vllm/worker/xpu_model_runner.py b/vllm/worker/xpu_model_runner.py index c193ccb0465a0..047acab67a7c9 100644 --- a/vllm/worker/xpu_model_runner.py +++ b/vllm/worker/xpu_model_runner.py @@ -4,8 +4,9 @@ import torch.nn as nn from vllm.attention import get_attn_backend -from vllm.config import (DeviceConfig, LoadConfig, LoRAConfig, ModelConfig, - ParallelConfig, SchedulerConfig, VisionLanguageConfig) +from vllm.config import (CacheConfig, DeviceConfig, LoadConfig, LoRAConfig, + ModelConfig, ParallelConfig, SchedulerConfig, + VisionLanguageConfig) from vllm.distributed import broadcast_tensor_dict from vllm.logger import init_logger from vllm.model_executor.model_loader import get_model @@ -32,6 +33,7 @@ def __init__( parallel_config: ParallelConfig, scheduler_config: SchedulerConfig, device_config: DeviceConfig, + cache_config: CacheConfig, load_config: LoadConfig, lora_config: Optional[LoRAConfig], vision_language_config: Optional[VisionLanguageConfig], @@ -45,6 +47,7 @@ def __init__( self.scheduler_config = scheduler_config self.lora_config = lora_config self.load_config = load_config + self.cache_config = cache_config self.vision_language_config = vision_language_config self.is_driver_worker = is_driver_worker @@ -57,16 +60,27 @@ def __init__( self.device = self.device_config.device self.kv_cache_dtype = kv_cache_dtype + self.block_size = cache_config.block_size self.max_context_len_to_capture = ( self.model_config.max_context_len_to_capture if self.model_config is not None else 0) self.attn_backend = get_attn_backend( - self.model_config.dtype if model_config is not None else None) + self.model_config.get_num_attention_heads(self.parallel_config), + self.model_config.get_head_size(), + self.model_config.get_num_kv_heads(self.parallel_config), + self.model_config.get_sliding_window(), + self.model_config.dtype, + self.kv_cache_dtype, + self.block_size, + ) + + # self.attn_backend = get_attn_backend( + # self.model_config.dtype if model_config is not None else None) # Lazy initialization. self.model: nn.Module # Set after init_Model - self.block_size: int # Set after initial profiling. + # self.block_size: int # Set after initial profiling. def load_model(self) -> None: with CudaMemoryProfiler() as m: @@ -78,6 +92,7 @@ def load_model(self) -> None: vision_language_config=self.vision_language_config, parallel_config=self.parallel_config, scheduler_config=self.scheduler_config, + cache_config=self.cache_config, ) self.model_memory_usage = m.consumed_memory @@ -226,7 +241,7 @@ def _prepare_decode( block_table = block_table[-sliding_window_blocks:] block_tables.append(block_table) - max_seq_len = max(seq_lens) + max_decode_seq_len = max(seq_lens) input_tokens = torch.tensor(input_tokens, dtype=torch.long, @@ -256,14 +271,11 @@ def _prepare_decode( slot_mapping=slot_mapping, seq_lens=seq_lens, seq_lens_tensor=seq_lens_tensor, - max_seq_len=max_seq_len, + max_decode_seq_len=max_decode_seq_len, num_prefill_tokens=0, num_decode_tokens=len(input_tokens), num_prefills=0, - prefill_metadata=None, - decode_metadata=None, block_tables=block_tables, - kv_cache_dtype=self.kv_cache_dtype, ) return ( input_tokens, @@ -395,15 +407,12 @@ def _prepare_prompt( is_prompt=True, seq_lens=seq_lens, seq_lens_tensor=None, - max_seq_len=None, + max_decode_seq_len=None, num_prefills=len(seq_lens), num_prefill_tokens=num_prompt_tokens, num_decode_tokens=0, - prefill_metadata=None, - decode_metadata=None, block_tables=torch.tensor([], device=self.device, dtype=torch.int), slot_mapping=slot_mapping, - kv_cache_dtype=self.kv_cache_dtype, ) return (input_tokens, input_positions, attn_metadata, seq_lens, multi_modal_input) From 2d86f22eb1a83744ccc187e2a7d5e1305d88b621 Mon Sep 17 00:00:00 2001 From: Kunshang Ji Date: Thu, 16 May 2024 18:12:09 +0800 Subject: [PATCH 50/67] fix format --- vllm/worker/xpu_worker.py | 1 + 1 file changed, 1 insertion(+) diff --git a/vllm/worker/xpu_worker.py b/vllm/worker/xpu_worker.py index 925d8d9aabe34..529f416232423 100644 --- a/vllm/worker/xpu_worker.py +++ b/vllm/worker/xpu_worker.py @@ -76,6 +76,7 @@ def __init__( parallel_config, scheduler_config, device_config, + cache_config, load_config=self.load_config, lora_config=self.lora_config, kv_cache_dtype=self.cache_config.cache_dtype, From fc0a8b8859bf80f2472a2b77e776eea498462263 Mon Sep 17 00:00:00 2001 From: Kunshang Ji Date: Thu, 16 May 2024 23:12:59 +0800 Subject: [PATCH 51/67] address comments --- .../getting_started/xpu-installation.rst | 2 +- tests/kernels/allclose_default.py | 11 +- tests/kernels/test_activation.py | 14 +-- tests/kernels/test_attention.py | 17 ++- tests/kernels/test_cache.py | 20 ++- tests/kernels/test_layernorm.py | 7 +- tests/kernels/test_pos_encoding.py | 15 +-- vllm/executor/xpu_executor.py | 6 +- vllm/worker/xpu_model_runner.py | 17 +-- vllm/worker/xpu_worker.py | 114 ++---------------- 10 files changed, 45 insertions(+), 178 deletions(-) diff --git a/docs/source/getting_started/xpu-installation.rst b/docs/source/getting_started/xpu-installation.rst index 8a61c3003a9af..48932582c68ed 100644 --- a/docs/source/getting_started/xpu-installation.rst +++ b/docs/source/getting_started/xpu-installation.rst @@ -47,7 +47,7 @@ Build from source .. code-block:: console $ pip install --upgrade pip - $ pip install wheel packaging ninja setuptools>=49.4.0 numpy + $ pip install -v -r requirements-dev.txt $ pip install -v -r requirements-xpu.txt - Finally, build and install vLLM XPU backend: diff --git a/tests/kernels/allclose_default.py b/tests/kernels/allclose_default.py index db38b5af4ff99..175cfe82fb74e 100644 --- a/tests/kernels/allclose_default.py +++ b/tests/kernels/allclose_default.py @@ -1,7 +1,5 @@ import torch -from vllm.utils import is_xpu - # Reference default values of atol and rtol are from # https://github.com/pytorch/pytorch/blob/6d96beb6bec24d73ee3f080bac54d2104068f675/test/test_transformers.py#L67 default_atol = {torch.float16: 1e-3, torch.bfloat16: 1e-3, torch.float: 1e-5} @@ -11,15 +9,10 @@ torch.float: 1.3e-6 } -ipex_xpu_atol = {torch.float16: 1e-3, torch.bfloat16: 1e-3, torch.float: 1e-5} -ipex_xpu_rtol = {torch.float16: 1e-3, torch.bfloat16: 1e-3, torch.float: 1e-5} - def get_default_atol(output) -> float: - return default_atol[output.dtype] if not is_xpu() else ipex_xpu_atol[ - output.dtype] + return default_atol[output.dtype] def get_default_rtol(output) -> float: - return default_rtol[output.dtype] if not is_xpu() else ipex_xpu_rtol[ - output.dtype] + return default_rtol[output.dtype] diff --git a/tests/kernels/test_activation.py b/tests/kernels/test_activation.py index 7b29f94df59dd..a4b9f91c7688b 100644 --- a/tests/kernels/test_activation.py +++ b/tests/kernels/test_activation.py @@ -5,7 +5,6 @@ from vllm.model_executor.layers.activation import (FastGELU, GeluAndMul, NewGELU, SiluAndMul) -from vllm.utils import is_xpu from .allclose_default import get_default_atol, get_default_rtol @@ -15,9 +14,7 @@ SEEDS = [0] CUDA_DEVICES = [ f"cuda:{i}" for i in range(1 if torch.cuda.device_count() == 1 else 2) -] if torch.cuda.is_available() else [] -SYCL_DEVICES = ["xpu:0"] if is_xpu() else [] -DEVICES = CUDA_DEVICES + SYCL_DEVICES +] @pytest.mark.parametrize("activation", ["silu", "gelu", "gelu_tanh"]) @@ -25,7 +22,7 @@ @pytest.mark.parametrize("d", D) @pytest.mark.parametrize("dtype", DTYPES) @pytest.mark.parametrize("seed", SEEDS) -@pytest.mark.parametrize("device", DEVICES) +@pytest.mark.parametrize("device", CUDA_DEVICES) @torch.inference_mode() def test_act_and_mul( activation: str, @@ -50,10 +47,7 @@ def test_act_and_mul( ref_out = layer.forward_native(x) # The SiLU and GELU implementations are equivalent to the native PyTorch # implementations, so we can do exact comparison. - assert torch.allclose(out, - ref_out, - atol=get_default_atol(out), - rtol=get_default_rtol(out)) + assert torch.allclose(out, ref_out, atol=0.0, rtol=0.0) @pytest.mark.parametrize("activation", [FastGELU, NewGELU]) @@ -61,7 +55,7 @@ def test_act_and_mul( @pytest.mark.parametrize("d", D) @pytest.mark.parametrize("dtype", DTYPES) @pytest.mark.parametrize("seed", SEEDS) -@pytest.mark.parametrize("device", DEVICES) +@pytest.mark.parametrize("device", CUDA_DEVICES) @torch.inference_mode() def test_activation( activation: Type[torch.nn.Module], diff --git a/tests/kernels/test_attention.py b/tests/kernels/test_attention.py index c037648b4e493..8bc4766fc93c4 100644 --- a/tests/kernels/test_attention.py +++ b/tests/kernels/test_attention.py @@ -7,18 +7,17 @@ from xformers.ops.fmha.attn_bias import BlockDiagonalCausalMask from vllm import _custom_ops as ops -from vllm.utils import get_max_shared_memory_bytes, is_hip, is_xpu +from vllm.utils import get_max_shared_memory_bytes, is_hip from .allclose_default import get_default_atol, get_default_rtol FLOAT32_BYTES = torch.finfo(torch.float).bits // 8 # This will change depending on the compute capability. # - 512 as a buffer -MAX_SEQ_LEN = (get_max_shared_memory_bytes() // FLOAT32_BYTES - - 512) if not is_xpu else 1024 +MAX_SEQ_LEN = get_max_shared_memory_bytes() // FLOAT32_BYTES - 512 # There may not be enough gpu memory due to large NUM_BLOCKS. # Reduce NUM_BLOCKS when it happens. -NUM_BLOCKS = 4321 if not is_xpu() else 500 # Arbitrary values for testing +NUM_BLOCKS = 4321 # Arbitrary values for testing PARTITION_SIZE = 512 # flshattF and tritonflashattF supported: {torch.float16, torch.bfloat16} DTYPES = [torch.half, torch.bfloat16, torch.float @@ -33,14 +32,12 @@ ] if not is_hip() else [64, 80, 96, 112, 128] BLOCK_SIZES = [16, 32] -USE_ALIBI = [False, True] if not is_xpu() else [False] +USE_ALIBI = [False, True] KV_CACHE_DTYPE = ["auto", "fp8"] SEEDS = [0] CUDA_DEVICES = [ f"cuda:{i}" for i in range(1 if torch.cuda.device_count() == 1 else 2) -] if torch.cuda.is_available() else [] -SYCL_DEVICES = ["xpu:0"] if is_xpu() else [] -DEVICES = CUDA_DEVICES + SYCL_DEVICES +] def ref_masked_attention( @@ -123,7 +120,7 @@ def ref_single_query_cached_kv_attention( @pytest.mark.parametrize("dtype", DTYPES) @pytest.mark.parametrize("kv_cache_dtype", KV_CACHE_DTYPE) @pytest.mark.parametrize("seed", SEEDS) -@pytest.mark.parametrize("device", DEVICES) +@pytest.mark.parametrize("device", CUDA_DEVICES) def test_paged_attention( kv_cache_factory, version: str, @@ -316,7 +313,7 @@ def ref_multi_query_kv_attention( @pytest.mark.parametrize("head_size", HEAD_SIZES) @pytest.mark.parametrize("dtype", DTYPES) @pytest.mark.parametrize("seed", SEEDS) -@pytest.mark.parametrize("device", DEVICES) +@pytest.mark.parametrize("device", CUDA_DEVICES) @torch.inference_mode() def test_multi_query_kv_attention( num_seqs: int, diff --git a/tests/kernels/test_cache.py b/tests/kernels/test_cache.py index cfb860cbf6cbf..29572cfa57499 100644 --- a/tests/kernels/test_cache.py +++ b/tests/kernels/test_cache.py @@ -5,10 +5,8 @@ import torch from vllm import _custom_ops as ops -from vllm.utils import is_xpu -COPYING_DIRECTION = [('cuda', 'cpu'), ('cuda', 'cuda'), ('cpu', 'cuda')] \ - if not is_xpu() else [('xpu', 'cpu'), ('xpu', 'xpu'), ('cpu', 'xpu')] +COPYING_DIRECTION = [('cuda', 'cpu'), ('cuda', 'cuda'), ('cpu', 'cuda')] DTYPES = [torch.half, torch.bfloat16, torch.float] NUM_TOKENS = [42] # Arbitrary values for testing NUM_LAYERS = [1] # Arbitrary values for testing @@ -24,9 +22,8 @@ SEEDS = [0] CUDA_DEVICES = [ f"cuda:{i}" for i in range(1 if torch.cuda.device_count() == 1 else 2) -] if torch.cuda.is_available() else [] -SYCL_DEVICES = ["xpu:0"] if is_xpu() else [] -DEVICES = CUDA_DEVICES + SYCL_DEVICES +] + # We assume fp8 is always enabled for testing. KV_CACHE_DTYPE = ["auto", "fp8"] @@ -39,7 +36,7 @@ @pytest.mark.parametrize("num_blocks", NUM_BLOCKS) @pytest.mark.parametrize("dtype", DTYPES) @pytest.mark.parametrize("seed", SEEDS) -@pytest.mark.parametrize("device", DEVICES) +@pytest.mark.parametrize("device", CUDA_DEVICES) @pytest.mark.parametrize("kv_cache_dtype", KV_CACHE_DTYPE) @torch.inference_mode() def test_copy_blocks( @@ -112,7 +109,7 @@ def test_copy_blocks( @pytest.mark.parametrize("num_blocks", NUM_BLOCKS) @pytest.mark.parametrize("dtype", DTYPES) @pytest.mark.parametrize("seed", SEEDS) -@pytest.mark.parametrize("device", DEVICES) +@pytest.mark.parametrize("device", CUDA_DEVICES) @pytest.mark.parametrize("kv_cache_dtype", KV_CACHE_DTYPE) @torch.inference_mode() def test_reshape_and_cache( @@ -131,7 +128,6 @@ def test_reshape_and_cache( torch.random.manual_seed(seed) if torch.cuda.is_available(): torch.cuda.manual_seed(seed) - torch.xpu.empty_cache() torch.set_default_device(device) # Create a random slot mapping. num_slots = block_size * num_blocks @@ -283,7 +279,7 @@ def test_reshape_and_cache_flash( @pytest.mark.parametrize("num_blocks", NUM_BLOCKS) @pytest.mark.parametrize("dtype", DTYPES) @pytest.mark.parametrize("seed", SEEDS) -@pytest.mark.parametrize("device", DEVICES) +@pytest.mark.parametrize("device", CUDA_DEVICES) @pytest.mark.parametrize("kv_cache_dtype", KV_CACHE_DTYPE) @torch.inference_mode() def test_swap_blocks( @@ -306,8 +302,8 @@ def test_swap_blocks( if torch.cuda.is_available(): torch.cuda.manual_seed(seed) - src_device = device if direction[0] != "cpu" else 'cpu' - dst_device = device if direction[1] != "cpu" else 'cpu' + src_device = device if direction[0] == "cuda" else 'cpu' + dst_device = device if direction[1] == "cuda" else 'cpu' src_blocks = random.sample(range(num_blocks), num_mappings) # For the same device, mapping must not overlap diff --git a/tests/kernels/test_layernorm.py b/tests/kernels/test_layernorm.py index 7e7175962c76d..a635e6c12c594 100644 --- a/tests/kernels/test_layernorm.py +++ b/tests/kernels/test_layernorm.py @@ -2,7 +2,6 @@ import torch from vllm.model_executor.layers.layernorm import RMSNorm -from vllm.utils import is_xpu DTYPES = [torch.half, torch.bfloat16, torch.float] NUM_TOKENS = [7, 83, 4096] # Arbitrary values for testing @@ -12,9 +11,7 @@ SEEDS = [0] CUDA_DEVICES = [ f"cuda:{i}" for i in range(1 if torch.cuda.device_count() == 1 else 2) -] if torch.cuda.is_available() else [] -SYCL_DEVICES = ["xpu:0"] if is_xpu() else [] -DEVICES = CUDA_DEVICES + SYCL_DEVICES +] @pytest.mark.parametrize("num_tokens", NUM_TOKENS) @@ -22,7 +19,7 @@ @pytest.mark.parametrize("add_residual", ADD_RESIDUAL) @pytest.mark.parametrize("dtype", DTYPES) @pytest.mark.parametrize("seed", SEEDS) -@pytest.mark.parametrize("device", DEVICES) +@pytest.mark.parametrize("device", CUDA_DEVICES) @torch.inference_mode() def test_rms_norm( num_tokens: int, diff --git a/tests/kernels/test_pos_encoding.py b/tests/kernels/test_pos_encoding.py index 84c6596ce8407..e564e325112a6 100644 --- a/tests/kernels/test_pos_encoding.py +++ b/tests/kernels/test_pos_encoding.py @@ -5,7 +5,6 @@ import torch from vllm.model_executor.layers.rotary_embedding import get_rope -from vllm.utils import is_xpu from .allclose_default import get_default_atol, get_default_rtol @@ -19,9 +18,7 @@ SEEDS = [0] CUDA_DEVICES = [ f"cuda:{i}" for i in range(1 if torch.cuda.device_count() == 1 else 2) -] if torch.cuda.is_available() else [] -SYCL_DEVICES = ["xpu:0"] if is_xpu() else [] -DEVICES = CUDA_DEVICES + SYCL_DEVICES +] @pytest.mark.parametrize("is_neox_style", IS_NEOX_STYLE) @@ -32,7 +29,7 @@ @pytest.mark.parametrize("rotary_dim", ROTARY_DIMS) @pytest.mark.parametrize("dtype", DTYPES) @pytest.mark.parametrize("seed", SEEDS) -@pytest.mark.parametrize("device", DEVICES) +@pytest.mark.parametrize("device", CUDA_DEVICES) @torch.inference_mode() def test_rotary_embedding( is_neox_style: bool, @@ -76,8 +73,8 @@ def test_rotary_embedding( rtol=get_default_rtol(out_query)) assert torch.allclose(out_key, ref_key, - atol=get_default_atol(out_query), - rtol=get_default_rtol(out_query)) + atol=get_default_atol(out_key), + rtol=get_default_rtol(out_key)) @pytest.mark.parametrize("is_neox_style", IS_NEOX_STYLE) @@ -88,7 +85,7 @@ def test_rotary_embedding( @pytest.mark.parametrize("rotary_dim", ROTARY_DIMS) @pytest.mark.parametrize("dtype", DTYPES) @pytest.mark.parametrize("seed", SEEDS) -@pytest.mark.parametrize("device", DEVICES) +@pytest.mark.parametrize("device", CUDA_DEVICES) @torch.inference_mode() def test_batched_rotary_embedding( is_neox_style: bool, @@ -150,7 +147,7 @@ def test_batched_rotary_embedding( @pytest.mark.parametrize("rotary_dim", ROTARY_DIMS) @pytest.mark.parametrize("dtype", DTYPES) @pytest.mark.parametrize("seed", SEEDS) -@pytest.mark.parametrize("device", DEVICES) +@pytest.mark.parametrize("device", CUDA_DEVICES) @torch.inference_mode() def test_batched_rotary_embedding_multi_lora( is_neox_style: bool, diff --git a/vllm/executor/xpu_executor.py b/vllm/executor/xpu_executor.py index 3e6791280079d..8974a7d37eaac 100644 --- a/vllm/executor/xpu_executor.py +++ b/vllm/executor/xpu_executor.py @@ -49,13 +49,13 @@ def __init__( self._init_executor() def _init_spec_worker(self): - logger.error("not support speculative for XPU executor!") + raise NotImplementedError("XPU does not support speculative decoding") def _init_non_spec_worker(self): from vllm.worker.xpu_worker import XPUWorker assert self.parallel_config.world_size == 1, ( - "XPUExecutor only supports single GPU.") + "XPUExecutor only supports a single XPU.") distributed_init_method = get_distributed_init_method( get_ip(), get_open_port()) @@ -90,7 +90,7 @@ async def execute_model_async( execute_model_req: ExecuteModelRequest, ) -> List[SamplerOutput]: output = await make_async(self.driver_worker.execute_model - )(execute_model_req=execute_model_req, ) + )(execute_model_req=execute_model_req) return output diff --git a/vllm/worker/xpu_model_runner.py b/vllm/worker/xpu_model_runner.py index 047acab67a7c9..606eee31c1db9 100644 --- a/vllm/worker/xpu_model_runner.py +++ b/vllm/worker/xpu_model_runner.py @@ -25,7 +25,7 @@ ] -class XPUModelRunner(): +class XPUModelRunner: def __init__( self, @@ -51,12 +51,8 @@ def __init__( self.vision_language_config = vision_language_config self.is_driver_worker = is_driver_worker - # model_config can be None in tests/samplers/test_sampler.py. - # FIXME(woosuk): This is a hack to make the tests work. Refactor this. - self.sliding_window = (model_config.get_sliding_window() - if model_config is not None else None) - self.device_config = (device_config - if device_config is not None else DeviceConfig()) + self.sliding_window = model_config.get_sliding_window() + self.device_config = device_config self.device = self.device_config.device self.kv_cache_dtype = kv_cache_dtype @@ -75,12 +71,8 @@ def __init__( self.block_size, ) - # self.attn_backend = get_attn_backend( - # self.model_config.dtype if model_config is not None else None) - # Lazy initialization. self.model: nn.Module # Set after init_Model - # self.block_size: int # Set after initial profiling. def load_model(self) -> None: with CudaMemoryProfiler() as m: @@ -142,9 +134,6 @@ def profile_run(self) -> None: torch.xpu.synchronize() return - def set_block_size(self, block_size: int) -> None: - self.block_size = block_size - def prepare_input_tensors( self, seq_group_metadata_list: List[SequenceGroupMetadata], diff --git a/vllm/worker/xpu_worker.py b/vllm/worker/xpu_worker.py index 529f416232423..2e096106026cb 100644 --- a/vllm/worker/xpu_worker.py +++ b/vllm/worker/xpu_worker.py @@ -1,6 +1,7 @@ """A XPU worker class.""" import gc -from typing import Any, Dict, List, Optional, Tuple +import os +from typing import List, Optional, Tuple import intel_extension_for_pytorch # noqa: F401 import oneccl_bindings_for_pytorch # noqa: F401 @@ -10,22 +11,20 @@ from vllm.config import (CacheConfig, DeviceConfig, LoadConfig, LoRAConfig, ModelConfig, ParallelConfig, SchedulerConfig, VisionLanguageConfig) -from vllm.distributed import (broadcast_tensor_dict, - ensure_model_parallel_initialized, +from vllm.distributed import (ensure_model_parallel_initialized, init_distributed_environment) from vllm.logger import init_logger from vllm.model_executor import set_random_seed -from vllm.sequence import ExecuteModelRequest, SamplerOutput from vllm.utils import is_xpu from vllm.worker.cache_engine import CacheEngine -from vllm.worker.worker import raise_if_cache_size_invalid +from vllm.worker.worker import Worker from vllm.worker.worker_base import LoraNotSupportedWorkerBase from vllm.worker.xpu_model_runner import XPUModelRunner logger = init_logger(__name__) -class XPUWorker(LoraNotSupportedWorkerBase): +class XPUWorker(LoraNotSupportedWorkerBase, Worker): """A worker class that executes (a partition of) the model on a GPU. Each worker is associated with a single XPU device. The worker is @@ -71,7 +70,7 @@ def __init__( assert not self.lora_config, ( "To be tested: vision language model with LoRA settings.") - self.model_runner = XPUModelRunner( + self.model_runner = XPUModelRunner( # type: ignore model_config, parallel_config, scheduler_config, @@ -99,13 +98,11 @@ def init_device(self) -> None: raise RuntimeError( f"Not support device type: {self.device_config.device}") # Initialize the distributed environment. - self.init_distributed_environment() + self.init_worker_distributed_environment() # Initialize the model. set_random_seed(self.model_config.seed) - def load_model(self): - self.model_runner.load_model() - + # keep this method for `empty_cache` and `synchronize` api @torch.inference_mode() def determine_num_available_blocks(self) -> Tuple[int, int]: """Profiles the peak memory usage of the model to determine how many @@ -154,101 +151,11 @@ def determine_num_available_blocks(self) -> Tuple[int, int]: torch.xpu.empty_cache() return num_gpu_blocks, num_cpu_blocks - def initialize_cache(self, num_gpu_blocks: int, - num_cpu_blocks: int) -> None: - """Allocate GPU and CPU KV cache with the specified number of blocks. - - This also warms up the model, which may record CUDA graphs. - """ - raise_if_cache_size_invalid(num_gpu_blocks, - self.cache_config.block_size, - self.model_config.max_model_len) - - self.cache_config.num_gpu_blocks = num_gpu_blocks - self.cache_config.num_cpu_blocks = num_cpu_blocks - - self._init_cache_engine() - self._warm_up_model() - - def _init_cache_engine(self) -> None: - assert self.cache_config.num_gpu_blocks is not None - self.cache_engine = CacheEngine(self.cache_config, self.model_config, - self.parallel_config, - self.device_config) - self.gpu_cache = self.cache_engine.gpu_cache - self.model_runner.set_block_size(self.cache_engine.block_size) - def _warm_up_model(self) -> None: # IPEX don't support capture graph yet pass - def get_cache_block_size_bytes(self) -> int: - """Get the size of the KV cache block size in bytes. - """ - return CacheEngine.get_cache_block_size(self.cache_config, - self.model_config, - self.parallel_config) - - @torch.inference_mode() - def execute_model( - self, - execute_model_req: Optional[ExecuteModelRequest] = None, - ) -> List[SamplerOutput]: - if execute_model_req is None: - seq_group_metadata_list = None - else: - seq_group_metadata_list = execute_model_req.seq_group_metadata_list - - if self.is_driver_worker: - assert seq_group_metadata_list is not None - num_seq_groups = len(seq_group_metadata_list) - assert execute_model_req is not None - blocks_to_swap_in = execute_model_req.blocks_to_swap_in - blocks_to_swap_out = execute_model_req.blocks_to_swap_out - blocks_to_copy = execute_model_req.blocks_to_copy - data: Dict[str, Any] = { - "num_seq_groups": num_seq_groups, - "blocks_to_swap_in": blocks_to_swap_in, - "blocks_to_swap_out": blocks_to_swap_out, - "blocks_to_copy": blocks_to_copy, - } - broadcast_tensor_dict(data, src=0) - else: - data = broadcast_tensor_dict(src=0) - num_seq_groups = data["num_seq_groups"] - blocks_to_swap_in = data["blocks_to_swap_in"] - blocks_to_swap_out = data["blocks_to_swap_out"] - blocks_to_copy = data["blocks_to_copy"] - - assert blocks_to_swap_in is not None - assert blocks_to_swap_out is not None - assert blocks_to_copy is not None - self.cache_swap(blocks_to_swap_in, blocks_to_swap_out, blocks_to_copy) - - # If there is no input, we don't need to execute the model. - if num_seq_groups == 0: - return [] - - output = self.model_runner.execute_model(seq_group_metadata_list, - self.gpu_cache) - return [output] - - def cache_swap( - self, - blocks_to_swap_in: Dict[int, int], - blocks_to_swap_out: Dict[int, int], - blocks_to_copy: Dict[int, List[int]], - ) -> None: - # Issue cache operations. - # TODO(woosuk): Profile swapping overhead and optimize if needed. - if blocks_to_swap_in: - self.cache_engine.swap_in(blocks_to_swap_in) - if blocks_to_swap_out: - self.cache_engine.swap_out(blocks_to_swap_out) - if blocks_to_copy: - self.cache_engine.copy(blocks_to_copy) - - def init_distributed_environment(self) -> None: + def init_worker_distributed_environment(self) -> None: """Initialize the distributed environment.""" parallel_config = self.parallel_config @@ -267,7 +174,6 @@ def init_distributed_environment(self) -> None: "distributed_init_method must be set if torch.distributed " "is not already initialized") else: - import os ENV_CCL_ZE_IPC_EXCHANGE = os.getenv("CCL_ZE_IPC_EXCHANGE", "sockets") os.environ['CCL_ZE_IPC_EXCHANGE'] = ENV_CCL_ZE_IPC_EXCHANGE @@ -277,8 +183,6 @@ def init_distributed_environment(self) -> None: distributed_init_method=distributed_init_method, local_rank=self.local_rank, backend="ccl") - # A small all_reduce for warmup. - torch.distributed.all_reduce(torch.zeros(1).xpu()) ensure_model_parallel_initialized( parallel_config.tensor_parallel_size, From 07c139b544ecd04b13cdbaf9b090bc4d925b13f7 Mon Sep 17 00:00:00 2001 From: Kunshang Ji Date: Thu, 16 May 2024 23:49:34 +0800 Subject: [PATCH 52/67] fix tp issues --- vllm/attention/backends/torch_sdpa.py | 1 - vllm/executor/ray_xpu_executor.py | 7 ------- vllm/worker/xpu_worker.py | 3 +++ 3 files changed, 3 insertions(+), 8 deletions(-) diff --git a/vllm/attention/backends/torch_sdpa.py b/vllm/attention/backends/torch_sdpa.py index 0e3e8309dc4a0..0fec1a466629b 100644 --- a/vllm/attention/backends/torch_sdpa.py +++ b/vllm/attention/backends/torch_sdpa.py @@ -6,7 +6,6 @@ import torch from torch.nn.functional import scaled_dot_product_attention -from vllm._ipex_ops import ipex_ops from vllm.attention.backends.abstract import (AttentionBackend, AttentionImpl, AttentionMetadata) from vllm.attention.ops.paged_attn import PagedAttentionMetadata diff --git a/vllm/executor/ray_xpu_executor.py b/vllm/executor/ray_xpu_executor.py index a889c7d4abddb..4382a7a4a814b 100644 --- a/vllm/executor/ray_xpu_executor.py +++ b/vllm/executor/ray_xpu_executor.py @@ -58,7 +58,6 @@ def __init__( self.device_config = device_config self.vision_language_config = vision_language_config - assert self.parallel_config.worker_use_ray placement_group = self.parallel_config.placement_group # Disable Ray usage stats collection. @@ -166,13 +165,7 @@ def _init_workers_ray(self, placement_group: "PlacementGroup", for node_id, gpu_ids in node_gpus.items(): node_gpus[node_id] = sorted(gpu_ids) - # VLLM_INSTANCE_ID = get_vllm_instance_id() - # TODO: add env var for xpu - # Set environment variables for the driver and workers. - # all_args_to_update_environment_variables = [] - # self._run_workers("update_environment_variables", - # all_args=all_args_to_update_environment_variables) distributed_init_method = get_distributed_init_method( driver_ip, get_open_port()) diff --git a/vllm/worker/xpu_worker.py b/vllm/worker/xpu_worker.py index 2e096106026cb..6cc7b1c5cfb2a 100644 --- a/vllm/worker/xpu_worker.py +++ b/vllm/worker/xpu_worker.py @@ -174,6 +174,9 @@ def init_worker_distributed_environment(self) -> None: "distributed_init_method must be set if torch.distributed " "is not already initialized") else: + # use sockets as default Level zero IPC exchange backend. By + # default oneccl will use `drmfd` as mechanism which need extra + # dependency (libdrm and drm headers) on your system. ENV_CCL_ZE_IPC_EXCHANGE = os.getenv("CCL_ZE_IPC_EXCHANGE", "sockets") os.environ['CCL_ZE_IPC_EXCHANGE'] = ENV_CCL_ZE_IPC_EXCHANGE From 634c9511127445be940ace217a27d4d435b94770 Mon Sep 17 00:00:00 2001 From: Kunshang Ji Date: Fri, 17 May 2024 01:55:49 +0800 Subject: [PATCH 53/67] fix worker --- vllm/executor/xpu_executor.py | 47 ++++++++++++++--------------------- vllm/worker/xpu_worker.py | 3 ++- 2 files changed, 21 insertions(+), 29 deletions(-) diff --git a/vllm/executor/xpu_executor.py b/vllm/executor/xpu_executor.py index 8974a7d37eaac..d37200bd02de3 100644 --- a/vllm/executor/xpu_executor.py +++ b/vllm/executor/xpu_executor.py @@ -9,8 +9,8 @@ from vllm.executor.gpu_executor import GPUExecutor from vllm.logger import init_logger from vllm.sequence import ExecuteModelRequest, SamplerOutput -from vllm.utils import (get_distributed_init_method, get_ip, get_open_port, - make_async) +from vllm.utils import make_async +from vllm.worker.worker_base import WorkerWrapperBase logger = init_logger(__name__) @@ -48,33 +48,24 @@ def __init__( # Instantiate the worker and load the model to GPU. self._init_executor() - def _init_spec_worker(self): - raise NotImplementedError("XPU does not support speculative decoding") - - def _init_non_spec_worker(self): - from vllm.worker.xpu_worker import XPUWorker - - assert self.parallel_config.world_size == 1, ( - "XPUExecutor only supports a single XPU.") - - distributed_init_method = get_distributed_init_method( - get_ip(), get_open_port()) - self.driver_worker = XPUWorker( - model_config=self.model_config, - parallel_config=self.parallel_config, - scheduler_config=self.scheduler_config, - device_config=self.device_config, - cache_config=self.cache_config, - load_config=self.load_config, - local_rank=0, - rank=0, - distributed_init_method=distributed_init_method, - lora_config=self.lora_config, - vision_language_config=self.vision_language_config, - is_driver_worker=True, + def _create_worker(self, + local_rank: int = 0, + rank: int = 0, + distributed_init_method: Optional[str] = None): + if self.speculative_config is None: + worker_module_name = "vllm.worker.xpu_worker" + worker_class_name = "XPUWorker" + else: + raise NotImplementedError( + "XPU does not support speculative decoding") + + wrapper = WorkerWrapperBase( + worker_module_name=worker_module_name, + worker_class_name=worker_class_name, ) - self.driver_worker.init_device() - self.driver_worker.load_model() + wrapper.init_worker(**self._get_worker_kwargs(local_rank, rank, + distributed_init_method)) + return wrapper.worker def execute_model( self, diff --git a/vllm/worker/xpu_worker.py b/vllm/worker/xpu_worker.py index 6cc7b1c5cfb2a..773ee9f8159e1 100644 --- a/vllm/worker/xpu_worker.py +++ b/vllm/worker/xpu_worker.py @@ -10,7 +10,7 @@ from vllm.config import (CacheConfig, DeviceConfig, LoadConfig, LoRAConfig, ModelConfig, ParallelConfig, SchedulerConfig, - VisionLanguageConfig) + SpeculativeConfig, VisionLanguageConfig) from vllm.distributed import (ensure_model_parallel_initialized, init_distributed_environment) from vllm.logger import init_logger @@ -46,6 +46,7 @@ def __init__( distributed_init_method: str, lora_config: Optional[LoRAConfig] = None, vision_language_config: Optional[VisionLanguageConfig] = None, + speculative_config: Optional[SpeculativeConfig] = None, is_driver_worker: bool = False, ) -> None: assert device_config.device_type == "xpu" From f0e6407275d7a4e046023193216274fa1b732568 Mon Sep 17 00:00:00 2001 From: Kunshang Ji Date: Thu, 23 May 2024 18:20:04 +0800 Subject: [PATCH 54/67] fix ray xpu executor --- vllm/executor/ray_xpu_executor.py | 88 +++++++++++++++---------------- 1 file changed, 44 insertions(+), 44 deletions(-) diff --git a/vllm/executor/ray_xpu_executor.py b/vllm/executor/ray_xpu_executor.py index 4382a7a4a814b..dd7c82289341e 100644 --- a/vllm/executor/ray_xpu_executor.py +++ b/vllm/executor/ray_xpu_executor.py @@ -3,7 +3,8 @@ import pickle from collections import defaultdict from itertools import islice, repeat -from typing import TYPE_CHECKING, Any, Dict, List, Optional, Set, Tuple +from typing import (TYPE_CHECKING, Any, Awaitable, Dict, List, Optional, Set, + Tuple, Union) from vllm.config import (CacheConfig, DeviceConfig, LoadConfig, LoRAConfig, ModelConfig, ParallelConfig, SchedulerConfig, @@ -73,6 +74,13 @@ def __init__( if USE_RAY_COMPILED_DAG: self.forward_dag = self._compiled_ray_dag() + # This is non-None when the execute model loop is running + # in the parallel workers. It's a coroutine in the AsyncLLMEngine case. + self.parallel_worker_tasks: Optional[Union[Any, Awaitable[Any]]] = None + # Updated by implementations that require additional args to be passed + # to the _run_workers execute_model call + self.extra_execute_model_run_workers_kwargs: Dict[str, Any] = {} + def _init_executor(self) -> None: pass @@ -221,17 +229,17 @@ def initialize_cache(self, num_gpu_blocks: int, num_gpu_blocks=num_gpu_blocks, num_cpu_blocks=num_cpu_blocks) - def execute_model( - self, - execute_model_req: ExecuteModelRequest) -> List[SamplerOutput]: - all_outputs = self._run_workers( - "execute_model", - driver_kwargs={"execute_model_req": execute_model_req}, - use_ray_compiled_dag=USE_RAY_COMPILED_DAG) + def _driver_execute_model( + self, + execute_model_req: Optional[ExecuteModelRequest] = None + ) -> List[SamplerOutput]: + """Run execute_model in the driver worker. - # Only the driver worker returns the sampling results. - output = all_outputs[0] - return output + Passing None will cause the driver to stop the model execution + loop running in each of the remote workers. + """ + return self.driver_worker.execute_method("execute_model", + execute_model_req) def add_lora(self, lora_request: LoRARequest) -> bool: assert lora_request.lora_int_id > 0, "lora_id must be greater than 0." @@ -254,8 +262,7 @@ def _run_workers( self, method: str, *args, - driver_args: Optional[Tuple[Any, ...]] = None, - driver_kwargs: Optional[Dict[str, Any]] = None, + async_run_remote_workers_only: bool = False, all_args: Optional[List[Tuple[Any, ...]]] = None, all_kwargs: Optional[List[Dict[str, Any]]] = None, use_dummy_driver: bool = False, @@ -277,11 +284,6 @@ def _run_workers( raise NotImplementedError( "max_concurrent_workers is not supported yet.") - if driver_args is None: - driver_args = args if all_args is None else all_args[0] - if driver_kwargs is None: - driver_kwargs = kwargs if all_kwargs is None else all_kwargs[0] - count = len(self.workers) all_worker_args = repeat(args, count) if all_args is None \ else islice(all_args, 1, None) @@ -301,6 +303,12 @@ def _run_workers( for (worker, worker_args, worker_kwargs ) in zip(self.workers, all_worker_args, all_worker_kwargs) ] + if async_run_remote_workers_only: + # Just return futures + return ray_worker_outputs + + driver_args = args if all_args is None else all_args[0] + driver_kwargs = kwargs if all_kwargs is None else all_kwargs[0] # Start the driver worker after all the ray workers. if not use_dummy_driver: @@ -328,6 +336,11 @@ def _run_workers( return [driver_worker_output] + ray_worker_outputs + def _wait_for_tasks_completion(self, parallel_worker_tasks: Any) -> None: + """Wait for futures returned from _run_workers() with + async_run_remote_workers_only to complete.""" + ray.get(parallel_worker_tasks) + def _compiled_ray_dag(self): import pkg_resources required_version = "2.9" @@ -371,31 +384,18 @@ class RayXPUExecutorAsync(RayXPUExecutor, DistributedGPUExecutorAsync): def __init__(self, *args, **kwargs): super().__init__(*args, **kwargs) - self.driver_executor = make_async(self.driver_worker.execute_method) + self.driver_exec_method = make_async(self.driver_worker.execute_method) - async def _run_workers_async( + async def _driver_execute_model_async( self, - method: str, - *args, - driver_args: Optional[Tuple[Any]] = None, - driver_kwargs: Optional[Dict[str, Any]] = None, - **kwargs, - ) -> Any: - """Runs the given method on all workers.""" - coros = [] - - if driver_args is None: - driver_args = args - if driver_kwargs is None: - driver_kwargs = kwargs - - # Run the driver worker asynchronously. - driver_executor = make_async(getattr(self.driver_worker, method)) - coros.append(driver_executor(*driver_args, **driver_kwargs)) - - # Run the ray workers asynchronously. - for worker in self.workers: - coros.append(worker.execute_method.remote(method, *args, **kwargs)) - - all_outputs = await asyncio.gather(*coros) - return all_outputs + execute_model_req: Optional[ExecuteModelRequest] = None + ) -> List[SamplerOutput]: + return await self.driver_exec_method("execute_model", + execute_model_req) + + async def _start_worker_execution_loop(self): + coros = [ + worker.execute_method.remote("start_worker_execution_loop") + for worker in self.workers + ] + return await asyncio.gather(*coros) From 6871d555a7c07a6f28240ec73154112e5939af0c Mon Sep 17 00:00:00 2001 From: Kunshang Ji Date: Tue, 4 Jun 2024 00:18:23 +0800 Subject: [PATCH 55/67] fix due to code rebase --- vllm/_ipex_ops.py | 10 ++++++++++ vllm/attention/ops/paged_attn.py | 1 - vllm/worker/xpu_model_runner.py | 12 ++++++------ 3 files changed, 16 insertions(+), 7 deletions(-) diff --git a/vllm/_ipex_ops.py b/vllm/_ipex_ops.py index 997009c751e28..dc63496b3ffa5 100644 --- a/vllm/_ipex_ops.py +++ b/vllm/_ipex_ops.py @@ -57,6 +57,11 @@ def paged_attention_v1( alibi_slopes: Optional[torch.Tensor], kv_cache_dtype: str, kv_scale: float, + tp_rank: int = 0, + blocksparse_local_blocks: int = 0, + blocksparse_vert_stride: int = 0, + blocksparse_block_size: int = 64, + blocksparse_head_sliding_step: int = 0, ) -> None: assert kv_cache_dtype == "auto" num_heads = out.size(1) @@ -92,6 +97,11 @@ def paged_attention_v2( alibi_slopes: Optional[torch.Tensor], kv_cache_dtype: str, kv_scale: float, + tp_rank: int = 0, + blocksparse_local_blocks: int = 0, + blocksparse_vert_stride: int = 0, + blocksparse_block_size: int = 64, + blocksparse_head_sliding_step: int = 0, ) -> None: assert kv_cache_dtype == "auto" num_heads = out.size(1) diff --git a/vllm/attention/ops/paged_attn.py b/vllm/attention/ops/paged_attn.py index c30e0d78a4935..170e9ec38aca6 100644 --- a/vllm/attention/ops/paged_attn.py +++ b/vllm/attention/ops/paged_attn.py @@ -119,7 +119,6 @@ def forward_decode( # For context len > 8192, use V2 kernel to avoid shared memory shortage. use_v1 = (max_seq_len <= 8192 and (max_num_partitions == 1 or num_seqs * num_heads > 512)) - use_v1 = use_v1 or is_xpu() # ipex page_attn v2 is not ready yet. if use_v1: # Run PagedAttention V1. ops.paged_attention_v1( diff --git a/vllm/worker/xpu_model_runner.py b/vllm/worker/xpu_model_runner.py index 606eee31c1db9..926c81364648f 100644 --- a/vllm/worker/xpu_model_runner.py +++ b/vllm/worker/xpu_model_runner.py @@ -11,10 +11,9 @@ from vllm.logger import init_logger from vllm.model_executor.model_loader import get_model from vllm.sampling_params import SamplingParams -from vllm.sequence import SamplerOutput, SequenceGroupMetadata +from vllm.sequence import SamplerOutput, SequenceData, SequenceGroupMetadata from vllm.utils import CudaMemoryProfiler, make_tensor_with_pad -from vllm.worker.model_runner import (AttentionMetadata, SamplingMetadata, - _prepare_fake_inputs) +from vllm.worker.model_runner import AttentionMetadata, SamplingMetadata logger = init_logger(__name__) @@ -114,8 +113,9 @@ def profile_run(self) -> None: for group_id in range(max_num_seqs): seq_len = (max_num_batched_tokens // max_num_seqs + (group_id < max_num_batched_tokens % max_num_seqs)) - seq_data, fake_multi_modal_input = _prepare_fake_inputs( - seq_len, None) + + seq_data = SequenceData([0] * seq_len) + dummy_multi_modal_data = None seq = SequenceGroupMetadata( request_id=str(group_id), is_prompt=True, @@ -123,7 +123,7 @@ def profile_run(self) -> None: sampling_params=sampling_params, block_tables=None, lora_request=None, - multi_modal_data=fake_multi_modal_input, + multi_modal_data=dummy_multi_modal_data, ) seqs.append(seq) From bef2c786e8f9e8b15e6241064bd721c18c02c519 Mon Sep 17 00:00:00 2001 From: Kunshang Ji Date: Tue, 4 Jun 2024 00:23:30 +0800 Subject: [PATCH 56/67] setuptools version --- requirements-xpu.txt | 2 ++ 1 file changed, 2 insertions(+) diff --git a/requirements-xpu.txt b/requirements-xpu.txt index 3eec032234b5a..48d899ec70eda 100644 --- a/requirements-xpu.txt +++ b/requirements-xpu.txt @@ -1,6 +1,8 @@ # Common dependencies -r requirements-common.txt +setuptools < 70.0.0 # IPEX's torch have some dependency. to be removed. + torch @ https://intel-extension-for-pytorch.s3.amazonaws.com/ipex_dev/xpu/torch-2.1.0.post1%2Bcxx11.abi-cp310-cp310-linux_x86_64.whl intel_extension_for_pytorch @ https://intel-extension-for-pytorch.s3.amazonaws.com/ipex_dev/xpu/intel_extension_for_pytorch-2.1.30a0-cp310-cp310-linux_x86_64.whl oneccl_bind_pt @ https://intel-extension-for-pytorch.s3.amazonaws.com/ipex_stable/xpu/oneccl_bind_pt-2.1.200%2Bxpu-cp310-cp310-linux_x86_64.whl From f037737026b8a9508450a4e9cab5e12bd8398258 Mon Sep 17 00:00:00 2001 From: Kunshang Ji Date: Wed, 5 Jun 2024 00:11:12 +0800 Subject: [PATCH 57/67] update docker file, due to public key expired. --- Dockerfile.xpu | 10 +++++++++- 1 file changed, 9 insertions(+), 1 deletion(-) diff --git a/Dockerfile.xpu b/Dockerfile.xpu index 15c4a50116d68..c39e551672d20 100644 --- a/Dockerfile.xpu +++ b/Dockerfile.xpu @@ -1,5 +1,13 @@ FROM intel/oneapi-basekit:2024.1.0-devel-ubuntu22.04 - + +RUN wget -O- https://apt.repos.intel.com/intel-gpg-keys/GPG-PUB-KEY-INTEL-SW-PRODUCTS.PUB | gpg --dearmor | tee /usr/share/keyrings/intel-oneapi-archive-keyring.gpg > /dev/null && \ + echo "deb [signed-by=/usr/share/keyrings/intel-oneapi-archive-keyring.gpg] https://apt.repos.intel.com/oneapi all main " | tee /etc/apt/sources.list.d/oneAPI.list && \ + chmod 644 /usr/share/keyrings/intel-oneapi-archive-keyring.gpg && \ + rm /etc/apt/sources.list.d/intel-graphics.list && \ + wget -O- https://repositories.intel.com/graphics/intel-graphics.key | gpg --dearmor | tee /usr/share/keyrings/intel-graphics.gpg > /dev/null && \ + echo "deb [arch=amd64,i386 signed-by=/usr/share/keyrings/intel-graphics.gpg] https://repositories.intel.com/graphics/ubuntu jammy arc" | tee /etc/apt/sources.list.d/intel.gpu.jammy.list && \ + chmod 644 /usr/share/keyrings/intel-graphics.gpg + RUN apt-get update -y \ && apt-get install -y curl libicu70 lsb-release git wget vim numactl python3 python3-pip From 84f6b3a55cb47893b6bfdb291f8e5c4df037fa28 Mon Sep 17 00:00:00 2001 From: Kunshang Ji Date: Fri, 7 Jun 2024 01:51:55 +0800 Subject: [PATCH 58/67] add RayXPUExecutorAsync for serving --- vllm/engine/async_llm_engine.py | 14 ++++++++++---- 1 file changed, 10 insertions(+), 4 deletions(-) diff --git a/vllm/engine/async_llm_engine.py b/vllm/engine/async_llm_engine.py index 7f0f381f51cc7..e068eba478a02 100644 --- a/vllm/engine/async_llm_engine.py +++ b/vllm/engine/async_llm_engine.py @@ -384,10 +384,16 @@ def from_engine_args( from vllm.executor.cpu_executor import CPUExecutorAsync executor_class = CPUExecutorAsync elif engine_config.device_config.device_type == "xpu": - assert distributed_executor_backend is None, ( - "Distributed execution is not supported with the XPU backend.") - from vllm.executor.xpu_executor import XPUExecutorAsync - executor_class = XPUExecutorAsync + if distributed_executor_backend is None: + from vllm.executor.xpu_executor import XPUExecutorAsync + executor_class = XPUExecutorAsync + elif distributed_executor_backend == "ray": + initialize_ray_cluster(engine_config.parallel_config) + from vllm.executor.ray_xpu_executor import RayXPUExecutorAsync + executor_class = RayXPUExecutorAsync + else: + raise RuntimeError( + "Not supported distributed execution model on XPU device.") elif distributed_executor_backend == "ray": initialize_ray_cluster(engine_config.parallel_config) from vllm.executor.ray_gpu_executor import RayGPUExecutorAsync From a1f2970d7e6475ad99662a5019794706eff733bd Mon Sep 17 00:00:00 2001 From: Kunshang Ji Date: Fri, 7 Jun 2024 21:05:16 +0800 Subject: [PATCH 59/67] update _custom_ops.py --- vllm/_custom_ops.py | 2 -- 1 file changed, 2 deletions(-) diff --git a/vllm/_custom_ops.py b/vllm/_custom_ops.py index 20eef1a893ca4..93574c828f9c7 100644 --- a/vllm/_custom_ops.py +++ b/vllm/_custom_ops.py @@ -13,8 +13,6 @@ except ImportError as e: logger.warning("Failed to import from vllm._C with %r", e) -from vllm.utils import is_xpu - if is_xpu(): from vllm._ipex_ops import ipex_cache_ops as vllm_cache_ops from vllm._ipex_ops import ipex_ops as vllm_ops From ca882701edb9b4c91de11d13f2c2e4dfc1c9d0fa Mon Sep 17 00:00:00 2001 From: Kunshang Ji Date: Thu, 13 Jun 2024 19:18:14 +0800 Subject: [PATCH 60/67] add ipex_attn backend --- vllm/_custom_ops.py | 37 +- vllm/_ipex_ops.py | 5 - vllm/attention/backends/ipex_attn.py | 393 ++++++++++++++++++ vllm/attention/selector.py | 15 +- vllm/config.py | 2 +- vllm/engine/arg_utils.py | 12 +- vllm/model_executor/custom_op.py | 8 +- vllm/model_executor/layers/activation.py | 35 ++ vllm/model_executor/layers/layernorm.py | 24 ++ .../model_executor/layers/rotary_embedding.py | 23 + 10 files changed, 500 insertions(+), 54 deletions(-) create mode 100644 vllm/attention/backends/ipex_attn.py diff --git a/vllm/_custom_ops.py b/vllm/_custom_ops.py index 93574c828f9c7..ab2a67950bfea 100644 --- a/vllm/_custom_ops.py +++ b/vllm/_custom_ops.py @@ -13,41 +13,6 @@ except ImportError as e: logger.warning("Failed to import from vllm._C with %r", e) -if is_xpu(): - from vllm._ipex_ops import ipex_cache_ops as vllm_cache_ops - from vllm._ipex_ops import ipex_ops as vllm_ops - -with contextlib.suppress(ImportError): - import vllm._moe_C - -with contextlib.suppress(ImportError): - # ruff: noqa: F401 - import vllm._punica_C - - -def is_custom_op_supported(op_name: str) -> bool: - op, overloads = torch._C._jit_get_operation(op_name) - return op is not None - - -def hint_on_error(fn): - - @functools.wraps(fn) - def wrapper(*args, **kwargs): - try: - return fn(*args, **kwargs) - except AttributeError as e: - msg = ( - "Error in calling custom op %s: %s\n" - "Possibly you have built or installed an obsolete version of vllm.\n" - "Please try a clean build and install of vllm," - "or remove old built files such as vllm/*cpython*.so and build/ ." - ) - logger.error(msg, fn.__name__, e) - raise e - - return wrapper - with contextlib.suppress(ImportError): import vllm._moe_C @@ -411,7 +376,7 @@ def reshape_and_cache_flash( def copy_blocks(key_caches: List[torch.Tensor], value_caches: List[torch.Tensor], block_mapping: torch.Tensor) -> None: - vllm_cache_ops.copy_blocks(key_caches, value_caches, block_mapping) + torch.ops._C_cache_ops.copy_blocks(key_caches, value_caches, block_mapping) def swap_blocks(src: torch.Tensor, dst: torch.Tensor, diff --git a/vllm/_ipex_ops.py b/vllm/_ipex_ops.py index dc63496b3ffa5..1e60e0848673b 100644 --- a/vllm/_ipex_ops.py +++ b/vllm/_ipex_ops.py @@ -217,10 +217,6 @@ def varlen_attention( softmax_scale, zero_tensors, is_causal, return_softmax, gen_) - -class ipex_cache_ops: - - @staticmethod def reshape_and_cache( key: torch.Tensor, value: torch.Tensor, @@ -240,7 +236,6 @@ def copy_blocks(key_caches: List[torch.Tensor], block_mapping: torch.Tensor) -> None: torch.xpu.copy_blocks(key_caches, value_caches, block_mapping) - @staticmethod def swap_blocks(src: torch.Tensor, dst: torch.Tensor, block_mapping: torch.Tensor) -> None: torch.xpu.swap_blocks(src, dst, block_mapping) diff --git a/vllm/attention/backends/ipex_attn.py b/vllm/attention/backends/ipex_attn.py new file mode 100644 index 0000000000000..78eac48ef6657 --- /dev/null +++ b/vllm/attention/backends/ipex_attn.py @@ -0,0 +1,393 @@ +""" Attention layer with torch scaled_dot_product_attention + and PagedAttention.""" +from dataclasses import dataclass +from typing import Any, Dict, List, Optional, Tuple, Type + +import torch +from torch.nn.functional import scaled_dot_product_attention + +from vllm._ipex_ops import ipex_ops +from vllm.attention.backends.abstract import (AttentionBackend, AttentionImpl, + AttentionMetadata) +from vllm.attention.ops.paged_attn import (PagedAttention, + PagedAttentionMetadata) +from vllm.utils import is_xpu + +_PARTITION_SIZE = 512 + + +class IpexAttnBackend(AttentionBackend): + + @staticmethod + def get_name() -> str: + return "ipex-attn" + + @staticmethod + def get_impl_cls() -> Type["IpexAttnBackendImpl"]: + return IpexAttnBackendImpl + + @staticmethod + def make_metadata(*args, **kwargs) -> "IpexAttnMetadata": + return IpexAttnMetadata(*args, **kwargs) + + @staticmethod + def get_kv_cache_shape( + num_blocks: int, + block_size: int, + num_kv_heads: int, + head_size: int, + ) -> Tuple[int, ...]: + return PagedAttention.get_kv_cache_shape(num_blocks, block_size, + num_kv_heads, head_size) + + @staticmethod + def swap_blocks( + src_kv_cache: torch.Tensor, + dst_kv_cache: torch.Tensor, + src_to_dst: torch.Tensor, + ) -> None: + PagedAttention.swap_blocks(src_kv_cache, dst_kv_cache, src_to_dst) + + @staticmethod + def copy_blocks( + kv_caches: List[torch.Tensor], + src_to_dists: torch.Tensor, + ) -> None: + PagedAttention.copy_blocks(kv_caches, src_to_dists) + + +@dataclass +class IpexAttnMetadata(AttentionMetadata, PagedAttentionMetadata): + """Metadata for TorchSDPABackend. + """ + # Currently, input sequences can only contain all prompts + # or all decoding. True if all sequences are prompts. + is_prompt: bool + slot_mapping: torch.Tensor + seq_lens: Optional[List[int]] + + def __post_init__(self): + # Set during the execution of the first attention op. + # It is a list because it is needed to set per prompt + # when alibi slopes is used. It is because of the limitation + # from xformer API. + # will not appear in the __repr__ and __init__ + self.attn_bias: Optional[List[torch.Tensor]] = None + + @property + def prefill_metadata(self) -> Optional["IpexAttnMetadata"]: + # Currently chunked prefill is not supported + if self.num_decode_tokens == 0: + assert self.num_prefills > 0 + return self + + return None + + @property + def decode_metadata(self) -> Optional["IpexAttnMetadata"]: + # Currently chunked prefill is not supported + if self.num_prefills > 0: + assert self.num_decode_tokens == 0 + return None + + return self + + +class IpexAttnBackendImpl(AttentionImpl[IpexAttnMetadata]): + + def __init__( + self, + num_heads: int, + head_size: int, + scale: float, + num_kv_heads: int, + alibi_slopes: Optional[List[float]], + sliding_window: Optional[int], + kv_cache_dtype: str, + blocksparse_params: Optional[Dict[str, Any]] = None, + ) -> None: + assert blocksparse_params is None, ValueError( + "Torch SPDA does not support block-sparse attention.") + self.num_heads = num_heads + self.head_size = head_size + self.scale = float(scale) + self.num_kv_heads = num_kv_heads + if alibi_slopes is not None: + alibi_slopes = torch.tensor(alibi_slopes, dtype=torch.float32) + self.alibi_slopes = alibi_slopes + self.sliding_window = sliding_window + self.kv_cache_dtype = kv_cache_dtype + self.fuse_batch = False #is_xpu() + + assert self.num_heads % self.num_kv_heads == 0 + self.num_queries_per_kv = self.num_heads // self.num_kv_heads + self.need_mask = (self.alibi_slopes is not None + or self.sliding_window is not None) + + supported_head_sizes = PagedAttention.get_supported_head_sizes() + if head_size not in supported_head_sizes: + raise ValueError( + f"Head size {head_size} is not supported by PagedAttention. " + f"Supported head sizes are: {supported_head_sizes}.") + if kv_cache_dtype != "auto": + raise NotImplementedError( + "Torch SDPA backend does not support FP8 KV cache. " + "Please use xFormers backend instead.") + + def split_kv_cache( + self, + kv_cache: torch.Tensor, + num_kv_heads: int, + head_size: int, + ) -> Tuple[torch.Tensor, torch.Tensor]: + x = 1 if is_xpu() else (16 // kv_cache.element_size()) + num_blocks = kv_cache.shape[1] + + key_cache = kv_cache[0] + key_cache = key_cache.view(num_blocks, num_kv_heads, head_size // x, + -1, x) + value_cache = kv_cache[1] + value_cache = value_cache.view(num_blocks, num_kv_heads, head_size, -1) + return key_cache, value_cache + + def forward( + self, + query: torch.Tensor, + key: torch.Tensor, + value: torch.Tensor, + kv_cache: Optional[torch.Tensor], + attn_metadata: IpexAttnMetadata, # type: ignore + kv_scale: float = 1.0, + ) -> torch.Tensor: + """Forward pass with torch SDPA and PagedAttention. + + Args: + query: shape = [num_tokens, num_heads * head_size] + key: shape = [num_tokens, num_kv_heads * head_size] + value: shape = [num_tokens, num_kv_heads * head_size] + kv_cache = [2, num_blocks, block_size * num_kv_heads * head_size] + attn_metadata: Metadata for attention. + Returns: + shape = [num_tokens, num_heads * head_size] + """ + assert kv_scale == 1.0 + num_tokens, hidden_size = query.shape + # Reshape the query, key, and value tensors. + query = query.view(-1, self.num_heads, self.head_size) + key = key.view(-1, self.num_kv_heads, self.head_size) + value = value.view(-1, self.num_kv_heads, self.head_size) + + if kv_cache is not None: + key_cache, value_cache = self.split_kv_cache( + kv_cache, self.num_kv_heads, self.head_size) + ipex_ops.reshape_and_cache( + key, + value, + key_cache, + value_cache, + attn_metadata.slot_mapping.flatten(), + self.kv_cache_dtype, + kv_scale, + ) + + if attn_metadata.is_prompt: + assert attn_metadata.seq_lens is not None + if (kv_cache is None or attn_metadata.block_tables.numel() == 0): + if self.num_kv_heads != self.num_heads: + key = key.repeat_interleave(self.num_queries_per_kv, dim=1) + value = value.repeat_interleave(self.num_queries_per_kv, + dim=1) + + if attn_metadata.attn_bias is None: + if self.alibi_slopes is not None: + att_masks = _make_alibi_bias( + self.alibi_slopes, query.dtype, + attn_metadata.seq_lens) # type: ignore + elif self.sliding_window is not None: + att_masks = _make_sliding_window_bias( + attn_metadata.seq_lens, self.sliding_window, + query.dtype) # type: ignore + else: + if self.fuse_batch: + att_masks = _make_sliding_window_bias( + attn_metadata.seq_lens, + None, + dtype=query.dtype) + else: + att_masks = [None] * len(attn_metadata.seq_lens) + attn_metadata.attn_bias = att_masks + + if self.fuse_batch: + out = torch.empty( + (num_tokens, self.num_heads, self.head_size), + dtype=query.dtype, + device=query.device) + tmp = [0] + tmp.extend(attn_metadata.seq_lens) + seqlen = torch.tensor(tmp) + max_seqlen = max(attn_metadata.seq_lens) + seqlen_q = torch.cumsum(seqlen, + dim=0).to(device=query.device) + ipex_ops.varlen_attention(query, + key, + value, + out, + seqlen_q, + seqlen_q, + max_seqlen, + max_seqlen, + pdropout=0.0, + softmax_scale=self.scale, + zero_tensors=False, + is_causal=True, + return_softmax=False, + gen_=None) + else: + query = query.movedim(0, query.dim() - 2) + key = key.movedim(0, key.dim() - 2) + value = value.movedim(0, value.dim() - 2) + + start = 0 + out = torch.empty( + (num_tokens, self.num_heads, self.head_size), + dtype=query.dtype, + device=query.device) + for seq_len, mask in zip(attn_metadata.seq_lens, + attn_metadata.attn_bias): + end = start + seq_len + sub_out = scaled_dot_product_attention( + query[:, start:end, :], + key[:, start:end, :], + value[:, start:end, :], + attn_mask=mask, + dropout_p=0.0, + is_causal=not self.need_mask, + scale=self.scale).movedim(query.dim() - 2, 0) + out[start:end, :, :] = sub_out + start = end + + output = out.to(query.dtype) + else: + # prefix-enabled attention + raise RuntimeError( + "Torch SDPA backend doesn't support prefix decoding.") + + else: + # Decoding run. + max_seq_len = attn_metadata.max_decode_seq_len + output = torch.empty_like(query) + block_size = value_cache.shape[3] + num_seqs, num_heads, head_size = query.shape + max_num_partitions = ((max_seq_len + _PARTITION_SIZE - 1) // + _PARTITION_SIZE) + # NOTE(woosuk): We use a simple heuristic to decide whether to use + # PagedAttention V1 or V2. If the number of partitions is 1, we use + # V1 to avoid the overhead of reduction. Also, if the number of + # sequences or heads is large, we use V1 since there is enough work + # to parallelize. + # TODO(woosuk): Tune this heuristic. + # For context len > 8192, use V2 kernel to avoid shared memory + # shortage. + use_v1 = (max_seq_len <= 8192 and + (max_num_partitions == 1 or num_seqs * num_heads > 512)) + if use_v1: + # Run PagedAttention V1. + ipex_ops.paged_attention_v1( + output, + query, + key_cache, + value_cache, + self.num_kv_heads, + self.scale, + attn_metadata.block_tables, + attn_metadata.seq_lens_tensor, + block_size, + max_seq_len, + self.alibi_slopes, + self.kv_cache_dtype, + kv_scale, + ) + else: + # Run PagedAttention V2. + assert _PARTITION_SIZE % block_size == 0 + tmp_output = torch.empty( + size=(num_seqs, num_heads, max_num_partitions, head_size), + dtype=output.dtype, + device=output.device, + ) + exp_sums = torch.empty( + size=(num_seqs, num_heads, max_num_partitions), + dtype=torch.float32, + device=output.device, + ) + max_logits = torch.empty_like(exp_sums) + ipex_ops.paged_attention_v2( + output, + exp_sums, + max_logits, + tmp_output, + query, + key_cache, + value_cache, + self.num_kv_heads, + self.scale, + attn_metadata.block_tables, + attn_metadata.seq_lens, + block_size, + max_seq_len, + self.alibi_slopes, + self.kv_cache_dtype, + kv_scale, + ) + + # Reshape the output tensor. + return output.view(-1, self.num_heads * self.head_size) + + +def _make_alibi_bias( + alibi_slopes: torch.Tensor, + dtype: torch.dtype, + seq_lens: List[int], +) -> List[torch.Tensor]: + attn_biases = [] + for seq_len in seq_lens: + bias = torch.arange(seq_len, dtype=dtype, device=alibi_slopes.device) + # NOTE(zhuohan): HF uses + # `bias = bias[None, :].repeat(seq_len, 1)` + # here. We find that both biases give the same results, but + # the bias below more accurately follows the original ALiBi + # paper. + bias = bias[None, :] - bias[:, None] + + num_heads = alibi_slopes.shape[0] + bias = bias[None, :].repeat((num_heads, 1, 1)) + bias.mul_(alibi_slopes[:, None, None]) + inf_mask = torch.empty( + (1, seq_len, seq_len), + dtype=bias.dtype, + device=alibi_slopes.device).fill_(-torch.inf).triu_(diagonal=1) + attn_biases.append((bias + inf_mask).to(dtype)) + + return attn_biases + + +def _make_sliding_window_bias( + seq_lens: List[int], + window_size: Optional[int], + dtype: torch.dtype, +) -> List[torch.Tensor]: + attn_biases = [] + for seq_len in seq_lens: + tensor = torch.full( + (1, seq_len, seq_len), + dtype=dtype, + fill_value=1, + ) + shift = 0 + mask = torch.tril(tensor, diagonal=shift).to(dtype) # type: ignore + if window_size is not None: + mask = torch.triu(mask, diagonal=shift - window_size + 1) + mask = torch.log(mask) + attn_biases.append(mask.to(dtype)) + + return attn_biases diff --git a/vllm/attention/selector.py b/vllm/attention/selector.py index abc4c26ffe94b..1d56d87ccd119 100644 --- a/vllm/attention/selector.py +++ b/vllm/attention/selector.py @@ -19,6 +19,7 @@ class _Backend(enum.Enum): TORCH_SDPA = enum.auto() FLASHINFER = enum.auto() PALLAS = enum.auto() + IPEX = enum.auto() @lru_cache(maxsize=None) @@ -58,12 +59,17 @@ def get_attn_backend( ROCmFlashAttentionBackend) return ROCmFlashAttentionBackend elif backend == _Backend.TORCH_SDPA: - # TODO: make XPU backend available here. assert is_cpu(), RuntimeError( "Torch SDPA backend is only used for the CPU device.") logger.info("Using Torch SDPA backend.") from vllm.attention.backends.torch_sdpa import TorchSDPABackend return TorchSDPABackend + elif backend == _Backend.IPEX: + assert is_xpu(), RuntimeError( + "IPEX attention backend is only used for the XPU device.") + logger.info("Using IPEX attention backend.") + from vllm.attention.backends.ipex_attn import IpexAttnBackend + return IpexAttnBackend elif backend == _Backend.FLASHINFER: logger.info("Using Flashinfer backend.") logger.warning("Eager mode is required for the Flashinfer backend. " @@ -102,11 +108,16 @@ def which_attn_to_use( "(case-sensitive).") selected_backend = _Backend[backend_by_env_var] - if is_cpu() or is_xpu(): + if is_cpu(): if selected_backend != _Backend.TORCH_SDPA: logger.info("Cannot use %s backend on CPU.", selected_backend) return _Backend.TORCH_SDPA + if is_xpu(): + if selected_backend != _Backend.IPEX: + logger.info("Cannot use %s backend on XPU.", selected_backend) + return _Backend.IPEX + if is_tpu(): if selected_backend != _Backend.PALLAS: logger.info("Cannot use %s backend on TPU.", selected_backend) diff --git a/vllm/config.py b/vllm/config.py index 6232d733538de..7fae6a1614837 100644 --- a/vllm/config.py +++ b/vllm/config.py @@ -12,7 +12,7 @@ from vllm.model_executor.models import ModelRegistry from vllm.transformers_utils.config import get_config, get_hf_text_config from vllm.utils import (cuda_device_count_stateless, get_cpu_memory, is_cpu, - is_hip, is_neuron, is_tpu, is_xpu) + is_hip, is_neuron, is_tpu, is_xpu)) if TYPE_CHECKING: from ray.util.placement_group import PlacementGroup diff --git a/vllm/engine/arg_utils.py b/vllm/engine/arg_utils.py index 16bf7e5d66e3f..58a76abf6239d 100644 --- a/vllm/engine/arg_utils.py +++ b/vllm/engine/arg_utils.py @@ -501,12 +501,12 @@ def add_cli_args( 'Enabling this will use the fully sharded layers. ' 'At high sequence length, max rank or ' 'tensor parallel size, this is likely faster.')) - parser.add_argument("--device", - type=str, - default=EngineArgs.device, - choices=["auto", "cuda", "neuron", "cpu", "tpu", - "xpu"], - help='Device type for vLLM execution.') + parser.add_argument( + "--device", + type=str, + default=EngineArgs.device, + choices=["auto", "cuda", "neuron", "cpu", "tpu", "xpu"], + help='Device type for vLLM execution.') # Related to Vision-language models such as llava parser = EngineArgs.add_cli_args_for_vlm(parser) diff --git a/vllm/model_executor/custom_op.py b/vllm/model_executor/custom_op.py index 56aa629ae3455..0db72d8d95f24 100644 --- a/vllm/model_executor/custom_op.py +++ b/vllm/model_executor/custom_op.py @@ -1,6 +1,6 @@ import torch.nn as nn -from vllm.utils import is_cpu, is_hip, is_tpu +from vllm.utils import is_cpu, is_hip, is_tpu, is_xpu class CustomOp(nn.Module): @@ -29,9 +29,7 @@ def forward_hip(self, *args, **kwargs): return self.forward_cuda(*args, **kwargs) def forward_xpu(self, *args, **kwargs): - # By default, we assume that XPU ops are compatible with CUDA ops. - # NOTE(woosuk): This is a placeholder for future extensions. - return self.forward_cuda(*args, **kwargs) + raise NotImplementedError def forward_cpu(self, *args, **kwargs): # By default, we assume that CPU ops are compatible with CUDA ops. @@ -58,5 +56,7 @@ def dispatch_forward(self): return self.forward_cpu elif is_tpu(): return self.forward_tpu + elif is_xpu(): + return self.forward_xpu else: return self.forward_cuda diff --git a/vllm/model_executor/layers/activation.py b/vllm/model_executor/layers/activation.py index 4d076421f9d2a..eb0606948686d 100644 --- a/vllm/model_executor/layers/activation.py +++ b/vllm/model_executor/layers/activation.py @@ -37,6 +37,15 @@ def forward_cuda(self, x: torch.Tensor) -> torch.Tensor: ops.silu_and_mul(out, x) return out + def forward_xpu(self, x: torch.Tensor) -> torch.Tensor: + from vllm._ipex_ops import ipex_ops as ops + + d = x.shape[-1] // 2 + output_shape = (x.shape[:-1] + (d, )) + out = torch.empty(output_shape, dtype=x.dtype, device=x.device) + ops.silu_and_mul(out, x) + return out + class GeluAndMul(CustomOp): """An activation function for GeGLU. @@ -71,6 +80,18 @@ def forward_cuda(self, x: torch.Tensor) -> torch.Tensor: ops.gelu_tanh_and_mul(out, x) return out + def forward_xpu(self, x: torch.Tensor) -> torch.Tensor: + from vllm._ipex_ops import ipex_ops as ops + + d = x.shape[-1] // 2 + output_shape = (x.shape[:-1] + (d, )) + out = torch.empty(output_shape, dtype=x.dtype, device=x.device) + if self.approximate == "none": + ops.gelu_and_mul(out, x) + elif self.approximate == "tanh": + ops.gelu_tanh_and_mul(out, x) + return out + def extra_repr(self) -> str: return f'approximate={repr(self.approximate)}' @@ -90,6 +111,13 @@ def forward_cuda(self, x: torch.Tensor) -> torch.Tensor: ops.gelu_new(out, x) return out + def forward_xpu(self, x: torch.Tensor) -> torch.Tensor: + from vllm._ipex_ops import ipex_ops as ops + + out = torch.empty_like(x) + ops.gelu_new(out, x) + return out + class FastGELU(CustomOp): @@ -105,6 +133,13 @@ def forward_cuda(self, x: torch.Tensor) -> torch.Tensor: ops.gelu_fast(out, x) return out + def forward_xpu(self, x: torch.Tensor) -> torch.Tensor: + from vllm._ipex_ops import ipex_ops as ops + + out = torch.empty_like(x) + ops.gelu_fast(out, x) + return out + class ScaledActivation(nn.Module): """An activation function with post-scale parameters. diff --git a/vllm/model_executor/layers/layernorm.py b/vllm/model_executor/layers/layernorm.py index 4533adf8f83aa..14f5e2378a421 100644 --- a/vllm/model_executor/layers/layernorm.py +++ b/vllm/model_executor/layers/layernorm.py @@ -67,6 +67,30 @@ def forward_cuda( ) return out + def forward_xpu( + self, + x: torch.Tensor, + residual: Optional[torch.Tensor] = None, + ) -> Union[torch.Tensor, Tuple[torch.Tensor, torch.Tensor]]: + from vllm._ipex_ops import ipex_ops as ops + + if residual is not None: + ops.fused_add_rms_norm( + x, + residual, + self.weight.data, + self.variance_epsilon, + ) + return x, residual + out = torch.empty_like(x) + ops.rms_norm( + out, + x, + self.weight.data, + self.variance_epsilon, + ) + return out + def extra_repr(self) -> str: s = f"hidden_size={self.weight.data.size(0)}" s += f", eps={self.variance_epsilon}" diff --git a/vllm/model_executor/layers/rotary_embedding.py b/vllm/model_executor/layers/rotary_embedding.py index 792c4729355a7..b3642d43fb674 100644 --- a/vllm/model_executor/layers/rotary_embedding.py +++ b/vllm/model_executor/layers/rotary_embedding.py @@ -221,6 +221,29 @@ def forward_cuda( self.cos_sin_cache, self.is_neox_style) return query, key + def forward_xpu( + self, + positions: torch.Tensor, + query: torch.Tensor, + key: torch.Tensor, + offsets: Optional[torch.Tensor] = None, + ) -> Tuple[torch.Tensor, torch.Tensor]: + from vllm._ipex_ops import ipex_ops as ops + + self.cos_sin_cache = self.cos_sin_cache.to(positions.device, + dtype=query.dtype) + # ops.rotary_embedding()/batched_rotary_embedding() + # are in-place operations that update the query and key tensors. + if offsets is not None: + ops.batched_rotary_embedding(positions, query, key, self.head_size, + self.cos_sin_cache, + self.is_neox_style, self.rotary_dim, + offsets) + else: + ops.rotary_embedding(positions, query, key, self.head_size, + self.cos_sin_cache, self.is_neox_style) + return query, key + def forward_tpu( self, positions: torch.Tensor, From f1ebe9f74b7cc7078f3f0c56021172f2219c8ff0 Mon Sep 17 00:00:00 2001 From: Kunshang Ji Date: Thu, 13 Jun 2024 21:51:01 +0800 Subject: [PATCH 61/67] revert torch sdpa backend --- vllm/attention/backends/ipex_attn.py | 4 ++-- vllm/attention/backends/torch_sdpa.py | 32 ++++----------------------- vllm/attention/ops/paged_attn.py | 1 + 3 files changed, 7 insertions(+), 30 deletions(-) diff --git a/vllm/attention/backends/ipex_attn.py b/vllm/attention/backends/ipex_attn.py index 78eac48ef6657..a1d3a26d6f10c 100644 --- a/vllm/attention/backends/ipex_attn.py +++ b/vllm/attention/backends/ipex_attn.py @@ -117,7 +117,7 @@ def __init__( self.alibi_slopes = alibi_slopes self.sliding_window = sliding_window self.kv_cache_dtype = kv_cache_dtype - self.fuse_batch = False #is_xpu() + self.fuse_batch = is_xpu() assert self.num_heads % self.num_kv_heads == 0 self.num_queries_per_kv = self.num_heads // self.num_kv_heads @@ -140,7 +140,7 @@ def split_kv_cache( num_kv_heads: int, head_size: int, ) -> Tuple[torch.Tensor, torch.Tensor]: - x = 1 if is_xpu() else (16 // kv_cache.element_size()) + x = 1 num_blocks = kv_cache.shape[1] key_cache = kv_cache[0] diff --git a/vllm/attention/backends/torch_sdpa.py b/vllm/attention/backends/torch_sdpa.py index 0fec1a466629b..ecd61add4141b 100644 --- a/vllm/attention/backends/torch_sdpa.py +++ b/vllm/attention/backends/torch_sdpa.py @@ -121,7 +121,6 @@ def __init__( self.alibi_slopes = alibi_slopes self.sliding_window = sliding_window self.kv_cache_dtype = kv_cache_dtype - self.fuse_batch = is_xpu() assert self.num_heads % self.num_kv_heads == 0 self.num_queries_per_kv = self.num_heads // self.num_kv_heads @@ -138,22 +137,6 @@ def __init__( "Torch SDPA backend does not support FP8 KV cache. " "Please use xFormers backend instead.") - def split_kv_cache( - self, - kv_cache: torch.Tensor, - num_kv_heads: int, - head_size: int, - ) -> Tuple[torch.Tensor, torch.Tensor]: - x = 1 if is_xpu() else (16 // kv_cache.element_size()) - num_blocks = kv_cache.shape[1] - - key_cache = kv_cache[0] - key_cache = key_cache.view(num_blocks, num_kv_heads, head_size // x, - -1, x) - value_cache = kv_cache[1] - value_cache = value_cache.view(num_blocks, num_kv_heads, head_size, -1) - return key_cache, value_cache - def forward( self, query: torch.Tensor, @@ -182,7 +165,7 @@ def forward( value = value.view(-1, self.num_kv_heads, self.head_size) if kv_cache is not None: - key_cache, value_cache = self.split_kv_cache( + key_cache, value_cache = PagedAttention.split_kv_cache( kv_cache, self.num_kv_heads, self.head_size) PagedAttention.write_to_paged_cache(key, value, key_cache, value_cache, @@ -207,13 +190,7 @@ def forward( attn_metadata.seq_lens, self.sliding_window, query.dtype) # type: ignore else: - if self.fuse_batch: - att_masks = _make_sliding_window_bias( - attn_metadata.seq_lens, - None, - dtype=query.dtype) - else: - att_masks = [None] * len(attn_metadata.seq_lens) + att_masks = [None] * len(attn_metadata.seq_lens) attn_metadata.attn_bias = att_masks # query = query.unsqueeze(0) # [batch_size, num_tokens, num_heads, head_size] @@ -270,7 +247,7 @@ def _make_alibi_bias( ) -> List[torch.Tensor]: attn_biases = [] for seq_len in seq_lens: - bias = torch.arange(seq_len, dtype=dtype, device=alibi_slopes.device) + bias = torch.arange(seq_len, dtype=dtype) # NOTE(zhuohan): HF uses # `bias = bias[None, :].repeat(seq_len, 1)` # here. We find that both biases give the same results, but @@ -283,8 +260,7 @@ def _make_alibi_bias( bias.mul_(alibi_slopes[:, None, None]).unsqueeze_(0) inf_mask = torch.empty( (1, seq_len, seq_len), - dtype=bias.dtype, - device=alibi_slopes.device).fill_(-torch.inf).triu_(diagonal=1) + dtype=bias.dtype).fill_(-torch.inf).triu_(diagonal=1) attn_biases.append((bias + inf_mask).to(dtype)) return attn_biases diff --git a/vllm/attention/ops/paged_attn.py b/vllm/attention/ops/paged_attn.py index 170e9ec38aca6..a214f40d16514 100644 --- a/vllm/attention/ops/paged_attn.py +++ b/vllm/attention/ops/paged_attn.py @@ -119,6 +119,7 @@ def forward_decode( # For context len > 8192, use V2 kernel to avoid shared memory shortage. use_v1 = (max_seq_len <= 8192 and (max_num_partitions == 1 or num_seqs * num_heads > 512)) + if use_v1: # Run PagedAttention V1. ops.paged_attention_v1( From 9046315f1071eecb341e63de2f4b28b1fc8a2712 Mon Sep 17 00:00:00 2001 From: Kunshang Ji Date: Thu, 13 Jun 2024 22:06:30 +0800 Subject: [PATCH 62/67] update document --- docs/source/getting_started/xpu-installation.rst | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/docs/source/getting_started/xpu-installation.rst b/docs/source/getting_started/xpu-installation.rst index 48932582c68ed..10a91e229bf23 100644 --- a/docs/source/getting_started/xpu-installation.rst +++ b/docs/source/getting_started/xpu-installation.rst @@ -47,7 +47,6 @@ Build from source .. code-block:: console $ pip install --upgrade pip - $ pip install -v -r requirements-dev.txt $ pip install -v -r requirements-xpu.txt - Finally, build and install vLLM XPU backend: @@ -57,5 +56,6 @@ Build from source $ VLLM_TARGET_DEVICE=xpu python setup.py install .. note:: - - FP16 is the default data type in the current XPU backend. + - FP16 is the default data type in the current XPU backend. The BF16 data + type will be supported in the future. From ebbc13e9e95db8b87559af75d71d815eb06f7a1c Mon Sep 17 00:00:00 2001 From: Kunshang Ji Date: Thu, 13 Jun 2024 22:09:36 +0800 Subject: [PATCH 63/67] format --- setup.py | 1 + 1 file changed, 1 insertion(+) diff --git a/setup.py b/setup.py index 17e5ae9ec137f..b2ae6def8cdc6 100644 --- a/setup.py +++ b/setup.py @@ -240,6 +240,7 @@ def _is_xpu() -> bool: def _build_custom_ops() -> bool: return _is_cuda() or _is_hip() or _is_cpu() + def _install_punica() -> bool: return envs.VLLM_INSTALL_PUNICA_KERNELS From 5d823d9ce8637324cc2a978d2b88ed7d2c506640 Mon Sep 17 00:00:00 2001 From: Kunshang Ji Date: Thu, 13 Jun 2024 22:48:01 +0800 Subject: [PATCH 64/67] more fix --- vllm/attention/backends/ipex_attn.py | 2 +- vllm/distributed/parallel_state.py | 2 +- vllm/model_executor/layers/vocab_parallel_embedding.py | 2 +- 3 files changed, 3 insertions(+), 3 deletions(-) diff --git a/vllm/attention/backends/ipex_attn.py b/vllm/attention/backends/ipex_attn.py index a1d3a26d6f10c..b9dbf133aa9f9 100644 --- a/vllm/attention/backends/ipex_attn.py +++ b/vllm/attention/backends/ipex_attn.py @@ -332,7 +332,7 @@ def forward( self.num_kv_heads, self.scale, attn_metadata.block_tables, - attn_metadata.seq_lens, + attn_metadata.seq_lens_tensor, block_size, max_seq_len, self.alibi_slopes, diff --git a/vllm/distributed/parallel_state.py b/vllm/distributed/parallel_state.py index f6a2fc9b05a84..d608ef920de58 100644 --- a/vllm/distributed/parallel_state.py +++ b/vllm/distributed/parallel_state.py @@ -57,7 +57,7 @@ def _split_tensor_dict( # because it contains not only the device type but also the device # index (e.g. "cuda:0"). We only need the device type. # receiving side will set the device index. - device = "cpu" if value.is_cpu else "cuda" + device = value.device.type metadata_list.append( (key, TensorMetadata(device, value.dtype, value.size()))) tensor_list.append(value) diff --git a/vllm/model_executor/layers/vocab_parallel_embedding.py b/vllm/model_executor/layers/vocab_parallel_embedding.py index 60eb5b404e2ca..1a26c5c63fedc 100644 --- a/vllm/model_executor/layers/vocab_parallel_embedding.py +++ b/vllm/model_executor/layers/vocab_parallel_embedding.py @@ -307,7 +307,7 @@ def forward(self, input_): else: masked_input = input_ # Get the embeddings. - output_parallel = F.embedding(masked_input, self.weight) + output_parallel = F.embedding(masked_input.long(), self.weight) # Mask the output embedding. if self.tp_size > 1: output_parallel.masked_fill_(input_mask.unsqueeze(1), 0) From bdb6ca555533837452c49f4a872245023681a2a7 Mon Sep 17 00:00:00 2001 From: Kunshang Ji Date: Fri, 14 Jun 2024 16:02:47 +0800 Subject: [PATCH 65/67] revert torch sdpa, fix doc --- docs/source/getting_started/xpu-installation.rst | 2 +- vllm/attention/backends/torch_sdpa.py | 6 +++--- vllm/config.py | 2 +- 3 files changed, 5 insertions(+), 5 deletions(-) diff --git a/docs/source/getting_started/xpu-installation.rst b/docs/source/getting_started/xpu-installation.rst index 10a91e229bf23..4f0d2da25b8e8 100644 --- a/docs/source/getting_started/xpu-installation.rst +++ b/docs/source/getting_started/xpu-installation.rst @@ -57,5 +57,5 @@ Build from source .. note:: - FP16 is the default data type in the current XPU backend. The BF16 data - type will be supported in the future. + type will be supported in the future. diff --git a/vllm/attention/backends/torch_sdpa.py b/vllm/attention/backends/torch_sdpa.py index ecd61add4141b..4b08cce99afb0 100644 --- a/vllm/attention/backends/torch_sdpa.py +++ b/vllm/attention/backends/torch_sdpa.py @@ -193,9 +193,9 @@ def forward( att_masks = [None] * len(attn_metadata.seq_lens) attn_metadata.attn_bias = att_masks - # query = query.unsqueeze(0) # [batch_size, num_tokens, num_heads, head_size] - # key = key.unsqueeze(0) - # value = value.unsqueeze(0) + query = query.movedim(0, query.dim() - 2) + key = key.movedim(0, key.dim() - 2) + value = value.movedim(0, value.dim() - 2) start = 0 output = torch.empty( diff --git a/vllm/config.py b/vllm/config.py index 7fae6a1614837..6232d733538de 100644 --- a/vllm/config.py +++ b/vllm/config.py @@ -12,7 +12,7 @@ from vllm.model_executor.models import ModelRegistry from vllm.transformers_utils.config import get_config, get_hf_text_config from vllm.utils import (cuda_device_count_stateless, get_cpu_memory, is_cpu, - is_hip, is_neuron, is_tpu, is_xpu)) + is_hip, is_neuron, is_tpu, is_xpu) if TYPE_CHECKING: from ray.util.placement_group import PlacementGroup From bcdf65a79b9731c7e1488bb2e0ca7e16b7cccaf8 Mon Sep 17 00:00:00 2001 From: Kunshang Ji Date: Mon, 17 Jun 2024 19:21:19 +0800 Subject: [PATCH 66/67] address comments --- vllm/attention/backends/ipex_attn.py | 24 ++++++++++-------------- vllm/utils.py | 13 ++++++------- vllm/worker/xpu_model_runner.py | 12 +++++++++++- 3 files changed, 27 insertions(+), 22 deletions(-) diff --git a/vllm/attention/backends/ipex_attn.py b/vllm/attention/backends/ipex_attn.py index b9dbf133aa9f9..c926e6bb69479 100644 --- a/vllm/attention/backends/ipex_attn.py +++ b/vllm/attention/backends/ipex_attn.py @@ -58,13 +58,15 @@ def copy_blocks( @dataclass class IpexAttnMetadata(AttentionMetadata, PagedAttentionMetadata): - """Metadata for TorchSDPABackend. + """Metadata for IpexAttnBackend. """ # Currently, input sequences can only contain all prompts # or all decoding. True if all sequences are prompts. is_prompt: bool slot_mapping: torch.Tensor seq_lens: Optional[List[int]] + seqlen_q: Optional[torch.Tensor] + max_seqlen: Optional[int] def __post_init__(self): # Set during the execution of the first attention op. @@ -131,7 +133,7 @@ def __init__( f"Supported head sizes are: {supported_head_sizes}.") if kv_cache_dtype != "auto": raise NotImplementedError( - "Torch SDPA backend does not support FP8 KV cache. " + "IPEX backend does not support FP8 KV cache. " "Please use xFormers backend instead.") def split_kv_cache( @@ -159,7 +161,7 @@ def forward( attn_metadata: IpexAttnMetadata, # type: ignore kv_scale: float = 1.0, ) -> torch.Tensor: - """Forward pass with torch SDPA and PagedAttention. + """Forward pass with IPEX varlen_attention and PagedAttention. Args: query: shape = [num_tokens, num_heads * head_size] @@ -222,20 +224,14 @@ def forward( (num_tokens, self.num_heads, self.head_size), dtype=query.dtype, device=query.device) - tmp = [0] - tmp.extend(attn_metadata.seq_lens) - seqlen = torch.tensor(tmp) - max_seqlen = max(attn_metadata.seq_lens) - seqlen_q = torch.cumsum(seqlen, - dim=0).to(device=query.device) ipex_ops.varlen_attention(query, key, value, out, - seqlen_q, - seqlen_q, - max_seqlen, - max_seqlen, + attn_metadata.seqlen_q, + attn_metadata.seqlen_q, + attn_metadata.max_seqlen, + attn_metadata.max_seqlen, pdropout=0.0, softmax_scale=self.scale, zero_tensors=False, @@ -270,7 +266,7 @@ def forward( else: # prefix-enabled attention raise RuntimeError( - "Torch SDPA backend doesn't support prefix decoding.") + "IPEX backend doesn't support prefix decoding.") else: # Decoding run. diff --git a/vllm/utils.py b/vllm/utils.py index e2ff019634926..56cdbb62ebec1 100644 --- a/vllm/utils.py +++ b/vllm/utils.py @@ -28,13 +28,6 @@ T = TypeVar("T") logger = init_logger(__name__) -try: - import intel_extension_for_pytorch as ipex # noqa: F401 - _import_ipex = True -except ImportError as e: - logger.warning("Import Error for IPEX: %s", e.msg) - _import_ipex = False - STR_DTYPE_TO_TORCH_DTYPE = { "half": torch.half, "bfloat16": torch.bfloat16, @@ -169,6 +162,12 @@ def is_xpu() -> bool: # vllm is not build with xpu if not is_xpu_flag: return False + try: + import intel_extension_for_pytorch as ipex # noqa: F401 + _import_ipex = True + except ImportError as e: + logger.warning("Import Error for IPEX: %s", e.msg) + _import_ipex = False # ipex dependency is not ready if not _import_ipex: logger.warning("not found ipex lib") diff --git a/vllm/worker/xpu_model_runner.py b/vllm/worker/xpu_model_runner.py index 926c81364648f..f30de703e805d 100644 --- a/vllm/worker/xpu_model_runner.py +++ b/vllm/worker/xpu_model_runner.py @@ -259,6 +259,8 @@ def _prepare_decode( is_prompt=False, slot_mapping=slot_mapping, seq_lens=seq_lens, + seqlen_q=None, + max_seqlen=None, seq_lens_tensor=seq_lens_tensor, max_decode_seq_len=max_decode_seq_len, num_prefill_tokens=0, @@ -392,16 +394,24 @@ def _prepare_prompt( dtype=torch.long, device=self.device) # type: ignore + max_seqlen = max(seq_lens) + tmp = [0] + tmp.extend(seq_lens) + seqlen = torch.tensor(tmp) + seqlen_q = torch.cumsum(seqlen, dim=0).to(device=self.device) + attn_metadata = self.attn_backend.make_metadata( is_prompt=True, + slot_mapping=slot_mapping, seq_lens=seq_lens, + seqlen_q=seqlen_q, + max_seqlen=max_seqlen, seq_lens_tensor=None, max_decode_seq_len=None, num_prefills=len(seq_lens), num_prefill_tokens=num_prompt_tokens, num_decode_tokens=0, block_tables=torch.tensor([], device=self.device, dtype=torch.int), - slot_mapping=slot_mapping, ) return (input_tokens, input_positions, attn_metadata, seq_lens, multi_modal_input) From 10ec2d23ca417ceaf640fc8f96e4bd21029f33a0 Mon Sep 17 00:00:00 2001 From: Kunshang Ji Date: Mon, 17 Jun 2024 22:16:08 +0800 Subject: [PATCH 67/67] remove fuse in ipex_attn backend --- vllm/attention/backends/ipex_attn.py | 74 ++++++++-------------------- 1 file changed, 20 insertions(+), 54 deletions(-) diff --git a/vllm/attention/backends/ipex_attn.py b/vllm/attention/backends/ipex_attn.py index c926e6bb69479..f09b24f2a0304 100644 --- a/vllm/attention/backends/ipex_attn.py +++ b/vllm/attention/backends/ipex_attn.py @@ -4,14 +4,12 @@ from typing import Any, Dict, List, Optional, Tuple, Type import torch -from torch.nn.functional import scaled_dot_product_attention from vllm._ipex_ops import ipex_ops from vllm.attention.backends.abstract import (AttentionBackend, AttentionImpl, AttentionMetadata) from vllm.attention.ops.paged_attn import (PagedAttention, PagedAttentionMetadata) -from vllm.utils import is_xpu _PARTITION_SIZE = 512 @@ -119,7 +117,6 @@ def __init__( self.alibi_slopes = alibi_slopes self.sliding_window = sliding_window self.kv_cache_dtype = kv_cache_dtype - self.fuse_batch = is_xpu() assert self.num_heads % self.num_kv_heads == 0 self.num_queries_per_kv = self.num_heads // self.num_kv_heads @@ -210,59 +207,28 @@ def forward( attn_metadata.seq_lens, self.sliding_window, query.dtype) # type: ignore else: - if self.fuse_batch: - att_masks = _make_sliding_window_bias( - attn_metadata.seq_lens, - None, - dtype=query.dtype) - else: - att_masks = [None] * len(attn_metadata.seq_lens) + att_masks = _make_sliding_window_bias( + attn_metadata.seq_lens, None, dtype=query.dtype) attn_metadata.attn_bias = att_masks - if self.fuse_batch: - out = torch.empty( - (num_tokens, self.num_heads, self.head_size), - dtype=query.dtype, - device=query.device) - ipex_ops.varlen_attention(query, - key, - value, - out, - attn_metadata.seqlen_q, - attn_metadata.seqlen_q, - attn_metadata.max_seqlen, - attn_metadata.max_seqlen, - pdropout=0.0, - softmax_scale=self.scale, - zero_tensors=False, - is_causal=True, - return_softmax=False, - gen_=None) - else: - query = query.movedim(0, query.dim() - 2) - key = key.movedim(0, key.dim() - 2) - value = value.movedim(0, value.dim() - 2) - - start = 0 - out = torch.empty( - (num_tokens, self.num_heads, self.head_size), - dtype=query.dtype, - device=query.device) - for seq_len, mask in zip(attn_metadata.seq_lens, - attn_metadata.attn_bias): - end = start + seq_len - sub_out = scaled_dot_product_attention( - query[:, start:end, :], - key[:, start:end, :], - value[:, start:end, :], - attn_mask=mask, - dropout_p=0.0, - is_causal=not self.need_mask, - scale=self.scale).movedim(query.dim() - 2, 0) - out[start:end, :, :] = sub_out - start = end - - output = out.to(query.dtype) + output = torch.empty( + (num_tokens, self.num_heads, self.head_size), + dtype=query.dtype, + device=query.device) + ipex_ops.varlen_attention(query, + key, + value, + output, + attn_metadata.seqlen_q, + attn_metadata.seqlen_q, + attn_metadata.max_seqlen, + attn_metadata.max_seqlen, + pdropout=0.0, + softmax_scale=self.scale, + zero_tensors=False, + is_causal=True, + return_softmax=False, + gen_=None) else: # prefix-enabled attention raise RuntimeError(