d6f3030047
* ggml: backend-agnostic tensor parallelism * support for GPT-OSS, Qwen 3 MoE * partial Vulkan fix * add support for 4/8 GPUs * unconditional peer access * re-use buffers + ggml contexts * fix output pattern * NCCL support * GGML: HIP: add RCCL support * Remove shfl and AllReduce from backend interface * move allocation workaround out of ggml-alloc.c * 2d tensor set/get support * Fix the seg fault without NCCL * Apply suggestion from JohannesGaessler * support for tensor dims % n_devs != 0 * fix view_offs scaling * arbitrary num. of GPUs/tensor split * fix compilation * better granularity estimate * Support device-specific host buffer types if all underlying backends expose the same type. This allows using pinned memory instead of pageable memory for CUDA. Fix compilation errors. * partial Qwen 3 Next support * Fix qwen3 30b (#8) * Fix crash with Qwen-30B-A3B Q4_0 Qwen-30B-A3B Q4_0 has an intermediate dimension of 768. Using a granularity of 256 forces an uneven split between GPUs, which is not supported by the current implementation. * Decide block size based on tensor quantization type * Fix crashes due to KV cache serialization (#9) KV cache serialization requires non-zero offsets on the tensor. Add support in the meta backend to set/get a tensor with a non-zero offset. * metal : fix build (#7) * static memory allocations, fix usage count * fix tensor granularity * more even memory distribution * use BF16 for allreduce * rebase fixup * better error message for unsupported architectures * Fix device mismatch during scatter of allReduce. (#11) There is a mismatch between the dst buffer device and the backend device, causing the use of sync copies * Enable the previous allreduce implementation. It is better in both perf and stability (#12) * delay AllReduce for Moe for less I/O * build : clean-up compile warnings * backend : move most of the meta backend API to ggml-backend-impl.h * cont : hide unused public API in the implementation * llama : use llama_device + remove ggml_backend_dev_is_meta() * ggml-backend : remove unused alloc include * minor : remove regex include * ggml : introduce ggml-ext.h for staging new APIs * rebase fixup * fix tests * llama : more robust logic for determining Meta devices (#16) * llama : more robust logic for determining Meta devices * cont : fix devs size check Co-authored-by: Johannes Gäßler <johannesg@5d6.de> * cont : fix log type Co-authored-by: Johannes Gäßler <johannesg@5d6.de> --------- Co-authored-by: Johannes Gäßler <johannesg@5d6.de> * disable roundtrip for meta backend * fix arch selection * Qwen 3.5 support * fix Gemma 4 MoE * fix OpenVino, SYCL * fix test-llama-archs for CPU-only builds * Fix Qwen 3.5 MoE * disable meta backend tests for WebGPU * tests : filter CPU-based devices from the Meta backend tests (#17) * meta : formatting, naming, indentation (#18) * formatting : llama-model.cpp * formatting : ggml-ext.h * formatting : ggml-backend-meta.cpp * meta : add TODO * add documentation * better error messages * fix GPT-OSS --------- Co-authored-by: Carl Philipp Klemm <carl@uvos.xyz> Co-authored-by: Gaurav Garg <gaugarg@nvidia.com> Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>
158 lines
4.7 KiB
CMake
158 lines
4.7 KiB
CMake
if (NOT EXISTS $ENV{ROCM_PATH})
|
|
if (NOT EXISTS /opt/rocm)
|
|
set(ROCM_PATH /usr)
|
|
else()
|
|
set(ROCM_PATH /opt/rocm)
|
|
endif()
|
|
else()
|
|
set(ROCM_PATH $ENV{ROCM_PATH})
|
|
endif()
|
|
|
|
list(APPEND CMAKE_PREFIX_PATH ${ROCM_PATH})
|
|
list(APPEND CMAKE_PREFIX_PATH "${ROCM_PATH}/lib64/cmake")
|
|
|
|
if (NOT DEFINED CMAKE_HIP_FLAGS_DEBUG)
|
|
set(CMAKE_HIP_FLAGS_DEBUG "-g -O2")
|
|
endif()
|
|
|
|
# CMake on Windows doesn't support the HIP language yet
|
|
if (WIN32)
|
|
set(CXX_IS_HIPCC TRUE)
|
|
else()
|
|
string(REGEX MATCH "hipcc(\.bat)?$" CXX_IS_HIPCC "${CMAKE_CXX_COMPILER}")
|
|
endif()
|
|
|
|
if (CXX_IS_HIPCC)
|
|
if (LINUX)
|
|
if (NOT ${CMAKE_CXX_COMPILER_ID} MATCHES "Clang")
|
|
message(WARNING "Only LLVM is supported for HIP, hint: CXX=/opt/rocm/llvm/bin/clang++")
|
|
endif()
|
|
|
|
message(WARNING "Setting hipcc as the C++ compiler is legacy behavior."
|
|
" Prefer setting the HIP compiler directly. See README for details.")
|
|
endif()
|
|
else()
|
|
# Forward (AMD)GPU_TARGETS to CMAKE_HIP_ARCHITECTURES.
|
|
if(AMDGPU_TARGETS AND NOT GPU_TARGETS)
|
|
set(GPU_TARGETS ${AMDGPU_TARGETS})
|
|
endif()
|
|
if(GPU_TARGETS AND NOT CMAKE_HIP_ARCHITECTURES)
|
|
set(CMAKE_HIP_ARCHITECTURES ${GPU_TARGETS})
|
|
endif()
|
|
cmake_minimum_required(VERSION 3.21)
|
|
enable_language(HIP)
|
|
endif()
|
|
|
|
find_package(hip REQUIRED)
|
|
find_package(hipblas REQUIRED)
|
|
find_package(rocblas REQUIRED)
|
|
|
|
if (GGML_HIP_RCCL)
|
|
find_package(rccl REQUIRED)
|
|
endif()
|
|
|
|
if (${hip_VERSION} VERSION_LESS 6.1)
|
|
message(FATAL_ERROR "At least ROCM/HIP V6.1 is required")
|
|
endif()
|
|
|
|
message(STATUS "HIP and hipBLAS found")
|
|
|
|
file(GLOB GGML_HEADERS_ROCM "../ggml-cuda/*.cuh")
|
|
list(APPEND GGML_HEADERS_ROCM "../../include/ggml-cuda.h")
|
|
|
|
file(GLOB GGML_SOURCES_ROCM "../ggml-cuda/*.cu")
|
|
file(GLOB SRCS "../ggml-cuda/template-instances/fattn-tile*.cu")
|
|
list(APPEND GGML_SOURCES_ROCM ${SRCS})
|
|
file(GLOB SRCS "../ggml-cuda/template-instances/fattn-mma*.cu")
|
|
list(APPEND GGML_SOURCES_ROCM ${SRCS})
|
|
file(GLOB SRCS "../ggml-cuda/template-instances/mmq*.cu")
|
|
list(APPEND GGML_SOURCES_ROCM ${SRCS})
|
|
file(GLOB SRCS "../ggml-cuda/template-instances/mmf*.cu")
|
|
list(APPEND GGML_SOURCES_ROCM ${SRCS})
|
|
|
|
if (GGML_CUDA_FA_ALL_QUANTS)
|
|
file(GLOB SRCS "../ggml-cuda/template-instances/fattn-vec*.cu")
|
|
list(APPEND GGML_SOURCES_ROCM ${SRCS})
|
|
add_compile_definitions(GGML_CUDA_FA_ALL_QUANTS)
|
|
else()
|
|
list(APPEND GGML_SOURCES_ROCM
|
|
../ggml-cuda/template-instances/fattn-vec-instance-f16-f16.cu
|
|
../ggml-cuda/template-instances/fattn-vec-instance-q4_0-q4_0.cu
|
|
../ggml-cuda/template-instances/fattn-vec-instance-q8_0-q8_0.cu
|
|
../ggml-cuda/template-instances/fattn-vec-instance-bf16-bf16.cu)
|
|
endif()
|
|
|
|
ggml_add_backend_library(ggml-hip
|
|
${GGML_HEADERS_ROCM}
|
|
${GGML_SOURCES_ROCM}
|
|
)
|
|
|
|
# TODO: do not use CUDA definitions for HIP
|
|
if (NOT GGML_BACKEND_DL)
|
|
target_compile_definitions(ggml PUBLIC GGML_USE_CUDA)
|
|
endif()
|
|
|
|
add_compile_definitions(GGML_USE_HIP)
|
|
|
|
if (GGML_CUDA_FORCE_MMQ)
|
|
add_compile_definitions(GGML_CUDA_FORCE_MMQ)
|
|
endif()
|
|
|
|
if (GGML_CUDA_FORCE_CUBLAS)
|
|
add_compile_definitions(GGML_CUDA_FORCE_CUBLAS)
|
|
endif()
|
|
|
|
if (GGML_CUDA_NO_PEER_COPY)
|
|
add_compile_definitions(GGML_CUDA_NO_PEER_COPY)
|
|
endif()
|
|
|
|
if (GGML_HIP_GRAPHS)
|
|
add_compile_definitions(GGML_HIP_GRAPHS)
|
|
endif()
|
|
|
|
if (GGML_HIP_NO_VMM)
|
|
add_compile_definitions(GGML_HIP_NO_VMM)
|
|
endif()
|
|
|
|
if (GGML_HIP_ROCWMMA_FATTN)
|
|
add_compile_definitions(GGML_HIP_ROCWMMA_FATTN)
|
|
endif()
|
|
|
|
if (NOT GGML_HIP_MMQ_MFMA)
|
|
add_compile_definitions(GGML_HIP_NO_MMQ_MFMA)
|
|
endif()
|
|
|
|
if (GGML_HIP_RCCL)
|
|
add_compile_definitions(GGML_USE_NCCL) # RCCL has the same interface as NCCL.
|
|
endif()
|
|
|
|
if (GGML_HIP_EXPORT_METRICS)
|
|
set(CMAKE_HIP_FLAGS "${CMAKE_HIP_FLAGS} -Rpass-analysis=kernel-resource-usage --save-temps")
|
|
endif()
|
|
|
|
if (NOT GGML_CUDA_FA)
|
|
add_compile_definitions(GGML_CUDA_NO_FA)
|
|
endif()
|
|
|
|
if (CXX_IS_HIPCC)
|
|
set_source_files_properties(${GGML_SOURCES_ROCM} PROPERTIES LANGUAGE CXX)
|
|
if (WIN32 AND CMAKE_BUILD_TYPE STREQUAL "Debug")
|
|
# CMake on Windows doesn't support the HIP language yet.
|
|
# Therefore we workaround debug build's failure on HIP backend this way.
|
|
set_source_files_properties(${GGML_SOURCES_ROCM} PROPERTIES COMPILE_FLAGS "-O2 -g")
|
|
endif()
|
|
target_link_libraries(ggml-hip PRIVATE hip::device)
|
|
else()
|
|
set_source_files_properties(${GGML_SOURCES_ROCM} PROPERTIES LANGUAGE HIP)
|
|
endif()
|
|
|
|
if (GGML_STATIC)
|
|
message(FATAL_ERROR "Static linking not supported for HIP/ROCm")
|
|
endif()
|
|
|
|
if (GGML_HIP_RCCL)
|
|
target_link_libraries(ggml-hip PRIVATE ggml-base roc::rccl)
|
|
endif()
|
|
|
|
target_link_libraries(ggml-hip PRIVATE ggml-base hip::host roc::rocblas roc::hipblas)
|