hexagon: optimization for HMX mat_mul (#21554)
* hexagon: add async HMX worker Introduce hmx-worker (dedicated thread for HMX compute) to overlap HMX matmul with HVX dequant/DMA stages in the pipeline path, replacing the previous synchronous HMX calls that blocked the main thread. * hexagon: cost-based VTCM chunk search for out-stationary matmul * hexagon: fix futex race in hmx_worker_drain Store the boolean to local variable avoid atomic load twice * hex-mm: hmx optimize scatter/transpose and use HMX intrinsics * hex-vmem: drop vmem limit a touch under 3GB on v73 * hexagon: add fwd declaration of htp_context * hex-hmx: replace hmx-worker with hmx-queue that mimics dma-queue interface Simplifies the overall implemantion, reduces thread wakeup roundtrips. * hex-mm: add debug log to hmx work func called from hmx-queue * Update hmx-queue.h Co-authored-by: Max Krasnyansky <max.krasnyansky@gmail.com> --------- Co-authored-by: Kim-Chyan Gan <kgan@qti.qualcomm.com> Co-authored-by: Max Krasnyansky <maxk@qti.qualcomm.com> Co-authored-by: Max Krasnyansky <max.krasnyansky@gmail.com>
This commit is contained in:
@@ -31,6 +31,14 @@ static inline uint64_t hex_get_pktcnt() {
|
||||
return pktcnt;
|
||||
}
|
||||
|
||||
static inline uint32_t hex_ceil_pow2(uint32_t x) {
|
||||
if (x <= 1) { return 1; }
|
||||
int p = 2;
|
||||
x--;
|
||||
while (x >>= 1) { p <<= 1; }
|
||||
return p;
|
||||
}
|
||||
|
||||
static inline size_t hmx_ceil_div(size_t num, size_t den) {
|
||||
return (num + den - 1) / den;
|
||||
}
|
||||
@@ -73,8 +81,7 @@ static inline void hex_l2fetch(const void * p, uint32_t width, uint32_t stride,
|
||||
#define HEX_L2_LINE_SIZE 64
|
||||
#define HEX_L2_FLUSH_SIZE (128 * 1024)
|
||||
|
||||
static inline void hex_l2flush(void * addr, size_t size)
|
||||
{
|
||||
static inline void hex_l2flush(void * addr, size_t size) {
|
||||
if (size > HEX_L2_FLUSH_SIZE) {
|
||||
qurt_mem_cache_clean((qurt_addr_t) 0, 0, QURT_MEM_CACHE_FLUSH_INVALIDATE_ALL, QURT_MEM_DCACHE);
|
||||
} else {
|
||||
@@ -89,4 +96,8 @@ static inline void hex_l2flush(void * addr, size_t size)
|
||||
}
|
||||
}
|
||||
|
||||
static inline void hex_pause() {
|
||||
asm volatile(" pause(#255)\n");
|
||||
}
|
||||
|
||||
#endif /* HEX_UTILS_H */
|
||||
|
||||
Reference in New Issue
Block a user