5d2b52d80d
* hexagon: restore HTP_OPMASK_QUEUE * hexagon: honor OPMASK_SKIP_COMPUTE in hmx-matmul * hex-prof: restore op profiling * hex-prof: enable PMU * hexagon: simplify and improve op-queuing with full profiling support Add separate profile descriptors. * hexagon: remove opsync and rename opmask into opstage opsync is no longer needed since the profiler is fully async now. opmask name was confusing and opstage is more accurate. * hexagon: refactor opbatch queue handling * hexagon: add iface hooks for enabling profiler from the host Also move all the PMU setup stuff out of the hex-utils since it's not inteded for normal use. * hexagon: make profiler mode configurable On older devices getting PMU counters is expensive so it's now optional. * hexagon: add support for setting profiler pmu events from env * hexagon: simplify profiler output (no need to print buffs, etc) * hexagon: simplify pmu counter formating * hexagon: add a simple profile post-proc tool * hex-prof: add support for reading logs from stdin * hexagon: document GGML_HEXAGON_PROFILE * hex-prof: update default width for dims field * hex-prof: fix linter warnings and errors * Update ggml/src/ggml-hexagon/htp/htp-ops.h Co-authored-by: Sigbjørn Skjæret <sigbjorn.skjaeret@scala.com> * Update scripts/snapdragon/ggml-hexagon-profile.py Co-authored-by: Sigbjørn Skjæret <sigbjorn.skjaeret@scala.com> --------- Co-authored-by: Trivikram Reddy <tamarnat@qti.qualcomm.com> Co-authored-by: Sigbjørn Skjæret <sigbjorn.skjaeret@scala.com>
132 lines
3.6 KiB
C
132 lines
3.6 KiB
C
#ifndef HEX_UTILS_H
|
|
#define HEX_UTILS_H
|
|
|
|
#include <stdbool.h>
|
|
#include <stdint.h>
|
|
#include <qurt_memory.h>
|
|
#include <qurt.h>
|
|
|
|
#include "hexagon_types.h"
|
|
#include "hexagon_protos.h"
|
|
|
|
#include "hex-fastdiv.h"
|
|
#include "hex-dump.h"
|
|
|
|
#ifndef MAX
|
|
#define MAX(a, b) ((a) > (b) ? (a) : (b))
|
|
#endif
|
|
|
|
#ifndef MIN
|
|
#define MIN(a, b) ((a) < (b) ? (a) : (b))
|
|
#endif
|
|
|
|
static inline uint64_t hex_get_cycles() {
|
|
uint64_t cycles = 0;
|
|
asm volatile(" %0 = c15:14\n" : "=r"(cycles));
|
|
return cycles;
|
|
}
|
|
|
|
static inline uint64_t hex_get_pktcnt() {
|
|
uint64_t pktcnt;
|
|
asm volatile(" %0 = c19:18\n" : "=r"(pktcnt));
|
|
return pktcnt;
|
|
}
|
|
|
|
static inline uint32_t hex_ceil_pow2(uint32_t x) {
|
|
if (x <= 1) { return 1; }
|
|
int p = 2;
|
|
x--;
|
|
while (x >>= 1) { p <<= 1; }
|
|
return p;
|
|
}
|
|
|
|
static inline size_t hmx_ceil_div(size_t num, size_t den) {
|
|
return (num + den - 1) / den;
|
|
}
|
|
|
|
static inline int32_t hex_is_aligned(const void * addr, uint32_t align) {
|
|
return ((size_t) addr & (align - 1)) == 0;
|
|
}
|
|
|
|
static inline size_t hex_align_up(size_t v, size_t align) {
|
|
return hmx_ceil_div(v, align) * align;
|
|
}
|
|
|
|
static inline size_t hex_align_down(size_t v, size_t align) {
|
|
return (v / align) * align;
|
|
}
|
|
|
|
static inline int32_t hex_is_one_chunk(void * addr, uint32_t n, uint32_t chunk_size) {
|
|
uint32_t left_off = (size_t) addr & (chunk_size - 1);
|
|
uint32_t right_off = left_off + n;
|
|
return right_off <= chunk_size;
|
|
}
|
|
|
|
static inline uint32_t hex_round_up(uint32_t n, uint32_t m) {
|
|
return m * ((n + m - 1) / m);
|
|
}
|
|
|
|
static inline size_t hex_smin(size_t a, size_t b) {
|
|
return a < b ? a : b;
|
|
}
|
|
|
|
static inline size_t hex_smax(size_t a, size_t b) {
|
|
return a > b ? a : b;
|
|
}
|
|
|
|
static inline void hex_l2fetch(const void * p, uint32_t width, uint32_t stride, uint32_t height) {
|
|
const uint64_t control = Q6_P_combine_RR(stride, Q6_R_combine_RlRl(width, height));
|
|
Q6_l2fetch_AP((void *) p, control);
|
|
}
|
|
|
|
#define HEX_L2_LINE_SIZE 64
|
|
#define HEX_L2_FLUSH_SIZE (128 * 1024)
|
|
|
|
static inline void hex_l2flush(void * addr, size_t size) {
|
|
if (size > HEX_L2_FLUSH_SIZE) {
|
|
qurt_mem_cache_clean((qurt_addr_t) 0, 0, QURT_MEM_CACHE_FLUSH_INVALIDATE_ALL, QURT_MEM_DCACHE);
|
|
} else {
|
|
const uint32_t s = (uint32_t) addr;
|
|
const uint32_t e = s + size;
|
|
for (uint32_t i = s; i < e; i += HEX_L2_LINE_SIZE * 4) {
|
|
Q6_dccleaninva_A((void *) i + HEX_L2_LINE_SIZE * 0);
|
|
Q6_dccleaninva_A((void *) i + HEX_L2_LINE_SIZE * 1);
|
|
Q6_dccleaninva_A((void *) i + HEX_L2_LINE_SIZE * 2);
|
|
Q6_dccleaninva_A((void *) i + HEX_L2_LINE_SIZE * 3);
|
|
}
|
|
}
|
|
}
|
|
|
|
static inline void hex_pause() {
|
|
asm volatile(" pause(#255)\n");
|
|
}
|
|
|
|
#ifndef HEX_NUM_PMU_COUNTERS
|
|
#define HEX_NUM_PMU_COUNTERS 8
|
|
#endif
|
|
|
|
static inline void hex_get_pmu(uint32_t counters[]) {
|
|
#if __HVX_ARCH__ >= 79
|
|
asm volatile("%0 = upmucnt0" : "=r"(counters[0]));
|
|
asm volatile("%0 = upmucnt1" : "=r"(counters[1]));
|
|
asm volatile("%0 = upmucnt2" : "=r"(counters[2]));
|
|
asm volatile("%0 = upmucnt3" : "=r"(counters[3]));
|
|
asm volatile("%0 = upmucnt4" : "=r"(counters[4]));
|
|
asm volatile("%0 = upmucnt5" : "=r"(counters[5]));
|
|
asm volatile("%0 = upmucnt6" : "=r"(counters[6]));
|
|
asm volatile("%0 = upmucnt7" : "=r"(counters[7]));
|
|
#else
|
|
counters[0] = qurt_pmu_get(QURT_PMUCNT0);
|
|
counters[1] = qurt_pmu_get(QURT_PMUCNT1);
|
|
counters[2] = qurt_pmu_get(QURT_PMUCNT2);
|
|
counters[3] = qurt_pmu_get(QURT_PMUCNT3);
|
|
counters[4] = qurt_pmu_get(QURT_PMUCNT4);
|
|
counters[5] = qurt_pmu_get(QURT_PMUCNT5);
|
|
counters[6] = qurt_pmu_get(QURT_PMUCNT6);
|
|
counters[7] = qurt_pmu_get(QURT_PMUCNT7);
|
|
// qurt_pmu_get_pmucnt(counters);
|
|
#endif
|
|
}
|
|
|
|
#endif /* HEX_UTILS_H */
|