ggml webgpu: add support for emscripten builds (#17184)
* Faster tensors (#8) Add fast matrix and matrix/vector multiplication. * Use map for shader replacements instead of pair of strings * Wasm (#9) * webgpu : fix build on emscripten * more debugging stuff * test-backend-ops: force single thread on wasm * fix single-thread case for init_tensor_uniform * use jspi * add pthread * test: remember to set n_thread for cpu backend * Add buffer label and enable dawn-specific toggles to turn off some checks * Intermediate state * Fast working f16/f32 vec4 * Working float fast mul mat * Clean up naming of mul_mat to match logical model, start work on q mul_mat * Setup for subgroup matrix mat mul * Basic working subgroup matrix * Working subgroup matrix tiling * Handle weirder sg matrix sizes (but still % sg matrix size) * Working start to gemv * working f16 accumulation with shared memory staging * Print out available subgroup matrix configurations * Vectorize dst stores for sg matrix shader * Gemv working scalar * Minor set_rows optimization (#4) * updated optimization, fixed errors * non vectorized version now dispatches one thread per element * Simplify * Change logic for set_rows pipelines --------- Co-authored-by: Neha Abbas <nehaabbas@macbookpro.lan> Co-authored-by: Neha Abbas <nehaabbas@ReeseLevines-MacBook-Pro.local> Co-authored-by: Reese Levine <reeselevine1@gmail.com> * Comment on dawn toggles * Working subgroup matrix code for (semi)generic sizes * Remove some comments * Cleanup code * Update dawn version and move to portable subgroup size * Try to fix new dawn release * Update subgroup size comment * Only check for subgroup matrix configs if they are supported * Add toggles for subgroup matrix/f16 support on nvidia+vulkan * Make row/col naming consistent * Refactor shared memory loading * Move sg matrix stores to correct file * Working q4_0 * Formatting * Work with emscripten builds * Fix test-backend-ops emscripten for f16/quantized types * Use emscripten memory64 to support get_memory * Add build flags and try ci --------- Co-authored-by: Xuan Son Nguyen <son@huggingface.co> * Remove extra whitespace * Move wasm single-thread logic out of test-backend-ops for cpu backend * Disable multiple threads for emscripten single-thread builds in ggml_graph_plan * Fix .gitignore * Add memory64 option and remove unneeded macros for setting threads to 1 --------- Co-authored-by: Xuan Son Nguyen <son@huggingface.co>
This commit is contained in:
+38
-22
@@ -41,12 +41,18 @@
|
||||
#include <vector>
|
||||
#include <unordered_map>
|
||||
|
||||
#ifdef __EMSCRIPTEN__
|
||||
# define N_THREADS 1
|
||||
#else
|
||||
# define N_THREADS std::thread::hardware_concurrency()
|
||||
#endif
|
||||
|
||||
static void init_tensor_uniform(ggml_tensor * tensor, float min = -1.0f, float max = 1.0f) {
|
||||
size_t nels = ggml_nelements(tensor);
|
||||
std::vector<float> data(nels);
|
||||
{
|
||||
// parallel initialization
|
||||
static const size_t n_threads = std::thread::hardware_concurrency();
|
||||
static const size_t n_threads = N_THREADS;
|
||||
// static RNG initialization (revisit if n_threads stops being constant)
|
||||
static std::vector<std::default_random_engine> generators = []() {
|
||||
std::random_device rd;
|
||||
@@ -65,15 +71,19 @@ static void init_tensor_uniform(ggml_tensor * tensor, float min = -1.0f, float m
|
||||
}
|
||||
};
|
||||
|
||||
std::vector<std::future<void>> tasks;
|
||||
tasks.reserve(n_threads);
|
||||
for (size_t i = 0; i < n_threads; i++) {
|
||||
size_t start = i*nels/n_threads;
|
||||
size_t end = (i+1)*nels/n_threads;
|
||||
tasks.push_back(std::async(std::launch::async, init_thread, i, start, end));
|
||||
}
|
||||
for (auto & t : tasks) {
|
||||
t.get();
|
||||
if (n_threads == 1) {
|
||||
init_thread(0, 0, nels);
|
||||
} else {
|
||||
std::vector<std::future<void>> tasks;
|
||||
tasks.reserve(n_threads);
|
||||
for (size_t i = 0; i < n_threads; i++) {
|
||||
size_t start = i*nels/n_threads;
|
||||
size_t end = (i+1)*nels/n_threads;
|
||||
tasks.push_back(std::async(std::launch::async, init_thread, i, start, end));
|
||||
}
|
||||
for (auto & t : tasks) {
|
||||
t.get();
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
@@ -105,17 +115,23 @@ static void init_tensor_uniform(ggml_tensor * tensor, float min = -1.0f, float m
|
||||
};
|
||||
|
||||
const size_t min_blocks_per_thread = 1;
|
||||
const size_t n_threads = std::min<size_t>(std::thread::hardware_concurrency()/2,
|
||||
std::max<size_t>(1, n_blocks / min_blocks_per_thread));
|
||||
std::vector<std::future<void>> tasks;
|
||||
tasks.reserve(n_threads);
|
||||
for (size_t i = 0; i < n_threads; i++) {
|
||||
size_t start = i*n_blocks/n_threads;
|
||||
size_t end = (i+1)*n_blocks/n_threads;
|
||||
tasks.push_back(std::async(std::launch::async, quantize_thread, start, end));
|
||||
}
|
||||
for (auto & t : tasks) {
|
||||
t.get();
|
||||
const size_t n_quant_threads = std::min<size_t>(std::max<size_t>(N_THREADS/2, 1),
|
||||
std::max<size_t>(1, n_blocks / min_blocks_per_thread));
|
||||
|
||||
if (n_quant_threads == 1) {
|
||||
// single-threaded quantization: do all blocks in the current thread
|
||||
quantize_thread(0, n_blocks);
|
||||
} else {
|
||||
std::vector<std::future<void>> tasks;
|
||||
tasks.reserve(n_quant_threads);
|
||||
for (size_t i = 0; i < n_quant_threads; i++) {
|
||||
size_t start = i*n_blocks/n_quant_threads;
|
||||
size_t end = (i+1)*n_blocks/n_quant_threads;
|
||||
tasks.push_back(std::async(std::launch::async, quantize_thread, start, end));
|
||||
}
|
||||
for (auto & t : tasks) {
|
||||
t.get();
|
||||
}
|
||||
}
|
||||
}
|
||||
ggml_backend_tensor_set(tensor, dataq.data(), 0, dataq.size());
|
||||
@@ -8363,7 +8379,7 @@ int main(int argc, char ** argv) {
|
||||
auto ggml_backend_set_n_threads_fn = (ggml_backend_set_n_threads_t) ggml_backend_reg_get_proc_address(reg, "ggml_backend_set_n_threads");
|
||||
if (ggml_backend_set_n_threads_fn) {
|
||||
// TODO: better value for n_threads
|
||||
ggml_backend_set_n_threads_fn(backend, std::thread::hardware_concurrency());
|
||||
ggml_backend_set_n_threads_fn(backend, N_THREADS);
|
||||
}
|
||||
|
||||
size_t free, total; // NOLINT
|
||||
|
||||
Reference in New Issue
Block a user