Fix recurrent state serialization for partial reads and writes (#22362)
The previous code worked only for full tensor reads and writes and was hitting `GGML_ASSERT(size == ggml_nbytes(tensor)); ` assert when tested with llama-server.
This commit is contained in:
@@ -1205,40 +1205,57 @@ static void ggml_backend_meta_buffer_set_tensor(ggml_backend_buffer_t buffer, gg
|
|||||||
|
|
||||||
if (split_state.n_segments != 1) {
|
if (split_state.n_segments != 1) {
|
||||||
GGML_ASSERT(split_state.axis >= 0 && split_state.axis < GGML_MAX_DIMS);
|
GGML_ASSERT(split_state.axis >= 0 && split_state.axis < GGML_MAX_DIMS);
|
||||||
GGML_ASSERT(offset == 0);
|
|
||||||
GGML_ASSERT(size == ggml_nbytes(tensor));
|
|
||||||
GGML_ASSERT(tensor->ne[3] == 1);
|
GGML_ASSERT(tensor->ne[3] == 1);
|
||||||
|
|
||||||
size_t offset_data = 0;
|
size_t offset_data = 0;
|
||||||
std::vector<size_t> simple_offsets(n_bufs, 0);
|
std::vector<size_t> simple_offsets(n_bufs, 0);
|
||||||
if (split_state.axis == GGML_BACKEND_SPLIT_AXIS_0) {
|
if (split_state.axis == GGML_BACKEND_SPLIT_AXIS_0) {
|
||||||
GGML_ASSERT(tensor->ne[2] == 1);
|
GGML_ASSERT(tensor->ne[2] == 1);
|
||||||
|
|
||||||
|
const size_t row_stride = tensor->nb[1];
|
||||||
|
GGML_ASSERT(offset % row_stride == 0);
|
||||||
|
GGML_ASSERT(size % row_stride == 0);
|
||||||
|
const int64_t r_start = offset / row_stride;
|
||||||
|
const int64_t r_count = size / row_stride;
|
||||||
|
GGML_ASSERT(r_start + r_count <= tensor->ne[1]);
|
||||||
|
|
||||||
const int64_t blck_size = ggml_blck_size(tensor->type);
|
const int64_t blck_size = ggml_blck_size(tensor->type);
|
||||||
for (size_t s = 0; s < split_state.n_segments; s++) {
|
for (size_t s = 0; s < split_state.n_segments; s++) {
|
||||||
for (size_t j = 0; j < n_bufs; j++) {
|
for (size_t j = 0; j < n_bufs; j++) {
|
||||||
ggml_tensor * simple_tensor = ggml_backend_meta_buffer_simple_tensor(tensor, j);
|
ggml_tensor * simple_tensor = ggml_backend_meta_buffer_simple_tensor(tensor, j);
|
||||||
GGML_ASSERT(split_state.ne[s*n_bufs + j] % blck_size == 0);
|
GGML_ASSERT(split_state.ne[s*n_bufs + j] % blck_size == 0);
|
||||||
const size_t nbytes = split_state.ne[s*n_bufs + j]/blck_size * tensor->nb[0];
|
const size_t nbytes = split_state.ne[s*n_bufs + j]/blck_size * tensor->nb[0];
|
||||||
ggml_backend_tensor_set_2d(simple_tensor, (const char *) data + offset_data, simple_offsets[j], nbytes,
|
ggml_backend_tensor_set_2d(simple_tensor, (const char *) data + offset_data,
|
||||||
tensor->ne[1], simple_tensor->nb[1], tensor->nb[1]);
|
simple_offsets[j] + r_start * simple_tensor->nb[1], nbytes,
|
||||||
|
r_count, simple_tensor->nb[1], tensor->nb[1]);
|
||||||
offset_data += nbytes;
|
offset_data += nbytes;
|
||||||
simple_offsets[j] += nbytes;
|
simple_offsets[j] += nbytes;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
GGML_ASSERT(offset_data*tensor->ne[1] == size);
|
GGML_ASSERT(offset_data*r_count == size);
|
||||||
return;
|
return;
|
||||||
}
|
}
|
||||||
GGML_ASSERT(split_state.axis == GGML_BACKEND_SPLIT_AXIS_1);
|
GGML_ASSERT(split_state.axis == GGML_BACKEND_SPLIT_AXIS_1);
|
||||||
|
|
||||||
|
const size_t row_stride = tensor->nb[2];
|
||||||
|
GGML_ASSERT(offset % row_stride == 0);
|
||||||
|
GGML_ASSERT(size % row_stride == 0);
|
||||||
|
const int64_t r_start = offset / row_stride;
|
||||||
|
const int64_t r_count = size / row_stride;
|
||||||
|
GGML_ASSERT(r_start + r_count <= tensor->ne[2]);
|
||||||
|
|
||||||
for (size_t s = 0; s < split_state.n_segments; s++) {
|
for (size_t s = 0; s < split_state.n_segments; s++) {
|
||||||
for (size_t j = 0; j < n_bufs; j++) {
|
for (size_t j = 0; j < n_bufs; j++) {
|
||||||
ggml_tensor * simple_tensor = ggml_backend_meta_buffer_simple_tensor(tensor, j);
|
ggml_tensor * simple_tensor = ggml_backend_meta_buffer_simple_tensor(tensor, j);
|
||||||
const size_t nbytes = split_state.ne[s*n_bufs + j] * tensor->nb[1];
|
const size_t nbytes = split_state.ne[s*n_bufs + j] * tensor->nb[1];
|
||||||
ggml_backend_tensor_set_2d(simple_tensor, (const char *) data + offset_data, simple_offsets[j], nbytes,
|
ggml_backend_tensor_set_2d(simple_tensor, (const char *) data + offset_data,
|
||||||
tensor->ne[2], simple_tensor->nb[2], tensor->nb[2]);
|
simple_offsets[j] + r_start * simple_tensor->nb[2], nbytes,
|
||||||
|
r_count, simple_tensor->nb[2], tensor->nb[2]);
|
||||||
offset_data += nbytes;
|
offset_data += nbytes;
|
||||||
simple_offsets[j] += nbytes;
|
simple_offsets[j] += nbytes;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
GGML_ASSERT(offset_data*tensor->ne[2] == size);
|
GGML_ASSERT(offset_data*r_count == size);
|
||||||
return;
|
return;
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -1295,40 +1312,57 @@ static void ggml_backend_meta_buffer_get_tensor(ggml_backend_buffer_t buffer, co
|
|||||||
|
|
||||||
if (split_state.n_segments != 1) {
|
if (split_state.n_segments != 1) {
|
||||||
GGML_ASSERT(split_state.axis >= 0 && split_state.axis < GGML_MAX_DIMS);
|
GGML_ASSERT(split_state.axis >= 0 && split_state.axis < GGML_MAX_DIMS);
|
||||||
GGML_ASSERT(offset == 0);
|
|
||||||
GGML_ASSERT(size == ggml_nbytes(tensor));
|
|
||||||
GGML_ASSERT(tensor->ne[3] == 1);
|
GGML_ASSERT(tensor->ne[3] == 1);
|
||||||
|
|
||||||
size_t offset_data = 0;
|
size_t offset_data = 0;
|
||||||
std::vector<size_t> simple_offsets(n_bufs, 0);
|
std::vector<size_t> simple_offsets(n_bufs, 0);
|
||||||
if (split_state.axis == GGML_BACKEND_SPLIT_AXIS_0) {
|
if (split_state.axis == GGML_BACKEND_SPLIT_AXIS_0) {
|
||||||
GGML_ASSERT(tensor->ne[2] == 1);
|
GGML_ASSERT(tensor->ne[2] == 1);
|
||||||
|
|
||||||
|
const size_t row_stride = tensor->nb[1];
|
||||||
|
GGML_ASSERT(offset % row_stride == 0);
|
||||||
|
GGML_ASSERT(size % row_stride == 0);
|
||||||
|
const int64_t r_start = offset / row_stride;
|
||||||
|
const int64_t r_count = size / row_stride;
|
||||||
|
GGML_ASSERT(r_start + r_count <= tensor->ne[1]);
|
||||||
|
|
||||||
const int64_t blck_size = ggml_blck_size(tensor->type);
|
const int64_t blck_size = ggml_blck_size(tensor->type);
|
||||||
for (size_t s = 0; s < split_state.n_segments; s++) {
|
for (size_t s = 0; s < split_state.n_segments; s++) {
|
||||||
for (size_t j = 0; j < n_bufs; j++) {
|
for (size_t j = 0; j < n_bufs; j++) {
|
||||||
const ggml_tensor * simple_tensor = ggml_backend_meta_buffer_simple_tensor(tensor, j);
|
const ggml_tensor * simple_tensor = ggml_backend_meta_buffer_simple_tensor(tensor, j);
|
||||||
GGML_ASSERT(split_state.ne[s*n_bufs + j] % blck_size == 0);
|
GGML_ASSERT(split_state.ne[s*n_bufs + j] % blck_size == 0);
|
||||||
const size_t nbytes = split_state.ne[s*n_bufs + j]/blck_size * tensor->nb[0];
|
const size_t nbytes = split_state.ne[s*n_bufs + j]/blck_size * tensor->nb[0];
|
||||||
ggml_backend_tensor_get_2d(simple_tensor, (char *) data + offset_data, simple_offsets[j], nbytes,
|
ggml_backend_tensor_get_2d(simple_tensor, (char *) data + offset_data,
|
||||||
tensor->ne[1], simple_tensor->nb[1], tensor->nb[1]);
|
simple_offsets[j] + r_start * simple_tensor->nb[1], nbytes,
|
||||||
|
r_count, simple_tensor->nb[1], tensor->nb[1]);
|
||||||
offset_data += nbytes;
|
offset_data += nbytes;
|
||||||
simple_offsets[j] += nbytes;
|
simple_offsets[j] += nbytes;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
GGML_ASSERT(offset_data*tensor->ne[1] == size);
|
GGML_ASSERT(offset_data*r_count == size);
|
||||||
return;
|
return;
|
||||||
}
|
}
|
||||||
GGML_ASSERT(split_state.axis == GGML_BACKEND_SPLIT_AXIS_1);
|
GGML_ASSERT(split_state.axis == GGML_BACKEND_SPLIT_AXIS_1);
|
||||||
|
|
||||||
|
const size_t row_stride = tensor->nb[2];
|
||||||
|
GGML_ASSERT(offset % row_stride == 0);
|
||||||
|
GGML_ASSERT(size % row_stride == 0);
|
||||||
|
const int64_t r_start = offset / row_stride;
|
||||||
|
const int64_t r_count = size / row_stride;
|
||||||
|
GGML_ASSERT(r_start + r_count <= tensor->ne[2]);
|
||||||
|
|
||||||
for (size_t s = 0; s < split_state.n_segments; s++) {
|
for (size_t s = 0; s < split_state.n_segments; s++) {
|
||||||
for (size_t j = 0; j < n_bufs; j++) {
|
for (size_t j = 0; j < n_bufs; j++) {
|
||||||
const ggml_tensor * simple_tensor = ggml_backend_meta_buffer_simple_tensor(tensor, j);
|
const ggml_tensor * simple_tensor = ggml_backend_meta_buffer_simple_tensor(tensor, j);
|
||||||
const size_t nbytes = split_state.ne[s*n_bufs + j] * tensor->nb[1];
|
const size_t nbytes = split_state.ne[s*n_bufs + j] * tensor->nb[1];
|
||||||
ggml_backend_tensor_get_2d(simple_tensor, (char *) data + offset_data, simple_offsets[j], nbytes,
|
ggml_backend_tensor_get_2d(simple_tensor, (char *) data + offset_data,
|
||||||
tensor->ne[2], simple_tensor->nb[2], tensor->nb[2]);
|
simple_offsets[j] + r_start * simple_tensor->nb[2], nbytes,
|
||||||
|
r_count, simple_tensor->nb[2], tensor->nb[2]);
|
||||||
offset_data += nbytes;
|
offset_data += nbytes;
|
||||||
simple_offsets[j] += nbytes;
|
simple_offsets[j] += nbytes;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
GGML_ASSERT(offset_data*tensor->ne[2] == size);
|
GGML_ASSERT(offset_data*r_count == size);
|
||||||
return;
|
return;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|||||||
Reference in New Issue
Block a user