From 3b579882903c577daa1af286a5e0bf5bc122a34d Mon Sep 17 00:00:00 2001 From: Eric Anholt Date: Wed, 23 Jan 2013 17:05:10 -0800 Subject: [PATCH] i965: Massively simplify the intel_upload implementation. The implementation kept a page-sized area for uploading data, and uploaded chunks from that to a 64kb-sized streamed buffer. This wasted cache footprint (and extra state tracking to do so) when we want to just write our data into the buffer immediately. Instead, build it around an interface like brw_state_batch() that just gets you a pointer to BO memory to upload your stuff immediately. Improves OpenArena on HSW by 1.62209% +/- 0.355299% (n=61) and on BYT by 1.7916% +/- 0.415743% (n=31). v2: Rebase on Mesa master, drop old prototypes. Re-do performance comparison on a kernel that doesn't punish CPU efficiency improvements. Reviewed-by: Kenneth Graunke --- src/mesa/drivers/dri/i965/brw_context.h | 5 +- src/mesa/drivers/dri/i965/brw_draw_upload.c | 10 +- src/mesa/drivers/dri/i965/intel_buffer_objects.h | 21 +-- src/mesa/drivers/dri/i965/intel_upload.c | 167 +++++++++-------------- 4 files changed, 77 insertions(+), 126 deletions(-) diff --git a/src/mesa/drivers/dri/i965/brw_context.h b/src/mesa/drivers/dri/i965/brw_context.h index 04af5d04d3a..e1196824571 100644 --- a/src/mesa/drivers/dri/i965/brw_context.h +++ b/src/mesa/drivers/dri/i965/brw_context.h @@ -1041,10 +1041,7 @@ struct brw_context struct { drm_intel_bo *bo; - GLuint offset; - uint32_t buffer_len; - uint32_t buffer_offset; - char buffer[4096]; + uint32_t next_offset; } upload; /** diff --git a/src/mesa/drivers/dri/i965/brw_draw_upload.c b/src/mesa/drivers/dri/i965/brw_draw_upload.c index e261163e765..a579025508c 100644 --- a/src/mesa/drivers/dri/i965/brw_draw_upload.c +++ b/src/mesa/drivers/dri/i965/brw_draw_upload.c @@ -381,21 +381,17 @@ copy_array_to_vbo_array(struct brw_context *brw, const unsigned char *src = element->glarray->Ptr + min * src_stride; int count = max - min + 1; GLuint size = count * dst_stride; + uint8_t *dst = intel_upload_space(brw, size, dst_stride, + &buffer->bo, &buffer->offset); if (dst_stride == src_stride) { - intel_upload_data(brw, src, size, dst_stride, - &buffer->bo, &buffer->offset); + memcpy(dst, src, size); } else { - char * const map = intel_upload_map(brw, size, dst_stride); - char *dst = map; - while (count--) { memcpy(dst, src, dst_stride); src += src_stride; dst += dst_stride; } - intel_upload_unmap(brw, map, size, dst_stride, - &buffer->bo, &buffer->offset); } buffer->stride = dst_stride; } diff --git a/src/mesa/drivers/dri/i965/intel_buffer_objects.h b/src/mesa/drivers/dri/i965/intel_buffer_objects.h index b27d25fbebf..5eaf9dce29f 100644 --- a/src/mesa/drivers/dri/i965/intel_buffer_objects.h +++ b/src/mesa/drivers/dri/i965/intel_buffer_objects.h @@ -90,16 +90,17 @@ drm_intel_bo *intel_bufferobj_buffer(struct brw_context *brw, uint32_t size); void intel_upload_data(struct brw_context *brw, - const void *ptr, GLuint size, GLuint align, - drm_intel_bo **return_bo, - GLuint *return_offset); - -void *intel_upload_map(struct brw_context *brw, - GLuint size, GLuint align); -void intel_upload_unmap(struct brw_context *brw, - const void *ptr, GLuint size, GLuint align, - drm_intel_bo **return_bo, - GLuint *return_offset); + const void *data, + uint32_t size, + uint32_t alignment, + drm_intel_bo **out_bo, + uint32_t *out_offset); + +void *intel_upload_space(struct brw_context *brw, + uint32_t size, + uint32_t alignment, + drm_intel_bo **out_bo, + uint32_t *out_offset); void intel_upload_finish(struct brw_context *brw); diff --git a/src/mesa/drivers/dri/i965/intel_upload.c b/src/mesa/drivers/dri/i965/intel_upload.c index ec3109bd441..bb3f615d987 100644 --- a/src/mesa/drivers/dri/i965/intel_upload.c +++ b/src/mesa/drivers/dri/i965/intel_upload.c @@ -57,127 +57,84 @@ intel_upload_finish(struct brw_context *brw) if (!brw->upload.bo) return; - if (brw->upload.buffer_len) { - drm_intel_bo_subdata(brw->upload.bo, - brw->upload.buffer_offset, - brw->upload.buffer_len, - brw->upload.buffer); - brw->upload.buffer_len = 0; - } - + drm_intel_bo_unmap(brw->upload.bo); drm_intel_bo_unreference(brw->upload.bo); brw->upload.bo = NULL; + brw->upload.next_offset = 0; } -static void -wrap_buffers(struct brw_context *brw, GLuint size) -{ - intel_upload_finish(brw); - - if (size < INTEL_UPLOAD_SIZE) - size = INTEL_UPLOAD_SIZE; - - brw->upload.bo = drm_intel_bo_alloc(brw->bufmgr, "upload", size, 0); - brw->upload.offset = 0; -} - -void -intel_upload_data(struct brw_context *brw, - const void *ptr, GLuint size, GLuint align, - drm_intel_bo **return_bo, - GLuint *return_offset) -{ - GLuint base, delta; - - base = ALIGN_NPOT(brw->upload.offset, align); - if (brw->upload.bo == NULL || base + size > brw->upload.bo->size) { - wrap_buffers(brw, size); - base = 0; - } - - drm_intel_bo_reference(brw->upload.bo); - *return_bo = brw->upload.bo; - *return_offset = base; - - delta = base - brw->upload.offset; - if (brw->upload.buffer_len && - brw->upload.buffer_len + delta + size > sizeof(brw->upload.buffer)) { - drm_intel_bo_subdata(brw->upload.bo, - brw->upload.buffer_offset, - brw->upload.buffer_len, - brw->upload.buffer); - brw->upload.buffer_len = 0; - } - - if (size < sizeof(brw->upload.buffer)) { - if (brw->upload.buffer_len == 0) - brw->upload.buffer_offset = base; - else - brw->upload.buffer_len += delta; - - memcpy(brw->upload.buffer + brw->upload.buffer_len, ptr, size); - brw->upload.buffer_len += size; - } else { - drm_intel_bo_subdata(brw->upload.bo, base, size, ptr); - } - - brw->upload.offset = base + size; -} - +/** + * Interface for getting memory for uploading streamed data to the GPU + * + * In most cases, streamed data (for GPU state structures, for example) is + * uploaded through brw_state_batch(), since that interface allows relocations + * from the streamed space returned to other BOs. However, that interface has + * the restriction that the amount of space allocated has to be "small" (see + * estimated_max_prim_size in brw_draw.c). + * + * This interface, on the other hand, is able to handle arbitrary sized + * allocation requests, though it will batch small allocations into the same + * BO for efficiency and reduced memory footprint. + * + * \note The returned pointer is valid only until intel_upload_finish(), which + * will happen at batch flush or the next + * intel_upload_space()/intel_upload_data(). + * + * \param out_bo Pointer to a BO, which must point to a valid BO or NULL on + * entry, and will have a reference to the new BO containing the state on + * return. + * + * \param out_offset Offset within the buffer object that the data will land. + */ void * -intel_upload_map(struct brw_context *brw, GLuint size, GLuint align) +intel_upload_space(struct brw_context *brw, + uint32_t size, + uint32_t alignment, + drm_intel_bo **out_bo, + uint32_t *out_offset) { - GLuint base, delta; - char *ptr; + uint32_t offset; - base = ALIGN_NPOT(brw->upload.offset, align); - if (brw->upload.bo == NULL || base + size > brw->upload.bo->size) { - wrap_buffers(brw, size); - base = 0; + offset = ALIGN_NPOT(brw->upload.next_offset, alignment); + if (brw->upload.bo && offset + size > brw->upload.bo->size) { + intel_upload_finish(brw); + offset = 0; } - delta = base - brw->upload.offset; - if (brw->upload.buffer_len && - brw->upload.buffer_len + delta + size > sizeof(brw->upload.buffer)) { - drm_intel_bo_subdata(brw->upload.bo, - brw->upload.buffer_offset, - brw->upload.buffer_len, - brw->upload.buffer); - brw->upload.buffer_len = 0; + if (!brw->upload.bo) { + brw->upload.bo = drm_intel_bo_alloc(brw->bufmgr, "streamed data", + MAX2(INTEL_UPLOAD_SIZE, size), 4096); + if (brw->has_llc) + drm_intel_bo_map(brw->upload.bo, true); + else + drm_intel_gem_bo_map_gtt(brw->upload.bo); } - if (size <= sizeof(brw->upload.buffer)) { - if (brw->upload.buffer_len == 0) - brw->upload.buffer_offset = base; - else - brw->upload.buffer_len += delta; + brw->upload.next_offset = offset + size; - ptr = brw->upload.buffer + brw->upload.buffer_len; - brw->upload.buffer_len += size; - } else { - ptr = malloc(size); + *out_offset = offset; + if (*out_bo != brw->upload.bo) { + drm_intel_bo_unreference(*out_bo); + *out_bo = brw->upload.bo; + drm_intel_bo_reference(brw->upload.bo); } - return ptr; + return brw->upload.bo->virtual + offset; } +/** + * Handy interface to upload some data to temporary GPU memory quickly. + * + * References to this memory should not be retained across batch flushes. + */ void -intel_upload_unmap(struct brw_context *brw, - const void *ptr, GLuint size, GLuint align, - drm_intel_bo **return_bo, - GLuint *return_offset) +intel_upload_data(struct brw_context *brw, + const void *data, + uint32_t size, + uint32_t alignment, + drm_intel_bo **out_bo, + uint32_t *out_offset) { - GLuint base; - - base = ALIGN_NPOT(brw->upload.offset, align); - if (size > sizeof(brw->upload.buffer)) { - drm_intel_bo_subdata(brw->upload.bo, base, size, ptr); - free((void*)ptr); - } - - drm_intel_bo_reference(brw->upload.bo); - *return_bo = brw->upload.bo; - *return_offset = base; - - brw->upload.offset = base + size; + void *dst = intel_upload_space(brw, size, alignment, out_bo, out_offset); + memcpy(dst, data, size); } -- 2.11.0