2 * Copyright © 2015 Intel Corporation
4 * Permission is hereby granted, free of charge, to any person obtaining a
5 * copy of this software and associated documentation files (the "Software"),
6 * to deal in the Software without restriction, including without limitation
7 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
8 * and/or sell copies of the Software, and to permit persons to whom the
9 * Software is furnished to do so, subject to the following conditions:
11 * The above copyright notice and this permission notice (including the next
12 * paragraph) shall be included in all copies or substantial portions of the
15 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
18 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
20 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
24 #include "util/mesa-sha1.h"
25 #include "util/debug.h"
26 #include "anv_private.h"
30 * - Compact binding table layout so it's tight and not dependent on
31 * descriptor set layout.
33 * - Review prog_data struct for size and cacheability: struct
34 * brw_stage_prog_data has binding_table which uses a lot of uint32_t for 8
35 * bit quantities etc; param, pull_param, and image_params are pointers, we
36 * just need the compation map. use bit fields for all bools, eg
41 anv_pipeline_cache_init(struct anv_pipeline_cache *cache,
42 struct anv_device *device)
44 cache->device = device;
45 anv_state_stream_init(&cache->program_stream,
46 &device->instruction_block_pool);
47 pthread_mutex_init(&cache->mutex, NULL);
49 cache->kernel_count = 0;
50 cache->total_size = 0;
51 cache->table_size = 1024;
52 const size_t byte_size = cache->table_size * sizeof(cache->hash_table[0]);
53 cache->hash_table = malloc(byte_size);
55 /* We don't consider allocation failure fatal, we just start with a 0-sized
57 if (cache->hash_table == NULL ||
58 !env_var_as_boolean("ANV_ENABLE_PIPELINE_CACHE", true))
59 cache->table_size = 0;
61 memset(cache->hash_table, 0xff, byte_size);
65 anv_pipeline_cache_finish(struct anv_pipeline_cache *cache)
67 anv_state_stream_finish(&cache->program_stream);
68 pthread_mutex_destroy(&cache->mutex);
69 free(cache->hash_table);
73 unsigned char sha1[20];
74 uint32_t prog_data_size;
76 uint32_t surface_count;
77 uint32_t sampler_count;
82 /* kernel follows prog_data at next 64 byte aligned address */
86 entry_size(struct cache_entry *entry)
88 /* This returns the number of bytes needed to serialize an entry, which
89 * doesn't include the alignment padding bytes.
92 const uint32_t map_size =
93 entry->surface_count * sizeof(struct anv_pipeline_binding) +
94 entry->sampler_count * sizeof(struct anv_pipeline_binding);
96 return sizeof(*entry) + entry->prog_data_size + map_size;
100 anv_hash_shader(unsigned char *hash, const void *key, size_t key_size,
101 struct anv_shader_module *module,
102 const char *entrypoint,
103 const VkSpecializationInfo *spec_info)
105 struct mesa_sha1 *ctx;
107 ctx = _mesa_sha1_init();
108 _mesa_sha1_update(ctx, key, key_size);
109 _mesa_sha1_update(ctx, module->sha1, sizeof(module->sha1));
110 _mesa_sha1_update(ctx, entrypoint, strlen(entrypoint));
111 /* hash in shader stage, pipeline layout? */
113 _mesa_sha1_update(ctx, spec_info->pMapEntries,
114 spec_info->mapEntryCount * sizeof spec_info->pMapEntries[0]);
115 _mesa_sha1_update(ctx, spec_info->pData, spec_info->dataSize);
117 _mesa_sha1_final(ctx, hash);
121 anv_pipeline_cache_search_unlocked(struct anv_pipeline_cache *cache,
122 const unsigned char *sha1,
123 const struct brw_stage_prog_data **prog_data,
124 struct anv_pipeline_bind_map *map)
126 const uint32_t mask = cache->table_size - 1;
127 const uint32_t start = (*(uint32_t *) sha1);
129 for (uint32_t i = 0; i < cache->table_size; i++) {
130 const uint32_t index = (start + i) & mask;
131 const uint32_t offset = cache->hash_table[index];
136 struct cache_entry *entry =
137 cache->program_stream.block_pool->map + offset;
138 if (memcmp(entry->sha1, sha1, sizeof(entry->sha1)) == 0) {
141 void *p = entry->prog_data;
143 p += entry->prog_data_size;
144 map->surface_count = entry->surface_count;
145 map->sampler_count = entry->sampler_count;
146 map->image_count = entry->image_count;
147 map->surface_to_descriptor = p;
148 p += map->surface_count * sizeof(struct anv_pipeline_binding);
149 map->sampler_to_descriptor = p;
152 return offset + align_u32(entry_size(entry), 64);
156 /* This can happen if the pipeline cache is disabled via
157 * ANV_ENABLE_PIPELINE_CACHE=false
163 anv_pipeline_cache_search(struct anv_pipeline_cache *cache,
164 const unsigned char *sha1,
165 const struct brw_stage_prog_data **prog_data,
166 struct anv_pipeline_bind_map *map)
170 pthread_mutex_lock(&cache->mutex);
172 kernel = anv_pipeline_cache_search_unlocked(cache, sha1, prog_data, map);
174 pthread_mutex_unlock(&cache->mutex);
180 anv_pipeline_cache_set_entry(struct anv_pipeline_cache *cache,
181 struct cache_entry *entry, uint32_t entry_offset)
183 const uint32_t mask = cache->table_size - 1;
184 const uint32_t start = (*(uint32_t *) entry->sha1);
186 /* We'll always be able to insert when we get here. */
187 assert(cache->kernel_count < cache->table_size / 2);
189 for (uint32_t i = 0; i < cache->table_size; i++) {
190 const uint32_t index = (start + i) & mask;
191 if (cache->hash_table[index] == ~0) {
192 cache->hash_table[index] = entry_offset;
197 cache->total_size += entry_size(entry) + entry->kernel_size;
198 cache->kernel_count++;
202 anv_pipeline_cache_grow(struct anv_pipeline_cache *cache)
204 const uint32_t table_size = cache->table_size * 2;
205 const uint32_t old_table_size = cache->table_size;
206 const size_t byte_size = table_size * sizeof(cache->hash_table[0]);
208 uint32_t *old_table = cache->hash_table;
210 table = malloc(byte_size);
212 return VK_ERROR_OUT_OF_HOST_MEMORY;
214 cache->hash_table = table;
215 cache->table_size = table_size;
216 cache->kernel_count = 0;
217 cache->total_size = 0;
219 memset(cache->hash_table, 0xff, byte_size);
220 for (uint32_t i = 0; i < old_table_size; i++) {
221 const uint32_t offset = old_table[i];
225 struct cache_entry *entry =
226 cache->program_stream.block_pool->map + offset;
227 anv_pipeline_cache_set_entry(cache, entry, offset);
236 anv_pipeline_cache_add_entry(struct anv_pipeline_cache *cache,
237 struct cache_entry *entry, uint32_t entry_offset)
239 if (cache->kernel_count == cache->table_size / 2)
240 anv_pipeline_cache_grow(cache);
242 /* Failing to grow that hash table isn't fatal, but may mean we don't
243 * have enough space to add this new kernel. Only add it if there's room.
245 if (cache->kernel_count < cache->table_size / 2)
246 anv_pipeline_cache_set_entry(cache, entry, entry_offset);
250 anv_pipeline_cache_upload_kernel(struct anv_pipeline_cache *cache,
251 const unsigned char *sha1,
252 const void *kernel, size_t kernel_size,
253 const struct brw_stage_prog_data **prog_data,
254 size_t prog_data_size,
255 struct anv_pipeline_bind_map *map)
257 pthread_mutex_lock(&cache->mutex);
259 /* Before uploading, check again that another thread didn't upload this
260 * shader while we were compiling it.
263 uint32_t cached_kernel =
264 anv_pipeline_cache_search_unlocked(cache, sha1, prog_data, map);
265 if (cached_kernel != NO_KERNEL) {
266 pthread_mutex_unlock(&cache->mutex);
267 return cached_kernel;
271 struct cache_entry *entry;
273 const uint32_t map_size =
274 map->surface_count * sizeof(struct anv_pipeline_binding) +
275 map->sampler_count * sizeof(struct anv_pipeline_binding);
277 const uint32_t preamble_size =
278 align_u32(sizeof(*entry) + prog_data_size + map_size, 64);
280 const uint32_t size = preamble_size + kernel_size;
282 assert(size < cache->program_stream.block_pool->block_size);
283 const struct anv_state state =
284 anv_state_stream_alloc(&cache->program_stream, size, 64);
287 entry->prog_data_size = prog_data_size;
288 entry->surface_count = map->surface_count;
289 entry->sampler_count = map->sampler_count;
290 entry->image_count = map->image_count;
291 entry->kernel_size = kernel_size;
293 void *p = entry->prog_data;
294 memcpy(p, *prog_data, prog_data_size);
297 memcpy(p, map->surface_to_descriptor,
298 map->surface_count * sizeof(struct anv_pipeline_binding));
299 map->surface_to_descriptor = p;
300 p += map->surface_count * sizeof(struct anv_pipeline_binding);
302 memcpy(p, map->sampler_to_descriptor,
303 map->sampler_count * sizeof(struct anv_pipeline_binding));
304 map->sampler_to_descriptor = p;
307 assert(anv_pipeline_cache_search_unlocked(cache, sha1,
308 NULL, NULL) == NO_KERNEL);
310 memcpy(entry->sha1, sha1, sizeof(entry->sha1));
311 anv_pipeline_cache_add_entry(cache, entry, state.offset);
314 pthread_mutex_unlock(&cache->mutex);
316 memcpy(state.map + preamble_size, kernel, kernel_size);
318 if (!cache->device->info.has_llc)
319 anv_state_clflush(state);
321 *prog_data = (const struct brw_stage_prog_data *) entry->prog_data;
323 return state.offset + preamble_size;
326 struct cache_header {
327 uint32_t header_size;
328 uint32_t header_version;
331 uint8_t uuid[VK_UUID_SIZE];
335 anv_pipeline_cache_load(struct anv_pipeline_cache *cache,
336 const void *data, size_t size)
338 struct anv_device *device = cache->device;
339 struct cache_header header;
340 uint8_t uuid[VK_UUID_SIZE];
342 if (size < sizeof(header))
344 memcpy(&header, data, sizeof(header));
345 if (header.header_size < sizeof(header))
347 if (header.header_version != VK_PIPELINE_CACHE_HEADER_VERSION_ONE)
349 if (header.vendor_id != 0x8086)
351 if (header.device_id != device->chipset_id)
353 anv_device_get_cache_uuid(uuid);
354 if (memcmp(header.uuid, uuid, VK_UUID_SIZE) != 0)
357 void *end = (void *) data + size;
358 void *p = (void *) data + header.header_size;
361 struct cache_entry *entry = p;
363 void *data = entry->prog_data;
364 const struct brw_stage_prog_data *prog_data = data;
365 data += entry->prog_data_size;
367 struct anv_pipeline_binding *surface_to_descriptor = data;
368 data += entry->surface_count * sizeof(struct anv_pipeline_binding);
369 struct anv_pipeline_binding *sampler_to_descriptor = data;
370 data += entry->sampler_count * sizeof(struct anv_pipeline_binding);
373 struct anv_pipeline_bind_map map = {
374 .surface_count = entry->surface_count,
375 .sampler_count = entry->sampler_count,
376 .image_count = entry->image_count,
377 .surface_to_descriptor = surface_to_descriptor,
378 .sampler_to_descriptor = sampler_to_descriptor
381 anv_pipeline_cache_upload_kernel(cache, entry->sha1,
382 kernel, entry->kernel_size,
384 entry->prog_data_size, &map);
385 p = kernel + entry->kernel_size;
389 VkResult anv_CreatePipelineCache(
391 const VkPipelineCacheCreateInfo* pCreateInfo,
392 const VkAllocationCallbacks* pAllocator,
393 VkPipelineCache* pPipelineCache)
395 ANV_FROM_HANDLE(anv_device, device, _device);
396 struct anv_pipeline_cache *cache;
398 assert(pCreateInfo->sType == VK_STRUCTURE_TYPE_PIPELINE_CACHE_CREATE_INFO);
399 assert(pCreateInfo->flags == 0);
401 cache = anv_alloc2(&device->alloc, pAllocator,
403 VK_SYSTEM_ALLOCATION_SCOPE_OBJECT);
405 return vk_error(VK_ERROR_OUT_OF_HOST_MEMORY);
407 anv_pipeline_cache_init(cache, device);
409 if (pCreateInfo->initialDataSize > 0)
410 anv_pipeline_cache_load(cache,
411 pCreateInfo->pInitialData,
412 pCreateInfo->initialDataSize);
414 *pPipelineCache = anv_pipeline_cache_to_handle(cache);
419 void anv_DestroyPipelineCache(
421 VkPipelineCache _cache,
422 const VkAllocationCallbacks* pAllocator)
424 ANV_FROM_HANDLE(anv_device, device, _device);
425 ANV_FROM_HANDLE(anv_pipeline_cache, cache, _cache);
427 anv_pipeline_cache_finish(cache);
429 anv_free2(&device->alloc, pAllocator, cache);
432 VkResult anv_GetPipelineCacheData(
434 VkPipelineCache _cache,
438 ANV_FROM_HANDLE(anv_device, device, _device);
439 ANV_FROM_HANDLE(anv_pipeline_cache, cache, _cache);
440 struct cache_header *header;
442 const size_t size = sizeof(*header) + cache->total_size;
449 if (*pDataSize < sizeof(*header)) {
451 return VK_INCOMPLETE;
454 void *p = pData, *end = pData + *pDataSize;
456 header->header_size = sizeof(*header);
457 header->header_version = VK_PIPELINE_CACHE_HEADER_VERSION_ONE;
458 header->vendor_id = 0x8086;
459 header->device_id = device->chipset_id;
460 anv_device_get_cache_uuid(header->uuid);
461 p += header->header_size;
463 struct cache_entry *entry;
464 for (uint32_t i = 0; i < cache->table_size; i++) {
465 if (cache->hash_table[i] == ~0)
468 entry = cache->program_stream.block_pool->map + cache->hash_table[i];
469 const uint32_t size = entry_size(entry);
470 if (end < p + size + entry->kernel_size)
473 memcpy(p, entry, size);
476 void *kernel = (void *) entry + align_u32(size, 64);
478 memcpy(p, kernel, entry->kernel_size);
479 p += entry->kernel_size;
482 *pDataSize = p - pData;
488 anv_pipeline_cache_merge(struct anv_pipeline_cache *dst,
489 struct anv_pipeline_cache *src)
491 for (uint32_t i = 0; i < src->table_size; i++) {
492 const uint32_t offset = src->hash_table[i];
496 struct cache_entry *entry =
497 src->program_stream.block_pool->map + offset;
499 if (anv_pipeline_cache_search(dst, entry->sha1, NULL, NULL) != NO_KERNEL)
502 anv_pipeline_cache_add_entry(dst, entry, offset);
506 VkResult anv_MergePipelineCaches(
508 VkPipelineCache destCache,
509 uint32_t srcCacheCount,
510 const VkPipelineCache* pSrcCaches)
512 ANV_FROM_HANDLE(anv_pipeline_cache, dst, destCache);
514 for (uint32_t i = 0; i < srcCacheCount; i++) {
515 ANV_FROM_HANDLE(anv_pipeline_cache, src, pSrcCaches[i]);
517 anv_pipeline_cache_merge(dst, src);