1 //===------ omptarget.cpp - Target independent OpenMP target RTL -- C++ -*-===//
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
7 //===----------------------------------------------------------------------===//
9 // Implementation of the interface to be used by Clang during the codegen of a
12 //===----------------------------------------------------------------------===//
21 /* All begin addresses for partially mapped structs must be 8-aligned in order
22 * to ensure proper alignment of members. E.g.
27 * int *p; // 8-aligned
30 * #pragma omp target map(tofrom: s1.b, s1.p[0:N])
33 * for (int i...) s1.p[i] = ...;
36 * Here we are mapping s1 starting from member b, so BaseAddress=&s1=&s1.a and
37 * BeginAddress=&s1.b. Let's assume that the struct begins at address 0x100,
38 * then &s1.a=0x100, &s1.b=0x104, &s1.p=0x108. Each member obeys the alignment
39 * requirements for its type. Now, when we allocate memory on the device, in
40 * CUDA's case cuMemAlloc() returns an address which is at least 256-aligned.
41 * This means that the chunk of the struct on the device will start at a
42 * 256-aligned address, let's say 0x200. Then the address of b will be 0x200 and
43 * address of p will be a misaligned 0x204 (on the host there was no need to add
44 * padding between b and p, so p comes exactly 4 bytes after b). If the device
45 * kernel tries to access s1.p, a misaligned address error occurs (as reported
46 * by the CUDA plugin). By padding the begin address down to a multiple of 8 and
47 * extending the size of the allocated chuck accordingly, the chuck on the
48 * device will start at 0x200 with the padding (4 bytes), then &s1.b=0x204 and
49 * &s1.p=0x208, as they should be to satisfy the alignment requirements.
51 static const int64_t Alignment = 8;
53 /// Map global data and execute pending ctors
54 static int InitLibrary(DeviceTy& Device) {
58 int32_t device_id = Device.DeviceID;
59 int rc = OFFLOAD_SUCCESS;
61 Device.PendingGlobalsMtx.lock();
63 for (HostEntriesBeginToTransTableTy::iterator entry_it =
64 PM->HostEntriesBeginToTransTable.begin();
65 entry_it != PM->HostEntriesBeginToTransTable.end(); ++entry_it) {
66 TranslationTable *TransTable = &entry_it->second;
67 if (TransTable->HostTable.EntriesBegin ==
68 TransTable->HostTable.EntriesEnd) {
69 // No host entry so no need to proceed
72 if (TransTable->TargetsTable[device_id] != 0) {
73 // Library entries have already been processed
78 assert(TransTable->TargetsImages.size() > (size_t)device_id &&
79 "Not expecting a device ID outside the table's bounds!");
80 __tgt_device_image *img = TransTable->TargetsImages[device_id];
82 REPORT("No image loaded for device id %d.\n", device_id);
86 // 2) load image into the target table.
87 __tgt_target_table *TargetTable =
88 TransTable->TargetsTable[device_id] = Device.load_binary(img);
89 // Unable to get table for this image: invalidate image and fail.
91 REPORT("Unable to generate entries table for device id %d.\n", device_id);
92 TransTable->TargetsImages[device_id] = 0;
97 // Verify whether the two table sizes match.
99 TransTable->HostTable.EntriesEnd - TransTable->HostTable.EntriesBegin;
100 size_t tsize = TargetTable->EntriesEnd - TargetTable->EntriesBegin;
102 // Invalid image for these host entries!
103 if (hsize != tsize) {
104 REPORT("Host and Target tables mismatch for device id %d [%zx != %zx].\n",
105 device_id, hsize, tsize);
106 TransTable->TargetsImages[device_id] = 0;
107 TransTable->TargetsTable[device_id] = 0;
112 // process global data that needs to be mapped.
113 Device.DataMapMtx.lock();
114 __tgt_target_table *HostTable = &TransTable->HostTable;
115 for (__tgt_offload_entry *CurrDeviceEntry = TargetTable->EntriesBegin,
116 *CurrHostEntry = HostTable->EntriesBegin,
117 *EntryDeviceEnd = TargetTable->EntriesEnd;
118 CurrDeviceEntry != EntryDeviceEnd;
119 CurrDeviceEntry++, CurrHostEntry++) {
120 if (CurrDeviceEntry->size != 0) {
122 assert(CurrDeviceEntry->size == CurrHostEntry->size &&
123 "data size mismatch");
125 // Fortran may use multiple weak declarations for the same symbol,
126 // therefore we must allow for multiple weak symbols to be loaded from
127 // the fat binary. Treat these mappings as any other "regular" mapping.
129 if (Device.getTgtPtrBegin(CurrHostEntry->addr, CurrHostEntry->size))
131 DP("Add mapping from host " DPxMOD " to device " DPxMOD " with size %zu"
132 "\n", DPxPTR(CurrHostEntry->addr), DPxPTR(CurrDeviceEntry->addr),
133 CurrDeviceEntry->size);
134 Device.HostDataToTargetMap.emplace(
135 (uintptr_t)CurrHostEntry->addr /*HstPtrBase*/,
136 (uintptr_t)CurrHostEntry->addr /*HstPtrBegin*/,
137 (uintptr_t)CurrHostEntry->addr + CurrHostEntry->size /*HstPtrEnd*/,
138 (uintptr_t)CurrDeviceEntry->addr /*TgtPtrBegin*/, nullptr,
139 true /*IsRefCountINF*/);
142 Device.DataMapMtx.unlock();
144 PM->TrlTblMtx.unlock();
146 if (rc != OFFLOAD_SUCCESS) {
147 Device.PendingGlobalsMtx.unlock();
152 * Run ctors for static objects
154 if (!Device.PendingCtorsDtors.empty()) {
155 // Call all ctors for all libraries registered so far
156 for (auto &lib : Device.PendingCtorsDtors) {
157 if (!lib.second.PendingCtors.empty()) {
158 DP("Has pending ctors... call now\n");
159 for (auto &entry : lib.second.PendingCtors) {
161 int rc = target(device_id, ctor, 0, nullptr, nullptr, nullptr,
162 nullptr, nullptr, nullptr, 1, 1, true /*team*/);
163 if (rc != OFFLOAD_SUCCESS) {
164 REPORT("Running ctor " DPxMOD " failed.\n", DPxPTR(ctor));
165 Device.PendingGlobalsMtx.unlock();
169 // Clear the list to indicate that this device has been used
170 lib.second.PendingCtors.clear();
171 DP("Done with pending ctors for lib " DPxMOD "\n", DPxPTR(lib.first));
175 Device.HasPendingGlobals = false;
176 Device.PendingGlobalsMtx.unlock();
178 return OFFLOAD_SUCCESS;
181 // Check whether a device has been initialized, global ctors have been
182 // executed and global data has been mapped; do so if not already done.
183 int CheckDeviceAndCtors(int64_t device_id) {
185 if (!device_is_ready(device_id)) {
186 REPORT("Device %" PRId64 " is not ready.\n", device_id);
191 DeviceTy &Device = PM->Devices[device_id];
193 // Check whether global data has been mapped for this device
194 Device.PendingGlobalsMtx.lock();
195 bool hasPendingGlobals = Device.HasPendingGlobals;
196 Device.PendingGlobalsMtx.unlock();
197 if (hasPendingGlobals && InitLibrary(Device) != OFFLOAD_SUCCESS) {
198 REPORT("Failed to init globals on device %" PRId64 "\n", device_id);
202 return OFFLOAD_SUCCESS;
205 static int32_t getParentIndex(int64_t type) {
206 return ((type & OMP_TGT_MAPTYPE_MEMBER_OF) >> 48) - 1;
209 /// Call the user-defined mapper function followed by the appropriate
210 // target_data_* function (target_data_{begin,end,update}).
211 int targetDataMapper(DeviceTy &Device, void *arg_base, void *arg,
212 int64_t arg_size, int64_t arg_type,
213 map_var_info_t arg_names, void *arg_mapper,
214 TargetDataFuncPtrTy target_data_function) {
215 DP("Calling the mapper function " DPxMOD "\n", DPxPTR(arg_mapper));
217 // The mapper function fills up Components.
218 MapperComponentsTy MapperComponents;
219 MapperFuncPtrTy MapperFuncPtr = (MapperFuncPtrTy)(arg_mapper);
220 (*MapperFuncPtr)((void *)&MapperComponents, arg_base, arg, arg_size, arg_type,
223 // Construct new arrays for args_base, args, arg_sizes and arg_types
224 // using the information in MapperComponents and call the corresponding
225 // target_data_* function using these new arrays.
226 std::vector<void *> MapperArgsBase(MapperComponents.Components.size());
227 std::vector<void *> MapperArgs(MapperComponents.Components.size());
228 std::vector<int64_t> MapperArgSizes(MapperComponents.Components.size());
229 std::vector<int64_t> MapperArgTypes(MapperComponents.Components.size());
230 std::vector<void *> MapperArgNames(MapperComponents.Components.size());
232 for (unsigned I = 0, E = MapperComponents.Components.size(); I < E; ++I) {
235 .Components[target_data_function == targetDataEnd ? I : E - I - 1];
236 MapperArgsBase[I] = C.Base;
237 MapperArgs[I] = C.Begin;
238 MapperArgSizes[I] = C.Size;
239 MapperArgTypes[I] = C.Type;
240 MapperArgNames[I] = C.Name;
243 int rc = target_data_function(Device, MapperComponents.Components.size(),
244 MapperArgsBase.data(), MapperArgs.data(),
245 MapperArgSizes.data(), MapperArgTypes.data(),
246 MapperArgNames.data(), /*arg_mappers*/ nullptr,
247 /*__tgt_async_info*/ nullptr);
252 /// Internal function to do the mapping and transfer the data to the device
253 int targetDataBegin(DeviceTy &Device, int32_t arg_num, void **args_base,
254 void **args, int64_t *arg_sizes, int64_t *arg_types,
255 map_var_info_t *arg_names, void **arg_mappers,
256 __tgt_async_info *async_info_ptr) {
257 // process each input.
258 for (int32_t i = 0; i < arg_num; ++i) {
259 // Ignore private variables and arrays - there is no mapping for them.
260 if ((arg_types[i] & OMP_TGT_MAPTYPE_LITERAL) ||
261 (arg_types[i] & OMP_TGT_MAPTYPE_PRIVATE))
264 if (arg_mappers && arg_mappers[i]) {
265 // Instead of executing the regular path of targetDataBegin, call the
266 // targetDataMapper variant which will call targetDataBegin again
267 // with new arguments.
268 DP("Calling targetDataMapper for the %dth argument\n", i);
270 map_var_info_t arg_name = (!arg_names) ? nullptr : arg_names[i];
271 int rc = targetDataMapper(Device, args_base[i], args[i], arg_sizes[i],
272 arg_types[i], arg_name, arg_mappers[i],
275 if (rc != OFFLOAD_SUCCESS) {
276 REPORT("Call to targetDataBegin via targetDataMapper for custom mapper"
281 // Skip the rest of this function, continue to the next argument.
285 void *HstPtrBegin = args[i];
286 void *HstPtrBase = args_base[i];
287 int64_t data_size = arg_sizes[i];
288 map_var_info_t HstPtrName = (!arg_names) ? nullptr : arg_names[i];
290 // Adjust for proper alignment if this is a combined entry (for structs).
291 // Look at the next argument - if that is MEMBER_OF this one, then this one
292 // is a combined entry.
294 const int next_i = i+1;
295 if (getParentIndex(arg_types[i]) < 0 && next_i < arg_num &&
296 getParentIndex(arg_types[next_i]) == i) {
297 padding = (int64_t)HstPtrBegin % Alignment;
299 DP("Using a padding of %" PRId64 " bytes for begin address " DPxMOD
300 "\n", padding, DPxPTR(HstPtrBegin));
301 HstPtrBegin = (char *) HstPtrBegin - padding;
302 data_size += padding;
306 // Address of pointer on the host and device, respectively.
307 void *Pointer_HstPtrBegin, *PointerTgtPtrBegin;
308 bool IsNew, Pointer_IsNew;
309 bool IsHostPtr = false;
310 bool IsImplicit = arg_types[i] & OMP_TGT_MAPTYPE_IMPLICIT;
311 // Force the creation of a device side copy of the data when:
312 // a close map modifier was associated with a map that contained a to.
313 bool HasCloseModifier = arg_types[i] & OMP_TGT_MAPTYPE_CLOSE;
314 bool HasPresentModifier = arg_types[i] & OMP_TGT_MAPTYPE_PRESENT;
315 // UpdateRef is based on MEMBER_OF instead of TARGET_PARAM because if we
316 // have reached this point via __tgt_target_data_begin and not __tgt_target
317 // then no argument is marked as TARGET_PARAM ("omp target data map" is not
318 // associated with a target region, so there are no target parameters). This
319 // may be considered a hack, we could revise the scheme in the future.
320 bool UpdateRef = !(arg_types[i] & OMP_TGT_MAPTYPE_MEMBER_OF);
321 if (arg_types[i] & OMP_TGT_MAPTYPE_PTR_AND_OBJ) {
322 DP("Has a pointer entry: \n");
323 // Base is address of pointer.
325 // Usually, the pointer is already allocated by this time. For example:
327 // #pragma omp target map(s.p[0:N])
329 // The map entry for s comes first, and the PTR_AND_OBJ entry comes
330 // afterward, so the pointer is already allocated by the time the
331 // PTR_AND_OBJ entry is handled below, and PointerTgtPtrBegin is thus
332 // non-null. However, "declare target link" can produce a PTR_AND_OBJ
333 // entry for a global that might not already be allocated by the time the
334 // PTR_AND_OBJ entry is handled below, and so the allocation might fail
335 // when HasPresentModifier.
336 PointerTgtPtrBegin = Device.getOrAllocTgtPtr(
337 HstPtrBase, HstPtrBase, sizeof(void *), nullptr, Pointer_IsNew,
338 IsHostPtr, IsImplicit, UpdateRef, HasCloseModifier,
340 if (!PointerTgtPtrBegin) {
341 REPORT("Call to getOrAllocTgtPtr returned null pointer (%s).\n",
342 HasPresentModifier ? "'present' map type modifier"
343 : "device failure or illegal mapping");
346 DP("There are %zu bytes allocated at target address " DPxMOD " - is%s new"
347 "\n", sizeof(void *), DPxPTR(PointerTgtPtrBegin),
348 (Pointer_IsNew ? "" : " not"));
349 Pointer_HstPtrBegin = HstPtrBase;
350 // modify current entry.
351 HstPtrBase = *(void **)HstPtrBase;
352 UpdateRef = true; // subsequently update ref count of pointee
355 void *TgtPtrBegin = Device.getOrAllocTgtPtr(
356 HstPtrBegin, HstPtrBase, data_size, HstPtrName, IsNew, IsHostPtr,
357 IsImplicit, UpdateRef, HasCloseModifier, HasPresentModifier);
358 // If data_size==0, then the argument could be a zero-length pointer to
359 // NULL, so getOrAlloc() returning NULL is not an error.
360 if (!TgtPtrBegin && (data_size || HasPresentModifier)) {
361 REPORT("Call to getOrAllocTgtPtr returned null pointer (%s).\n",
362 HasPresentModifier ? "'present' map type modifier"
363 : "device failure or illegal mapping");
366 DP("There are %" PRId64 " bytes allocated at target address " DPxMOD
367 " - is%s new\n", data_size, DPxPTR(TgtPtrBegin),
368 (IsNew ? "" : " not"));
370 if (arg_types[i] & OMP_TGT_MAPTYPE_RETURN_PARAM) {
371 uintptr_t Delta = (uintptr_t)HstPtrBegin - (uintptr_t)HstPtrBase;
372 void *TgtPtrBase = (void *)((uintptr_t)TgtPtrBegin - Delta);
373 DP("Returning device pointer " DPxMOD "\n", DPxPTR(TgtPtrBase));
374 args_base[i] = TgtPtrBase;
377 if (arg_types[i] & OMP_TGT_MAPTYPE_TO) {
379 if (!(PM->RTLs.RequiresFlags & OMP_REQ_UNIFIED_SHARED_MEMORY) ||
381 if (IsNew || (arg_types[i] & OMP_TGT_MAPTYPE_ALWAYS)) {
383 } else if ((arg_types[i] & OMP_TGT_MAPTYPE_MEMBER_OF) &&
384 !(arg_types[i] & OMP_TGT_MAPTYPE_PTR_AND_OBJ)) {
385 // Copy data only if the "parent" struct has RefCount==1.
386 // If this is a PTR_AND_OBJ entry, the OBJ is not part of the struct,
387 // so exclude it from this check.
388 int32_t parent_idx = getParentIndex(arg_types[i]);
389 uint64_t parent_rc = Device.getMapEntryRefCnt(args[parent_idx]);
390 assert(parent_rc > 0 && "parent struct not found");
391 if (parent_rc == 1) {
397 if (copy && !IsHostPtr) {
398 DP("Moving %" PRId64 " bytes (hst:" DPxMOD ") -> (tgt:" DPxMOD ")\n",
399 data_size, DPxPTR(HstPtrBegin), DPxPTR(TgtPtrBegin));
400 int rt = Device.submitData(TgtPtrBegin, HstPtrBegin, data_size,
402 if (rt != OFFLOAD_SUCCESS) {
403 REPORT("Copying data to device failed.\n");
409 if (arg_types[i] & OMP_TGT_MAPTYPE_PTR_AND_OBJ && !IsHostPtr) {
410 DP("Update pointer (" DPxMOD ") -> [" DPxMOD "]\n",
411 DPxPTR(PointerTgtPtrBegin), DPxPTR(TgtPtrBegin));
412 uint64_t Delta = (uint64_t)HstPtrBegin - (uint64_t)HstPtrBase;
413 void *TgtPtrBase = (void *)((uint64_t)TgtPtrBegin - Delta);
414 int rt = Device.submitData(PointerTgtPtrBegin, &TgtPtrBase,
415 sizeof(void *), async_info_ptr);
416 if (rt != OFFLOAD_SUCCESS) {
417 REPORT("Copying data to device failed.\n");
420 // create shadow pointers for this entry
421 Device.ShadowMtx.lock();
422 Device.ShadowPtrMap[Pointer_HstPtrBegin] = {
423 HstPtrBase, PointerTgtPtrBegin, TgtPtrBase};
424 Device.ShadowMtx.unlock();
428 return OFFLOAD_SUCCESS;
432 /// This structure contains information to deallocate a target pointer, aka.
433 /// used to call the function \p DeviceTy::deallocTgtPtr.
434 struct DeallocTgtPtrInfo {
435 /// Host pointer used to look up into the map table
439 /// Whether it is forced to be removed from the map table
441 /// Whether it has \p close modifier
442 bool HasCloseModifier;
444 DeallocTgtPtrInfo(void *HstPtr, int64_t Size, bool ForceDelete,
445 bool HasCloseModifier)
446 : HstPtrBegin(HstPtr), DataSize(Size), ForceDelete(ForceDelete),
447 HasCloseModifier(HasCloseModifier) {}
451 /// Internal function to undo the mapping and retrieve the data from the device.
452 int targetDataEnd(DeviceTy &Device, int32_t ArgNum, void **ArgBases,
453 void **Args, int64_t *ArgSizes, int64_t *ArgTypes,
454 map_var_info_t *ArgNames, void **ArgMappers,
455 __tgt_async_info *AsyncInfo) {
457 std::vector<DeallocTgtPtrInfo> DeallocTgtPtrs;
458 // process each input.
459 for (int32_t I = ArgNum - 1; I >= 0; --I) {
460 // Ignore private variables and arrays - there is no mapping for them.
461 // Also, ignore the use_device_ptr directive, it has no effect here.
462 if ((ArgTypes[I] & OMP_TGT_MAPTYPE_LITERAL) ||
463 (ArgTypes[I] & OMP_TGT_MAPTYPE_PRIVATE))
466 if (ArgMappers && ArgMappers[I]) {
467 // Instead of executing the regular path of targetDataEnd, call the
468 // targetDataMapper variant which will call targetDataEnd again
469 // with new arguments.
470 DP("Calling targetDataMapper for the %dth argument\n", I);
472 map_var_info_t ArgName = (!ArgNames) ? nullptr : ArgNames[I];
474 targetDataMapper(Device, ArgBases[I], Args[I], ArgSizes[I],
475 ArgTypes[I], ArgName, ArgMappers[I], targetDataEnd);
477 if (Ret != OFFLOAD_SUCCESS) {
478 REPORT("Call to targetDataEnd via targetDataMapper for custom mapper"
483 // Skip the rest of this function, continue to the next argument.
487 void *HstPtrBegin = Args[I];
488 int64_t DataSize = ArgSizes[I];
489 // Adjust for proper alignment if this is a combined entry (for structs).
490 // Look at the next argument - if that is MEMBER_OF this one, then this one
491 // is a combined entry.
492 const int NextI = I + 1;
493 if (getParentIndex(ArgTypes[I]) < 0 && NextI < ArgNum &&
494 getParentIndex(ArgTypes[NextI]) == I) {
495 int64_t Padding = (int64_t)HstPtrBegin % Alignment;
497 DP("Using a Padding of %" PRId64 " bytes for begin address " DPxMOD
499 Padding, DPxPTR(HstPtrBegin));
500 HstPtrBegin = (char *)HstPtrBegin - Padding;
505 bool IsLast, IsHostPtr;
506 bool IsImplicit = ArgTypes[I] & OMP_TGT_MAPTYPE_IMPLICIT;
507 bool UpdateRef = !(ArgTypes[I] & OMP_TGT_MAPTYPE_MEMBER_OF) ||
508 (ArgTypes[I] & OMP_TGT_MAPTYPE_PTR_AND_OBJ);
509 bool ForceDelete = ArgTypes[I] & OMP_TGT_MAPTYPE_DELETE;
510 bool HasCloseModifier = ArgTypes[I] & OMP_TGT_MAPTYPE_CLOSE;
511 bool HasPresentModifier = ArgTypes[I] & OMP_TGT_MAPTYPE_PRESENT;
513 // If PTR_AND_OBJ, HstPtrBegin is address of pointee
514 void *TgtPtrBegin = Device.getTgtPtrBegin(
515 HstPtrBegin, DataSize, IsLast, UpdateRef, IsHostPtr, !IsImplicit);
516 if (!TgtPtrBegin && (DataSize || HasPresentModifier)) {
517 DP("Mapping does not exist (%s)\n",
518 (HasPresentModifier ? "'present' map type modifier" : "ignored"));
519 if (HasPresentModifier) {
520 // This should be an error upon entering an "omp target exit data". It
521 // should not be an error upon exiting an "omp target data" or "omp
522 // target". For "omp target data", Clang thus doesn't include present
523 // modifiers for end calls. For "omp target", we have not found a valid
524 // OpenMP program for which the error matters: it appears that, if a
525 // program can guarantee that data is present at the beginning of an
526 // "omp target" region so that there's no error there, that data is also
527 // guaranteed to be present at the end.
528 MESSAGE("device mapping required by 'present' map type modifier does "
529 "not exist for host address " DPxMOD " (%" PRId64 " bytes)",
530 DPxPTR(HstPtrBegin), DataSize);
534 DP("There are %" PRId64 " bytes allocated at target address " DPxMOD
536 DataSize, DPxPTR(TgtPtrBegin), (IsLast ? "" : " not"));
539 bool DelEntry = IsLast || ForceDelete;
541 if ((ArgTypes[I] & OMP_TGT_MAPTYPE_MEMBER_OF) &&
542 !(ArgTypes[I] & OMP_TGT_MAPTYPE_PTR_AND_OBJ)) {
543 DelEntry = false; // protect parent struct from being deallocated
546 if ((ArgTypes[I] & OMP_TGT_MAPTYPE_FROM) || DelEntry) {
547 // Move data back to the host
548 if (ArgTypes[I] & OMP_TGT_MAPTYPE_FROM) {
549 bool Always = ArgTypes[I] & OMP_TGT_MAPTYPE_ALWAYS;
550 bool CopyMember = false;
551 if (!(PM->RTLs.RequiresFlags & OMP_REQ_UNIFIED_SHARED_MEMORY) ||
553 if ((ArgTypes[I] & OMP_TGT_MAPTYPE_MEMBER_OF) &&
554 !(ArgTypes[I] & OMP_TGT_MAPTYPE_PTR_AND_OBJ)) {
555 // Copy data only if the "parent" struct has RefCount==1.
556 int32_t ParentIdx = getParentIndex(ArgTypes[I]);
557 uint64_t ParentRC = Device.getMapEntryRefCnt(Args[ParentIdx]);
558 assert(ParentRC > 0 && "parent struct not found");
564 if ((DelEntry || Always || CopyMember) &&
565 !(PM->RTLs.RequiresFlags & OMP_REQ_UNIFIED_SHARED_MEMORY &&
566 TgtPtrBegin == HstPtrBegin)) {
567 DP("Moving %" PRId64 " bytes (tgt:" DPxMOD ") -> (hst:" DPxMOD ")\n",
568 DataSize, DPxPTR(TgtPtrBegin), DPxPTR(HstPtrBegin));
569 Ret = Device.retrieveData(HstPtrBegin, TgtPtrBegin, DataSize,
571 if (Ret != OFFLOAD_SUCCESS) {
572 REPORT("Copying data from device failed.\n");
578 // If we copied back to the host a struct/array containing pointers, we
579 // need to restore the original host pointer values from their shadow
580 // copies. If the struct is going to be deallocated, remove any remaining
581 // shadow pointer entries for this struct.
582 uintptr_t LB = (uintptr_t)HstPtrBegin;
583 uintptr_t UB = (uintptr_t)HstPtrBegin + DataSize;
584 Device.ShadowMtx.lock();
585 for (ShadowPtrListTy::iterator Itr = Device.ShadowPtrMap.begin();
586 Itr != Device.ShadowPtrMap.end();) {
587 void **ShadowHstPtrAddr = (void **)Itr->first;
589 // An STL map is sorted on its keys; use this property
590 // to quickly determine when to break out of the loop.
591 if ((uintptr_t)ShadowHstPtrAddr < LB) {
595 if ((uintptr_t)ShadowHstPtrAddr >= UB)
598 // If we copied the struct to the host, we need to restore the pointer.
599 if (ArgTypes[I] & OMP_TGT_MAPTYPE_FROM) {
600 DP("Restoring original host pointer value " DPxMOD " for host "
601 "pointer " DPxMOD "\n",
602 DPxPTR(Itr->second.HstPtrVal), DPxPTR(ShadowHstPtrAddr));
603 *ShadowHstPtrAddr = Itr->second.HstPtrVal;
605 // If the struct is to be deallocated, remove the shadow entry.
607 DP("Removing shadow pointer " DPxMOD "\n", DPxPTR(ShadowHstPtrAddr));
608 Itr = Device.ShadowPtrMap.erase(Itr);
613 Device.ShadowMtx.unlock();
615 // Add pointer to the buffer for later deallocation
617 DeallocTgtPtrs.emplace_back(HstPtrBegin, DataSize, ForceDelete,
622 // We need to synchronize before deallocating data.
623 // If AsyncInfo is nullptr, the previous data transfer (if has) will be
624 // synchronous, so we don't need to synchronize again. If AsyncInfo->Queue is
625 // nullptr, there is no data transfer happened because once there is,
626 // AsyncInfo->Queue will not be nullptr, so again, we don't need to
628 if (AsyncInfo && AsyncInfo->Queue) {
629 Ret = Device.synchronize(AsyncInfo);
630 if (Ret != OFFLOAD_SUCCESS) {
631 REPORT("Failed to synchronize device.\n");
636 // Deallocate target pointer
637 for (DeallocTgtPtrInfo &Info : DeallocTgtPtrs) {
638 Ret = Device.deallocTgtPtr(Info.HstPtrBegin, Info.DataSize,
639 Info.ForceDelete, Info.HasCloseModifier);
640 if (Ret != OFFLOAD_SUCCESS) {
641 REPORT("Deallocating data from device failed.\n");
646 return OFFLOAD_SUCCESS;
649 static int targetDataContiguous(DeviceTy &Device, void *ArgsBase,
650 void *HstPtrBegin, int64_t ArgSize,
652 bool IsLast, IsHostPtr;
653 void *TgtPtrBegin = Device.getTgtPtrBegin(HstPtrBegin, ArgSize, IsLast, false,
654 IsHostPtr, /*MustContain=*/true);
656 DP("hst data:" DPxMOD " not found, becomes a noop\n", DPxPTR(HstPtrBegin));
657 if (ArgType & OMP_TGT_MAPTYPE_PRESENT) {
658 MESSAGE("device mapping required by 'present' motion modifier does not "
659 "exist for host address " DPxMOD " (%" PRId64 " bytes)",
660 DPxPTR(HstPtrBegin), ArgSize);
663 return OFFLOAD_SUCCESS;
666 if (PM->RTLs.RequiresFlags & OMP_REQ_UNIFIED_SHARED_MEMORY &&
667 TgtPtrBegin == HstPtrBegin) {
668 DP("hst data:" DPxMOD " unified and shared, becomes a noop\n",
669 DPxPTR(HstPtrBegin));
670 return OFFLOAD_SUCCESS;
673 if (ArgType & OMP_TGT_MAPTYPE_FROM) {
674 DP("Moving %" PRId64 " bytes (tgt:" DPxMOD ") -> (hst:" DPxMOD ")\n",
675 ArgSize, DPxPTR(TgtPtrBegin), DPxPTR(HstPtrBegin));
676 int Ret = Device.retrieveData(HstPtrBegin, TgtPtrBegin, ArgSize, nullptr);
677 if (Ret != OFFLOAD_SUCCESS) {
678 REPORT("Copying data from device failed.\n");
682 uintptr_t LB = (uintptr_t)HstPtrBegin;
683 uintptr_t UB = (uintptr_t)HstPtrBegin + ArgSize;
684 Device.ShadowMtx.lock();
685 for (ShadowPtrListTy::iterator IT = Device.ShadowPtrMap.begin();
686 IT != Device.ShadowPtrMap.end(); ++IT) {
687 void **ShadowHstPtrAddr = (void **)IT->first;
688 if ((uintptr_t)ShadowHstPtrAddr < LB)
690 if ((uintptr_t)ShadowHstPtrAddr >= UB)
692 DP("Restoring original host pointer value " DPxMOD
693 " for host pointer " DPxMOD "\n",
694 DPxPTR(IT->second.HstPtrVal), DPxPTR(ShadowHstPtrAddr));
695 *ShadowHstPtrAddr = IT->second.HstPtrVal;
697 Device.ShadowMtx.unlock();
700 if (ArgType & OMP_TGT_MAPTYPE_TO) {
701 DP("Moving %" PRId64 " bytes (hst:" DPxMOD ") -> (tgt:" DPxMOD ")\n",
702 ArgSize, DPxPTR(HstPtrBegin), DPxPTR(TgtPtrBegin));
703 int Ret = Device.submitData(TgtPtrBegin, HstPtrBegin, ArgSize, nullptr);
704 if (Ret != OFFLOAD_SUCCESS) {
705 REPORT("Copying data to device failed.\n");
709 uintptr_t LB = (uintptr_t)HstPtrBegin;
710 uintptr_t UB = (uintptr_t)HstPtrBegin + ArgSize;
711 Device.ShadowMtx.lock();
712 for (ShadowPtrListTy::iterator IT = Device.ShadowPtrMap.begin();
713 IT != Device.ShadowPtrMap.end(); ++IT) {
714 void **ShadowHstPtrAddr = (void **)IT->first;
715 if ((uintptr_t)ShadowHstPtrAddr < LB)
717 if ((uintptr_t)ShadowHstPtrAddr >= UB)
719 DP("Restoring original target pointer value " DPxMOD " for target "
720 "pointer " DPxMOD "\n",
721 DPxPTR(IT->second.TgtPtrVal), DPxPTR(IT->second.TgtPtrAddr));
722 Ret = Device.submitData(IT->second.TgtPtrAddr, &IT->second.TgtPtrVal,
723 sizeof(void *), nullptr);
724 if (Ret != OFFLOAD_SUCCESS) {
725 REPORT("Copying data to device failed.\n");
726 Device.ShadowMtx.unlock();
730 Device.ShadowMtx.unlock();
732 return OFFLOAD_SUCCESS;
735 static int targetDataNonContiguous(DeviceTy &Device, void *ArgsBase,
736 __tgt_target_non_contig *NonContig,
737 uint64_t Size, int64_t ArgType,
738 int CurrentDim, int DimSize,
740 int Ret = OFFLOAD_SUCCESS;
741 if (CurrentDim < DimSize) {
742 for (unsigned int I = 0; I < NonContig[CurrentDim].Count; ++I) {
744 (NonContig[CurrentDim].Offset + I) * NonContig[CurrentDim].Stride;
745 // we only need to transfer the first element for the last dimension
746 // since we've already got a contiguous piece.
747 if (CurrentDim != DimSize - 1 || I == 0) {
748 Ret = targetDataNonContiguous(Device, ArgsBase, NonContig, Size,
749 ArgType, CurrentDim + 1, DimSize,
751 // Stop the whole process if any contiguous piece returns anything
752 // other than OFFLOAD_SUCCESS.
753 if (Ret != OFFLOAD_SUCCESS)
758 char *Ptr = (char *)ArgsBase + Offset;
759 DP("Transfer of non-contiguous : host ptr %lx offset %ld len %ld\n",
760 (uint64_t)Ptr, Offset, Size);
761 Ret = targetDataContiguous(Device, ArgsBase, Ptr, Size, ArgType);
766 static int getNonContigMergedDimension(__tgt_target_non_contig *NonContig,
769 for (int I = DimSize - 1; I > 0; --I) {
770 if (NonContig[I].Count * NonContig[I].Stride == NonContig[I - 1].Stride)
776 /// Internal function to pass data to/from the target.
777 // async_info_ptr is currently unused, added here so targetDataUpdate has the
778 // same signature as targetDataBegin and targetDataEnd.
779 int targetDataUpdate(DeviceTy &Device, int32_t ArgNum, void **ArgsBase,
780 void **Args, int64_t *ArgSizes, int64_t *ArgTypes,
781 map_var_info_t *ArgNames, void **ArgMappers,
782 __tgt_async_info *AsyncInfoPtr) {
783 // process each input.
784 for (int32_t I = 0; I < ArgNum; ++I) {
785 if ((ArgTypes[I] & OMP_TGT_MAPTYPE_LITERAL) ||
786 (ArgTypes[I] & OMP_TGT_MAPTYPE_PRIVATE))
789 if (ArgMappers && ArgMappers[I]) {
790 // Instead of executing the regular path of targetDataUpdate, call the
791 // targetDataMapper variant which will call targetDataUpdate again
792 // with new arguments.
793 DP("Calling targetDataMapper for the %dth argument\n", I);
795 map_var_info_t ArgName = (!ArgNames) ? nullptr : ArgNames[I];
796 int Ret = targetDataMapper(Device, ArgsBase[I], Args[I], ArgSizes[I],
797 ArgTypes[I], ArgName, ArgMappers[I],
800 if (Ret != OFFLOAD_SUCCESS) {
801 REPORT("Call to targetDataUpdate via targetDataMapper for custom mapper"
806 // Skip the rest of this function, continue to the next argument.
810 int Ret = OFFLOAD_SUCCESS;
812 if (ArgTypes[I] & OMP_TGT_MAPTYPE_NON_CONTIG) {
813 __tgt_target_non_contig *NonContig = (__tgt_target_non_contig *)Args[I];
814 int32_t DimSize = ArgSizes[I];
816 NonContig[DimSize - 1].Count * NonContig[DimSize - 1].Stride;
817 int32_t MergedDim = getNonContigMergedDimension(NonContig, DimSize);
818 Ret = targetDataNonContiguous(
819 Device, ArgsBase[I], NonContig, Size, ArgTypes[I],
820 /*current_dim=*/0, DimSize - MergedDim, /*offset=*/0);
822 Ret = targetDataContiguous(Device, ArgsBase[I], Args[I], ArgSizes[I],
825 if (Ret == OFFLOAD_FAIL)
828 return OFFLOAD_SUCCESS;
831 static const unsigned LambdaMapping = OMP_TGT_MAPTYPE_PTR_AND_OBJ |
832 OMP_TGT_MAPTYPE_LITERAL |
833 OMP_TGT_MAPTYPE_IMPLICIT;
834 static bool isLambdaMapping(int64_t Mapping) {
835 return (Mapping & LambdaMapping) == LambdaMapping;
839 /// Find the table information in the map or look it up in the translation
841 TableMap *getTableMap(void *HostPtr) {
842 std::lock_guard<std::mutex> TblMapLock(PM->TblMapMtx);
843 HostPtrToTableMapTy::iterator TableMapIt =
844 PM->HostPtrToTableMap.find(HostPtr);
846 if (TableMapIt != PM->HostPtrToTableMap.end())
847 return &TableMapIt->second;
849 // We don't have a map. So search all the registered libraries.
850 TableMap *TM = nullptr;
851 std::lock_guard<std::mutex> TrlTblLock(PM->TrlTblMtx);
852 for (HostEntriesBeginToTransTableTy::iterator Itr =
853 PM->HostEntriesBeginToTransTable.begin();
854 Itr != PM->HostEntriesBeginToTransTable.end(); ++Itr) {
855 // get the translation table (which contains all the good info).
856 TranslationTable *TransTable = &Itr->second;
857 // iterate over all the host table entries to see if we can locate the
859 __tgt_offload_entry *Cur = TransTable->HostTable.EntriesBegin;
860 for (uint32_t I = 0; Cur < TransTable->HostTable.EntriesEnd; ++Cur, ++I) {
861 if (Cur->addr != HostPtr)
863 // we got a match, now fill the HostPtrToTableMap so that we
864 // may avoid this search next time.
865 TM = &(PM->HostPtrToTableMap)[HostPtr];
866 TM->Table = TransTable;
875 /// Get loop trip count
876 /// FIXME: This function will not work right if calling
877 /// __kmpc_push_target_tripcount in one thread but doing offloading in another
878 /// thread, which might occur when we call task yield.
879 uint64_t getLoopTripCount(int64_t DeviceId) {
880 DeviceTy &Device = PM->Devices[DeviceId];
881 uint64_t LoopTripCount = 0;
884 std::lock_guard<std::mutex> TblMapLock(PM->TblMapMtx);
885 auto I = Device.LoopTripCnt.find(__kmpc_global_thread_num(NULL));
886 if (I != Device.LoopTripCnt.end()) {
887 LoopTripCount = I->second;
888 Device.LoopTripCnt.erase(I);
889 DP("loop trip count is %lu.\n", LoopTripCount);
893 return LoopTripCount;
896 /// A class manages private arguments in a target region.
897 class PrivateArgumentManagerTy {
898 /// A data structure for the information of first-private arguments. We can
899 /// use this information to optimize data transfer by packing all
900 /// first-private arguments and transfer them all at once.
901 struct FirstPrivateArgInfoTy {
902 /// The index of the element in \p TgtArgs corresponding to the argument
904 /// Host pointer begin
905 const char *HstPtrBegin;
907 const char *HstPtrEnd;
909 const int64_t AlignedSize;
910 /// Host pointer name
911 const map_var_info_t HstPtrName = nullptr;
913 FirstPrivateArgInfoTy(int Index, const void *HstPtr, int64_t Size,
914 const map_var_info_t HstPtrName = nullptr)
915 : Index(Index), HstPtrBegin(reinterpret_cast<const char *>(HstPtr)),
916 HstPtrEnd(HstPtrBegin + Size), AlignedSize(Size + Size % Alignment),
917 HstPtrName(HstPtrName) {}
920 /// A vector of target pointers for all private arguments
921 std::vector<void *> TgtPtrs;
923 /// A vector of information of all first-private arguments to be packed
924 std::vector<FirstPrivateArgInfoTy> FirstPrivateArgInfo;
925 /// Host buffer for all arguments to be packed
926 std::vector<char> FirstPrivateArgBuffer;
927 /// The total size of all arguments to be packed
928 int64_t FirstPrivateArgSize = 0;
930 /// A reference to the \p DeviceTy object
932 /// A pointer to a \p __tgt_async_info object
933 __tgt_async_info *AsyncInfo;
935 // TODO: What would be the best value here? Should we make it configurable?
936 // If the size is larger than this threshold, we will allocate and transfer it
937 // immediately instead of packing it.
938 static constexpr const int64_t FirstPrivateArgSizeThreshold = 1024;
942 PrivateArgumentManagerTy(DeviceTy &Dev, __tgt_async_info *AsyncInfo)
943 : Device(Dev), AsyncInfo(AsyncInfo) {}
945 /// Add a private argument
946 int addArg(void *HstPtr, int64_t ArgSize, int64_t ArgOffset,
947 bool IsFirstPrivate, void *&TgtPtr, int TgtArgsIndex,
948 const map_var_info_t HstPtrName = nullptr) {
949 // If the argument is not first-private, or its size is greater than a
950 // predefined threshold, we will allocate memory and issue the transfer
952 if (ArgSize > FirstPrivateArgSizeThreshold || !IsFirstPrivate) {
953 TgtPtr = Device.allocData(ArgSize, HstPtr);
955 DP("Data allocation for %sprivate array " DPxMOD " failed.\n",
956 (IsFirstPrivate ? "first-" : ""), DPxPTR(HstPtr));
959 #ifdef OMPTARGET_DEBUG
960 void *TgtPtrBase = (void *)((intptr_t)TgtPtr + ArgOffset);
961 DP("Allocated %" PRId64 " bytes of target memory at " DPxMOD
962 " for %sprivate array " DPxMOD " - pushing target argument " DPxMOD
964 ArgSize, DPxPTR(TgtPtr), (IsFirstPrivate ? "first-" : ""),
965 DPxPTR(HstPtr), DPxPTR(TgtPtrBase));
967 // If first-private, copy data from host
968 if (IsFirstPrivate) {
969 int Ret = Device.submitData(TgtPtr, HstPtr, ArgSize, AsyncInfo);
970 if (Ret != OFFLOAD_SUCCESS) {
971 DP("Copying data to device failed, failed.\n");
975 TgtPtrs.push_back(TgtPtr);
977 DP("Firstprivate array " DPxMOD " of size %" PRId64 " will be packed\n",
978 DPxPTR(HstPtr), ArgSize);
979 // When reach this point, the argument must meet all following
981 // 1. Its size does not exceed the threshold (see the comment for
982 // FirstPrivateArgSizeThreshold);
983 // 2. It must be first-private (needs to be mapped to target device).
984 // We will pack all this kind of arguments to transfer them all at once
985 // to reduce the number of data transfer. We will not take
986 // non-first-private arguments, aka. private arguments that doesn't need
987 // to be mapped to target device, into account because data allocation
988 // can be very efficient with memory manager.
992 FirstPrivateArgInfo.emplace_back(TgtArgsIndex, HstPtr, ArgSize,
994 FirstPrivateArgSize += FirstPrivateArgInfo.back().AlignedSize;
997 return OFFLOAD_SUCCESS;
1000 /// Pack first-private arguments, replace place holder pointers in \p TgtArgs,
1001 /// and start the transfer.
1002 int packAndTransfer(std::vector<void *> &TgtArgs) {
1003 if (!FirstPrivateArgInfo.empty()) {
1004 assert(FirstPrivateArgSize != 0 &&
1005 "FirstPrivateArgSize is 0 but FirstPrivateArgInfo is empty");
1006 FirstPrivateArgBuffer.resize(FirstPrivateArgSize, 0);
1007 auto Itr = FirstPrivateArgBuffer.begin();
1008 // Copy all host data to this buffer
1009 for (FirstPrivateArgInfoTy &Info : FirstPrivateArgInfo) {
1010 std::copy(Info.HstPtrBegin, Info.HstPtrEnd, Itr);
1011 Itr = std::next(Itr, Info.AlignedSize);
1013 // Allocate target memory
1015 Device.allocData(FirstPrivateArgSize, FirstPrivateArgBuffer.data());
1016 if (TgtPtr == nullptr) {
1017 DP("Failed to allocate target memory for private arguments.\n");
1018 return OFFLOAD_FAIL;
1020 TgtPtrs.push_back(TgtPtr);
1021 DP("Allocated %" PRId64 " bytes of target memory at " DPxMOD "\n",
1022 FirstPrivateArgSize, DPxPTR(TgtPtr));
1023 // Transfer data to target device
1024 int Ret = Device.submitData(TgtPtr, FirstPrivateArgBuffer.data(),
1025 FirstPrivateArgSize, AsyncInfo);
1026 if (Ret != OFFLOAD_SUCCESS) {
1027 DP("Failed to submit data of private arguments.\n");
1028 return OFFLOAD_FAIL;
1030 // Fill in all placeholder pointers
1031 auto TP = reinterpret_cast<uintptr_t>(TgtPtr);
1032 for (FirstPrivateArgInfoTy &Info : FirstPrivateArgInfo) {
1033 void *&Ptr = TgtArgs[Info.Index];
1034 assert(Ptr == nullptr && "Target pointer is already set by mistaken");
1035 Ptr = reinterpret_cast<void *>(TP);
1036 TP += Info.AlignedSize;
1037 DP("Firstprivate array " DPxMOD " of size %" PRId64 " mapped to " DPxMOD
1039 DPxPTR(Info.HstPtrBegin), Info.HstPtrEnd - Info.HstPtrBegin,
1044 return OFFLOAD_SUCCESS;
1047 /// Free all target memory allocated for private arguments
1049 for (void *P : TgtPtrs) {
1050 int Ret = Device.deleteData(P);
1051 if (Ret != OFFLOAD_SUCCESS) {
1052 DP("Deallocation of (first-)private arrays failed.\n");
1053 return OFFLOAD_FAIL;
1059 return OFFLOAD_SUCCESS;
1063 /// Process data before launching the kernel, including calling targetDataBegin
1064 /// to map and transfer data to target device, transferring (first-)private
1066 int processDataBefore(int64_t DeviceId, void *HostPtr, int32_t ArgNum,
1067 void **ArgBases, void **Args, int64_t *ArgSizes,
1068 int64_t *ArgTypes, map_var_info_t *ArgNames,
1069 void **ArgMappers, std::vector<void *> &TgtArgs,
1070 std::vector<ptrdiff_t> &TgtOffsets,
1071 PrivateArgumentManagerTy &PrivateArgumentManager,
1072 __tgt_async_info *AsyncInfo) {
1073 DeviceTy &Device = PM->Devices[DeviceId];
1074 int Ret = targetDataBegin(Device, ArgNum, ArgBases, Args, ArgSizes, ArgTypes,
1075 ArgNames, ArgMappers, AsyncInfo);
1076 if (Ret != OFFLOAD_SUCCESS) {
1077 REPORT("Call to targetDataBegin failed, abort target.\n");
1078 return OFFLOAD_FAIL;
1081 // List of (first-)private arrays allocated for this target region
1082 std::vector<int> TgtArgsPositions(ArgNum, -1);
1084 for (int32_t I = 0; I < ArgNum; ++I) {
1085 if (!(ArgTypes[I] & OMP_TGT_MAPTYPE_TARGET_PARAM)) {
1086 // This is not a target parameter, do not push it into TgtArgs.
1087 // Check for lambda mapping.
1088 if (isLambdaMapping(ArgTypes[I])) {
1089 assert((ArgTypes[I] & OMP_TGT_MAPTYPE_MEMBER_OF) &&
1090 "PTR_AND_OBJ must be also MEMBER_OF.");
1091 unsigned Idx = getParentIndex(ArgTypes[I]);
1092 int TgtIdx = TgtArgsPositions[Idx];
1093 assert(TgtIdx != -1 && "Base address must be translated already.");
1094 // The parent lambda must be processed already and it must be the last
1095 // in TgtArgs and TgtOffsets arrays.
1096 void *HstPtrVal = Args[I];
1097 void *HstPtrBegin = ArgBases[I];
1098 void *HstPtrBase = Args[Idx];
1099 bool IsLast, IsHostPtr; // unused.
1101 (void *)((intptr_t)TgtArgs[TgtIdx] + TgtOffsets[TgtIdx]);
1102 DP("Parent lambda base " DPxMOD "\n", DPxPTR(TgtPtrBase));
1103 uint64_t Delta = (uint64_t)HstPtrBegin - (uint64_t)HstPtrBase;
1104 void *TgtPtrBegin = (void *)((uintptr_t)TgtPtrBase + Delta);
1105 void *PointerTgtPtrBegin = Device.getTgtPtrBegin(
1106 HstPtrVal, ArgSizes[I], IsLast, false, IsHostPtr);
1107 if (!PointerTgtPtrBegin) {
1108 DP("No lambda captured variable mapped (" DPxMOD ") - ignored\n",
1112 if (PM->RTLs.RequiresFlags & OMP_REQ_UNIFIED_SHARED_MEMORY &&
1113 TgtPtrBegin == HstPtrBegin) {
1114 DP("Unified memory is active, no need to map lambda captured"
1115 "variable (" DPxMOD ")\n",
1119 DP("Update lambda reference (" DPxMOD ") -> [" DPxMOD "]\n",
1120 DPxPTR(PointerTgtPtrBegin), DPxPTR(TgtPtrBegin));
1121 Ret = Device.submitData(TgtPtrBegin, &PointerTgtPtrBegin,
1122 sizeof(void *), AsyncInfo);
1123 if (Ret != OFFLOAD_SUCCESS) {
1124 REPORT("Copying data to device failed.\n");
1125 return OFFLOAD_FAIL;
1130 void *HstPtrBegin = Args[I];
1131 void *HstPtrBase = ArgBases[I];
1133 map_var_info_t HstPtrName = (!ArgNames) ? nullptr : ArgNames[I];
1134 ptrdiff_t TgtBaseOffset;
1135 bool IsLast, IsHostPtr; // unused.
1136 if (ArgTypes[I] & OMP_TGT_MAPTYPE_LITERAL) {
1137 DP("Forwarding first-private value " DPxMOD " to the target construct\n",
1138 DPxPTR(HstPtrBase));
1139 TgtPtrBegin = HstPtrBase;
1141 } else if (ArgTypes[I] & OMP_TGT_MAPTYPE_PRIVATE) {
1142 TgtBaseOffset = (intptr_t)HstPtrBase - (intptr_t)HstPtrBegin;
1143 // Can be marked for optimization if the next argument(s) do(es) not
1144 // depend on this one.
1145 const bool IsFirstPrivate =
1146 (I >= ArgNum - 1 || !(ArgTypes[I + 1] & OMP_TGT_MAPTYPE_MEMBER_OF));
1147 Ret = PrivateArgumentManager.addArg(
1148 HstPtrBegin, ArgSizes[I], TgtBaseOffset, IsFirstPrivate, TgtPtrBegin,
1149 TgtArgs.size(), HstPtrName);
1150 if (Ret != OFFLOAD_SUCCESS) {
1151 REPORT("Failed to process %sprivate argument " DPxMOD "\n",
1152 (IsFirstPrivate ? "first-" : ""), DPxPTR(HstPtrBegin));
1153 return OFFLOAD_FAIL;
1156 if (ArgTypes[I] & OMP_TGT_MAPTYPE_PTR_AND_OBJ)
1157 HstPtrBase = *reinterpret_cast<void **>(HstPtrBase);
1158 TgtPtrBegin = Device.getTgtPtrBegin(HstPtrBegin, ArgSizes[I], IsLast,
1160 TgtBaseOffset = (intptr_t)HstPtrBase - (intptr_t)HstPtrBegin;
1161 #ifdef OMPTARGET_DEBUG
1162 void *TgtPtrBase = (void *)((intptr_t)TgtPtrBegin + TgtBaseOffset);
1163 DP("Obtained target argument " DPxMOD " from host pointer " DPxMOD "\n",
1164 DPxPTR(TgtPtrBase), DPxPTR(HstPtrBegin));
1167 TgtArgsPositions[I] = TgtArgs.size();
1168 TgtArgs.push_back(TgtPtrBegin);
1169 TgtOffsets.push_back(TgtBaseOffset);
1172 assert(TgtArgs.size() == TgtOffsets.size() &&
1173 "Size mismatch in arguments and offsets");
1175 // Pack and transfer first-private arguments
1176 Ret = PrivateArgumentManager.packAndTransfer(TgtArgs);
1177 if (Ret != OFFLOAD_SUCCESS) {
1178 DP("Failed to pack and transfer first private arguments\n");
1179 return OFFLOAD_FAIL;
1182 return OFFLOAD_SUCCESS;
1185 /// Process data after launching the kernel, including transferring data back to
1186 /// host if needed and deallocating target memory of (first-)private variables.
1187 int processDataAfter(int64_t DeviceId, void *HostPtr, int32_t ArgNum,
1188 void **ArgBases, void **Args, int64_t *ArgSizes,
1189 int64_t *ArgTypes, map_var_info_t *ArgNames,
1191 PrivateArgumentManagerTy &PrivateArgumentManager,
1192 __tgt_async_info *AsyncInfo) {
1193 DeviceTy &Device = PM->Devices[DeviceId];
1195 // Move data from device.
1196 int Ret = targetDataEnd(Device, ArgNum, ArgBases, Args, ArgSizes, ArgTypes,
1197 ArgNames, ArgMappers, AsyncInfo);
1198 if (Ret != OFFLOAD_SUCCESS) {
1199 REPORT("Call to targetDataEnd failed, abort target.\n");
1200 return OFFLOAD_FAIL;
1203 // Free target memory for private arguments
1204 Ret = PrivateArgumentManager.free();
1205 if (Ret != OFFLOAD_SUCCESS) {
1206 REPORT("Failed to deallocate target memory for private args\n");
1207 return OFFLOAD_FAIL;
1210 return OFFLOAD_SUCCESS;
1214 /// performs the same actions as data_begin in case arg_num is
1215 /// non-zero and initiates run of the offloaded region on the target platform;
1216 /// if arg_num is non-zero after the region execution is done it also
1217 /// performs the same action as data_update and data_end above. This function
1218 /// returns 0 if it was able to transfer the execution to a target and an
1219 /// integer different from zero otherwise.
1220 int target(int64_t DeviceId, void *HostPtr, int32_t ArgNum, void **ArgBases,
1221 void **Args, int64_t *ArgSizes, int64_t *ArgTypes,
1222 map_var_info_t *ArgNames, void **ArgMappers, int32_t TeamNum,
1223 int32_t ThreadLimit, int IsTeamConstruct) {
1224 DeviceTy &Device = PM->Devices[DeviceId];
1226 TableMap *TM = getTableMap(HostPtr);
1227 // No map for this host pointer found!
1229 REPORT("Host ptr " DPxMOD " does not have a matching target pointer.\n",
1231 return OFFLOAD_FAIL;
1234 // get target table.
1235 __tgt_target_table *TargetTable = nullptr;
1237 std::lock_guard<std::mutex> TrlTblLock(PM->TrlTblMtx);
1238 assert(TM->Table->TargetsTable.size() > (size_t)DeviceId &&
1239 "Not expecting a device ID outside the table's bounds!");
1240 TargetTable = TM->Table->TargetsTable[DeviceId];
1242 assert(TargetTable && "Global data has not been mapped\n");
1244 __tgt_async_info AsyncInfo;
1246 std::vector<void *> TgtArgs;
1247 std::vector<ptrdiff_t> TgtOffsets;
1249 PrivateArgumentManagerTy PrivateArgumentManager(Device, &AsyncInfo);
1251 // Process data, such as data mapping, before launching the kernel
1252 int Ret = processDataBefore(DeviceId, HostPtr, ArgNum, ArgBases, Args,
1253 ArgSizes, ArgTypes, ArgNames, ArgMappers, TgtArgs,
1254 TgtOffsets, PrivateArgumentManager, &AsyncInfo);
1255 if (Ret != OFFLOAD_SUCCESS) {
1256 REPORT("Failed to process data before launching the kernel.\n");
1257 return OFFLOAD_FAIL;
1260 // Get loop trip count
1261 uint64_t LoopTripCount = getLoopTripCount(DeviceId);
1263 // Launch device execution.
1264 void *TgtEntryPtr = TargetTable->EntriesBegin[TM->Index].addr;
1265 DP("Launching target execution %s with pointer " DPxMOD " (index=%d).\n",
1266 TargetTable->EntriesBegin[TM->Index].name, DPxPTR(TgtEntryPtr), TM->Index);
1268 if (IsTeamConstruct)
1269 Ret = Device.runTeamRegion(TgtEntryPtr, &TgtArgs[0], &TgtOffsets[0],
1270 TgtArgs.size(), TeamNum, ThreadLimit,
1271 LoopTripCount, &AsyncInfo);
1273 Ret = Device.runRegion(TgtEntryPtr, &TgtArgs[0], &TgtOffsets[0],
1274 TgtArgs.size(), &AsyncInfo);
1276 if (Ret != OFFLOAD_SUCCESS) {
1277 REPORT("Executing target region abort target.\n");
1278 return OFFLOAD_FAIL;
1281 // Transfer data back and deallocate target memory for (first-)private
1283 Ret = processDataAfter(DeviceId, HostPtr, ArgNum, ArgBases, Args, ArgSizes,
1284 ArgTypes, ArgNames, ArgMappers, PrivateArgumentManager,
1286 if (Ret != OFFLOAD_SUCCESS) {
1287 REPORT("Failed to process data after launching the kernel.\n");
1288 return OFFLOAD_FAIL;
1291 return OFFLOAD_SUCCESS;