2 * Linux UFFD-WP support
4 * Copyright Virtuozzo GmbH, 2020
7 * Andrey Gruzdev <andrey.gruzdev@virtuozzo.com>
9 * This work is licensed under the terms of the GNU GPL, version 2 or
10 * later. See the COPYING file in the top-level directory.
13 #include "qemu/osdep.h"
14 #include "qemu/bitops.h"
15 #include "qemu/error-report.h"
16 #include "qemu/userfaultfd.h"
19 #include <sys/syscall.h>
20 #include <sys/ioctl.h>
24 UFFD_UNINITIALIZED = 0,
29 int uffd_open(int flags)
31 #if defined(__NR_userfaultfd)
32 static uffd_open_mode open_mode;
35 /* Detect how to generate uffd desc when run the 1st time */
36 if (open_mode == UFFD_UNINITIALIZED) {
38 * Make /dev/userfaultfd the default approach because it has better
39 * permission controls, meanwhile allows kernel faults without any
40 * privilege requirement (e.g. SYS_CAP_PTRACE).
42 uffd_dev = open("/dev/userfaultfd", O_RDWR | O_CLOEXEC);
44 open_mode = UFFD_USE_DEV_PATH;
46 /* Fallback to the system call */
47 open_mode = UFFD_USE_SYSCALL;
49 trace_uffd_detect_open_mode(open_mode);
52 if (open_mode == UFFD_USE_DEV_PATH) {
53 assert(uffd_dev >= 0);
54 return ioctl(uffd_dev, USERFAULTFD_IOC_NEW, flags);
57 return syscall(__NR_userfaultfd, flags);
64 * uffd_query_features: query UFFD features
66 * Returns: 0 on success, negative value in case of an error
68 * @features: parameter to receive 'uffdio_api.features'
70 int uffd_query_features(uint64_t *features)
73 struct uffdio_api api_struct = { 0 };
76 uffd_fd = uffd_open(O_CLOEXEC);
78 trace_uffd_query_features_nosys(errno);
82 api_struct.api = UFFD_API;
83 api_struct.features = 0;
85 if (ioctl(uffd_fd, UFFDIO_API, &api_struct)) {
86 trace_uffd_query_features_api_failed(errno);
89 *features = api_struct.features;
98 * uffd_create_fd: create UFFD file descriptor
100 * Returns non-negative file descriptor or negative value in case of an error
102 * @features: UFFD features to request
103 * @non_blocking: create UFFD file descriptor for non-blocking operation
105 int uffd_create_fd(uint64_t features, bool non_blocking)
109 struct uffdio_api api_struct = { 0 };
110 uint64_t ioctl_mask = BIT(_UFFDIO_REGISTER) | BIT(_UFFDIO_UNREGISTER);
112 flags = O_CLOEXEC | (non_blocking ? O_NONBLOCK : 0);
113 uffd_fd = uffd_open(flags);
115 trace_uffd_create_fd_nosys(errno);
119 api_struct.api = UFFD_API;
120 api_struct.features = features;
121 if (ioctl(uffd_fd, UFFDIO_API, &api_struct)) {
122 trace_uffd_create_fd_api_failed(errno);
125 if ((api_struct.ioctls & ioctl_mask) != ioctl_mask) {
126 trace_uffd_create_fd_api_noioctl(ioctl_mask, api_struct.ioctls);
138 * uffd_close_fd: close UFFD file descriptor
140 * @uffd_fd: UFFD file descriptor
142 void uffd_close_fd(int uffd_fd)
144 assert(uffd_fd >= 0);
149 * uffd_register_memory: register memory range via UFFD-IO
151 * Returns 0 in case of success, negative value in case of an error
153 * @uffd_fd: UFFD file descriptor
154 * @addr: base address of memory range
155 * @length: length of memory range
156 * @mode: UFFD register mode (UFFDIO_REGISTER_MODE_MISSING, ...)
157 * @ioctls: optional pointer to receive supported IOCTL mask
159 int uffd_register_memory(int uffd_fd, void *addr, uint64_t length,
160 uint64_t mode, uint64_t *ioctls)
162 struct uffdio_register uffd_register;
164 uffd_register.range.start = (uintptr_t) addr;
165 uffd_register.range.len = length;
166 uffd_register.mode = mode;
168 if (ioctl(uffd_fd, UFFDIO_REGISTER, &uffd_register)) {
169 trace_uffd_register_memory_failed(addr, length, mode, errno);
173 *ioctls = uffd_register.ioctls;
180 * uffd_unregister_memory: un-register memory range with UFFD-IO
182 * Returns 0 in case of success, negative value in case of an error
184 * @uffd_fd: UFFD file descriptor
185 * @addr: base address of memory range
186 * @length: length of memory range
188 int uffd_unregister_memory(int uffd_fd, void *addr, uint64_t length)
190 struct uffdio_range uffd_range;
192 uffd_range.start = (uintptr_t) addr;
193 uffd_range.len = length;
195 if (ioctl(uffd_fd, UFFDIO_UNREGISTER, &uffd_range)) {
196 trace_uffd_unregister_memory_failed(addr, length, errno);
204 * uffd_change_protection: protect/un-protect memory range for writes via UFFD-IO
206 * Returns 0 on success, negative value in case of error
208 * @uffd_fd: UFFD file descriptor
209 * @addr: base address of memory range
210 * @length: length of memory range
211 * @wp: write-protect/unprotect
212 * @dont_wake: do not wake threads waiting on wr-protected page
214 int uffd_change_protection(int uffd_fd, void *addr, uint64_t length,
215 bool wp, bool dont_wake)
217 struct uffdio_writeprotect uffd_writeprotect;
219 uffd_writeprotect.range.start = (uintptr_t) addr;
220 uffd_writeprotect.range.len = length;
221 if (!wp && dont_wake) {
222 /* DONTWAKE is meaningful only on protection release */
223 uffd_writeprotect.mode = UFFDIO_WRITEPROTECT_MODE_DONTWAKE;
225 uffd_writeprotect.mode = (wp ? UFFDIO_WRITEPROTECT_MODE_WP : 0);
228 if (ioctl(uffd_fd, UFFDIO_WRITEPROTECT, &uffd_writeprotect)) {
229 error_report("uffd_change_protection() failed: addr=%p len=%" PRIu64
230 " mode=%" PRIx64 " errno=%i", addr, length,
231 (uint64_t) uffd_writeprotect.mode, errno);
239 * uffd_copy_page: copy range of pages to destination via UFFD-IO
241 * Copy range of source pages to the destination to resolve
242 * missing page fault somewhere in the destination range.
244 * Returns 0 on success, negative value in case of an error
246 * @uffd_fd: UFFD file descriptor
247 * @dst_addr: destination base address
248 * @src_addr: source base address
249 * @length: length of the range to copy
250 * @dont_wake: do not wake threads waiting on missing page
252 int uffd_copy_page(int uffd_fd, void *dst_addr, void *src_addr,
253 uint64_t length, bool dont_wake)
255 struct uffdio_copy uffd_copy;
257 uffd_copy.dst = (uintptr_t) dst_addr;
258 uffd_copy.src = (uintptr_t) src_addr;
259 uffd_copy.len = length;
260 uffd_copy.mode = dont_wake ? UFFDIO_COPY_MODE_DONTWAKE : 0;
262 if (ioctl(uffd_fd, UFFDIO_COPY, &uffd_copy)) {
263 error_report("uffd_copy_page() failed: dst_addr=%p src_addr=%p length=%" PRIu64
264 " mode=%" PRIx64 " errno=%i", dst_addr, src_addr,
265 length, (uint64_t) uffd_copy.mode, errno);
273 * uffd_zero_page: fill range of pages with zeroes via UFFD-IO
275 * Fill range pages with zeroes to resolve missing page fault within the range.
277 * Returns 0 on success, negative value in case of an error
279 * @uffd_fd: UFFD file descriptor
280 * @addr: base address
281 * @length: length of the range to fill with zeroes
282 * @dont_wake: do not wake threads waiting on missing page
284 int uffd_zero_page(int uffd_fd, void *addr, uint64_t length, bool dont_wake)
286 struct uffdio_zeropage uffd_zeropage;
288 uffd_zeropage.range.start = (uintptr_t) addr;
289 uffd_zeropage.range.len = length;
290 uffd_zeropage.mode = dont_wake ? UFFDIO_ZEROPAGE_MODE_DONTWAKE : 0;
292 if (ioctl(uffd_fd, UFFDIO_ZEROPAGE, &uffd_zeropage)) {
293 error_report("uffd_zero_page() failed: addr=%p length=%" PRIu64
294 " mode=%" PRIx64 " errno=%i", addr, length,
295 (uint64_t) uffd_zeropage.mode, errno);
303 * uffd_wakeup: wake up threads waiting on page UFFD-managed page fault resolution
305 * Wake up threads waiting on any page/pages from the designated range.
306 * The main use case is when during some period, page faults are resolved
307 * via UFFD-IO IOCTLs with MODE_DONTWAKE flag set, then after that all waits
308 * for the whole memory range are satisfied in a single call to uffd_wakeup().
310 * Returns 0 on success, negative value in case of an error
312 * @uffd_fd: UFFD file descriptor
313 * @addr: base address
314 * @length: length of the range
316 int uffd_wakeup(int uffd_fd, void *addr, uint64_t length)
318 struct uffdio_range uffd_range;
320 uffd_range.start = (uintptr_t) addr;
321 uffd_range.len = length;
323 if (ioctl(uffd_fd, UFFDIO_WAKE, &uffd_range)) {
324 error_report("uffd_wakeup() failed: addr=%p length=%" PRIu64 " errno=%i",
325 addr, length, errno);
333 * uffd_read_events: read pending UFFD events
335 * Returns number of fetched messages, 0 if non is available or
336 * negative value in case of an error
338 * @uffd_fd: UFFD file descriptor
339 * @msgs: pointer to message buffer
340 * @count: number of messages that can fit in the buffer
342 int uffd_read_events(int uffd_fd, struct uffd_msg *msgs, int count)
346 res = read(uffd_fd, msgs, count * sizeof(struct uffd_msg));
347 } while (res < 0 && errno == EINTR);
349 if ((res < 0 && errno == EAGAIN)) {
353 error_report("uffd_read_events() failed: errno=%i", errno);
357 return (int) (res / sizeof(struct uffd_msg));
361 * uffd_poll_events: poll UFFD file descriptor for read
363 * Returns true if events are available for read, false otherwise
365 * @uffd_fd: UFFD file descriptor
366 * @tmo: timeout value
368 bool uffd_poll_events(int uffd_fd, int tmo)
371 struct pollfd poll_fd = { .fd = uffd_fd, .events = POLLIN, .revents = 0 };
374 res = poll(&poll_fd, 1, tmo);
375 } while (res < 0 && errno == EINTR);
381 error_report("uffd_poll_events() failed: errno=%i", errno);
385 return (poll_fd.revents & POLLIN) != 0;