#include <unistd.h>
#include <time.h>
#include <pthread.h>
+#include <semaphore.h>
+#include <sys/types.h>
+#include <signal.h>
+#include <errno.h>
#include <linux/bitmap.h>
#include <linux/bitops.h>
+#include <asm/barrier.h>
#include "test_util.h"
#include "kvm_util.h"
# define test_and_clear_bit_le test_and_clear_bit
#endif
+#define TEST_DIRTY_RING_COUNT 1024
+
/*
* Guest/Host shared variables. Ensure addr_gva2hva() and/or
* sync_global_to/from_guest() are used when accessing from
static uint64_t host_clear_count;
static uint64_t host_track_next_count;
+/* Whether dirty ring reset is requested, or finished */
+static sem_t dirty_ring_vcpu_stop;
+static sem_t dirty_ring_vcpu_cont;
+/*
+ * This is only used for verifying the dirty pages. Dirty ring has a very
+ * tricky case when the ring just got full, kvm will do userspace exit due to
+ * ring full. When that happens, the very last PFN is set but actually the
+ * data is not changed (the guest WRITE is not really applied yet), because
+ * we found that the dirty ring is full, refused to continue the vcpu, and
+ * recorded the dirty gfn with the old contents.
+ *
+ * For this specific case, it's safe to skip checking this pfn for this
+ * bit, because it's a redundant bit, and when the write happens later the bit
+ * will be set again. We use this variable to always keep track of the latest
+ * dirty gfn we've collected, so that if a mismatch of data found later in the
+ * verifying process, we let it pass.
+ */
+static uint64_t dirty_ring_last_page;
+
enum log_mode_t {
/* Only use KVM_GET_DIRTY_LOG for logging */
LOG_MODE_DIRTY_LOG = 0,
/* Use both KVM_[GET|CLEAR]_DIRTY_LOG for logging */
LOG_MODE_CLEAR_LOG = 1,
+ /* Use dirty ring for logging */
+ LOG_MODE_DIRTY_RING = 2,
+
LOG_MODE_NUM,
/* Run all supported modes */
static enum log_mode_t host_log_mode_option = LOG_MODE_ALL;
/* Logging mode for current run */
static enum log_mode_t host_log_mode;
+static pthread_t vcpu_thread;
+
+/*
+ * In our test we do signal tricks, let's use a better version of
+ * sem_wait to avoid signal interrupts
+ */
+static void sem_wait_until(sem_t *sem)
+{
+ int ret;
+
+ do
+ ret = sem_wait(sem);
+ while (ret == -1 && errno == EINTR);
+}
static bool clear_log_supported(void)
{
kvm_vm_clear_dirty_log(vm, slot, bitmap, 0, num_pages);
}
-static void default_after_vcpu_run(struct kvm_vm *vm)
+static void default_after_vcpu_run(struct kvm_vm *vm, int ret, int err)
{
struct kvm_run *run = vcpu_state(vm, VCPU_ID);
+ TEST_ASSERT(ret == 0 || (ret == -1 && err == EINTR),
+ "vcpu run failed: errno=%d", err);
+
TEST_ASSERT(get_ucall(vm, VCPU_ID, NULL) == UCALL_SYNC,
"Invalid guest sync status: exit_reason=%s\n",
exit_reason_str(run->exit_reason));
}
+static bool dirty_ring_supported(void)
+{
+ return kvm_check_cap(KVM_CAP_DIRTY_LOG_RING);
+}
+
+static void dirty_ring_create_vm_done(struct kvm_vm *vm)
+{
+ /*
+ * Switch to dirty ring mode after VM creation but before any
+ * of the vcpu creation.
+ */
+ vm_enable_dirty_ring(vm, TEST_DIRTY_RING_COUNT *
+ sizeof(struct kvm_dirty_gfn));
+}
+
+static inline bool dirty_gfn_is_dirtied(struct kvm_dirty_gfn *gfn)
+{
+ return gfn->flags == KVM_DIRTY_GFN_F_DIRTY;
+}
+
+static inline void dirty_gfn_set_collected(struct kvm_dirty_gfn *gfn)
+{
+ gfn->flags = KVM_DIRTY_GFN_F_RESET;
+}
+
+static uint32_t dirty_ring_collect_one(struct kvm_dirty_gfn *dirty_gfns,
+ int slot, void *bitmap,
+ uint32_t num_pages, uint32_t *fetch_index)
+{
+ struct kvm_dirty_gfn *cur;
+ uint32_t count = 0;
+
+ while (true) {
+ cur = &dirty_gfns[*fetch_index % TEST_DIRTY_RING_COUNT];
+ if (!dirty_gfn_is_dirtied(cur))
+ break;
+ TEST_ASSERT(cur->slot == slot, "Slot number didn't match: "
+ "%u != %u", cur->slot, slot);
+ TEST_ASSERT(cur->offset < num_pages, "Offset overflow: "
+ "0x%llx >= 0x%x", cur->offset, num_pages);
+ //pr_info("fetch 0x%x page %llu\n", *fetch_index, cur->offset);
+ set_bit_le(cur->offset, bitmap);
+ dirty_ring_last_page = cur->offset;
+ dirty_gfn_set_collected(cur);
+ (*fetch_index)++;
+ count++;
+ }
+
+ return count;
+}
+
+static void dirty_ring_wait_vcpu(void)
+{
+ sem_wait_until(&dirty_ring_vcpu_stop);
+}
+
+static void dirty_ring_continue_vcpu(void)
+{
+ pr_info("Notifying vcpu to continue\n");
+ sem_post(&dirty_ring_vcpu_cont);
+}
+
+static void dirty_ring_collect_dirty_pages(struct kvm_vm *vm, int slot,
+ void *bitmap, uint32_t num_pages)
+{
+ /* We only have one vcpu */
+ static uint32_t fetch_index = 0;
+ uint32_t count = 0, cleared;
+
+ dirty_ring_wait_vcpu();
+
+ /* Only have one vcpu */
+ count = dirty_ring_collect_one(vcpu_map_dirty_ring(vm, VCPU_ID),
+ slot, bitmap, num_pages, &fetch_index);
+
+ cleared = kvm_vm_reset_dirty_ring(vm);
+
+ /* Cleared pages should be the same as collected */
+ TEST_ASSERT(cleared == count, "Reset dirty pages (%u) mismatch "
+ "with collected (%u)", cleared, count);
+
+ dirty_ring_continue_vcpu();
+
+ pr_info("Iteration %ld collected %u pages\n", iteration, count);
+}
+
+static void dirty_ring_after_vcpu_run(struct kvm_vm *vm, int ret, int err)
+{
+ struct kvm_run *run = vcpu_state(vm, VCPU_ID);
+
+ /* A ucall-sync or ring-full event is allowed */
+ if (get_ucall(vm, VCPU_ID, NULL) == UCALL_SYNC) {
+ /* We should allow this to continue */
+ ;
+ } else if (run->exit_reason == KVM_EXIT_DIRTY_RING_FULL) {
+ /* Update the flag first before pause */
+ sem_post(&dirty_ring_vcpu_stop);
+ pr_info("vcpu stops because dirty ring is full...\n");
+ sem_wait_until(&dirty_ring_vcpu_cont);
+ pr_info("vcpu continues now.\n");
+ } else {
+ TEST_ASSERT(false, "Invalid guest sync status: "
+ "exit_reason=%s\n",
+ exit_reason_str(run->exit_reason));
+ }
+}
+
+static void dirty_ring_before_vcpu_join(void)
+{
+ /* Kick another round of vcpu just to make sure it will quit */
+ sem_post(&dirty_ring_vcpu_cont);
+}
+
struct log_mode {
const char *name;
/* Return true if this mode is supported, otherwise false */
void (*collect_dirty_pages) (struct kvm_vm *vm, int slot,
void *bitmap, uint32_t num_pages);
/* Hook to call when after each vcpu run */
- void (*after_vcpu_run)(struct kvm_vm *vm);
+ void (*after_vcpu_run)(struct kvm_vm *vm, int ret, int err);
+ void (*before_vcpu_join) (void);
} log_modes[LOG_MODE_NUM] = {
{
.name = "dirty-log",
.collect_dirty_pages = clear_log_collect_dirty_pages,
.after_vcpu_run = default_after_vcpu_run,
},
+ {
+ .name = "dirty-ring",
+ .supported = dirty_ring_supported,
+ .create_vm_done = dirty_ring_create_vm_done,
+ .collect_dirty_pages = dirty_ring_collect_dirty_pages,
+ .before_vcpu_join = dirty_ring_before_vcpu_join,
+ .after_vcpu_run = dirty_ring_after_vcpu_run,
+ },
};
/*
mode->collect_dirty_pages(vm, slot, bitmap, num_pages);
}
-static void log_mode_after_vcpu_run(struct kvm_vm *vm)
+static void log_mode_after_vcpu_run(struct kvm_vm *vm, int ret, int err)
{
struct log_mode *mode = &log_modes[host_log_mode];
if (mode->after_vcpu_run)
- mode->after_vcpu_run(vm);
+ mode->after_vcpu_run(vm, ret, err);
+}
+
+static void log_mode_before_vcpu_join(void)
+{
+ struct log_mode *mode = &log_modes[host_log_mode];
+
+ if (mode->before_vcpu_join)
+ mode->before_vcpu_join();
}
static void generate_random_array(uint64_t *guest_array, uint64_t size)
static void *vcpu_worker(void *data)
{
- int ret;
+ int ret, vcpu_fd;
struct kvm_vm *vm = data;
uint64_t *guest_array;
uint64_t pages_count = 0;
+ vcpu_fd = vcpu_get_fd(vm, VCPU_ID);
+
guest_array = addr_gva2hva(vm, (vm_vaddr_t)random_array);
while (!READ_ONCE(host_quit)) {
+ /* Clear any existing kick signals */
generate_random_array(guest_array, TEST_PAGES_PER_LOOP);
pages_count += TEST_PAGES_PER_LOOP;
/* Let the guest dirty the random pages */
- ret = _vcpu_run(vm, VCPU_ID);
- TEST_ASSERT(ret == 0, "vcpu_run failed: %d\n", ret);
- log_mode_after_vcpu_run(vm);
+ ret = ioctl(vcpu_fd, KVM_RUN, NULL);
+ log_mode_after_vcpu_run(vm, ret, errno);
}
pr_info("Dirtied %"PRIu64" pages\n", pages_count);
uint64_t step = vm_num_host_pages(mode, 1);
uint64_t page;
uint64_t *value_ptr;
+ uint64_t min_iter = 0;
for (page = 0; page < host_num_pages; page += step) {
value_ptr = host_test_mem + page * host_page_size;
}
if (test_and_clear_bit_le(page, bmap)) {
+ bool matched;
+
host_dirty_count++;
+
/*
* If the bit is set, the value written onto
* the corresponding page should be either the
* previous iteration number or the current one.
*/
- TEST_ASSERT(*value_ptr == iteration ||
- *value_ptr == iteration - 1,
+ matched = (*value_ptr == iteration ||
+ *value_ptr == iteration - 1);
+
+ if (host_log_mode == LOG_MODE_DIRTY_RING && !matched) {
+ if (*value_ptr == iteration - 2 && min_iter <= iteration - 2) {
+ /*
+ * Short answer: this case is special
+ * only for dirty ring test where the
+ * page is the last page before a kvm
+ * dirty ring full in iteration N-2.
+ *
+ * Long answer: Assuming ring size R,
+ * one possible condition is:
+ *
+ * main thr vcpu thr
+ * -------- --------
+ * iter=1
+ * write 1 to page 0~(R-1)
+ * full, vmexit
+ * collect 0~(R-1)
+ * kick vcpu
+ * write 1 to (R-1)~(2R-2)
+ * full, vmexit
+ * iter=2
+ * collect (R-1)~(2R-2)
+ * kick vcpu
+ * write 1 to (2R-2)
+ * (NOTE!!! "1" cached in cpu reg)
+ * write 2 to (2R-1)~(3R-3)
+ * full, vmexit
+ * iter=3
+ * collect (2R-2)~(3R-3)
+ * (here if we read value on page
+ * "2R-2" is 1, while iter=3!!!)
+ *
+ * This however can only happen once per iteration.
+ */
+ min_iter = iteration - 1;
+ continue;
+ } else if (page == dirty_ring_last_page) {
+ /*
+ * Please refer to comments in
+ * dirty_ring_last_page.
+ */
+ continue;
+ }
+ }
+
+ TEST_ASSERT(matched,
"Set page %"PRIu64" value %"PRIu64
" incorrect (iteration=%"PRIu64")",
page, *value_ptr, iteration);
static void run_test(enum vm_guest_mode mode, unsigned long iterations,
unsigned long interval, uint64_t phys_offset)
{
- pthread_t vcpu_thread;
struct kvm_vm *vm;
unsigned long *bmap;
/* Tell the vcpu thread to quit */
host_quit = true;
+ log_mode_before_vcpu_join();
pthread_join(vcpu_thread, NULL);
pr_info("Total bits checked: dirty (%"PRIu64"), clear (%"PRIu64"), "
unsigned int mode;
int opt, i, j;
+ sem_init(&dirty_ring_vcpu_stop, 0, 0);
+ sem_init(&dirty_ring_vcpu_cont, 0, 0);
+
#ifdef __x86_64__
guest_mode_init(VM_MODE_PXXV48_4K, true, true);
#endif
return r;
}
+void vm_enable_dirty_ring(struct kvm_vm *vm, uint32_t ring_size)
+{
+ struct kvm_enable_cap cap = { 0 };
+
+ cap.cap = KVM_CAP_DIRTY_LOG_RING;
+ cap.args[0] = ring_size;
+ vm_enable_cap(vm, &cap);
+ vm->dirty_ring_size = ring_size;
+}
+
static void vm_open(struct kvm_vm *vm, int perm)
{
vm->kvm_fd = open(KVM_DEV_PATH, perm);
__func__, strerror(-ret));
}
+uint32_t kvm_vm_reset_dirty_ring(struct kvm_vm *vm)
+{
+ return ioctl(vm->fd, KVM_RESET_DIRTY_RINGS);
+}
+
/*
* Userspace Memory Region Find
*
*
* Removes a vCPU from a VM and frees its resources.
*/
-static void vm_vcpu_rm(struct vcpu *vcpu)
+static void vm_vcpu_rm(struct kvm_vm *vm, struct vcpu *vcpu)
{
int ret;
+ if (vcpu->dirty_gfns) {
+ ret = munmap(vcpu->dirty_gfns, vm->dirty_ring_size);
+ TEST_ASSERT(ret == 0, "munmap of VCPU dirty ring failed, "
+ "rc: %i errno: %i", ret, errno);
+ vcpu->dirty_gfns = NULL;
+ }
+
ret = munmap(vcpu->state, sizeof(*vcpu->state));
TEST_ASSERT(ret == 0, "munmap of VCPU fd failed, rc: %i "
"errno: %i", ret, errno);
int ret;
list_for_each_entry_safe(vcpu, tmp, &vmp->vcpus, list)
- vm_vcpu_rm(vcpu);
+ vm_vcpu_rm(vmp, vcpu);
ret = close(vmp->fd);
TEST_ASSERT(ret == 0, "Close of vm fd failed,\n"
return rc;
}
+int vcpu_get_fd(struct kvm_vm *vm, uint32_t vcpuid)
+{
+ struct vcpu *vcpu = vcpu_find(vm, vcpuid);
+
+ TEST_ASSERT(vcpu != NULL, "vcpu not found, vcpuid: %u", vcpuid);
+
+ return vcpu->fd;
+}
+
void vcpu_run_complete_io(struct kvm_vm *vm, uint32_t vcpuid)
{
struct vcpu *vcpu = vcpu_find(vm, vcpuid);
return ret;
}
+void *vcpu_map_dirty_ring(struct kvm_vm *vm, uint32_t vcpuid)
+{
+ struct vcpu *vcpu;
+ uint32_t size = vm->dirty_ring_size;
+
+ TEST_ASSERT(size > 0, "Should enable dirty ring first");
+
+ vcpu = vcpu_find(vm, vcpuid);
+
+ TEST_ASSERT(vcpu, "Cannot find vcpu %u", vcpuid);
+
+ if (!vcpu->dirty_gfns) {
+ void *addr;
+
+ addr = mmap(NULL, size, PROT_READ,
+ MAP_PRIVATE, vcpu->fd,
+ vm->page_size * KVM_DIRTY_LOG_PAGE_OFFSET);
+ TEST_ASSERT(addr == MAP_FAILED, "Dirty ring mapped private");
+
+ addr = mmap(NULL, size, PROT_READ | PROT_EXEC,
+ MAP_PRIVATE, vcpu->fd,
+ vm->page_size * KVM_DIRTY_LOG_PAGE_OFFSET);
+ TEST_ASSERT(addr == MAP_FAILED, "Dirty ring mapped exec");
+
+ addr = mmap(NULL, size, PROT_READ | PROT_WRITE,
+ MAP_SHARED, vcpu->fd,
+ vm->page_size * KVM_DIRTY_LOG_PAGE_OFFSET);
+ TEST_ASSERT(addr != MAP_FAILED, "Dirty ring map failed");
+
+ vcpu->dirty_gfns = addr;
+ vcpu->dirty_gfns_count = size / sizeof(struct kvm_dirty_gfn);
+ }
+
+ return vcpu->dirty_gfns;
+}
+
/*
* VM Ioctl
*
{KVM_EXIT_INTERNAL_ERROR, "INTERNAL_ERROR"},
{KVM_EXIT_OSI, "OSI"},
{KVM_EXIT_PAPR_HCALL, "PAPR_HCALL"},
+ {KVM_EXIT_DIRTY_RING_FULL, "DIRTY_RING_FULL"},
#ifdef KVM_EXIT_MEMORY_NOT_PRESENT
{KVM_EXIT_MEMORY_NOT_PRESENT, "MEMORY_NOT_PRESENT"},
#endif