#include "kvm/kvm.h" #include "kvm/vfio.h" #include "kvm/ioport.h" #include #define VFIO_DEV_DIR "/dev/vfio" #define VFIO_DEV_NODE VFIO_DEV_DIR "/vfio" #define IOMMU_GROUP_DIR "/sys/kernel/iommu_groups" static int vfio_container; static LIST_HEAD(vfio_groups); static struct vfio_device *vfio_devices; static int vfio_device_pci_parser(const struct option *opt, char *arg, struct vfio_device_params *dev) { unsigned int domain, bus, devnr, fn; int nr = sscanf(arg, "%4x:%2x:%2x.%1x", &domain, &bus, &devnr, &fn); if (nr < 4) { domain = 0; nr = sscanf(arg, "%2x:%2x.%1x", &bus, &devnr, &fn); if (nr < 3) { pr_err("Invalid device identifier %s", arg); return -EINVAL; } } dev->type = VFIO_DEVICE_PCI; dev->bus = "pci"; dev->name = malloc(13); if (!dev->name) return -ENOMEM; snprintf(dev->name, 13, "%04x:%02x:%02x.%x", domain, bus, devnr, fn); return 0; } int vfio_device_parser(const struct option *opt, const char *arg, int unset) { int ret = -EINVAL; static int idx = 0; struct kvm *kvm = opt->ptr; struct vfio_device_params *dev, *devs; char *cur, *buf = strdup(arg); if (!buf) return -ENOMEM; if (idx >= MAX_VFIO_DEVICES) { pr_warning("Too many VFIO devices"); goto out_free_buf; } devs = realloc(kvm->cfg.vfio_devices, sizeof(*dev) * (idx + 1)); if (!devs) { ret = -ENOMEM; goto out_free_buf; } kvm->cfg.vfio_devices = devs; dev = &devs[idx]; cur = strtok(buf, ","); if (!cur) goto out_free_buf; if (!strcmp(opt->long_name, "vfio-pci")) ret = vfio_device_pci_parser(opt, cur, dev); else ret = -EINVAL; if (!ret) kvm->cfg.num_vfio_devices = ++idx; out_free_buf: free(buf); return ret; } static bool vfio_ioport_in(struct vfio_region *region, u32 offset, void *data, int len) { struct vfio_device *vdev = region->vdev; ssize_t nr; u32 val; if (!(region->info.flags & VFIO_REGION_INFO_FLAG_READ)) return false; nr = pread(vdev->fd, &val, len, region->info.offset + offset); if (nr != len) { vfio_dev_err(vdev, "could not read %d bytes from I/O port 0x%x\n", len, offset + region->port_base); return false; } switch (len) { case 1: ioport__write8(data, val); break; case 2: ioport__write16(data, val); break; case 4: ioport__write32(data, val); break; default: return false; } return true; } static bool vfio_ioport_out(struct vfio_region *region, u32 offset, void *data, int len) { struct vfio_device *vdev = region->vdev; ssize_t nr; u32 val; if (!(region->info.flags & VFIO_REGION_INFO_FLAG_WRITE)) return false; switch (len) { case 1: val = ioport__read8(data); break; case 2: val = ioport__read16(data); break; case 4: val = ioport__read32(data); break; default: return false; } nr = pwrite(vdev->fd, &val, len, region->info.offset + offset); if (nr != len) vfio_dev_err(vdev, "could not write %d bytes to I/O port 0x%x", len, offset + region->port_base); return nr == len; } static void vfio_ioport_mmio(struct kvm_cpu *vcpu, u64 addr, u8 *data, u32 len, u8 is_write, void *ptr) { struct vfio_region *region = ptr; u32 offset = addr - region->port_base; if (is_write) vfio_ioport_out(region, offset, data, len); else vfio_ioport_in(region, offset, data, len); } static void vfio_mmio_access(struct kvm_cpu *vcpu, u64 addr, u8 *data, u32 len, u8 is_write, void *ptr) { u64 val; ssize_t nr; struct vfio_region *region = ptr; struct vfio_device *vdev = region->vdev; u32 offset = addr - region->guest_phys_addr; if (len < 1 || len > 8) goto err_report; if (is_write) { if (!(region->info.flags & VFIO_REGION_INFO_FLAG_WRITE)) goto err_report; memcpy(&val, data, len); nr = pwrite(vdev->fd, &val, len, region->info.offset + offset); if ((u32)nr != len) goto err_report; } else { if (!(region->info.flags & VFIO_REGION_INFO_FLAG_READ)) goto err_report; nr = pread(vdev->fd, &val, len, region->info.offset + offset); if ((u32)nr != len) goto err_report; memcpy(data, &val, len); } return; err_report: vfio_dev_err(vdev, "could not %s %u bytes at 0x%x (0x%llx)", is_write ? "write" : "read", len, offset, addr); } static int vfio_setup_trap_region(struct kvm *kvm, struct vfio_device *vdev, struct vfio_region *region) { if (region->is_ioport) { int port; port = kvm__register_pio(kvm, region->port_base, region->info.size, vfio_ioport_mmio, region); if (port < 0) return port; return 0; } return kvm__register_mmio(kvm, region->guest_phys_addr, region->info.size, false, vfio_mmio_access, region); } int vfio_map_region(struct kvm *kvm, struct vfio_device *vdev, struct vfio_region *region) { void *base; int ret, prot = 0; /* KVM needs page-aligned regions */ u64 map_size = ALIGN(region->info.size, PAGE_SIZE); if (!(region->info.flags & VFIO_REGION_INFO_FLAG_MMAP)) return vfio_setup_trap_region(kvm, vdev, region); /* * KVM_SET_USER_MEMORY_REGION will fail because the guest physical * address isn't page aligned, let's emulate the region ourselves. */ if (region->guest_phys_addr & (PAGE_SIZE - 1)) return kvm__register_mmio(kvm, region->guest_phys_addr, region->info.size, false, vfio_mmio_access, region); if (region->info.flags & VFIO_REGION_INFO_FLAG_READ) prot |= PROT_READ; if (region->info.flags & VFIO_REGION_INFO_FLAG_WRITE) prot |= PROT_WRITE; base = mmap(NULL, region->info.size, prot, MAP_SHARED, vdev->fd, region->info.offset); if (base == MAP_FAILED) { /* TODO: support sparse mmap */ vfio_dev_warn(vdev, "failed to mmap region %u (0x%llx bytes), falling back to trapping", region->info.index, region->info.size); return vfio_setup_trap_region(kvm, vdev, region); } region->host_addr = base; ret = kvm__register_dev_mem(kvm, region->guest_phys_addr, map_size, region->host_addr); if (ret) { vfio_dev_err(vdev, "failed to register region with KVM"); return ret; } return 0; } void vfio_unmap_region(struct kvm *kvm, struct vfio_region *region) { u64 map_size; if (region->host_addr) { map_size = ALIGN(region->info.size, PAGE_SIZE); kvm__destroy_mem(kvm, region->guest_phys_addr, map_size, region->host_addr); munmap(region->host_addr, region->info.size); region->host_addr = NULL; } else if (region->is_ioport) { kvm__deregister_pio(kvm, region->port_base); } else { kvm__deregister_mmio(kvm, region->guest_phys_addr); } } static int vfio_configure_device(struct kvm *kvm, struct vfio_device *vdev) { int ret; struct vfio_group *group = vdev->group; vdev->fd = ioctl(group->fd, VFIO_GROUP_GET_DEVICE_FD, vdev->params->name); if (vdev->fd < 0) { vfio_dev_warn(vdev, "failed to get fd"); /* The device might be a bridge without an fd */ return 0; } vdev->info.argsz = sizeof(vdev->info); if (ioctl(vdev->fd, VFIO_DEVICE_GET_INFO, &vdev->info)) { ret = -errno; vfio_dev_err(vdev, "failed to get info"); goto err_close_device; } if (vdev->info.flags & VFIO_DEVICE_FLAGS_RESET && ioctl(vdev->fd, VFIO_DEVICE_RESET) < 0) vfio_dev_warn(vdev, "failed to reset device"); vdev->regions = calloc(vdev->info.num_regions, sizeof(*vdev->regions)); if (!vdev->regions) { ret = -ENOMEM; goto err_close_device; } /* Now for the bus-specific initialization... */ switch (vdev->params->type) { case VFIO_DEVICE_PCI: BUG_ON(!(vdev->info.flags & VFIO_DEVICE_FLAGS_PCI)); ret = vfio_pci_setup_device(kvm, vdev); break; default: BUG_ON(1); ret = -EINVAL; } if (ret) goto err_free_regions; vfio_dev_info(vdev, "assigned to device number 0x%x in group %lu", vdev->dev_hdr.dev_num, group->id); return 0; err_free_regions: free(vdev->regions); err_close_device: close(vdev->fd); return ret; } static int vfio_configure_devices(struct kvm *kvm) { int i, ret; for (i = 0; i < kvm->cfg.num_vfio_devices; ++i) { ret = vfio_configure_device(kvm, &vfio_devices[i]); if (ret) return ret; } return 0; } static int vfio_get_iommu_type(void) { if (ioctl(vfio_container, VFIO_CHECK_EXTENSION, VFIO_TYPE1v2_IOMMU)) return VFIO_TYPE1v2_IOMMU; if (ioctl(vfio_container, VFIO_CHECK_EXTENSION, VFIO_TYPE1_IOMMU)) return VFIO_TYPE1_IOMMU; return -ENODEV; } static int vfio_map_mem_bank(struct kvm *kvm, struct kvm_mem_bank *bank, void *data) { int ret = 0; struct vfio_iommu_type1_dma_map dma_map = { .argsz = sizeof(dma_map), .flags = VFIO_DMA_MAP_FLAG_READ | VFIO_DMA_MAP_FLAG_WRITE, .vaddr = (unsigned long)bank->host_addr, .iova = (u64)bank->guest_phys_addr, .size = bank->size, }; /* Map the guest memory for DMA (i.e. provide isolation) */ if (ioctl(vfio_container, VFIO_IOMMU_MAP_DMA, &dma_map)) { ret = -errno; pr_err("Failed to map 0x%llx -> 0x%llx (%llu) for DMA", dma_map.iova, dma_map.vaddr, dma_map.size); } return ret; } static int vfio_unmap_mem_bank(struct kvm *kvm, struct kvm_mem_bank *bank, void *data) { struct vfio_iommu_type1_dma_unmap dma_unmap = { .argsz = sizeof(dma_unmap), .size = bank->size, .iova = bank->guest_phys_addr, }; ioctl(vfio_container, VFIO_IOMMU_UNMAP_DMA, &dma_unmap); return 0; } static int vfio_configure_reserved_regions(struct kvm *kvm, struct vfio_group *group) { FILE *file; int ret = 0; char type[9]; char filename[PATH_MAX]; unsigned long long start, end; snprintf(filename, PATH_MAX, IOMMU_GROUP_DIR "/%lu/reserved_regions", group->id); /* reserved_regions might not be present on older systems */ if (access(filename, F_OK)) return 0; file = fopen(filename, "r"); if (!file) return -errno; while (fscanf(file, "0x%llx 0x%llx %8s\n", &start, &end, type) == 3) { ret = kvm__reserve_mem(kvm, start, end - start + 1); if (ret) break; } fclose(file); return ret; } static int vfio_configure_groups(struct kvm *kvm) { int ret; struct vfio_group *group; list_for_each_entry(group, &vfio_groups, list) { ret = vfio_configure_reserved_regions(kvm, group); if (ret) return ret; } return 0; } static struct vfio_group *vfio_group_create(struct kvm *kvm, unsigned long id) { int ret; struct vfio_group *group; char group_node[PATH_MAX]; struct vfio_group_status group_status = { .argsz = sizeof(group_status), }; group = calloc(1, sizeof(*group)); if (!group) return NULL; group->id = id; group->refs = 1; ret = snprintf(group_node, PATH_MAX, VFIO_DEV_DIR "/%lu", id); if (ret < 0 || ret == PATH_MAX) return NULL; group->fd = open(group_node, O_RDWR); if (group->fd < 0) { pr_err("Failed to open IOMMU group %s", group_node); goto err_free_group; } if (ioctl(group->fd, VFIO_GROUP_GET_STATUS, &group_status)) { pr_err("Failed to determine status of IOMMU group %lu", id); goto err_close_group; } if (!(group_status.flags & VFIO_GROUP_FLAGS_VIABLE)) { pr_err("IOMMU group %lu is not viable", id); goto err_close_group; } if (ioctl(group->fd, VFIO_GROUP_SET_CONTAINER, &vfio_container)) { pr_err("Failed to add IOMMU group %lu to VFIO container", id); goto err_close_group; } list_add(&group->list, &vfio_groups); return group; err_close_group: close(group->fd); err_free_group: free(group); return NULL; } static void vfio_group_exit(struct kvm *kvm, struct vfio_group *group) { if (--group->refs != 0) return; ioctl(group->fd, VFIO_GROUP_UNSET_CONTAINER); list_del(&group->list); close(group->fd); free(group); } static struct vfio_group * vfio_group_get_for_dev(struct kvm *kvm, struct vfio_device *vdev) { int dirfd; ssize_t ret; char *group_name; unsigned long group_id; char group_path[PATH_MAX]; struct vfio_group *group = NULL; /* Find IOMMU group for this device */ dirfd = open(vdev->sysfs_path, O_DIRECTORY | O_PATH | O_RDONLY); if (dirfd < 0) { vfio_dev_err(vdev, "failed to open '%s'", vdev->sysfs_path); return NULL; } ret = readlinkat(dirfd, "iommu_group", group_path, PATH_MAX); if (ret < 0) { vfio_dev_err(vdev, "no iommu_group"); goto out_close; } if (ret == PATH_MAX) goto out_close; group_path[ret] = '\0'; group_name = basename(group_path); errno = 0; group_id = strtoul(group_name, NULL, 10); if (errno) goto out_close; list_for_each_entry(group, &vfio_groups, list) { if (group->id == group_id) { group->refs++; return group; } } group = vfio_group_create(kvm, group_id); out_close: close(dirfd); return group; } static int vfio_device_init(struct kvm *kvm, struct vfio_device *vdev) { int ret; char dev_path[PATH_MAX]; struct vfio_group *group; ret = snprintf(dev_path, PATH_MAX, "/sys/bus/%s/devices/%s", vdev->params->bus, vdev->params->name); if (ret < 0 || ret == PATH_MAX) return -EINVAL; vdev->sysfs_path = strndup(dev_path, PATH_MAX); if (!vdev->sysfs_path) return -errno; group = vfio_group_get_for_dev(kvm, vdev); if (!group) { free(vdev->sysfs_path); return -EINVAL; } vdev->group = group; return 0; } static void vfio_device_exit(struct kvm *kvm, struct vfio_device *vdev) { vfio_group_exit(kvm, vdev->group); switch (vdev->params->type) { case VFIO_DEVICE_PCI: vfio_pci_teardown_device(kvm, vdev); break; default: vfio_dev_warn(vdev, "no teardown function for device"); } close(vdev->fd); free(vdev->regions); free(vdev->sysfs_path); } static int vfio_container_init(struct kvm *kvm) { int api, i, ret, iommu_type;; /* Create a container for our IOMMU groups */ vfio_container = open(VFIO_DEV_NODE, O_RDWR); if (vfio_container == -1) { ret = errno; pr_err("Failed to open %s", VFIO_DEV_NODE); return ret; } api = ioctl(vfio_container, VFIO_GET_API_VERSION); if (api != VFIO_API_VERSION) { pr_err("Unknown VFIO API version %d", api); return -ENODEV; } iommu_type = vfio_get_iommu_type(); if (iommu_type < 0) { pr_err("VFIO type-1 IOMMU not supported on this platform"); return iommu_type; } /* Create groups for our devices and add them to the container */ for (i = 0; i < kvm->cfg.num_vfio_devices; ++i) { vfio_devices[i].params = &kvm->cfg.vfio_devices[i]; ret = vfio_device_init(kvm, &vfio_devices[i]); if (ret) return ret; } /* Finalise the container */ if (ioctl(vfio_container, VFIO_SET_IOMMU, iommu_type)) { ret = -errno; pr_err("Failed to set IOMMU type %d for VFIO container", iommu_type); return ret; } else { pr_info("Using IOMMU type %d for VFIO container", iommu_type); } return kvm__for_each_mem_bank(kvm, KVM_MEM_TYPE_RAM, vfio_map_mem_bank, NULL); } static int vfio__init(struct kvm *kvm) { int ret; if (!kvm->cfg.num_vfio_devices) return 0; vfio_devices = calloc(kvm->cfg.num_vfio_devices, sizeof(*vfio_devices)); if (!vfio_devices) return -ENOMEM; ret = vfio_container_init(kvm); if (ret) return ret; ret = vfio_configure_groups(kvm); if (ret) return ret; ret = vfio_configure_devices(kvm); if (ret) return ret; return 0; } dev_base_init(vfio__init); static int vfio__exit(struct kvm *kvm) { int i; if (!kvm->cfg.num_vfio_devices) return 0; for (i = 0; i < kvm->cfg.num_vfio_devices; i++) vfio_device_exit(kvm, &vfio_devices[i]); free(vfio_devices); kvm__for_each_mem_bank(kvm, KVM_MEM_TYPE_RAM, vfio_unmap_mem_bank, NULL); close(vfio_container); free(kvm->cfg.vfio_devices); return 0; } dev_base_exit(vfio__exit);