lkl: add PCI device interface and a vfio backend driver

LKL is now able to manage PCI devices by itself like DPDK!
This commit implements a PCI bus driver and PCI device interface.
The commit also includes a vfio-pci backend driver as a reference
implementation, thus no extra kernel modules is needed if
the PCI device is assigned to VFIO.

I believe the vfio backend driver fulfills the need of most cases,
but users can inject wrapper functions for another userspace PCI
framework such as uio-pci-generic or a handmade kernel module.
In either case, the framework should provide physically contiguous memory
for DMA, because the kernel and some drivers (e.g. NVMe) assume its memory
as physically contiguous.

Signed-off-by: Shinichi Awamoto <shinichi.awamoto@gmail.com>
This commit is contained in:
Shinichi Awamoto
2020-08-20 10:49:55 +00:00
parent 5398299620
commit 96de6a9f88
23 changed files with 1065 additions and 25 deletions

View File

@@ -56,8 +56,8 @@ do_steps: &do_steps
command: | command: |
if [[ $CROSS_COMPILE == *android* ]]; then if [[ $CROSS_COMPILE == *android* ]]; then
emulator -avd Nexus5_API24 -no-window -no-audio -no-boot-anim; emulator -avd Nexus5_API24 -no-window -no-audio -no-boot-anim;
elif [[ $CROSS_COMPILE == *freebsd* ]]; then elif [[ $CROSS_COMPILE == *freebsd* ]] || [[ -n "$LKL_QEMU_TEST" ]]; then
cd /home/ubuntu && $QEMU cd /home/ubuntu && eval $QEMU
fi fi
background: true background: true
- run: cd tools/lkl && make -j8 ${MKARG} - run: cd tools/lkl && make -j8 ${MKARG}
@@ -71,7 +71,7 @@ do_steps: &do_steps
command: | command: |
if [[ $CROSS_COMPILE == *android* ]]; then if [[ $CROSS_COMPILE == *android* ]]; then
/home/ubuntu/circle-android.sh wait-for-boot; /home/ubuntu/circle-android.sh wait-for-boot;
elif [[ $CROSS_COMPILE == *freebsd* ]]; then elif [[ $CROSS_COMPILE == *freebsd* ]] || [[ -n "$LKL_QEMU_TEST" ]]; then
while ! $MYSSH -o ConnectTimeout=1 exit 2> /dev/null while ! $MYSSH -o ConnectTimeout=1 exit 2> /dev/null
do do
sleep 5 sleep 5
@@ -147,6 +147,15 @@ jobs:
VALGRIND: 1 VALGRIND: 1
<<: *do_steps <<: *do_steps
x86_64_qemu:
docker:
- image: lkldocker/circleci-qemu-x86_64:v1.1
environment:
CROSS_COMPILE: ""
MKARG: "dpdk=no"
LKL_QEMU_TEST: 1
<<: *do_steps
checkpatch: checkpatch:
docker: docker:
- image: lkldocker/circleci:0.5 - image: lkldocker/circleci:0.5
@@ -167,6 +176,7 @@ workflows:
build: build:
jobs: jobs:
- x86_64 - x86_64
- x86_64_qemu
- mingw32 - mingw32
- android-arm32 - android-arm32
- android-aarch64 - android-aarch64

View File

@@ -36,6 +36,7 @@ config LKL
select ARCH_NO_COHERENT_DMA_MMAP select ARCH_NO_COHERENT_DMA_MMAP
select HAVE_MEMBLOCK select HAVE_MEMBLOCK
select NO_BOOTMEM select NO_BOOTMEM
select BLK_DEV_NVME
config OUTPUT_FORMAT config OUTPUT_FORMAT
string "Output format" string "Output format"
@@ -93,4 +94,8 @@ config CONSOLE_LOGLEVEL_QUIET
will be used as the loglevel. IOW passing "quiet" will be the will be used as the loglevel. IOW passing "quiet" will be the
equivalent of passing "loglevel=<CONSOLE_LOGLEVEL_QUIET>" equivalent of passing "loglevel=<CONSOLE_LOGLEVEL_QUIET>"
config PCI
bool "PCI support"
select NO_GENERIC_PCI_IOPORT_MAP
select GENERIC_PCI_IOMAP
default y

View File

@@ -36,6 +36,7 @@ endif
core-y += arch/lkl/kernel/ core-y += arch/lkl/kernel/
core-y += arch/lkl/mm/ core-y += arch/lkl/mm/
core-y += arch/lkl/drivers/
all: lkl.o all: lkl.o

View File

@@ -0,0 +1,2 @@
obj-y = pci.o

271
arch/lkl/drivers/pci.c Normal file
View File

@@ -0,0 +1,271 @@
// SPDX-License-Identifier: GPL-2.0
#include <linux/init.h>
#include <linux/kernel.h>
#include <linux/pci.h>
#include <linux/slab.h>
#include <linux/interrupt.h>
#include <linux/types.h>
#include <linux/io.h>
#include <linux/platform_device.h>
#include <linux/dma-mapping.h>
#include <linux/scatterlist.h>
#include <linux/mm.h>
#include <asm/host_ops.h>
static int lkl_pci_generic_read(struct pci_bus *bus, unsigned int devfn,
int where, int size, u32 *val)
{
if (devfn == 0 &&
lkl_ops->pci_ops->read(bus->sysdata, where, size, val) == size)
return PCIBIOS_SUCCESSFUL;
else
return PCIBIOS_FUNC_NOT_SUPPORTED;
}
static int lkl_pci_generic_write(struct pci_bus *bus, unsigned int devfn,
int where, int size, u32 val)
{
if (devfn == 0 &&
lkl_ops->pci_ops->write(bus->sysdata, where, size, &val) == size)
return PCIBIOS_SUCCESSFUL;
else
return PCIBIOS_FUNC_NOT_SUPPORTED;
}
void __iomem *__pci_ioport_map(struct pci_dev *dev, unsigned long port,
unsigned int nr)
{
panic("%s is not supported\n", __func__);
return NULL;
}
static int lkl_pci_override_resource(struct pci_dev *dev, void *data)
{
int i;
struct resource *r;
resource_size_t start, size;
void *remapped_start = NULL;
if (dev->devfn != 0)
return 0;
for (i = 0; i < PCI_NUM_RESOURCES; i++) {
r = &dev->resource[i];
if (!r->parent && r->start && r->flags) {
dev_info(&dev->dev, "claiming resource %s/%d\n",
pci_name(dev), i);
if (pci_claim_resource(dev, i)) {
dev_err(&dev->dev,
"Could not claim resource %s/%d!",
pci_name(dev), i);
}
size = pci_resource_len(dev, i);
if (pci_resource_flags(dev, i) & IORESOURCE_MEM) {
remapped_start =
lkl_ops->pci_ops->resource_alloc(
dev->sysdata, size, i);
}
if (remapped_start) {
/* override values */
start = (resource_size_t)remapped_start;
pci_resource_start(dev, i) = start;
pci_resource_end(dev, i) = start + size - 1;
} else {
/*
* A host library or the application could
* not handle the resource. Disable it
* not to be touched by drivers.
*/
pci_resource_flags(dev, i) |=
IORESOURCE_DISABLED;
}
}
}
dev->irq = lkl_get_free_irq("pci");
if (lkl_ops->pci_ops->irq_init(dev->sysdata, dev->irq) < 0)
return -ENOMEM;
return 0;
}
static int lkl_pci_remove_devices(struct pci_dev *dev, void *data)
{
lkl_ops->pci_ops->remove(dev->sysdata);
return 0;
}
static struct pci_ops lkl_pci_root_ops = {
.read = lkl_pci_generic_read,
.write = lkl_pci_generic_write,
};
static void *lkl_dma_alloc(struct device *dev, size_t size,
dma_addr_t *dma_handle, gfp_t gfp,
unsigned long attrs)
{
void *vaddr = page_to_virt(alloc_pages(gfp, get_order(size)));
*dma_handle = (dma_addr_t)lkl_ops->pci_ops->map_page(
to_pci_dev(dev)->sysdata, vaddr, size);
return vaddr;
}
static void lkl_dma_free(struct device *dev, size_t size, void *cpu_addr,
dma_addr_t dma_addr, unsigned long attrs)
{
lkl_ops->pci_ops->unmap_page(to_pci_dev(dev)->sysdata, dma_addr, size);
__free_pages(cpu_addr, get_order(size));
}
static dma_addr_t lkl_dma_map_page(struct device *dev, struct page *page,
unsigned long offset, size_t size,
enum dma_data_direction dir,
unsigned long attrs)
{
dma_addr_t dma_handle = (dma_addr_t)lkl_ops->pci_ops->map_page(
to_pci_dev(dev)->sysdata, page_to_virt(page) + offset, size);
if (dma_handle == 0)
return DMA_MAPPING_ERROR;
return dma_handle;
}
static void lkl_dma_unmap_page(struct device *dev, dma_addr_t dma_addr,
size_t size, enum dma_data_direction dir,
unsigned long attrs)
{
lkl_ops->pci_ops->unmap_page(to_pci_dev(dev)->sysdata, dma_addr, size);
}
static int lkl_dma_map_sg(struct device *dev, struct scatterlist *sgl,
int nents, enum dma_data_direction dir,
unsigned long attrs)
{
int i;
struct scatterlist *sg;
for_each_sg(sgl, sg, nents, i) {
void *va;
WARN_ON(!sg_page(sg));
va = sg_virt(sg);
sg_dma_address(sg) = (dma_addr_t)lkl_dma_map_page(
dev, sg_page(sg), sg->offset, sg->length, dir, attrs);
sg_dma_len(sg) = sg->length;
}
return nents;
}
static void lkl_dma_unmap_sg(struct device *dev, struct scatterlist *sgl,
int nents, enum dma_data_direction dir,
unsigned long attrs)
{
int i;
struct scatterlist *sg;
for_each_sg(sgl, sg, nents, i)
lkl_dma_unmap_page(dev, sg_dma_address(sg), sg_dma_len(sg), dir,
attrs);
}
static int lkl_dma_supported(struct device *dev, u64 mask)
{
return 1;
}
static char *pcidev_name;
static int __init setup_pci_device(char *str)
{
if (pcidev_name) {
pr_info("The PCI driver supports only one PCI device.");
pr_info("'%s' will be discarded.", str);
return -1;
}
pcidev_name = str;
return 0;
}
early_param("lkl_pci", setup_pci_device);
const struct dma_map_ops lkl_dma_ops = {
.alloc = lkl_dma_alloc,
.free = lkl_dma_free,
.map_sg = lkl_dma_map_sg,
.unmap_sg = lkl_dma_unmap_sg,
.map_page = lkl_dma_map_page,
.unmap_page = lkl_dma_unmap_page,
.dma_supported = lkl_dma_supported,
};
static int lkl_pci_probe(struct platform_device *pdev)
{
struct lkl_pci_dev *dev;
struct pci_bus *bus;
if (!lkl_ops->pci_ops || !pcidev_name)
return -1;
dev = lkl_ops->pci_ops->add(pcidev_name, (void *)memory_start,
memory_end - memory_start);
if (!dev)
return -1;
bus = pci_scan_bus(0, &lkl_pci_root_ops, (void *)dev);
if (!bus) {
lkl_ops->pci_ops->remove(dev);
return -1;
}
pci_walk_bus(bus, lkl_pci_override_resource, NULL);
pci_bus_add_devices(bus);
dev_set_drvdata(&pdev->dev, bus);
return 0;
}
static void lkl_pci_shutdown(struct platform_device *pdev)
{
struct pci_bus *bus = (struct pci_bus *)dev_get_drvdata(&pdev->dev);
if (bus)
pci_walk_bus(bus, lkl_pci_remove_devices, NULL);
}
static struct platform_driver lkl_pci_driver = {
.driver = {
.name = "lkl_pci",
},
.probe = lkl_pci_probe,
.shutdown = lkl_pci_shutdown,
};
static int __init lkl_pci_init(void)
{
int ret;
struct platform_device *dev;
/*register a platform driver*/
ret = platform_driver_register(&lkl_pci_driver);
if (ret != 0)
return ret;
dev = platform_device_alloc("lkl_pci", -1);
if (!dev)
return -ENOMEM;
ret = platform_device_add(dev);
if (ret != 0)
goto error;
return 0;
error:
platform_device_put(dev);
return ret;
}
subsys_initcall(lkl_pci_init);

View File

@@ -14,8 +14,6 @@ generic-y += current.h
generic-y += delay.h generic-y += delay.h
generic-y += device.h generic-y += device.h
generic-y += div64.h generic-y += div64.h
generic-y += dma.h
generic-y += dma-mapping.h
generic-y += emergency-restart.h generic-y += emergency-restart.h
generic-y += errno.h generic-y += errno.h
generic-y += extable.h generic-y += extable.h
@@ -42,7 +40,7 @@ generic-y += module.h
generic-y += msgbuf.h generic-y += msgbuf.h
generic-y += param.h generic-y += param.h
generic-y += parport.h generic-y += parport.h
generic-y += pci.h generic-y += pci_iomap.h
generic-y += percpu.h generic-y += percpu.h
generic-y += pgalloc.h generic-y += pgalloc.h
generic-y += poll.h generic-y += poll.h
@@ -75,5 +73,6 @@ generic-y += topology.h
generic-y += trace_clock.h generic-y += trace_clock.h
generic-y += unaligned.h generic-y += unaligned.h
generic-y += user.h generic-y += user.h
generic-y += vga.h
generic-y += word-at-a-time.h generic-y += word-at-a-time.h
generic-y += kprobes.h generic-y += kprobes.h

View File

@@ -0,0 +1,12 @@
/* SPDX-License-Identifier: GPL-2.0 */
#ifndef _ASM_LKL_DMA_MAPPING_H
#define _ASM_LKL_DMA_MAPPING_H
extern const struct dma_map_ops lkl_dma_ops;
static inline const struct dma_map_ops *get_arch_dma_ops(struct bus_type *bus)
{
return &lkl_dma_ops;
}
#endif

View File

@@ -0,0 +1,13 @@
/* SPDX-License-Identifier: GPL-2.0 */
#ifndef _ASM_LKL_DMA_H
#define _ASM_LKL_DMA_H
#include <asm-generic/dma.h>
#ifdef CONFIG_PCI
extern int isa_dma_bridge_buggy;
#else
#define isa_dma_bridge_buggy (0)
#endif
#endif /* _ASM_LKL_DMA_H */

View File

@@ -0,0 +1,11 @@
/* SPDX-License-Identifier: GPL-2.0 */
#ifndef _ASM_LKL_PCI_H
#define _ASM_LKL_PCI_H
#include <asm-generic/pci.h>
#define pcibios_assign_all_busses() 0
#define PCIBIOS_MIN_IO 0x1000
#define PCIBIOS_MIN_MEM 0x10000000
#endif /* _ASM_LKL_PCI_H */

View File

@@ -9,6 +9,40 @@ typedef unsigned long lkl_thread_t;
struct lkl_jmp_buf { struct lkl_jmp_buf {
unsigned long buf[128]; unsigned long buf[128];
}; };
struct lkl_pci_dev;
/**
* lkl_dev_pci_ops - PCI host operations
*
* These operations would be a wrapper of userspace PCI drvier and
* must be provided by a host library or by the application.
*
* @add - add a new PCI device; returns a handler or NULL if fails
* @remove - release resources
* @init_irq - allocate resources for interrupts
* @read - read the PCI Configuration Space
* @write - write the PCI Configuration Space
* @resource_alloc - map BARx and return the mapped address. x is resource_index
*
* @map_page - return the DMA address of pages; vaddr might not be page-aligned
* @unmap_page - cleanup DMA region if needed
*
*/
struct lkl_dev_pci_ops {
struct lkl_pci_dev *(*add)(const char *name, void *kernel_ram,
unsigned long ram_size);
void (*remove)(struct lkl_pci_dev *dev);
int (*irq_init)(struct lkl_pci_dev *dev, int irq);
int (*read)(struct lkl_pci_dev *dev, int where, int size, void *val);
int (*write)(struct lkl_pci_dev *dev, int where, int size, void *val);
void *(*resource_alloc)(struct lkl_pci_dev *dev,
unsigned long resource_size,
int resource_index);
unsigned long long (*map_page)(struct lkl_pci_dev *dev, void *vaddr,
unsigned long size);
void (*unmap_page)(struct lkl_pci_dev *dev,
unsigned long long dma_handle, unsigned long size);
};
/** /**
* lkl_host_operations - host operations used by the Linux kernel * lkl_host_operations - host operations used by the Linux kernel
@@ -54,6 +88,8 @@ struct lkl_jmp_buf {
* *
* @mem_alloc - allocate memory * @mem_alloc - allocate memory
* @mem_free - free memory * @mem_free - free memory
* @page_alloc - allocate page aligned memory
* @page_free - free memory allocated by page_alloc
* *
* @timer_create - allocate a host timer that runs fn(arg) when the timer * @timer_create - allocate a host timer that runs fn(arg) when the timer
* fires. * fires.
@@ -83,6 +119,7 @@ struct lkl_jmp_buf {
* @jmp_buf_longjmp - perform a jump back to the saved jump buffer * @jmp_buf_longjmp - perform a jump back to the saved jump buffer
* *
* @memcpy - copy memory * @memcpy - copy memory
* @pci_ops - pointer to PCI host operations
*/ */
struct lkl_host_operations { struct lkl_host_operations {
const char *virtio_devices; const char *virtio_devices;
@@ -114,6 +151,8 @@ struct lkl_host_operations {
void* (*mem_alloc)(unsigned long); void* (*mem_alloc)(unsigned long);
void (*mem_free)(void *); void (*mem_free)(void *);
void* (*page_alloc)(unsigned long size);
void (*page_free)(void *addr, unsigned long size);
unsigned long long (*time)(void); unsigned long long (*time)(void);
@@ -131,6 +170,7 @@ struct lkl_host_operations {
void (*jmp_buf_longjmp)(struct lkl_jmp_buf *jmpb, int val); void (*jmp_buf_longjmp)(struct lkl_jmp_buf *jmpb, int val);
void* (*memcpy)(void *dest, const void *src, unsigned long count); void* (*memcpy)(void *dest, const void *src, unsigned long count);
struct lkl_dev_pci_ops *pci_ops;
}; };
/** /**

View File

@@ -12,7 +12,13 @@ void __init bootmem_init(unsigned long mem_sz)
{ {
mem_size = mem_sz; mem_size = mem_sz;
_memory_start = (unsigned long)lkl_ops->mem_alloc(mem_size); if (lkl_ops->page_alloc) {
mem_size = PAGE_ALIGN(mem_size);
_memory_start = (unsigned long)lkl_ops->page_alloc(mem_size);
} else {
_memory_start = (unsigned long)lkl_ops->mem_alloc(mem_size);
}
memory_start = _memory_start; memory_start = _memory_start;
BUG_ON(!memory_start); BUG_ON(!memory_start);
memory_end = memory_start + mem_size; memory_end = memory_start + mem_size;
@@ -62,5 +68,8 @@ void free_initmem(void)
void free_mem(void) void free_mem(void)
{ {
lkl_ops->mem_free((void *)_memory_start); if (lkl_ops->page_free)
lkl_ops->page_free((void *)_memory_start, mem_size);
else
lkl_ops->mem_free((void *)_memory_start);
} }

View File

@@ -8,6 +8,7 @@ tests/valgrind*.xml
*.dll *.dll
tests/net-test tests/net-test
tests/disk tests/disk
tests/vfio-pci
Makefile.conf Makefile.conf
include/lkl_autoconf.h include/lkl_autoconf.h
tests/autoconf.sh tests/autoconf.sh

View File

@@ -55,6 +55,7 @@ endef
define posix_host define posix_host
$(call set_autoconf_var,POSIX,y) $(call set_autoconf_var,POSIX,y)
$(call set_autoconf_var,VIRTIO_NET,y) $(call set_autoconf_var,VIRTIO_NET,y)
$(if $(strip $(call find_include,linux/vfio.h)),$(call set_autoconf_var,VFIO_PCI,y))
LDFLAGS += -pie LDFLAGS += -pie
CFLAGS += -fPIC -pthread CFLAGS += -fPIC -pthread
SOSUF := .so SOSUF := .so

View File

@@ -23,5 +23,6 @@ LDLIBS_cptofs-$(LKL_HOST_CONFIG_NEEDS_LARGP) += -largp
progs-y += tests/boot progs-y += tests/boot
progs-y += tests/disk progs-y += tests/disk
progs-y += tests/disk-vfio-pci
progs-y += tests/net-test progs-y += tests/net-test

View File

@@ -434,6 +434,25 @@ long lkl_mount_dev(unsigned int disk_id, unsigned int part, const char *fs_type,
int flags, const char *opts, int flags, const char *opts,
char *mnt_str, unsigned int mnt_str_len); char *mnt_str, unsigned int mnt_str_len);
/**
* lkl_mount_blkdev - mount a block device
*
* Like lkl_mount_dev, but mounts the device specified by dev.
*
* @dev - the device id (can be generated by LKL_MKDEV()) identifying the device
* to be mounted
* @fs_type - filesystem type
* @flags - mount flags
* @opts - additional filesystem specific mount options
* @mnt_str - a string that will be filled by this function with the path where
* the filesystem has been mounted
* @mnt_str_len - size of mnt_str
* @returns - 0 on success, a negative value on error
*/
long lkl_mount_blkdev(unsigned int dev, const char *fs_type, int flags,
const char *opts, char *mnt_str,
unsigned int mnt_str_len);
/** /**
* lkl_umount_dev - umount a disk * lkl_umount_dev - umount a disk
* *
@@ -450,6 +469,19 @@ long lkl_mount_dev(unsigned int disk_id, unsigned int part, const char *fs_type,
long lkl_umount_dev(unsigned int disk_id, unsigned int part, int flags, long lkl_umount_dev(unsigned int disk_id, unsigned int part, int flags,
long timeout_ms); long timeout_ms);
/**
* lkl_umount_blkdev - umount a block device
*
* Like lkl_umount_dev, but unmounts the device specified by dev.
*
* @dev - the device id identifying the device to be mounted
* @flags - umount flags
* @timeout_ms - timeout to wait for the kernel to flush closed files so that
* umount can succeed
* @returns - 0 on success, a negative value on error
*/
long lkl_umount_blkdev(unsigned int dev, int flags, long timeout_ms);
/** /**
* lkl_umount_timeout - umount filesystem with timeout * lkl_umount_timeout - umount filesystem with timeout
* *

View File

@@ -21,5 +21,6 @@ liblkl-$(LKL_HOST_CONFIG_VIRTIO_NET_MACVTAP) += virtio_net_macvtap.o
liblkl-$(LKL_HOST_CONFIG_VIRTIO_NET_DPDK) += virtio_net_dpdk.o liblkl-$(LKL_HOST_CONFIG_VIRTIO_NET_DPDK) += virtio_net_dpdk.o
liblkl-$(LKL_HOST_CONFIG_VIRTIO_NET_VDE) += virtio_net_vde.o liblkl-$(LKL_HOST_CONFIG_VIRTIO_NET_VDE) += virtio_net_vde.o
liblkl-$(LKL_HOST_CONFIG_VIRTIO_NET) += virtio_net_pipe.o liblkl-$(LKL_HOST_CONFIG_VIRTIO_NET) += virtio_net_pipe.o
liblkl-$(LKL_HOST_CONFIG_VFIO_PCI) += vfio_pci.o
liblkl-y += ../../perf/pmu-events/jsmn.o liblkl-y += ../../perf/pmu-events/jsmn.o
liblkl-y += config.o liblkl-y += config.o

View File

@@ -200,22 +200,16 @@ int lkl_get_virtio_blkdev(int disk_id, unsigned int part, uint32_t *pdevid)
return lkl_encode_dev_from_sysfs(sysfs_path, pdevid); return lkl_encode_dev_from_sysfs(sysfs_path, pdevid);
} }
long lkl_mount_dev(unsigned int disk_id, unsigned int part, long lkl_mount_blkdev(unsigned int dev, const char *fs_type, int flags,
const char *fs_type, int flags, const char *data, char *mnt_str, unsigned int mnt_str_len)
const char *data, char *mnt_str, unsigned int mnt_str_len)
{ {
char dev_str[] = { "/dev/xxxxxxxx" }; char dev_str[] = { "/dev/xxxxxxxx" };
unsigned int dev;
int err; int err;
char _data[4096]; /* FIXME: PAGE_SIZE is not exported by LKL */ char _data[4096]; /* FIXME: PAGE_SIZE is not exported by LKL */
if (mnt_str_len < sizeof(dev_str)) if (mnt_str_len < sizeof(dev_str))
return -LKL_ENOMEM; return -LKL_ENOMEM;
err = lkl_get_virtio_blkdev(disk_id, part, &dev);
if (err < 0)
return err;
snprintf(dev_str, sizeof(dev_str), "/dev/%08x", dev); snprintf(dev_str, sizeof(dev_str), "/dev/%08x", dev);
snprintf(mnt_str, mnt_str_len, "/mnt/%08x", dev); snprintf(mnt_str, mnt_str_len, "/mnt/%08x", dev);
@@ -263,6 +257,21 @@ long lkl_mount_dev(unsigned int disk_id, unsigned int part,
return 0; return 0;
} }
long lkl_mount_dev(unsigned int disk_id, unsigned int part,
const char *fs_type, int flags,
const char *data, char *mnt_str, unsigned int mnt_str_len)
{
unsigned int dev;
int err;
err = lkl_get_virtio_blkdev(disk_id, part, &dev);
if (err < 0)
return err;
return lkl_mount_blkdev(dev, fs_type, flags, data, mnt_str,
mnt_str_len);
}
long lkl_umount_timeout(char *path, int flags, long timeout_ms) long lkl_umount_timeout(char *path, int flags, long timeout_ms)
{ {
long incr = 10000000; /* 10 ms */ long incr = 10000000; /* 10 ms */
@@ -284,18 +293,12 @@ long lkl_umount_timeout(char *path, int flags, long timeout_ms)
return err; return err;
} }
long lkl_umount_dev(unsigned int disk_id, unsigned int part, int flags, long lkl_umount_blkdev(unsigned int dev, int flags, long timeout_ms)
long timeout_ms)
{ {
char dev_str[] = { "/dev/xxxxxxxx" }; char dev_str[] = { "/dev/xxxxxxxx" };
char mnt_str[] = { "/mnt/xxxxxxxx" }; char mnt_str[] = { "/mnt/xxxxxxxx" };
unsigned int dev;
int err; int err;
err = lkl_get_virtio_blkdev(disk_id, part, &dev);
if (err < 0)
return err;
snprintf(dev_str, sizeof(dev_str), "/dev/%08x", dev); snprintf(dev_str, sizeof(dev_str), "/dev/%08x", dev);
snprintf(mnt_str, sizeof(mnt_str), "/mnt/%08x", dev); snprintf(mnt_str, sizeof(mnt_str), "/mnt/%08x", dev);
@@ -310,6 +313,19 @@ long lkl_umount_dev(unsigned int disk_id, unsigned int part, int flags,
return lkl_sys_rmdir(mnt_str); return lkl_sys_rmdir(mnt_str);
} }
long lkl_umount_dev(unsigned int disk_id, unsigned int part, int flags,
long timeout_ms)
{
unsigned int dev;
int err;
err = lkl_get_virtio_blkdev(disk_id, part, &dev);
if (err < 0)
return err;
return lkl_umount_blkdev(dev, flags, timeout_ms);
}
struct lkl_dir { struct lkl_dir {
int fd; int fd;
char buf[1024]; char buf[1024];

View File

@@ -13,6 +13,7 @@
#include <sys/types.h> #include <sys/types.h>
#include <sys/socket.h> #include <sys/socket.h>
#include <sys/syscall.h> #include <sys/syscall.h>
#include <sys/mman.h>
#include <poll.h> #include <poll.h>
#include <lkl_host.h> #include <lkl_host.h>
#include "iomem.h" #include "iomem.h"
@@ -311,6 +312,28 @@ static long _gettid(void)
#endif #endif
} }
static void *page_alloc(unsigned long size)
{
void *addr;
addr = mmap(NULL, size, PROT_READ | PROT_WRITE,
MAP_PRIVATE | MAP_ANONYMOUS, -1, 0);
if (addr == MAP_FAILED)
return NULL;
return addr;
}
static void page_free(void *addr, unsigned long size)
{
munmap((void *)addr, size);
}
#ifdef LKL_HOST_CONFIG_VFIO_PCI
extern struct lkl_dev_pci_ops vfio_pci_ops;
#endif
struct lkl_host_operations lkl_host_ops = { struct lkl_host_operations lkl_host_ops = {
.panic = panic, .panic = panic,
.thread_create = thread_create, .thread_create = thread_create,
@@ -338,6 +361,8 @@ struct lkl_host_operations lkl_host_ops = {
.print = print, .print = print,
.mem_alloc = malloc, .mem_alloc = malloc,
.mem_free = free, .mem_free = free,
.page_alloc = page_alloc,
.page_free = page_free,
.ioremap = lkl_ioremap, .ioremap = lkl_ioremap,
.iomem_access = lkl_iomem_access, .iomem_access = lkl_iomem_access,
.virtio_devices = lkl_virtio_devs, .virtio_devices = lkl_virtio_devs,
@@ -345,6 +370,9 @@ struct lkl_host_operations lkl_host_ops = {
.jmp_buf_set = jmp_buf_set, .jmp_buf_set = jmp_buf_set,
.jmp_buf_longjmp = jmp_buf_longjmp, .jmp_buf_longjmp = jmp_buf_longjmp,
.memcpy = memcpy, .memcpy = memcpy,
#ifdef LKL_HOST_CONFIG_VFIO_PCI
.pci_ops = &vfio_pci_ops,
#endif
}; };
static int fd_get_capacity(struct lkl_disk disk, unsigned long long *res) static int fd_get_capacity(struct lkl_disk disk, unsigned long long *res)

401
tools/lkl/lib/vfio_pci.c Normal file
View File

@@ -0,0 +1,401 @@
// SPDX-License-Identifier: GPL-2.0
#include <stdlib.h>
#include <string.h>
#include <time.h>
#include <pthread.h>
#include <lkl_host.h>
#include <stdio.h>
#include <sys/types.h>
#include <stdint.h>
#include <sys/stat.h>
#include <fcntl.h>
#include <linux/vfio.h>
#include <sys/ioctl.h>
#include <sys/mman.h>
#include <linux/vfio.h>
#include <sys/eventfd.h>
#include "iomem.h"
struct lkl_pci_dev {
struct lkl_sem *thread_init_sem;
int irq;
lkl_thread_t int_thread;
int quit;
int fd;
int irq_fd;
struct vfio_device_info device_info;
struct vfio_region_info config_reg;
struct vfio_iommu_type1_dma_map dma_map;
};
/**
* vfio_pci_add - Create a new pci device
*
* The device should be assigned to VFIO by the host in advance.
*
* @name - PCI device name (as %x:%x:%x.%x format)
* @kernel_ram - the start address of kernel memory needed to be mapped for DMA.
* The address must be aligned to the page size.
* @ram_size - the size of kernel memory, should be page-aligned as well.
*/
static struct lkl_pci_dev *vfio_pci_add(const char *name, void *kernel_ram,
unsigned long ram_size)
{
struct lkl_pci_dev *dev;
char path[128];
int segn, busn, devn, funcn;
int i;
int container_fd = 0, group_fd = 0;
struct vfio_group_status group_status = { .argsz = sizeof(
group_status) };
struct vfio_iommu_type1_info iommu_info = { .argsz = sizeof(
iommu_info) };
dev = malloc(sizeof(*dev));
if (!dev)
return NULL;
memset(dev, 0, sizeof(*dev));
dev->device_info.argsz = sizeof(struct vfio_device_info);
dev->config_reg.argsz = sizeof(struct vfio_region_info);
dev->dma_map.argsz = sizeof(struct vfio_iommu_type1_dma_map);
container_fd = open("/dev/vfio/vfio", O_RDWR);
if (container_fd < 0)
goto error;
if (ioctl(container_fd, VFIO_GET_API_VERSION) != VFIO_API_VERSION ||
ioctl(container_fd, VFIO_CHECK_EXTENSION, VFIO_TYPE1_IOMMU) == 0)
goto error;
if (sscanf(name, "vfio%x:%x:%x.%x", &segn, &busn, &devn, &funcn) != 4)
goto error;
snprintf(path, sizeof(path),
"/sys/bus/pci/devices/%04x:%02x:%02x.%01x/iommu_group", segn,
busn, devn, funcn);
i = readlink(path, path, sizeof(path));
if (i < 0)
goto error;
path[i] = '\0';
snprintf(path, sizeof(path), "/dev/vfio%s", strrchr(path, '/'));
group_fd = open(path, O_RDWR);
if (group_fd < 0)
goto error;
if (ioctl(group_fd, VFIO_GROUP_GET_STATUS, &group_status) < 0)
goto error;
if (!(group_status.flags & VFIO_GROUP_FLAGS_VIABLE))
goto error;
if (ioctl(group_fd, VFIO_GROUP_SET_CONTAINER, &container_fd) < 0)
goto error;
if (ioctl(container_fd, VFIO_SET_IOMMU, VFIO_TYPE1_IOMMU) < 0)
goto error;
if (ioctl(container_fd, VFIO_IOMMU_GET_INFO, &iommu_info) < 0)
goto error;
/* if kernel_ram is null, assume the memory is already initialized
* by another device, and skip this step.
*/
if (kernel_ram) {
dev->dma_map.vaddr = (uint64_t)kernel_ram;
dev->dma_map.size = ram_size;
dev->dma_map.iova = 0;
dev->dma_map.flags =
VFIO_DMA_MAP_FLAG_READ | VFIO_DMA_MAP_FLAG_WRITE;
if (ioctl(container_fd, VFIO_IOMMU_MAP_DMA, &dev->dma_map) < 0)
goto error;
}
snprintf(path, sizeof(path), "%04x:%02x:%02x.%01x", segn, busn, devn,
funcn);
dev->fd = ioctl(group_fd, VFIO_GROUP_GET_DEVICE_FD, path);
if (dev->fd < 0)
goto error;
if (ioctl(dev->fd, VFIO_DEVICE_GET_INFO, &dev->device_info) < 0)
goto error;
if (dev->device_info.num_regions <= VFIO_PCI_CONFIG_REGION_INDEX)
goto error;
dev->config_reg.index = VFIO_PCI_CONFIG_REGION_INDEX;
if (ioctl(dev->fd, VFIO_DEVICE_GET_REGION_INFO, &dev->config_reg) < 0)
goto error;
return dev;
error:
lkl_printf("lkl_vfio_pci: failed to create a PCI device for %s\n",
name);
if (container_fd > 0)
close(container_fd);
if (group_fd > 0)
close(group_fd);
free(dev);
return NULL;
}
static void vfio_pci_remove(struct lkl_pci_dev *dev)
{
dev->quit = 1;
lkl_host_ops.thread_join(dev->int_thread);
close(dev->fd);
free(dev);
}
static int check_irq_status(struct lkl_pci_dev *dev)
{
unsigned short status;
if (pread(dev->fd, &status, 2, dev->config_reg.offset + 6) != 2)
return 0;
return (status & (1 << 3)) ? 1 : 0;
}
/* Currently, we only support INTx. */
static void vfio_int_thread(void *_dev)
{
eventfd_t icount;
struct lkl_pci_dev *dev = (struct lkl_pci_dev *)_dev;
struct timespec req = { 0, 1000 * 1000 };
struct vfio_irq_info irq = { .argsz = sizeof(irq) };
struct vfio_irq_set *irq_set;
char irq_set_buf[sizeof(struct vfio_irq_set) + sizeof(int)];
fd_set rfds;
if (dev->device_info.num_irqs <= VFIO_PCI_INTX_IRQ_INDEX)
goto init_error;
irq.index = VFIO_PCI_INTX_IRQ_INDEX;
if (ioctl(dev->fd, VFIO_DEVICE_GET_IRQ_INFO, &irq))
goto init_error;
if (irq.count != 1)
goto init_error;
irq_set = (struct vfio_irq_set *)irq_set_buf;
irq_set->argsz = sizeof(irq_set_buf);
irq_set->count = 1;
irq_set->flags =
VFIO_IRQ_SET_DATA_EVENTFD | VFIO_IRQ_SET_ACTION_TRIGGER;
irq_set->index = VFIO_PCI_INTX_IRQ_INDEX;
irq_set->start = 0;
dev->irq_fd = eventfd(0, EFD_CLOEXEC);
if (dev->irq_fd < 0)
goto init_error;
*(int *)&irq_set->data = dev->irq_fd;
if (ioctl(dev->fd, VFIO_DEVICE_SET_IRQS, irq_set))
goto init_error;
lkl_host_ops.sem_up(dev->thread_init_sem);
while (1) {
/* We should wait until the driver actually handles
* an interrupt by monitoring the PCI interrupt status bit.
*/
while (check_irq_status(dev) && !dev->quit) {
lkl_trigger_irq(dev->irq);
nanosleep(&req, NULL);
}
if (dev->quit)
return;
/* unmask interrupts */
irq_set->argsz = sizeof(*irq_set);
irq_set->count = 1;
irq_set->flags =
VFIO_IRQ_SET_DATA_NONE | VFIO_IRQ_SET_ACTION_UNMASK;
irq_set->index = VFIO_PCI_INTX_IRQ_INDEX;
irq_set->start = 0;
if (ioctl(dev->fd, VFIO_DEVICE_SET_IRQS, irq_set))
goto handling_error;
/* Wait for next interrupt. */
while (1) {
struct timeval tv;
int rc;
FD_ZERO(&rfds);
FD_SET(dev->irq_fd, &rfds);
tv.tv_sec = 0;
tv.tv_usec = 100 * 1000;
rc = select(dev->irq_fd + 1, &rfds, NULL, NULL, &tv);
if (rc == -1)
goto handling_error;
else if (rc)
if (read(dev->irq_fd, &icount, sizeof(icount)) <
0)
goto handling_error;
else
break;
else if (dev->quit)
return;
}
}
init_error:
lkl_printf("lkl_vfio_pci: failed to setup INTx for a device\n");
return;
handling_error:
lkl_printf("lkl_vfio_pci: unknown error in the interrupt handler\n");
}
static int vfio_pci_irq_init(struct lkl_pci_dev *dev, int irq)
{
dev->thread_init_sem = lkl_host_ops.sem_alloc(0);
if (!dev->thread_init_sem)
return -1;
dev->irq = irq;
dev->int_thread =
lkl_host_ops.thread_create(vfio_int_thread, (void *)dev);
if (!dev->int_thread) {
lkl_host_ops.sem_free(dev->thread_init_sem);
return -1;
}
/* wait until the interrupt handler thread is ready */
lkl_host_ops.sem_down(dev->thread_init_sem);
lkl_host_ops.sem_free(dev->thread_init_sem);
return 0;
}
static unsigned long long vfio_map_page(struct lkl_pci_dev *dev, void *vaddr,
unsigned long size)
{
return (unsigned long long)vaddr - dev->dma_map.vaddr;
}
static void vfio_unmap_page(struct lkl_pci_dev *dev,
unsigned long long dma_handle, unsigned long size)
{
}
static int vfio_pci_read(struct lkl_pci_dev *dev, int where, int size,
void *val)
{
return pread(dev->fd, val, size, dev->config_reg.offset + where);
}
static int vfio_pci_write(struct lkl_pci_dev *dev, int where, int size,
void *val)
{
return pwrite(dev->fd, val, size, dev->config_reg.offset + where);
}
static int pci_resource_read(void *data, int offset, void *res, int size)
{
void *addr = data + offset;
switch (size) {
case 8:
*(uint64_t *)res = *(uint64_t *)addr;
break;
case 4:
*(uint32_t *)res = *(uint32_t *)addr;
break;
case 2:
*(uint16_t *)res = *(uint16_t *)addr;
break;
case 1:
*(uint8_t *)res = *(uint8_t *)addr;
break;
default:
return -LKL_EOPNOTSUPP;
}
return 0;
}
static int pci_resource_write(void *data, int offset, void *res, int size)
{
void *addr = data + offset;
switch (size) {
case 8:
*(uint64_t *)addr = *(uint64_t *)res;
break;
case 4:
*(uint32_t *)addr = *(uint32_t *)res;
break;
case 2:
*(uint16_t *)addr = *(uint16_t *)res;
break;
case 1:
*(uint8_t *)addr = *(uint8_t *)res;
break;
default:
return -LKL_EOPNOTSUPP;
}
return 0;
}
static const struct lkl_iomem_ops pci_resource_ops = {
.read = pci_resource_read,
.write = pci_resource_write,
};
#define ARRAY_SIZE(arr) (sizeof(arr) / sizeof((arr)[0]))
static void *vfio_resource_alloc(struct lkl_pci_dev *dev,
unsigned long resource_size,
int resource_index)
{
unsigned int region_index_list[] = {
VFIO_PCI_BAR0_REGION_INDEX, VFIO_PCI_BAR1_REGION_INDEX,
VFIO_PCI_BAR2_REGION_INDEX, VFIO_PCI_BAR3_REGION_INDEX,
VFIO_PCI_BAR4_REGION_INDEX, VFIO_PCI_BAR5_REGION_INDEX,
};
struct vfio_region_info reg = { .argsz = sizeof(reg) };
void *mmio_addr;
if ((unsigned int)resource_index >= ARRAY_SIZE(region_index_list))
return NULL;
reg.index = region_index_list[resource_index];
if (dev->device_info.num_regions <= reg.index)
return NULL;
/* We assume the resource is a memory space. */
if (ioctl(dev->fd, VFIO_DEVICE_GET_REGION_INFO, &reg) < 0)
return NULL;
if (reg.size < resource_size)
return NULL;
mmio_addr = mmap(NULL, resource_size, PROT_READ | PROT_WRITE,
MAP_SHARED, dev->fd, reg.offset);
if (mmio_addr == MAP_FAILED)
return NULL;
return register_iomem(mmio_addr, resource_size, &pci_resource_ops);
}
struct lkl_dev_pci_ops vfio_pci_ops = {
.add = vfio_pci_add,
.remove = vfio_pci_remove,
.irq_init = vfio_pci_irq_init,
.read = vfio_pci_read,
.write = vfio_pci_write,
.resource_alloc = vfio_resource_alloc,
.map_page = vfio_map_page,
.unmap_page = vfio_unmap_page,
};

View File

@@ -1,3 +1,4 @@
boot-y += boot.o test.o boot-y += boot.o test.o
disk-y += disk.o cla.o test.o disk-y += disk.o cla.o test.o
disk-vfio-pci-y += disk-vfio-pci.o cla.o test.o
net-test-y += net-test.o cla.o test.o net-test-y += net-test.o cla.o test.o

View File

@@ -0,0 +1,112 @@
// SPDX-License-Identifier: GPL-2.0
#include <stdio.h>
#include <unistd.h>
#include <string.h>
#include <time.h>
#include <stdlib.h>
#include <stdint.h>
#include <lkl.h>
#include <lkl_host.h>
#include <sys/stat.h>
#include <fcntl.h>
#include "test.h"
#include "cla.h"
static struct {
int printk;
const char *fstype;
const char *pciname;
} cla;
struct cl_arg args[] = {
{ "type", 't', "filesystem type", 1, CL_ARG_STR, &cla.fstype },
{ "pciname", 'n', "PCI device name (as %x:%x:%x.%x format)", 1,
CL_ARG_STR, &cla.pciname },
{ 0 },
};
static char mnt_point[32];
static char bootparams[128];
static int lkl_test_umount_dev(void)
{
long ret, ret2;
ret = lkl_sys_chdir("/");
ret2 = lkl_umount_blkdev(LKL_MKDEV(259, 0), 0, 1000);
lkl_test_logf("%ld %ld", ret, ret2);
if (!ret && !ret2)
return TEST_SUCCESS;
return TEST_FAILURE;
}
struct lkl_dir *dir;
static int lkl_test_opendir(void)
{
int err;
dir = lkl_opendir(mnt_point, &err);
lkl_test_logf("lkl_opedir(%s) = %d %s\n", mnt_point, err,
lkl_strerror(err));
if (err == 0)
return TEST_SUCCESS;
return TEST_FAILURE;
}
static int lkl_test_readdir(void)
{
struct lkl_linux_dirent64 *de = lkl_readdir(dir);
int wr = 0;
while (de) {
wr += lkl_test_logf("%s ", de->d_name);
if (wr >= 70) {
lkl_test_logf("\n");
wr = 0;
break;
}
de = lkl_readdir(dir);
}
if (lkl_errdir(dir) == 0)
return TEST_SUCCESS;
return TEST_FAILURE;
}
LKL_TEST_CALL(mount_dev, lkl_mount_blkdev, 0, LKL_MKDEV(259, 0),
cla.fstype, 0, NULL, mnt_point, sizeof(mnt_point))
LKL_TEST_CALL(closedir, lkl_closedir, 0, dir);
LKL_TEST_CALL(chdir_mnt_point, lkl_sys_chdir, 0, mnt_point);
LKL_TEST_CALL(start_kernel, lkl_start_kernel, 0, &lkl_host_ops, bootparams);
LKL_TEST_CALL(stop_kernel, lkl_sys_halt, 0);
struct lkl_test tests[] = {
LKL_TEST(start_kernel), LKL_TEST(mount_dev),
LKL_TEST(chdir_mnt_point), LKL_TEST(opendir),
LKL_TEST(readdir), LKL_TEST(closedir),
LKL_TEST(umount_dev), LKL_TEST(stop_kernel),
};
int main(int argc, const char **argv)
{
if (parse_args(argc, argv, args) < 0)
return -1;
snprintf(bootparams, sizeof(bootparams),
"mem=16M loglevel=8 lkl_pci=vfio%s", cla.pciname);
lkl_host_ops.print = lkl_test_log;
return lkl_test_run(tests, sizeof(tests) / sizeof(struct lkl_test),
"disk-vfio-pci %s", cla.fstype);
}

View File

@@ -0,0 +1,69 @@
#!/usr/bin/env bash
# SPDX-License-Identifier: GPL-2.0
script_dir=$(cd $(dirname ${BASH_SOURCE:-$0}); pwd)
source $script_dir/test.sh
pciname="0000:00:03.0"
nvme_id="8086 5845"
bin_name="disk-vfio-pci"
function wait_guest()
{
for i in `seq 300`; do
if $MYSSH exit 2> /dev/null; then
break
fi
sleep 1
done
}
function init()
{
# initialize
dd if=/dev/zero of=/home/ubuntu/nvme.img bs=1024 count=102400
yes | sudo mkfs.$fstype /home/ubuntu/nvme.img
$MYSSH sudo modprobe vfio-pci
$MYSSH "sh -c 'echo $nvme_id |
sudo tee /sys/bus/pci/drivers/vfio-pci/new_id'"
$MYSSH "sh -c 'echo $pciname |
sudo tee /sys/bus/pci/drivers/nvme/unbind'"
$MYSSH "sh -c 'echo $pciname |
sudo tee /sys/bus/pci/drivers/vfio-pci/bind'"
$MYSSH sudo chown lkl:lkl /dev/vfio/3
$MYSCP $script_dir/$bin_name lkl@localhost:
}
function cleanup()
{
$MYSSH "sh -c 'echo $pciname |
sudo tee /sys/bus/pci/drivers/vfio-pci/unbind'"
$MYSSH "sh -c 'echo $pciname |
sudo tee /sys/bus/pci/drivers/nvme/bind'"
}
function run()
{
if [ -z "$LKL_QEMU_TEST" ]; then
lkl_test_plan 0 "disk-vfio-pci $fstype"
echo "vfio not supported"
else
lkl_test_plan 1 "disk-vfio-pci $fstype"
lkl_test_run 1 init
lkl_test_exec $MYSSH ./$bin_name -n 0000:00:03.0 -t $fstype
lkl_test_plan 1 "disk-vfio-pci $fstype"
lkl_test_run 1 cleanup
fi
}
if [ "$1" = "-t" ]; then
shift
fstype=$1
shift
fi
if [ -z "$fstype" ]; then
fstype="ext4"
fi
"$@"

View File

@@ -62,7 +62,11 @@ tests = [
'lklfuse.sh -t btrfs', 'lklfuse.sh -t btrfs',
'lklfuse.sh -t vfat', 'lklfuse.sh -t vfat',
'lklfuse.sh -t xfs', 'lklfuse.sh -t xfs',
'hijack-test.sh' 'hijack-test.sh',
'disk-vfio-pci.sh -t ext4 run',
'disk-vfio-pci.sh -t btrfs run',
'disk-vfio-pci.sh -t vfat run',
'disk-vfio-pci.sh -t xfs run'
] ]
parser = argparse.ArgumentParser(description='LKL test runner') parser = argparse.ArgumentParser(description='LKL test runner')