From 6e7907468fd05b8a641a715ebb110fc1903a604e Mon Sep 17 00:00:00 2001 From: Paolo Bonzini Date: Tue, 5 Feb 2013 12:42:31 +0100 Subject: hw: move virtio devices to hw/ subdirectories Signed-off-by: Paolo Bonzini --- hw/Makefile.objs | 5 - hw/block/Makefile.objs | 3 + hw/block/dataplane/Makefile.objs | 1 + hw/block/dataplane/ioq.c | 117 ++++ hw/block/dataplane/ioq.h | 57 ++ hw/block/dataplane/virtio-blk.c | 540 +++++++++++++++ hw/block/dataplane/virtio-blk.h | 29 + hw/block/virtio-blk.c | 732 ++++++++++++++++++++ hw/char/Makefile.objs | 2 + hw/char/virtio-serial-bus.c | 1018 +++++++++++++++++++++++++++ hw/dataplane/Makefile.objs | 1 - hw/dataplane/hostmem.c | 176 ----- hw/dataplane/ioq.c | 117 ---- hw/dataplane/ioq.h | 57 -- hw/dataplane/virtio-blk.c | 540 --------------- hw/dataplane/virtio-blk.h | 29 - hw/dataplane/vring.c | 363 ---------- hw/net/Makefile.objs | 3 + hw/net/vhost_net.c | 328 +++++++++ hw/net/virtio-net.c | 1370 +++++++++++++++++++++++++++++++++++++ hw/scsi/Makefile.objs | 1 + hw/scsi/virtio-scsi.c | 774 +++++++++++++++++++++ hw/vhost.c | 1042 ---------------------------- hw/vhost_net.c | 328 --------- hw/virtio-balloon.c | 416 ----------- hw/virtio-blk.c | 732 -------------------- hw/virtio-net.c | 1370 ------------------------------------- hw/virtio-scsi.c | 774 --------------------- hw/virtio-serial-bus.c | 1018 --------------------------- hw/virtio.c | 1121 ------------------------------ hw/virtio/Makefile.objs | 3 + hw/virtio/dataplane/Makefile.objs | 1 + hw/virtio/dataplane/hostmem.c | 176 +++++ hw/virtio/dataplane/vring.c | 363 ++++++++++ hw/virtio/vhost.c | 1042 ++++++++++++++++++++++++++++ hw/virtio/virtio-balloon.c | 416 +++++++++++ hw/virtio/virtio.c | 1121 ++++++++++++++++++++++++++++++ 37 files changed, 8097 insertions(+), 8089 deletions(-) create mode 100644 hw/block/dataplane/Makefile.objs create mode 100644 hw/block/dataplane/ioq.c create mode 100644 hw/block/dataplane/ioq.h create mode 100644 hw/block/dataplane/virtio-blk.c create mode 100644 hw/block/dataplane/virtio-blk.h create mode 100644 hw/block/virtio-blk.c create mode 100644 hw/char/virtio-serial-bus.c delete mode 100644 hw/dataplane/Makefile.objs delete mode 100644 hw/dataplane/hostmem.c delete mode 100644 hw/dataplane/ioq.c delete mode 100644 hw/dataplane/ioq.h delete mode 100644 hw/dataplane/virtio-blk.c delete mode 100644 hw/dataplane/virtio-blk.h delete mode 100644 hw/dataplane/vring.c create mode 100644 hw/net/vhost_net.c create mode 100644 hw/net/virtio-net.c create mode 100644 hw/scsi/virtio-scsi.c delete mode 100644 hw/vhost.c delete mode 100644 hw/vhost_net.c delete mode 100644 hw/virtio-balloon.c delete mode 100644 hw/virtio-blk.c delete mode 100644 hw/virtio-net.c delete mode 100644 hw/virtio-scsi.c delete mode 100644 hw/virtio-serial-bus.c delete mode 100644 hw/virtio.c create mode 100644 hw/virtio/dataplane/Makefile.objs create mode 100644 hw/virtio/dataplane/hostmem.c create mode 100644 hw/virtio/dataplane/vring.c create mode 100644 hw/virtio/vhost.c create mode 100644 hw/virtio/virtio-balloon.c create mode 100644 hw/virtio/virtio.c (limited to 'hw') diff --git a/hw/Makefile.objs b/hw/Makefile.objs index 1d28ce28d7..83a6bf2b18 100644 --- a/hw/Makefile.objs +++ b/hw/Makefile.objs @@ -34,11 +34,6 @@ ifeq ($(CONFIG_SOFTMMU),y) # Per-target files # virtio has to be here due to weird dependency between PCI and virtio-net. # need to fix this properly -obj-$(CONFIG_VIRTIO) += dataplane/ -obj-$(CONFIG_VIRTIO) += virtio.o virtio-blk.o virtio-balloon.o virtio-net.o -obj-$(CONFIG_VIRTIO) += virtio-serial-bus.o virtio-scsi.o -obj-$(CONFIG_SOFTMMU) += vhost_net.o -obj-$(CONFIG_VHOST_NET) += vhost.o obj-$(CONFIG_VGA) += vga.o # Inter-VM PCI shared memory & VFIO PCI device assignment diff --git a/hw/block/Makefile.objs b/hw/block/Makefile.objs index 5fa5101386..856915eb6a 100644 --- a/hw/block/Makefile.objs +++ b/hw/block/Makefile.objs @@ -6,3 +6,6 @@ common-obj-$(CONFIG_PFLASH_CFI01) += pflash_cfi01.o common-obj-$(CONFIG_PFLASH_CFI02) += pflash_cfi02.o common-obj-$(CONFIG_XEN_BACKEND) += xen_disk.o common-obj-$(CONFIG_ECC) += ecc.o + +obj-$(CONFIG_VIRTIO) += virtio-blk.o +obj-$(CONFIG_VIRTIO_BLK_DATA_PLANE) += dataplane/ diff --git a/hw/block/dataplane/Makefile.objs b/hw/block/dataplane/Makefile.objs new file mode 100644 index 0000000000..9da2eb82ba --- /dev/null +++ b/hw/block/dataplane/Makefile.objs @@ -0,0 +1 @@ +obj-y += ioq.o virtio-blk.o diff --git a/hw/block/dataplane/ioq.c b/hw/block/dataplane/ioq.c new file mode 100644 index 0000000000..f709f87ed6 --- /dev/null +++ b/hw/block/dataplane/ioq.c @@ -0,0 +1,117 @@ +/* + * Linux AIO request queue + * + * Copyright 2012 IBM, Corp. + * Copyright 2012 Red Hat, Inc. and/or its affiliates + * + * Authors: + * Stefan Hajnoczi + * + * This work is licensed under the terms of the GNU GPL, version 2 or later. + * See the COPYING file in the top-level directory. + * + */ + +#include "ioq.h" + +void ioq_init(IOQueue *ioq, int fd, unsigned int max_reqs) +{ + int rc; + + ioq->fd = fd; + ioq->max_reqs = max_reqs; + + memset(&ioq->io_ctx, 0, sizeof ioq->io_ctx); + rc = io_setup(max_reqs, &ioq->io_ctx); + if (rc != 0) { + fprintf(stderr, "ioq io_setup failed %d\n", rc); + exit(1); + } + + rc = event_notifier_init(&ioq->io_notifier, 0); + if (rc != 0) { + fprintf(stderr, "ioq io event notifier creation failed %d\n", rc); + exit(1); + } + + ioq->freelist = g_malloc0(sizeof ioq->freelist[0] * max_reqs); + ioq->freelist_idx = 0; + + ioq->queue = g_malloc0(sizeof ioq->queue[0] * max_reqs); + ioq->queue_idx = 0; +} + +void ioq_cleanup(IOQueue *ioq) +{ + g_free(ioq->freelist); + g_free(ioq->queue); + + event_notifier_cleanup(&ioq->io_notifier); + io_destroy(ioq->io_ctx); +} + +EventNotifier *ioq_get_notifier(IOQueue *ioq) +{ + return &ioq->io_notifier; +} + +struct iocb *ioq_get_iocb(IOQueue *ioq) +{ + /* Underflow cannot happen since ioq is sized for max_reqs */ + assert(ioq->freelist_idx != 0); + + struct iocb *iocb = ioq->freelist[--ioq->freelist_idx]; + ioq->queue[ioq->queue_idx++] = iocb; + return iocb; +} + +void ioq_put_iocb(IOQueue *ioq, struct iocb *iocb) +{ + /* Overflow cannot happen since ioq is sized for max_reqs */ + assert(ioq->freelist_idx != ioq->max_reqs); + + ioq->freelist[ioq->freelist_idx++] = iocb; +} + +struct iocb *ioq_rdwr(IOQueue *ioq, bool read, struct iovec *iov, + unsigned int count, long long offset) +{ + struct iocb *iocb = ioq_get_iocb(ioq); + + if (read) { + io_prep_preadv(iocb, ioq->fd, iov, count, offset); + } else { + io_prep_pwritev(iocb, ioq->fd, iov, count, offset); + } + io_set_eventfd(iocb, event_notifier_get_fd(&ioq->io_notifier)); + return iocb; +} + +int ioq_submit(IOQueue *ioq) +{ + int rc = io_submit(ioq->io_ctx, ioq->queue_idx, ioq->queue); + ioq->queue_idx = 0; /* reset */ + return rc; +} + +int ioq_run_completion(IOQueue *ioq, IOQueueCompletion *completion, + void *opaque) +{ + struct io_event events[ioq->max_reqs]; + int nevents, i; + + do { + nevents = io_getevents(ioq->io_ctx, 0, ioq->max_reqs, events, NULL); + } while (nevents < 0 && errno == EINTR); + if (nevents < 0) { + return nevents; + } + + for (i = 0; i < nevents; i++) { + ssize_t ret = ((uint64_t)events[i].res2 << 32) | events[i].res; + + completion(events[i].obj, ret, opaque); + ioq_put_iocb(ioq, events[i].obj); + } + return nevents; +} diff --git a/hw/block/dataplane/ioq.h b/hw/block/dataplane/ioq.h new file mode 100644 index 0000000000..b49b5de7f4 --- /dev/null +++ b/hw/block/dataplane/ioq.h @@ -0,0 +1,57 @@ +/* + * Linux AIO request queue + * + * Copyright 2012 IBM, Corp. + * Copyright 2012 Red Hat, Inc. and/or its affiliates + * + * Authors: + * Stefan Hajnoczi + * + * This work is licensed under the terms of the GNU GPL, version 2 or later. + * See the COPYING file in the top-level directory. + * + */ + +#ifndef IOQ_H +#define IOQ_H + +#include +#include "qemu/event_notifier.h" + +typedef struct { + int fd; /* file descriptor */ + unsigned int max_reqs; /* max length of freelist and queue */ + + io_context_t io_ctx; /* Linux AIO context */ + EventNotifier io_notifier; /* Linux AIO eventfd */ + + /* Requests can complete in any order so a free list is necessary to manage + * available iocbs. + */ + struct iocb **freelist; /* free iocbs */ + unsigned int freelist_idx; + + /* Multiple requests are queued up before submitting them all in one go */ + struct iocb **queue; /* queued iocbs */ + unsigned int queue_idx; +} IOQueue; + +void ioq_init(IOQueue *ioq, int fd, unsigned int max_reqs); +void ioq_cleanup(IOQueue *ioq); +EventNotifier *ioq_get_notifier(IOQueue *ioq); +struct iocb *ioq_get_iocb(IOQueue *ioq); +void ioq_put_iocb(IOQueue *ioq, struct iocb *iocb); +struct iocb *ioq_rdwr(IOQueue *ioq, bool read, struct iovec *iov, + unsigned int count, long long offset); +int ioq_submit(IOQueue *ioq); + +static inline unsigned int ioq_num_queued(IOQueue *ioq) +{ + return ioq->queue_idx; +} + +typedef void IOQueueCompletion(struct iocb *iocb, ssize_t ret, void *opaque); +int ioq_run_completion(IOQueue *ioq, IOQueueCompletion *completion, + void *opaque); + +#endif /* IOQ_H */ diff --git a/hw/block/dataplane/virtio-blk.c b/hw/block/dataplane/virtio-blk.c new file mode 100644 index 0000000000..5baef2391a --- /dev/null +++ b/hw/block/dataplane/virtio-blk.c @@ -0,0 +1,540 @@ +/* + * Dedicated thread for virtio-blk I/O processing + * + * Copyright 2012 IBM, Corp. + * Copyright 2012 Red Hat, Inc. and/or its affiliates + * + * Authors: + * Stefan Hajnoczi + * + * This work is licensed under the terms of the GNU GPL, version 2 or later. + * See the COPYING file in the top-level directory. + * + */ + +#include "trace.h" +#include "qemu/iov.h" +#include "qemu/thread.h" +#include "qemu/error-report.h" +#include "hw/virtio/dataplane/vring.h" +#include "ioq.h" +#include "migration/migration.h" +#include "block/block.h" +#include "hw/virtio/virtio-blk.h" +#include "virtio-blk.h" +#include "block/aio.h" + +enum { + SEG_MAX = 126, /* maximum number of I/O segments */ + VRING_MAX = SEG_MAX + 2, /* maximum number of vring descriptors */ + REQ_MAX = VRING_MAX, /* maximum number of requests in the vring, + * is VRING_MAX / 2 with traditional and + * VRING_MAX with indirect descriptors */ +}; + +typedef struct { + struct iocb iocb; /* Linux AIO control block */ + QEMUIOVector *inhdr; /* iovecs for virtio_blk_inhdr */ + unsigned int head; /* vring descriptor index */ + struct iovec *bounce_iov; /* used if guest buffers are unaligned */ + QEMUIOVector *read_qiov; /* for read completion /w bounce buffer */ +} VirtIOBlockRequest; + +struct VirtIOBlockDataPlane { + bool started; + bool stopping; + QEMUBH *start_bh; + QemuThread thread; + + VirtIOBlkConf *blk; + int fd; /* image file descriptor */ + + VirtIODevice *vdev; + Vring vring; /* virtqueue vring */ + EventNotifier *guest_notifier; /* irq */ + + /* Note that these EventNotifiers are assigned by value. This is + * fine as long as you do not call event_notifier_cleanup on them + * (because you don't own the file descriptor or handle; you just + * use it). + */ + AioContext *ctx; + EventNotifier io_notifier; /* Linux AIO completion */ + EventNotifier host_notifier; /* doorbell */ + + IOQueue ioqueue; /* Linux AIO queue (should really be per + dataplane thread) */ + VirtIOBlockRequest requests[REQ_MAX]; /* pool of requests, managed by the + queue */ + + unsigned int num_reqs; + + Error *migration_blocker; +}; + +/* Raise an interrupt to signal guest, if necessary */ +static void notify_guest(VirtIOBlockDataPlane *s) +{ + if (!vring_should_notify(s->vdev, &s->vring)) { + return; + } + + event_notifier_set(s->guest_notifier); +} + +static void complete_request(struct iocb *iocb, ssize_t ret, void *opaque) +{ + VirtIOBlockDataPlane *s = opaque; + VirtIOBlockRequest *req = container_of(iocb, VirtIOBlockRequest, iocb); + struct virtio_blk_inhdr hdr; + int len; + + if (likely(ret >= 0)) { + hdr.status = VIRTIO_BLK_S_OK; + len = ret; + } else { + hdr.status = VIRTIO_BLK_S_IOERR; + len = 0; + } + + trace_virtio_blk_data_plane_complete_request(s, req->head, ret); + + if (req->read_qiov) { + assert(req->bounce_iov); + qemu_iovec_from_buf(req->read_qiov, 0, req->bounce_iov->iov_base, len); + qemu_iovec_destroy(req->read_qiov); + g_slice_free(QEMUIOVector, req->read_qiov); + } + + if (req->bounce_iov) { + qemu_vfree(req->bounce_iov->iov_base); + g_slice_free(struct iovec, req->bounce_iov); + } + + qemu_iovec_from_buf(req->inhdr, 0, &hdr, sizeof(hdr)); + qemu_iovec_destroy(req->inhdr); + g_slice_free(QEMUIOVector, req->inhdr); + + /* According to the virtio specification len should be the number of bytes + * written to, but for virtio-blk it seems to be the number of bytes + * transferred plus the status bytes. + */ + vring_push(&s->vring, req->head, len + sizeof(hdr)); + + s->num_reqs--; +} + +static void complete_request_early(VirtIOBlockDataPlane *s, unsigned int head, + QEMUIOVector *inhdr, unsigned char status) +{ + struct virtio_blk_inhdr hdr = { + .status = status, + }; + + qemu_iovec_from_buf(inhdr, 0, &hdr, sizeof(hdr)); + qemu_iovec_destroy(inhdr); + g_slice_free(QEMUIOVector, inhdr); + + vring_push(&s->vring, head, sizeof(hdr)); + notify_guest(s); +} + +/* Get disk serial number */ +static void do_get_id_cmd(VirtIOBlockDataPlane *s, + struct iovec *iov, unsigned int iov_cnt, + unsigned int head, QEMUIOVector *inhdr) +{ + char id[VIRTIO_BLK_ID_BYTES]; + + /* Serial number not NUL-terminated when shorter than buffer */ + strncpy(id, s->blk->serial ? s->blk->serial : "", sizeof(id)); + iov_from_buf(iov, iov_cnt, 0, id, sizeof(id)); + complete_request_early(s, head, inhdr, VIRTIO_BLK_S_OK); +} + +static int do_rdwr_cmd(VirtIOBlockDataPlane *s, bool read, + struct iovec *iov, unsigned int iov_cnt, + long long offset, unsigned int head, + QEMUIOVector *inhdr) +{ + struct iocb *iocb; + QEMUIOVector qiov; + struct iovec *bounce_iov = NULL; + QEMUIOVector *read_qiov = NULL; + + qemu_iovec_init_external(&qiov, iov, iov_cnt); + if (!bdrv_qiov_is_aligned(s->blk->conf.bs, &qiov)) { + void *bounce_buffer = qemu_blockalign(s->blk->conf.bs, qiov.size); + + if (read) { + /* Need to copy back from bounce buffer on completion */ + read_qiov = g_slice_new(QEMUIOVector); + qemu_iovec_init(read_qiov, iov_cnt); + qemu_iovec_concat_iov(read_qiov, iov, iov_cnt, 0, qiov.size); + } else { + qemu_iovec_to_buf(&qiov, 0, bounce_buffer, qiov.size); + } + + /* Redirect I/O to aligned bounce buffer */ + bounce_iov = g_slice_new(struct iovec); + bounce_iov->iov_base = bounce_buffer; + bounce_iov->iov_len = qiov.size; + iov = bounce_iov; + iov_cnt = 1; + } + + iocb = ioq_rdwr(&s->ioqueue, read, iov, iov_cnt, offset); + + /* Fill in virtio block metadata needed for completion */ + VirtIOBlockRequest *req = container_of(iocb, VirtIOBlockRequest, iocb); + req->head = head; + req->inhdr = inhdr; + req->bounce_iov = bounce_iov; + req->read_qiov = read_qiov; + return 0; +} + +static int process_request(IOQueue *ioq, struct iovec iov[], + unsigned int out_num, unsigned int in_num, + unsigned int head) +{ + VirtIOBlockDataPlane *s = container_of(ioq, VirtIOBlockDataPlane, ioqueue); + struct iovec *in_iov = &iov[out_num]; + struct virtio_blk_outhdr outhdr; + QEMUIOVector *inhdr; + size_t in_size; + + /* Copy in outhdr */ + if (unlikely(iov_to_buf(iov, out_num, 0, &outhdr, + sizeof(outhdr)) != sizeof(outhdr))) { + error_report("virtio-blk request outhdr too short"); + return -EFAULT; + } + iov_discard_front(&iov, &out_num, sizeof(outhdr)); + + /* Grab inhdr for later */ + in_size = iov_size(in_iov, in_num); + if (in_size < sizeof(struct virtio_blk_inhdr)) { + error_report("virtio_blk request inhdr too short"); + return -EFAULT; + } + inhdr = g_slice_new(QEMUIOVector); + qemu_iovec_init(inhdr, 1); + qemu_iovec_concat_iov(inhdr, in_iov, in_num, + in_size - sizeof(struct virtio_blk_inhdr), + sizeof(struct virtio_blk_inhdr)); + iov_discard_back(in_iov, &in_num, sizeof(struct virtio_blk_inhdr)); + + /* TODO Linux sets the barrier bit even when not advertised! */ + outhdr.type &= ~VIRTIO_BLK_T_BARRIER; + + switch (outhdr.type) { + case VIRTIO_BLK_T_IN: + do_rdwr_cmd(s, true, in_iov, in_num, outhdr.sector * 512, head, inhdr); + return 0; + + case VIRTIO_BLK_T_OUT: + do_rdwr_cmd(s, false, iov, out_num, outhdr.sector * 512, head, inhdr); + return 0; + + case VIRTIO_BLK_T_SCSI_CMD: + /* TODO support SCSI commands */ + complete_request_early(s, head, inhdr, VIRTIO_BLK_S_UNSUPP); + return 0; + + case VIRTIO_BLK_T_FLUSH: + /* TODO fdsync not supported by Linux AIO, do it synchronously here! */ + if (qemu_fdatasync(s->fd) < 0) { + complete_request_early(s, head, inhdr, VIRTIO_BLK_S_IOERR); + } else { + complete_request_early(s, head, inhdr, VIRTIO_BLK_S_OK); + } + return 0; + + case VIRTIO_BLK_T_GET_ID: + do_get_id_cmd(s, in_iov, in_num, head, inhdr); + return 0; + + default: + error_report("virtio-blk unsupported request type %#x", outhdr.type); + qemu_iovec_destroy(inhdr); + g_slice_free(QEMUIOVector, inhdr); + return -EFAULT; + } +} + +static int flush_true(EventNotifier *e) +{ + return true; +} + +static void handle_notify(EventNotifier *e) +{ + VirtIOBlockDataPlane *s = container_of(e, VirtIOBlockDataPlane, + host_notifier); + + /* There is one array of iovecs into which all new requests are extracted + * from the vring. Requests are read from the vring and the translated + * descriptors are written to the iovecs array. The iovecs do not have to + * persist across handle_notify() calls because the kernel copies the + * iovecs on io_submit(). + * + * Handling io_submit() EAGAIN may require storing the requests across + * handle_notify() calls until the kernel has sufficient resources to + * accept more I/O. This is not implemented yet. + */ + struct iovec iovec[VRING_MAX]; + struct iovec *end = &iovec[VRING_MAX]; + struct iovec *iov = iovec; + + /* When a request is read from the vring, the index of the first descriptor + * (aka head) is returned so that the completed request can be pushed onto + * the vring later. + * + * The number of hypervisor read-only iovecs is out_num. The number of + * hypervisor write-only iovecs is in_num. + */ + int head; + unsigned int out_num = 0, in_num = 0; + unsigned int num_queued; + + event_notifier_test_and_clear(&s->host_notifier); + for (;;) { + /* Disable guest->host notifies to avoid unnecessary vmexits */ + vring_disable_notification(s->vdev, &s->vring); + + for (;;) { + head = vring_pop(s->vdev, &s->vring, iov, end, &out_num, &in_num); + if (head < 0) { + break; /* no more requests */ + } + + trace_virtio_blk_data_plane_process_request(s, out_num, in_num, + head); + + if (process_request(&s->ioqueue, iov, out_num, in_num, head) < 0) { + vring_set_broken(&s->vring); + break; + } + iov += out_num + in_num; + } + + if (likely(head == -EAGAIN)) { /* vring emptied */ + /* Re-enable guest->host notifies and stop processing the vring. + * But if the guest has snuck in more descriptors, keep processing. + */ + if (vring_enable_notification(s->vdev, &s->vring)) { + break; + } + } else { /* head == -ENOBUFS or fatal error, iovecs[] is depleted */ + /* Since there are no iovecs[] left, stop processing for now. Do + * not re-enable guest->host notifies since the I/O completion + * handler knows to check for more vring descriptors anyway. + */ + break; + } + } + + num_queued = ioq_num_queued(&s->ioqueue); + if (num_queued > 0) { + s->num_reqs += num_queued; + + int rc = ioq_submit(&s->ioqueue); + if (unlikely(rc < 0)) { + fprintf(stderr, "ioq_submit failed %d\n", rc); + exit(1); + } + } +} + +static int flush_io(EventNotifier *e) +{ + VirtIOBlockDataPlane *s = container_of(e, VirtIOBlockDataPlane, + io_notifier); + + return s->num_reqs > 0; +} + +static void handle_io(EventNotifier *e) +{ + VirtIOBlockDataPlane *s = container_of(e, VirtIOBlockDataPlane, + io_notifier); + + event_notifier_test_and_clear(&s->io_notifier); + if (ioq_run_completion(&s->ioqueue, complete_request, s) > 0) { + notify_guest(s); + } + + /* If there were more requests than iovecs, the vring will not be empty yet + * so check again. There should now be enough resources to process more + * requests. + */ + if (unlikely(vring_more_avail(&s->vring))) { + handle_notify(&s->host_notifier); + } +} + +static void *data_plane_thread(void *opaque) +{ + VirtIOBlockDataPlane *s = opaque; + + do { + aio_poll(s->ctx, true); + } while (!s->stopping || s->num_reqs > 0); + return NULL; +} + +static void start_data_plane_bh(void *opaque) +{ + VirtIOBlockDataPlane *s = opaque; + + qemu_bh_delete(s->start_bh); + s->start_bh = NULL; + qemu_thread_create(&s->thread, data_plane_thread, + s, QEMU_THREAD_JOINABLE); +} + +bool virtio_blk_data_plane_create(VirtIODevice *vdev, VirtIOBlkConf *blk, + VirtIOBlockDataPlane **dataplane) +{ + VirtIOBlockDataPlane *s; + int fd; + + *dataplane = NULL; + + if (!blk->data_plane) { + return true; + } + + if (blk->scsi) { + error_report("device is incompatible with x-data-plane, use scsi=off"); + return false; + } + + if (blk->config_wce) { + error_report("device is incompatible with x-data-plane, " + "use config-wce=off"); + return false; + } + + fd = raw_get_aio_fd(blk->conf.bs); + if (fd < 0) { + error_report("drive is incompatible with x-data-plane, " + "use format=raw,cache=none,aio=native"); + return false; + } + + s = g_new0(VirtIOBlockDataPlane, 1); + s->vdev = vdev; + s->fd = fd; + s->blk = blk; + + /* Prevent block operations that conflict with data plane thread */ + bdrv_set_in_use(blk->conf.bs, 1); + + error_setg(&s->migration_blocker, + "x-data-plane does not support migration"); + migrate_add_blocker(s->migration_blocker); + + *dataplane = s; + return true; +} + +void virtio_blk_data_plane_destroy(VirtIOBlockDataPlane *s) +{ + if (!s) { + return; + } + + virtio_blk_data_plane_stop(s); + migrate_del_blocker(s->migration_blocker); + error_free(s->migration_blocker); + bdrv_set_in_use(s->blk->conf.bs, 0); + g_free(s); +} + +void virtio_blk_data_plane_start(VirtIOBlockDataPlane *s) +{ + VirtQueue *vq; + int i; + + if (s->started) { + return; + } + + vq = virtio_get_queue(s->vdev, 0); + if (!vring_setup(&s->vring, s->vdev, 0)) { + return; + } + + s->ctx = aio_context_new(); + + /* Set up guest notifier (irq) */ + if (s->vdev->binding->set_guest_notifiers(s->vdev->binding_opaque, 1, + true) != 0) { + fprintf(stderr, "virtio-blk failed to set guest notifier, " + "ensure -enable-kvm is set\n"); + exit(1); + } + s->guest_notifier = virtio_queue_get_guest_notifier(vq); + + /* Set up virtqueue notify */ + if (s->vdev->binding->set_host_notifier(s->vdev->binding_opaque, + 0, true) != 0) { + fprintf(stderr, "virtio-blk failed to set host notifier\n"); + exit(1); + } + s->host_notifier = *virtio_queue_get_host_notifier(vq); + aio_set_event_notifier(s->ctx, &s->host_notifier, handle_notify, flush_true); + + /* Set up ioqueue */ + ioq_init(&s->ioqueue, s->fd, REQ_MAX); + for (i = 0; i < ARRAY_SIZE(s->requests); i++) { + ioq_put_iocb(&s->ioqueue, &s->requests[i].iocb); + } + s->io_notifier = *ioq_get_notifier(&s->ioqueue); + aio_set_event_notifier(s->ctx, &s->io_notifier, handle_io, flush_io); + + s->started = true; + trace_virtio_blk_data_plane_start(s); + + /* Kick right away to begin processing requests already in vring */ + event_notifier_set(virtio_queue_get_host_notifier(vq)); + + /* Spawn thread in BH so it inherits iothread cpusets */ + s->start_bh = qemu_bh_new(start_data_plane_bh, s); + qemu_bh_schedule(s->start_bh); +} + +void virtio_blk_data_plane_stop(VirtIOBlockDataPlane *s) +{ + if (!s->started || s->stopping) { + return; + } + s->stopping = true; + trace_virtio_blk_data_plane_stop(s); + + /* Stop thread or cancel pending thread creation BH */ + if (s->start_bh) { + qemu_bh_delete(s->start_bh); + s->start_bh = NULL; + } else { + aio_notify(s->ctx); + qemu_thread_join(&s->thread); + } + + aio_set_event_notifier(s->ctx, &s->io_notifier, NULL, NULL); + ioq_cleanup(&s->ioqueue); + + aio_set_event_notifier(s->ctx, &s->host_notifier, NULL, NULL); + s->vdev->binding->set_host_notifier(s->vdev->binding_opaque, 0, false); + + aio_context_unref(s->ctx); + + /* Clean up guest notifier (irq) */ + s->vdev->binding->set_guest_notifiers(s->vdev->binding_opaque, 1, false); + + vring_teardown(&s->vring); + s->started = false; + s->stopping = false; +} diff --git a/hw/block/dataplane/virtio-blk.h b/hw/block/dataplane/virtio-blk.h new file mode 100644 index 0000000000..c90e99f48f --- /dev/null +++ b/hw/block/dataplane/virtio-blk.h @@ -0,0 +1,29 @@ +/* + * Dedicated thread for virtio-blk I/O processing + * + * Copyright 2012 IBM, Corp. + * Copyright 2012 Red Hat, Inc. and/or its affiliates + * + * Authors: + * Stefan Hajnoczi + * + * This work is licensed under the terms of the GNU GPL, version 2 or later. + * See the COPYING file in the top-level directory. + * + */ + +#ifndef HW_DATAPLANE_VIRTIO_BLK_H +#define HW_DATAPLANE_VIRTIO_BLK_H + +#include "hw/virtio/virtio.h" + +typedef struct VirtIOBlockDataPlane VirtIOBlockDataPlane; + +bool virtio_blk_data_plane_create(VirtIODevice *vdev, VirtIOBlkConf *blk, + VirtIOBlockDataPlane **dataplane); +void virtio_blk_data_plane_destroy(VirtIOBlockDataPlane *s); +void virtio_blk_data_plane_start(VirtIOBlockDataPlane *s); +void virtio_blk_data_plane_stop(VirtIOBlockDataPlane *s); +void virtio_blk_data_plane_drain(VirtIOBlockDataPlane *s); + +#endif /* HW_DATAPLANE_VIRTIO_BLK_H */ diff --git a/hw/block/virtio-blk.c b/hw/block/virtio-blk.c new file mode 100644 index 0000000000..6efb2f063d --- /dev/null +++ b/hw/block/virtio-blk.c @@ -0,0 +1,732 @@ +/* + * Virtio Block Device + * + * Copyright IBM, Corp. 2007 + * + * Authors: + * Anthony Liguori + * + * This work is licensed under the terms of the GNU GPL, version 2. See + * the COPYING file in the top-level directory. + * + */ + +#include "qemu-common.h" +#include "qemu/error-report.h" +#include "trace.h" +#include "hw/block/block.h" +#include "sysemu/blockdev.h" +#include "hw/virtio/virtio-blk.h" +#ifdef CONFIG_VIRTIO_BLK_DATA_PLANE +# include "dataplane/virtio-blk.h" +#endif +#include "block/scsi.h" +#ifdef __linux__ +# include +#endif +#include "hw/virtio/virtio-bus.h" + +typedef struct VirtIOBlockReq +{ + VirtIOBlock *dev; + VirtQueueElement elem; + struct virtio_blk_inhdr *in; + struct virtio_blk_outhdr *out; + struct virtio_scsi_inhdr *scsi; + QEMUIOVector qiov; + struct VirtIOBlockReq *next; + BlockAcctCookie acct; +} VirtIOBlockReq; + +static void virtio_blk_req_complete(VirtIOBlockReq *req, int status) +{ + VirtIOBlock *s = req->dev; + VirtIODevice *vdev = VIRTIO_DEVICE(s); + + trace_virtio_blk_req_complete(req, status); + + stb_p(&req->in->status, status); + virtqueue_push(s->vq, &req->elem, req->qiov.size + sizeof(*req->in)); + virtio_notify(vdev, s->vq); +} + +static int virtio_blk_handle_rw_error(VirtIOBlockReq *req, int error, + bool is_read) +{ + BlockErrorAction action = bdrv_get_error_action(req->dev->bs, is_read, error); + VirtIOBlock *s = req->dev; + + if (action == BDRV_ACTION_STOP) { + req->next = s->rq; + s->rq = req; + } else if (action == BDRV_ACTION_REPORT) { + virtio_blk_req_complete(req, VIRTIO_BLK_S_IOERR); + bdrv_acct_done(s->bs, &req->acct); + g_free(req); + } + + bdrv_error_action(s->bs, action, is_read, error); + return action != BDRV_ACTION_IGNORE; +} + +static void virtio_blk_rw_complete(void *opaque, int ret) +{ + VirtIOBlockReq *req = opaque; + + trace_virtio_blk_rw_complete(req, ret); + + if (ret) { + bool is_read = !(ldl_p(&req->out->type) & VIRTIO_BLK_T_OUT); + if (virtio_blk_handle_rw_error(req, -ret, is_read)) + return; + } + + virtio_blk_req_complete(req, VIRTIO_BLK_S_OK); + bdrv_acct_done(req->dev->bs, &req->acct); + g_free(req); +} + +static void virtio_blk_flush_complete(void *opaque, int ret) +{ + VirtIOBlockReq *req = opaque; + + if (ret) { + if (virtio_blk_handle_rw_error(req, -ret, 0)) { + return; + } + } + + virtio_blk_req_complete(req, VIRTIO_BLK_S_OK); + bdrv_acct_done(req->dev->bs, &req->acct); + g_free(req); +} + +static VirtIOBlockReq *virtio_blk_alloc_request(VirtIOBlock *s) +{ + VirtIOBlockReq *req = g_malloc(sizeof(*req)); + req->dev = s; + req->qiov.size = 0; + req->next = NULL; + return req; +} + +static VirtIOBlockReq *virtio_blk_get_request(VirtIOBlock *s) +{ + VirtIOBlockReq *req = virtio_blk_alloc_request(s); + + if (req != NULL) { + if (!virtqueue_pop(s->vq, &req->elem)) { + g_free(req); + return NULL; + } + } + + return req; +} + +static void virtio_blk_handle_scsi(VirtIOBlockReq *req) +{ +#ifdef __linux__ + int ret; + int i; +#endif + int status = VIRTIO_BLK_S_OK; + + /* + * We require at least one output segment each for the virtio_blk_outhdr + * and the SCSI command block. + * + * We also at least require the virtio_blk_inhdr, the virtio_scsi_inhdr + * and the sense buffer pointer in the input segments. + */ + if (req->elem.out_num < 2 || req->elem.in_num < 3) { + virtio_blk_req_complete(req, VIRTIO_BLK_S_IOERR); + g_free(req); + return; + } + + /* + * The scsi inhdr is placed in the second-to-last input segment, just + * before the regular inhdr. + */ + req->scsi = (void *)req->elem.in_sg[req->elem.in_num - 2].iov_base; + + if (!req->dev->blk.scsi) { + status = VIRTIO_BLK_S_UNSUPP; + goto fail; + } + + /* + * No support for bidirection commands yet. + */ + if (req->elem.out_num > 2 && req->elem.in_num > 3) { + status = VIRTIO_BLK_S_UNSUPP; + goto fail; + } + +#ifdef __linux__ + struct sg_io_hdr hdr; + memset(&hdr, 0, sizeof(struct sg_io_hdr)); + hdr.interface_id = 'S'; + hdr.cmd_len = req->elem.out_sg[1].iov_len; + hdr.cmdp = req->elem.out_sg[1].iov_base; + hdr.dxfer_len = 0; + + if (req->elem.out_num > 2) { + /* + * If there are more than the minimally required 2 output segments + * there is write payload starting from the third iovec. + */ + hdr.dxfer_direction = SG_DXFER_TO_DEV; + hdr.iovec_count = req->elem.out_num - 2; + + for (i = 0; i < hdr.iovec_count; i++) + hdr.dxfer_len += req->elem.out_sg[i + 2].iov_len; + + hdr.dxferp = req->elem.out_sg + 2; + + } else if (req->elem.in_num > 3) { + /* + * If we have more than 3 input segments the guest wants to actually + * read data. + */ + hdr.dxfer_direction = SG_DXFER_FROM_DEV; + hdr.iovec_count = req->elem.in_num - 3; + for (i = 0; i < hdr.iovec_count; i++) + hdr.dxfer_len += req->elem.in_sg[i].iov_len; + + hdr.dxferp = req->elem.in_sg; + } else { + /* + * Some SCSI commands don't actually transfer any data. + */ + hdr.dxfer_direction = SG_DXFER_NONE; + } + + hdr.sbp = req->elem.in_sg[req->elem.in_num - 3].iov_base; + hdr.mx_sb_len = req->elem.in_sg[req->elem.in_num - 3].iov_len; + + ret = bdrv_ioctl(req->dev->bs, SG_IO, &hdr); + if (ret) { + status = VIRTIO_BLK_S_UNSUPP; + goto fail; + } + + /* + * From SCSI-Generic-HOWTO: "Some lower level drivers (e.g. ide-scsi) + * clear the masked_status field [hence status gets cleared too, see + * block/scsi_ioctl.c] even when a CHECK_CONDITION or COMMAND_TERMINATED + * status has occurred. However they do set DRIVER_SENSE in driver_status + * field. Also a (sb_len_wr > 0) indicates there is a sense buffer. + */ + if (hdr.status == 0 && hdr.sb_len_wr > 0) { + hdr.status = CHECK_CONDITION; + } + + stl_p(&req->scsi->errors, + hdr.status | (hdr.msg_status << 8) | + (hdr.host_status << 16) | (hdr.driver_status << 24)); + stl_p(&req->scsi->residual, hdr.resid); + stl_p(&req->scsi->sense_len, hdr.sb_len_wr); + stl_p(&req->scsi->data_len, hdr.dxfer_len); + + virtio_blk_req_complete(req, status); + g_free(req); + return; +#else + abort(); +#endif + +fail: + /* Just put anything nonzero so that the ioctl fails in the guest. */ + stl_p(&req->scsi->errors, 255); + virtio_blk_req_complete(req, status); + g_free(req); +} + +typedef struct MultiReqBuffer { + BlockRequest blkreq[32]; + unsigned int num_writes; +} MultiReqBuffer; + +static void virtio_submit_multiwrite(BlockDriverState *bs, MultiReqBuffer *mrb) +{ + int i, ret; + + if (!mrb->num_writes) { + return; + } + + ret = bdrv_aio_multiwrite(bs, mrb->blkreq, mrb->num_writes); + if (ret != 0) { + for (i = 0; i < mrb->num_writes; i++) { + if (mrb->blkreq[i].error) { + virtio_blk_rw_complete(mrb->blkreq[i].opaque, -EIO); + } + } + } + + mrb->num_writes = 0; +} + +static void virtio_blk_handle_flush(VirtIOBlockReq *req, MultiReqBuffer *mrb) +{ + bdrv_acct_start(req->dev->bs, &req->acct, 0, BDRV_ACCT_FLUSH); + + /* + * Make sure all outstanding writes are posted to the backing device. + */ + virtio_submit_multiwrite(req->dev->bs, mrb); + bdrv_aio_flush(req->dev->bs, virtio_blk_flush_complete, req); +} + +static void virtio_blk_handle_write(VirtIOBlockReq *req, MultiReqBuffer *mrb) +{ + BlockRequest *blkreq; + uint64_t sector; + + sector = ldq_p(&req->out->sector); + + bdrv_acct_start(req->dev->bs, &req->acct, req->qiov.size, BDRV_ACCT_WRITE); + + trace_virtio_blk_handle_write(req, sector, req->qiov.size / 512); + + if (sector & req->dev->sector_mask) { + virtio_blk_rw_complete(req, -EIO); + return; + } + if (req->qiov.size % req->dev->conf->logical_block_size) { + virtio_blk_rw_complete(req, -EIO); + return; + } + + if (mrb->num_writes == 32) { + virtio_submit_multiwrite(req->dev->bs, mrb); + } + + blkreq = &mrb->blkreq[mrb->num_writes]; + blkreq->sector = sector; + blkreq->nb_sectors = req->qiov.size / BDRV_SECTOR_SIZE; + blkreq->qiov = &req->qiov; + blkreq->cb = virtio_blk_rw_complete; + blkreq->opaque = req; + blkreq->error = 0; + + mrb->num_writes++; +} + +static void virtio_blk_handle_read(VirtIOBlockReq *req) +{ + uint64_t sector; + + sector = ldq_p(&req->out->sector); + + bdrv_acct_start(req->dev->bs, &req->acct, req->qiov.size, BDRV_ACCT_READ); + + trace_virtio_blk_handle_read(req, sector, req->qiov.size / 512); + + if (sector & req->dev->sector_mask) { + virtio_blk_rw_complete(req, -EIO); + return; + } + if (req->qiov.size % req->dev->conf->logical_block_size) { + virtio_blk_rw_complete(req, -EIO); + return; + } + bdrv_aio_readv(req->dev->bs, sector, &req->qiov, + req->qiov.size / BDRV_SECTOR_SIZE, + virtio_blk_rw_complete, req); +} + +static void virtio_blk_handle_request(VirtIOBlockReq *req, + MultiReqBuffer *mrb) +{ + uint32_t type; + + if (req->elem.out_num < 1 || req->elem.in_num < 1) { + error_report("virtio-blk missing headers"); + exit(1); + } + + if (req->elem.out_sg[0].iov_len < sizeof(*req->out) || + req->elem.in_sg[req->elem.in_num - 1].iov_len < sizeof(*req->in)) { + error_report("virtio-blk header not in correct element"); + exit(1); + } + + req->out = (void *)req->elem.out_sg[0].iov_base; + req->in = (void *)req->elem.in_sg[req->elem.in_num - 1].iov_base; + + type = ldl_p(&req->out->type); + + if (type & VIRTIO_BLK_T_FLUSH) { + virtio_blk_handle_flush(req, mrb); + } else if (type & VIRTIO_BLK_T_SCSI_CMD) { + virtio_blk_handle_scsi(req); + } else if (type & VIRTIO_BLK_T_GET_ID) { + VirtIOBlock *s = req->dev; + + /* + * NB: per existing s/n string convention the string is + * terminated by '\0' only when shorter than buffer. + */ + strncpy(req->elem.in_sg[0].iov_base, + s->blk.serial ? s->blk.serial : "", + MIN(req->elem.in_sg[0].iov_len, VIRTIO_BLK_ID_BYTES)); + virtio_blk_req_complete(req, VIRTIO_BLK_S_OK); + g_free(req); + } else if (type & VIRTIO_BLK_T_OUT) { + qemu_iovec_init_external(&req->qiov, &req->elem.out_sg[1], + req->elem.out_num - 1); + virtio_blk_handle_write(req, mrb); + } else if (type == VIRTIO_BLK_T_IN || type == VIRTIO_BLK_T_BARRIER) { + /* VIRTIO_BLK_T_IN is 0, so we can't just & it. */ + qemu_iovec_init_external(&req->qiov, &req->elem.in_sg[0], + req->elem.in_num - 1); + virtio_blk_handle_read(req); + } else { + virtio_blk_req_complete(req, VIRTIO_BLK_S_UNSUPP); + g_free(req); + } +} + +static void virtio_blk_handle_output(VirtIODevice *vdev, VirtQueue *vq) +{ + VirtIOBlock *s = VIRTIO_BLK(vdev); + VirtIOBlockReq *req; + MultiReqBuffer mrb = { + .num_writes = 0, + }; + +#ifdef CONFIG_VIRTIO_BLK_DATA_PLANE + /* Some guests kick before setting VIRTIO_CONFIG_S_DRIVER_OK so start + * dataplane here instead of waiting for .set_status(). + */ + if (s->dataplane) { + virtio_blk_data_plane_start(s->dataplane); + return; + } +#endif + + while ((req = virtio_blk_get_request(s))) { + virtio_blk_handle_request(req, &mrb); + } + + virtio_submit_multiwrite(s->bs, &mrb); + + /* + * FIXME: Want to check for completions before returning to guest mode, + * so cached reads and writes are reported as quickly as possible. But + * that should be done in the generic block layer. + */ +} + +static void virtio_blk_dma_restart_bh(void *opaque) +{ + VirtIOBlock *s = opaque; + VirtIOBlockReq *req = s->rq; + MultiReqBuffer mrb = { + .num_writes = 0, + }; + + qemu_bh_delete(s->bh); + s->bh = NULL; + + s->rq = NULL; + + while (req) { + virtio_blk_handle_request(req, &mrb); + req = req->next; + } + + virtio_submit_multiwrite(s->bs, &mrb); +} + +static void virtio_blk_dma_restart_cb(void *opaque, int running, + RunState state) +{ + VirtIOBlock *s = opaque; + + if (!running) { + return; + } + + if (!s->bh) { + s->bh = qemu_bh_new(virtio_blk_dma_restart_bh, s); + qemu_bh_schedule(s->bh); + } +} + +static void virtio_blk_reset(VirtIODevice *vdev) +{ +#ifdef CONFIG_VIRTIO_BLK_DATA_PLANE + VirtIOBlock *s = VIRTIO_BLK(vdev); + + if (s->dataplane) { + virtio_blk_data_plane_stop(s->dataplane); + } +#endif + + /* + * This should cancel pending requests, but can't do nicely until there + * are per-device request lists. + */ + bdrv_drain_all(); +} + +/* coalesce internal state, copy to pci i/o region 0 + */ +static void virtio_blk_update_config(VirtIODevice *vdev, uint8_t *config) +{ + VirtIOBlock *s = VIRTIO_BLK(vdev); + struct virtio_blk_config blkcfg; + uint64_t capacity; + int blk_size = s->conf->logical_block_size; + + bdrv_get_geometry(s->bs, &capacity); + memset(&blkcfg, 0, sizeof(blkcfg)); + stq_raw(&blkcfg.capacity, capacity); + stl_raw(&blkcfg.seg_max, 128 - 2); + stw_raw(&blkcfg.cylinders, s->conf->cyls); + stl_raw(&blkcfg.blk_size, blk_size); + stw_raw(&blkcfg.min_io_size, s->conf->min_io_size / blk_size); + stw_raw(&blkcfg.opt_io_size, s->conf->opt_io_size / blk_size); + blkcfg.heads = s->conf->heads; + /* + * We must ensure that the block device capacity is a multiple of + * the logical block size. If that is not the case, lets use + * sector_mask to adopt the geometry to have a correct picture. + * For those devices where the capacity is ok for the given geometry + * we dont touch the sector value of the geometry, since some devices + * (like s390 dasd) need a specific value. Here the capacity is already + * cyls*heads*secs*blk_size and the sector value is not block size + * divided by 512 - instead it is the amount of blk_size blocks + * per track (cylinder). + */ + if (bdrv_getlength(s->bs) / s->conf->heads / s->conf->secs % blk_size) { + blkcfg.sectors = s->conf->secs & ~s->sector_mask; + } else { + blkcfg.sectors = s->conf->secs; + } + blkcfg.size_max = 0; + blkcfg.physical_block_exp = get_physical_block_exp(s->conf); + blkcfg.alignment_offset = 0; + blkcfg.wce = bdrv_enable_write_cache(s->bs); + memcpy(config, &blkcfg, sizeof(struct virtio_blk_config)); +} + +static void virtio_blk_set_config(VirtIODevice *vdev, const uint8_t *config) +{ + VirtIOBlock *s = VIRTIO_BLK(vdev); + struct virtio_blk_config blkcfg; + + memcpy(&blkcfg, config, sizeof(blkcfg)); + bdrv_set_enable_write_cache(s->bs, blkcfg.wce != 0); +} + +static uint32_t virtio_blk_get_features(VirtIODevice *vdev, uint32_t features) +{ + VirtIOBlock *s = VIRTIO_BLK(vdev); + + features |= (1 << VIRTIO_BLK_F_SEG_MAX); + features |= (1 << VIRTIO_BLK_F_GEOMETRY); + features |= (1 << VIRTIO_BLK_F_TOPOLOGY); + features |= (1 << VIRTIO_BLK_F_BLK_SIZE); + features |= (1 << VIRTIO_BLK_F_SCSI); + + if (s->blk.config_wce) { + features |= (1 << VIRTIO_BLK_F_CONFIG_WCE); + } + if (bdrv_enable_write_cache(s->bs)) + features |= (1 << VIRTIO_BLK_F_WCE); + + if (bdrv_is_read_only(s->bs)) + features |= 1 << VIRTIO_BLK_F_RO; + + return features; +} + +static void virtio_blk_set_status(VirtIODevice *vdev, uint8_t status) +{ + VirtIOBlock *s = VIRTIO_BLK(vdev); + uint32_t features; + +#ifdef CONFIG_VIRTIO_BLK_DATA_PLANE + if (s->dataplane && !(status & (VIRTIO_CONFIG_S_DRIVER | + VIRTIO_CONFIG_S_DRIVER_OK))) { + virtio_blk_data_plane_stop(s->dataplane); + } +#endif + + if (!(status & VIRTIO_CONFIG_S_DRIVER_OK)) { + return; + } + + features = vdev->guest_features; + bdrv_set_enable_write_cache(s->bs, !!(features & (1 << VIRTIO_BLK_F_WCE))); +} + +static void virtio_blk_save(QEMUFile *f, void *opaque) +{ + VirtIOBlock *s = opaque; + VirtIODevice *vdev = VIRTIO_DEVICE(s); + VirtIOBlockReq *req = s->rq; + + virtio_save(vdev, f); + + while (req) { + qemu_put_sbyte(f, 1); + qemu_put_buffer(f, (unsigned char*)&req->elem, sizeof(req->elem)); + req = req->next; + } + qemu_put_sbyte(f, 0); +} + +static int virtio_blk_load(QEMUFile *f, void *opaque, int version_id) +{ + VirtIOBlock *s = opaque; + VirtIODevice *vdev = VIRTIO_DEVICE(s); + int ret; + + if (version_id != 2) + return -EINVAL; + + ret = virtio_load(vdev, f); + if (ret) { + return ret; + } + + while (qemu_get_sbyte(f)) { + VirtIOBlockReq *req = virtio_blk_alloc_request(s); + qemu_get_buffer(f, (unsigned char*)&req->elem, sizeof(req->elem)); + req->next = s->rq; + s->rq = req; + + virtqueue_map_sg(req->elem.in_sg, req->elem.in_addr, + req->elem.in_num, 1); + virtqueue_map_sg(req->elem.out_sg, req->elem.out_addr, + req->elem.out_num, 0); + } + + return 0; +} + +static void virtio_blk_resize(void *opaque) +{ + VirtIODevice *vdev = VIRTIO_DEVICE(opaque); + + virtio_notify_config(vdev); +} + +static const BlockDevOps virtio_block_ops = { + .resize_cb = virtio_blk_resize, +}; + +void virtio_blk_set_conf(DeviceState *dev, VirtIOBlkConf *blk) +{ + VirtIOBlock *s = VIRTIO_BLK(dev); + memcpy(&(s->blk), blk, sizeof(struct VirtIOBlkConf)); +} + +static int virtio_blk_device_init(VirtIODevice *vdev) +{ + DeviceState *qdev = DEVICE(vdev); + VirtIOBlock *s = VIRTIO_BLK(vdev); + VirtIOBlkConf *blk = &(s->blk); + static int virtio_blk_id; + + if (!blk->conf.bs) { + error_report("drive property not set"); + return -1; + } + if (!bdrv_is_inserted(blk->conf.bs)) { + error_report("Device needs media, but drive is empty"); + return -1; + } + + blkconf_serial(&blk->conf, &blk->serial); + if (blkconf_geometry(&blk->conf, NULL, 65535, 255, 255) < 0) { + return -1; + } + + virtio_init(vdev, "virtio-blk", VIRTIO_ID_BLOCK, + sizeof(struct virtio_blk_config)); + + vdev->get_config = virtio_blk_update_config; + vdev->set_config = virtio_blk_set_config; + vdev->get_features = virtio_blk_get_features; + vdev->set_status = virtio_blk_set_status; + vdev->reset = virtio_blk_reset; + s->bs = blk->conf.bs; + s->conf = &blk->conf; + memcpy(&(s->blk), blk, sizeof(struct VirtIOBlkConf)); + s->rq = NULL; + s->sector_mask = (s->conf->logical_block_size / BDRV_SECTOR_SIZE) - 1; + + s->vq = virtio_add_queue(vdev, 128, virtio_blk_handle_output); +#ifdef CONFIG_VIRTIO_BLK_DATA_PLANE + if (!virtio_blk_data_plane_create(vdev, blk, &s->dataplane)) { + virtio_common_cleanup(vdev); + return -1; + } +#endif + + s->change = qemu_add_vm_change_state_handler(virtio_blk_dma_restart_cb, s); + register_savevm(qdev, "virtio-blk", virtio_blk_id++, 2, + virtio_blk_save, virtio_blk_load, s); + bdrv_set_dev_ops(s->bs, &virtio_block_ops, s); + bdrv_set_buffer_alignment(s->bs, s->conf->logical_block_size); + + bdrv_iostatus_enable(s->bs); + + add_boot_device_path(s->conf->bootindex, qdev, "/disk@0,0"); + return 0; +} + +static int virtio_blk_device_exit(DeviceState *dev) +{ + VirtIODevice *vdev = VIRTIO_DEVICE(dev); + VirtIOBlock *s = VIRTIO_BLK(dev); +#ifdef CONFIG_VIRTIO_BLK_DATA_PLANE + virtio_blk_data_plane_destroy(s->dataplane); + s->dataplane = NULL; +#endif + qemu_del_vm_change_state_handler(s->change); + unregister_savevm(dev, "virtio-blk", s); + blockdev_mark_auto_del(s->bs); + virtio_common_cleanup(vdev); + return 0; +} + +static Property virtio_blk_properties[] = { + DEFINE_VIRTIO_BLK_PROPERTIES(VirtIOBlock, blk), + DEFINE_PROP_END_OF_LIST(), +}; + +static void virtio_blk_class_init(ObjectClass *klass, void *data) +{ + DeviceClass *dc = DEVICE_CLASS(klass); + VirtioDeviceClass *vdc = VIRTIO_DEVICE_CLASS(klass); + dc->exit = virtio_blk_device_exit; + dc->props = virtio_blk_properties; + vdc->init = virtio_blk_device_init; + vdc->get_config = virtio_blk_update_config; + vdc->set_config = virtio_blk_set_config; + vdc->get_features = virtio_blk_get_features; + vdc->set_status = virtio_blk_set_status; + vdc->reset = virtio_blk_reset; +} + +static const TypeInfo virtio_device_info = { + .name = TYPE_VIRTIO_BLK, + .parent = TYPE_VIRTIO_DEVICE, + .instance_size = sizeof(VirtIOBlock), + .class_init = virtio_blk_class_init, +}; + +static void virtio_register_types(void) +{ + type_register_static(&virtio_device_info); +} + +type_init(virtio_register_types) diff --git a/hw/char/Makefile.objs b/hw/char/Makefile.objs index eee23ff637..ddfd3ec9cb 100644 --- a/hw/char/Makefile.objs +++ b/hw/char/Makefile.objs @@ -8,3 +8,5 @@ common-obj-$(CONFIG_VIRTIO) += virtio-console.o common-obj-$(CONFIG_XILINX) += xilinx_uartlite.o common-obj-$(CONFIG_XEN_BACKEND) += xen_console.o common-obj-$(CONFIG_CADENCE) += cadence_uart.o + +obj-$(CONFIG_VIRTIO) += virtio-serial-bus.o diff --git a/hw/char/virtio-serial-bus.c b/hw/char/virtio-serial-bus.c new file mode 100644 index 0000000000..1dba8ab2c6 --- /dev/null +++ b/hw/char/virtio-serial-bus.c @@ -0,0 +1,1018 @@ +/* + * A bus for connecting virtio serial and console ports + * + * Copyright (C) 2009, 2010 Red Hat, Inc. + * + * Author(s): + * Amit Shah + * + * Some earlier parts are: + * Copyright IBM, Corp. 2008 + * authored by + * Christian Ehrhardt + * + * This work is licensed under the terms of the GNU GPL, version 2. See + * the COPYING file in the top-level directory. + * + * Contributions after 2012-01-13 are licensed under the terms of the + * GNU GPL, version 2 or (at your option) any later version. + */ + +#include "qemu/iov.h" +#include "monitor/monitor.h" +#include "qemu/queue.h" +#include "hw/sysbus.h" +#include "trace.h" +#include "hw/virtio/virtio-serial.h" + +static VirtIOSerialPort *find_port_by_id(VirtIOSerial *vser, uint32_t id) +{ + VirtIOSerialPort *port; + + if (id == VIRTIO_CONSOLE_BAD_ID) { + return NULL; + } + + QTAILQ_FOREACH(port, &vser->ports, next) { + if (port->id == id) + return port; + } + return NULL; +} + +static VirtIOSerialPort *find_port_by_vq(VirtIOSerial *vser, VirtQueue *vq) +{ + VirtIOSerialPort *port; + + QTAILQ_FOREACH(port, &vser->ports, next) { + if (port->ivq == vq || port->ovq == vq) + return port; + } + return NULL; +} + +static bool use_multiport(VirtIOSerial *vser) +{ + return vser->vdev.guest_features & (1 << VIRTIO_CONSOLE_F_MULTIPORT); +} + +static size_t write_to_port(VirtIOSerialPort *port, + const uint8_t *buf, size_t size) +{ + VirtQueueElement elem; + VirtQueue *vq; + size_t offset; + + vq = port->ivq; + if (!virtio_queue_ready(vq)) { + return 0; + } + + offset = 0; + while (offset < size) { + size_t len; + + if (!virtqueue_pop(vq, &elem)) { + break; + } + + len = iov_from_buf(elem.in_sg, elem.in_num, 0, + buf + offset, size - offset); + offset += len; + + virtqueue_push(vq, &elem, len); + } + + virtio_notify(&port->vser->vdev, vq); + return offset; +} + +static void discard_vq_data(VirtQueue *vq, VirtIODevice *vdev) +{ + VirtQueueElement elem; + + if (!virtio_queue_ready(vq)) { + return; + } + while (virtqueue_pop(vq, &elem)) { + virtqueue_push(vq, &elem, 0); + } + virtio_notify(vdev, vq); +} + +static void do_flush_queued_data(VirtIOSerialPort *port, VirtQueue *vq, + VirtIODevice *vdev) +{ + VirtIOSerialPortClass *vsc; + + assert(port); + assert(virtio_queue_ready(vq)); + + vsc = VIRTIO_SERIAL_PORT_GET_CLASS(port); + + while (!port->throttled) { + unsigned int i; + + /* Pop an elem only if we haven't left off a previous one mid-way */ + if (!port->elem.out_num) { + if (!virtqueue_pop(vq, &port->elem)) { + break; + } + port->iov_idx = 0; + port->iov_offset = 0; + } + + for (i = port->iov_idx; i < port->elem.out_num; i++) { + size_t buf_size; + ssize_t ret; + + buf_size = port->elem.out_sg[i].iov_len - port->iov_offset; + ret = vsc->have_data(port, + port->elem.out_sg[i].iov_base + + port->iov_offset, + buf_size); + if (port->throttled) { + port->iov_idx = i; + if (ret > 0) { + port->iov_offset += ret; + } + break; + } + port->iov_offset = 0; + } + if (port->throttled) { + break; + } + virtqueue_push(vq, &port->elem, 0); + port->elem.out_num = 0; + } + virtio_notify(vdev, vq); +} + +static void flush_queued_data(VirtIOSerialPort *port) +{ + assert(port); + + if (!virtio_queue_ready(port->ovq)) { + return; + } + do_flush_queued_data(port, port->ovq, &port->vser->vdev); +} + +static size_t send_control_msg(VirtIOSerial *vser, void *buf, size_t len) +{ + VirtQueueElement elem; + VirtQueue *vq; + + vq = vser->c_ivq; + if (!virtio_queue_ready(vq)) { + return 0; + } + if (!virtqueue_pop(vq, &elem)) { + return 0; + } + + memcpy(elem.in_sg[0].iov_base, buf, len); + + virtqueue_push(vq, &elem, len); + virtio_notify(&vser->vdev, vq); + return len; +} + +static size_t send_control_event(VirtIOSerial *vser, uint32_t port_id, + uint16_t event, uint16_t value) +{ + struct virtio_console_control cpkt; + + stl_p(&cpkt.id, port_id); + stw_p(&cpkt.event, event); + stw_p(&cpkt.value, value); + + trace_virtio_serial_send_control_event(port_id, event, value); + return send_control_msg(vser, &cpkt, sizeof(cpkt)); +} + +/* Functions for use inside qemu to open and read from/write to ports */ +int virtio_serial_open(VirtIOSerialPort *port) +{ + /* Don't allow opening an already-open port */ + if (port->host_connected) { + return 0; + } + /* Send port open notification to the guest */ + port->host_connected = true; + send_control_event(port->vser, port->id, VIRTIO_CONSOLE_PORT_OPEN, 1); + + return 0; +} + +int virtio_serial_close(VirtIOSerialPort *port) +{ + port->host_connected = false; + /* + * If there's any data the guest sent which the app didn't + * consume, reset the throttling flag and discard the data. + */ + port->throttled = false; + discard_vq_data(port->ovq, &port->vser->vdev); + + send_control_event(port->vser, port->id, VIRTIO_CONSOLE_PORT_OPEN, 0); + + return 0; +} + +/* Individual ports/apps call this function to write to the guest. */ +ssize_t virtio_serial_write(VirtIOSerialPort *port, const uint8_t *buf, + size_t size) +{ + if (!port || !port->host_connected || !port->guest_connected) { + return 0; + } + return write_to_port(port, buf, size); +} + +/* + * Readiness of the guest to accept data on a port. + * Returns max. data the guest can receive + */ +size_t virtio_serial_guest_ready(VirtIOSerialPort *port) +{ + VirtQueue *vq = port->ivq; + unsigned int bytes; + + if (!virtio_queue_ready(vq) || + !(port->vser->vdev.status & VIRTIO_CONFIG_S_DRIVER_OK) || + virtio_queue_empty(vq)) { + return 0; + } + if (use_multiport(port->vser) && !port->guest_connected) { + return 0; + } + virtqueue_get_avail_bytes(vq, &bytes, NULL, 4096, 0); + return bytes; +} + +static void flush_queued_data_bh(void *opaque) +{ + VirtIOSerialPort *port = opaque; + + flush_queued_data(port); +} + +void virtio_serial_throttle_port(VirtIOSerialPort *port, bool throttle) +{ + if (!port) { + return; + } + + trace_virtio_serial_throttle_port(port->id, throttle); + port->throttled = throttle; + if (throttle) { + return; + } + qemu_bh_schedule(port->bh); +} + +/* Guest wants to notify us of some event */ +static void handle_control_message(VirtIOSerial *vser, void *buf, size_t len) +{ + struct VirtIOSerialPort *port; + VirtIOSerialPortClass *vsc; + struct virtio_console_control cpkt, *gcpkt; + uint8_t *buffer; + size_t buffer_len; + + gcpkt = buf; + + if (len < sizeof(cpkt)) { + /* The guest sent an invalid control packet */ + return; + } + + cpkt.event = lduw_p(&gcpkt->event); + cpkt.value = lduw_p(&gcpkt->value); + + trace_virtio_serial_handle_control_message(cpkt.event, cpkt.value); + + if (cpkt.event == VIRTIO_CONSOLE_DEVICE_READY) { + if (!cpkt.value) { + error_report("virtio-serial-bus: Guest failure in adding device %s", + vser->bus.qbus.name); + return; + } + /* + * The device is up, we can now tell the device about all the + * ports we have here. + */ + QTAILQ_FOREACH(port, &vser->ports, next) { + send_control_event(vser, port->id, VIRTIO_CONSOLE_PORT_ADD, 1); + } + return; + } + + port = find_port_by_id(vser, ldl_p(&gcpkt->id)); + if (!port) { + error_report("virtio-serial-bus: Unexpected port id %u for device %s", + ldl_p(&gcpkt->id), vser->bus.qbus.name); + return; + } + + trace_virtio_serial_handle_control_message_port(port->id); + + vsc = VIRTIO_SERIAL_PORT_GET_CLASS(port); + + switch(cpkt.event) { + case VIRTIO_CONSOLE_PORT_READY: + if (!cpkt.value) { + error_report("virtio-serial-bus: Guest failure in adding port %u for device %s", + port->id, vser->bus.qbus.name); + break; + } + /* + * Now that we know the guest asked for the port name, we're + * sure the guest has initialised whatever state is necessary + * for this port. Now's a good time to let the guest know if + * this port is a console port so that the guest can hook it + * up to hvc. + */ + if (vsc->is_console) { + send_control_event(vser, port->id, VIRTIO_CONSOLE_CONSOLE_PORT, 1); + } + + if (port->name) { + stl_p(&cpkt.id, port->id); + stw_p(&cpkt.event, VIRTIO_CONSOLE_PORT_NAME); + stw_p(&cpkt.value, 1); + + buffer_len = sizeof(cpkt) + strlen(port->name) + 1; + buffer = g_malloc(buffer_len); + + memcpy(buffer, &cpkt, sizeof(cpkt)); + memcpy(buffer + sizeof(cpkt), port->name, strlen(port->name)); + buffer[buffer_len - 1] = 0; + + send_control_msg(vser, buffer, buffer_len); + g_free(buffer); + } + + if (port->host_connected) { + send_control_event(vser, port->id, VIRTIO_CONSOLE_PORT_OPEN, 1); + } + + /* + * When the guest has asked us for this information it means + * the guest is all setup and has its virtqueues + * initialised. If some app is interested in knowing about + * this event, let it know. + */ + if (vsc->guest_ready) { + vsc->guest_ready(port); + } + break; + + case VIRTIO_CONSOLE_PORT_OPEN: + port->guest_connected = cpkt.value; + if (vsc->set_guest_connected) { + /* Send the guest opened notification if an app is interested */ + vsc->set_guest_connected(port, cpkt.value); + } + break; + } +} + +static void control_in(VirtIODevice *vdev, VirtQueue *vq) +{ +} + +static void control_out(VirtIODevice *vdev, VirtQueue *vq) +{ + VirtQueueElement elem; + VirtIOSerial *vser; + uint8_t *buf; + size_t len; + + vser = DO_UPCAST(VirtIOSerial, vdev, vdev); + + len = 0; + buf = NULL; + while (virtqueue_pop(vq, &elem)) { + size_t cur_len; + + cur_len = iov_size(elem.out_sg, elem.out_num); + /* + * Allocate a new buf only if we didn't have one previously or + * if the size of the buf differs + */ + if (cur_len > len) { + g_free(buf); + + buf = g_malloc(cur_len); + len = cur_len; + } + iov_to_buf(elem.out_sg, elem.out_num, 0, buf, cur_len); + + handle_control_message(vser, buf, cur_len); + virtqueue_push(vq, &elem, 0); + } + g_free(buf); + virtio_notify(vdev, vq); +} + +/* Guest wrote something to some port. */ +static void handle_output(VirtIODevice *vdev, VirtQueue *vq) +{ + VirtIOSerial *vser; + VirtIOSerialPort *port; + + vser = DO_UPCAST(VirtIOSerial, vdev, vdev); + port = find_port_by_vq(vser, vq); + + if (!port || !port->host_connected) { + discard_vq_data(vq, vdev); + return; + } + + if (!port->throttled) { + do_flush_queued_data(port, vq, vdev); + return; + } +} + +static void handle_input(VirtIODevice *vdev, VirtQueue *vq) +{ +} + +static uint32_t get_features(VirtIODevice *vdev, uint32_t features) +{ + VirtIOSerial *vser; + + vser = DO_UPCAST(VirtIOSerial, vdev, vdev); + + if (vser->bus.max_nr_ports > 1) { + features |= (1 << VIRTIO_CONSOLE_F_MULTIPORT); + } + return features; +} + +/* Guest requested config info */ +static void get_config(VirtIODevice *vdev, uint8_t *config_data) +{ + VirtIOSerial *vser; + + vser = DO_UPCAST(VirtIOSerial, vdev, vdev); + memcpy(config_data, &vser->config, sizeof(struct virtio_console_config)); +} + +static void set_config(VirtIODevice *vdev, const uint8_t *config_data) +{ + struct virtio_console_config config; + + memcpy(&config, config_data, sizeof(config)); +} + +static void guest_reset(VirtIOSerial *vser) +{ + VirtIOSerialPort *port; + VirtIOSerialPortClass *vsc; + + QTAILQ_FOREACH(port, &vser->ports, next) { + vsc = VIRTIO_SERIAL_PORT_GET_CLASS(port); + if (port->guest_connected) { + port->guest_connected = false; + if (vsc->set_guest_connected) { + vsc->set_guest_connected(port, false); + } + } + } +} + +static void set_status(VirtIODevice *vdev, uint8_t status) +{ + VirtIOSerial *vser; + VirtIOSerialPort *port; + + vser = DO_UPCAST(VirtIOSerial, vdev, vdev); + port = find_port_by_id(vser, 0); + + if (port && !use_multiport(port->vser) + && (status & VIRTIO_CONFIG_S_DRIVER_OK)) { + /* + * Non-multiport guests won't be able to tell us guest + * open/close status. Such guests can only have a port at id + * 0, so set guest_connected for such ports as soon as guest + * is up. + */ + port->guest_connected = true; + } + if (!(status & VIRTIO_CONFIG_S_DRIVER_OK)) { + guest_reset(vser); + } +} + +static void vser_reset(VirtIODevice *vdev) +{ + VirtIOSerial *vser; + + vser = DO_UPCAST(VirtIOSerial, vdev, vdev); + guest_reset(vser); +} + +static void virtio_serial_save(QEMUFile *f, void *opaque) +{ + VirtIOSerial *s = opaque; + VirtIOSerialPort *port; + uint32_t nr_active_ports; + unsigned int i, max_nr_ports; + + /* The virtio device */ + virtio_save(&s->vdev, f); + + /* The config space */ + qemu_put_be16s(f, &s->config.cols); + qemu_put_be16s(f, &s->config.rows); + + qemu_put_be32s(f, &s->config.max_nr_ports); + + /* The ports map */ + max_nr_ports = tswap32(s->config.max_nr_ports); + for (i = 0; i < (max_nr_ports + 31) / 32; i++) { + qemu_put_be32s(f, &s->ports_map[i]); + } + + /* Ports */ + + nr_active_ports = 0; + QTAILQ_FOREACH(port, &s->ports, next) { + nr_active_ports++; + } + + qemu_put_be32s(f, &nr_active_ports); + + /* + * Items in struct VirtIOSerialPort. + */ + QTAILQ_FOREACH(port, &s->ports, next) { + uint32_t elem_popped; + + qemu_put_be32s(f, &port->id); + qemu_put_byte(f, port->guest_connected); + qemu_put_byte(f, port->host_connected); + + elem_popped = 0; + if (port->elem.out_num) { + elem_popped = 1; + } + qemu_put_be32s(f, &elem_popped); + if (elem_popped) { + qemu_put_be32s(f, &port->iov_idx); + qemu_put_be64s(f, &port->iov_offset); + + qemu_put_buffer(f, (unsigned char *)&port->elem, + sizeof(port->elem)); + } + } +} + +static void virtio_serial_post_load_timer_cb(void *opaque) +{ + uint32_t i; + VirtIOSerial *s = opaque; + VirtIOSerialPort *port; + uint8_t host_connected; + VirtIOSerialPortClass *vsc; + + if (!s->post_load) { + return; + } + for (i = 0 ; i < s->post_load->nr_active_ports; ++i) { + port = s->post_load->connected[i].port; + host_connected = s->post_load->connected[i].host_connected; + if (host_connected != port->host_connected) { + /* + * We have to let the guest know of the host connection + * status change + */ + send_control_event(s, port->id, VIRTIO_CONSOLE_PORT_OPEN, + port->host_connected); + } + vsc = VIRTIO_SERIAL_PORT_GET_CLASS(port); + if (vsc->set_guest_connected) { + vsc->set_guest_connected(port, port->guest_connected); + } + } + g_free(s->post_load->connected); + qemu_free_timer(s->post_load->timer); + g_free(s->post_load); + s->post_load = NULL; +} + +static int fetch_active_ports_list(QEMUFile *f, int version_id, + VirtIOSerial *s, uint32_t nr_active_ports) +{ + uint32_t i; + + s->post_load = g_malloc0(sizeof(*s->post_load)); + s->post_load->nr_active_ports = nr_active_ports; + s->post_load->connected = + g_malloc0(sizeof(*s->post_load->connected) * nr_active_ports); + + s->post_load->timer = qemu_new_timer_ns(vm_clock, + virtio_serial_post_load_timer_cb, + s); + + /* Items in struct VirtIOSerialPort */ + for (i = 0; i < nr_active_ports; i++) { + VirtIOSerialPort *port; + uint32_t id; + + id = qemu_get_be32(f); + port = find_port_by_id(s, id); + if (!port) { + return -EINVAL; + } + + port->guest_connected = qemu_get_byte(f); + s->post_load->connected[i].port = port; + s->post_load->connected[i].host_connected = qemu_get_byte(f); + + if (version_id > 2) { + uint32_t elem_popped; + + qemu_get_be32s(f, &elem_popped); + if (elem_popped) { + qemu_get_be32s(f, &port->iov_idx); + qemu_get_be64s(f, &port->iov_offset); + + qemu_get_buffer(f, (unsigned char *)&port->elem, + sizeof(port->elem)); + virtqueue_map_sg(port->elem.in_sg, port->elem.in_addr, + port->elem.in_num, 1); + virtqueue_map_sg(port->elem.out_sg, port->elem.out_addr, + port->elem.out_num, 1); + + /* + * Port was throttled on source machine. Let's + * unthrottle it here so data starts flowing again. + */ + virtio_serial_throttle_port(port, false); + } + } + } + qemu_mod_timer(s->post_load->timer, 1); + return 0; +} + +static int virtio_serial_load(QEMUFile *f, void *opaque, int version_id) +{ + VirtIOSerial *s = opaque; + uint32_t max_nr_ports, nr_active_ports, ports_map; + unsigned int i; + int ret; + + if (version_id > 3) { + return -EINVAL; + } + + /* The virtio device */ + ret = virtio_load(&s->vdev, f); + if (ret) { + return ret; + } + + if (version_id < 2) { + return 0; + } + + /* The config space */ + qemu_get_be16s(f, &s->config.cols); + qemu_get_be16s(f, &s->config.rows); + + qemu_get_be32s(f, &max_nr_ports); + tswap32s(&max_nr_ports); + if (max_nr_ports > tswap32(s->config.max_nr_ports)) { + /* Source could have had more ports than us. Fail migration. */ + return -EINVAL; + } + + for (i = 0; i < (max_nr_ports + 31) / 32; i++) { + qemu_get_be32s(f, &ports_map); + + if (ports_map != s->ports_map[i]) { + /* + * Ports active on source and destination don't + * match. Fail migration. + */ + return -EINVAL; + } + } + + qemu_get_be32s(f, &nr_active_ports); + + if (nr_active_ports) { + ret = fetch_active_ports_list(f, version_id, s, nr_active_ports); + if (ret) { + return ret; + } + } + return 0; +} + +static void virtser_bus_dev_print(Monitor *mon, DeviceState *qdev, int indent); + +static Property virtser_props[] = { + DEFINE_PROP_UINT32("nr", VirtIOSerialPort, id, VIRTIO_CONSOLE_BAD_ID), + DEFINE_PROP_STRING("name", VirtIOSerialPort, name), + DEFINE_PROP_END_OF_LIST() +}; + +#define TYPE_VIRTIO_SERIAL_BUS "virtio-serial-bus" +#define VIRTIO_SERIAL_BUS(obj) \ + OBJECT_CHECK(VirtIOSerialBus, (obj), TYPE_VIRTIO_SERIAL_BUS) + +static void virtser_bus_class_init(ObjectClass *klass, void *data) +{ + BusClass *k = BUS_CLASS(klass); + k->print_dev = virtser_bus_dev_print; +} + +static const TypeInfo virtser_bus_info = { + .name = TYPE_VIRTIO_SERIAL_BUS, + .parent = TYPE_BUS, + .instance_size = sizeof(VirtIOSerialBus), + .class_init = virtser_bus_class_init, +}; + +static void virtser_bus_dev_print(Monitor *mon, DeviceState *qdev, int indent) +{ + VirtIOSerialPort *port = DO_UPCAST(VirtIOSerialPort, dev, qdev); + + monitor_printf(mon, "%*sport %d, guest %s, host %s, throttle %s\n", + indent, "", port->id, + port->guest_connected ? "on" : "off", + port->host_connected ? "on" : "off", + port->throttled ? "on" : "off"); +} + +/* This function is only used if a port id is not provided by the user */ +static uint32_t find_free_port_id(VirtIOSerial *vser) +{ + unsigned int i, max_nr_ports; + + max_nr_ports = tswap32(vser->config.max_nr_ports); + for (i = 0; i < (max_nr_ports + 31) / 32; i++) { + uint32_t map, bit; + + map = vser->ports_map[i]; + bit = ffs(~map); + if (bit) { + return (bit - 1) + i * 32; + } + } + return VIRTIO_CONSOLE_BAD_ID; +} + +static void mark_port_added(VirtIOSerial *vser, uint32_t port_id) +{ + unsigned int i; + + i = port_id / 32; + vser->ports_map[i] |= 1U << (port_id % 32); +} + +static void add_port(VirtIOSerial *vser, uint32_t port_id) +{ + mark_port_added(vser, port_id); + send_control_event(vser, port_id, VIRTIO_CONSOLE_PORT_ADD, 1); +} + +static void remove_port(VirtIOSerial *vser, uint32_t port_id) +{ + VirtIOSerialPort *port; + unsigned int i; + + i = port_id / 32; + vser->ports_map[i] &= ~(1U << (port_id % 32)); + + port = find_port_by_id(vser, port_id); + /* + * This function is only called from qdev's unplug callback; if we + * get a NULL port here, we're in trouble. + */ + assert(port); + + /* Flush out any unconsumed buffers first */ + discard_vq_data(port->ovq, &port->vser->vdev); + + send_control_event(vser, port->id, VIRTIO_CONSOLE_PORT_REMOVE, 1); +} + +static int virtser_port_qdev_init(DeviceState *qdev) +{ + VirtIOSerialPort *port = DO_UPCAST(VirtIOSerialPort, dev, qdev); + VirtIOSerialPortClass *vsc = VIRTIO_SERIAL_PORT_GET_CLASS(port); + VirtIOSerialBus *bus = DO_UPCAST(VirtIOSerialBus, qbus, qdev->parent_bus); + int ret, max_nr_ports; + bool plugging_port0; + + port->vser = bus->vser; + port->bh = qemu_bh_new(flush_queued_data_bh, port); + + assert(vsc->have_data); + + /* + * Is the first console port we're seeing? If so, put it up at + * location 0. This is done for backward compatibility (old + * kernel, new qemu). + */ + plugging_port0 = vsc->is_console && !find_port_by_id(port->vser, 0); + + if (find_port_by_id(port->vser, port->id)) { + error_report("virtio-serial-bus: A port already exists at id %u", + port->id); + return -1; + } + + if (port->id == VIRTIO_CONSOLE_BAD_ID) { + if (plugging_port0) { + port->id = 0; + } else { + port->id = find_free_port_id(port->vser); + if (port->id == VIRTIO_CONSOLE_BAD_ID) { + error_report("virtio-serial-bus: Maximum port limit for this device reached"); + return -1; + } + } + } + + max_nr_ports = tswap32(port->vser->config.max_nr_ports); + if (port->id >= max_nr_ports) { + error_report("virtio-serial-bus: Out-of-range port id specified, max. allowed: %u", + max_nr_ports - 1); + return -1; + } + + ret = vsc->init(port); + if (ret) { + return ret; + } + + port->elem.out_num = 0; + + QTAILQ_INSERT_TAIL(&port->vser->ports, port, next); + port->ivq = port->vser->ivqs[port->id]; + port->ovq = port->vser->ovqs[port->id]; + + add_port(port->vser, port->id); + + /* Send an update to the guest about this new port added */ + virtio_notify_config(&port->vser->vdev); + + return ret; +} + +static int virtser_port_qdev_exit(DeviceState *qdev) +{ + VirtIOSerialPort *port = DO_UPCAST(VirtIOSerialPort, dev, qdev); + VirtIOSerialPortClass *vsc = VIRTIO_SERIAL_PORT_GET_CLASS(port); + VirtIOSerial *vser = port->vser; + + qemu_bh_delete(port->bh); + remove_port(port->vser, port->id); + + QTAILQ_REMOVE(&vser->ports, port, next); + + if (vsc->exit) { + vsc->exit(port); + } + return 0; +} + +VirtIODevice *virtio_serial_init(DeviceState *dev, virtio_serial_conf *conf) +{ + VirtIOSerial *vser; + VirtIODevice *vdev; + uint32_t i, max_supported_ports; + + if (!conf->max_virtserial_ports) + return NULL; + + /* Each port takes 2 queues, and one pair is for the control queue */ + max_supported_ports = VIRTIO_PCI_QUEUE_MAX / 2 - 1; + + if (conf->max_virtserial_ports > max_supported_ports) { + error_report("maximum ports supported: %u", max_supported_ports); + return NULL; + } + + vdev = virtio_common_init("virtio-serial", VIRTIO_ID_CONSOLE, + sizeof(struct virtio_console_config), + sizeof(VirtIOSerial)); + + vser = DO_UPCAST(VirtIOSerial, vdev, vdev); + + /* Spawn a new virtio-serial bus on which the ports will ride as devices */ + qbus_create_inplace(&vser->bus.qbus, TYPE_VIRTIO_SERIAL_BUS, dev, NULL); + vser->bus.qbus.allow_hotplug = 1; + vser->bus.vser = vser; + QTAILQ_INIT(&vser->ports); + + vser->bus.max_nr_ports = conf->max_virtserial_ports; + vser->ivqs = g_malloc(conf->max_virtserial_ports * sizeof(VirtQueue *)); + vser->ovqs = g_malloc(conf->max_virtserial_ports * sizeof(VirtQueue *)); + + /* Add a queue for host to guest transfers for port 0 (backward compat) */ + vser->ivqs[0] = virtio_add_queue(vdev, 128, handle_input); + /* Add a queue for guest to host transfers for port 0 (backward compat) */ + vser->ovqs[0] = virtio_add_queue(vdev, 128, handle_output); + + /* TODO: host to guest notifications can get dropped + * if the queue fills up. Implement queueing in host, + * this might also make it possible to reduce the control + * queue size: as guest preposts buffers there, + * this will save 4Kbyte of guest memory per entry. */ + + /* control queue: host to guest */ + vser->c_ivq = virtio_add_queue(vdev, 32, control_in); + /* control queue: guest to host */ + vser->c_ovq = virtio_add_queue(vdev, 32, control_out); + + for (i = 1; i < vser->bus.max_nr_ports; i++) { + /* Add a per-port queue for host to guest transfers */ + vser->ivqs[i] = virtio_add_queue(vdev, 128, handle_input); + /* Add a per-per queue for guest to host transfers */ + vser->ovqs[i] = virtio_add_queue(vdev, 128, handle_output); + } + + vser->config.max_nr_ports = tswap32(conf->max_virtserial_ports); + vser->ports_map = g_malloc0(((conf->max_virtserial_ports + 31) / 32) + * sizeof(vser->ports_map[0])); + /* + * Reserve location 0 for a console port for backward compat + * (old kernel, new qemu) + */ + mark_port_added(vser, 0); + + vser->vdev.get_features = get_features; + vser->vdev.get_config = get_config; + vser->vdev.set_config = set_config; + vser->vdev.set_status = set_status; + vser->vdev.reset = vser_reset; + + vser->qdev = dev; + + vser->post_load = NULL; + + /* + * Register for the savevm section with the virtio-console name + * to preserve backward compat + */ + register_savevm(dev, "virtio-console", -1, 3, virtio_serial_save, + virtio_serial_load, vser); + + return vdev; +} + +void virtio_serial_exit(VirtIODevice *vdev) +{ + VirtIOSerial *vser = DO_UPCAST(VirtIOSerial, vdev, vdev); + + unregister_savevm(vser->qdev, "virtio-console", vser); + + g_free(vser->ivqs); + g_free(vser->ovqs); + g_free(vser->ports_map); + if (vser->post_load) { + g_free(vser->post_load->connected); + qemu_del_timer(vser->post_load->timer); + qemu_free_timer(vser->post_load->timer); + g_free(vser->post_load); + } + virtio_cleanup(vdev); +} + +static void virtio_serial_port_class_init(ObjectClass *klass, void *data) +{ + DeviceClass *k = DEVICE_CLASS(klass); + k->init = virtser_port_qdev_init; + k->bus_type = TYPE_VIRTIO_SERIAL_BUS; + k->exit = virtser_port_qdev_exit; + k->unplug = qdev_simple_unplug_cb; + k->props = virtser_props; +} + +static const TypeInfo virtio_serial_port_type_info = { + .name = TYPE_VIRTIO_SERIAL_PORT, + .parent = TYPE_DEVICE, + .instance_size = sizeof(VirtIOSerialPort), + .abstract = true, + .class_size = sizeof(VirtIOSerialPortClass), + .class_init = virtio_serial_port_class_init, +}; + +static void virtio_serial_register_types(void) +{ + type_register_static(&virtser_bus_info); + type_register_static(&virtio_serial_port_type_info); +} + +type_init(virtio_serial_register_types) diff --git a/hw/dataplane/Makefile.objs b/hw/dataplane/Makefile.objs deleted file mode 100644 index 701111ccb9..0000000000 --- a/hw/dataplane/Makefile.objs +++ /dev/null @@ -1 +0,0 @@ -obj-$(CONFIG_VIRTIO_BLK_DATA_PLANE) += hostmem.o vring.o ioq.o virtio-blk.o diff --git a/hw/dataplane/hostmem.c b/hw/dataplane/hostmem.c deleted file mode 100644 index 37292ffd00..0000000000 --- a/hw/dataplane/hostmem.c +++ /dev/null @@ -1,176 +0,0 @@ -/* - * Thread-safe guest to host memory mapping - * - * Copyright 2012 Red Hat, Inc. and/or its affiliates - * - * Authors: - * Stefan Hajnoczi - * - * This work is licensed under the terms of the GNU GPL, version 2 or later. - * See the COPYING file in the top-level directory. - * - */ - -#include "exec/address-spaces.h" -#include "hw/virtio/dataplane/hostmem.h" - -static int hostmem_lookup_cmp(const void *phys_, const void *region_) -{ - hwaddr phys = *(const hwaddr *)phys_; - const HostMemRegion *region = region_; - - if (phys < region->guest_addr) { - return -1; - } else if (phys >= region->guest_addr + region->size) { - return 1; - } else { - return 0; - } -} - -/** - * Map guest physical address to host pointer - */ -void *hostmem_lookup(HostMem *hostmem, hwaddr phys, hwaddr len, bool is_write) -{ - HostMemRegion *region; - void *host_addr = NULL; - hwaddr offset_within_region; - - qemu_mutex_lock(&hostmem->current_regions_lock); - region = bsearch(&phys, hostmem->current_regions, - hostmem->num_current_regions, - sizeof(hostmem->current_regions[0]), - hostmem_lookup_cmp); - if (!region) { - goto out; - } - if (is_write && region->readonly) { - goto out; - } - offset_within_region = phys - region->guest_addr; - if (len <= region->size - offset_within_region) { - host_addr = region->host_addr + offset_within_region; - } -out: - qemu_mutex_unlock(&hostmem->current_regions_lock); - - return host_addr; -} - -/** - * Install new regions list - */ -static void hostmem_listener_commit(MemoryListener *listener) -{ - HostMem *hostmem = container_of(listener, HostMem, listener); - - qemu_mutex_lock(&hostmem->current_regions_lock); - g_free(hostmem->current_regions); - hostmem->current_regions = hostmem->new_regions; - hostmem->num_current_regions = hostmem->num_new_regions; - qemu_mutex_unlock(&hostmem->current_regions_lock); - - /* Reset new regions list */ - hostmem->new_regions = NULL; - hostmem->num_new_regions = 0; -} - -/** - * Add a MemoryRegionSection to the new regions list - */ -static void hostmem_append_new_region(HostMem *hostmem, - MemoryRegionSection *section) -{ - void *ram_ptr = memory_region_get_ram_ptr(section->mr); - size_t num = hostmem->num_new_regions; - size_t new_size = (num + 1) * sizeof(hostmem->new_regions[0]); - - hostmem->new_regions = g_realloc(hostmem->new_regions, new_size); - hostmem->new_regions[num] = (HostMemRegion){ - .host_addr = ram_ptr + section->offset_within_region, - .guest_addr = section->offset_within_address_space, - .size = section->size, - .readonly = section->readonly, - }; - hostmem->num_new_regions++; -} - -static void hostmem_listener_append_region(MemoryListener *listener, - MemoryRegionSection *section) -{ - HostMem *hostmem = container_of(listener, HostMem, listener); - - /* Ignore non-RAM regions, we may not be able to map them */ - if (!memory_region_is_ram(section->mr)) { - return; - } - - /* Ignore regions with dirty logging, we cannot mark them dirty */ - if (memory_region_is_logging(section->mr)) { - return; - } - - hostmem_append_new_region(hostmem, section); -} - -/* We don't implement most MemoryListener callbacks, use these nop stubs */ -static void hostmem_listener_dummy(MemoryListener *listener) -{ -} - -static void hostmem_listener_section_dummy(MemoryListener *listener, - MemoryRegionSection *section) -{ -} - -static void hostmem_listener_eventfd_dummy(MemoryListener *listener, - MemoryRegionSection *section, - bool match_data, uint64_t data, - EventNotifier *e) -{ -} - -static void hostmem_listener_coalesced_mmio_dummy(MemoryListener *listener, - MemoryRegionSection *section, - hwaddr addr, hwaddr len) -{ -} - -void hostmem_init(HostMem *hostmem) -{ - memset(hostmem, 0, sizeof(*hostmem)); - - qemu_mutex_init(&hostmem->current_regions_lock); - - hostmem->listener = (MemoryListener){ - .begin = hostmem_listener_dummy, - .commit = hostmem_listener_commit, - .region_add = hostmem_listener_append_region, - .region_del = hostmem_listener_section_dummy, - .region_nop = hostmem_listener_append_region, - .log_start = hostmem_listener_section_dummy, - .log_stop = hostmem_listener_section_dummy, - .log_sync = hostmem_listener_section_dummy, - .log_global_start = hostmem_listener_dummy, - .log_global_stop = hostmem_listener_dummy, - .eventfd_add = hostmem_listener_eventfd_dummy, - .eventfd_del = hostmem_listener_eventfd_dummy, - .coalesced_mmio_add = hostmem_listener_coalesced_mmio_dummy, - .coalesced_mmio_del = hostmem_listener_coalesced_mmio_dummy, - .priority = 10, - }; - - memory_listener_register(&hostmem->listener, &address_space_memory); - if (hostmem->num_new_regions > 0) { - hostmem_listener_commit(&hostmem->listener); - } -} - -void hostmem_finalize(HostMem *hostmem) -{ - memory_listener_unregister(&hostmem->listener); - g_free(hostmem->new_regions); - g_free(hostmem->current_regions); - qemu_mutex_destroy(&hostmem->current_regions_lock); -} diff --git a/hw/dataplane/ioq.c b/hw/dataplane/ioq.c deleted file mode 100644 index f709f87ed6..0000000000 --- a/hw/dataplane/ioq.c +++ /dev/null @@ -1,117 +0,0 @@ -/* - * Linux AIO request queue - * - * Copyright 2012 IBM, Corp. - * Copyright 2012 Red Hat, Inc. and/or its affiliates - * - * Authors: - * Stefan Hajnoczi - * - * This work is licensed under the terms of the GNU GPL, version 2 or later. - * See the COPYING file in the top-level directory. - * - */ - -#include "ioq.h" - -void ioq_init(IOQueue *ioq, int fd, unsigned int max_reqs) -{ - int rc; - - ioq->fd = fd; - ioq->max_reqs = max_reqs; - - memset(&ioq->io_ctx, 0, sizeof ioq->io_ctx); - rc = io_setup(max_reqs, &ioq->io_ctx); - if (rc != 0) { - fprintf(stderr, "ioq io_setup failed %d\n", rc); - exit(1); - } - - rc = event_notifier_init(&ioq->io_notifier, 0); - if (rc != 0) { - fprintf(stderr, "ioq io event notifier creation failed %d\n", rc); - exit(1); - } - - ioq->freelist = g_malloc0(sizeof ioq->freelist[0] * max_reqs); - ioq->freelist_idx = 0; - - ioq->queue = g_malloc0(sizeof ioq->queue[0] * max_reqs); - ioq->queue_idx = 0; -} - -void ioq_cleanup(IOQueue *ioq) -{ - g_free(ioq->freelist); - g_free(ioq->queue); - - event_notifier_cleanup(&ioq->io_notifier); - io_destroy(ioq->io_ctx); -} - -EventNotifier *ioq_get_notifier(IOQueue *ioq) -{ - return &ioq->io_notifier; -} - -struct iocb *ioq_get_iocb(IOQueue *ioq) -{ - /* Underflow cannot happen since ioq is sized for max_reqs */ - assert(ioq->freelist_idx != 0); - - struct iocb *iocb = ioq->freelist[--ioq->freelist_idx]; - ioq->queue[ioq->queue_idx++] = iocb; - return iocb; -} - -void ioq_put_iocb(IOQueue *ioq, struct iocb *iocb) -{ - /* Overflow cannot happen since ioq is sized for max_reqs */ - assert(ioq->freelist_idx != ioq->max_reqs); - - ioq->freelist[ioq->freelist_idx++] = iocb; -} - -struct iocb *ioq_rdwr(IOQueue *ioq, bool read, struct iovec *iov, - unsigned int count, long long offset) -{ - struct iocb *iocb = ioq_get_iocb(ioq); - - if (read) { - io_prep_preadv(iocb, ioq->fd, iov, count, offset); - } else { - io_prep_pwritev(iocb, ioq->fd, iov, count, offset); - } - io_set_eventfd(iocb, event_notifier_get_fd(&ioq->io_notifier)); - return iocb; -} - -int ioq_submit(IOQueue *ioq) -{ - int rc = io_submit(ioq->io_ctx, ioq->queue_idx, ioq->queue); - ioq->queue_idx = 0; /* reset */ - return rc; -} - -int ioq_run_completion(IOQueue *ioq, IOQueueCompletion *completion, - void *opaque) -{ - struct io_event events[ioq->max_reqs]; - int nevents, i; - - do { - nevents = io_getevents(ioq->io_ctx, 0, ioq->max_reqs, events, NULL); - } while (nevents < 0 && errno == EINTR); - if (nevents < 0) { - return nevents; - } - - for (i = 0; i < nevents; i++) { - ssize_t ret = ((uint64_t)events[i].res2 << 32) | events[i].res; - - completion(events[i].obj, ret, opaque); - ioq_put_iocb(ioq, events[i].obj); - } - return nevents; -} diff --git a/hw/dataplane/ioq.h b/hw/dataplane/ioq.h deleted file mode 100644 index b49b5de7f4..0000000000 --- a/hw/dataplane/ioq.h +++ /dev/null @@ -1,57 +0,0 @@ -/* - * Linux AIO request queue - * - * Copyright 2012 IBM, Corp. - * Copyright 2012 Red Hat, Inc. and/or its affiliates - * - * Authors: - * Stefan Hajnoczi - * - * This work is licensed under the terms of the GNU GPL, version 2 or later. - * See the COPYING file in the top-level directory. - * - */ - -#ifndef IOQ_H -#define IOQ_H - -#include -#include "qemu/event_notifier.h" - -typedef struct { - int fd; /* file descriptor */ - unsigned int max_reqs; /* max length of freelist and queue */ - - io_context_t io_ctx; /* Linux AIO context */ - EventNotifier io_notifier; /* Linux AIO eventfd */ - - /* Requests can complete in any order so a free list is necessary to manage - * available iocbs. - */ - struct iocb **freelist; /* free iocbs */ - unsigned int freelist_idx; - - /* Multiple requests are queued up before submitting them all in one go */ - struct iocb **queue; /* queued iocbs */ - unsigned int queue_idx; -} IOQueue; - -void ioq_init(IOQueue *ioq, int fd, unsigned int max_reqs); -void ioq_cleanup(IOQueue *ioq); -EventNotifier *ioq_get_notifier(IOQueue *ioq); -struct iocb *ioq_get_iocb(IOQueue *ioq); -void ioq_put_iocb(IOQueue *ioq, struct iocb *iocb); -struct iocb *ioq_rdwr(IOQueue *ioq, bool read, struct iovec *iov, - unsigned int count, long long offset); -int ioq_submit(IOQueue *ioq); - -static inline unsigned int ioq_num_queued(IOQueue *ioq) -{ - return ioq->queue_idx; -} - -typedef void IOQueueCompletion(struct iocb *iocb, ssize_t ret, void *opaque); -int ioq_run_completion(IOQueue *ioq, IOQueueCompletion *completion, - void *opaque); - -#endif /* IOQ_H */ diff --git a/hw/dataplane/virtio-blk.c b/hw/dataplane/virtio-blk.c deleted file mode 100644 index 5baef2391a..0000000000 --- a/hw/dataplane/virtio-blk.c +++ /dev/null @@ -1,540 +0,0 @@ -/* - * Dedicated thread for virtio-blk I/O processing - * - * Copyright 2012 IBM, Corp. - * Copyright 2012 Red Hat, Inc. and/or its affiliates - * - * Authors: - * Stefan Hajnoczi - * - * This work is licensed under the terms of the GNU GPL, version 2 or later. - * See the COPYING file in the top-level directory. - * - */ - -#include "trace.h" -#include "qemu/iov.h" -#include "qemu/thread.h" -#include "qemu/error-report.h" -#include "hw/virtio/dataplane/vring.h" -#include "ioq.h" -#include "migration/migration.h" -#include "block/block.h" -#include "hw/virtio/virtio-blk.h" -#include "virtio-blk.h" -#include "block/aio.h" - -enum { - SEG_MAX = 126, /* maximum number of I/O segments */ - VRING_MAX = SEG_MAX + 2, /* maximum number of vring descriptors */ - REQ_MAX = VRING_MAX, /* maximum number of requests in the vring, - * is VRING_MAX / 2 with traditional and - * VRING_MAX with indirect descriptors */ -}; - -typedef struct { - struct iocb iocb; /* Linux AIO control block */ - QEMUIOVector *inhdr; /* iovecs for virtio_blk_inhdr */ - unsigned int head; /* vring descriptor index */ - struct iovec *bounce_iov; /* used if guest buffers are unaligned */ - QEMUIOVector *read_qiov; /* for read completion /w bounce buffer */ -} VirtIOBlockRequest; - -struct VirtIOBlockDataPlane { - bool started; - bool stopping; - QEMUBH *start_bh; - QemuThread thread; - - VirtIOBlkConf *blk; - int fd; /* image file descriptor */ - - VirtIODevice *vdev; - Vring vring; /* virtqueue vring */ - EventNotifier *guest_notifier; /* irq */ - - /* Note that these EventNotifiers are assigned by value. This is - * fine as long as you do not call event_notifier_cleanup on them - * (because you don't own the file descriptor or handle; you just - * use it). - */ - AioContext *ctx; - EventNotifier io_notifier; /* Linux AIO completion */ - EventNotifier host_notifier; /* doorbell */ - - IOQueue ioqueue; /* Linux AIO queue (should really be per - dataplane thread) */ - VirtIOBlockRequest requests[REQ_MAX]; /* pool of requests, managed by the - queue */ - - unsigned int num_reqs; - - Error *migration_blocker; -}; - -/* Raise an interrupt to signal guest, if necessary */ -static void notify_guest(VirtIOBlockDataPlane *s) -{ - if (!vring_should_notify(s->vdev, &s->vring)) { - return; - } - - event_notifier_set(s->guest_notifier); -} - -static void complete_request(struct iocb *iocb, ssize_t ret, void *opaque) -{ - VirtIOBlockDataPlane *s = opaque; - VirtIOBlockRequest *req = container_of(iocb, VirtIOBlockRequest, iocb); - struct virtio_blk_inhdr hdr; - int len; - - if (likely(ret >= 0)) { - hdr.status = VIRTIO_BLK_S_OK; - len = ret; - } else { - hdr.status = VIRTIO_BLK_S_IOERR; - len = 0; - } - - trace_virtio_blk_data_plane_complete_request(s, req->head, ret); - - if (req->read_qiov) { - assert(req->bounce_iov); - qemu_iovec_from_buf(req->read_qiov, 0, req->bounce_iov->iov_base, len); - qemu_iovec_destroy(req->read_qiov); - g_slice_free(QEMUIOVector, req->read_qiov); - } - - if (req->bounce_iov) { - qemu_vfree(req->bounce_iov->iov_base); - g_slice_free(struct iovec, req->bounce_iov); - } - - qemu_iovec_from_buf(req->inhdr, 0, &hdr, sizeof(hdr)); - qemu_iovec_destroy(req->inhdr); - g_slice_free(QEMUIOVector, req->inhdr); - - /* According to the virtio specification len should be the number of bytes - * written to, but for virtio-blk it seems to be the number of bytes - * transferred plus the status bytes. - */ - vring_push(&s->vring, req->head, len + sizeof(hdr)); - - s->num_reqs--; -} - -static void complete_request_early(VirtIOBlockDataPlane *s, unsigned int head, - QEMUIOVector *inhdr, unsigned char status) -{ - struct virtio_blk_inhdr hdr = { - .status = status, - }; - - qemu_iovec_from_buf(inhdr, 0, &hdr, sizeof(hdr)); - qemu_iovec_destroy(inhdr); - g_slice_free(QEMUIOVector, inhdr); - - vring_push(&s->vring, head, sizeof(hdr)); - notify_guest(s); -} - -/* Get disk serial number */ -static void do_get_id_cmd(VirtIOBlockDataPlane *s, - struct iovec *iov, unsigned int iov_cnt, - unsigned int head, QEMUIOVector *inhdr) -{ - char id[VIRTIO_BLK_ID_BYTES]; - - /* Serial number not NUL-terminated when shorter than buffer */ - strncpy(id, s->blk->serial ? s->blk->serial : "", sizeof(id)); - iov_from_buf(iov, iov_cnt, 0, id, sizeof(id)); - complete_request_early(s, head, inhdr, VIRTIO_BLK_S_OK); -} - -static int do_rdwr_cmd(VirtIOBlockDataPlane *s, bool read, - struct iovec *iov, unsigned int iov_cnt, - long long offset, unsigned int head, - QEMUIOVector *inhdr) -{ - struct iocb *iocb; - QEMUIOVector qiov; - struct iovec *bounce_iov = NULL; - QEMUIOVector *read_qiov = NULL; - - qemu_iovec_init_external(&qiov, iov, iov_cnt); - if (!bdrv_qiov_is_aligned(s->blk->conf.bs, &qiov)) { - void *bounce_buffer = qemu_blockalign(s->blk->conf.bs, qiov.size); - - if (read) { - /* Need to copy back from bounce buffer on completion */ - read_qiov = g_slice_new(QEMUIOVector); - qemu_iovec_init(read_qiov, iov_cnt); - qemu_iovec_concat_iov(read_qiov, iov, iov_cnt, 0, qiov.size); - } else { - qemu_iovec_to_buf(&qiov, 0, bounce_buffer, qiov.size); - } - - /* Redirect I/O to aligned bounce buffer */ - bounce_iov = g_slice_new(struct iovec); - bounce_iov->iov_base = bounce_buffer; - bounce_iov->iov_len = qiov.size; - iov = bounce_iov; - iov_cnt = 1; - } - - iocb = ioq_rdwr(&s->ioqueue, read, iov, iov_cnt, offset); - - /* Fill in virtio block metadata needed for completion */ - VirtIOBlockRequest *req = container_of(iocb, VirtIOBlockRequest, iocb); - req->head = head; - req->inhdr = inhdr; - req->bounce_iov = bounce_iov; - req->read_qiov = read_qiov; - return 0; -} - -static int process_request(IOQueue *ioq, struct iovec iov[], - unsigned int out_num, unsigned int in_num, - unsigned int head) -{ - VirtIOBlockDataPlane *s = container_of(ioq, VirtIOBlockDataPlane, ioqueue); - struct iovec *in_iov = &iov[out_num]; - struct virtio_blk_outhdr outhdr; - QEMUIOVector *inhdr; - size_t in_size; - - /* Copy in outhdr */ - if (unlikely(iov_to_buf(iov, out_num, 0, &outhdr, - sizeof(outhdr)) != sizeof(outhdr))) { - error_report("virtio-blk request outhdr too short"); - return -EFAULT; - } - iov_discard_front(&iov, &out_num, sizeof(outhdr)); - - /* Grab inhdr for later */ - in_size = iov_size(in_iov, in_num); - if (in_size < sizeof(struct virtio_blk_inhdr)) { - error_report("virtio_blk request inhdr too short"); - return -EFAULT; - } - inhdr = g_slice_new(QEMUIOVector); - qemu_iovec_init(inhdr, 1); - qemu_iovec_concat_iov(inhdr, in_iov, in_num, - in_size - sizeof(struct virtio_blk_inhdr), - sizeof(struct virtio_blk_inhdr)); - iov_discard_back(in_iov, &in_num, sizeof(struct virtio_blk_inhdr)); - - /* TODO Linux sets the barrier bit even when not advertised! */ - outhdr.type &= ~VIRTIO_BLK_T_BARRIER; - - switch (outhdr.type) { - case VIRTIO_BLK_T_IN: - do_rdwr_cmd(s, true, in_iov, in_num, outhdr.sector * 512, head, inhdr); - return 0; - - case VIRTIO_BLK_T_OUT: - do_rdwr_cmd(s, false, iov, out_num, outhdr.sector * 512, head, inhdr); - return 0; - - case VIRTIO_BLK_T_SCSI_CMD: - /* TODO support SCSI commands */ - complete_request_early(s, head, inhdr, VIRTIO_BLK_S_UNSUPP); - return 0; - - case VIRTIO_BLK_T_FLUSH: - /* TODO fdsync not supported by Linux AIO, do it synchronously here! */ - if (qemu_fdatasync(s->fd) < 0) { - complete_request_early(s, head, inhdr, VIRTIO_BLK_S_IOERR); - } else { - complete_request_early(s, head, inhdr, VIRTIO_BLK_S_OK); - } - return 0; - - case VIRTIO_BLK_T_GET_ID: - do_get_id_cmd(s, in_iov, in_num, head, inhdr); - return 0; - - default: - error_report("virtio-blk unsupported request type %#x", outhdr.type); - qemu_iovec_destroy(inhdr); - g_slice_free(QEMUIOVector, inhdr); - return -EFAULT; - } -} - -static int flush_true(EventNotifier *e) -{ - return true; -} - -static void handle_notify(EventNotifier *e) -{ - VirtIOBlockDataPlane *s = container_of(e, VirtIOBlockDataPlane, - host_notifier); - - /* There is one array of iovecs into which all new requests are extracted - * from the vring. Requests are read from the vring and the translated - * descriptors are written to the iovecs array. The iovecs do not have to - * persist across handle_notify() calls because the kernel copies the - * iovecs on io_submit(). - * - * Handling io_submit() EAGAIN may require storing the requests across - * handle_notify() calls until the kernel has sufficient resources to - * accept more I/O. This is not implemented yet. - */ - struct iovec iovec[VRING_MAX]; - struct iovec *end = &iovec[VRING_MAX]; - struct iovec *iov = iovec; - - /* When a request is read from the vring, the index of the first descriptor - * (aka head) is returned so that the completed request can be pushed onto - * the vring later. - * - * The number of hypervisor read-only iovecs is out_num. The number of - * hypervisor write-only iovecs is in_num. - */ - int head; - unsigned int out_num = 0, in_num = 0; - unsigned int num_queued; - - event_notifier_test_and_clear(&s->host_notifier); - for (;;) { - /* Disable guest->host notifies to avoid unnecessary vmexits */ - vring_disable_notification(s->vdev, &s->vring); - - for (;;) { - head = vring_pop(s->vdev, &s->vring, iov, end, &out_num, &in_num); - if (head < 0) { - break; /* no more requests */ - } - - trace_virtio_blk_data_plane_process_request(s, out_num, in_num, - head); - - if (process_request(&s->ioqueue, iov, out_num, in_num, head) < 0) { - vring_set_broken(&s->vring); - break; - } - iov += out_num + in_num; - } - - if (likely(head == -EAGAIN)) { /* vring emptied */ - /* Re-enable guest->host notifies and stop processing the vring. - * But if the guest has snuck in more descriptors, keep processing. - */ - if (vring_enable_notification(s->vdev, &s->vring)) { - break; - } - } else { /* head == -ENOBUFS or fatal error, iovecs[] is depleted */ - /* Since there are no iovecs[] left, stop processing for now. Do - * not re-enable guest->host notifies since the I/O completion - * handler knows to check for more vring descriptors anyway. - */ - break; - } - } - - num_queued = ioq_num_queued(&s->ioqueue); - if (num_queued > 0) { - s->num_reqs += num_queued; - - int rc = ioq_submit(&s->ioqueue); - if (unlikely(rc < 0)) { - fprintf(stderr, "ioq_submit failed %d\n", rc); - exit(1); - } - } -} - -static int flush_io(EventNotifier *e) -{ - VirtIOBlockDataPlane *s = container_of(e, VirtIOBlockDataPlane, - io_notifier); - - return s->num_reqs > 0; -} - -static void handle_io(EventNotifier *e) -{ - VirtIOBlockDataPlane *s = container_of(e, VirtIOBlockDataPlane, - io_notifier); - - event_notifier_test_and_clear(&s->io_notifier); - if (ioq_run_completion(&s->ioqueue, complete_request, s) > 0) { - notify_guest(s); - } - - /* If there were more requests than iovecs, the vring will not be empty yet - * so check again. There should now be enough resources to process more - * requests. - */ - if (unlikely(vring_more_avail(&s->vring))) { - handle_notify(&s->host_notifier); - } -} - -static void *data_plane_thread(void *opaque) -{ - VirtIOBlockDataPlane *s = opaque; - - do { - aio_poll(s->ctx, true); - } while (!s->stopping || s->num_reqs > 0); - return NULL; -} - -static void start_data_plane_bh(void *opaque) -{ - VirtIOBlockDataPlane *s = opaque; - - qemu_bh_delete(s->start_bh); - s->start_bh = NULL; - qemu_thread_create(&s->thread, data_plane_thread, - s, QEMU_THREAD_JOINABLE); -} - -bool virtio_blk_data_plane_create(VirtIODevice *vdev, VirtIOBlkConf *blk, - VirtIOBlockDataPlane **dataplane) -{ - VirtIOBlockDataPlane *s; - int fd; - - *dataplane = NULL; - - if (!blk->data_plane) { - return true; - } - - if (blk->scsi) { - error_report("device is incompatible with x-data-plane, use scsi=off"); - return false; - } - - if (blk->config_wce) { - error_report("device is incompatible with x-data-plane, " - "use config-wce=off"); - return false; - } - - fd = raw_get_aio_fd(blk->conf.bs); - if (fd < 0) { - error_report("drive is incompatible with x-data-plane, " - "use format=raw,cache=none,aio=native"); - return false; - } - - s = g_new0(VirtIOBlockDataPlane, 1); - s->vdev = vdev; - s->fd = fd; - s->blk = blk; - - /* Prevent block operations that conflict with data plane thread */ - bdrv_set_in_use(blk->conf.bs, 1); - - error_setg(&s->migration_blocker, - "x-data-plane does not support migration"); - migrate_add_blocker(s->migration_blocker); - - *dataplane = s; - return true; -} - -void virtio_blk_data_plane_destroy(VirtIOBlockDataPlane *s) -{ - if (!s) { - return; - } - - virtio_blk_data_plane_stop(s); - migrate_del_blocker(s->migration_blocker); - error_free(s->migration_blocker); - bdrv_set_in_use(s->blk->conf.bs, 0); - g_free(s); -} - -void virtio_blk_data_plane_start(VirtIOBlockDataPlane *s) -{ - VirtQueue *vq; - int i; - - if (s->started) { - return; - } - - vq = virtio_get_queue(s->vdev, 0); - if (!vring_setup(&s->vring, s->vdev, 0)) { - return; - } - - s->ctx = aio_context_new(); - - /* Set up guest notifier (irq) */ - if (s->vdev->binding->set_guest_notifiers(s->vdev->binding_opaque, 1, - true) != 0) { - fprintf(stderr, "virtio-blk failed to set guest notifier, " - "ensure -enable-kvm is set\n"); - exit(1); - } - s->guest_notifier = virtio_queue_get_guest_notifier(vq); - - /* Set up virtqueue notify */ - if (s->vdev->binding->set_host_notifier(s->vdev->binding_opaque, - 0, true) != 0) { - fprintf(stderr, "virtio-blk failed to set host notifier\n"); - exit(1); - } - s->host_notifier = *virtio_queue_get_host_notifier(vq); - aio_set_event_notifier(s->ctx, &s->host_notifier, handle_notify, flush_true); - - /* Set up ioqueue */ - ioq_init(&s->ioqueue, s->fd, REQ_MAX); - for (i = 0; i < ARRAY_SIZE(s->requests); i++) { - ioq_put_iocb(&s->ioqueue, &s->requests[i].iocb); - } - s->io_notifier = *ioq_get_notifier(&s->ioqueue); - aio_set_event_notifier(s->ctx, &s->io_notifier, handle_io, flush_io); - - s->started = true; - trace_virtio_blk_data_plane_start(s); - - /* Kick right away to begin processing requests already in vring */ - event_notifier_set(virtio_queue_get_host_notifier(vq)); - - /* Spawn thread in BH so it inherits iothread cpusets */ - s->start_bh = qemu_bh_new(start_data_plane_bh, s); - qemu_bh_schedule(s->start_bh); -} - -void virtio_blk_data_plane_stop(VirtIOBlockDataPlane *s) -{ - if (!s->started || s->stopping) { - return; - } - s->stopping = true; - trace_virtio_blk_data_plane_stop(s); - - /* Stop thread or cancel pending thread creation BH */ - if (s->start_bh) { - qemu_bh_delete(s->start_bh); - s->start_bh = NULL; - } else { - aio_notify(s->ctx); - qemu_thread_join(&s->thread); - } - - aio_set_event_notifier(s->ctx, &s->io_notifier, NULL, NULL); - ioq_cleanup(&s->ioqueue); - - aio_set_event_notifier(s->ctx, &s->host_notifier, NULL, NULL); - s->vdev->binding->set_host_notifier(s->vdev->binding_opaque, 0, false); - - aio_context_unref(s->ctx); - - /* Clean up guest notifier (irq) */ - s->vdev->binding->set_guest_notifiers(s->vdev->binding_opaque, 1, false); - - vring_teardown(&s->vring); - s->started = false; - s->stopping = false; -} diff --git a/hw/dataplane/virtio-blk.h b/hw/dataplane/virtio-blk.h deleted file mode 100644 index c90e99f48f..0000000000 --- a/hw/dataplane/virtio-blk.h +++ /dev/null @@ -1,29 +0,0 @@ -/* - * Dedicated thread for virtio-blk I/O processing - * - * Copyright 2012 IBM, Corp. - * Copyright 2012 Red Hat, Inc. and/or its affiliates - * - * Authors: - * Stefan Hajnoczi - * - * This work is licensed under the terms of the GNU GPL, version 2 or later. - * See the COPYING file in the top-level directory. - * - */ - -#ifndef HW_DATAPLANE_VIRTIO_BLK_H -#define HW_DATAPLANE_VIRTIO_BLK_H - -#include "hw/virtio/virtio.h" - -typedef struct VirtIOBlockDataPlane VirtIOBlockDataPlane; - -bool virtio_blk_data_plane_create(VirtIODevice *vdev, VirtIOBlkConf *blk, - VirtIOBlockDataPlane **dataplane); -void virtio_blk_data_plane_destroy(VirtIOBlockDataPlane *s); -void virtio_blk_data_plane_start(VirtIOBlockDataPlane *s); -void virtio_blk_data_plane_stop(VirtIOBlockDataPlane *s); -void virtio_blk_data_plane_drain(VirtIOBlockDataPlane *s); - -#endif /* HW_DATAPLANE_VIRTIO_BLK_H */ diff --git a/hw/dataplane/vring.c b/hw/dataplane/vring.c deleted file mode 100644 index e0d6e83625..0000000000 --- a/hw/dataplane/vring.c +++ /dev/null @@ -1,363 +0,0 @@ -/* Copyright 2012 Red Hat, Inc. - * Copyright IBM, Corp. 2012 - * - * Based on Linux 2.6.39 vhost code: - * Copyright (C) 2009 Red Hat, Inc. - * Copyright (C) 2006 Rusty Russell IBM Corporation - * - * Author: Michael S. Tsirkin - * Stefan Hajnoczi - * - * Inspiration, some code, and most witty comments come from - * Documentation/virtual/lguest/lguest.c, by Rusty Russell - * - * This work is licensed under the terms of the GNU GPL, version 2. - */ - -#include "trace.h" -#include "hw/virtio/dataplane/vring.h" -#include "qemu/error-report.h" - -/* Map the guest's vring to host memory */ -bool vring_setup(Vring *vring, VirtIODevice *vdev, int n) -{ - hwaddr vring_addr = virtio_queue_get_ring_addr(vdev, n); - hwaddr vring_size = virtio_queue_get_ring_size(vdev, n); - void *vring_ptr; - - vring->broken = false; - - hostmem_init(&vring->hostmem); - vring_ptr = hostmem_lookup(&vring->hostmem, vring_addr, vring_size, true); - if (!vring_ptr) { - error_report("Failed to map vring " - "addr %#" HWADDR_PRIx " size %" HWADDR_PRIu, - vring_addr, vring_size); - vring->broken = true; - return false; - } - - vring_init(&vring->vr, virtio_queue_get_num(vdev, n), vring_ptr, 4096); - - vring->last_avail_idx = 0; - vring->last_used_idx = 0; - vring->signalled_used = 0; - vring->signalled_used_valid = false; - - trace_vring_setup(virtio_queue_get_ring_addr(vdev, n), - vring->vr.desc, vring->vr.avail, vring->vr.used); - return true; -} - -void vring_teardown(Vring *vring) -{ - hostmem_finalize(&vring->hostmem); -} - -/* Disable guest->host notifies */ -void vring_disable_notification(VirtIODevice *vdev, Vring *vring) -{ - if (!(vdev->guest_features & (1 << VIRTIO_RING_F_EVENT_IDX))) { - vring->vr.used->flags |= VRING_USED_F_NO_NOTIFY; - } -} - -/* Enable guest->host notifies - * - * Return true if the vring is empty, false if there are more requests. - */ -bool vring_enable_notification(VirtIODevice *vdev, Vring *vring) -{ - if (vdev->guest_features & (1 << VIRTIO_RING_F_EVENT_IDX)) { - vring_avail_event(&vring->vr) = vring->vr.avail->idx; - } else { - vring->vr.used->flags &= ~VRING_USED_F_NO_NOTIFY; - } - smp_mb(); /* ensure update is seen before reading avail_idx */ - return !vring_more_avail(vring); -} - -/* This is stolen from linux/drivers/vhost/vhost.c:vhost_notify() */ -bool vring_should_notify(VirtIODevice *vdev, Vring *vring) -{ - uint16_t old, new; - bool v; - /* Flush out used index updates. This is paired - * with the barrier that the Guest executes when enabling - * interrupts. */ - smp_mb(); - - if ((vdev->guest_features & VIRTIO_F_NOTIFY_ON_EMPTY) && - unlikely(vring->vr.avail->idx == vring->last_avail_idx)) { - return true; - } - - if (!(vdev->guest_features & VIRTIO_RING_F_EVENT_IDX)) { - return !(vring->vr.avail->flags & VRING_AVAIL_F_NO_INTERRUPT); - } - old = vring->signalled_used; - v = vring->signalled_used_valid; - new = vring->signalled_used = vring->last_used_idx; - vring->signalled_used_valid = true; - - if (unlikely(!v)) { - return true; - } - - return vring_need_event(vring_used_event(&vring->vr), new, old); -} - -/* This is stolen from linux/drivers/vhost/vhost.c. */ -static int get_indirect(Vring *vring, - struct iovec iov[], struct iovec *iov_end, - unsigned int *out_num, unsigned int *in_num, - struct vring_desc *indirect) -{ - struct vring_desc desc; - unsigned int i = 0, count, found = 0; - - /* Sanity check */ - if (unlikely(indirect->len % sizeof(desc))) { - error_report("Invalid length in indirect descriptor: " - "len %#x not multiple of %#zx", - indirect->len, sizeof(desc)); - vring->broken = true; - return -EFAULT; - } - - count = indirect->len / sizeof(desc); - /* Buffers are chained via a 16 bit next field, so - * we can have at most 2^16 of these. */ - if (unlikely(count > USHRT_MAX + 1)) { - error_report("Indirect buffer length too big: %d", indirect->len); - vring->broken = true; - return -EFAULT; - } - - do { - struct vring_desc *desc_ptr; - - /* Translate indirect descriptor */ - desc_ptr = hostmem_lookup(&vring->hostmem, - indirect->addr + found * sizeof(desc), - sizeof(desc), false); - if (!desc_ptr) { - error_report("Failed to map indirect descriptor " - "addr %#" PRIx64 " len %zu", - (uint64_t)indirect->addr + found * sizeof(desc), - sizeof(desc)); - vring->broken = true; - return -EFAULT; - } - desc = *desc_ptr; - - /* Ensure descriptor has been loaded before accessing fields */ - barrier(); /* read_barrier_depends(); */ - - if (unlikely(++found > count)) { - error_report("Loop detected: last one at %u " - "indirect size %u", i, count); - vring->broken = true; - return -EFAULT; - } - - if (unlikely(desc.flags & VRING_DESC_F_INDIRECT)) { - error_report("Nested indirect descriptor"); - vring->broken = true; - return -EFAULT; - } - - /* Stop for now if there are not enough iovecs available. */ - if (iov >= iov_end) { - return -ENOBUFS; - } - - iov->iov_base = hostmem_lookup(&vring->hostmem, desc.addr, desc.len, - desc.flags & VRING_DESC_F_WRITE); - if (!iov->iov_base) { - error_report("Failed to map indirect descriptor" - "addr %#" PRIx64 " len %u", - (uint64_t)desc.addr, desc.len); - vring->broken = true; - return -EFAULT; - } - iov->iov_len = desc.len; - iov++; - - /* If this is an input descriptor, increment that count. */ - if (desc.flags & VRING_DESC_F_WRITE) { - *in_num += 1; - } else { - /* If it's an output descriptor, they're all supposed - * to come before any input descriptors. */ - if (unlikely(*in_num)) { - error_report("Indirect descriptor " - "has out after in: idx %u", i); - vring->broken = true; - return -EFAULT; - } - *out_num += 1; - } - i = desc.next; - } while (desc.flags & VRING_DESC_F_NEXT); - return 0; -} - -/* This looks in the virtqueue and for the first available buffer, and converts - * it to an iovec for convenient access. Since descriptors consist of some - * number of output then some number of input descriptors, it's actually two - * iovecs, but we pack them into one and note how many of each there were. - * - * This function returns the descriptor number found, or vq->num (which is - * never a valid descriptor number) if none was found. A negative code is - * returned on error. - * - * Stolen from linux/drivers/vhost/vhost.c. - */ -int vring_pop(VirtIODevice *vdev, Vring *vring, - struct iovec iov[], struct iovec *iov_end, - unsigned int *out_num, unsigned int *in_num) -{ - struct vring_desc desc; - unsigned int i, head, found = 0, num = vring->vr.num; - uint16_t avail_idx, last_avail_idx; - - /* If there was a fatal error then refuse operation */ - if (vring->broken) { - return -EFAULT; - } - - /* Check it isn't doing very strange things with descriptor numbers. */ - last_avail_idx = vring->last_avail_idx; - avail_idx = vring->vr.avail->idx; - barrier(); /* load indices now and not again later */ - - if (unlikely((uint16_t)(avail_idx - last_avail_idx) > num)) { - error_report("Guest moved used index from %u to %u", - last_avail_idx, avail_idx); - vring->broken = true; - return -EFAULT; - } - - /* If there's nothing new since last we looked. */ - if (avail_idx == last_avail_idx) { - return -EAGAIN; - } - - /* Only get avail ring entries after they have been exposed by guest. */ - smp_rmb(); - - /* Grab the next descriptor number they're advertising, and increment - * the index we've seen. */ - head = vring->vr.avail->ring[last_avail_idx % num]; - - /* If their number is silly, that's an error. */ - if (unlikely(head >= num)) { - error_report("Guest says index %u > %u is available", head, num); - vring->broken = true; - return -EFAULT; - } - - if (vdev->guest_features & (1 << VIRTIO_RING_F_EVENT_IDX)) { - vring_avail_event(&vring->vr) = vring->vr.avail->idx; - } - - /* When we start there are none of either input nor output. */ - *out_num = *in_num = 0; - - i = head; - do { - if (unlikely(i >= num)) { - error_report("Desc index is %u > %u, head = %u", i, num, head); - vring->broken = true; - return -EFAULT; - } - if (unlikely(++found > num)) { - error_report("Loop detected: last one at %u vq size %u head %u", - i, num, head); - vring->broken = true; - return -EFAULT; - } - desc = vring->vr.desc[i]; - - /* Ensure descriptor is loaded before accessing fields */ - barrier(); - - if (desc.flags & VRING_DESC_F_INDIRECT) { - int ret = get_indirect(vring, iov, iov_end, out_num, in_num, &desc); - if (ret < 0) { - return ret; - } - continue; - } - - /* If there are not enough iovecs left, stop for now. The caller - * should check if there are more descs available once they have dealt - * with the current set. - */ - if (iov >= iov_end) { - return -ENOBUFS; - } - - /* TODO handle non-contiguous memory across region boundaries */ - iov->iov_base = hostmem_lookup(&vring->hostmem, desc.addr, desc.len, - desc.flags & VRING_DESC_F_WRITE); - if (!iov->iov_base) { - error_report("Failed to map vring desc addr %#" PRIx64 " len %u", - (uint64_t)desc.addr, desc.len); - vring->broken = true; - return -EFAULT; - } - iov->iov_len = desc.len; - iov++; - - if (desc.flags & VRING_DESC_F_WRITE) { - /* If this is an input descriptor, - * increment that count. */ - *in_num += 1; - } else { - /* If it's an output descriptor, they're all supposed - * to come before any input descriptors. */ - if (unlikely(*in_num)) { - error_report("Descriptor has out after in: idx %d", i); - vring->broken = true; - return -EFAULT; - } - *out_num += 1; - } - i = desc.next; - } while (desc.flags & VRING_DESC_F_NEXT); - - /* On success, increment avail index. */ - vring->last_avail_idx++; - return head; -} - -/* After we've used one of their buffers, we tell them about it. - * - * Stolen from linux/drivers/vhost/vhost.c. - */ -void vring_push(Vring *vring, unsigned int head, int len) -{ - struct vring_used_elem *used; - uint16_t new; - - /* Don't touch vring if a fatal error occurred */ - if (vring->broken) { - return; - } - - /* The virtqueue contains a ring of used buffers. Get a pointer to the - * next entry in that used ring. */ - used = &vring->vr.used->ring[vring->last_used_idx % vring->vr.num]; - used->id = head; - used->len = len; - - /* Make sure buffer is written before we update index. */ - smp_wmb(); - - new = vring->vr.used->idx = ++vring->last_used_idx; - if (unlikely((int16_t)(new - vring->signalled_used) < (uint16_t)1)) { - vring->signalled_used_valid = false; - } -} diff --git a/hw/net/Makefile.objs b/hw/net/Makefile.objs index ad91293fe4..73217d80ae 100644 --- a/hw/net/Makefile.objs +++ b/hw/net/Makefile.objs @@ -20,3 +20,6 @@ common-obj-$(CONFIG_MIPSNET) += mipsnet.o common-obj-$(CONFIG_XILINX_AXI) += xilinx_axienet.o common-obj-$(CONFIG_CADENCE) += cadence_gem.o + +obj-$(CONFIG_VIRTIO) += virtio-net.o +obj-y += vhost_net.o diff --git a/hw/net/vhost_net.c b/hw/net/vhost_net.c new file mode 100644 index 0000000000..8c5384cf76 --- /dev/null +++ b/hw/net/vhost_net.c @@ -0,0 +1,328 @@ +/* + * vhost-net support + * + * Copyright Red Hat, Inc. 2010 + * + * Authors: + * Michael S. Tsirkin + * + * This work is licensed under the terms of the GNU GPL, version 2. See + * the COPYING file in the top-level directory. + * + * Contributions after 2012-01-13 are licensed under the terms of the + * GNU GPL, version 2 or (at your option) any later version. + */ + +#include "net/net.h" +#include "net/tap.h" + +#include "hw/virtio/virtio-net.h" +#include "net/vhost_net.h" +#include "qemu/error-report.h" + +#include "config.h" + +#ifdef CONFIG_VHOST_NET +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include + +#include "hw/virtio/vhost.h" + +struct vhost_net { + struct vhost_dev dev; + struct vhost_virtqueue vqs[2]; + int backend; + NetClientState *nc; +}; + +unsigned vhost_net_get_features(struct vhost_net *net, unsigned features) +{ + /* Clear features not supported by host kernel. */ + if (!(net->dev.features & (1 << VIRTIO_F_NOTIFY_ON_EMPTY))) { + features &= ~(1 << VIRTIO_F_NOTIFY_ON_EMPTY); + } + if (!(net->dev.features & (1 << VIRTIO_RING_F_INDIRECT_DESC))) { + features &= ~(1 << VIRTIO_RING_F_INDIRECT_DESC); + } + if (!(net->dev.features & (1 << VIRTIO_RING_F_EVENT_IDX))) { + features &= ~(1 << VIRTIO_RING_F_EVENT_IDX); + } + if (!(net->dev.features & (1 << VIRTIO_NET_F_MRG_RXBUF))) { + features &= ~(1 << VIRTIO_NET_F_MRG_RXBUF); + } + return features; +} + +void vhost_net_ack_features(struct vhost_net *net, unsigned features) +{ + net->dev.acked_features = net->dev.backend_features; + if (features & (1 << VIRTIO_F_NOTIFY_ON_EMPTY)) { + net->dev.acked_features |= (1 << VIRTIO_F_NOTIFY_ON_EMPTY); + } + if (features & (1 << VIRTIO_RING_F_INDIRECT_DESC)) { + net->dev.acked_features |= (1 << VIRTIO_RING_F_INDIRECT_DESC); + } + if (features & (1 << VIRTIO_RING_F_EVENT_IDX)) { + net->dev.acked_features |= (1 << VIRTIO_RING_F_EVENT_IDX); + } + if (features & (1 << VIRTIO_NET_F_MRG_RXBUF)) { + net->dev.acked_features |= (1 << VIRTIO_NET_F_MRG_RXBUF); + } +} + +static int vhost_net_get_fd(NetClientState *backend) +{ + switch (backend->info->type) { + case NET_CLIENT_OPTIONS_KIND_TAP: + return tap_get_fd(backend); + default: + fprintf(stderr, "vhost-net requires tap backend\n"); + return -EBADFD; + } +} + +struct vhost_net *vhost_net_init(NetClientState *backend, int devfd, + bool force) +{ + int r; + struct vhost_net *net = g_malloc(sizeof *net); + if (!backend) { + fprintf(stderr, "vhost-net requires backend to be setup\n"); + goto fail; + } + r = vhost_net_get_fd(backend); + if (r < 0) { + goto fail; + } + net->nc = backend; + net->dev.backend_features = tap_has_vnet_hdr(backend) ? 0 : + (1 << VHOST_NET_F_VIRTIO_NET_HDR); + net->backend = r; + + net->dev.nvqs = 2; + net->dev.vqs = net->vqs; + + r = vhost_dev_init(&net->dev, devfd, "/dev/vhost-net", force); + if (r < 0) { + goto fail; + } + if (!tap_has_vnet_hdr_len(backend, + sizeof(struct virtio_net_hdr_mrg_rxbuf))) { + net->dev.features &= ~(1 << VIRTIO_NET_F_MRG_RXBUF); + } + if (~net->dev.features & net->dev.backend_features) { + fprintf(stderr, "vhost lacks feature mask %" PRIu64 " for backend\n", + (uint64_t)(~net->dev.features & net->dev.backend_features)); + vhost_dev_cleanup(&net->dev); + goto fail; + } + + /* Set sane init value. Override when guest acks. */ + vhost_net_ack_features(net, 0); + return net; +fail: + g_free(net); + return NULL; +} + +bool vhost_net_query(VHostNetState *net, VirtIODevice *dev) +{ + return vhost_dev_query(&net->dev, dev); +} + +static int vhost_net_start_one(struct vhost_net *net, + VirtIODevice *dev, + int vq_index) +{ + struct vhost_vring_file file = { }; + int r; + + if (net->dev.started) { + return 0; + } + + net->dev.nvqs = 2; + net->dev.vqs = net->vqs; + net->dev.vq_index = vq_index; + + r = vhost_dev_enable_notifiers(&net->dev, dev); + if (r < 0) { + goto fail_notifiers; + } + + r = vhost_dev_start(&net->dev, dev); + if (r < 0) { + goto fail_start; + } + + net->nc->info->poll(net->nc, false); + qemu_set_fd_handler(net->backend, NULL, NULL, NULL); + file.fd = net->backend; + for (file.index = 0; file.index < net->dev.nvqs; ++file.index) { + r = ioctl(net->dev.control, VHOST_NET_SET_BACKEND, &file); + if (r < 0) { + r = -errno; + goto fail; + } + } + return 0; +fail: + file.fd = -1; + while (file.index-- > 0) { + int r = ioctl(net->dev.control, VHOST_NET_SET_BACKEND, &file); + assert(r >= 0); + } + net->nc->info->poll(net->nc, true); + vhost_dev_stop(&net->dev, dev); +fail_start: + vhost_dev_disable_notifiers(&net->dev, dev); +fail_notifiers: + return r; +} + +static void vhost_net_stop_one(struct vhost_net *net, + VirtIODevice *dev) +{ + struct vhost_vring_file file = { .fd = -1 }; + + if (!net->dev.started) { + return; + } + + for (file.index = 0; file.index < net->dev.nvqs; ++file.index) { + int r = ioctl(net->dev.control, VHOST_NET_SET_BACKEND, &file); + assert(r >= 0); + } + net->nc->info->poll(net->nc, true); + vhost_dev_stop(&net->dev, dev); + vhost_dev_disable_notifiers(&net->dev, dev); +} + +int vhost_net_start(VirtIODevice *dev, NetClientState *ncs, + int total_queues) +{ + int r, i = 0; + + if (!dev->binding->set_guest_notifiers) { + error_report("binding does not support guest notifiers"); + r = -ENOSYS; + goto err; + } + + for (i = 0; i < total_queues; i++) { + r = vhost_net_start_one(tap_get_vhost_net(ncs[i].peer), dev, i * 2); + + if (r < 0) { + goto err; + } + } + + r = dev->binding->set_guest_notifiers(dev->binding_opaque, + total_queues * 2, + true); + if (r < 0) { + error_report("Error binding guest notifier: %d", -r); + goto err; + } + + return 0; + +err: + while (--i >= 0) { + vhost_net_stop_one(tap_get_vhost_net(ncs[i].peer), dev); + } + return r; +} + +void vhost_net_stop(VirtIODevice *dev, NetClientState *ncs, + int total_queues) +{ + int i, r; + + r = dev->binding->set_guest_notifiers(dev->binding_opaque, + total_queues * 2, + false); + if (r < 0) { + fprintf(stderr, "vhost guest notifier cleanup failed: %d\n", r); + fflush(stderr); + } + assert(r >= 0); + + for (i = 0; i < total_queues; i++) { + vhost_net_stop_one(tap_get_vhost_net(ncs[i].peer), dev); + } +} + +void vhost_net_cleanup(struct vhost_net *net) +{ + vhost_dev_cleanup(&net->dev); + g_free(net); +} + +bool vhost_net_virtqueue_pending(VHostNetState *net, int idx) +{ + return vhost_virtqueue_pending(&net->dev, idx); +} + +void vhost_net_virtqueue_mask(VHostNetState *net, VirtIODevice *dev, + int idx, bool mask) +{ + vhost_virtqueue_mask(&net->dev, dev, idx, mask); +} +#else +struct vhost_net *vhost_net_init(NetClientState *backend, int devfd, + bool force) +{ + error_report("vhost-net support is not compiled in"); + return NULL; +} + +bool vhost_net_query(VHostNetState *net, VirtIODevice *dev) +{ + return false; +} + +int vhost_net_start(VirtIODevice *dev, + NetClientState *ncs, + int total_queues) +{ + return -ENOSYS; +} +void vhost_net_stop(VirtIODevice *dev, + NetClientState *ncs, + int total_queues) +{ +} + +void vhost_net_cleanup(struct vhost_net *net) +{ +} + +unsigned vhost_net_get_features(struct vhost_net *net, unsigned features) +{ + return features; +} +void vhost_net_ack_features(struct vhost_net *net, unsigned features) +{ +} + +bool vhost_net_virtqueue_pending(VHostNetState *net, int idx) +{ + return -ENOSYS; +} + +void vhost_net_virtqueue_mask(VHostNetState *net, VirtIODevice *dev, + int idx, bool mask) +{ +} +#endif diff --git a/hw/net/virtio-net.c b/hw/net/virtio-net.c new file mode 100644 index 0000000000..bc8fd43b4b --- /dev/null +++ b/hw/net/virtio-net.c @@ -0,0 +1,1370 @@ +/* + * Virtio Network Device + * + * Copyright IBM, Corp. 2007 + * + * Authors: + * Anthony Liguori + * + * This work is licensed under the terms of the GNU GPL, version 2. See + * the COPYING file in the top-level directory. + * + */ + +#include "qemu/iov.h" +#include "hw/virtio/virtio.h" +#include "net/net.h" +#include "net/checksum.h" +#include "net/tap.h" +#include "qemu/error-report.h" +#include "qemu/timer.h" +#include "hw/virtio/virtio-net.h" +#include "net/vhost_net.h" + +#define VIRTIO_NET_VM_VERSION 11 + +#define MAC_TABLE_ENTRIES 64 +#define MAX_VLAN (1 << 12) /* Per 802.1Q definition */ + +/* + * Calculate the number of bytes up to and including the given 'field' of + * 'container'. + */ +#define endof(container, field) \ + (offsetof(container, field) + sizeof(((container *)0)->field)) + +typedef struct VirtIOFeature { + uint32_t flags; + size_t end; +} VirtIOFeature; + +static VirtIOFeature feature_sizes[] = { + {.flags = 1 << VIRTIO_NET_F_MAC, + .end = endof(struct virtio_net_config, mac)}, + {.flags = 1 << VIRTIO_NET_F_STATUS, + .end = endof(struct virtio_net_config, status)}, + {.flags = 1 << VIRTIO_NET_F_MQ, + .end = endof(struct virtio_net_config, max_virtqueue_pairs)}, + {} +}; + +static VirtIONetQueue *virtio_net_get_subqueue(NetClientState *nc) +{ + VirtIONet *n = qemu_get_nic_opaque(nc); + + return &n->vqs[nc->queue_index]; +} + +static int vq2q(int queue_index) +{ + return queue_index / 2; +} + +/* TODO + * - we could suppress RX interrupt if we were so inclined. + */ + +static VirtIONet *to_virtio_net(VirtIODevice *vdev) +{ + return (VirtIONet *)vdev; +} + +static void virtio_net_get_config(VirtIODevice *vdev, uint8_t *config) +{ + VirtIONet *n = to_virtio_net(vdev); + struct virtio_net_config netcfg; + + stw_p(&netcfg.status, n->status); + stw_p(&netcfg.max_virtqueue_pairs, n->max_queues); + memcpy(netcfg.mac, n->mac, ETH_ALEN); + memcpy(config, &netcfg, n->config_size); +} + +static void virtio_net_set_config(VirtIODevice *vdev, const uint8_t *config) +{ + VirtIONet *n = to_virtio_net(vdev); + struct virtio_net_config netcfg = {}; + + memcpy(&netcfg, config, n->config_size); + + if (!(n->vdev.guest_features >> VIRTIO_NET_F_CTRL_MAC_ADDR & 1) && + memcmp(netcfg.mac, n->mac, ETH_ALEN)) { + memcpy(n->mac, netcfg.mac, ETH_ALEN); + qemu_format_nic_info_str(qemu_get_queue(n->nic), n->mac); + } +} + +static bool virtio_net_started(VirtIONet *n, uint8_t status) +{ + return (status & VIRTIO_CONFIG_S_DRIVER_OK) && + (n->status & VIRTIO_NET_S_LINK_UP) && n->vdev.vm_running; +} + +static void virtio_net_vhost_status(VirtIONet *n, uint8_t status) +{ + NetClientState *nc = qemu_get_queue(n->nic); + int queues = n->multiqueue ? n->max_queues : 1; + + if (!nc->peer) { + return; + } + if (nc->peer->info->type != NET_CLIENT_OPTIONS_KIND_TAP) { + return; + } + + if (!tap_get_vhost_net(nc->peer)) { + return; + } + + if (!!n->vhost_started == virtio_net_started(n, status) && + !nc->peer->link_down) { + return; + } + if (!n->vhost_started) { + int r; + if (!vhost_net_query(tap_get_vhost_net(nc->peer), &n->vdev)) { + return; + } + n->vhost_started = 1; + r = vhost_net_start(&n->vdev, n->nic->ncs, queues); + if (r < 0) { + error_report("unable to start vhost net: %d: " + "falling back on userspace virtio", -r); + n->vhost_started = 0; + } + } else { + vhost_net_stop(&n->vdev, n->nic->ncs, queues); + n->vhost_started = 0; + } +} + +static void virtio_net_set_status(struct VirtIODevice *vdev, uint8_t status) +{ + VirtIONet *n = to_virtio_net(vdev); + VirtIONetQueue *q; + int i; + uint8_t queue_status; + + virtio_net_vhost_status(n, status); + + for (i = 0; i < n->max_queues; i++) { + q = &n->vqs[i]; + + if ((!n->multiqueue && i != 0) || i >= n->curr_queues) { + queue_status = 0; + } else { + queue_status = status; + } + + if (!q->tx_waiting) { + continue; + } + + if (virtio_net_started(n, queue_status) && !n->vhost_started) { + if (q->tx_timer) { + qemu_mod_timer(q->tx_timer, + qemu_get_clock_ns(vm_clock) + n->tx_timeout); + } else { + qemu_bh_schedule(q->tx_bh); + } + } else { + if (q->tx_timer) { + qemu_del_timer(q->tx_timer); + } else { + qemu_bh_cancel(q->tx_bh); + } + } + } +} + +static void virtio_net_set_link_status(NetClientState *nc) +{ + VirtIONet *n = qemu_get_nic_opaque(nc); + uint16_t old_status = n->status; + + if (nc->link_down) + n->status &= ~VIRTIO_NET_S_LINK_UP; + else + n->status |= VIRTIO_NET_S_LINK_UP; + + if (n->status != old_status) + virtio_notify_config(&n->vdev); + + virtio_net_set_status(&n->vdev, n->vdev.status); +} + +static void virtio_net_reset(VirtIODevice *vdev) +{ + VirtIONet *n = to_virtio_net(vdev); + + /* Reset back to compatibility mode */ + n->promisc = 1; + n->allmulti = 0; + n->alluni = 0; + n->nomulti = 0; + n->nouni = 0; + n->nobcast = 0; + /* multiqueue is disabled by default */ + n->curr_queues = 1; + + /* Flush any MAC and VLAN filter table state */ + n->mac_table.in_use = 0; + n->mac_table.first_multi = 0; + n->mac_table.multi_overflow = 0; + n->mac_table.uni_overflow = 0; + memset(n->mac_table.macs, 0, MAC_TABLE_ENTRIES * ETH_ALEN); + memcpy(&n->mac[0], &n->nic->conf->macaddr, sizeof(n->mac)); + memset(n->vlans, 0, MAX_VLAN >> 3); +} + +static void peer_test_vnet_hdr(VirtIONet *n) +{ + NetClientState *nc = qemu_get_queue(n->nic); + if (!nc->peer) { + return; + } + + if (nc->peer->info->type != NET_CLIENT_OPTIONS_KIND_TAP) { + return; + } + + n->has_vnet_hdr = tap_has_vnet_hdr(nc->peer); +} + +static int peer_has_vnet_hdr(VirtIONet *n) +{ + return n->has_vnet_hdr; +} + +static int peer_has_ufo(VirtIONet *n) +{ + if (!peer_has_vnet_hdr(n)) + return 0; + + n->has_ufo = tap_has_ufo(qemu_get_queue(n->nic)->peer); + + return n->has_ufo; +} + +static void virtio_net_set_mrg_rx_bufs(VirtIONet *n, int mergeable_rx_bufs) +{ + int i; + NetClientState *nc; + + n->mergeable_rx_bufs = mergeable_rx_bufs; + + n->guest_hdr_len = n->mergeable_rx_bufs ? + sizeof(struct virtio_net_hdr_mrg_rxbuf) : sizeof(struct virtio_net_hdr); + + for (i = 0; i < n->max_queues; i++) { + nc = qemu_get_subqueue(n->nic, i); + + if (peer_has_vnet_hdr(n) && + tap_has_vnet_hdr_len(nc->peer, n->guest_hdr_len)) { + tap_set_vnet_hdr_len(nc->peer, n->guest_hdr_len); + n->host_hdr_len = n->guest_hdr_len; + } + } +} + +static int peer_attach(VirtIONet *n, int index) +{ + NetClientState *nc = qemu_get_subqueue(n->nic, index); + + if (!nc->peer) { + return 0; + } + + if (nc->peer->info->type != NET_CLIENT_OPTIONS_KIND_TAP) { + return 0; + } + + return tap_enable(nc->peer); +} + +static int peer_detach(VirtIONet *n, int index) +{ + NetClientState *nc = qemu_get_subqueue(n->nic, index); + + if (!nc->peer) { + return 0; + } + + if (nc->peer->info->type != NET_CLIENT_OPTIONS_KIND_TAP) { + return 0; + } + + return tap_disable(nc->peer); +} + +static void virtio_net_set_queues(VirtIONet *n) +{ + int i; + + for (i = 0; i < n->max_queues; i++) { + if (i < n->curr_queues) { + assert(!peer_attach(n, i)); + } else { + assert(!peer_detach(n, i)); + } + } +} + +static void virtio_net_set_multiqueue(VirtIONet *n, int multiqueue, int ctrl); + +static uint32_t virtio_net_get_features(VirtIODevice *vdev, uint32_t features) +{ + VirtIONet *n = to_virtio_net(vdev); + NetClientState *nc = qemu_get_queue(n->nic); + + features |= (1 << VIRTIO_NET_F_MAC); + + if (!peer_has_vnet_hdr(n)) { + features &= ~(0x1 << VIRTIO_NET_F_CSUM); + features &= ~(0x1 << VIRTIO_NET_F_HOST_TSO4); + features &= ~(0x1 << VIRTIO_NET_F_HOST_TSO6); + features &= ~(0x1 << VIRTIO_NET_F_HOST_ECN); + + features &= ~(0x1 << VIRTIO_NET_F_GUEST_CSUM); + features &= ~(0x1 << VIRTIO_NET_F_GUEST_TSO4); + features &= ~(0x1 << VIRTIO_NET_F_GUEST_TSO6); + features &= ~(0x1 << VIRTIO_NET_F_GUEST_ECN); + } + + if (!peer_has_vnet_hdr(n) || !peer_has_ufo(n)) { + features &= ~(0x1 << VIRTIO_NET_F_GUEST_UFO); + features &= ~(0x1 << VIRTIO_NET_F_HOST_UFO); + } + + if (!nc->peer || nc->peer->info->type != NET_CLIENT_OPTIONS_KIND_TAP) { + return features; + } + if (!tap_get_vhost_net(nc->peer)) { + return features; + } + return vhost_net_get_features(tap_get_vhost_net(nc->peer), features); +} + +static uint32_t virtio_net_bad_features(VirtIODevice *vdev) +{ + uint32_t features = 0; + + /* Linux kernel 2.6.25. It understood MAC (as everyone must), + * but also these: */ + features |= (1 << VIRTIO_NET_F_MAC); + features |= (1 << VIRTIO_NET_F_CSUM); + features |= (1 << VIRTIO_NET_F_HOST_TSO4); + features |= (1 << VIRTIO_NET_F_HOST_TSO6); + features |= (1 << VIRTIO_NET_F_HOST_ECN); + + return features; +} + +static void virtio_net_set_features(VirtIODevice *vdev, uint32_t features) +{ + VirtIONet *n = to_virtio_net(vdev); + int i; + + virtio_net_set_multiqueue(n, !!(features & (1 << VIRTIO_NET_F_MQ)), + !!(features & (1 << VIRTIO_NET_F_CTRL_VQ))); + + virtio_net_set_mrg_rx_bufs(n, !!(features & (1 << VIRTIO_NET_F_MRG_RXBUF))); + + if (n->has_vnet_hdr) { + tap_set_offload(qemu_get_subqueue(n->nic, 0)->peer, + (features >> VIRTIO_NET_F_GUEST_CSUM) & 1, + (features >> VIRTIO_NET_F_GUEST_TSO4) & 1, + (features >> VIRTIO_NET_F_GUEST_TSO6) & 1, + (features >> VIRTIO_NET_F_GUEST_ECN) & 1, + (features >> VIRTIO_NET_F_GUEST_UFO) & 1); + } + + for (i = 0; i < n->max_queues; i++) { + NetClientState *nc = qemu_get_subqueue(n->nic, i); + + if (!nc->peer || nc->peer->info->type != NET_CLIENT_OPTIONS_KIND_TAP) { + continue; + } + if (!tap_get_vhost_net(nc->peer)) { + continue; + } + vhost_net_ack_features(tap_get_vhost_net(nc->peer), features); + } +} + +static int virtio_net_handle_rx_mode(VirtIONet *n, uint8_t cmd, + struct iovec *iov, unsigned int iov_cnt) +{ + uint8_t on; + size_t s; + + s = iov_to_buf(iov, iov_cnt, 0, &on, sizeof(on)); + if (s != sizeof(on)) { + return VIRTIO_NET_ERR; + } + + if (cmd == VIRTIO_NET_CTRL_RX_PROMISC) { + n->promisc = on; + } else if (cmd == VIRTIO_NET_CTRL_RX_ALLMULTI) { + n->allmulti = on; + } else if (cmd == VIRTIO_NET_CTRL_RX_ALLUNI) { + n->alluni = on; + } else if (cmd == VIRTIO_NET_CTRL_RX_NOMULTI) { + n->nomulti = on; + } else if (cmd == VIRTIO_NET_CTRL_RX_NOUNI) { + n->nouni = on; + } else if (cmd == VIRTIO_NET_CTRL_RX_NOBCAST) { + n->nobcast = on; + } else { + return VIRTIO_NET_ERR; + } + + return VIRTIO_NET_OK; +} + +static int virtio_net_handle_mac(VirtIONet *n, uint8_t cmd, + struct iovec *iov, unsigned int iov_cnt) +{ + struct virtio_net_ctrl_mac mac_data; + size_t s; + + if (cmd == VIRTIO_NET_CTRL_MAC_ADDR_SET) { + if (iov_size(iov, iov_cnt) != sizeof(n->mac)) { + return VIRTIO_NET_ERR; + } + s = iov_to_buf(iov, iov_cnt, 0, &n->mac, sizeof(n->mac)); + assert(s == sizeof(n->mac)); + qemu_format_nic_info_str(qemu_get_queue(n->nic), n->mac); + return VIRTIO_NET_OK; + } + + if (cmd != VIRTIO_NET_CTRL_MAC_TABLE_SET) { + return VIRTIO_NET_ERR; + } + + n->mac_table.in_use = 0; + n->mac_table.first_multi = 0; + n->mac_table.uni_overflow = 0; + n->mac_table.multi_overflow = 0; + memset(n->mac_table.macs, 0, MAC_TABLE_ENTRIES * ETH_ALEN); + + s = iov_to_buf(iov, iov_cnt, 0, &mac_data.entries, + sizeof(mac_data.entries)); + mac_data.entries = ldl_p(&mac_data.entries); + if (s != sizeof(mac_data.entries)) { + return VIRTIO_NET_ERR; + } + iov_discard_front(&iov, &iov_cnt, s); + + if (mac_data.entries * ETH_ALEN > iov_size(iov, iov_cnt)) { + return VIRTIO_NET_ERR; + } + + if (mac_data.entries <= MAC_TABLE_ENTRIES) { + s = iov_to_buf(iov, iov_cnt, 0, n->mac_table.macs, + mac_data.entries * ETH_ALEN); + if (s != mac_data.entries * ETH_ALEN) { + return VIRTIO_NET_ERR; + } + n->mac_table.in_use += mac_data.entries; + } else { + n->mac_table.uni_overflow = 1; + } + + iov_discard_front(&iov, &iov_cnt, mac_data.entries * ETH_ALEN); + + n->mac_table.first_multi = n->mac_table.in_use; + + s = iov_to_buf(iov, iov_cnt, 0, &mac_data.entries, + sizeof(mac_data.entries)); + mac_data.entries = ldl_p(&mac_data.entries); + if (s != sizeof(mac_data.entries)) { + return VIRTIO_NET_ERR; + } + + iov_discard_front(&iov, &iov_cnt, s); + + if (mac_data.entries * ETH_ALEN != iov_size(iov, iov_cnt)) { + return VIRTIO_NET_ERR; + } + + if (n->mac_table.in_use + mac_data.entries <= MAC_TABLE_ENTRIES) { + s = iov_to_buf(iov, iov_cnt, 0, n->mac_table.macs, + mac_data.entries * ETH_ALEN); + if (s != mac_data.entries * ETH_ALEN) { + return VIRTIO_NET_ERR; + } + n->mac_table.in_use += mac_data.entries; + } else { + n->mac_table.multi_overflow = 1; + } + + return VIRTIO_NET_OK; +} + +static int virtio_net_handle_vlan_table(VirtIONet *n, uint8_t cmd, + struct iovec *iov, unsigned int iov_cnt) +{ + uint16_t vid; + size_t s; + + s = iov_to_buf(iov, iov_cnt, 0, &vid, sizeof(vid)); + vid = lduw_p(&vid); + if (s != sizeof(vid)) { + return VIRTIO_NET_ERR; + } + + if (vid >= MAX_VLAN) + return VIRTIO_NET_ERR; + + if (cmd == VIRTIO_NET_CTRL_VLAN_ADD) + n->vlans[vid >> 5] |= (1U << (vid & 0x1f)); + else if (cmd == VIRTIO_NET_CTRL_VLAN_DEL) + n->vlans[vid >> 5] &= ~(1U << (vid & 0x1f)); + else + return VIRTIO_NET_ERR; + + return VIRTIO_NET_OK; +} + +static int virtio_net_handle_mq(VirtIONet *n, uint8_t cmd, + struct iovec *iov, unsigned int iov_cnt) +{ + struct virtio_net_ctrl_mq mq; + size_t s; + uint16_t queues; + + s = iov_to_buf(iov, iov_cnt, 0, &mq, sizeof(mq)); + if (s != sizeof(mq)) { + return VIRTIO_NET_ERR; + } + + if (cmd != VIRTIO_NET_CTRL_MQ_VQ_PAIRS_SET) { + return VIRTIO_NET_ERR; + } + + queues = lduw_p(&mq.virtqueue_pairs); + + if (queues < VIRTIO_NET_CTRL_MQ_VQ_PAIRS_MIN || + queues > VIRTIO_NET_CTRL_MQ_VQ_PAIRS_MAX || + queues > n->max_queues || + !n->multiqueue) { + return VIRTIO_NET_ERR; + } + + n->curr_queues = queues; + /* stop the backend before changing the number of queues to avoid handling a + * disabled queue */ + virtio_net_set_status(&n->vdev, n->vdev.status); + virtio_net_set_queues(n); + + return VIRTIO_NET_OK; +} +static void virtio_net_handle_ctrl(VirtIODevice *vdev, VirtQueue *vq) +{ + VirtIONet *n = to_virtio_net(vdev); + struct virtio_net_ctrl_hdr ctrl; + virtio_net_ctrl_ack status = VIRTIO_NET_ERR; + VirtQueueElement elem; + size_t s; + struct iovec *iov; + unsigned int iov_cnt; + + while (virtqueue_pop(vq, &elem)) { + if (iov_size(elem.in_sg, elem.in_num) < sizeof(status) || + iov_size(elem.out_sg, elem.out_num) < sizeof(ctrl)) { + error_report("virtio-net ctrl missing headers"); + exit(1); + } + + iov = elem.out_sg; + iov_cnt = elem.out_num; + s = iov_to_buf(iov, iov_cnt, 0, &ctrl, sizeof(ctrl)); + iov_discard_front(&iov, &iov_cnt, sizeof(ctrl)); + if (s != sizeof(ctrl)) { + status = VIRTIO_NET_ERR; + } else if (ctrl.class == VIRTIO_NET_CTRL_RX) { + status = virtio_net_handle_rx_mode(n, ctrl.cmd, iov, iov_cnt); + } else if (ctrl.class == VIRTIO_NET_CTRL_MAC) { + status = virtio_net_handle_mac(n, ctrl.cmd, iov, iov_cnt); + } else if (ctrl.class == VIRTIO_NET_CTRL_VLAN) { + status = virtio_net_handle_vlan_table(n, ctrl.cmd, iov, iov_cnt); + } else if (ctrl.class == VIRTIO_NET_CTRL_MQ) { + status = virtio_net_handle_mq(n, ctrl.cmd, iov, iov_cnt); + } + + s = iov_from_buf(elem.in_sg, elem.in_num, 0, &status, sizeof(status)); + assert(s == sizeof(status)); + + virtqueue_push(vq, &elem, sizeof(status)); + virtio_notify(vdev, vq); + } +} + +/* RX */ + +static void virtio_net_handle_rx(VirtIODevice *vdev, VirtQueue *vq) +{ + VirtIONet *n = to_virtio_net(vdev); + int queue_index = vq2q(virtio_get_queue_index(vq)); + + qemu_flush_queued_packets(qemu_get_subqueue(n->nic, queue_index)); +} + +static int virtio_net_can_receive(NetClientState *nc) +{ + VirtIONet *n = qemu_get_nic_opaque(nc); + VirtIONetQueue *q = virtio_net_get_subqueue(nc); + + if (!n->vdev.vm_running) { + return 0; + } + + if (nc->queue_index >= n->curr_queues) { + return 0; + } + + if (!virtio_queue_ready(q->rx_vq) || + !(n->vdev.status & VIRTIO_CONFIG_S_DRIVER_OK)) { + return 0; + } + + return 1; +} + +static int virtio_net_has_buffers(VirtIONetQueue *q, int bufsize) +{ + VirtIONet *n = q->n; + if (virtio_queue_empty(q->rx_vq) || + (n->mergeable_rx_bufs && + !virtqueue_avail_bytes(q->rx_vq, bufsize, 0))) { + virtio_queue_set_notification(q->rx_vq, 1); + + /* To avoid a race condition where the guest has made some buffers + * available after the above check but before notification was + * enabled, check for available buffers again. + */ + if (virtio_queue_empty(q->rx_vq) || + (n->mergeable_rx_bufs && + !virtqueue_avail_bytes(q->rx_vq, bufsize, 0))) { + return 0; + } + } + + virtio_queue_set_notification(q->rx_vq, 0); + return 1; +} + +/* dhclient uses AF_PACKET but doesn't pass auxdata to the kernel so + * it never finds out that the packets don't have valid checksums. This + * causes dhclient to get upset. Fedora's carried a patch for ages to + * fix this with Xen but it hasn't appeared in an upstream release of + * dhclient yet. + * + * To avoid breaking existing guests, we catch udp packets and add + * checksums. This is terrible but it's better than hacking the guest + * kernels. + * + * N.B. if we introduce a zero-copy API, this operation is no longer free so + * we should provide a mechanism to disable it to avoid polluting the host + * cache. + */ +static void work_around_broken_dhclient(struct virtio_net_hdr *hdr, + uint8_t *buf, size_t size) +{ + if ((hdr->flags & VIRTIO_NET_HDR_F_NEEDS_CSUM) && /* missing csum */ + (size > 27 && size < 1500) && /* normal sized MTU */ + (buf[12] == 0x08 && buf[13] == 0x00) && /* ethertype == IPv4 */ + (buf[23] == 17) && /* ip.protocol == UDP */ + (buf[34] == 0 && buf[35] == 67)) { /* udp.srcport == bootps */ + net_checksum_calculate(buf, size); + hdr->flags &= ~VIRTIO_NET_HDR_F_NEEDS_CSUM; + } +} + +static void receive_header(VirtIONet *n, const struct iovec *iov, int iov_cnt, + const void *buf, size_t size) +{ + if (n->has_vnet_hdr) { + /* FIXME this cast is evil */ + void *wbuf = (void *)buf; + work_around_broken_dhclient(wbuf, wbuf + n->host_hdr_len, + size - n->host_hdr_len); + iov_from_buf(iov, iov_cnt, 0, buf, sizeof(struct virtio_net_hdr)); + } else { + struct virtio_net_hdr hdr = { + .flags = 0, + .gso_type = VIRTIO_NET_HDR_GSO_NONE + }; + iov_from_buf(iov, iov_cnt, 0, &hdr, sizeof hdr); + } +} + +static int receive_filter(VirtIONet *n, const uint8_t *buf, int size) +{ + static const uint8_t bcast[] = {0xff, 0xff, 0xff, 0xff, 0xff, 0xff}; + static const uint8_t vlan[] = {0x81, 0x00}; + uint8_t *ptr = (uint8_t *)buf; + int i; + + if (n->promisc) + return 1; + + ptr += n->host_hdr_len; + + if (!memcmp(&ptr[12], vlan, sizeof(vlan))) { + int vid = be16_to_cpup((uint16_t *)(ptr + 14)) & 0xfff; + if (!(n->vlans[vid >> 5] & (1U << (vid & 0x1f)))) + return 0; + } + + if (ptr[0] & 1) { // multicast + if (!memcmp(ptr, bcast, sizeof(bcast))) { + return !n->nobcast; + } else if (n->nomulti) { + return 0; + } else if (n->allmulti || n->mac_table.multi_overflow) { + return 1; + } + + for (i = n->mac_table.first_multi; i < n->mac_table.in_use; i++) { + if (!memcmp(ptr, &n->mac_table.macs[i * ETH_ALEN], ETH_ALEN)) { + return 1; + } + } + } else { // unicast + if (n->nouni) { + return 0; + } else if (n->alluni || n->mac_table.uni_overflow) { + return 1; + } else if (!memcmp(ptr, n->mac, ETH_ALEN)) { + return 1; + } + + for (i = 0; i < n->mac_table.first_multi; i++) { + if (!memcmp(ptr, &n->mac_table.macs[i * ETH_ALEN], ETH_ALEN)) { + return 1; + } + } + } + + return 0; +} + +static ssize_t virtio_net_receive(NetClientState *nc, const uint8_t *buf, size_t size) +{ + VirtIONet *n = qemu_get_nic_opaque(nc); + VirtIONetQueue *q = virtio_net_get_subqueue(nc); + struct iovec mhdr_sg[VIRTQUEUE_MAX_SIZE]; + struct virtio_net_hdr_mrg_rxbuf mhdr; + unsigned mhdr_cnt = 0; + size_t offset, i, guest_offset; + + if (!virtio_net_can_receive(nc)) { + return -1; + } + + /* hdr_len refers to the header we supply to the guest */ + if (!virtio_net_has_buffers(q, size + n->guest_hdr_len - n->host_hdr_len)) { + return 0; + } + + if (!receive_filter(n, buf, size)) + return size; + + offset = i = 0; + + while (offset < size) { + VirtQueueElement elem; + int len, total; + const struct iovec *sg = elem.in_sg; + + total = 0; + + if (virtqueue_pop(q->rx_vq, &elem) == 0) { + if (i == 0) + return -1; + error_report("virtio-net unexpected empty queue: " + "i %zd mergeable %d offset %zd, size %zd, " + "guest hdr len %zd, host hdr len %zd guest features 0x%x", + i, n->mergeable_rx_bufs, offset, size, + n->guest_hdr_len, n->host_hdr_len, n->vdev.guest_features); + exit(1); + } + + if (elem.in_num < 1) { + error_report("virtio-net receive queue contains no in buffers"); + exit(1); + } + + if (i == 0) { + assert(offset == 0); + if (n->mergeable_rx_bufs) { + mhdr_cnt = iov_copy(mhdr_sg, ARRAY_SIZE(mhdr_sg), + sg, elem.in_num, + offsetof(typeof(mhdr), num_buffers), + sizeof(mhdr.num_buffers)); + } + + receive_header(n, sg, elem.in_num, buf, size); + offset = n->host_hdr_len; + total += n->guest_hdr_len; + guest_offset = n->guest_hdr_len; + } else { + guest_offset = 0; + } + + /* copy in packet. ugh */ + len = iov_from_buf(sg, elem.in_num, guest_offset, + buf + offset, size - offset); + total += len; + offset += len; + /* If buffers can't be merged, at this point we + * must have consumed the complete packet. + * Otherwise, drop it. */ + if (!n->mergeable_rx_bufs && offset < size) { +#if 0 + error_report("virtio-net truncated non-mergeable packet: " + "i %zd mergeable %d offset %zd, size %zd, " + "guest hdr len %zd, host hdr len %zd", + i, n->mergeable_rx_bufs, + offset, size, n->guest_hdr_len, n->host_hdr_len); +#endif + return size; + } + + /* signal other side */ + virtqueue_fill(q->rx_vq, &elem, total, i++); + } + + if (mhdr_cnt) { + stw_p(&mhdr.num_buffers, i); + iov_from_buf(mhdr_sg, mhdr_cnt, + 0, + &mhdr.num_buffers, sizeof mhdr.num_buffers); + } + + virtqueue_flush(q->rx_vq, i); + virtio_notify(&n->vdev, q->rx_vq); + + return size; +} + +static int32_t virtio_net_flush_tx(VirtIONetQueue *q); + +static void virtio_net_tx_complete(NetClientState *nc, ssize_t len) +{ + VirtIONet *n = qemu_get_nic_opaque(nc); + VirtIONetQueue *q = virtio_net_get_subqueue(nc); + + virtqueue_push(q->tx_vq, &q->async_tx.elem, 0); + virtio_notify(&n->vdev, q->tx_vq); + + q->async_tx.elem.out_num = q->async_tx.len = 0; + + virtio_queue_set_notification(q->tx_vq, 1); + virtio_net_flush_tx(q); +} + +/* TX */ +static int32_t virtio_net_flush_tx(VirtIONetQueue *q) +{ + VirtIONet *n = q->n; + VirtQueueElement elem; + int32_t num_packets = 0; + int queue_index = vq2q(virtio_get_queue_index(q->tx_vq)); + if (!(n->vdev.status & VIRTIO_CONFIG_S_DRIVER_OK)) { + return num_packets; + } + + assert(n->vdev.vm_running); + + if (q->async_tx.elem.out_num) { + virtio_queue_set_notification(q->tx_vq, 0); + return num_packets; + } + + while (virtqueue_pop(q->tx_vq, &elem)) { + ssize_t ret, len; + unsigned int out_num = elem.out_num; + struct iovec *out_sg = &elem.out_sg[0]; + struct iovec sg[VIRTQUEUE_MAX_SIZE]; + + if (out_num < 1) { + error_report("virtio-net header not in first element"); + exit(1); + } + + /* + * If host wants to see the guest header as is, we can + * pass it on unchanged. Otherwise, copy just the parts + * that host is interested in. + */ + assert(n->host_hdr_len <= n->guest_hdr_len); + if (n->host_hdr_len != n->guest_hdr_len) { + unsigned sg_num = iov_copy(sg, ARRAY_SIZE(sg), + out_sg, out_num, + 0, n->host_hdr_len); + sg_num += iov_copy(sg + sg_num, ARRAY_SIZE(sg) - sg_num, + out_sg, out_num, + n->guest_hdr_len, -1); + out_num = sg_num; + out_sg = sg; + } + + len = n->guest_hdr_len; + + ret = qemu_sendv_packet_async(qemu_get_subqueue(n->nic, queue_index), + out_sg, out_num, virtio_net_tx_complete); + if (ret == 0) { + virtio_queue_set_notification(q->tx_vq, 0); + q->async_tx.elem = elem; + q->async_tx.len = len; + return -EBUSY; + } + + len += ret; + + virtqueue_push(q->tx_vq, &elem, 0); + virtio_notify(&n->vdev, q->tx_vq); + + if (++num_packets >= n->tx_burst) { + break; + } + } + return num_packets; +} + +static void virtio_net_handle_tx_timer(VirtIODevice *vdev, VirtQueue *vq) +{ + VirtIONet *n = to_virtio_net(vdev); + VirtIONetQueue *q = &n->vqs[vq2q(virtio_get_queue_index(vq))]; + + /* This happens when device was stopped but VCPU wasn't. */ + if (!n->vdev.vm_running) { + q->tx_waiting = 1; + return; + } + + if (q->tx_waiting) { + virtio_queue_set_notification(vq, 1); + qemu_del_timer(q->tx_timer); + q->tx_waiting = 0; + virtio_net_flush_tx(q); + } else { + qemu_mod_timer(q->tx_timer, + qemu_get_clock_ns(vm_clock) + n->tx_timeout); + q->tx_waiting = 1; + virtio_queue_set_notification(vq, 0); + } +} + +static void virtio_net_handle_tx_bh(VirtIODevice *vdev, VirtQueue *vq) +{ + VirtIONet *n = to_virtio_net(vdev); + VirtIONetQueue *q = &n->vqs[vq2q(virtio_get_queue_index(vq))]; + + if (unlikely(q->tx_waiting)) { + return; + } + q->tx_waiting = 1; + /* This happens when device was stopped but VCPU wasn't. */ + if (!n->vdev.vm_running) { + return; + } + virtio_queue_set_notification(vq, 0); + qemu_bh_schedule(q->tx_bh); +} + +static void virtio_net_tx_timer(void *opaque) +{ + VirtIONetQueue *q = opaque; + VirtIONet *n = q->n; + assert(n->vdev.vm_running); + + q->tx_waiting = 0; + + /* Just in case the driver is not ready on more */ + if (!(n->vdev.status & VIRTIO_CONFIG_S_DRIVER_OK)) + return; + + virtio_queue_set_notification(q->tx_vq, 1); + virtio_net_flush_tx(q); +} + +static void virtio_net_tx_bh(void *opaque) +{ + VirtIONetQueue *q = opaque; + VirtIONet *n = q->n; + int32_t ret; + + assert(n->vdev.vm_running); + + q->tx_waiting = 0; + + /* Just in case the driver is not ready on more */ + if (unlikely(!(n->vdev.status & VIRTIO_CONFIG_S_DRIVER_OK))) + return; + + ret = virtio_net_flush_tx(q); + if (ret == -EBUSY) { + return; /* Notification re-enable handled by tx_complete */ + } + + /* If we flush a full burst of packets, assume there are + * more coming and immediately reschedule */ + if (ret >= n->tx_burst) { + qemu_bh_schedule(q->tx_bh); + q->tx_waiting = 1; + return; + } + + /* If less than a full burst, re-enable notification and flush + * anything that may have come in while we weren't looking. If + * we find something, assume the guest is still active and reschedule */ + virtio_queue_set_notification(q->tx_vq, 1); + if (virtio_net_flush_tx(q) > 0) { + virtio_queue_set_notification(q->tx_vq, 0); + qemu_bh_schedule(q->tx_bh); + q->tx_waiting = 1; + } +} + +static void virtio_net_set_multiqueue(VirtIONet *n, int multiqueue, int ctrl) +{ + VirtIODevice *vdev = &n->vdev; + int i, max = multiqueue ? n->max_queues : 1; + + n->multiqueue = multiqueue; + + for (i = 2; i <= n->max_queues * 2 + 1; i++) { + virtio_del_queue(vdev, i); + } + + for (i = 1; i < max; i++) { + n->vqs[i].rx_vq = virtio_add_queue(vdev, 256, virtio_net_handle_rx); + if (n->vqs[i].tx_timer) { + n->vqs[i].tx_vq = + virtio_add_queue(vdev, 256, virtio_net_handle_tx_timer); + n->vqs[i].tx_timer = qemu_new_timer_ns(vm_clock, + virtio_net_tx_timer, + &n->vqs[i]); + } else { + n->vqs[i].tx_vq = + virtio_add_queue(vdev, 256, virtio_net_handle_tx_bh); + n->vqs[i].tx_bh = qemu_bh_new(virtio_net_tx_bh, &n->vqs[i]); + } + + n->vqs[i].tx_waiting = 0; + n->vqs[i].n = n; + } + + if (ctrl) { + n->ctrl_vq = virtio_add_queue(vdev, 64, virtio_net_handle_ctrl); + } + + virtio_net_set_queues(n); +} + +static void virtio_net_save(QEMUFile *f, void *opaque) +{ + int i; + VirtIONet *n = opaque; + + /* At this point, backend must be stopped, otherwise + * it might keep writing to memory. */ + assert(!n->vhost_started); + virtio_save(&n->vdev, f); + + qemu_put_buffer(f, n->mac, ETH_ALEN); + qemu_put_be32(f, n->vqs[0].tx_waiting); + qemu_put_be32(f, n->mergeable_rx_bufs); + qemu_put_be16(f, n->status); + qemu_put_byte(f, n->promisc); + qemu_put_byte(f, n->allmulti); + qemu_put_be32(f, n->mac_table.in_use); + qemu_put_buffer(f, n->mac_table.macs, n->mac_table.in_use * ETH_ALEN); + qemu_put_buffer(f, (uint8_t *)n->vlans, MAX_VLAN >> 3); + qemu_put_be32(f, n->has_vnet_hdr); + qemu_put_byte(f, n->mac_table.multi_overflow); + qemu_put_byte(f, n->mac_table.uni_overflow); + qemu_put_byte(f, n->alluni); + qemu_put_byte(f, n->nomulti); + qemu_put_byte(f, n->nouni); + qemu_put_byte(f, n->nobcast); + qemu_put_byte(f, n->has_ufo); + if (n->max_queues > 1) { + qemu_put_be16(f, n->max_queues); + qemu_put_be16(f, n->curr_queues); + for (i = 1; i < n->curr_queues; i++) { + qemu_put_be32(f, n->vqs[i].tx_waiting); + } + } +} + +static int virtio_net_load(QEMUFile *f, void *opaque, int version_id) +{ + VirtIONet *n = opaque; + int ret, i, link_down; + + if (version_id < 2 || version_id > VIRTIO_NET_VM_VERSION) + return -EINVAL; + + ret = virtio_load(&n->vdev, f); + if (ret) { + return ret; + } + + qemu_get_buffer(f, n->mac, ETH_ALEN); + n->vqs[0].tx_waiting = qemu_get_be32(f); + + virtio_net_set_mrg_rx_bufs(n, qemu_get_be32(f)); + + if (version_id >= 3) + n->status = qemu_get_be16(f); + + if (version_id >= 4) { + if (version_id < 8) { + n->promisc = qemu_get_be32(f); + n->allmulti = qemu_get_be32(f); + } else { + n->promisc = qemu_get_byte(f); + n->allmulti = qemu_get_byte(f); + } + } + + if (version_id >= 5) { + n->mac_table.in_use = qemu_get_be32(f); + /* MAC_TABLE_ENTRIES may be different from the saved image */ + if (n->mac_table.in_use <= MAC_TABLE_ENTRIES) { + qemu_get_buffer(f, n->mac_table.macs, + n->mac_table.in_use * ETH_ALEN); + } else if (n->mac_table.in_use) { + uint8_t *buf = g_malloc0(n->mac_table.in_use); + qemu_get_buffer(f, buf, n->mac_table.in_use * ETH_ALEN); + g_free(buf); + n->mac_table.multi_overflow = n->mac_table.uni_overflow = 1; + n->mac_table.in_use = 0; + } + } + + if (version_id >= 6) + qemu_get_buffer(f, (uint8_t *)n->vlans, MAX_VLAN >> 3); + + if (version_id >= 7) { + if (qemu_get_be32(f) && !peer_has_vnet_hdr(n)) { + error_report("virtio-net: saved image requires vnet_hdr=on"); + return -1; + } + + if (n->has_vnet_hdr) { + tap_set_offload(qemu_get_queue(n->nic)->peer, + (n->vdev.guest_features >> VIRTIO_NET_F_GUEST_CSUM) & 1, + (n->vdev.guest_features >> VIRTIO_NET_F_GUEST_TSO4) & 1, + (n->vdev.guest_features >> VIRTIO_NET_F_GUEST_TSO6) & 1, + (n->vdev.guest_features >> VIRTIO_NET_F_GUEST_ECN) & 1, + (n->vdev.guest_features >> VIRTIO_NET_F_GUEST_UFO) & 1); + } + } + + if (version_id >= 9) { + n->mac_table.multi_overflow = qemu_get_byte(f); + n->mac_table.uni_overflow = qemu_get_byte(f); + } + + if (version_id >= 10) { + n->alluni = qemu_get_byte(f); + n->nomulti = qemu_get_byte(f); + n->nouni = qemu_get_byte(f); + n->nobcast = qemu_get_byte(f); + } + + if (version_id >= 11) { + if (qemu_get_byte(f) && !peer_has_ufo(n)) { + error_report("virtio-net: saved image requires TUN_F_UFO support"); + return -1; + } + } + + if (n->max_queues > 1) { + if (n->max_queues != qemu_get_be16(f)) { + error_report("virtio-net: different max_queues "); + return -1; + } + + n->curr_queues = qemu_get_be16(f); + for (i = 1; i < n->curr_queues; i++) { + n->vqs[i].tx_waiting = qemu_get_be32(f); + } + } + + virtio_net_set_queues(n); + + /* Find the first multicast entry in the saved MAC filter */ + for (i = 0; i < n->mac_table.in_use; i++) { + if (n->mac_table.macs[i * ETH_ALEN] & 1) { + break; + } + } + n->mac_table.first_multi = i; + + /* nc.link_down can't be migrated, so infer link_down according + * to link status bit in n->status */ + link_down = (n->status & VIRTIO_NET_S_LINK_UP) == 0; + for (i = 0; i < n->max_queues; i++) { + qemu_get_subqueue(n->nic, i)->link_down = link_down; + } + + return 0; +} + +static void virtio_net_cleanup(NetClientState *nc) +{ + VirtIONet *n = qemu_get_nic_opaque(nc); + + n->nic = NULL; +} + +static NetClientInfo net_virtio_info = { + .type = NET_CLIENT_OPTIONS_KIND_NIC, + .size = sizeof(NICState), + .can_receive = virtio_net_can_receive, + .receive = virtio_net_receive, + .cleanup = virtio_net_cleanup, + .link_status_changed = virtio_net_set_link_status, +}; + +static bool virtio_net_guest_notifier_pending(VirtIODevice *vdev, int idx) +{ + VirtIONet *n = to_virtio_net(vdev); + NetClientState *nc = qemu_get_subqueue(n->nic, vq2q(idx)); + assert(n->vhost_started); + return vhost_net_virtqueue_pending(tap_get_vhost_net(nc->peer), idx); +} + +static void virtio_net_guest_notifier_mask(VirtIODevice *vdev, int idx, + bool mask) +{ + VirtIONet *n = to_virtio_net(vdev); + NetClientState *nc = qemu_get_subqueue(n->nic, vq2q(idx)); + assert(n->vhost_started); + vhost_net_virtqueue_mask(tap_get_vhost_net(nc->peer), + vdev, idx, mask); +} + +VirtIODevice *virtio_net_init(DeviceState *dev, NICConf *conf, + virtio_net_conf *net, uint32_t host_features) +{ + VirtIONet *n; + int i, config_size = 0; + + for (i = 0; feature_sizes[i].flags != 0; i++) { + if (host_features & feature_sizes[i].flags) { + config_size = MAX(feature_sizes[i].end, config_size); + } + } + + n = (VirtIONet *)virtio_common_init("virtio-net", VIRTIO_ID_NET, + config_size, sizeof(VirtIONet)); + + n->config_size = config_size; + n->vdev.get_config = virtio_net_get_config; + n->vdev.set_config = virtio_net_set_config; + n->vdev.get_features = virtio_net_get_features; + n->vdev.set_features = virtio_net_set_features; + n->vdev.bad_features = virtio_net_bad_features; + n->vdev.reset = virtio_net_reset; + n->vdev.set_status = virtio_net_set_status; + n->vdev.guest_notifier_mask = virtio_net_guest_notifier_mask; + n->vdev.guest_notifier_pending = virtio_net_guest_notifier_pending; + n->max_queues = MAX(conf->queues, 1); + n->vqs = g_malloc0(sizeof(VirtIONetQueue) * n->max_queues); + n->vqs[0].rx_vq = virtio_add_queue(&n->vdev, 256, virtio_net_handle_rx); + n->curr_queues = 1; + n->vqs[0].n = n; + n->tx_timeout = net->txtimer; + + if (net->tx && strcmp(net->tx, "timer") && strcmp(net->tx, "bh")) { + error_report("virtio-net: " + "Unknown option tx=%s, valid options: \"timer\" \"bh\"", + net->tx); + error_report("Defaulting to \"bh\""); + } + + if (net->tx && !strcmp(net->tx, "timer")) { + n->vqs[0].tx_vq = virtio_add_queue(&n->vdev, 256, + virtio_net_handle_tx_timer); + n->vqs[0].tx_timer = qemu_new_timer_ns(vm_clock, virtio_net_tx_timer, + &n->vqs[0]); + } else { + n->vqs[0].tx_vq = virtio_add_queue(&n->vdev, 256, + virtio_net_handle_tx_bh); + n->vqs[0].tx_bh = qemu_bh_new(virtio_net_tx_bh, &n->vqs[0]); + } + n->ctrl_vq = virtio_add_queue(&n->vdev, 64, virtio_net_handle_ctrl); + qemu_macaddr_default_if_unset(&conf->macaddr); + memcpy(&n->mac[0], &conf->macaddr, sizeof(n->mac)); + n->status = VIRTIO_NET_S_LINK_UP; + + n->nic = qemu_new_nic(&net_virtio_info, conf, object_get_typename(OBJECT(dev)), dev->id, n); + peer_test_vnet_hdr(n); + if (peer_has_vnet_hdr(n)) { + for (i = 0; i < n->max_queues; i++) { + tap_using_vnet_hdr(qemu_get_subqueue(n->nic, i)->peer, true); + } + n->host_hdr_len = sizeof(struct virtio_net_hdr); + } else { + n->host_hdr_len = 0; + } + + qemu_format_nic_info_str(qemu_get_queue(n->nic), conf->macaddr.a); + + n->vqs[0].tx_waiting = 0; + n->tx_burst = net->txburst; + virtio_net_set_mrg_rx_bufs(n, 0); + n->promisc = 1; /* for compatibility */ + + n->mac_table.macs = g_malloc0(MAC_TABLE_ENTRIES * ETH_ALEN); + + n->vlans = g_malloc0(MAX_VLAN >> 3); + + n->qdev = dev; + register_savevm(dev, "virtio-net", -1, VIRTIO_NET_VM_VERSION, + virtio_net_save, virtio_net_load, n); + + add_boot_device_path(conf->bootindex, dev, "/ethernet-phy@0"); + + return &n->vdev; +} + +void virtio_net_exit(VirtIODevice *vdev) +{ + VirtIONet *n = DO_UPCAST(VirtIONet, vdev, vdev); + int i; + + /* This will stop vhost backend if appropriate. */ + virtio_net_set_status(vdev, 0); + + unregister_savevm(n->qdev, "virtio-net", n); + + g_free(n->mac_table.macs); + g_free(n->vlans); + + for (i = 0; i < n->max_queues; i++) { + VirtIONetQueue *q = &n->vqs[i]; + NetClientState *nc = qemu_get_subqueue(n->nic, i); + + qemu_purge_queued_packets(nc); + + if (q->tx_timer) { + qemu_del_timer(q->tx_timer); + qemu_free_timer(q->tx_timer); + } else { + qemu_bh_delete(q->tx_bh); + } + } + + g_free(n->vqs); + qemu_del_nic(n->nic); + virtio_cleanup(&n->vdev); +} diff --git a/hw/scsi/Makefile.objs b/hw/scsi/Makefile.objs index 6a56504068..b76b9c3733 100644 --- a/hw/scsi/Makefile.objs +++ b/hw/scsi/Makefile.objs @@ -4,3 +4,4 @@ common-obj-$(CONFIG_LSI_SCSI_PCI) += lsi53c895a.o common-obj-$(CONFIG_MEGASAS_SCSI_PCI) += megasas.o common-obj-$(CONFIG_ESP) += esp.o common-obj-$(CONFIG_ESP_PCI) += esp-pci.o +obj-$(CONFIG_VIRTIO) += virtio-scsi.o diff --git a/hw/scsi/virtio-scsi.c b/hw/scsi/virtio-scsi.c new file mode 100644 index 0000000000..ead7cda13d --- /dev/null +++ b/hw/scsi/virtio-scsi.c @@ -0,0 +1,774 @@ +/* + * Virtio SCSI HBA + * + * Copyright IBM, Corp. 2010 + * Copyright Red Hat, Inc. 2011 + * + * Authors: + * Stefan Hajnoczi + * Paolo Bonzini + * + * This work is licensed under the terms of the GNU GPL, version 2 or later. + * See the COPYING file in the top-level directory. + * + */ + +#include "hw/virtio/virtio-scsi.h" +#include "qemu/error-report.h" +#include +#include +#include + +#define VIRTIO_SCSI_VQ_SIZE 128 +#define VIRTIO_SCSI_CDB_SIZE 32 +#define VIRTIO_SCSI_SENSE_SIZE 96 +#define VIRTIO_SCSI_MAX_CHANNEL 0 +#define VIRTIO_SCSI_MAX_TARGET 255 +#define VIRTIO_SCSI_MAX_LUN 16383 + +/* Response codes */ +#define VIRTIO_SCSI_S_OK 0 +#define VIRTIO_SCSI_S_OVERRUN 1 +#define VIRTIO_SCSI_S_ABORTED 2 +#define VIRTIO_SCSI_S_BAD_TARGET 3 +#define VIRTIO_SCSI_S_RESET 4 +#define VIRTIO_SCSI_S_BUSY 5 +#define VIRTIO_SCSI_S_TRANSPORT_FAILURE 6 +#define VIRTIO_SCSI_S_TARGET_FAILURE 7 +#define VIRTIO_SCSI_S_NEXUS_FAILURE 8 +#define VIRTIO_SCSI_S_FAILURE 9 +#define VIRTIO_SCSI_S_FUNCTION_SUCCEEDED 10 +#define VIRTIO_SCSI_S_FUNCTION_REJECTED 11 +#define VIRTIO_SCSI_S_INCORRECT_LUN 12 + +/* Controlq type codes. */ +#define VIRTIO_SCSI_T_TMF 0 +#define VIRTIO_SCSI_T_AN_QUERY 1 +#define VIRTIO_SCSI_T_AN_SUBSCRIBE 2 + +/* Valid TMF subtypes. */ +#define VIRTIO_SCSI_T_TMF_ABORT_TASK 0 +#define VIRTIO_SCSI_T_TMF_ABORT_TASK_SET 1 +#define VIRTIO_SCSI_T_TMF_CLEAR_ACA 2 +#define VIRTIO_SCSI_T_TMF_CLEAR_TASK_SET 3 +#define VIRTIO_SCSI_T_TMF_I_T_NEXUS_RESET 4 +#define VIRTIO_SCSI_T_TMF_LOGICAL_UNIT_RESET 5 +#define VIRTIO_SCSI_T_TMF_QUERY_TASK 6 +#define VIRTIO_SCSI_T_TMF_QUERY_TASK_SET 7 + +/* Events. */ +#define VIRTIO_SCSI_T_EVENTS_MISSED 0x80000000 +#define VIRTIO_SCSI_T_NO_EVENT 0 +#define VIRTIO_SCSI_T_TRANSPORT_RESET 1 +#define VIRTIO_SCSI_T_ASYNC_NOTIFY 2 +#define VIRTIO_SCSI_T_PARAM_CHANGE 3 + +/* Reasons for transport reset event */ +#define VIRTIO_SCSI_EVT_RESET_HARD 0 +#define VIRTIO_SCSI_EVT_RESET_RESCAN 1 +#define VIRTIO_SCSI_EVT_RESET_REMOVED 2 + +/* SCSI command request, followed by data-out */ +typedef struct { + uint8_t lun[8]; /* Logical Unit Number */ + uint64_t tag; /* Command identifier */ + uint8_t task_attr; /* Task attribute */ + uint8_t prio; + uint8_t crn; + uint8_t cdb[]; +} QEMU_PACKED VirtIOSCSICmdReq; + +/* Response, followed by sense data and data-in */ +typedef struct { + uint32_t sense_len; /* Sense data length */ + uint32_t resid; /* Residual bytes in data buffer */ + uint16_t status_qualifier; /* Status qualifier */ + uint8_t status; /* Command completion status */ + uint8_t response; /* Response values */ + uint8_t sense[]; +} QEMU_PACKED VirtIOSCSICmdResp; + +/* Task Management Request */ +typedef struct { + uint32_t type; + uint32_t subtype; + uint8_t lun[8]; + uint64_t tag; +} QEMU_PACKED VirtIOSCSICtrlTMFReq; + +typedef struct { + uint8_t response; +} QEMU_PACKED VirtIOSCSICtrlTMFResp; + +/* Asynchronous notification query/subscription */ +typedef struct { + uint32_t type; + uint8_t lun[8]; + uint32_t event_requested; +} QEMU_PACKED VirtIOSCSICtrlANReq; + +typedef struct { + uint32_t event_actual; + uint8_t response; +} QEMU_PACKED VirtIOSCSICtrlANResp; + +typedef struct { + uint32_t event; + uint8_t lun[8]; + uint32_t reason; +} QEMU_PACKED VirtIOSCSIEvent; + +typedef struct { + uint32_t num_queues; + uint32_t seg_max; + uint32_t max_sectors; + uint32_t cmd_per_lun; + uint32_t event_info_size; + uint32_t sense_size; + uint32_t cdb_size; + uint16_t max_channel; + uint16_t max_target; + uint32_t max_lun; +} QEMU_PACKED VirtIOSCSIConfig; + +typedef struct VirtIOSCSIReq { + VirtIOSCSI *dev; + VirtQueue *vq; + VirtQueueElement elem; + QEMUSGList qsgl; + SCSIRequest *sreq; + union { + char *buf; + VirtIOSCSICmdReq *cmd; + VirtIOSCSICtrlTMFReq *tmf; + VirtIOSCSICtrlANReq *an; + } req; + union { + char *buf; + VirtIOSCSICmdResp *cmd; + VirtIOSCSICtrlTMFResp *tmf; + VirtIOSCSICtrlANResp *an; + VirtIOSCSIEvent *event; + } resp; +} VirtIOSCSIReq; + +static inline int virtio_scsi_get_lun(uint8_t *lun) +{ + return ((lun[2] << 8) | lun[3]) & 0x3FFF; +} + +static inline SCSIDevice *virtio_scsi_device_find(VirtIOSCSI *s, uint8_t *lun) +{ + if (lun[0] != 1) { + return NULL; + } + if (lun[2] != 0 && !(lun[2] >= 0x40 && lun[2] < 0x80)) { + return NULL; + } + return scsi_device_find(&s->bus, 0, lun[1], virtio_scsi_get_lun(lun)); +} + +static void virtio_scsi_complete_req(VirtIOSCSIReq *req) +{ + VirtIOSCSI *s = req->dev; + VirtQueue *vq = req->vq; + VirtIODevice *vdev = VIRTIO_DEVICE(s); + virtqueue_push(vq, &req->elem, req->qsgl.size + req->elem.in_sg[0].iov_len); + qemu_sglist_destroy(&req->qsgl); + if (req->sreq) { + req->sreq->hba_private = NULL; + scsi_req_unref(req->sreq); + } + g_free(req); + virtio_notify(vdev, vq); +} + +static void virtio_scsi_bad_req(void) +{ + error_report("wrong size for virtio-scsi headers"); + exit(1); +} + +static void qemu_sgl_init_external(QEMUSGList *qsgl, struct iovec *sg, + hwaddr *addr, int num) +{ + qemu_sglist_init(qsgl, num, &dma_context_memory); + while (num--) { + qemu_sglist_add(qsgl, *(addr++), (sg++)->iov_len); + } +} + +static void virtio_scsi_parse_req(VirtIOSCSI *s, VirtQueue *vq, + VirtIOSCSIReq *req) +{ + assert(req->elem.in_num); + req->vq = vq; + req->dev = s; + req->sreq = NULL; + if (req->elem.out_num) { + req->req.buf = req->elem.out_sg[0].iov_base; + } + req->resp.buf = req->elem.in_sg[0].iov_base; + + if (req->elem.out_num > 1) { + qemu_sgl_init_external(&req->qsgl, &req->elem.out_sg[1], + &req->elem.out_addr[1], + req->elem.out_num - 1); + } else { + qemu_sgl_init_external(&req->qsgl, &req->elem.in_sg[1], + &req->elem.in_addr[1], + req->elem.in_num - 1); + } +} + +static VirtIOSCSIReq *virtio_scsi_pop_req(VirtIOSCSI *s, VirtQueue *vq) +{ + VirtIOSCSIReq *req; + req = g_malloc(sizeof(*req)); + if (!virtqueue_pop(vq, &req->elem)) { + g_free(req); + return NULL; + } + + virtio_scsi_parse_req(s, vq, req); + return req; +} + +static void virtio_scsi_save_request(QEMUFile *f, SCSIRequest *sreq) +{ + VirtIOSCSIReq *req = sreq->hba_private; + uint32_t n = virtio_queue_get_id(req->vq) - 2; + + assert(n < req->dev->conf.num_queues); + qemu_put_be32s(f, &n); + qemu_put_buffer(f, (unsigned char *)&req->elem, sizeof(req->elem)); +} + +static void *virtio_scsi_load_request(QEMUFile *f, SCSIRequest *sreq) +{ + SCSIBus *bus = sreq->bus; + VirtIOSCSI *s = container_of(bus, VirtIOSCSI, bus); + VirtIOSCSIReq *req; + uint32_t n; + + req = g_malloc(sizeof(*req)); + qemu_get_be32s(f, &n); + assert(n < s->conf.num_queues); + qemu_get_buffer(f, (unsigned char *)&req->elem, sizeof(req->elem)); + virtio_scsi_parse_req(s, s->cmd_vqs[n], req); + + scsi_req_ref(sreq); + req->sreq = sreq; + if (req->sreq->cmd.mode != SCSI_XFER_NONE) { + int req_mode = + (req->elem.in_num > 1 ? SCSI_XFER_FROM_DEV : SCSI_XFER_TO_DEV); + + assert(req->sreq->cmd.mode == req_mode); + } + return req; +} + +static void virtio_scsi_do_tmf(VirtIOSCSI *s, VirtIOSCSIReq *req) +{ + SCSIDevice *d = virtio_scsi_device_find(s, req->req.tmf->lun); + SCSIRequest *r, *next; + BusChild *kid; + int target; + + /* Here VIRTIO_SCSI_S_OK means "FUNCTION COMPLETE". */ + req->resp.tmf->response = VIRTIO_SCSI_S_OK; + + switch (req->req.tmf->subtype) { + case VIRTIO_SCSI_T_TMF_ABORT_TASK: + case VIRTIO_SCSI_T_TMF_QUERY_TASK: + if (!d) { + goto fail; + } + if (d->lun != virtio_scsi_get_lun(req->req.tmf->lun)) { + goto incorrect_lun; + } + QTAILQ_FOREACH_SAFE(r, &d->requests, next, next) { + VirtIOSCSIReq *cmd_req = r->hba_private; + if (cmd_req && cmd_req->req.cmd->tag == req->req.tmf->tag) { + break; + } + } + if (r) { + /* + * Assert that the request has not been completed yet, we + * check for it in the loop above. + */ + assert(r->hba_private); + if (req->req.tmf->subtype == VIRTIO_SCSI_T_TMF_QUERY_TASK) { + /* "If the specified command is present in the task set, then + * return a service response set to FUNCTION SUCCEEDED". + */ + req->resp.tmf->response = VIRTIO_SCSI_S_FUNCTION_SUCCEEDED; + } else { + scsi_req_cancel(r); + } + } + break; + + case VIRTIO_SCSI_T_TMF_LOGICAL_UNIT_RESET: + if (!d) { + goto fail; + } + if (d->lun != virtio_scsi_get_lun(req->req.tmf->lun)) { + goto incorrect_lun; + } + s->resetting++; + qdev_reset_all(&d->qdev); + s->resetting--; + break; + + case VIRTIO_SCSI_T_TMF_ABORT_TASK_SET: + case VIRTIO_SCSI_T_TMF_CLEAR_TASK_SET: + case VIRTIO_SCSI_T_TMF_QUERY_TASK_SET: + if (!d) { + goto fail; + } + if (d->lun != virtio_scsi_get_lun(req->req.tmf->lun)) { + goto incorrect_lun; + } + QTAILQ_FOREACH_SAFE(r, &d->requests, next, next) { + if (r->hba_private) { + if (req->req.tmf->subtype == VIRTIO_SCSI_T_TMF_QUERY_TASK_SET) { + /* "If there is any command present in the task set, then + * return a service response set to FUNCTION SUCCEEDED". + */ + req->resp.tmf->response = VIRTIO_SCSI_S_FUNCTION_SUCCEEDED; + break; + } else { + scsi_req_cancel(r); + } + } + } + break; + + case VIRTIO_SCSI_T_TMF_I_T_NEXUS_RESET: + target = req->req.tmf->lun[1]; + s->resetting++; + QTAILQ_FOREACH(kid, &s->bus.qbus.children, sibling) { + d = DO_UPCAST(SCSIDevice, qdev, kid->child); + if (d->channel == 0 && d->id == target) { + qdev_reset_all(&d->qdev); + } + } + s->resetting--; + break; + + case VIRTIO_SCSI_T_TMF_CLEAR_ACA: + default: + req->resp.tmf->response = VIRTIO_SCSI_S_FUNCTION_REJECTED; + break; + } + + return; + +incorrect_lun: + req->resp.tmf->response = VIRTIO_SCSI_S_INCORRECT_LUN; + return; + +fail: + req->resp.tmf->response = VIRTIO_SCSI_S_BAD_TARGET; +} + +static void virtio_scsi_handle_ctrl(VirtIODevice *vdev, VirtQueue *vq) +{ + VirtIOSCSI *s = (VirtIOSCSI *)vdev; + VirtIOSCSIReq *req; + + while ((req = virtio_scsi_pop_req(s, vq))) { + int out_size, in_size; + if (req->elem.out_num < 1 || req->elem.in_num < 1) { + virtio_scsi_bad_req(); + continue; + } + + out_size = req->elem.out_sg[0].iov_len; + in_size = req->elem.in_sg[0].iov_len; + if (req->req.tmf->type == VIRTIO_SCSI_T_TMF) { + if (out_size < sizeof(VirtIOSCSICtrlTMFReq) || + in_size < sizeof(VirtIOSCSICtrlTMFResp)) { + virtio_scsi_bad_req(); + } + virtio_scsi_do_tmf(s, req); + + } else if (req->req.tmf->type == VIRTIO_SCSI_T_AN_QUERY || + req->req.tmf->type == VIRTIO_SCSI_T_AN_SUBSCRIBE) { + if (out_size < sizeof(VirtIOSCSICtrlANReq) || + in_size < sizeof(VirtIOSCSICtrlANResp)) { + virtio_scsi_bad_req(); + } + req->resp.an->event_actual = 0; + req->resp.an->response = VIRTIO_SCSI_S_OK; + } + virtio_scsi_complete_req(req); + } +} + +static void virtio_scsi_command_complete(SCSIRequest *r, uint32_t status, + size_t resid) +{ + VirtIOSCSIReq *req = r->hba_private; + uint32_t sense_len; + + req->resp.cmd->response = VIRTIO_SCSI_S_OK; + req->resp.cmd->status = status; + if (req->resp.cmd->status == GOOD) { + req->resp.cmd->resid = tswap32(resid); + } else { + req->resp.cmd->resid = 0; + sense_len = scsi_req_get_sense(r, req->resp.cmd->sense, + VIRTIO_SCSI_SENSE_SIZE); + req->resp.cmd->sense_len = tswap32(sense_len); + } + virtio_scsi_complete_req(req); +} + +static QEMUSGList *virtio_scsi_get_sg_list(SCSIRequest *r) +{ + VirtIOSCSIReq *req = r->hba_private; + + return &req->qsgl; +} + +static void virtio_scsi_request_cancelled(SCSIRequest *r) +{ + VirtIOSCSIReq *req = r->hba_private; + + if (!req) { + return; + } + if (req->dev->resetting) { + req->resp.cmd->response = VIRTIO_SCSI_S_RESET; + } else { + req->resp.cmd->response = VIRTIO_SCSI_S_ABORTED; + } + virtio_scsi_complete_req(req); +} + +static void virtio_scsi_fail_cmd_req(VirtIOSCSIReq *req) +{ + req->resp.cmd->response = VIRTIO_SCSI_S_FAILURE; + virtio_scsi_complete_req(req); +} + +static void virtio_scsi_handle_cmd(VirtIODevice *vdev, VirtQueue *vq) +{ + VirtIOSCSI *s = (VirtIOSCSI *)vdev; + VirtIOSCSIReq *req; + int n; + + while ((req = virtio_scsi_pop_req(s, vq))) { + SCSIDevice *d; + int out_size, in_size; + if (req->elem.out_num < 1 || req->elem.in_num < 1) { + virtio_scsi_bad_req(); + } + + out_size = req->elem.out_sg[0].iov_len; + in_size = req->elem.in_sg[0].iov_len; + if (out_size < sizeof(VirtIOSCSICmdReq) + s->cdb_size || + in_size < sizeof(VirtIOSCSICmdResp) + s->sense_size) { + virtio_scsi_bad_req(); + } + + if (req->elem.out_num > 1 && req->elem.in_num > 1) { + virtio_scsi_fail_cmd_req(req); + continue; + } + + d = virtio_scsi_device_find(s, req->req.cmd->lun); + if (!d) { + req->resp.cmd->response = VIRTIO_SCSI_S_BAD_TARGET; + virtio_scsi_complete_req(req); + continue; + } + req->sreq = scsi_req_new(d, req->req.cmd->tag, + virtio_scsi_get_lun(req->req.cmd->lun), + req->req.cmd->cdb, req); + + if (req->sreq->cmd.mode != SCSI_XFER_NONE) { + int req_mode = + (req->elem.in_num > 1 ? SCSI_XFER_FROM_DEV : SCSI_XFER_TO_DEV); + + if (req->sreq->cmd.mode != req_mode || + req->sreq->cmd.xfer > req->qsgl.size) { + req->resp.cmd->response = VIRTIO_SCSI_S_OVERRUN; + virtio_scsi_complete_req(req); + continue; + } + } + + n = scsi_req_enqueue(req->sreq); + if (n) { + scsi_req_continue(req->sreq); + } + } +} + +static void virtio_scsi_get_config(VirtIODevice *vdev, + uint8_t *config) +{ + VirtIOSCSIConfig *scsiconf = (VirtIOSCSIConfig *)config; + VirtIOSCSI *s = (VirtIOSCSI *)vdev; + + stl_raw(&scsiconf->num_queues, s->conf.num_queues); + stl_raw(&scsiconf->seg_max, 128 - 2); + stl_raw(&scsiconf->max_sectors, s->conf.max_sectors); + stl_raw(&scsiconf->cmd_per_lun, s->conf.cmd_per_lun); + stl_raw(&scsiconf->event_info_size, sizeof(VirtIOSCSIEvent)); + stl_raw(&scsiconf->sense_size, s->sense_size); + stl_raw(&scsiconf->cdb_size, s->cdb_size); + stw_raw(&scsiconf->max_channel, VIRTIO_SCSI_MAX_CHANNEL); + stw_raw(&scsiconf->max_target, VIRTIO_SCSI_MAX_TARGET); + stl_raw(&scsiconf->max_lun, VIRTIO_SCSI_MAX_LUN); +} + +static void virtio_scsi_set_config(VirtIODevice *vdev, + const uint8_t *config) +{ + VirtIOSCSIConfig *scsiconf = (VirtIOSCSIConfig *)config; + VirtIOSCSI *s = (VirtIOSCSI *)vdev; + + if ((uint32_t) ldl_raw(&scsiconf->sense_size) >= 65536 || + (uint32_t) ldl_raw(&scsiconf->cdb_size) >= 256) { + error_report("bad data written to virtio-scsi configuration space"); + exit(1); + } + + s->sense_size = ldl_raw(&scsiconf->sense_size); + s->cdb_size = ldl_raw(&scsiconf->cdb_size); +} + +static uint32_t virtio_scsi_get_features(VirtIODevice *vdev, + uint32_t requested_features) +{ + return requested_features; +} + +static void virtio_scsi_reset(VirtIODevice *vdev) +{ + VirtIOSCSI *s = (VirtIOSCSI *)vdev; + + s->resetting++; + qbus_reset_all(&s->bus.qbus); + s->resetting--; + + s->sense_size = VIRTIO_SCSI_SENSE_SIZE; + s->cdb_size = VIRTIO_SCSI_CDB_SIZE; + s->events_dropped = false; +} + +/* The device does not have anything to save beyond the virtio data. + * Request data is saved with callbacks from SCSI devices. + */ +static void virtio_scsi_save(QEMUFile *f, void *opaque) +{ + VirtIODevice *vdev = VIRTIO_DEVICE(opaque); + virtio_save(vdev, f); +} + +static int virtio_scsi_load(QEMUFile *f, void *opaque, int version_id) +{ + VirtIODevice *vdev = VIRTIO_DEVICE(opaque); + int ret; + + ret = virtio_load(vdev, f); + if (ret) { + return ret; + } + return 0; +} + +static void virtio_scsi_push_event(VirtIOSCSI *s, SCSIDevice *dev, + uint32_t event, uint32_t reason) +{ + VirtIOSCSIReq *req = virtio_scsi_pop_req(s, s->event_vq); + VirtIOSCSIEvent *evt; + VirtIODevice *vdev = VIRTIO_DEVICE(s); + int in_size; + + if (!(vdev->status & VIRTIO_CONFIG_S_DRIVER_OK)) { + return; + } + + if (!req) { + s->events_dropped = true; + return; + } + + if (req->elem.out_num || req->elem.in_num != 1) { + virtio_scsi_bad_req(); + } + + if (s->events_dropped) { + event |= VIRTIO_SCSI_T_EVENTS_MISSED; + s->events_dropped = false; + } + + in_size = req->elem.in_sg[0].iov_len; + if (in_size < sizeof(VirtIOSCSIEvent)) { + virtio_scsi_bad_req(); + } + + evt = req->resp.event; + memset(evt, 0, sizeof(VirtIOSCSIEvent)); + evt->event = event; + evt->reason = reason; + if (!dev) { + assert(event == VIRTIO_SCSI_T_NO_EVENT); + } else { + evt->lun[0] = 1; + evt->lun[1] = dev->id; + + /* Linux wants us to keep the same encoding we use for REPORT LUNS. */ + if (dev->lun >= 256) { + evt->lun[2] = (dev->lun >> 8) | 0x40; + } + evt->lun[3] = dev->lun & 0xFF; + } + virtio_scsi_complete_req(req); +} + +static void virtio_scsi_handle_event(VirtIODevice *vdev, VirtQueue *vq) +{ + VirtIOSCSI *s = VIRTIO_SCSI(vdev); + + if (s->events_dropped) { + virtio_scsi_push_event(s, NULL, VIRTIO_SCSI_T_NO_EVENT, 0); + } +} + +static void virtio_scsi_change(SCSIBus *bus, SCSIDevice *dev, SCSISense sense) +{ + VirtIOSCSI *s = container_of(bus, VirtIOSCSI, bus); + VirtIODevice *vdev = VIRTIO_DEVICE(s); + + if (((vdev->guest_features >> VIRTIO_SCSI_F_CHANGE) & 1) && + dev->type != TYPE_ROM) { + virtio_scsi_push_event(s, dev, VIRTIO_SCSI_T_PARAM_CHANGE, + sense.asc | (sense.ascq << 8)); + } +} + +static void virtio_scsi_hotplug(SCSIBus *bus, SCSIDevice *dev) +{ + VirtIOSCSI *s = container_of(bus, VirtIOSCSI, bus); + VirtIODevice *vdev = VIRTIO_DEVICE(s); + + if ((vdev->guest_features >> VIRTIO_SCSI_F_HOTPLUG) & 1) { + virtio_scsi_push_event(s, dev, VIRTIO_SCSI_T_TRANSPORT_RESET, + VIRTIO_SCSI_EVT_RESET_RESCAN); + } +} + +static void virtio_scsi_hot_unplug(SCSIBus *bus, SCSIDevice *dev) +{ + VirtIOSCSI *s = container_of(bus, VirtIOSCSI, bus); + VirtIODevice *vdev = VIRTIO_DEVICE(s); + + if ((vdev->guest_features >> VIRTIO_SCSI_F_HOTPLUG) & 1) { + virtio_scsi_push_event(s, dev, VIRTIO_SCSI_T_TRANSPORT_RESET, + VIRTIO_SCSI_EVT_RESET_REMOVED); + } +} + +static struct SCSIBusInfo virtio_scsi_scsi_info = { + .tcq = true, + .max_channel = VIRTIO_SCSI_MAX_CHANNEL, + .max_target = VIRTIO_SCSI_MAX_TARGET, + .max_lun = VIRTIO_SCSI_MAX_LUN, + + .complete = virtio_scsi_command_complete, + .cancel = virtio_scsi_request_cancelled, + .change = virtio_scsi_change, + .hotplug = virtio_scsi_hotplug, + .hot_unplug = virtio_scsi_hot_unplug, + .get_sg_list = virtio_scsi_get_sg_list, + .save_request = virtio_scsi_save_request, + .load_request = virtio_scsi_load_request, +}; + +static int virtio_scsi_device_init(VirtIODevice *vdev) +{ + DeviceState *qdev = DEVICE(vdev); + VirtIOSCSI *s = VIRTIO_SCSI(vdev); + static int virtio_scsi_id; + int i; + + virtio_init(VIRTIO_DEVICE(s), "virtio-scsi", VIRTIO_ID_SCSI, + sizeof(VirtIOSCSIConfig)); + + s->cmd_vqs = g_malloc0(s->conf.num_queues * sizeof(VirtQueue *)); + + /* TODO set up vdev function pointers */ + vdev->get_config = virtio_scsi_get_config; + vdev->set_config = virtio_scsi_set_config; + vdev->get_features = virtio_scsi_get_features; + vdev->reset = virtio_scsi_reset; + + s->ctrl_vq = virtio_add_queue(vdev, VIRTIO_SCSI_VQ_SIZE, + virtio_scsi_handle_ctrl); + s->event_vq = virtio_add_queue(vdev, VIRTIO_SCSI_VQ_SIZE, + virtio_scsi_handle_event); + for (i = 0; i < s->conf.num_queues; i++) { + s->cmd_vqs[i] = virtio_add_queue(vdev, VIRTIO_SCSI_VQ_SIZE, + virtio_scsi_handle_cmd); + } + + scsi_bus_new(&s->bus, qdev, &virtio_scsi_scsi_info); + if (!qdev->hotplugged) { + scsi_bus_legacy_handle_cmdline(&s->bus); + } + + register_savevm(qdev, "virtio-scsi", virtio_scsi_id++, 1, + virtio_scsi_save, virtio_scsi_load, s); + + return 0; +} + +static int virtio_scsi_device_exit(DeviceState *qdev) +{ + VirtIOSCSI *s = VIRTIO_SCSI(qdev); + VirtIODevice *vdev = VIRTIO_DEVICE(qdev); + + unregister_savevm(qdev, "virtio-scsi", s); + g_free(s->cmd_vqs); + virtio_common_cleanup(vdev); + return 0; +} + +static Property virtio_scsi_properties[] = { + DEFINE_VIRTIO_SCSI_PROPERTIES(VirtIOSCSI, conf), + DEFINE_PROP_END_OF_LIST(), +}; + +static void virtio_scsi_class_init(ObjectClass *klass, void *data) +{ + DeviceClass *dc = DEVICE_CLASS(klass); + VirtioDeviceClass *vdc = VIRTIO_DEVICE_CLASS(klass); + dc->exit = virtio_scsi_device_exit; + dc->props = virtio_scsi_properties; + vdc->init = virtio_scsi_device_init; + vdc->get_config = virtio_scsi_get_config; + vdc->set_config = virtio_scsi_set_config; + vdc->get_features = virtio_scsi_get_features; + vdc->reset = virtio_scsi_reset; +} + +static const TypeInfo virtio_scsi_info = { + .name = TYPE_VIRTIO_SCSI, + .parent = TYPE_VIRTIO_DEVICE, + .instance_size = sizeof(VirtIOSCSI), + .class_init = virtio_scsi_class_init, +}; + +static void virtio_register_types(void) +{ + type_register_static(&virtio_scsi_info); +} + +type_init(virtio_register_types) diff --git a/hw/vhost.c b/hw/vhost.c deleted file mode 100644 index 636fad0f74..0000000000 --- a/hw/vhost.c +++ /dev/null @@ -1,1042 +0,0 @@ -/* - * vhost support - * - * Copyright Red Hat, Inc. 2010 - * - * Authors: - * Michael S. Tsirkin - * - * This work is licensed under the terms of the GNU GPL, version 2. See - * the COPYING file in the top-level directory. - * - * Contributions after 2012-01-13 are licensed under the terms of the - * GNU GPL, version 2 or (at your option) any later version. - */ - -#include -#include "hw/virtio/vhost.h" -#include "hw/hw.h" -#include "qemu/range.h" -#include -#include "exec/address-spaces.h" - -static void vhost_dev_sync_region(struct vhost_dev *dev, - MemoryRegionSection *section, - uint64_t mfirst, uint64_t mlast, - uint64_t rfirst, uint64_t rlast) -{ - uint64_t start = MAX(mfirst, rfirst); - uint64_t end = MIN(mlast, rlast); - vhost_log_chunk_t *from = dev->log + start / VHOST_LOG_CHUNK; - vhost_log_chunk_t *to = dev->log + end / VHOST_LOG_CHUNK + 1; - uint64_t addr = (start / VHOST_LOG_CHUNK) * VHOST_LOG_CHUNK; - - if (end < start) { - return; - } - assert(end / VHOST_LOG_CHUNK < dev->log_size); - assert(start / VHOST_LOG_CHUNK < dev->log_size); - - for (;from < to; ++from) { - vhost_log_chunk_t log; - int bit; - /* We first check with non-atomic: much cheaper, - * and we expect non-dirty to be the common case. */ - if (!*from) { - addr += VHOST_LOG_CHUNK; - continue; - } - /* Data must be read atomically. We don't really - * need the barrier semantics of __sync - * builtins, but it's easier to use them than - * roll our own. */ - log = __sync_fetch_and_and(from, 0); - while ((bit = sizeof(log) > sizeof(int) ? - ffsll(log) : ffs(log))) { - hwaddr page_addr; - hwaddr section_offset; - hwaddr mr_offset; - bit -= 1; - page_addr = addr + bit * VHOST_LOG_PAGE; - section_offset = page_addr - section->offset_within_address_space; - mr_offset = section_offset + section->offset_within_region; - memory_region_set_dirty(section->mr, mr_offset, VHOST_LOG_PAGE); - log &= ~(0x1ull << bit); - } - addr += VHOST_LOG_CHUNK; - } -} - -static int vhost_sync_dirty_bitmap(struct vhost_dev *dev, - MemoryRegionSection *section, - hwaddr first, - hwaddr last) -{ - int i; - hwaddr start_addr; - hwaddr end_addr; - - if (!dev->log_enabled || !dev->started) { - return 0; - } - start_addr = section->offset_within_address_space; - end_addr = range_get_last(start_addr, section->size); - start_addr = MAX(first, start_addr); - end_addr = MIN(last, end_addr); - - for (i = 0; i < dev->mem->nregions; ++i) { - struct vhost_memory_region *reg = dev->mem->regions + i; - vhost_dev_sync_region(dev, section, start_addr, end_addr, - reg->guest_phys_addr, - range_get_last(reg->guest_phys_addr, - reg->memory_size)); - } - for (i = 0; i < dev->nvqs; ++i) { - struct vhost_virtqueue *vq = dev->vqs + i; - vhost_dev_sync_region(dev, section, start_addr, end_addr, vq->used_phys, - range_get_last(vq->used_phys, vq->used_size)); - } - return 0; -} - -static void vhost_log_sync(MemoryListener *listener, - MemoryRegionSection *section) -{ - struct vhost_dev *dev = container_of(listener, struct vhost_dev, - memory_listener); - vhost_sync_dirty_bitmap(dev, section, 0x0, ~0x0ULL); -} - -static void vhost_log_sync_range(struct vhost_dev *dev, - hwaddr first, hwaddr last) -{ - int i; - /* FIXME: this is N^2 in number of sections */ - for (i = 0; i < dev->n_mem_sections; ++i) { - MemoryRegionSection *section = &dev->mem_sections[i]; - vhost_sync_dirty_bitmap(dev, section, first, last); - } -} - -/* Assign/unassign. Keep an unsorted array of non-overlapping - * memory regions in dev->mem. */ -static void vhost_dev_unassign_memory(struct vhost_dev *dev, - uint64_t start_addr, - uint64_t size) -{ - int from, to, n = dev->mem->nregions; - /* Track overlapping/split regions for sanity checking. */ - int overlap_start = 0, overlap_end = 0, overlap_middle = 0, split = 0; - - for (from = 0, to = 0; from < n; ++from, ++to) { - struct vhost_memory_region *reg = dev->mem->regions + to; - uint64_t reglast; - uint64_t memlast; - uint64_t change; - - /* clone old region */ - if (to != from) { - memcpy(reg, dev->mem->regions + from, sizeof *reg); - } - - /* No overlap is simple */ - if (!ranges_overlap(reg->guest_phys_addr, reg->memory_size, - start_addr, size)) { - continue; - } - - /* Split only happens if supplied region - * is in the middle of an existing one. Thus it can not - * overlap with any other existing region. */ - assert(!split); - - reglast = range_get_last(reg->guest_phys_addr, reg->memory_size); - memlast = range_get_last(start_addr, size); - - /* Remove whole region */ - if (start_addr <= reg->guest_phys_addr && memlast >= reglast) { - --dev->mem->nregions; - --to; - ++overlap_middle; - continue; - } - - /* Shrink region */ - if (memlast >= reglast) { - reg->memory_size = start_addr - reg->guest_phys_addr; - assert(reg->memory_size); - assert(!overlap_end); - ++overlap_end; - continue; - } - - /* Shift region */ - if (start_addr <= reg->guest_phys_addr) { - change = memlast + 1 - reg->guest_phys_addr; - reg->memory_size -= change; - reg->guest_phys_addr += change; - reg->userspace_addr += change; - assert(reg->memory_size); - assert(!overlap_start); - ++overlap_start; - continue; - } - - /* This only happens if supplied region - * is in the middle of an existing one. Thus it can not - * overlap with any other existing region. */ - assert(!overlap_start); - assert(!overlap_end); - assert(!overlap_middle); - /* Split region: shrink first part, shift second part. */ - memcpy(dev->mem->regions + n, reg, sizeof *reg); - reg->memory_size = start_addr - reg->guest_phys_addr; - assert(reg->memory_size); - change = memlast + 1 - reg->guest_phys_addr; - reg = dev->mem->regions + n; - reg->memory_size -= change; - assert(reg->memory_size); - reg->guest_phys_addr += change; - reg->userspace_addr += change; - /* Never add more than 1 region */ - assert(dev->mem->nregions == n); - ++dev->mem->nregions; - ++split; - } -} - -/* Called after unassign, so no regions overlap the given range. */ -static void vhost_dev_assign_memory(struct vhost_dev *dev, - uint64_t start_addr, - uint64_t size, - uint64_t uaddr) -{ - int from, to; - struct vhost_memory_region *merged = NULL; - for (from = 0, to = 0; from < dev->mem->nregions; ++from, ++to) { - struct vhost_memory_region *reg = dev->mem->regions + to; - uint64_t prlast, urlast; - uint64_t pmlast, umlast; - uint64_t s, e, u; - - /* clone old region */ - if (to != from) { - memcpy(reg, dev->mem->regions + from, sizeof *reg); - } - prlast = range_get_last(reg->guest_phys_addr, reg->memory_size); - pmlast = range_get_last(start_addr, size); - urlast = range_get_last(reg->userspace_addr, reg->memory_size); - umlast = range_get_last(uaddr, size); - - /* check for overlapping regions: should never happen. */ - assert(prlast < start_addr || pmlast < reg->guest_phys_addr); - /* Not an adjacent or overlapping region - do not merge. */ - if ((prlast + 1 != start_addr || urlast + 1 != uaddr) && - (pmlast + 1 != reg->guest_phys_addr || - umlast + 1 != reg->userspace_addr)) { - continue; - } - - if (merged) { - --to; - assert(to >= 0); - } else { - merged = reg; - } - u = MIN(uaddr, reg->userspace_addr); - s = MIN(start_addr, reg->guest_phys_addr); - e = MAX(pmlast, prlast); - uaddr = merged->userspace_addr = u; - start_addr = merged->guest_phys_addr = s; - size = merged->memory_size = e - s + 1; - assert(merged->memory_size); - } - - if (!merged) { - struct vhost_memory_region *reg = dev->mem->regions + to; - memset(reg, 0, sizeof *reg); - reg->memory_size = size; - assert(reg->memory_size); - reg->guest_phys_addr = start_addr; - reg->userspace_addr = uaddr; - ++to; - } - assert(to <= dev->mem->nregions + 1); - dev->mem->nregions = to; -} - -static uint64_t vhost_get_log_size(struct vhost_dev *dev) -{ - uint64_t log_size = 0; - int i; - for (i = 0; i < dev->mem->nregions; ++i) { - struct vhost_memory_region *reg = dev->mem->regions + i; - uint64_t last = range_get_last(reg->guest_phys_addr, - reg->memory_size); - log_size = MAX(log_size, last / VHOST_LOG_CHUNK + 1); - } - for (i = 0; i < dev->nvqs; ++i) { - struct vhost_virtqueue *vq = dev->vqs + i; - uint64_t last = vq->used_phys + vq->used_size - 1; - log_size = MAX(log_size, last / VHOST_LOG_CHUNK + 1); - } - return log_size; -} - -static inline void vhost_dev_log_resize(struct vhost_dev* dev, uint64_t size) -{ - vhost_log_chunk_t *log; - uint64_t log_base; - int r; - - log = g_malloc0(size * sizeof *log); - log_base = (uint64_t)(unsigned long)log; - r = ioctl(dev->control, VHOST_SET_LOG_BASE, &log_base); - assert(r >= 0); - /* Sync only the range covered by the old log */ - if (dev->log_size) { - vhost_log_sync_range(dev, 0, dev->log_size * VHOST_LOG_CHUNK - 1); - } - if (dev->log) { - g_free(dev->log); - } - dev->log = log; - dev->log_size = size; -} - -static int vhost_verify_ring_mappings(struct vhost_dev *dev, - uint64_t start_addr, - uint64_t size) -{ - int i; - for (i = 0; i < dev->nvqs; ++i) { - struct vhost_virtqueue *vq = dev->vqs + i; - hwaddr l; - void *p; - - if (!ranges_overlap(start_addr, size, vq->ring_phys, vq->ring_size)) { - continue; - } - l = vq->ring_size; - p = cpu_physical_memory_map(vq->ring_phys, &l, 1); - if (!p || l != vq->ring_size) { - fprintf(stderr, "Unable to map ring buffer for ring %d\n", i); - return -ENOMEM; - } - if (p != vq->ring) { - fprintf(stderr, "Ring buffer relocated for ring %d\n", i); - return -EBUSY; - } - cpu_physical_memory_unmap(p, l, 0, 0); - } - return 0; -} - -static struct vhost_memory_region *vhost_dev_find_reg(struct vhost_dev *dev, - uint64_t start_addr, - uint64_t size) -{ - int i, n = dev->mem->nregions; - for (i = 0; i < n; ++i) { - struct vhost_memory_region *reg = dev->mem->regions + i; - if (ranges_overlap(reg->guest_phys_addr, reg->memory_size, - start_addr, size)) { - return reg; - } - } - return NULL; -} - -static bool vhost_dev_cmp_memory(struct vhost_dev *dev, - uint64_t start_addr, - uint64_t size, - uint64_t uaddr) -{ - struct vhost_memory_region *reg = vhost_dev_find_reg(dev, start_addr, size); - uint64_t reglast; - uint64_t memlast; - - if (!reg) { - return true; - } - - reglast = range_get_last(reg->guest_phys_addr, reg->memory_size); - memlast = range_get_last(start_addr, size); - - /* Need to extend region? */ - if (start_addr < reg->guest_phys_addr || memlast > reglast) { - return true; - } - /* userspace_addr changed? */ - return uaddr != reg->userspace_addr + start_addr - reg->guest_phys_addr; -} - -static void vhost_set_memory(MemoryListener *listener, - MemoryRegionSection *section, - bool add) -{ - struct vhost_dev *dev = container_of(listener, struct vhost_dev, - memory_listener); - hwaddr start_addr = section->offset_within_address_space; - ram_addr_t size = section->size; - bool log_dirty = memory_region_is_logging(section->mr); - int s = offsetof(struct vhost_memory, regions) + - (dev->mem->nregions + 1) * sizeof dev->mem->regions[0]; - uint64_t log_size; - int r; - void *ram; - - dev->mem = g_realloc(dev->mem, s); - - if (log_dirty) { - add = false; - } - - assert(size); - - /* Optimize no-change case. At least cirrus_vga does this a lot at this time. */ - ram = memory_region_get_ram_ptr(section->mr) + section->offset_within_region; - if (add) { - if (!vhost_dev_cmp_memory(dev, start_addr, size, (uintptr_t)ram)) { - /* Region exists with same address. Nothing to do. */ - return; - } - } else { - if (!vhost_dev_find_reg(dev, start_addr, size)) { - /* Removing region that we don't access. Nothing to do. */ - return; - } - } - - vhost_dev_unassign_memory(dev, start_addr, size); - if (add) { - /* Add given mapping, merging adjacent regions if any */ - vhost_dev_assign_memory(dev, start_addr, size, (uintptr_t)ram); - } else { - /* Remove old mapping for this memory, if any. */ - vhost_dev_unassign_memory(dev, start_addr, size); - } - - if (!dev->started) { - return; - } - - if (dev->started) { - r = vhost_verify_ring_mappings(dev, start_addr, size); - assert(r >= 0); - } - - if (!dev->log_enabled) { - r = ioctl(dev->control, VHOST_SET_MEM_TABLE, dev->mem); - assert(r >= 0); - return; - } - log_size = vhost_get_log_size(dev); - /* We allocate an extra 4K bytes to log, - * to reduce the * number of reallocations. */ -#define VHOST_LOG_BUFFER (0x1000 / sizeof *dev->log) - /* To log more, must increase log size before table update. */ - if (dev->log_size < log_size) { - vhost_dev_log_resize(dev, log_size + VHOST_LOG_BUFFER); - } - r = ioctl(dev->control, VHOST_SET_MEM_TABLE, dev->mem); - assert(r >= 0); - /* To log less, can only decrease log size after table update. */ - if (dev->log_size > log_size + VHOST_LOG_BUFFER) { - vhost_dev_log_resize(dev, log_size); - } -} - -static bool vhost_section(MemoryRegionSection *section) -{ - return memory_region_is_ram(section->mr); -} - -static void vhost_begin(MemoryListener *listener) -{ -} - -static void vhost_commit(MemoryListener *listener) -{ -} - -static void vhost_region_add(MemoryListener *listener, - MemoryRegionSection *section) -{ - struct vhost_dev *dev = container_of(listener, struct vhost_dev, - memory_listener); - - if (!vhost_section(section)) { - return; - } - - ++dev->n_mem_sections; - dev->mem_sections = g_renew(MemoryRegionSection, dev->mem_sections, - dev->n_mem_sections); - dev->mem_sections[dev->n_mem_sections - 1] = *section; - vhost_set_memory(listener, section, true); -} - -static void vhost_region_del(MemoryListener *listener, - MemoryRegionSection *section) -{ - struct vhost_dev *dev = container_of(listener, struct vhost_dev, - memory_listener); - int i; - - if (!vhost_section(section)) { - return; - } - - vhost_set_memory(listener, section, false); - for (i = 0; i < dev->n_mem_sections; ++i) { - if (dev->mem_sections[i].offset_within_address_space - == section->offset_within_address_space) { - --dev->n_mem_sections; - memmove(&dev->mem_sections[i], &dev->mem_sections[i+1], - (dev->n_mem_sections - i) * sizeof(*dev->mem_sections)); - break; - } - } -} - -static void vhost_region_nop(MemoryListener *listener, - MemoryRegionSection *section) -{ -} - -static int vhost_virtqueue_set_addr(struct vhost_dev *dev, - struct vhost_virtqueue *vq, - unsigned idx, bool enable_log) -{ - struct vhost_vring_addr addr = { - .index = idx, - .desc_user_addr = (uint64_t)(unsigned long)vq->desc, - .avail_user_addr = (uint64_t)(unsigned long)vq->avail, - .used_user_addr = (uint64_t)(unsigned long)vq->used, - .log_guest_addr = vq->used_phys, - .flags = enable_log ? (1 << VHOST_VRING_F_LOG) : 0, - }; - int r = ioctl(dev->control, VHOST_SET_VRING_ADDR, &addr); - if (r < 0) { - return -errno; - } - return 0; -} - -static int vhost_dev_set_features(struct vhost_dev *dev, bool enable_log) -{ - uint64_t features = dev->acked_features; - int r; - if (enable_log) { - features |= 0x1 << VHOST_F_LOG_ALL; - } - r = ioctl(dev->control, VHOST_SET_FEATURES, &features); - return r < 0 ? -errno : 0; -} - -static int vhost_dev_set_log(struct vhost_dev *dev, bool enable_log) -{ - int r, t, i; - r = vhost_dev_set_features(dev, enable_log); - if (r < 0) { - goto err_features; - } - for (i = 0; i < dev->nvqs; ++i) { - r = vhost_virtqueue_set_addr(dev, dev->vqs + i, i, - enable_log); - if (r < 0) { - goto err_vq; - } - } - return 0; -err_vq: - for (; i >= 0; --i) { - t = vhost_virtqueue_set_addr(dev, dev->vqs + i, i, - dev->log_enabled); - assert(t >= 0); - } - t = vhost_dev_set_features(dev, dev->log_enabled); - assert(t >= 0); -err_features: - return r; -} - -static int vhost_migration_log(MemoryListener *listener, int enable) -{ - struct vhost_dev *dev = container_of(listener, struct vhost_dev, - memory_listener); - int r; - if (!!enable == dev->log_enabled) { - return 0; - } - if (!dev->started) { - dev->log_enabled = enable; - return 0; - } - if (!enable) { - r = vhost_dev_set_log(dev, false); - if (r < 0) { - return r; - } - if (dev->log) { - g_free(dev->log); - } - dev->log = NULL; - dev->log_size = 0; - } else { - vhost_dev_log_resize(dev, vhost_get_log_size(dev)); - r = vhost_dev_set_log(dev, true); - if (r < 0) { - return r; - } - } - dev->log_enabled = enable; - return 0; -} - -static void vhost_log_global_start(MemoryListener *listener) -{ - int r; - - r = vhost_migration_log(listener, true); - if (r < 0) { - abort(); - } -} - -static void vhost_log_global_stop(MemoryListener *listener) -{ - int r; - - r = vhost_migration_log(listener, false); - if (r < 0) { - abort(); - } -} - -static void vhost_log_start(MemoryListener *listener, - MemoryRegionSection *section) -{ - /* FIXME: implement */ -} - -static void vhost_log_stop(MemoryListener *listener, - MemoryRegionSection *section) -{ - /* FIXME: implement */ -} - -static int vhost_virtqueue_start(struct vhost_dev *dev, - struct VirtIODevice *vdev, - struct vhost_virtqueue *vq, - unsigned idx) -{ - hwaddr s, l, a; - int r; - int vhost_vq_index = idx - dev->vq_index; - struct vhost_vring_file file = { - .index = vhost_vq_index - }; - struct vhost_vring_state state = { - .index = vhost_vq_index - }; - struct VirtQueue *vvq = virtio_get_queue(vdev, idx); - - assert(idx >= dev->vq_index && idx < dev->vq_index + dev->nvqs); - - vq->num = state.num = virtio_queue_get_num(vdev, idx); - r = ioctl(dev->control, VHOST_SET_VRING_NUM, &state); - if (r) { - return -errno; - } - - state.num = virtio_queue_get_last_avail_idx(vdev, idx); - r = ioctl(dev->control, VHOST_SET_VRING_BASE, &state); - if (r) { - return -errno; - } - - s = l = virtio_queue_get_desc_size(vdev, idx); - a = virtio_queue_get_desc_addr(vdev, idx); - vq->desc = cpu_physical_memory_map(a, &l, 0); - if (!vq->desc || l != s) { - r = -ENOMEM; - goto fail_alloc_desc; - } - s = l = virtio_queue_get_avail_size(vdev, idx); - a = virtio_queue_get_avail_addr(vdev, idx); - vq->avail = cpu_physical_memory_map(a, &l, 0); - if (!vq->avail || l != s) { - r = -ENOMEM; - goto fail_alloc_avail; - } - vq->used_size = s = l = virtio_queue_get_used_size(vdev, idx); - vq->used_phys = a = virtio_queue_get_used_addr(vdev, idx); - vq->used = cpu_physical_memory_map(a, &l, 1); - if (!vq->used || l != s) { - r = -ENOMEM; - goto fail_alloc_used; - } - - vq->ring_size = s = l = virtio_queue_get_ring_size(vdev, idx); - vq->ring_phys = a = virtio_queue_get_ring_addr(vdev, idx); - vq->ring = cpu_physical_memory_map(a, &l, 1); - if (!vq->ring || l != s) { - r = -ENOMEM; - goto fail_alloc_ring; - } - - r = vhost_virtqueue_set_addr(dev, vq, vhost_vq_index, dev->log_enabled); - if (r < 0) { - r = -errno; - goto fail_alloc; - } - - file.fd = event_notifier_get_fd(virtio_queue_get_host_notifier(vvq)); - r = ioctl(dev->control, VHOST_SET_VRING_KICK, &file); - if (r) { - r = -errno; - goto fail_kick; - } - - /* Clear and discard previous events if any. */ - event_notifier_test_and_clear(&vq->masked_notifier); - - return 0; - -fail_kick: -fail_alloc: - cpu_physical_memory_unmap(vq->ring, virtio_queue_get_ring_size(vdev, idx), - 0, 0); -fail_alloc_ring: - cpu_physical_memory_unmap(vq->used, virtio_queue_get_used_size(vdev, idx), - 0, 0); -fail_alloc_used: - cpu_physical_memory_unmap(vq->avail, virtio_queue_get_avail_size(vdev, idx), - 0, 0); -fail_alloc_avail: - cpu_physical_memory_unmap(vq->desc, virtio_queue_get_desc_size(vdev, idx), - 0, 0); -fail_alloc_desc: - return r; -} - -static void vhost_virtqueue_stop(struct vhost_dev *dev, - struct VirtIODevice *vdev, - struct vhost_virtqueue *vq, - unsigned idx) -{ - struct vhost_vring_state state = { - .index = idx - dev->vq_index - }; - int r; - assert(idx >= dev->vq_index && idx < dev->vq_index + dev->nvqs); - r = ioctl(dev->control, VHOST_GET_VRING_BASE, &state); - if (r < 0) { - fprintf(stderr, "vhost VQ %d ring restore failed: %d\n", idx, r); - fflush(stderr); - } - virtio_queue_set_last_avail_idx(vdev, idx, state.num); - assert (r >= 0); - cpu_physical_memory_unmap(vq->ring, virtio_queue_get_ring_size(vdev, idx), - 0, virtio_queue_get_ring_size(vdev, idx)); - cpu_physical_memory_unmap(vq->used, virtio_queue_get_used_size(vdev, idx), - 1, virtio_queue_get_used_size(vdev, idx)); - cpu_physical_memory_unmap(vq->avail, virtio_queue_get_avail_size(vdev, idx), - 0, virtio_queue_get_avail_size(vdev, idx)); - cpu_physical_memory_unmap(vq->desc, virtio_queue_get_desc_size(vdev, idx), - 0, virtio_queue_get_desc_size(vdev, idx)); -} - -static void vhost_eventfd_add(MemoryListener *listener, - MemoryRegionSection *section, - bool match_data, uint64_t data, EventNotifier *e) -{ -} - -static void vhost_eventfd_del(MemoryListener *listener, - MemoryRegionSection *section, - bool match_data, uint64_t data, EventNotifier *e) -{ -} - -static int vhost_virtqueue_init(struct vhost_dev *dev, - struct vhost_virtqueue *vq, int n) -{ - struct vhost_vring_file file = { - .index = n, - }; - int r = event_notifier_init(&vq->masked_notifier, 0); - if (r < 0) { - return r; - } - - file.fd = event_notifier_get_fd(&vq->masked_notifier); - r = ioctl(dev->control, VHOST_SET_VRING_CALL, &file); - if (r) { - r = -errno; - goto fail_call; - } - return 0; -fail_call: - event_notifier_cleanup(&vq->masked_notifier); - return r; -} - -static void vhost_virtqueue_cleanup(struct vhost_virtqueue *vq) -{ - event_notifier_cleanup(&vq->masked_notifier); -} - -int vhost_dev_init(struct vhost_dev *hdev, int devfd, const char *devpath, - bool force) -{ - uint64_t features; - int i, r; - if (devfd >= 0) { - hdev->control = devfd; - } else { - hdev->control = open(devpath, O_RDWR); - if (hdev->control < 0) { - return -errno; - } - } - r = ioctl(hdev->control, VHOST_SET_OWNER, NULL); - if (r < 0) { - goto fail; - } - - r = ioctl(hdev->control, VHOST_GET_FEATURES, &features); - if (r < 0) { - goto fail; - } - - for (i = 0; i < hdev->nvqs; ++i) { - r = vhost_virtqueue_init(hdev, hdev->vqs + i, i); - if (r < 0) { - goto fail_vq; - } - } - hdev->features = features; - - hdev->memory_listener = (MemoryListener) { - .begin = vhost_begin, - .commit = vhost_commit, - .region_add = vhost_region_add, - .region_del = vhost_region_del, - .region_nop = vhost_region_nop, - .log_start = vhost_log_start, - .log_stop = vhost_log_stop, - .log_sync = vhost_log_sync, - .log_global_start = vhost_log_global_start, - .log_global_stop = vhost_log_global_stop, - .eventfd_add = vhost_eventfd_add, - .eventfd_del = vhost_eventfd_del, - .priority = 10 - }; - hdev->mem = g_malloc0(offsetof(struct vhost_memory, regions)); - hdev->n_mem_sections = 0; - hdev->mem_sections = NULL; - hdev->log = NULL; - hdev->log_size = 0; - hdev->log_enabled = false; - hdev->started = false; - memory_listener_register(&hdev->memory_listener, &address_space_memory); - hdev->force = force; - return 0; -fail_vq: - while (--i >= 0) { - vhost_virtqueue_cleanup(hdev->vqs + i); - } -fail: - r = -errno; - close(hdev->control); - return r; -} - -void vhost_dev_cleanup(struct vhost_dev *hdev) -{ - int i; - for (i = 0; i < hdev->nvqs; ++i) { - vhost_virtqueue_cleanup(hdev->vqs + i); - } - memory_listener_unregister(&hdev->memory_listener); - g_free(hdev->mem); - g_free(hdev->mem_sections); - close(hdev->control); -} - -bool vhost_dev_query(struct vhost_dev *hdev, VirtIODevice *vdev) -{ - return !vdev->binding->query_guest_notifiers || - vdev->binding->query_guest_notifiers(vdev->binding_opaque) || - hdev->force; -} - -/* Stop processing guest IO notifications in qemu. - * Start processing them in vhost in kernel. - */ -int vhost_dev_enable_notifiers(struct vhost_dev *hdev, VirtIODevice *vdev) -{ - int i, r; - if (!vdev->binding->set_host_notifier) { - fprintf(stderr, "binding does not support host notifiers\n"); - r = -ENOSYS; - goto fail; - } - - for (i = 0; i < hdev->nvqs; ++i) { - r = vdev->binding->set_host_notifier(vdev->binding_opaque, - hdev->vq_index + i, - true); - if (r < 0) { - fprintf(stderr, "vhost VQ %d notifier binding failed: %d\n", i, -r); - goto fail_vq; - } - } - - return 0; -fail_vq: - while (--i >= 0) { - r = vdev->binding->set_host_notifier(vdev->binding_opaque, - hdev->vq_index + i, - false); - if (r < 0) { - fprintf(stderr, "vhost VQ %d notifier cleanup error: %d\n", i, -r); - fflush(stderr); - } - assert (r >= 0); - } -fail: - return r; -} - -/* Stop processing guest IO notifications in vhost. - * Start processing them in qemu. - * This might actually run the qemu handlers right away, - * so virtio in qemu must be completely setup when this is called. - */ -void vhost_dev_disable_notifiers(struct vhost_dev *hdev, VirtIODevice *vdev) -{ - int i, r; - - for (i = 0; i < hdev->nvqs; ++i) { - r = vdev->binding->set_host_notifier(vdev->binding_opaque, - hdev->vq_index + i, - false); - if (r < 0) { - fprintf(stderr, "vhost VQ %d notifier cleanup failed: %d\n", i, -r); - fflush(stderr); - } - assert (r >= 0); - } -} - -/* Test and clear event pending status. - * Should be called after unmask to avoid losing events. - */ -bool vhost_virtqueue_pending(struct vhost_dev *hdev, int n) -{ - struct vhost_virtqueue *vq = hdev->vqs + n - hdev->vq_index; - assert(hdev->started); - assert(n >= hdev->vq_index && n < hdev->vq_index + hdev->nvqs); - return event_notifier_test_and_clear(&vq->masked_notifier); -} - -/* Mask/unmask events from this vq. */ -void vhost_virtqueue_mask(struct vhost_dev *hdev, VirtIODevice *vdev, int n, - bool mask) -{ - struct VirtQueue *vvq = virtio_get_queue(vdev, n); - int r, index = n - hdev->vq_index; - - assert(hdev->started); - assert(n >= hdev->vq_index && n < hdev->vq_index + hdev->nvqs); - - struct vhost_vring_file file = { - .index = index - }; - if (mask) { - file.fd = event_notifier_get_fd(&hdev->vqs[index].masked_notifier); - } else { - file.fd = event_notifier_get_fd(virtio_queue_get_guest_notifier(vvq)); - } - r = ioctl(hdev->control, VHOST_SET_VRING_CALL, &file); - assert(r >= 0); -} - -/* Host notifiers must be enabled at this point. */ -int vhost_dev_start(struct vhost_dev *hdev, VirtIODevice *vdev) -{ - int i, r; - - hdev->started = true; - - r = vhost_dev_set_features(hdev, hdev->log_enabled); - if (r < 0) { - goto fail_features; - } - r = ioctl(hdev->control, VHOST_SET_MEM_TABLE, hdev->mem); - if (r < 0) { - r = -errno; - goto fail_mem; - } - for (i = 0; i < hdev->nvqs; ++i) { - r = vhost_virtqueue_start(hdev, - vdev, - hdev->vqs + i, - hdev->vq_index + i); - if (r < 0) { - goto fail_vq; - } - } - - if (hdev->log_enabled) { - hdev->log_size = vhost_get_log_size(hdev); - hdev->log = hdev->log_size ? - g_malloc0(hdev->log_size * sizeof *hdev->log) : NULL; - r = ioctl(hdev->control, VHOST_SET_LOG_BASE, - (uint64_t)(unsigned long)hdev->log); - if (r < 0) { - r = -errno; - goto fail_log; - } - } - - return 0; -fail_log: -fail_vq: - while (--i >= 0) { - vhost_virtqueue_stop(hdev, - vdev, - hdev->vqs + i, - hdev->vq_index + i); - } - i = hdev->nvqs; -fail_mem: -fail_features: - - hdev->started = false; - return r; -} - -/* Host notifiers must be enabled at this point. */ -void vhost_dev_stop(struct vhost_dev *hdev, VirtIODevice *vdev) -{ - int i; - - for (i = 0; i < hdev->nvqs; ++i) { - vhost_virtqueue_stop(hdev, - vdev, - hdev->vqs + i, - hdev->vq_index + i); - } - vhost_log_sync_range(hdev, 0, ~0x0ull); - - hdev->started = false; - g_free(hdev->log); - hdev->log = NULL; - hdev->log_size = 0; -} - diff --git a/hw/vhost_net.c b/hw/vhost_net.c deleted file mode 100644 index 8c5384cf76..0000000000 --- a/hw/vhost_net.c +++ /dev/null @@ -1,328 +0,0 @@ -/* - * vhost-net support - * - * Copyright Red Hat, Inc. 2010 - * - * Authors: - * Michael S. Tsirkin - * - * This work is licensed under the terms of the GNU GPL, version 2. See - * the COPYING file in the top-level directory. - * - * Contributions after 2012-01-13 are licensed under the terms of the - * GNU GPL, version 2 or (at your option) any later version. - */ - -#include "net/net.h" -#include "net/tap.h" - -#include "hw/virtio/virtio-net.h" -#include "net/vhost_net.h" -#include "qemu/error-report.h" - -#include "config.h" - -#ifdef CONFIG_VHOST_NET -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include - -#include - -#include "hw/virtio/vhost.h" - -struct vhost_net { - struct vhost_dev dev; - struct vhost_virtqueue vqs[2]; - int backend; - NetClientState *nc; -}; - -unsigned vhost_net_get_features(struct vhost_net *net, unsigned features) -{ - /* Clear features not supported by host kernel. */ - if (!(net->dev.features & (1 << VIRTIO_F_NOTIFY_ON_EMPTY))) { - features &= ~(1 << VIRTIO_F_NOTIFY_ON_EMPTY); - } - if (!(net->dev.features & (1 << VIRTIO_RING_F_INDIRECT_DESC))) { - features &= ~(1 << VIRTIO_RING_F_INDIRECT_DESC); - } - if (!(net->dev.features & (1 << VIRTIO_RING_F_EVENT_IDX))) { - features &= ~(1 << VIRTIO_RING_F_EVENT_IDX); - } - if (!(net->dev.features & (1 << VIRTIO_NET_F_MRG_RXBUF))) { - features &= ~(1 << VIRTIO_NET_F_MRG_RXBUF); - } - return features; -} - -void vhost_net_ack_features(struct vhost_net *net, unsigned features) -{ - net->dev.acked_features = net->dev.backend_features; - if (features & (1 << VIRTIO_F_NOTIFY_ON_EMPTY)) { - net->dev.acked_features |= (1 << VIRTIO_F_NOTIFY_ON_EMPTY); - } - if (features & (1 << VIRTIO_RING_F_INDIRECT_DESC)) { - net->dev.acked_features |= (1 << VIRTIO_RING_F_INDIRECT_DESC); - } - if (features & (1 << VIRTIO_RING_F_EVENT_IDX)) { - net->dev.acked_features |= (1 << VIRTIO_RING_F_EVENT_IDX); - } - if (features & (1 << VIRTIO_NET_F_MRG_RXBUF)) { - net->dev.acked_features |= (1 << VIRTIO_NET_F_MRG_RXBUF); - } -} - -static int vhost_net_get_fd(NetClientState *backend) -{ - switch (backend->info->type) { - case NET_CLIENT_OPTIONS_KIND_TAP: - return tap_get_fd(backend); - default: - fprintf(stderr, "vhost-net requires tap backend\n"); - return -EBADFD; - } -} - -struct vhost_net *vhost_net_init(NetClientState *backend, int devfd, - bool force) -{ - int r; - struct vhost_net *net = g_malloc(sizeof *net); - if (!backend) { - fprintf(stderr, "vhost-net requires backend to be setup\n"); - goto fail; - } - r = vhost_net_get_fd(backend); - if (r < 0) { - goto fail; - } - net->nc = backend; - net->dev.backend_features = tap_has_vnet_hdr(backend) ? 0 : - (1 << VHOST_NET_F_VIRTIO_NET_HDR); - net->backend = r; - - net->dev.nvqs = 2; - net->dev.vqs = net->vqs; - - r = vhost_dev_init(&net->dev, devfd, "/dev/vhost-net", force); - if (r < 0) { - goto fail; - } - if (!tap_has_vnet_hdr_len(backend, - sizeof(struct virtio_net_hdr_mrg_rxbuf))) { - net->dev.features &= ~(1 << VIRTIO_NET_F_MRG_RXBUF); - } - if (~net->dev.features & net->dev.backend_features) { - fprintf(stderr, "vhost lacks feature mask %" PRIu64 " for backend\n", - (uint64_t)(~net->dev.features & net->dev.backend_features)); - vhost_dev_cleanup(&net->dev); - goto fail; - } - - /* Set sane init value. Override when guest acks. */ - vhost_net_ack_features(net, 0); - return net; -fail: - g_free(net); - return NULL; -} - -bool vhost_net_query(VHostNetState *net, VirtIODevice *dev) -{ - return vhost_dev_query(&net->dev, dev); -} - -static int vhost_net_start_one(struct vhost_net *net, - VirtIODevice *dev, - int vq_index) -{ - struct vhost_vring_file file = { }; - int r; - - if (net->dev.started) { - return 0; - } - - net->dev.nvqs = 2; - net->dev.vqs = net->vqs; - net->dev.vq_index = vq_index; - - r = vhost_dev_enable_notifiers(&net->dev, dev); - if (r < 0) { - goto fail_notifiers; - } - - r = vhost_dev_start(&net->dev, dev); - if (r < 0) { - goto fail_start; - } - - net->nc->info->poll(net->nc, false); - qemu_set_fd_handler(net->backend, NULL, NULL, NULL); - file.fd = net->backend; - for (file.index = 0; file.index < net->dev.nvqs; ++file.index) { - r = ioctl(net->dev.control, VHOST_NET_SET_BACKEND, &file); - if (r < 0) { - r = -errno; - goto fail; - } - } - return 0; -fail: - file.fd = -1; - while (file.index-- > 0) { - int r = ioctl(net->dev.control, VHOST_NET_SET_BACKEND, &file); - assert(r >= 0); - } - net->nc->info->poll(net->nc, true); - vhost_dev_stop(&net->dev, dev); -fail_start: - vhost_dev_disable_notifiers(&net->dev, dev); -fail_notifiers: - return r; -} - -static void vhost_net_stop_one(struct vhost_net *net, - VirtIODevice *dev) -{ - struct vhost_vring_file file = { .fd = -1 }; - - if (!net->dev.started) { - return; - } - - for (file.index = 0; file.index < net->dev.nvqs; ++file.index) { - int r = ioctl(net->dev.control, VHOST_NET_SET_BACKEND, &file); - assert(r >= 0); - } - net->nc->info->poll(net->nc, true); - vhost_dev_stop(&net->dev, dev); - vhost_dev_disable_notifiers(&net->dev, dev); -} - -int vhost_net_start(VirtIODevice *dev, NetClientState *ncs, - int total_queues) -{ - int r, i = 0; - - if (!dev->binding->set_guest_notifiers) { - error_report("binding does not support guest notifiers"); - r = -ENOSYS; - goto err; - } - - for (i = 0; i < total_queues; i++) { - r = vhost_net_start_one(tap_get_vhost_net(ncs[i].peer), dev, i * 2); - - if (r < 0) { - goto err; - } - } - - r = dev->binding->set_guest_notifiers(dev->binding_opaque, - total_queues * 2, - true); - if (r < 0) { - error_report("Error binding guest notifier: %d", -r); - goto err; - } - - return 0; - -err: - while (--i >= 0) { - vhost_net_stop_one(tap_get_vhost_net(ncs[i].peer), dev); - } - return r; -} - -void vhost_net_stop(VirtIODevice *dev, NetClientState *ncs, - int total_queues) -{ - int i, r; - - r = dev->binding->set_guest_notifiers(dev->binding_opaque, - total_queues * 2, - false); - if (r < 0) { - fprintf(stderr, "vhost guest notifier cleanup failed: %d\n", r); - fflush(stderr); - } - assert(r >= 0); - - for (i = 0; i < total_queues; i++) { - vhost_net_stop_one(tap_get_vhost_net(ncs[i].peer), dev); - } -} - -void vhost_net_cleanup(struct vhost_net *net) -{ - vhost_dev_cleanup(&net->dev); - g_free(net); -} - -bool vhost_net_virtqueue_pending(VHostNetState *net, int idx) -{ - return vhost_virtqueue_pending(&net->dev, idx); -} - -void vhost_net_virtqueue_mask(VHostNetState *net, VirtIODevice *dev, - int idx, bool mask) -{ - vhost_virtqueue_mask(&net->dev, dev, idx, mask); -} -#else -struct vhost_net *vhost_net_init(NetClientState *backend, int devfd, - bool force) -{ - error_report("vhost-net support is not compiled in"); - return NULL; -} - -bool vhost_net_query(VHostNetState *net, VirtIODevice *dev) -{ - return false; -} - -int vhost_net_start(VirtIODevice *dev, - NetClientState *ncs, - int total_queues) -{ - return -ENOSYS; -} -void vhost_net_stop(VirtIODevice *dev, - NetClientState *ncs, - int total_queues) -{ -} - -void vhost_net_cleanup(struct vhost_net *net) -{ -} - -unsigned vhost_net_get_features(struct vhost_net *net, unsigned features) -{ - return features; -} -void vhost_net_ack_features(struct vhost_net *net, unsigned features) -{ -} - -bool vhost_net_virtqueue_pending(VHostNetState *net, int idx) -{ - return -ENOSYS; -} - -void vhost_net_virtqueue_mask(VHostNetState *net, VirtIODevice *dev, - int idx, bool mask) -{ -} -#endif diff --git a/hw/virtio-balloon.c b/hw/virtio-balloon.c deleted file mode 100644 index c2c446eb9b..0000000000 --- a/hw/virtio-balloon.c +++ /dev/null @@ -1,416 +0,0 @@ -/* - * Virtio Balloon Device - * - * Copyright IBM, Corp. 2008 - * Copyright (C) 2011 Red Hat, Inc. - * Copyright (C) 2011 Amit Shah - * - * Authors: - * Anthony Liguori - * - * This work is licensed under the terms of the GNU GPL, version 2. See - * the COPYING file in the top-level directory. - * - */ - -#include "qemu/iov.h" -#include "qemu/timer.h" -#include "qemu-common.h" -#include "hw/virtio/virtio.h" -#include "hw/i386/pc.h" -#include "cpu.h" -#include "sysemu/balloon.h" -#include "hw/virtio/virtio-balloon.h" -#include "sysemu/kvm.h" -#include "exec/address-spaces.h" -#include "qapi/visitor.h" - -#if defined(__linux__) -#include -#endif - -#include "hw/virtio/virtio-bus.h" - -static void balloon_page(void *addr, int deflate) -{ -#if defined(__linux__) - if (!kvm_enabled() || kvm_has_sync_mmu()) - qemu_madvise(addr, TARGET_PAGE_SIZE, - deflate ? QEMU_MADV_WILLNEED : QEMU_MADV_DONTNEED); -#endif -} - -static const char *balloon_stat_names[] = { - [VIRTIO_BALLOON_S_SWAP_IN] = "stat-swap-in", - [VIRTIO_BALLOON_S_SWAP_OUT] = "stat-swap-out", - [VIRTIO_BALLOON_S_MAJFLT] = "stat-major-faults", - [VIRTIO_BALLOON_S_MINFLT] = "stat-minor-faults", - [VIRTIO_BALLOON_S_MEMFREE] = "stat-free-memory", - [VIRTIO_BALLOON_S_MEMTOT] = "stat-total-memory", - [VIRTIO_BALLOON_S_NR] = NULL -}; - -/* - * reset_stats - Mark all items in the stats array as unset - * - * This function needs to be called at device intialization and before - * before updating to a set of newly-generated stats. This will ensure that no - * stale values stick around in case the guest reports a subset of the supported - * statistics. - */ -static inline void reset_stats(VirtIOBalloon *dev) -{ - int i; - for (i = 0; i < VIRTIO_BALLOON_S_NR; dev->stats[i++] = -1); -} - -static bool balloon_stats_supported(const VirtIOBalloon *s) -{ - VirtIODevice *vdev = VIRTIO_DEVICE(s); - return vdev->guest_features & (1 << VIRTIO_BALLOON_F_STATS_VQ); -} - -static bool balloon_stats_enabled(const VirtIOBalloon *s) -{ - return s->stats_poll_interval > 0; -} - -static void balloon_stats_destroy_timer(VirtIOBalloon *s) -{ - if (balloon_stats_enabled(s)) { - qemu_del_timer(s->stats_timer); - qemu_free_timer(s->stats_timer); - s->stats_timer = NULL; - s->stats_poll_interval = 0; - } -} - -static void balloon_stats_change_timer(VirtIOBalloon *s, int secs) -{ - qemu_mod_timer(s->stats_timer, qemu_get_clock_ms(vm_clock) + secs * 1000); -} - -static void balloon_stats_poll_cb(void *opaque) -{ - VirtIOBalloon *s = opaque; - VirtIODevice *vdev = VIRTIO_DEVICE(s); - - if (!balloon_stats_supported(s)) { - /* re-schedule */ - balloon_stats_change_timer(s, s->stats_poll_interval); - return; - } - - virtqueue_push(s->svq, &s->stats_vq_elem, s->stats_vq_offset); - virtio_notify(vdev, s->svq); -} - -static void balloon_stats_get_all(Object *obj, struct Visitor *v, - void *opaque, const char *name, Error **errp) -{ - VirtIOBalloon *s = opaque; - int i; - - if (!s->stats_last_update) { - error_setg(errp, "guest hasn't updated any stats yet"); - return; - } - - visit_start_struct(v, NULL, "guest-stats", name, 0, errp); - visit_type_int(v, &s->stats_last_update, "last-update", errp); - - visit_start_struct(v, NULL, NULL, "stats", 0, errp); - for (i = 0; i < VIRTIO_BALLOON_S_NR; i++) { - visit_type_int64(v, (int64_t *) &s->stats[i], balloon_stat_names[i], - errp); - } - visit_end_struct(v, errp); - - visit_end_struct(v, errp); -} - -static void balloon_stats_get_poll_interval(Object *obj, struct Visitor *v, - void *opaque, const char *name, - Error **errp) -{ - VirtIOBalloon *s = opaque; - visit_type_int(v, &s->stats_poll_interval, name, errp); -} - -static void balloon_stats_set_poll_interval(Object *obj, struct Visitor *v, - void *opaque, const char *name, - Error **errp) -{ - VirtIOBalloon *s = opaque; - int64_t value; - - visit_type_int(v, &value, name, errp); - if (error_is_set(errp)) { - return; - } - - if (value < 0) { - error_setg(errp, "timer value must be greater than zero"); - return; - } - - if (value == s->stats_poll_interval) { - return; - } - - if (value == 0) { - /* timer=0 disables the timer */ - balloon_stats_destroy_timer(s); - return; - } - - if (balloon_stats_enabled(s)) { - /* timer interval change */ - s->stats_poll_interval = value; - balloon_stats_change_timer(s, value); - return; - } - - /* create a new timer */ - g_assert(s->stats_timer == NULL); - s->stats_timer = qemu_new_timer_ms(vm_clock, balloon_stats_poll_cb, s); - s->stats_poll_interval = value; - balloon_stats_change_timer(s, 0); -} - -static void virtio_balloon_handle_output(VirtIODevice *vdev, VirtQueue *vq) -{ - VirtIOBalloon *s = VIRTIO_BALLOON(vdev); - VirtQueueElement elem; - MemoryRegionSection section; - - while (virtqueue_pop(vq, &elem)) { - size_t offset = 0; - uint32_t pfn; - - while (iov_to_buf(elem.out_sg, elem.out_num, offset, &pfn, 4) == 4) { - ram_addr_t pa; - ram_addr_t addr; - - pa = (ram_addr_t)ldl_p(&pfn) << VIRTIO_BALLOON_PFN_SHIFT; - offset += 4; - - /* FIXME: remove get_system_memory(), but how? */ - section = memory_region_find(get_system_memory(), pa, 1); - if (!section.size || !memory_region_is_ram(section.mr)) - continue; - - /* Using memory_region_get_ram_ptr is bending the rules a bit, but - should be OK because we only want a single page. */ - addr = section.offset_within_region; - balloon_page(memory_region_get_ram_ptr(section.mr) + addr, - !!(vq == s->dvq)); - } - - virtqueue_push(vq, &elem, offset); - virtio_notify(vdev, vq); - } -} - -static void virtio_balloon_receive_stats(VirtIODevice *vdev, VirtQueue *vq) -{ - VirtIOBalloon *s = VIRTIO_BALLOON(vdev); - VirtQueueElement *elem = &s->stats_vq_elem; - VirtIOBalloonStat stat; - size_t offset = 0; - qemu_timeval tv; - - if (!virtqueue_pop(vq, elem)) { - goto out; - } - - /* Initialize the stats to get rid of any stale values. This is only - * needed to handle the case where a guest supports fewer stats than it - * used to (ie. it has booted into an old kernel). - */ - reset_stats(s); - - while (iov_to_buf(elem->out_sg, elem->out_num, offset, &stat, sizeof(stat)) - == sizeof(stat)) { - uint16_t tag = tswap16(stat.tag); - uint64_t val = tswap64(stat.val); - - offset += sizeof(stat); - if (tag < VIRTIO_BALLOON_S_NR) - s->stats[tag] = val; - } - s->stats_vq_offset = offset; - - if (qemu_gettimeofday(&tv) < 0) { - fprintf(stderr, "warning: %s: failed to get time of day\n", __func__); - goto out; - } - - s->stats_last_update = tv.tv_sec; - -out: - if (balloon_stats_enabled(s)) { - balloon_stats_change_timer(s, s->stats_poll_interval); - } -} - -static void virtio_balloon_get_config(VirtIODevice *vdev, uint8_t *config_data) -{ - VirtIOBalloon *dev = VIRTIO_BALLOON(vdev); - struct virtio_balloon_config config; - - config.num_pages = cpu_to_le32(dev->num_pages); - config.actual = cpu_to_le32(dev->actual); - - memcpy(config_data, &config, 8); -} - -static void virtio_balloon_set_config(VirtIODevice *vdev, - const uint8_t *config_data) -{ - VirtIOBalloon *dev = VIRTIO_BALLOON(vdev); - struct virtio_balloon_config config; - uint32_t oldactual = dev->actual; - memcpy(&config, config_data, 8); - dev->actual = le32_to_cpu(config.actual); - if (dev->actual != oldactual) { - qemu_balloon_changed(ram_size - - (dev->actual << VIRTIO_BALLOON_PFN_SHIFT)); - } -} - -static uint32_t virtio_balloon_get_features(VirtIODevice *vdev, uint32_t f) -{ - f |= (1 << VIRTIO_BALLOON_F_STATS_VQ); - return f; -} - -static void virtio_balloon_stat(void *opaque, BalloonInfo *info) -{ - VirtIOBalloon *dev = opaque; - info->actual = ram_size - ((uint64_t) dev->actual << - VIRTIO_BALLOON_PFN_SHIFT); -} - -static void virtio_balloon_to_target(void *opaque, ram_addr_t target) -{ - VirtIOBalloon *dev = VIRTIO_BALLOON(opaque); - VirtIODevice *vdev = VIRTIO_DEVICE(dev); - - if (target > ram_size) { - target = ram_size; - } - if (target) { - dev->num_pages = (ram_size - target) >> VIRTIO_BALLOON_PFN_SHIFT; - virtio_notify_config(vdev); - } -} - -static void virtio_balloon_save(QEMUFile *f, void *opaque) -{ - VirtIOBalloon *s = VIRTIO_BALLOON(opaque); - VirtIODevice *vdev = VIRTIO_DEVICE(s); - - virtio_save(vdev, f); - - qemu_put_be32(f, s->num_pages); - qemu_put_be32(f, s->actual); -} - -static int virtio_balloon_load(QEMUFile *f, void *opaque, int version_id) -{ - VirtIOBalloon *s = VIRTIO_BALLOON(opaque); - VirtIODevice *vdev = VIRTIO_DEVICE(s); - int ret; - - if (version_id != 1) - return -EINVAL; - - ret = virtio_load(vdev, f); - if (ret) { - return ret; - } - - s->num_pages = qemu_get_be32(f); - s->actual = qemu_get_be32(f); - return 0; -} - -static int virtio_balloon_device_init(VirtIODevice *vdev) -{ - DeviceState *qdev = DEVICE(vdev); - VirtIOBalloon *s = VIRTIO_BALLOON(vdev); - int ret; - - virtio_init(vdev, "virtio-balloon", VIRTIO_ID_BALLOON, 8); - - vdev->get_config = virtio_balloon_get_config; - vdev->set_config = virtio_balloon_set_config; - vdev->get_features = virtio_balloon_get_features; - - ret = qemu_add_balloon_handler(virtio_balloon_to_target, - virtio_balloon_stat, s); - - if (ret < 0) { - virtio_common_cleanup(VIRTIO_DEVICE(s)); - return -1; - } - - s->ivq = virtio_add_queue(vdev, 128, virtio_balloon_handle_output); - s->dvq = virtio_add_queue(vdev, 128, virtio_balloon_handle_output); - s->svq = virtio_add_queue(vdev, 128, virtio_balloon_receive_stats); - - register_savevm(qdev, "virtio-balloon", -1, 1, - virtio_balloon_save, virtio_balloon_load, s); - - object_property_add(OBJECT(qdev), "guest-stats", "guest statistics", - balloon_stats_get_all, NULL, NULL, s, NULL); - - object_property_add(OBJECT(qdev), "guest-stats-polling-interval", "int", - balloon_stats_get_poll_interval, - balloon_stats_set_poll_interval, - NULL, s, NULL); - return 0; -} - -static int virtio_balloon_device_exit(DeviceState *qdev) -{ - VirtIOBalloon *s = VIRTIO_BALLOON(qdev); - VirtIODevice *vdev = VIRTIO_DEVICE(qdev); - - balloon_stats_destroy_timer(s); - qemu_remove_balloon_handler(s); - unregister_savevm(qdev, "virtio-balloon", s); - virtio_common_cleanup(vdev); - return 0; -} - -static Property virtio_balloon_properties[] = { - DEFINE_PROP_END_OF_LIST(), -}; - -static void virtio_balloon_class_init(ObjectClass *klass, void *data) -{ - DeviceClass *dc = DEVICE_CLASS(klass); - VirtioDeviceClass *vdc = VIRTIO_DEVICE_CLASS(klass); - dc->exit = virtio_balloon_device_exit; - dc->props = virtio_balloon_properties; - vdc->init = virtio_balloon_device_init; - vdc->get_config = virtio_balloon_get_config; - vdc->set_config = virtio_balloon_set_config; - vdc->get_features = virtio_balloon_get_features; -} - -static const TypeInfo virtio_balloon_info = { - .name = TYPE_VIRTIO_BALLOON, - .parent = TYPE_VIRTIO_DEVICE, - .instance_size = sizeof(VirtIOBalloon), - .class_init = virtio_balloon_class_init, -}; - -static void virtio_register_types(void) -{ - type_register_static(&virtio_balloon_info); -} - -type_init(virtio_register_types) diff --git a/hw/virtio-blk.c b/hw/virtio-blk.c deleted file mode 100644 index 6efb2f063d..0000000000 --- a/hw/virtio-blk.c +++ /dev/null @@ -1,732 +0,0 @@ -/* - * Virtio Block Device - * - * Copyright IBM, Corp. 2007 - * - * Authors: - * Anthony Liguori - * - * This work is licensed under the terms of the GNU GPL, version 2. See - * the COPYING file in the top-level directory. - * - */ - -#include "qemu-common.h" -#include "qemu/error-report.h" -#include "trace.h" -#include "hw/block/block.h" -#include "sysemu/blockdev.h" -#include "hw/virtio/virtio-blk.h" -#ifdef CONFIG_VIRTIO_BLK_DATA_PLANE -# include "dataplane/virtio-blk.h" -#endif -#include "block/scsi.h" -#ifdef __linux__ -# include -#endif -#include "hw/virtio/virtio-bus.h" - -typedef struct VirtIOBlockReq -{ - VirtIOBlock *dev; - VirtQueueElement elem; - struct virtio_blk_inhdr *in; - struct virtio_blk_outhdr *out; - struct virtio_scsi_inhdr *scsi; - QEMUIOVector qiov; - struct VirtIOBlockReq *next; - BlockAcctCookie acct; -} VirtIOBlockReq; - -static void virtio_blk_req_complete(VirtIOBlockReq *req, int status) -{ - VirtIOBlock *s = req->dev; - VirtIODevice *vdev = VIRTIO_DEVICE(s); - - trace_virtio_blk_req_complete(req, status); - - stb_p(&req->in->status, status); - virtqueue_push(s->vq, &req->elem, req->qiov.size + sizeof(*req->in)); - virtio_notify(vdev, s->vq); -} - -static int virtio_blk_handle_rw_error(VirtIOBlockReq *req, int error, - bool is_read) -{ - BlockErrorAction action = bdrv_get_error_action(req->dev->bs, is_read, error); - VirtIOBlock *s = req->dev; - - if (action == BDRV_ACTION_STOP) { - req->next = s->rq; - s->rq = req; - } else if (action == BDRV_ACTION_REPORT) { - virtio_blk_req_complete(req, VIRTIO_BLK_S_IOERR); - bdrv_acct_done(s->bs, &req->acct); - g_free(req); - } - - bdrv_error_action(s->bs, action, is_read, error); - return action != BDRV_ACTION_IGNORE; -} - -static void virtio_blk_rw_complete(void *opaque, int ret) -{ - VirtIOBlockReq *req = opaque; - - trace_virtio_blk_rw_complete(req, ret); - - if (ret) { - bool is_read = !(ldl_p(&req->out->type) & VIRTIO_BLK_T_OUT); - if (virtio_blk_handle_rw_error(req, -ret, is_read)) - return; - } - - virtio_blk_req_complete(req, VIRTIO_BLK_S_OK); - bdrv_acct_done(req->dev->bs, &req->acct); - g_free(req); -} - -static void virtio_blk_flush_complete(void *opaque, int ret) -{ - VirtIOBlockReq *req = opaque; - - if (ret) { - if (virtio_blk_handle_rw_error(req, -ret, 0)) { - return; - } - } - - virtio_blk_req_complete(req, VIRTIO_BLK_S_OK); - bdrv_acct_done(req->dev->bs, &req->acct); - g_free(req); -} - -static VirtIOBlockReq *virtio_blk_alloc_request(VirtIOBlock *s) -{ - VirtIOBlockReq *req = g_malloc(sizeof(*req)); - req->dev = s; - req->qiov.size = 0; - req->next = NULL; - return req; -} - -static VirtIOBlockReq *virtio_blk_get_request(VirtIOBlock *s) -{ - VirtIOBlockReq *req = virtio_blk_alloc_request(s); - - if (req != NULL) { - if (!virtqueue_pop(s->vq, &req->elem)) { - g_free(req); - return NULL; - } - } - - return req; -} - -static void virtio_blk_handle_scsi(VirtIOBlockReq *req) -{ -#ifdef __linux__ - int ret; - int i; -#endif - int status = VIRTIO_BLK_S_OK; - - /* - * We require at least one output segment each for the virtio_blk_outhdr - * and the SCSI command block. - * - * We also at least require the virtio_blk_inhdr, the virtio_scsi_inhdr - * and the sense buffer pointer in the input segments. - */ - if (req->elem.out_num < 2 || req->elem.in_num < 3) { - virtio_blk_req_complete(req, VIRTIO_BLK_S_IOERR); - g_free(req); - return; - } - - /* - * The scsi inhdr is placed in the second-to-last input segment, just - * before the regular inhdr. - */ - req->scsi = (void *)req->elem.in_sg[req->elem.in_num - 2].iov_base; - - if (!req->dev->blk.scsi) { - status = VIRTIO_BLK_S_UNSUPP; - goto fail; - } - - /* - * No support for bidirection commands yet. - */ - if (req->elem.out_num > 2 && req->elem.in_num > 3) { - status = VIRTIO_BLK_S_UNSUPP; - goto fail; - } - -#ifdef __linux__ - struct sg_io_hdr hdr; - memset(&hdr, 0, sizeof(struct sg_io_hdr)); - hdr.interface_id = 'S'; - hdr.cmd_len = req->elem.out_sg[1].iov_len; - hdr.cmdp = req->elem.out_sg[1].iov_base; - hdr.dxfer_len = 0; - - if (req->elem.out_num > 2) { - /* - * If there are more than the minimally required 2 output segments - * there is write payload starting from the third iovec. - */ - hdr.dxfer_direction = SG_DXFER_TO_DEV; - hdr.iovec_count = req->elem.out_num - 2; - - for (i = 0; i < hdr.iovec_count; i++) - hdr.dxfer_len += req->elem.out_sg[i + 2].iov_len; - - hdr.dxferp = req->elem.out_sg + 2; - - } else if (req->elem.in_num > 3) { - /* - * If we have more than 3 input segments the guest wants to actually - * read data. - */ - hdr.dxfer_direction = SG_DXFER_FROM_DEV; - hdr.iovec_count = req->elem.in_num - 3; - for (i = 0; i < hdr.iovec_count; i++) - hdr.dxfer_len += req->elem.in_sg[i].iov_len; - - hdr.dxferp = req->elem.in_sg; - } else { - /* - * Some SCSI commands don't actually transfer any data. - */ - hdr.dxfer_direction = SG_DXFER_NONE; - } - - hdr.sbp = req->elem.in_sg[req->elem.in_num - 3].iov_base; - hdr.mx_sb_len = req->elem.in_sg[req->elem.in_num - 3].iov_len; - - ret = bdrv_ioctl(req->dev->bs, SG_IO, &hdr); - if (ret) { - status = VIRTIO_BLK_S_UNSUPP; - goto fail; - } - - /* - * From SCSI-Generic-HOWTO: "Some lower level drivers (e.g. ide-scsi) - * clear the masked_status field [hence status gets cleared too, see - * block/scsi_ioctl.c] even when a CHECK_CONDITION or COMMAND_TERMINATED - * status has occurred. However they do set DRIVER_SENSE in driver_status - * field. Also a (sb_len_wr > 0) indicates there is a sense buffer. - */ - if (hdr.status == 0 && hdr.sb_len_wr > 0) { - hdr.status = CHECK_CONDITION; - } - - stl_p(&req->scsi->errors, - hdr.status | (hdr.msg_status << 8) | - (hdr.host_status << 16) | (hdr.driver_status << 24)); - stl_p(&req->scsi->residual, hdr.resid); - stl_p(&req->scsi->sense_len, hdr.sb_len_wr); - stl_p(&req->scsi->data_len, hdr.dxfer_len); - - virtio_blk_req_complete(req, status); - g_free(req); - return; -#else - abort(); -#endif - -fail: - /* Just put anything nonzero so that the ioctl fails in the guest. */ - stl_p(&req->scsi->errors, 255); - virtio_blk_req_complete(req, status); - g_free(req); -} - -typedef struct MultiReqBuffer { - BlockRequest blkreq[32]; - unsigned int num_writes; -} MultiReqBuffer; - -static void virtio_submit_multiwrite(BlockDriverState *bs, MultiReqBuffer *mrb) -{ - int i, ret; - - if (!mrb->num_writes) { - return; - } - - ret = bdrv_aio_multiwrite(bs, mrb->blkreq, mrb->num_writes); - if (ret != 0) { - for (i = 0; i < mrb->num_writes; i++) { - if (mrb->blkreq[i].error) { - virtio_blk_rw_complete(mrb->blkreq[i].opaque, -EIO); - } - } - } - - mrb->num_writes = 0; -} - -static void virtio_blk_handle_flush(VirtIOBlockReq *req, MultiReqBuffer *mrb) -{ - bdrv_acct_start(req->dev->bs, &req->acct, 0, BDRV_ACCT_FLUSH); - - /* - * Make sure all outstanding writes are posted to the backing device. - */ - virtio_submit_multiwrite(req->dev->bs, mrb); - bdrv_aio_flush(req->dev->bs, virtio_blk_flush_complete, req); -} - -static void virtio_blk_handle_write(VirtIOBlockReq *req, MultiReqBuffer *mrb) -{ - BlockRequest *blkreq; - uint64_t sector; - - sector = ldq_p(&req->out->sector); - - bdrv_acct_start(req->dev->bs, &req->acct, req->qiov.size, BDRV_ACCT_WRITE); - - trace_virtio_blk_handle_write(req, sector, req->qiov.size / 512); - - if (sector & req->dev->sector_mask) { - virtio_blk_rw_complete(req, -EIO); - return; - } - if (req->qiov.size % req->dev->conf->logical_block_size) { - virtio_blk_rw_complete(req, -EIO); - return; - } - - if (mrb->num_writes == 32) { - virtio_submit_multiwrite(req->dev->bs, mrb); - } - - blkreq = &mrb->blkreq[mrb->num_writes]; - blkreq->sector = sector; - blkreq->nb_sectors = req->qiov.size / BDRV_SECTOR_SIZE; - blkreq->qiov = &req->qiov; - blkreq->cb = virtio_blk_rw_complete; - blkreq->opaque = req; - blkreq->error = 0; - - mrb->num_writes++; -} - -static void virtio_blk_handle_read(VirtIOBlockReq *req) -{ - uint64_t sector; - - sector = ldq_p(&req->out->sector); - - bdrv_acct_start(req->dev->bs, &req->acct, req->qiov.size, BDRV_ACCT_READ); - - trace_virtio_blk_handle_read(req, sector, req->qiov.size / 512); - - if (sector & req->dev->sector_mask) { - virtio_blk_rw_complete(req, -EIO); - return; - } - if (req->qiov.size % req->dev->conf->logical_block_size) { - virtio_blk_rw_complete(req, -EIO); - return; - } - bdrv_aio_readv(req->dev->bs, sector, &req->qiov, - req->qiov.size / BDRV_SECTOR_SIZE, - virtio_blk_rw_complete, req); -} - -static void virtio_blk_handle_request(VirtIOBlockReq *req, - MultiReqBuffer *mrb) -{ - uint32_t type; - - if (req->elem.out_num < 1 || req->elem.in_num < 1) { - error_report("virtio-blk missing headers"); - exit(1); - } - - if (req->elem.out_sg[0].iov_len < sizeof(*req->out) || - req->elem.in_sg[req->elem.in_num - 1].iov_len < sizeof(*req->in)) { - error_report("virtio-blk header not in correct element"); - exit(1); - } - - req->out = (void *)req->elem.out_sg[0].iov_base; - req->in = (void *)req->elem.in_sg[req->elem.in_num - 1].iov_base; - - type = ldl_p(&req->out->type); - - if (type & VIRTIO_BLK_T_FLUSH) { - virtio_blk_handle_flush(req, mrb); - } else if (type & VIRTIO_BLK_T_SCSI_CMD) { - virtio_blk_handle_scsi(req); - } else if (type & VIRTIO_BLK_T_GET_ID) { - VirtIOBlock *s = req->dev; - - /* - * NB: per existing s/n string convention the string is - * terminated by '\0' only when shorter than buffer. - */ - strncpy(req->elem.in_sg[0].iov_base, - s->blk.serial ? s->blk.serial : "", - MIN(req->elem.in_sg[0].iov_len, VIRTIO_BLK_ID_BYTES)); - virtio_blk_req_complete(req, VIRTIO_BLK_S_OK); - g_free(req); - } else if (type & VIRTIO_BLK_T_OUT) { - qemu_iovec_init_external(&req->qiov, &req->elem.out_sg[1], - req->elem.out_num - 1); - virtio_blk_handle_write(req, mrb); - } else if (type == VIRTIO_BLK_T_IN || type == VIRTIO_BLK_T_BARRIER) { - /* VIRTIO_BLK_T_IN is 0, so we can't just & it. */ - qemu_iovec_init_external(&req->qiov, &req->elem.in_sg[0], - req->elem.in_num - 1); - virtio_blk_handle_read(req); - } else { - virtio_blk_req_complete(req, VIRTIO_BLK_S_UNSUPP); - g_free(req); - } -} - -static void virtio_blk_handle_output(VirtIODevice *vdev, VirtQueue *vq) -{ - VirtIOBlock *s = VIRTIO_BLK(vdev); - VirtIOBlockReq *req; - MultiReqBuffer mrb = { - .num_writes = 0, - }; - -#ifdef CONFIG_VIRTIO_BLK_DATA_PLANE - /* Some guests kick before setting VIRTIO_CONFIG_S_DRIVER_OK so start - * dataplane here instead of waiting for .set_status(). - */ - if (s->dataplane) { - virtio_blk_data_plane_start(s->dataplane); - return; - } -#endif - - while ((req = virtio_blk_get_request(s))) { - virtio_blk_handle_request(req, &mrb); - } - - virtio_submit_multiwrite(s->bs, &mrb); - - /* - * FIXME: Want to check for completions before returning to guest mode, - * so cached reads and writes are reported as quickly as possible. But - * that should be done in the generic block layer. - */ -} - -static void virtio_blk_dma_restart_bh(void *opaque) -{ - VirtIOBlock *s = opaque; - VirtIOBlockReq *req = s->rq; - MultiReqBuffer mrb = { - .num_writes = 0, - }; - - qemu_bh_delete(s->bh); - s->bh = NULL; - - s->rq = NULL; - - while (req) { - virtio_blk_handle_request(req, &mrb); - req = req->next; - } - - virtio_submit_multiwrite(s->bs, &mrb); -} - -static void virtio_blk_dma_restart_cb(void *opaque, int running, - RunState state) -{ - VirtIOBlock *s = opaque; - - if (!running) { - return; - } - - if (!s->bh) { - s->bh = qemu_bh_new(virtio_blk_dma_restart_bh, s); - qemu_bh_schedule(s->bh); - } -} - -static void virtio_blk_reset(VirtIODevice *vdev) -{ -#ifdef CONFIG_VIRTIO_BLK_DATA_PLANE - VirtIOBlock *s = VIRTIO_BLK(vdev); - - if (s->dataplane) { - virtio_blk_data_plane_stop(s->dataplane); - } -#endif - - /* - * This should cancel pending requests, but can't do nicely until there - * are per-device request lists. - */ - bdrv_drain_all(); -} - -/* coalesce internal state, copy to pci i/o region 0 - */ -static void virtio_blk_update_config(VirtIODevice *vdev, uint8_t *config) -{ - VirtIOBlock *s = VIRTIO_BLK(vdev); - struct virtio_blk_config blkcfg; - uint64_t capacity; - int blk_size = s->conf->logical_block_size; - - bdrv_get_geometry(s->bs, &capacity); - memset(&blkcfg, 0, sizeof(blkcfg)); - stq_raw(&blkcfg.capacity, capacity); - stl_raw(&blkcfg.seg_max, 128 - 2); - stw_raw(&blkcfg.cylinders, s->conf->cyls); - stl_raw(&blkcfg.blk_size, blk_size); - stw_raw(&blkcfg.min_io_size, s->conf->min_io_size / blk_size); - stw_raw(&blkcfg.opt_io_size, s->conf->opt_io_size / blk_size); - blkcfg.heads = s->conf->heads; - /* - * We must ensure that the block device capacity is a multiple of - * the logical block size. If that is not the case, lets use - * sector_mask to adopt the geometry to have a correct picture. - * For those devices where the capacity is ok for the given geometry - * we dont touch the sector value of the geometry, since some devices - * (like s390 dasd) need a specific value. Here the capacity is already - * cyls*heads*secs*blk_size and the sector value is not block size - * divided by 512 - instead it is the amount of blk_size blocks - * per track (cylinder). - */ - if (bdrv_getlength(s->bs) / s->conf->heads / s->conf->secs % blk_size) { - blkcfg.sectors = s->conf->secs & ~s->sector_mask; - } else { - blkcfg.sectors = s->conf->secs; - } - blkcfg.size_max = 0; - blkcfg.physical_block_exp = get_physical_block_exp(s->conf); - blkcfg.alignment_offset = 0; - blkcfg.wce = bdrv_enable_write_cache(s->bs); - memcpy(config, &blkcfg, sizeof(struct virtio_blk_config)); -} - -static void virtio_blk_set_config(VirtIODevice *vdev, const uint8_t *config) -{ - VirtIOBlock *s = VIRTIO_BLK(vdev); - struct virtio_blk_config blkcfg; - - memcpy(&blkcfg, config, sizeof(blkcfg)); - bdrv_set_enable_write_cache(s->bs, blkcfg.wce != 0); -} - -static uint32_t virtio_blk_get_features(VirtIODevice *vdev, uint32_t features) -{ - VirtIOBlock *s = VIRTIO_BLK(vdev); - - features |= (1 << VIRTIO_BLK_F_SEG_MAX); - features |= (1 << VIRTIO_BLK_F_GEOMETRY); - features |= (1 << VIRTIO_BLK_F_TOPOLOGY); - features |= (1 << VIRTIO_BLK_F_BLK_SIZE); - features |= (1 << VIRTIO_BLK_F_SCSI); - - if (s->blk.config_wce) { - features |= (1 << VIRTIO_BLK_F_CONFIG_WCE); - } - if (bdrv_enable_write_cache(s->bs)) - features |= (1 << VIRTIO_BLK_F_WCE); - - if (bdrv_is_read_only(s->bs)) - features |= 1 << VIRTIO_BLK_F_RO; - - return features; -} - -static void virtio_blk_set_status(VirtIODevice *vdev, uint8_t status) -{ - VirtIOBlock *s = VIRTIO_BLK(vdev); - uint32_t features; - -#ifdef CONFIG_VIRTIO_BLK_DATA_PLANE - if (s->dataplane && !(status & (VIRTIO_CONFIG_S_DRIVER | - VIRTIO_CONFIG_S_DRIVER_OK))) { - virtio_blk_data_plane_stop(s->dataplane); - } -#endif - - if (!(status & VIRTIO_CONFIG_S_DRIVER_OK)) { - return; - } - - features = vdev->guest_features; - bdrv_set_enable_write_cache(s->bs, !!(features & (1 << VIRTIO_BLK_F_WCE))); -} - -static void virtio_blk_save(QEMUFile *f, void *opaque) -{ - VirtIOBlock *s = opaque; - VirtIODevice *vdev = VIRTIO_DEVICE(s); - VirtIOBlockReq *req = s->rq; - - virtio_save(vdev, f); - - while (req) { - qemu_put_sbyte(f, 1); - qemu_put_buffer(f, (unsigned char*)&req->elem, sizeof(req->elem)); - req = req->next; - } - qemu_put_sbyte(f, 0); -} - -static int virtio_blk_load(QEMUFile *f, void *opaque, int version_id) -{ - VirtIOBlock *s = opaque; - VirtIODevice *vdev = VIRTIO_DEVICE(s); - int ret; - - if (version_id != 2) - return -EINVAL; - - ret = virtio_load(vdev, f); - if (ret) { - return ret; - } - - while (qemu_get_sbyte(f)) { - VirtIOBlockReq *req = virtio_blk_alloc_request(s); - qemu_get_buffer(f, (unsigned char*)&req->elem, sizeof(req->elem)); - req->next = s->rq; - s->rq = req; - - virtqueue_map_sg(req->elem.in_sg, req->elem.in_addr, - req->elem.in_num, 1); - virtqueue_map_sg(req->elem.out_sg, req->elem.out_addr, - req->elem.out_num, 0); - } - - return 0; -} - -static void virtio_blk_resize(void *opaque) -{ - VirtIODevice *vdev = VIRTIO_DEVICE(opaque); - - virtio_notify_config(vdev); -} - -static const BlockDevOps virtio_block_ops = { - .resize_cb = virtio_blk_resize, -}; - -void virtio_blk_set_conf(DeviceState *dev, VirtIOBlkConf *blk) -{ - VirtIOBlock *s = VIRTIO_BLK(dev); - memcpy(&(s->blk), blk, sizeof(struct VirtIOBlkConf)); -} - -static int virtio_blk_device_init(VirtIODevice *vdev) -{ - DeviceState *qdev = DEVICE(vdev); - VirtIOBlock *s = VIRTIO_BLK(vdev); - VirtIOBlkConf *blk = &(s->blk); - static int virtio_blk_id; - - if (!blk->conf.bs) { - error_report("drive property not set"); - return -1; - } - if (!bdrv_is_inserted(blk->conf.bs)) { - error_report("Device needs media, but drive is empty"); - return -1; - } - - blkconf_serial(&blk->conf, &blk->serial); - if (blkconf_geometry(&blk->conf, NULL, 65535, 255, 255) < 0) { - return -1; - } - - virtio_init(vdev, "virtio-blk", VIRTIO_ID_BLOCK, - sizeof(struct virtio_blk_config)); - - vdev->get_config = virtio_blk_update_config; - vdev->set_config = virtio_blk_set_config; - vdev->get_features = virtio_blk_get_features; - vdev->set_status = virtio_blk_set_status; - vdev->reset = virtio_blk_reset; - s->bs = blk->conf.bs; - s->conf = &blk->conf; - memcpy(&(s->blk), blk, sizeof(struct VirtIOBlkConf)); - s->rq = NULL; - s->sector_mask = (s->conf->logical_block_size / BDRV_SECTOR_SIZE) - 1; - - s->vq = virtio_add_queue(vdev, 128, virtio_blk_handle_output); -#ifdef CONFIG_VIRTIO_BLK_DATA_PLANE - if (!virtio_blk_data_plane_create(vdev, blk, &s->dataplane)) { - virtio_common_cleanup(vdev); - return -1; - } -#endif - - s->change = qemu_add_vm_change_state_handler(virtio_blk_dma_restart_cb, s); - register_savevm(qdev, "virtio-blk", virtio_blk_id++, 2, - virtio_blk_save, virtio_blk_load, s); - bdrv_set_dev_ops(s->bs, &virtio_block_ops, s); - bdrv_set_buffer_alignment(s->bs, s->conf->logical_block_size); - - bdrv_iostatus_enable(s->bs); - - add_boot_device_path(s->conf->bootindex, qdev, "/disk@0,0"); - return 0; -} - -static int virtio_blk_device_exit(DeviceState *dev) -{ - VirtIODevice *vdev = VIRTIO_DEVICE(dev); - VirtIOBlock *s = VIRTIO_BLK(dev); -#ifdef CONFIG_VIRTIO_BLK_DATA_PLANE - virtio_blk_data_plane_destroy(s->dataplane); - s->dataplane = NULL; -#endif - qemu_del_vm_change_state_handler(s->change); - unregister_savevm(dev, "virtio-blk", s); - blockdev_mark_auto_del(s->bs); - virtio_common_cleanup(vdev); - return 0; -} - -static Property virtio_blk_properties[] = { - DEFINE_VIRTIO_BLK_PROPERTIES(VirtIOBlock, blk), - DEFINE_PROP_END_OF_LIST(), -}; - -static void virtio_blk_class_init(ObjectClass *klass, void *data) -{ - DeviceClass *dc = DEVICE_CLASS(klass); - VirtioDeviceClass *vdc = VIRTIO_DEVICE_CLASS(klass); - dc->exit = virtio_blk_device_exit; - dc->props = virtio_blk_properties; - vdc->init = virtio_blk_device_init; - vdc->get_config = virtio_blk_update_config; - vdc->set_config = virtio_blk_set_config; - vdc->get_features = virtio_blk_get_features; - vdc->set_status = virtio_blk_set_status; - vdc->reset = virtio_blk_reset; -} - -static const TypeInfo virtio_device_info = { - .name = TYPE_VIRTIO_BLK, - .parent = TYPE_VIRTIO_DEVICE, - .instance_size = sizeof(VirtIOBlock), - .class_init = virtio_blk_class_init, -}; - -static void virtio_register_types(void) -{ - type_register_static(&virtio_device_info); -} - -type_init(virtio_register_types) diff --git a/hw/virtio-net.c b/hw/virtio-net.c deleted file mode 100644 index bc8fd43b4b..0000000000 --- a/hw/virtio-net.c +++ /dev/null @@ -1,1370 +0,0 @@ -/* - * Virtio Network Device - * - * Copyright IBM, Corp. 2007 - * - * Authors: - * Anthony Liguori - * - * This work is licensed under the terms of the GNU GPL, version 2. See - * the COPYING file in the top-level directory. - * - */ - -#include "qemu/iov.h" -#include "hw/virtio/virtio.h" -#include "net/net.h" -#include "net/checksum.h" -#include "net/tap.h" -#include "qemu/error-report.h" -#include "qemu/timer.h" -#include "hw/virtio/virtio-net.h" -#include "net/vhost_net.h" - -#define VIRTIO_NET_VM_VERSION 11 - -#define MAC_TABLE_ENTRIES 64 -#define MAX_VLAN (1 << 12) /* Per 802.1Q definition */ - -/* - * Calculate the number of bytes up to and including the given 'field' of - * 'container'. - */ -#define endof(container, field) \ - (offsetof(container, field) + sizeof(((container *)0)->field)) - -typedef struct VirtIOFeature { - uint32_t flags; - size_t end; -} VirtIOFeature; - -static VirtIOFeature feature_sizes[] = { - {.flags = 1 << VIRTIO_NET_F_MAC, - .end = endof(struct virtio_net_config, mac)}, - {.flags = 1 << VIRTIO_NET_F_STATUS, - .end = endof(struct virtio_net_config, status)}, - {.flags = 1 << VIRTIO_NET_F_MQ, - .end = endof(struct virtio_net_config, max_virtqueue_pairs)}, - {} -}; - -static VirtIONetQueue *virtio_net_get_subqueue(NetClientState *nc) -{ - VirtIONet *n = qemu_get_nic_opaque(nc); - - return &n->vqs[nc->queue_index]; -} - -static int vq2q(int queue_index) -{ - return queue_index / 2; -} - -/* TODO - * - we could suppress RX interrupt if we were so inclined. - */ - -static VirtIONet *to_virtio_net(VirtIODevice *vdev) -{ - return (VirtIONet *)vdev; -} - -static void virtio_net_get_config(VirtIODevice *vdev, uint8_t *config) -{ - VirtIONet *n = to_virtio_net(vdev); - struct virtio_net_config netcfg; - - stw_p(&netcfg.status, n->status); - stw_p(&netcfg.max_virtqueue_pairs, n->max_queues); - memcpy(netcfg.mac, n->mac, ETH_ALEN); - memcpy(config, &netcfg, n->config_size); -} - -static void virtio_net_set_config(VirtIODevice *vdev, const uint8_t *config) -{ - VirtIONet *n = to_virtio_net(vdev); - struct virtio_net_config netcfg = {}; - - memcpy(&netcfg, config, n->config_size); - - if (!(n->vdev.guest_features >> VIRTIO_NET_F_CTRL_MAC_ADDR & 1) && - memcmp(netcfg.mac, n->mac, ETH_ALEN)) { - memcpy(n->mac, netcfg.mac, ETH_ALEN); - qemu_format_nic_info_str(qemu_get_queue(n->nic), n->mac); - } -} - -static bool virtio_net_started(VirtIONet *n, uint8_t status) -{ - return (status & VIRTIO_CONFIG_S_DRIVER_OK) && - (n->status & VIRTIO_NET_S_LINK_UP) && n->vdev.vm_running; -} - -static void virtio_net_vhost_status(VirtIONet *n, uint8_t status) -{ - NetClientState *nc = qemu_get_queue(n->nic); - int queues = n->multiqueue ? n->max_queues : 1; - - if (!nc->peer) { - return; - } - if (nc->peer->info->type != NET_CLIENT_OPTIONS_KIND_TAP) { - return; - } - - if (!tap_get_vhost_net(nc->peer)) { - return; - } - - if (!!n->vhost_started == virtio_net_started(n, status) && - !nc->peer->link_down) { - return; - } - if (!n->vhost_started) { - int r; - if (!vhost_net_query(tap_get_vhost_net(nc->peer), &n->vdev)) { - return; - } - n->vhost_started = 1; - r = vhost_net_start(&n->vdev, n->nic->ncs, queues); - if (r < 0) { - error_report("unable to start vhost net: %d: " - "falling back on userspace virtio", -r); - n->vhost_started = 0; - } - } else { - vhost_net_stop(&n->vdev, n->nic->ncs, queues); - n->vhost_started = 0; - } -} - -static void virtio_net_set_status(struct VirtIODevice *vdev, uint8_t status) -{ - VirtIONet *n = to_virtio_net(vdev); - VirtIONetQueue *q; - int i; - uint8_t queue_status; - - virtio_net_vhost_status(n, status); - - for (i = 0; i < n->max_queues; i++) { - q = &n->vqs[i]; - - if ((!n->multiqueue && i != 0) || i >= n->curr_queues) { - queue_status = 0; - } else { - queue_status = status; - } - - if (!q->tx_waiting) { - continue; - } - - if (virtio_net_started(n, queue_status) && !n->vhost_started) { - if (q->tx_timer) { - qemu_mod_timer(q->tx_timer, - qemu_get_clock_ns(vm_clock) + n->tx_timeout); - } else { - qemu_bh_schedule(q->tx_bh); - } - } else { - if (q->tx_timer) { - qemu_del_timer(q->tx_timer); - } else { - qemu_bh_cancel(q->tx_bh); - } - } - } -} - -static void virtio_net_set_link_status(NetClientState *nc) -{ - VirtIONet *n = qemu_get_nic_opaque(nc); - uint16_t old_status = n->status; - - if (nc->link_down) - n->status &= ~VIRTIO_NET_S_LINK_UP; - else - n->status |= VIRTIO_NET_S_LINK_UP; - - if (n->status != old_status) - virtio_notify_config(&n->vdev); - - virtio_net_set_status(&n->vdev, n->vdev.status); -} - -static void virtio_net_reset(VirtIODevice *vdev) -{ - VirtIONet *n = to_virtio_net(vdev); - - /* Reset back to compatibility mode */ - n->promisc = 1; - n->allmulti = 0; - n->alluni = 0; - n->nomulti = 0; - n->nouni = 0; - n->nobcast = 0; - /* multiqueue is disabled by default */ - n->curr_queues = 1; - - /* Flush any MAC and VLAN filter table state */ - n->mac_table.in_use = 0; - n->mac_table.first_multi = 0; - n->mac_table.multi_overflow = 0; - n->mac_table.uni_overflow = 0; - memset(n->mac_table.macs, 0, MAC_TABLE_ENTRIES * ETH_ALEN); - memcpy(&n->mac[0], &n->nic->conf->macaddr, sizeof(n->mac)); - memset(n->vlans, 0, MAX_VLAN >> 3); -} - -static void peer_test_vnet_hdr(VirtIONet *n) -{ - NetClientState *nc = qemu_get_queue(n->nic); - if (!nc->peer) { - return; - } - - if (nc->peer->info->type != NET_CLIENT_OPTIONS_KIND_TAP) { - return; - } - - n->has_vnet_hdr = tap_has_vnet_hdr(nc->peer); -} - -static int peer_has_vnet_hdr(VirtIONet *n) -{ - return n->has_vnet_hdr; -} - -static int peer_has_ufo(VirtIONet *n) -{ - if (!peer_has_vnet_hdr(n)) - return 0; - - n->has_ufo = tap_has_ufo(qemu_get_queue(n->nic)->peer); - - return n->has_ufo; -} - -static void virtio_net_set_mrg_rx_bufs(VirtIONet *n, int mergeable_rx_bufs) -{ - int i; - NetClientState *nc; - - n->mergeable_rx_bufs = mergeable_rx_bufs; - - n->guest_hdr_len = n->mergeable_rx_bufs ? - sizeof(struct virtio_net_hdr_mrg_rxbuf) : sizeof(struct virtio_net_hdr); - - for (i = 0; i < n->max_queues; i++) { - nc = qemu_get_subqueue(n->nic, i); - - if (peer_has_vnet_hdr(n) && - tap_has_vnet_hdr_len(nc->peer, n->guest_hdr_len)) { - tap_set_vnet_hdr_len(nc->peer, n->guest_hdr_len); - n->host_hdr_len = n->guest_hdr_len; - } - } -} - -static int peer_attach(VirtIONet *n, int index) -{ - NetClientState *nc = qemu_get_subqueue(n->nic, index); - - if (!nc->peer) { - return 0; - } - - if (nc->peer->info->type != NET_CLIENT_OPTIONS_KIND_TAP) { - return 0; - } - - return tap_enable(nc->peer); -} - -static int peer_detach(VirtIONet *n, int index) -{ - NetClientState *nc = qemu_get_subqueue(n->nic, index); - - if (!nc->peer) { - return 0; - } - - if (nc->peer->info->type != NET_CLIENT_OPTIONS_KIND_TAP) { - return 0; - } - - return tap_disable(nc->peer); -} - -static void virtio_net_set_queues(VirtIONet *n) -{ - int i; - - for (i = 0; i < n->max_queues; i++) { - if (i < n->curr_queues) { - assert(!peer_attach(n, i)); - } else { - assert(!peer_detach(n, i)); - } - } -} - -static void virtio_net_set_multiqueue(VirtIONet *n, int multiqueue, int ctrl); - -static uint32_t virtio_net_get_features(VirtIODevice *vdev, uint32_t features) -{ - VirtIONet *n = to_virtio_net(vdev); - NetClientState *nc = qemu_get_queue(n->nic); - - features |= (1 << VIRTIO_NET_F_MAC); - - if (!peer_has_vnet_hdr(n)) { - features &= ~(0x1 << VIRTIO_NET_F_CSUM); - features &= ~(0x1 << VIRTIO_NET_F_HOST_TSO4); - features &= ~(0x1 << VIRTIO_NET_F_HOST_TSO6); - features &= ~(0x1 << VIRTIO_NET_F_HOST_ECN); - - features &= ~(0x1 << VIRTIO_NET_F_GUEST_CSUM); - features &= ~(0x1 << VIRTIO_NET_F_GUEST_TSO4); - features &= ~(0x1 << VIRTIO_NET_F_GUEST_TSO6); - features &= ~(0x1 << VIRTIO_NET_F_GUEST_ECN); - } - - if (!peer_has_vnet_hdr(n) || !peer_has_ufo(n)) { - features &= ~(0x1 << VIRTIO_NET_F_GUEST_UFO); - features &= ~(0x1 << VIRTIO_NET_F_HOST_UFO); - } - - if (!nc->peer || nc->peer->info->type != NET_CLIENT_OPTIONS_KIND_TAP) { - return features; - } - if (!tap_get_vhost_net(nc->peer)) { - return features; - } - return vhost_net_get_features(tap_get_vhost_net(nc->peer), features); -} - -static uint32_t virtio_net_bad_features(VirtIODevice *vdev) -{ - uint32_t features = 0; - - /* Linux kernel 2.6.25. It understood MAC (as everyone must), - * but also these: */ - features |= (1 << VIRTIO_NET_F_MAC); - features |= (1 << VIRTIO_NET_F_CSUM); - features |= (1 << VIRTIO_NET_F_HOST_TSO4); - features |= (1 << VIRTIO_NET_F_HOST_TSO6); - features |= (1 << VIRTIO_NET_F_HOST_ECN); - - return features; -} - -static void virtio_net_set_features(VirtIODevice *vdev, uint32_t features) -{ - VirtIONet *n = to_virtio_net(vdev); - int i; - - virtio_net_set_multiqueue(n, !!(features & (1 << VIRTIO_NET_F_MQ)), - !!(features & (1 << VIRTIO_NET_F_CTRL_VQ))); - - virtio_net_set_mrg_rx_bufs(n, !!(features & (1 << VIRTIO_NET_F_MRG_RXBUF))); - - if (n->has_vnet_hdr) { - tap_set_offload(qemu_get_subqueue(n->nic, 0)->peer, - (features >> VIRTIO_NET_F_GUEST_CSUM) & 1, - (features >> VIRTIO_NET_F_GUEST_TSO4) & 1, - (features >> VIRTIO_NET_F_GUEST_TSO6) & 1, - (features >> VIRTIO_NET_F_GUEST_ECN) & 1, - (features >> VIRTIO_NET_F_GUEST_UFO) & 1); - } - - for (i = 0; i < n->max_queues; i++) { - NetClientState *nc = qemu_get_subqueue(n->nic, i); - - if (!nc->peer || nc->peer->info->type != NET_CLIENT_OPTIONS_KIND_TAP) { - continue; - } - if (!tap_get_vhost_net(nc->peer)) { - continue; - } - vhost_net_ack_features(tap_get_vhost_net(nc->peer), features); - } -} - -static int virtio_net_handle_rx_mode(VirtIONet *n, uint8_t cmd, - struct iovec *iov, unsigned int iov_cnt) -{ - uint8_t on; - size_t s; - - s = iov_to_buf(iov, iov_cnt, 0, &on, sizeof(on)); - if (s != sizeof(on)) { - return VIRTIO_NET_ERR; - } - - if (cmd == VIRTIO_NET_CTRL_RX_PROMISC) { - n->promisc = on; - } else if (cmd == VIRTIO_NET_CTRL_RX_ALLMULTI) { - n->allmulti = on; - } else if (cmd == VIRTIO_NET_CTRL_RX_ALLUNI) { - n->alluni = on; - } else if (cmd == VIRTIO_NET_CTRL_RX_NOMULTI) { - n->nomulti = on; - } else if (cmd == VIRTIO_NET_CTRL_RX_NOUNI) { - n->nouni = on; - } else if (cmd == VIRTIO_NET_CTRL_RX_NOBCAST) { - n->nobcast = on; - } else { - return VIRTIO_NET_ERR; - } - - return VIRTIO_NET_OK; -} - -static int virtio_net_handle_mac(VirtIONet *n, uint8_t cmd, - struct iovec *iov, unsigned int iov_cnt) -{ - struct virtio_net_ctrl_mac mac_data; - size_t s; - - if (cmd == VIRTIO_NET_CTRL_MAC_ADDR_SET) { - if (iov_size(iov, iov_cnt) != sizeof(n->mac)) { - return VIRTIO_NET_ERR; - } - s = iov_to_buf(iov, iov_cnt, 0, &n->mac, sizeof(n->mac)); - assert(s == sizeof(n->mac)); - qemu_format_nic_info_str(qemu_get_queue(n->nic), n->mac); - return VIRTIO_NET_OK; - } - - if (cmd != VIRTIO_NET_CTRL_MAC_TABLE_SET) { - return VIRTIO_NET_ERR; - } - - n->mac_table.in_use = 0; - n->mac_table.first_multi = 0; - n->mac_table.uni_overflow = 0; - n->mac_table.multi_overflow = 0; - memset(n->mac_table.macs, 0, MAC_TABLE_ENTRIES * ETH_ALEN); - - s = iov_to_buf(iov, iov_cnt, 0, &mac_data.entries, - sizeof(mac_data.entries)); - mac_data.entries = ldl_p(&mac_data.entries); - if (s != sizeof(mac_data.entries)) { - return VIRTIO_NET_ERR; - } - iov_discard_front(&iov, &iov_cnt, s); - - if (mac_data.entries * ETH_ALEN > iov_size(iov, iov_cnt)) { - return VIRTIO_NET_ERR; - } - - if (mac_data.entries <= MAC_TABLE_ENTRIES) { - s = iov_to_buf(iov, iov_cnt, 0, n->mac_table.macs, - mac_data.entries * ETH_ALEN); - if (s != mac_data.entries * ETH_ALEN) { - return VIRTIO_NET_ERR; - } - n->mac_table.in_use += mac_data.entries; - } else { - n->mac_table.uni_overflow = 1; - } - - iov_discard_front(&iov, &iov_cnt, mac_data.entries * ETH_ALEN); - - n->mac_table.first_multi = n->mac_table.in_use; - - s = iov_to_buf(iov, iov_cnt, 0, &mac_data.entries, - sizeof(mac_data.entries)); - mac_data.entries = ldl_p(&mac_data.entries); - if (s != sizeof(mac_data.entries)) { - return VIRTIO_NET_ERR; - } - - iov_discard_front(&iov, &iov_cnt, s); - - if (mac_data.entries * ETH_ALEN != iov_size(iov, iov_cnt)) { - return VIRTIO_NET_ERR; - } - - if (n->mac_table.in_use + mac_data.entries <= MAC_TABLE_ENTRIES) { - s = iov_to_buf(iov, iov_cnt, 0, n->mac_table.macs, - mac_data.entries * ETH_ALEN); - if (s != mac_data.entries * ETH_ALEN) { - return VIRTIO_NET_ERR; - } - n->mac_table.in_use += mac_data.entries; - } else { - n->mac_table.multi_overflow = 1; - } - - return VIRTIO_NET_OK; -} - -static int virtio_net_handle_vlan_table(VirtIONet *n, uint8_t cmd, - struct iovec *iov, unsigned int iov_cnt) -{ - uint16_t vid; - size_t s; - - s = iov_to_buf(iov, iov_cnt, 0, &vid, sizeof(vid)); - vid = lduw_p(&vid); - if (s != sizeof(vid)) { - return VIRTIO_NET_ERR; - } - - if (vid >= MAX_VLAN) - return VIRTIO_NET_ERR; - - if (cmd == VIRTIO_NET_CTRL_VLAN_ADD) - n->vlans[vid >> 5] |= (1U << (vid & 0x1f)); - else if (cmd == VIRTIO_NET_CTRL_VLAN_DEL) - n->vlans[vid >> 5] &= ~(1U << (vid & 0x1f)); - else - return VIRTIO_NET_ERR; - - return VIRTIO_NET_OK; -} - -static int virtio_net_handle_mq(VirtIONet *n, uint8_t cmd, - struct iovec *iov, unsigned int iov_cnt) -{ - struct virtio_net_ctrl_mq mq; - size_t s; - uint16_t queues; - - s = iov_to_buf(iov, iov_cnt, 0, &mq, sizeof(mq)); - if (s != sizeof(mq)) { - return VIRTIO_NET_ERR; - } - - if (cmd != VIRTIO_NET_CTRL_MQ_VQ_PAIRS_SET) { - return VIRTIO_NET_ERR; - } - - queues = lduw_p(&mq.virtqueue_pairs); - - if (queues < VIRTIO_NET_CTRL_MQ_VQ_PAIRS_MIN || - queues > VIRTIO_NET_CTRL_MQ_VQ_PAIRS_MAX || - queues > n->max_queues || - !n->multiqueue) { - return VIRTIO_NET_ERR; - } - - n->curr_queues = queues; - /* stop the backend before changing the number of queues to avoid handling a - * disabled queue */ - virtio_net_set_status(&n->vdev, n->vdev.status); - virtio_net_set_queues(n); - - return VIRTIO_NET_OK; -} -static void virtio_net_handle_ctrl(VirtIODevice *vdev, VirtQueue *vq) -{ - VirtIONet *n = to_virtio_net(vdev); - struct virtio_net_ctrl_hdr ctrl; - virtio_net_ctrl_ack status = VIRTIO_NET_ERR; - VirtQueueElement elem; - size_t s; - struct iovec *iov; - unsigned int iov_cnt; - - while (virtqueue_pop(vq, &elem)) { - if (iov_size(elem.in_sg, elem.in_num) < sizeof(status) || - iov_size(elem.out_sg, elem.out_num) < sizeof(ctrl)) { - error_report("virtio-net ctrl missing headers"); - exit(1); - } - - iov = elem.out_sg; - iov_cnt = elem.out_num; - s = iov_to_buf(iov, iov_cnt, 0, &ctrl, sizeof(ctrl)); - iov_discard_front(&iov, &iov_cnt, sizeof(ctrl)); - if (s != sizeof(ctrl)) { - status = VIRTIO_NET_ERR; - } else if (ctrl.class == VIRTIO_NET_CTRL_RX) { - status = virtio_net_handle_rx_mode(n, ctrl.cmd, iov, iov_cnt); - } else if (ctrl.class == VIRTIO_NET_CTRL_MAC) { - status = virtio_net_handle_mac(n, ctrl.cmd, iov, iov_cnt); - } else if (ctrl.class == VIRTIO_NET_CTRL_VLAN) { - status = virtio_net_handle_vlan_table(n, ctrl.cmd, iov, iov_cnt); - } else if (ctrl.class == VIRTIO_NET_CTRL_MQ) { - status = virtio_net_handle_mq(n, ctrl.cmd, iov, iov_cnt); - } - - s = iov_from_buf(elem.in_sg, elem.in_num, 0, &status, sizeof(status)); - assert(s == sizeof(status)); - - virtqueue_push(vq, &elem, sizeof(status)); - virtio_notify(vdev, vq); - } -} - -/* RX */ - -static void virtio_net_handle_rx(VirtIODevice *vdev, VirtQueue *vq) -{ - VirtIONet *n = to_virtio_net(vdev); - int queue_index = vq2q(virtio_get_queue_index(vq)); - - qemu_flush_queued_packets(qemu_get_subqueue(n->nic, queue_index)); -} - -static int virtio_net_can_receive(NetClientState *nc) -{ - VirtIONet *n = qemu_get_nic_opaque(nc); - VirtIONetQueue *q = virtio_net_get_subqueue(nc); - - if (!n->vdev.vm_running) { - return 0; - } - - if (nc->queue_index >= n->curr_queues) { - return 0; - } - - if (!virtio_queue_ready(q->rx_vq) || - !(n->vdev.status & VIRTIO_CONFIG_S_DRIVER_OK)) { - return 0; - } - - return 1; -} - -static int virtio_net_has_buffers(VirtIONetQueue *q, int bufsize) -{ - VirtIONet *n = q->n; - if (virtio_queue_empty(q->rx_vq) || - (n->mergeable_rx_bufs && - !virtqueue_avail_bytes(q->rx_vq, bufsize, 0))) { - virtio_queue_set_notification(q->rx_vq, 1); - - /* To avoid a race condition where the guest has made some buffers - * available after the above check but before notification was - * enabled, check for available buffers again. - */ - if (virtio_queue_empty(q->rx_vq) || - (n->mergeable_rx_bufs && - !virtqueue_avail_bytes(q->rx_vq, bufsize, 0))) { - return 0; - } - } - - virtio_queue_set_notification(q->rx_vq, 0); - return 1; -} - -/* dhclient uses AF_PACKET but doesn't pass auxdata to the kernel so - * it never finds out that the packets don't have valid checksums. This - * causes dhclient to get upset. Fedora's carried a patch for ages to - * fix this with Xen but it hasn't appeared in an upstream release of - * dhclient yet. - * - * To avoid breaking existing guests, we catch udp packets and add - * checksums. This is terrible but it's better than hacking the guest - * kernels. - * - * N.B. if we introduce a zero-copy API, this operation is no longer free so - * we should provide a mechanism to disable it to avoid polluting the host - * cache. - */ -static void work_around_broken_dhclient(struct virtio_net_hdr *hdr, - uint8_t *buf, size_t size) -{ - if ((hdr->flags & VIRTIO_NET_HDR_F_NEEDS_CSUM) && /* missing csum */ - (size > 27 && size < 1500) && /* normal sized MTU */ - (buf[12] == 0x08 && buf[13] == 0x00) && /* ethertype == IPv4 */ - (buf[23] == 17) && /* ip.protocol == UDP */ - (buf[34] == 0 && buf[35] == 67)) { /* udp.srcport == bootps */ - net_checksum_calculate(buf, size); - hdr->flags &= ~VIRTIO_NET_HDR_F_NEEDS_CSUM; - } -} - -static void receive_header(VirtIONet *n, const struct iovec *iov, int iov_cnt, - const void *buf, size_t size) -{ - if (n->has_vnet_hdr) { - /* FIXME this cast is evil */ - void *wbuf = (void *)buf; - work_around_broken_dhclient(wbuf, wbuf + n->host_hdr_len, - size - n->host_hdr_len); - iov_from_buf(iov, iov_cnt, 0, buf, sizeof(struct virtio_net_hdr)); - } else { - struct virtio_net_hdr hdr = { - .flags = 0, - .gso_type = VIRTIO_NET_HDR_GSO_NONE - }; - iov_from_buf(iov, iov_cnt, 0, &hdr, sizeof hdr); - } -} - -static int receive_filter(VirtIONet *n, const uint8_t *buf, int size) -{ - static const uint8_t bcast[] = {0xff, 0xff, 0xff, 0xff, 0xff, 0xff}; - static const uint8_t vlan[] = {0x81, 0x00}; - uint8_t *ptr = (uint8_t *)buf; - int i; - - if (n->promisc) - return 1; - - ptr += n->host_hdr_len; - - if (!memcmp(&ptr[12], vlan, sizeof(vlan))) { - int vid = be16_to_cpup((uint16_t *)(ptr + 14)) & 0xfff; - if (!(n->vlans[vid >> 5] & (1U << (vid & 0x1f)))) - return 0; - } - - if (ptr[0] & 1) { // multicast - if (!memcmp(ptr, bcast, sizeof(bcast))) { - return !n->nobcast; - } else if (n->nomulti) { - return 0; - } else if (n->allmulti || n->mac_table.multi_overflow) { - return 1; - } - - for (i = n->mac_table.first_multi; i < n->mac_table.in_use; i++) { - if (!memcmp(ptr, &n->mac_table.macs[i * ETH_ALEN], ETH_ALEN)) { - return 1; - } - } - } else { // unicast - if (n->nouni) { - return 0; - } else if (n->alluni || n->mac_table.uni_overflow) { - return 1; - } else if (!memcmp(ptr, n->mac, ETH_ALEN)) { - return 1; - } - - for (i = 0; i < n->mac_table.first_multi; i++) { - if (!memcmp(ptr, &n->mac_table.macs[i * ETH_ALEN], ETH_ALEN)) { - return 1; - } - } - } - - return 0; -} - -static ssize_t virtio_net_receive(NetClientState *nc, const uint8_t *buf, size_t size) -{ - VirtIONet *n = qemu_get_nic_opaque(nc); - VirtIONetQueue *q = virtio_net_get_subqueue(nc); - struct iovec mhdr_sg[VIRTQUEUE_MAX_SIZE]; - struct virtio_net_hdr_mrg_rxbuf mhdr; - unsigned mhdr_cnt = 0; - size_t offset, i, guest_offset; - - if (!virtio_net_can_receive(nc)) { - return -1; - } - - /* hdr_len refers to the header we supply to the guest */ - if (!virtio_net_has_buffers(q, size + n->guest_hdr_len - n->host_hdr_len)) { - return 0; - } - - if (!receive_filter(n, buf, size)) - return size; - - offset = i = 0; - - while (offset < size) { - VirtQueueElement elem; - int len, total; - const struct iovec *sg = elem.in_sg; - - total = 0; - - if (virtqueue_pop(q->rx_vq, &elem) == 0) { - if (i == 0) - return -1; - error_report("virtio-net unexpected empty queue: " - "i %zd mergeable %d offset %zd, size %zd, " - "guest hdr len %zd, host hdr len %zd guest features 0x%x", - i, n->mergeable_rx_bufs, offset, size, - n->guest_hdr_len, n->host_hdr_len, n->vdev.guest_features); - exit(1); - } - - if (elem.in_num < 1) { - error_report("virtio-net receive queue contains no in buffers"); - exit(1); - } - - if (i == 0) { - assert(offset == 0); - if (n->mergeable_rx_bufs) { - mhdr_cnt = iov_copy(mhdr_sg, ARRAY_SIZE(mhdr_sg), - sg, elem.in_num, - offsetof(typeof(mhdr), num_buffers), - sizeof(mhdr.num_buffers)); - } - - receive_header(n, sg, elem.in_num, buf, size); - offset = n->host_hdr_len; - total += n->guest_hdr_len; - guest_offset = n->guest_hdr_len; - } else { - guest_offset = 0; - } - - /* copy in packet. ugh */ - len = iov_from_buf(sg, elem.in_num, guest_offset, - buf + offset, size - offset); - total += len; - offset += len; - /* If buffers can't be merged, at this point we - * must have consumed the complete packet. - * Otherwise, drop it. */ - if (!n->mergeable_rx_bufs && offset < size) { -#if 0 - error_report("virtio-net truncated non-mergeable packet: " - "i %zd mergeable %d offset %zd, size %zd, " - "guest hdr len %zd, host hdr len %zd", - i, n->mergeable_rx_bufs, - offset, size, n->guest_hdr_len, n->host_hdr_len); -#endif - return size; - } - - /* signal other side */ - virtqueue_fill(q->rx_vq, &elem, total, i++); - } - - if (mhdr_cnt) { - stw_p(&mhdr.num_buffers, i); - iov_from_buf(mhdr_sg, mhdr_cnt, - 0, - &mhdr.num_buffers, sizeof mhdr.num_buffers); - } - - virtqueue_flush(q->rx_vq, i); - virtio_notify(&n->vdev, q->rx_vq); - - return size; -} - -static int32_t virtio_net_flush_tx(VirtIONetQueue *q); - -static void virtio_net_tx_complete(NetClientState *nc, ssize_t len) -{ - VirtIONet *n = qemu_get_nic_opaque(nc); - VirtIONetQueue *q = virtio_net_get_subqueue(nc); - - virtqueue_push(q->tx_vq, &q->async_tx.elem, 0); - virtio_notify(&n->vdev, q->tx_vq); - - q->async_tx.elem.out_num = q->async_tx.len = 0; - - virtio_queue_set_notification(q->tx_vq, 1); - virtio_net_flush_tx(q); -} - -/* TX */ -static int32_t virtio_net_flush_tx(VirtIONetQueue *q) -{ - VirtIONet *n = q->n; - VirtQueueElement elem; - int32_t num_packets = 0; - int queue_index = vq2q(virtio_get_queue_index(q->tx_vq)); - if (!(n->vdev.status & VIRTIO_CONFIG_S_DRIVER_OK)) { - return num_packets; - } - - assert(n->vdev.vm_running); - - if (q->async_tx.elem.out_num) { - virtio_queue_set_notification(q->tx_vq, 0); - return num_packets; - } - - while (virtqueue_pop(q->tx_vq, &elem)) { - ssize_t ret, len; - unsigned int out_num = elem.out_num; - struct iovec *out_sg = &elem.out_sg[0]; - struct iovec sg[VIRTQUEUE_MAX_SIZE]; - - if (out_num < 1) { - error_report("virtio-net header not in first element"); - exit(1); - } - - /* - * If host wants to see the guest header as is, we can - * pass it on unchanged. Otherwise, copy just the parts - * that host is interested in. - */ - assert(n->host_hdr_len <= n->guest_hdr_len); - if (n->host_hdr_len != n->guest_hdr_len) { - unsigned sg_num = iov_copy(sg, ARRAY_SIZE(sg), - out_sg, out_num, - 0, n->host_hdr_len); - sg_num += iov_copy(sg + sg_num, ARRAY_SIZE(sg) - sg_num, - out_sg, out_num, - n->guest_hdr_len, -1); - out_num = sg_num; - out_sg = sg; - } - - len = n->guest_hdr_len; - - ret = qemu_sendv_packet_async(qemu_get_subqueue(n->nic, queue_index), - out_sg, out_num, virtio_net_tx_complete); - if (ret == 0) { - virtio_queue_set_notification(q->tx_vq, 0); - q->async_tx.elem = elem; - q->async_tx.len = len; - return -EBUSY; - } - - len += ret; - - virtqueue_push(q->tx_vq, &elem, 0); - virtio_notify(&n->vdev, q->tx_vq); - - if (++num_packets >= n->tx_burst) { - break; - } - } - return num_packets; -} - -static void virtio_net_handle_tx_timer(VirtIODevice *vdev, VirtQueue *vq) -{ - VirtIONet *n = to_virtio_net(vdev); - VirtIONetQueue *q = &n->vqs[vq2q(virtio_get_queue_index(vq))]; - - /* This happens when device was stopped but VCPU wasn't. */ - if (!n->vdev.vm_running) { - q->tx_waiting = 1; - return; - } - - if (q->tx_waiting) { - virtio_queue_set_notification(vq, 1); - qemu_del_timer(q->tx_timer); - q->tx_waiting = 0; - virtio_net_flush_tx(q); - } else { - qemu_mod_timer(q->tx_timer, - qemu_get_clock_ns(vm_clock) + n->tx_timeout); - q->tx_waiting = 1; - virtio_queue_set_notification(vq, 0); - } -} - -static void virtio_net_handle_tx_bh(VirtIODevice *vdev, VirtQueue *vq) -{ - VirtIONet *n = to_virtio_net(vdev); - VirtIONetQueue *q = &n->vqs[vq2q(virtio_get_queue_index(vq))]; - - if (unlikely(q->tx_waiting)) { - return; - } - q->tx_waiting = 1; - /* This happens when device was stopped but VCPU wasn't. */ - if (!n->vdev.vm_running) { - return; - } - virtio_queue_set_notification(vq, 0); - qemu_bh_schedule(q->tx_bh); -} - -static void virtio_net_tx_timer(void *opaque) -{ - VirtIONetQueue *q = opaque; - VirtIONet *n = q->n; - assert(n->vdev.vm_running); - - q->tx_waiting = 0; - - /* Just in case the driver is not ready on more */ - if (!(n->vdev.status & VIRTIO_CONFIG_S_DRIVER_OK)) - return; - - virtio_queue_set_notification(q->tx_vq, 1); - virtio_net_flush_tx(q); -} - -static void virtio_net_tx_bh(void *opaque) -{ - VirtIONetQueue *q = opaque; - VirtIONet *n = q->n; - int32_t ret; - - assert(n->vdev.vm_running); - - q->tx_waiting = 0; - - /* Just in case the driver is not ready on more */ - if (unlikely(!(n->vdev.status & VIRTIO_CONFIG_S_DRIVER_OK))) - return; - - ret = virtio_net_flush_tx(q); - if (ret == -EBUSY) { - return; /* Notification re-enable handled by tx_complete */ - } - - /* If we flush a full burst of packets, assume there are - * more coming and immediately reschedule */ - if (ret >= n->tx_burst) { - qemu_bh_schedule(q->tx_bh); - q->tx_waiting = 1; - return; - } - - /* If less than a full burst, re-enable notification and flush - * anything that may have come in while we weren't looking. If - * we find something, assume the guest is still active and reschedule */ - virtio_queue_set_notification(q->tx_vq, 1); - if (virtio_net_flush_tx(q) > 0) { - virtio_queue_set_notification(q->tx_vq, 0); - qemu_bh_schedule(q->tx_bh); - q->tx_waiting = 1; - } -} - -static void virtio_net_set_multiqueue(VirtIONet *n, int multiqueue, int ctrl) -{ - VirtIODevice *vdev = &n->vdev; - int i, max = multiqueue ? n->max_queues : 1; - - n->multiqueue = multiqueue; - - for (i = 2; i <= n->max_queues * 2 + 1; i++) { - virtio_del_queue(vdev, i); - } - - for (i = 1; i < max; i++) { - n->vqs[i].rx_vq = virtio_add_queue(vdev, 256, virtio_net_handle_rx); - if (n->vqs[i].tx_timer) { - n->vqs[i].tx_vq = - virtio_add_queue(vdev, 256, virtio_net_handle_tx_timer); - n->vqs[i].tx_timer = qemu_new_timer_ns(vm_clock, - virtio_net_tx_timer, - &n->vqs[i]); - } else { - n->vqs[i].tx_vq = - virtio_add_queue(vdev, 256, virtio_net_handle_tx_bh); - n->vqs[i].tx_bh = qemu_bh_new(virtio_net_tx_bh, &n->vqs[i]); - } - - n->vqs[i].tx_waiting = 0; - n->vqs[i].n = n; - } - - if (ctrl) { - n->ctrl_vq = virtio_add_queue(vdev, 64, virtio_net_handle_ctrl); - } - - virtio_net_set_queues(n); -} - -static void virtio_net_save(QEMUFile *f, void *opaque) -{ - int i; - VirtIONet *n = opaque; - - /* At this point, backend must be stopped, otherwise - * it might keep writing to memory. */ - assert(!n->vhost_started); - virtio_save(&n->vdev, f); - - qemu_put_buffer(f, n->mac, ETH_ALEN); - qemu_put_be32(f, n->vqs[0].tx_waiting); - qemu_put_be32(f, n->mergeable_rx_bufs); - qemu_put_be16(f, n->status); - qemu_put_byte(f, n->promisc); - qemu_put_byte(f, n->allmulti); - qemu_put_be32(f, n->mac_table.in_use); - qemu_put_buffer(f, n->mac_table.macs, n->mac_table.in_use * ETH_ALEN); - qemu_put_buffer(f, (uint8_t *)n->vlans, MAX_VLAN >> 3); - qemu_put_be32(f, n->has_vnet_hdr); - qemu_put_byte(f, n->mac_table.multi_overflow); - qemu_put_byte(f, n->mac_table.uni_overflow); - qemu_put_byte(f, n->alluni); - qemu_put_byte(f, n->nomulti); - qemu_put_byte(f, n->nouni); - qemu_put_byte(f, n->nobcast); - qemu_put_byte(f, n->has_ufo); - if (n->max_queues > 1) { - qemu_put_be16(f, n->max_queues); - qemu_put_be16(f, n->curr_queues); - for (i = 1; i < n->curr_queues; i++) { - qemu_put_be32(f, n->vqs[i].tx_waiting); - } - } -} - -static int virtio_net_load(QEMUFile *f, void *opaque, int version_id) -{ - VirtIONet *n = opaque; - int ret, i, link_down; - - if (version_id < 2 || version_id > VIRTIO_NET_VM_VERSION) - return -EINVAL; - - ret = virtio_load(&n->vdev, f); - if (ret) { - return ret; - } - - qemu_get_buffer(f, n->mac, ETH_ALEN); - n->vqs[0].tx_waiting = qemu_get_be32(f); - - virtio_net_set_mrg_rx_bufs(n, qemu_get_be32(f)); - - if (version_id >= 3) - n->status = qemu_get_be16(f); - - if (version_id >= 4) { - if (version_id < 8) { - n->promisc = qemu_get_be32(f); - n->allmulti = qemu_get_be32(f); - } else { - n->promisc = qemu_get_byte(f); - n->allmulti = qemu_get_byte(f); - } - } - - if (version_id >= 5) { - n->mac_table.in_use = qemu_get_be32(f); - /* MAC_TABLE_ENTRIES may be different from the saved image */ - if (n->mac_table.in_use <= MAC_TABLE_ENTRIES) { - qemu_get_buffer(f, n->mac_table.macs, - n->mac_table.in_use * ETH_ALEN); - } else if (n->mac_table.in_use) { - uint8_t *buf = g_malloc0(n->mac_table.in_use); - qemu_get_buffer(f, buf, n->mac_table.in_use * ETH_ALEN); - g_free(buf); - n->mac_table.multi_overflow = n->mac_table.uni_overflow = 1; - n->mac_table.in_use = 0; - } - } - - if (version_id >= 6) - qemu_get_buffer(f, (uint8_t *)n->vlans, MAX_VLAN >> 3); - - if (version_id >= 7) { - if (qemu_get_be32(f) && !peer_has_vnet_hdr(n)) { - error_report("virtio-net: saved image requires vnet_hdr=on"); - return -1; - } - - if (n->has_vnet_hdr) { - tap_set_offload(qemu_get_queue(n->nic)->peer, - (n->vdev.guest_features >> VIRTIO_NET_F_GUEST_CSUM) & 1, - (n->vdev.guest_features >> VIRTIO_NET_F_GUEST_TSO4) & 1, - (n->vdev.guest_features >> VIRTIO_NET_F_GUEST_TSO6) & 1, - (n->vdev.guest_features >> VIRTIO_NET_F_GUEST_ECN) & 1, - (n->vdev.guest_features >> VIRTIO_NET_F_GUEST_UFO) & 1); - } - } - - if (version_id >= 9) { - n->mac_table.multi_overflow = qemu_get_byte(f); - n->mac_table.uni_overflow = qemu_get_byte(f); - } - - if (version_id >= 10) { - n->alluni = qemu_get_byte(f); - n->nomulti = qemu_get_byte(f); - n->nouni = qemu_get_byte(f); - n->nobcast = qemu_get_byte(f); - } - - if (version_id >= 11) { - if (qemu_get_byte(f) && !peer_has_ufo(n)) { - error_report("virtio-net: saved image requires TUN_F_UFO support"); - return -1; - } - } - - if (n->max_queues > 1) { - if (n->max_queues != qemu_get_be16(f)) { - error_report("virtio-net: different max_queues "); - return -1; - } - - n->curr_queues = qemu_get_be16(f); - for (i = 1; i < n->curr_queues; i++) { - n->vqs[i].tx_waiting = qemu_get_be32(f); - } - } - - virtio_net_set_queues(n); - - /* Find the first multicast entry in the saved MAC filter */ - for (i = 0; i < n->mac_table.in_use; i++) { - if (n->mac_table.macs[i * ETH_ALEN] & 1) { - break; - } - } - n->mac_table.first_multi = i; - - /* nc.link_down can't be migrated, so infer link_down according - * to link status bit in n->status */ - link_down = (n->status & VIRTIO_NET_S_LINK_UP) == 0; - for (i = 0; i < n->max_queues; i++) { - qemu_get_subqueue(n->nic, i)->link_down = link_down; - } - - return 0; -} - -static void virtio_net_cleanup(NetClientState *nc) -{ - VirtIONet *n = qemu_get_nic_opaque(nc); - - n->nic = NULL; -} - -static NetClientInfo net_virtio_info = { - .type = NET_CLIENT_OPTIONS_KIND_NIC, - .size = sizeof(NICState), - .can_receive = virtio_net_can_receive, - .receive = virtio_net_receive, - .cleanup = virtio_net_cleanup, - .link_status_changed = virtio_net_set_link_status, -}; - -static bool virtio_net_guest_notifier_pending(VirtIODevice *vdev, int idx) -{ - VirtIONet *n = to_virtio_net(vdev); - NetClientState *nc = qemu_get_subqueue(n->nic, vq2q(idx)); - assert(n->vhost_started); - return vhost_net_virtqueue_pending(tap_get_vhost_net(nc->peer), idx); -} - -static void virtio_net_guest_notifier_mask(VirtIODevice *vdev, int idx, - bool mask) -{ - VirtIONet *n = to_virtio_net(vdev); - NetClientState *nc = qemu_get_subqueue(n->nic, vq2q(idx)); - assert(n->vhost_started); - vhost_net_virtqueue_mask(tap_get_vhost_net(nc->peer), - vdev, idx, mask); -} - -VirtIODevice *virtio_net_init(DeviceState *dev, NICConf *conf, - virtio_net_conf *net, uint32_t host_features) -{ - VirtIONet *n; - int i, config_size = 0; - - for (i = 0; feature_sizes[i].flags != 0; i++) { - if (host_features & feature_sizes[i].flags) { - config_size = MAX(feature_sizes[i].end, config_size); - } - } - - n = (VirtIONet *)virtio_common_init("virtio-net", VIRTIO_ID_NET, - config_size, sizeof(VirtIONet)); - - n->config_size = config_size; - n->vdev.get_config = virtio_net_get_config; - n->vdev.set_config = virtio_net_set_config; - n->vdev.get_features = virtio_net_get_features; - n->vdev.set_features = virtio_net_set_features; - n->vdev.bad_features = virtio_net_bad_features; - n->vdev.reset = virtio_net_reset; - n->vdev.set_status = virtio_net_set_status; - n->vdev.guest_notifier_mask = virtio_net_guest_notifier_mask; - n->vdev.guest_notifier_pending = virtio_net_guest_notifier_pending; - n->max_queues = MAX(conf->queues, 1); - n->vqs = g_malloc0(sizeof(VirtIONetQueue) * n->max_queues); - n->vqs[0].rx_vq = virtio_add_queue(&n->vdev, 256, virtio_net_handle_rx); - n->curr_queues = 1; - n->vqs[0].n = n; - n->tx_timeout = net->txtimer; - - if (net->tx && strcmp(net->tx, "timer") && strcmp(net->tx, "bh")) { - error_report("virtio-net: " - "Unknown option tx=%s, valid options: \"timer\" \"bh\"", - net->tx); - error_report("Defaulting to \"bh\""); - } - - if (net->tx && !strcmp(net->tx, "timer")) { - n->vqs[0].tx_vq = virtio_add_queue(&n->vdev, 256, - virtio_net_handle_tx_timer); - n->vqs[0].tx_timer = qemu_new_timer_ns(vm_clock, virtio_net_tx_timer, - &n->vqs[0]); - } else { - n->vqs[0].tx_vq = virtio_add_queue(&n->vdev, 256, - virtio_net_handle_tx_bh); - n->vqs[0].tx_bh = qemu_bh_new(virtio_net_tx_bh, &n->vqs[0]); - } - n->ctrl_vq = virtio_add_queue(&n->vdev, 64, virtio_net_handle_ctrl); - qemu_macaddr_default_if_unset(&conf->macaddr); - memcpy(&n->mac[0], &conf->macaddr, sizeof(n->mac)); - n->status = VIRTIO_NET_S_LINK_UP; - - n->nic = qemu_new_nic(&net_virtio_info, conf, object_get_typename(OBJECT(dev)), dev->id, n); - peer_test_vnet_hdr(n); - if (peer_has_vnet_hdr(n)) { - for (i = 0; i < n->max_queues; i++) { - tap_using_vnet_hdr(qemu_get_subqueue(n->nic, i)->peer, true); - } - n->host_hdr_len = sizeof(struct virtio_net_hdr); - } else { - n->host_hdr_len = 0; - } - - qemu_format_nic_info_str(qemu_get_queue(n->nic), conf->macaddr.a); - - n->vqs[0].tx_waiting = 0; - n->tx_burst = net->txburst; - virtio_net_set_mrg_rx_bufs(n, 0); - n->promisc = 1; /* for compatibility */ - - n->mac_table.macs = g_malloc0(MAC_TABLE_ENTRIES * ETH_ALEN); - - n->vlans = g_malloc0(MAX_VLAN >> 3); - - n->qdev = dev; - register_savevm(dev, "virtio-net", -1, VIRTIO_NET_VM_VERSION, - virtio_net_save, virtio_net_load, n); - - add_boot_device_path(conf->bootindex, dev, "/ethernet-phy@0"); - - return &n->vdev; -} - -void virtio_net_exit(VirtIODevice *vdev) -{ - VirtIONet *n = DO_UPCAST(VirtIONet, vdev, vdev); - int i; - - /* This will stop vhost backend if appropriate. */ - virtio_net_set_status(vdev, 0); - - unregister_savevm(n->qdev, "virtio-net", n); - - g_free(n->mac_table.macs); - g_free(n->vlans); - - for (i = 0; i < n->max_queues; i++) { - VirtIONetQueue *q = &n->vqs[i]; - NetClientState *nc = qemu_get_subqueue(n->nic, i); - - qemu_purge_queued_packets(nc); - - if (q->tx_timer) { - qemu_del_timer(q->tx_timer); - qemu_free_timer(q->tx_timer); - } else { - qemu_bh_delete(q->tx_bh); - } - } - - g_free(n->vqs); - qemu_del_nic(n->nic); - virtio_cleanup(&n->vdev); -} diff --git a/hw/virtio-scsi.c b/hw/virtio-scsi.c deleted file mode 100644 index ead7cda13d..0000000000 --- a/hw/virtio-scsi.c +++ /dev/null @@ -1,774 +0,0 @@ -/* - * Virtio SCSI HBA - * - * Copyright IBM, Corp. 2010 - * Copyright Red Hat, Inc. 2011 - * - * Authors: - * Stefan Hajnoczi - * Paolo Bonzini - * - * This work is licensed under the terms of the GNU GPL, version 2 or later. - * See the COPYING file in the top-level directory. - * - */ - -#include "hw/virtio/virtio-scsi.h" -#include "qemu/error-report.h" -#include -#include -#include - -#define VIRTIO_SCSI_VQ_SIZE 128 -#define VIRTIO_SCSI_CDB_SIZE 32 -#define VIRTIO_SCSI_SENSE_SIZE 96 -#define VIRTIO_SCSI_MAX_CHANNEL 0 -#define VIRTIO_SCSI_MAX_TARGET 255 -#define VIRTIO_SCSI_MAX_LUN 16383 - -/* Response codes */ -#define VIRTIO_SCSI_S_OK 0 -#define VIRTIO_SCSI_S_OVERRUN 1 -#define VIRTIO_SCSI_S_ABORTED 2 -#define VIRTIO_SCSI_S_BAD_TARGET 3 -#define VIRTIO_SCSI_S_RESET 4 -#define VIRTIO_SCSI_S_BUSY 5 -#define VIRTIO_SCSI_S_TRANSPORT_FAILURE 6 -#define VIRTIO_SCSI_S_TARGET_FAILURE 7 -#define VIRTIO_SCSI_S_NEXUS_FAILURE 8 -#define VIRTIO_SCSI_S_FAILURE 9 -#define VIRTIO_SCSI_S_FUNCTION_SUCCEEDED 10 -#define VIRTIO_SCSI_S_FUNCTION_REJECTED 11 -#define VIRTIO_SCSI_S_INCORRECT_LUN 12 - -/* Controlq type codes. */ -#define VIRTIO_SCSI_T_TMF 0 -#define VIRTIO_SCSI_T_AN_QUERY 1 -#define VIRTIO_SCSI_T_AN_SUBSCRIBE 2 - -/* Valid TMF subtypes. */ -#define VIRTIO_SCSI_T_TMF_ABORT_TASK 0 -#define VIRTIO_SCSI_T_TMF_ABORT_TASK_SET 1 -#define VIRTIO_SCSI_T_TMF_CLEAR_ACA 2 -#define VIRTIO_SCSI_T_TMF_CLEAR_TASK_SET 3 -#define VIRTIO_SCSI_T_TMF_I_T_NEXUS_RESET 4 -#define VIRTIO_SCSI_T_TMF_LOGICAL_UNIT_RESET 5 -#define VIRTIO_SCSI_T_TMF_QUERY_TASK 6 -#define VIRTIO_SCSI_T_TMF_QUERY_TASK_SET 7 - -/* Events. */ -#define VIRTIO_SCSI_T_EVENTS_MISSED 0x80000000 -#define VIRTIO_SCSI_T_NO_EVENT 0 -#define VIRTIO_SCSI_T_TRANSPORT_RESET 1 -#define VIRTIO_SCSI_T_ASYNC_NOTIFY 2 -#define VIRTIO_SCSI_T_PARAM_CHANGE 3 - -/* Reasons for transport reset event */ -#define VIRTIO_SCSI_EVT_RESET_HARD 0 -#define VIRTIO_SCSI_EVT_RESET_RESCAN 1 -#define VIRTIO_SCSI_EVT_RESET_REMOVED 2 - -/* SCSI command request, followed by data-out */ -typedef struct { - uint8_t lun[8]; /* Logical Unit Number */ - uint64_t tag; /* Command identifier */ - uint8_t task_attr; /* Task attribute */ - uint8_t prio; - uint8_t crn; - uint8_t cdb[]; -} QEMU_PACKED VirtIOSCSICmdReq; - -/* Response, followed by sense data and data-in */ -typedef struct { - uint32_t sense_len; /* Sense data length */ - uint32_t resid; /* Residual bytes in data buffer */ - uint16_t status_qualifier; /* Status qualifier */ - uint8_t status; /* Command completion status */ - uint8_t response; /* Response values */ - uint8_t sense[]; -} QEMU_PACKED VirtIOSCSICmdResp; - -/* Task Management Request */ -typedef struct { - uint32_t type; - uint32_t subtype; - uint8_t lun[8]; - uint64_t tag; -} QEMU_PACKED VirtIOSCSICtrlTMFReq; - -typedef struct { - uint8_t response; -} QEMU_PACKED VirtIOSCSICtrlTMFResp; - -/* Asynchronous notification query/subscription */ -typedef struct { - uint32_t type; - uint8_t lun[8]; - uint32_t event_requested; -} QEMU_PACKED VirtIOSCSICtrlANReq; - -typedef struct { - uint32_t event_actual; - uint8_t response; -} QEMU_PACKED VirtIOSCSICtrlANResp; - -typedef struct { - uint32_t event; - uint8_t lun[8]; - uint32_t reason; -} QEMU_PACKED VirtIOSCSIEvent; - -typedef struct { - uint32_t num_queues; - uint32_t seg_max; - uint32_t max_sectors; - uint32_t cmd_per_lun; - uint32_t event_info_size; - uint32_t sense_size; - uint32_t cdb_size; - uint16_t max_channel; - uint16_t max_target; - uint32_t max_lun; -} QEMU_PACKED VirtIOSCSIConfig; - -typedef struct VirtIOSCSIReq { - VirtIOSCSI *dev; - VirtQueue *vq; - VirtQueueElement elem; - QEMUSGList qsgl; - SCSIRequest *sreq; - union { - char *buf; - VirtIOSCSICmdReq *cmd; - VirtIOSCSICtrlTMFReq *tmf; - VirtIOSCSICtrlANReq *an; - } req; - union { - char *buf; - VirtIOSCSICmdResp *cmd; - VirtIOSCSICtrlTMFResp *tmf; - VirtIOSCSICtrlANResp *an; - VirtIOSCSIEvent *event; - } resp; -} VirtIOSCSIReq; - -static inline int virtio_scsi_get_lun(uint8_t *lun) -{ - return ((lun[2] << 8) | lun[3]) & 0x3FFF; -} - -static inline SCSIDevice *virtio_scsi_device_find(VirtIOSCSI *s, uint8_t *lun) -{ - if (lun[0] != 1) { - return NULL; - } - if (lun[2] != 0 && !(lun[2] >= 0x40 && lun[2] < 0x80)) { - return NULL; - } - return scsi_device_find(&s->bus, 0, lun[1], virtio_scsi_get_lun(lun)); -} - -static void virtio_scsi_complete_req(VirtIOSCSIReq *req) -{ - VirtIOSCSI *s = req->dev; - VirtQueue *vq = req->vq; - VirtIODevice *vdev = VIRTIO_DEVICE(s); - virtqueue_push(vq, &req->elem, req->qsgl.size + req->elem.in_sg[0].iov_len); - qemu_sglist_destroy(&req->qsgl); - if (req->sreq) { - req->sreq->hba_private = NULL; - scsi_req_unref(req->sreq); - } - g_free(req); - virtio_notify(vdev, vq); -} - -static void virtio_scsi_bad_req(void) -{ - error_report("wrong size for virtio-scsi headers"); - exit(1); -} - -static void qemu_sgl_init_external(QEMUSGList *qsgl, struct iovec *sg, - hwaddr *addr, int num) -{ - qemu_sglist_init(qsgl, num, &dma_context_memory); - while (num--) { - qemu_sglist_add(qsgl, *(addr++), (sg++)->iov_len); - } -} - -static void virtio_scsi_parse_req(VirtIOSCSI *s, VirtQueue *vq, - VirtIOSCSIReq *req) -{ - assert(req->elem.in_num); - req->vq = vq; - req->dev = s; - req->sreq = NULL; - if (req->elem.out_num) { - req->req.buf = req->elem.out_sg[0].iov_base; - } - req->resp.buf = req->elem.in_sg[0].iov_base; - - if (req->elem.out_num > 1) { - qemu_sgl_init_external(&req->qsgl, &req->elem.out_sg[1], - &req->elem.out_addr[1], - req->elem.out_num - 1); - } else { - qemu_sgl_init_external(&req->qsgl, &req->elem.in_sg[1], - &req->elem.in_addr[1], - req->elem.in_num - 1); - } -} - -static VirtIOSCSIReq *virtio_scsi_pop_req(VirtIOSCSI *s, VirtQueue *vq) -{ - VirtIOSCSIReq *req; - req = g_malloc(sizeof(*req)); - if (!virtqueue_pop(vq, &req->elem)) { - g_free(req); - return NULL; - } - - virtio_scsi_parse_req(s, vq, req); - return req; -} - -static void virtio_scsi_save_request(QEMUFile *f, SCSIRequest *sreq) -{ - VirtIOSCSIReq *req = sreq->hba_private; - uint32_t n = virtio_queue_get_id(req->vq) - 2; - - assert(n < req->dev->conf.num_queues); - qemu_put_be32s(f, &n); - qemu_put_buffer(f, (unsigned char *)&req->elem, sizeof(req->elem)); -} - -static void *virtio_scsi_load_request(QEMUFile *f, SCSIRequest *sreq) -{ - SCSIBus *bus = sreq->bus; - VirtIOSCSI *s = container_of(bus, VirtIOSCSI, bus); - VirtIOSCSIReq *req; - uint32_t n; - - req = g_malloc(sizeof(*req)); - qemu_get_be32s(f, &n); - assert(n < s->conf.num_queues); - qemu_get_buffer(f, (unsigned char *)&req->elem, sizeof(req->elem)); - virtio_scsi_parse_req(s, s->cmd_vqs[n], req); - - scsi_req_ref(sreq); - req->sreq = sreq; - if (req->sreq->cmd.mode != SCSI_XFER_NONE) { - int req_mode = - (req->elem.in_num > 1 ? SCSI_XFER_FROM_DEV : SCSI_XFER_TO_DEV); - - assert(req->sreq->cmd.mode == req_mode); - } - return req; -} - -static void virtio_scsi_do_tmf(VirtIOSCSI *s, VirtIOSCSIReq *req) -{ - SCSIDevice *d = virtio_scsi_device_find(s, req->req.tmf->lun); - SCSIRequest *r, *next; - BusChild *kid; - int target; - - /* Here VIRTIO_SCSI_S_OK means "FUNCTION COMPLETE". */ - req->resp.tmf->response = VIRTIO_SCSI_S_OK; - - switch (req->req.tmf->subtype) { - case VIRTIO_SCSI_T_TMF_ABORT_TASK: - case VIRTIO_SCSI_T_TMF_QUERY_TASK: - if (!d) { - goto fail; - } - if (d->lun != virtio_scsi_get_lun(req->req.tmf->lun)) { - goto incorrect_lun; - } - QTAILQ_FOREACH_SAFE(r, &d->requests, next, next) { - VirtIOSCSIReq *cmd_req = r->hba_private; - if (cmd_req && cmd_req->req.cmd->tag == req->req.tmf->tag) { - break; - } - } - if (r) { - /* - * Assert that the request has not been completed yet, we - * check for it in the loop above. - */ - assert(r->hba_private); - if (req->req.tmf->subtype == VIRTIO_SCSI_T_TMF_QUERY_TASK) { - /* "If the specified command is present in the task set, then - * return a service response set to FUNCTION SUCCEEDED". - */ - req->resp.tmf->response = VIRTIO_SCSI_S_FUNCTION_SUCCEEDED; - } else { - scsi_req_cancel(r); - } - } - break; - - case VIRTIO_SCSI_T_TMF_LOGICAL_UNIT_RESET: - if (!d) { - goto fail; - } - if (d->lun != virtio_scsi_get_lun(req->req.tmf->lun)) { - goto incorrect_lun; - } - s->resetting++; - qdev_reset_all(&d->qdev); - s->resetting--; - break; - - case VIRTIO_SCSI_T_TMF_ABORT_TASK_SET: - case VIRTIO_SCSI_T_TMF_CLEAR_TASK_SET: - case VIRTIO_SCSI_T_TMF_QUERY_TASK_SET: - if (!d) { - goto fail; - } - if (d->lun != virtio_scsi_get_lun(req->req.tmf->lun)) { - goto incorrect_lun; - } - QTAILQ_FOREACH_SAFE(r, &d->requests, next, next) { - if (r->hba_private) { - if (req->req.tmf->subtype == VIRTIO_SCSI_T_TMF_QUERY_TASK_SET) { - /* "If there is any command present in the task set, then - * return a service response set to FUNCTION SUCCEEDED". - */ - req->resp.tmf->response = VIRTIO_SCSI_S_FUNCTION_SUCCEEDED; - break; - } else { - scsi_req_cancel(r); - } - } - } - break; - - case VIRTIO_SCSI_T_TMF_I_T_NEXUS_RESET: - target = req->req.tmf->lun[1]; - s->resetting++; - QTAILQ_FOREACH(kid, &s->bus.qbus.children, sibling) { - d = DO_UPCAST(SCSIDevice, qdev, kid->child); - if (d->channel == 0 && d->id == target) { - qdev_reset_all(&d->qdev); - } - } - s->resetting--; - break; - - case VIRTIO_SCSI_T_TMF_CLEAR_ACA: - default: - req->resp.tmf->response = VIRTIO_SCSI_S_FUNCTION_REJECTED; - break; - } - - return; - -incorrect_lun: - req->resp.tmf->response = VIRTIO_SCSI_S_INCORRECT_LUN; - return; - -fail: - req->resp.tmf->response = VIRTIO_SCSI_S_BAD_TARGET; -} - -static void virtio_scsi_handle_ctrl(VirtIODevice *vdev, VirtQueue *vq) -{ - VirtIOSCSI *s = (VirtIOSCSI *)vdev; - VirtIOSCSIReq *req; - - while ((req = virtio_scsi_pop_req(s, vq))) { - int out_size, in_size; - if (req->elem.out_num < 1 || req->elem.in_num < 1) { - virtio_scsi_bad_req(); - continue; - } - - out_size = req->elem.out_sg[0].iov_len; - in_size = req->elem.in_sg[0].iov_len; - if (req->req.tmf->type == VIRTIO_SCSI_T_TMF) { - if (out_size < sizeof(VirtIOSCSICtrlTMFReq) || - in_size < sizeof(VirtIOSCSICtrlTMFResp)) { - virtio_scsi_bad_req(); - } - virtio_scsi_do_tmf(s, req); - - } else if (req->req.tmf->type == VIRTIO_SCSI_T_AN_QUERY || - req->req.tmf->type == VIRTIO_SCSI_T_AN_SUBSCRIBE) { - if (out_size < sizeof(VirtIOSCSICtrlANReq) || - in_size < sizeof(VirtIOSCSICtrlANResp)) { - virtio_scsi_bad_req(); - } - req->resp.an->event_actual = 0; - req->resp.an->response = VIRTIO_SCSI_S_OK; - } - virtio_scsi_complete_req(req); - } -} - -static void virtio_scsi_command_complete(SCSIRequest *r, uint32_t status, - size_t resid) -{ - VirtIOSCSIReq *req = r->hba_private; - uint32_t sense_len; - - req->resp.cmd->response = VIRTIO_SCSI_S_OK; - req->resp.cmd->status = status; - if (req->resp.cmd->status == GOOD) { - req->resp.cmd->resid = tswap32(resid); - } else { - req->resp.cmd->resid = 0; - sense_len = scsi_req_get_sense(r, req->resp.cmd->sense, - VIRTIO_SCSI_SENSE_SIZE); - req->resp.cmd->sense_len = tswap32(sense_len); - } - virtio_scsi_complete_req(req); -} - -static QEMUSGList *virtio_scsi_get_sg_list(SCSIRequest *r) -{ - VirtIOSCSIReq *req = r->hba_private; - - return &req->qsgl; -} - -static void virtio_scsi_request_cancelled(SCSIRequest *r) -{ - VirtIOSCSIReq *req = r->hba_private; - - if (!req) { - return; - } - if (req->dev->resetting) { - req->resp.cmd->response = VIRTIO_SCSI_S_RESET; - } else { - req->resp.cmd->response = VIRTIO_SCSI_S_ABORTED; - } - virtio_scsi_complete_req(req); -} - -static void virtio_scsi_fail_cmd_req(VirtIOSCSIReq *req) -{ - req->resp.cmd->response = VIRTIO_SCSI_S_FAILURE; - virtio_scsi_complete_req(req); -} - -static void virtio_scsi_handle_cmd(VirtIODevice *vdev, VirtQueue *vq) -{ - VirtIOSCSI *s = (VirtIOSCSI *)vdev; - VirtIOSCSIReq *req; - int n; - - while ((req = virtio_scsi_pop_req(s, vq))) { - SCSIDevice *d; - int out_size, in_size; - if (req->elem.out_num < 1 || req->elem.in_num < 1) { - virtio_scsi_bad_req(); - } - - out_size = req->elem.out_sg[0].iov_len; - in_size = req->elem.in_sg[0].iov_len; - if (out_size < sizeof(VirtIOSCSICmdReq) + s->cdb_size || - in_size < sizeof(VirtIOSCSICmdResp) + s->sense_size) { - virtio_scsi_bad_req(); - } - - if (req->elem.out_num > 1 && req->elem.in_num > 1) { - virtio_scsi_fail_cmd_req(req); - continue; - } - - d = virtio_scsi_device_find(s, req->req.cmd->lun); - if (!d) { - req->resp.cmd->response = VIRTIO_SCSI_S_BAD_TARGET; - virtio_scsi_complete_req(req); - continue; - } - req->sreq = scsi_req_new(d, req->req.cmd->tag, - virtio_scsi_get_lun(req->req.cmd->lun), - req->req.cmd->cdb, req); - - if (req->sreq->cmd.mode != SCSI_XFER_NONE) { - int req_mode = - (req->elem.in_num > 1 ? SCSI_XFER_FROM_DEV : SCSI_XFER_TO_DEV); - - if (req->sreq->cmd.mode != req_mode || - req->sreq->cmd.xfer > req->qsgl.size) { - req->resp.cmd->response = VIRTIO_SCSI_S_OVERRUN; - virtio_scsi_complete_req(req); - continue; - } - } - - n = scsi_req_enqueue(req->sreq); - if (n) { - scsi_req_continue(req->sreq); - } - } -} - -static void virtio_scsi_get_config(VirtIODevice *vdev, - uint8_t *config) -{ - VirtIOSCSIConfig *scsiconf = (VirtIOSCSIConfig *)config; - VirtIOSCSI *s = (VirtIOSCSI *)vdev; - - stl_raw(&scsiconf->num_queues, s->conf.num_queues); - stl_raw(&scsiconf->seg_max, 128 - 2); - stl_raw(&scsiconf->max_sectors, s->conf.max_sectors); - stl_raw(&scsiconf->cmd_per_lun, s->conf.cmd_per_lun); - stl_raw(&scsiconf->event_info_size, sizeof(VirtIOSCSIEvent)); - stl_raw(&scsiconf->sense_size, s->sense_size); - stl_raw(&scsiconf->cdb_size, s->cdb_size); - stw_raw(&scsiconf->max_channel, VIRTIO_SCSI_MAX_CHANNEL); - stw_raw(&scsiconf->max_target, VIRTIO_SCSI_MAX_TARGET); - stl_raw(&scsiconf->max_lun, VIRTIO_SCSI_MAX_LUN); -} - -static void virtio_scsi_set_config(VirtIODevice *vdev, - const uint8_t *config) -{ - VirtIOSCSIConfig *scsiconf = (VirtIOSCSIConfig *)config; - VirtIOSCSI *s = (VirtIOSCSI *)vdev; - - if ((uint32_t) ldl_raw(&scsiconf->sense_size) >= 65536 || - (uint32_t) ldl_raw(&scsiconf->cdb_size) >= 256) { - error_report("bad data written to virtio-scsi configuration space"); - exit(1); - } - - s->sense_size = ldl_raw(&scsiconf->sense_size); - s->cdb_size = ldl_raw(&scsiconf->cdb_size); -} - -static uint32_t virtio_scsi_get_features(VirtIODevice *vdev, - uint32_t requested_features) -{ - return requested_features; -} - -static void virtio_scsi_reset(VirtIODevice *vdev) -{ - VirtIOSCSI *s = (VirtIOSCSI *)vdev; - - s->resetting++; - qbus_reset_all(&s->bus.qbus); - s->resetting--; - - s->sense_size = VIRTIO_SCSI_SENSE_SIZE; - s->cdb_size = VIRTIO_SCSI_CDB_SIZE; - s->events_dropped = false; -} - -/* The device does not have anything to save beyond the virtio data. - * Request data is saved with callbacks from SCSI devices. - */ -static void virtio_scsi_save(QEMUFile *f, void *opaque) -{ - VirtIODevice *vdev = VIRTIO_DEVICE(opaque); - virtio_save(vdev, f); -} - -static int virtio_scsi_load(QEMUFile *f, void *opaque, int version_id) -{ - VirtIODevice *vdev = VIRTIO_DEVICE(opaque); - int ret; - - ret = virtio_load(vdev, f); - if (ret) { - return ret; - } - return 0; -} - -static void virtio_scsi_push_event(VirtIOSCSI *s, SCSIDevice *dev, - uint32_t event, uint32_t reason) -{ - VirtIOSCSIReq *req = virtio_scsi_pop_req(s, s->event_vq); - VirtIOSCSIEvent *evt; - VirtIODevice *vdev = VIRTIO_DEVICE(s); - int in_size; - - if (!(vdev->status & VIRTIO_CONFIG_S_DRIVER_OK)) { - return; - } - - if (!req) { - s->events_dropped = true; - return; - } - - if (req->elem.out_num || req->elem.in_num != 1) { - virtio_scsi_bad_req(); - } - - if (s->events_dropped) { - event |= VIRTIO_SCSI_T_EVENTS_MISSED; - s->events_dropped = false; - } - - in_size = req->elem.in_sg[0].iov_len; - if (in_size < sizeof(VirtIOSCSIEvent)) { - virtio_scsi_bad_req(); - } - - evt = req->resp.event; - memset(evt, 0, sizeof(VirtIOSCSIEvent)); - evt->event = event; - evt->reason = reason; - if (!dev) { - assert(event == VIRTIO_SCSI_T_NO_EVENT); - } else { - evt->lun[0] = 1; - evt->lun[1] = dev->id; - - /* Linux wants us to keep the same encoding we use for REPORT LUNS. */ - if (dev->lun >= 256) { - evt->lun[2] = (dev->lun >> 8) | 0x40; - } - evt->lun[3] = dev->lun & 0xFF; - } - virtio_scsi_complete_req(req); -} - -static void virtio_scsi_handle_event(VirtIODevice *vdev, VirtQueue *vq) -{ - VirtIOSCSI *s = VIRTIO_SCSI(vdev); - - if (s->events_dropped) { - virtio_scsi_push_event(s, NULL, VIRTIO_SCSI_T_NO_EVENT, 0); - } -} - -static void virtio_scsi_change(SCSIBus *bus, SCSIDevice *dev, SCSISense sense) -{ - VirtIOSCSI *s = container_of(bus, VirtIOSCSI, bus); - VirtIODevice *vdev = VIRTIO_DEVICE(s); - - if (((vdev->guest_features >> VIRTIO_SCSI_F_CHANGE) & 1) && - dev->type != TYPE_ROM) { - virtio_scsi_push_event(s, dev, VIRTIO_SCSI_T_PARAM_CHANGE, - sense.asc | (sense.ascq << 8)); - } -} - -static void virtio_scsi_hotplug(SCSIBus *bus, SCSIDevice *dev) -{ - VirtIOSCSI *s = container_of(bus, VirtIOSCSI, bus); - VirtIODevice *vdev = VIRTIO_DEVICE(s); - - if ((vdev->guest_features >> VIRTIO_SCSI_F_HOTPLUG) & 1) { - virtio_scsi_push_event(s, dev, VIRTIO_SCSI_T_TRANSPORT_RESET, - VIRTIO_SCSI_EVT_RESET_RESCAN); - } -} - -static void virtio_scsi_hot_unplug(SCSIBus *bus, SCSIDevice *dev) -{ - VirtIOSCSI *s = container_of(bus, VirtIOSCSI, bus); - VirtIODevice *vdev = VIRTIO_DEVICE(s); - - if ((vdev->guest_features >> VIRTIO_SCSI_F_HOTPLUG) & 1) { - virtio_scsi_push_event(s, dev, VIRTIO_SCSI_T_TRANSPORT_RESET, - VIRTIO_SCSI_EVT_RESET_REMOVED); - } -} - -static struct SCSIBusInfo virtio_scsi_scsi_info = { - .tcq = true, - .max_channel = VIRTIO_SCSI_MAX_CHANNEL, - .max_target = VIRTIO_SCSI_MAX_TARGET, - .max_lun = VIRTIO_SCSI_MAX_LUN, - - .complete = virtio_scsi_command_complete, - .cancel = virtio_scsi_request_cancelled, - .change = virtio_scsi_change, - .hotplug = virtio_scsi_hotplug, - .hot_unplug = virtio_scsi_hot_unplug, - .get_sg_list = virtio_scsi_get_sg_list, - .save_request = virtio_scsi_save_request, - .load_request = virtio_scsi_load_request, -}; - -static int virtio_scsi_device_init(VirtIODevice *vdev) -{ - DeviceState *qdev = DEVICE(vdev); - VirtIOSCSI *s = VIRTIO_SCSI(vdev); - static int virtio_scsi_id; - int i; - - virtio_init(VIRTIO_DEVICE(s), "virtio-scsi", VIRTIO_ID_SCSI, - sizeof(VirtIOSCSIConfig)); - - s->cmd_vqs = g_malloc0(s->conf.num_queues * sizeof(VirtQueue *)); - - /* TODO set up vdev function pointers */ - vdev->get_config = virtio_scsi_get_config; - vdev->set_config = virtio_scsi_set_config; - vdev->get_features = virtio_scsi_get_features; - vdev->reset = virtio_scsi_reset; - - s->ctrl_vq = virtio_add_queue(vdev, VIRTIO_SCSI_VQ_SIZE, - virtio_scsi_handle_ctrl); - s->event_vq = virtio_add_queue(vdev, VIRTIO_SCSI_VQ_SIZE, - virtio_scsi_handle_event); - for (i = 0; i < s->conf.num_queues; i++) { - s->cmd_vqs[i] = virtio_add_queue(vdev, VIRTIO_SCSI_VQ_SIZE, - virtio_scsi_handle_cmd); - } - - scsi_bus_new(&s->bus, qdev, &virtio_scsi_scsi_info); - if (!qdev->hotplugged) { - scsi_bus_legacy_handle_cmdline(&s->bus); - } - - register_savevm(qdev, "virtio-scsi", virtio_scsi_id++, 1, - virtio_scsi_save, virtio_scsi_load, s); - - return 0; -} - -static int virtio_scsi_device_exit(DeviceState *qdev) -{ - VirtIOSCSI *s = VIRTIO_SCSI(qdev); - VirtIODevice *vdev = VIRTIO_DEVICE(qdev); - - unregister_savevm(qdev, "virtio-scsi", s); - g_free(s->cmd_vqs); - virtio_common_cleanup(vdev); - return 0; -} - -static Property virtio_scsi_properties[] = { - DEFINE_VIRTIO_SCSI_PROPERTIES(VirtIOSCSI, conf), - DEFINE_PROP_END_OF_LIST(), -}; - -static void virtio_scsi_class_init(ObjectClass *klass, void *data) -{ - DeviceClass *dc = DEVICE_CLASS(klass); - VirtioDeviceClass *vdc = VIRTIO_DEVICE_CLASS(klass); - dc->exit = virtio_scsi_device_exit; - dc->props = virtio_scsi_properties; - vdc->init = virtio_scsi_device_init; - vdc->get_config = virtio_scsi_get_config; - vdc->set_config = virtio_scsi_set_config; - vdc->get_features = virtio_scsi_get_features; - vdc->reset = virtio_scsi_reset; -} - -static const TypeInfo virtio_scsi_info = { - .name = TYPE_VIRTIO_SCSI, - .parent = TYPE_VIRTIO_DEVICE, - .instance_size = sizeof(VirtIOSCSI), - .class_init = virtio_scsi_class_init, -}; - -static void virtio_register_types(void) -{ - type_register_static(&virtio_scsi_info); -} - -type_init(virtio_register_types) diff --git a/hw/virtio-serial-bus.c b/hw/virtio-serial-bus.c deleted file mode 100644 index 1dba8ab2c6..0000000000 --- a/hw/virtio-serial-bus.c +++ /dev/null @@ -1,1018 +0,0 @@ -/* - * A bus for connecting virtio serial and console ports - * - * Copyright (C) 2009, 2010 Red Hat, Inc. - * - * Author(s): - * Amit Shah - * - * Some earlier parts are: - * Copyright IBM, Corp. 2008 - * authored by - * Christian Ehrhardt - * - * This work is licensed under the terms of the GNU GPL, version 2. See - * the COPYING file in the top-level directory. - * - * Contributions after 2012-01-13 are licensed under the terms of the - * GNU GPL, version 2 or (at your option) any later version. - */ - -#include "qemu/iov.h" -#include "monitor/monitor.h" -#include "qemu/queue.h" -#include "hw/sysbus.h" -#include "trace.h" -#include "hw/virtio/virtio-serial.h" - -static VirtIOSerialPort *find_port_by_id(VirtIOSerial *vser, uint32_t id) -{ - VirtIOSerialPort *port; - - if (id == VIRTIO_CONSOLE_BAD_ID) { - return NULL; - } - - QTAILQ_FOREACH(port, &vser->ports, next) { - if (port->id == id) - return port; - } - return NULL; -} - -static VirtIOSerialPort *find_port_by_vq(VirtIOSerial *vser, VirtQueue *vq) -{ - VirtIOSerialPort *port; - - QTAILQ_FOREACH(port, &vser->ports, next) { - if (port->ivq == vq || port->ovq == vq) - return port; - } - return NULL; -} - -static bool use_multiport(VirtIOSerial *vser) -{ - return vser->vdev.guest_features & (1 << VIRTIO_CONSOLE_F_MULTIPORT); -} - -static size_t write_to_port(VirtIOSerialPort *port, - const uint8_t *buf, size_t size) -{ - VirtQueueElement elem; - VirtQueue *vq; - size_t offset; - - vq = port->ivq; - if (!virtio_queue_ready(vq)) { - return 0; - } - - offset = 0; - while (offset < size) { - size_t len; - - if (!virtqueue_pop(vq, &elem)) { - break; - } - - len = iov_from_buf(elem.in_sg, elem.in_num, 0, - buf + offset, size - offset); - offset += len; - - virtqueue_push(vq, &elem, len); - } - - virtio_notify(&port->vser->vdev, vq); - return offset; -} - -static void discard_vq_data(VirtQueue *vq, VirtIODevice *vdev) -{ - VirtQueueElement elem; - - if (!virtio_queue_ready(vq)) { - return; - } - while (virtqueue_pop(vq, &elem)) { - virtqueue_push(vq, &elem, 0); - } - virtio_notify(vdev, vq); -} - -static void do_flush_queued_data(VirtIOSerialPort *port, VirtQueue *vq, - VirtIODevice *vdev) -{ - VirtIOSerialPortClass *vsc; - - assert(port); - assert(virtio_queue_ready(vq)); - - vsc = VIRTIO_SERIAL_PORT_GET_CLASS(port); - - while (!port->throttled) { - unsigned int i; - - /* Pop an elem only if we haven't left off a previous one mid-way */ - if (!port->elem.out_num) { - if (!virtqueue_pop(vq, &port->elem)) { - break; - } - port->iov_idx = 0; - port->iov_offset = 0; - } - - for (i = port->iov_idx; i < port->elem.out_num; i++) { - size_t buf_size; - ssize_t ret; - - buf_size = port->elem.out_sg[i].iov_len - port->iov_offset; - ret = vsc->have_data(port, - port->elem.out_sg[i].iov_base - + port->iov_offset, - buf_size); - if (port->throttled) { - port->iov_idx = i; - if (ret > 0) { - port->iov_offset += ret; - } - break; - } - port->iov_offset = 0; - } - if (port->throttled) { - break; - } - virtqueue_push(vq, &port->elem, 0); - port->elem.out_num = 0; - } - virtio_notify(vdev, vq); -} - -static void flush_queued_data(VirtIOSerialPort *port) -{ - assert(port); - - if (!virtio_queue_ready(port->ovq)) { - return; - } - do_flush_queued_data(port, port->ovq, &port->vser->vdev); -} - -static size_t send_control_msg(VirtIOSerial *vser, void *buf, size_t len) -{ - VirtQueueElement elem; - VirtQueue *vq; - - vq = vser->c_ivq; - if (!virtio_queue_ready(vq)) { - return 0; - } - if (!virtqueue_pop(vq, &elem)) { - return 0; - } - - memcpy(elem.in_sg[0].iov_base, buf, len); - - virtqueue_push(vq, &elem, len); - virtio_notify(&vser->vdev, vq); - return len; -} - -static size_t send_control_event(VirtIOSerial *vser, uint32_t port_id, - uint16_t event, uint16_t value) -{ - struct virtio_console_control cpkt; - - stl_p(&cpkt.id, port_id); - stw_p(&cpkt.event, event); - stw_p(&cpkt.value, value); - - trace_virtio_serial_send_control_event(port_id, event, value); - return send_control_msg(vser, &cpkt, sizeof(cpkt)); -} - -/* Functions for use inside qemu to open and read from/write to ports */ -int virtio_serial_open(VirtIOSerialPort *port) -{ - /* Don't allow opening an already-open port */ - if (port->host_connected) { - return 0; - } - /* Send port open notification to the guest */ - port->host_connected = true; - send_control_event(port->vser, port->id, VIRTIO_CONSOLE_PORT_OPEN, 1); - - return 0; -} - -int virtio_serial_close(VirtIOSerialPort *port) -{ - port->host_connected = false; - /* - * If there's any data the guest sent which the app didn't - * consume, reset the throttling flag and discard the data. - */ - port->throttled = false; - discard_vq_data(port->ovq, &port->vser->vdev); - - send_control_event(port->vser, port->id, VIRTIO_CONSOLE_PORT_OPEN, 0); - - return 0; -} - -/* Individual ports/apps call this function to write to the guest. */ -ssize_t virtio_serial_write(VirtIOSerialPort *port, const uint8_t *buf, - size_t size) -{ - if (!port || !port->host_connected || !port->guest_connected) { - return 0; - } - return write_to_port(port, buf, size); -} - -/* - * Readiness of the guest to accept data on a port. - * Returns max. data the guest can receive - */ -size_t virtio_serial_guest_ready(VirtIOSerialPort *port) -{ - VirtQueue *vq = port->ivq; - unsigned int bytes; - - if (!virtio_queue_ready(vq) || - !(port->vser->vdev.status & VIRTIO_CONFIG_S_DRIVER_OK) || - virtio_queue_empty(vq)) { - return 0; - } - if (use_multiport(port->vser) && !port->guest_connected) { - return 0; - } - virtqueue_get_avail_bytes(vq, &bytes, NULL, 4096, 0); - return bytes; -} - -static void flush_queued_data_bh(void *opaque) -{ - VirtIOSerialPort *port = opaque; - - flush_queued_data(port); -} - -void virtio_serial_throttle_port(VirtIOSerialPort *port, bool throttle) -{ - if (!port) { - return; - } - - trace_virtio_serial_throttle_port(port->id, throttle); - port->throttled = throttle; - if (throttle) { - return; - } - qemu_bh_schedule(port->bh); -} - -/* Guest wants to notify us of some event */ -static void handle_control_message(VirtIOSerial *vser, void *buf, size_t len) -{ - struct VirtIOSerialPort *port; - VirtIOSerialPortClass *vsc; - struct virtio_console_control cpkt, *gcpkt; - uint8_t *buffer; - size_t buffer_len; - - gcpkt = buf; - - if (len < sizeof(cpkt)) { - /* The guest sent an invalid control packet */ - return; - } - - cpkt.event = lduw_p(&gcpkt->event); - cpkt.value = lduw_p(&gcpkt->value); - - trace_virtio_serial_handle_control_message(cpkt.event, cpkt.value); - - if (cpkt.event == VIRTIO_CONSOLE_DEVICE_READY) { - if (!cpkt.value) { - error_report("virtio-serial-bus: Guest failure in adding device %s", - vser->bus.qbus.name); - return; - } - /* - * The device is up, we can now tell the device about all the - * ports we have here. - */ - QTAILQ_FOREACH(port, &vser->ports, next) { - send_control_event(vser, port->id, VIRTIO_CONSOLE_PORT_ADD, 1); - } - return; - } - - port = find_port_by_id(vser, ldl_p(&gcpkt->id)); - if (!port) { - error_report("virtio-serial-bus: Unexpected port id %u for device %s", - ldl_p(&gcpkt->id), vser->bus.qbus.name); - return; - } - - trace_virtio_serial_handle_control_message_port(port->id); - - vsc = VIRTIO_SERIAL_PORT_GET_CLASS(port); - - switch(cpkt.event) { - case VIRTIO_CONSOLE_PORT_READY: - if (!cpkt.value) { - error_report("virtio-serial-bus: Guest failure in adding port %u for device %s", - port->id, vser->bus.qbus.name); - break; - } - /* - * Now that we know the guest asked for the port name, we're - * sure the guest has initialised whatever state is necessary - * for this port. Now's a good time to let the guest know if - * this port is a console port so that the guest can hook it - * up to hvc. - */ - if (vsc->is_console) { - send_control_event(vser, port->id, VIRTIO_CONSOLE_CONSOLE_PORT, 1); - } - - if (port->name) { - stl_p(&cpkt.id, port->id); - stw_p(&cpkt.event, VIRTIO_CONSOLE_PORT_NAME); - stw_p(&cpkt.value, 1); - - buffer_len = sizeof(cpkt) + strlen(port->name) + 1; - buffer = g_malloc(buffer_len); - - memcpy(buffer, &cpkt, sizeof(cpkt)); - memcpy(buffer + sizeof(cpkt), port->name, strlen(port->name)); - buffer[buffer_len - 1] = 0; - - send_control_msg(vser, buffer, buffer_len); - g_free(buffer); - } - - if (port->host_connected) { - send_control_event(vser, port->id, VIRTIO_CONSOLE_PORT_OPEN, 1); - } - - /* - * When the guest has asked us for this information it means - * the guest is all setup and has its virtqueues - * initialised. If some app is interested in knowing about - * this event, let it know. - */ - if (vsc->guest_ready) { - vsc->guest_ready(port); - } - break; - - case VIRTIO_CONSOLE_PORT_OPEN: - port->guest_connected = cpkt.value; - if (vsc->set_guest_connected) { - /* Send the guest opened notification if an app is interested */ - vsc->set_guest_connected(port, cpkt.value); - } - break; - } -} - -static void control_in(VirtIODevice *vdev, VirtQueue *vq) -{ -} - -static void control_out(VirtIODevice *vdev, VirtQueue *vq) -{ - VirtQueueElement elem; - VirtIOSerial *vser; - uint8_t *buf; - size_t len; - - vser = DO_UPCAST(VirtIOSerial, vdev, vdev); - - len = 0; - buf = NULL; - while (virtqueue_pop(vq, &elem)) { - size_t cur_len; - - cur_len = iov_size(elem.out_sg, elem.out_num); - /* - * Allocate a new buf only if we didn't have one previously or - * if the size of the buf differs - */ - if (cur_len > len) { - g_free(buf); - - buf = g_malloc(cur_len); - len = cur_len; - } - iov_to_buf(elem.out_sg, elem.out_num, 0, buf, cur_len); - - handle_control_message(vser, buf, cur_len); - virtqueue_push(vq, &elem, 0); - } - g_free(buf); - virtio_notify(vdev, vq); -} - -/* Guest wrote something to some port. */ -static void handle_output(VirtIODevice *vdev, VirtQueue *vq) -{ - VirtIOSerial *vser; - VirtIOSerialPort *port; - - vser = DO_UPCAST(VirtIOSerial, vdev, vdev); - port = find_port_by_vq(vser, vq); - - if (!port || !port->host_connected) { - discard_vq_data(vq, vdev); - return; - } - - if (!port->throttled) { - do_flush_queued_data(port, vq, vdev); - return; - } -} - -static void handle_input(VirtIODevice *vdev, VirtQueue *vq) -{ -} - -static uint32_t get_features(VirtIODevice *vdev, uint32_t features) -{ - VirtIOSerial *vser; - - vser = DO_UPCAST(VirtIOSerial, vdev, vdev); - - if (vser->bus.max_nr_ports > 1) { - features |= (1 << VIRTIO_CONSOLE_F_MULTIPORT); - } - return features; -} - -/* Guest requested config info */ -static void get_config(VirtIODevice *vdev, uint8_t *config_data) -{ - VirtIOSerial *vser; - - vser = DO_UPCAST(VirtIOSerial, vdev, vdev); - memcpy(config_data, &vser->config, sizeof(struct virtio_console_config)); -} - -static void set_config(VirtIODevice *vdev, const uint8_t *config_data) -{ - struct virtio_console_config config; - - memcpy(&config, config_data, sizeof(config)); -} - -static void guest_reset(VirtIOSerial *vser) -{ - VirtIOSerialPort *port; - VirtIOSerialPortClass *vsc; - - QTAILQ_FOREACH(port, &vser->ports, next) { - vsc = VIRTIO_SERIAL_PORT_GET_CLASS(port); - if (port->guest_connected) { - port->guest_connected = false; - if (vsc->set_guest_connected) { - vsc->set_guest_connected(port, false); - } - } - } -} - -static void set_status(VirtIODevice *vdev, uint8_t status) -{ - VirtIOSerial *vser; - VirtIOSerialPort *port; - - vser = DO_UPCAST(VirtIOSerial, vdev, vdev); - port = find_port_by_id(vser, 0); - - if (port && !use_multiport(port->vser) - && (status & VIRTIO_CONFIG_S_DRIVER_OK)) { - /* - * Non-multiport guests won't be able to tell us guest - * open/close status. Such guests can only have a port at id - * 0, so set guest_connected for such ports as soon as guest - * is up. - */ - port->guest_connected = true; - } - if (!(status & VIRTIO_CONFIG_S_DRIVER_OK)) { - guest_reset(vser); - } -} - -static void vser_reset(VirtIODevice *vdev) -{ - VirtIOSerial *vser; - - vser = DO_UPCAST(VirtIOSerial, vdev, vdev); - guest_reset(vser); -} - -static void virtio_serial_save(QEMUFile *f, void *opaque) -{ - VirtIOSerial *s = opaque; - VirtIOSerialPort *port; - uint32_t nr_active_ports; - unsigned int i, max_nr_ports; - - /* The virtio device */ - virtio_save(&s->vdev, f); - - /* The config space */ - qemu_put_be16s(f, &s->config.cols); - qemu_put_be16s(f, &s->config.rows); - - qemu_put_be32s(f, &s->config.max_nr_ports); - - /* The ports map */ - max_nr_ports = tswap32(s->config.max_nr_ports); - for (i = 0; i < (max_nr_ports + 31) / 32; i++) { - qemu_put_be32s(f, &s->ports_map[i]); - } - - /* Ports */ - - nr_active_ports = 0; - QTAILQ_FOREACH(port, &s->ports, next) { - nr_active_ports++; - } - - qemu_put_be32s(f, &nr_active_ports); - - /* - * Items in struct VirtIOSerialPort. - */ - QTAILQ_FOREACH(port, &s->ports, next) { - uint32_t elem_popped; - - qemu_put_be32s(f, &port->id); - qemu_put_byte(f, port->guest_connected); - qemu_put_byte(f, port->host_connected); - - elem_popped = 0; - if (port->elem.out_num) { - elem_popped = 1; - } - qemu_put_be32s(f, &elem_popped); - if (elem_popped) { - qemu_put_be32s(f, &port->iov_idx); - qemu_put_be64s(f, &port->iov_offset); - - qemu_put_buffer(f, (unsigned char *)&port->elem, - sizeof(port->elem)); - } - } -} - -static void virtio_serial_post_load_timer_cb(void *opaque) -{ - uint32_t i; - VirtIOSerial *s = opaque; - VirtIOSerialPort *port; - uint8_t host_connected; - VirtIOSerialPortClass *vsc; - - if (!s->post_load) { - return; - } - for (i = 0 ; i < s->post_load->nr_active_ports; ++i) { - port = s->post_load->connected[i].port; - host_connected = s->post_load->connected[i].host_connected; - if (host_connected != port->host_connected) { - /* - * We have to let the guest know of the host connection - * status change - */ - send_control_event(s, port->id, VIRTIO_CONSOLE_PORT_OPEN, - port->host_connected); - } - vsc = VIRTIO_SERIAL_PORT_GET_CLASS(port); - if (vsc->set_guest_connected) { - vsc->set_guest_connected(port, port->guest_connected); - } - } - g_free(s->post_load->connected); - qemu_free_timer(s->post_load->timer); - g_free(s->post_load); - s->post_load = NULL; -} - -static int fetch_active_ports_list(QEMUFile *f, int version_id, - VirtIOSerial *s, uint32_t nr_active_ports) -{ - uint32_t i; - - s->post_load = g_malloc0(sizeof(*s->post_load)); - s->post_load->nr_active_ports = nr_active_ports; - s->post_load->connected = - g_malloc0(sizeof(*s->post_load->connected) * nr_active_ports); - - s->post_load->timer = qemu_new_timer_ns(vm_clock, - virtio_serial_post_load_timer_cb, - s); - - /* Items in struct VirtIOSerialPort */ - for (i = 0; i < nr_active_ports; i++) { - VirtIOSerialPort *port; - uint32_t id; - - id = qemu_get_be32(f); - port = find_port_by_id(s, id); - if (!port) { - return -EINVAL; - } - - port->guest_connected = qemu_get_byte(f); - s->post_load->connected[i].port = port; - s->post_load->connected[i].host_connected = qemu_get_byte(f); - - if (version_id > 2) { - uint32_t elem_popped; - - qemu_get_be32s(f, &elem_popped); - if (elem_popped) { - qemu_get_be32s(f, &port->iov_idx); - qemu_get_be64s(f, &port->iov_offset); - - qemu_get_buffer(f, (unsigned char *)&port->elem, - sizeof(port->elem)); - virtqueue_map_sg(port->elem.in_sg, port->elem.in_addr, - port->elem.in_num, 1); - virtqueue_map_sg(port->elem.out_sg, port->elem.out_addr, - port->elem.out_num, 1); - - /* - * Port was throttled on source machine. Let's - * unthrottle it here so data starts flowing again. - */ - virtio_serial_throttle_port(port, false); - } - } - } - qemu_mod_timer(s->post_load->timer, 1); - return 0; -} - -static int virtio_serial_load(QEMUFile *f, void *opaque, int version_id) -{ - VirtIOSerial *s = opaque; - uint32_t max_nr_ports, nr_active_ports, ports_map; - unsigned int i; - int ret; - - if (version_id > 3) { - return -EINVAL; - } - - /* The virtio device */ - ret = virtio_load(&s->vdev, f); - if (ret) { - return ret; - } - - if (version_id < 2) { - return 0; - } - - /* The config space */ - qemu_get_be16s(f, &s->config.cols); - qemu_get_be16s(f, &s->config.rows); - - qemu_get_be32s(f, &max_nr_ports); - tswap32s(&max_nr_ports); - if (max_nr_ports > tswap32(s->config.max_nr_ports)) { - /* Source could have had more ports than us. Fail migration. */ - return -EINVAL; - } - - for (i = 0; i < (max_nr_ports + 31) / 32; i++) { - qemu_get_be32s(f, &ports_map); - - if (ports_map != s->ports_map[i]) { - /* - * Ports active on source and destination don't - * match. Fail migration. - */ - return -EINVAL; - } - } - - qemu_get_be32s(f, &nr_active_ports); - - if (nr_active_ports) { - ret = fetch_active_ports_list(f, version_id, s, nr_active_ports); - if (ret) { - return ret; - } - } - return 0; -} - -static void virtser_bus_dev_print(Monitor *mon, DeviceState *qdev, int indent); - -static Property virtser_props[] = { - DEFINE_PROP_UINT32("nr", VirtIOSerialPort, id, VIRTIO_CONSOLE_BAD_ID), - DEFINE_PROP_STRING("name", VirtIOSerialPort, name), - DEFINE_PROP_END_OF_LIST() -}; - -#define TYPE_VIRTIO_SERIAL_BUS "virtio-serial-bus" -#define VIRTIO_SERIAL_BUS(obj) \ - OBJECT_CHECK(VirtIOSerialBus, (obj), TYPE_VIRTIO_SERIAL_BUS) - -static void virtser_bus_class_init(ObjectClass *klass, void *data) -{ - BusClass *k = BUS_CLASS(klass); - k->print_dev = virtser_bus_dev_print; -} - -static const TypeInfo virtser_bus_info = { - .name = TYPE_VIRTIO_SERIAL_BUS, - .parent = TYPE_BUS, - .instance_size = sizeof(VirtIOSerialBus), - .class_init = virtser_bus_class_init, -}; - -static void virtser_bus_dev_print(Monitor *mon, DeviceState *qdev, int indent) -{ - VirtIOSerialPort *port = DO_UPCAST(VirtIOSerialPort, dev, qdev); - - monitor_printf(mon, "%*sport %d, guest %s, host %s, throttle %s\n", - indent, "", port->id, - port->guest_connected ? "on" : "off", - port->host_connected ? "on" : "off", - port->throttled ? "on" : "off"); -} - -/* This function is only used if a port id is not provided by the user */ -static uint32_t find_free_port_id(VirtIOSerial *vser) -{ - unsigned int i, max_nr_ports; - - max_nr_ports = tswap32(vser->config.max_nr_ports); - for (i = 0; i < (max_nr_ports + 31) / 32; i++) { - uint32_t map, bit; - - map = vser->ports_map[i]; - bit = ffs(~map); - if (bit) { - return (bit - 1) + i * 32; - } - } - return VIRTIO_CONSOLE_BAD_ID; -} - -static void mark_port_added(VirtIOSerial *vser, uint32_t port_id) -{ - unsigned int i; - - i = port_id / 32; - vser->ports_map[i] |= 1U << (port_id % 32); -} - -static void add_port(VirtIOSerial *vser, uint32_t port_id) -{ - mark_port_added(vser, port_id); - send_control_event(vser, port_id, VIRTIO_CONSOLE_PORT_ADD, 1); -} - -static void remove_port(VirtIOSerial *vser, uint32_t port_id) -{ - VirtIOSerialPort *port; - unsigned int i; - - i = port_id / 32; - vser->ports_map[i] &= ~(1U << (port_id % 32)); - - port = find_port_by_id(vser, port_id); - /* - * This function is only called from qdev's unplug callback; if we - * get a NULL port here, we're in trouble. - */ - assert(port); - - /* Flush out any unconsumed buffers first */ - discard_vq_data(port->ovq, &port->vser->vdev); - - send_control_event(vser, port->id, VIRTIO_CONSOLE_PORT_REMOVE, 1); -} - -static int virtser_port_qdev_init(DeviceState *qdev) -{ - VirtIOSerialPort *port = DO_UPCAST(VirtIOSerialPort, dev, qdev); - VirtIOSerialPortClass *vsc = VIRTIO_SERIAL_PORT_GET_CLASS(port); - VirtIOSerialBus *bus = DO_UPCAST(VirtIOSerialBus, qbus, qdev->parent_bus); - int ret, max_nr_ports; - bool plugging_port0; - - port->vser = bus->vser; - port->bh = qemu_bh_new(flush_queued_data_bh, port); - - assert(vsc->have_data); - - /* - * Is the first console port we're seeing? If so, put it up at - * location 0. This is done for backward compatibility (old - * kernel, new qemu). - */ - plugging_port0 = vsc->is_console && !find_port_by_id(port->vser, 0); - - if (find_port_by_id(port->vser, port->id)) { - error_report("virtio-serial-bus: A port already exists at id %u", - port->id); - return -1; - } - - if (port->id == VIRTIO_CONSOLE_BAD_ID) { - if (plugging_port0) { - port->id = 0; - } else { - port->id = find_free_port_id(port->vser); - if (port->id == VIRTIO_CONSOLE_BAD_ID) { - error_report("virtio-serial-bus: Maximum port limit for this device reached"); - return -1; - } - } - } - - max_nr_ports = tswap32(port->vser->config.max_nr_ports); - if (port->id >= max_nr_ports) { - error_report("virtio-serial-bus: Out-of-range port id specified, max. allowed: %u", - max_nr_ports - 1); - return -1; - } - - ret = vsc->init(port); - if (ret) { - return ret; - } - - port->elem.out_num = 0; - - QTAILQ_INSERT_TAIL(&port->vser->ports, port, next); - port->ivq = port->vser->ivqs[port->id]; - port->ovq = port->vser->ovqs[port->id]; - - add_port(port->vser, port->id); - - /* Send an update to the guest about this new port added */ - virtio_notify_config(&port->vser->vdev); - - return ret; -} - -static int virtser_port_qdev_exit(DeviceState *qdev) -{ - VirtIOSerialPort *port = DO_UPCAST(VirtIOSerialPort, dev, qdev); - VirtIOSerialPortClass *vsc = VIRTIO_SERIAL_PORT_GET_CLASS(port); - VirtIOSerial *vser = port->vser; - - qemu_bh_delete(port->bh); - remove_port(port->vser, port->id); - - QTAILQ_REMOVE(&vser->ports, port, next); - - if (vsc->exit) { - vsc->exit(port); - } - return 0; -} - -VirtIODevice *virtio_serial_init(DeviceState *dev, virtio_serial_conf *conf) -{ - VirtIOSerial *vser; - VirtIODevice *vdev; - uint32_t i, max_supported_ports; - - if (!conf->max_virtserial_ports) - return NULL; - - /* Each port takes 2 queues, and one pair is for the control queue */ - max_supported_ports = VIRTIO_PCI_QUEUE_MAX / 2 - 1; - - if (conf->max_virtserial_ports > max_supported_ports) { - error_report("maximum ports supported: %u", max_supported_ports); - return NULL; - } - - vdev = virtio_common_init("virtio-serial", VIRTIO_ID_CONSOLE, - sizeof(struct virtio_console_config), - sizeof(VirtIOSerial)); - - vser = DO_UPCAST(VirtIOSerial, vdev, vdev); - - /* Spawn a new virtio-serial bus on which the ports will ride as devices */ - qbus_create_inplace(&vser->bus.qbus, TYPE_VIRTIO_SERIAL_BUS, dev, NULL); - vser->bus.qbus.allow_hotplug = 1; - vser->bus.vser = vser; - QTAILQ_INIT(&vser->ports); - - vser->bus.max_nr_ports = conf->max_virtserial_ports; - vser->ivqs = g_malloc(conf->max_virtserial_ports * sizeof(VirtQueue *)); - vser->ovqs = g_malloc(conf->max_virtserial_ports * sizeof(VirtQueue *)); - - /* Add a queue for host to guest transfers for port 0 (backward compat) */ - vser->ivqs[0] = virtio_add_queue(vdev, 128, handle_input); - /* Add a queue for guest to host transfers for port 0 (backward compat) */ - vser->ovqs[0] = virtio_add_queue(vdev, 128, handle_output); - - /* TODO: host to guest notifications can get dropped - * if the queue fills up. Implement queueing in host, - * this might also make it possible to reduce the control - * queue size: as guest preposts buffers there, - * this will save 4Kbyte of guest memory per entry. */ - - /* control queue: host to guest */ - vser->c_ivq = virtio_add_queue(vdev, 32, control_in); - /* control queue: guest to host */ - vser->c_ovq = virtio_add_queue(vdev, 32, control_out); - - for (i = 1; i < vser->bus.max_nr_ports; i++) { - /* Add a per-port queue for host to guest transfers */ - vser->ivqs[i] = virtio_add_queue(vdev, 128, handle_input); - /* Add a per-per queue for guest to host transfers */ - vser->ovqs[i] = virtio_add_queue(vdev, 128, handle_output); - } - - vser->config.max_nr_ports = tswap32(conf->max_virtserial_ports); - vser->ports_map = g_malloc0(((conf->max_virtserial_ports + 31) / 32) - * sizeof(vser->ports_map[0])); - /* - * Reserve location 0 for a console port for backward compat - * (old kernel, new qemu) - */ - mark_port_added(vser, 0); - - vser->vdev.get_features = get_features; - vser->vdev.get_config = get_config; - vser->vdev.set_config = set_config; - vser->vdev.set_status = set_status; - vser->vdev.reset = vser_reset; - - vser->qdev = dev; - - vser->post_load = NULL; - - /* - * Register for the savevm section with the virtio-console name - * to preserve backward compat - */ - register_savevm(dev, "virtio-console", -1, 3, virtio_serial_save, - virtio_serial_load, vser); - - return vdev; -} - -void virtio_serial_exit(VirtIODevice *vdev) -{ - VirtIOSerial *vser = DO_UPCAST(VirtIOSerial, vdev, vdev); - - unregister_savevm(vser->qdev, "virtio-console", vser); - - g_free(vser->ivqs); - g_free(vser->ovqs); - g_free(vser->ports_map); - if (vser->post_load) { - g_free(vser->post_load->connected); - qemu_del_timer(vser->post_load->timer); - qemu_free_timer(vser->post_load->timer); - g_free(vser->post_load); - } - virtio_cleanup(vdev); -} - -static void virtio_serial_port_class_init(ObjectClass *klass, void *data) -{ - DeviceClass *k = DEVICE_CLASS(klass); - k->init = virtser_port_qdev_init; - k->bus_type = TYPE_VIRTIO_SERIAL_BUS; - k->exit = virtser_port_qdev_exit; - k->unplug = qdev_simple_unplug_cb; - k->props = virtser_props; -} - -static const TypeInfo virtio_serial_port_type_info = { - .name = TYPE_VIRTIO_SERIAL_PORT, - .parent = TYPE_DEVICE, - .instance_size = sizeof(VirtIOSerialPort), - .abstract = true, - .class_size = sizeof(VirtIOSerialPortClass), - .class_init = virtio_serial_port_class_init, -}; - -static void virtio_serial_register_types(void) -{ - type_register_static(&virtser_bus_info); - type_register_static(&virtio_serial_port_type_info); -} - -type_init(virtio_serial_register_types) diff --git a/hw/virtio.c b/hw/virtio.c deleted file mode 100644 index 1c2282c54f..0000000000 --- a/hw/virtio.c +++ /dev/null @@ -1,1121 +0,0 @@ -/* - * Virtio Support - * - * Copyright IBM, Corp. 2007 - * - * Authors: - * Anthony Liguori - * - * This work is licensed under the terms of the GNU GPL, version 2. See - * the COPYING file in the top-level directory. - * - */ - -#include - -#include "trace.h" -#include "qemu/error-report.h" -#include "hw/virtio/virtio.h" -#include "qemu/atomic.h" -#include "hw/virtio/virtio-bus.h" - -/* The alignment to use between consumer and producer parts of vring. - * x86 pagesize again. */ -#define VIRTIO_PCI_VRING_ALIGN 4096 - -typedef struct VRingDesc -{ - uint64_t addr; - uint32_t len; - uint16_t flags; - uint16_t next; -} VRingDesc; - -typedef struct VRingAvail -{ - uint16_t flags; - uint16_t idx; - uint16_t ring[0]; -} VRingAvail; - -typedef struct VRingUsedElem -{ - uint32_t id; - uint32_t len; -} VRingUsedElem; - -typedef struct VRingUsed -{ - uint16_t flags; - uint16_t idx; - VRingUsedElem ring[0]; -} VRingUsed; - -typedef struct VRing -{ - unsigned int num; - hwaddr desc; - hwaddr avail; - hwaddr used; -} VRing; - -struct VirtQueue -{ - VRing vring; - hwaddr pa; - uint16_t last_avail_idx; - /* Last used index value we have signalled on */ - uint16_t signalled_used; - - /* Last used index value we have signalled on */ - bool signalled_used_valid; - - /* Notification enabled? */ - bool notification; - - uint16_t queue_index; - - int inuse; - - uint16_t vector; - void (*handle_output)(VirtIODevice *vdev, VirtQueue *vq); - VirtIODevice *vdev; - EventNotifier guest_notifier; - EventNotifier host_notifier; -}; - -/* virt queue functions */ -static void virtqueue_init(VirtQueue *vq) -{ - hwaddr pa = vq->pa; - - vq->vring.desc = pa; - vq->vring.avail = pa + vq->vring.num * sizeof(VRingDesc); - vq->vring.used = vring_align(vq->vring.avail + - offsetof(VRingAvail, ring[vq->vring.num]), - VIRTIO_PCI_VRING_ALIGN); -} - -static inline uint64_t vring_desc_addr(hwaddr desc_pa, int i) -{ - hwaddr pa; - pa = desc_pa + sizeof(VRingDesc) * i + offsetof(VRingDesc, addr); - return ldq_phys(pa); -} - -static inline uint32_t vring_desc_len(hwaddr desc_pa, int i) -{ - hwaddr pa; - pa = desc_pa + sizeof(VRingDesc) * i + offsetof(VRingDesc, len); - return ldl_phys(pa); -} - -static inline uint16_t vring_desc_flags(hwaddr desc_pa, int i) -{ - hwaddr pa; - pa = desc_pa + sizeof(VRingDesc) * i + offsetof(VRingDesc, flags); - return lduw_phys(pa); -} - -static inline uint16_t vring_desc_next(hwaddr desc_pa, int i) -{ - hwaddr pa; - pa = desc_pa + sizeof(VRingDesc) * i + offsetof(VRingDesc, next); - return lduw_phys(pa); -} - -static inline uint16_t vring_avail_flags(VirtQueue *vq) -{ - hwaddr pa; - pa = vq->vring.avail + offsetof(VRingAvail, flags); - return lduw_phys(pa); -} - -static inline uint16_t vring_avail_idx(VirtQueue *vq) -{ - hwaddr pa; - pa = vq->vring.avail + offsetof(VRingAvail, idx); - return lduw_phys(pa); -} - -static inline uint16_t vring_avail_ring(VirtQueue *vq, int i) -{ - hwaddr pa; - pa = vq->vring.avail + offsetof(VRingAvail, ring[i]); - return lduw_phys(pa); -} - -static inline uint16_t vring_used_event(VirtQueue *vq) -{ - return vring_avail_ring(vq, vq->vring.num); -} - -static inline void vring_used_ring_id(VirtQueue *vq, int i, uint32_t val) -{ - hwaddr pa; - pa = vq->vring.used + offsetof(VRingUsed, ring[i].id); - stl_phys(pa, val); -} - -static inline void vring_used_ring_len(VirtQueue *vq, int i, uint32_t val) -{ - hwaddr pa; - pa = vq->vring.used + offsetof(VRingUsed, ring[i].len); - stl_phys(pa, val); -} - -static uint16_t vring_used_idx(VirtQueue *vq) -{ - hwaddr pa; - pa = vq->vring.used + offsetof(VRingUsed, idx); - return lduw_phys(pa); -} - -static inline void vring_used_idx_set(VirtQueue *vq, uint16_t val) -{ - hwaddr pa; - pa = vq->vring.used + offsetof(VRingUsed, idx); - stw_phys(pa, val); -} - -static inline void vring_used_flags_set_bit(VirtQueue *vq, int mask) -{ - hwaddr pa; - pa = vq->vring.used + offsetof(VRingUsed, flags); - stw_phys(pa, lduw_phys(pa) | mask); -} - -static inline void vring_used_flags_unset_bit(VirtQueue *vq, int mask) -{ - hwaddr pa; - pa = vq->vring.used + offsetof(VRingUsed, flags); - stw_phys(pa, lduw_phys(pa) & ~mask); -} - -static inline void vring_avail_event(VirtQueue *vq, uint16_t val) -{ - hwaddr pa; - if (!vq->notification) { - return; - } - pa = vq->vring.used + offsetof(VRingUsed, ring[vq->vring.num]); - stw_phys(pa, val); -} - -void virtio_queue_set_notification(VirtQueue *vq, int enable) -{ - vq->notification = enable; - if (vq->vdev->guest_features & (1 << VIRTIO_RING_F_EVENT_IDX)) { - vring_avail_event(vq, vring_avail_idx(vq)); - } else if (enable) { - vring_used_flags_unset_bit(vq, VRING_USED_F_NO_NOTIFY); - } else { - vring_used_flags_set_bit(vq, VRING_USED_F_NO_NOTIFY); - } - if (enable) { - /* Expose avail event/used flags before caller checks the avail idx. */ - smp_mb(); - } -} - -int virtio_queue_ready(VirtQueue *vq) -{ - return vq->vring.avail != 0; -} - -int virtio_queue_empty(VirtQueue *vq) -{ - return vring_avail_idx(vq) == vq->last_avail_idx; -} - -void virtqueue_fill(VirtQueue *vq, const VirtQueueElement *elem, - unsigned int len, unsigned int idx) -{ - unsigned int offset; - int i; - - trace_virtqueue_fill(vq, elem, len, idx); - - offset = 0; - for (i = 0; i < elem->in_num; i++) { - size_t size = MIN(len - offset, elem->in_sg[i].iov_len); - - cpu_physical_memory_unmap(elem->in_sg[i].iov_base, - elem->in_sg[i].iov_len, - 1, size); - - offset += size; - } - - for (i = 0; i < elem->out_num; i++) - cpu_physical_memory_unmap(elem->out_sg[i].iov_base, - elem->out_sg[i].iov_len, - 0, elem->out_sg[i].iov_len); - - idx = (idx + vring_used_idx(vq)) % vq->vring.num; - - /* Get a pointer to the next entry in the used ring. */ - vring_used_ring_id(vq, idx, elem->index); - vring_used_ring_len(vq, idx, len); -} - -void virtqueue_flush(VirtQueue *vq, unsigned int count) -{ - uint16_t old, new; - /* Make sure buffer is written before we update index. */ - smp_wmb(); - trace_virtqueue_flush(vq, count); - old = vring_used_idx(vq); - new = old + count; - vring_used_idx_set(vq, new); - vq->inuse -= count; - if (unlikely((int16_t)(new - vq->signalled_used) < (uint16_t)(new - old))) - vq->signalled_used_valid = false; -} - -void virtqueue_push(VirtQueue *vq, const VirtQueueElement *elem, - unsigned int len) -{ - virtqueue_fill(vq, elem, len, 0); - virtqueue_flush(vq, 1); -} - -static int virtqueue_num_heads(VirtQueue *vq, unsigned int idx) -{ - uint16_t num_heads = vring_avail_idx(vq) - idx; - - /* Check it isn't doing very strange things with descriptor numbers. */ - if (num_heads > vq->vring.num) { - error_report("Guest moved used index from %u to %u", - idx, vring_avail_idx(vq)); - exit(1); - } - /* On success, callers read a descriptor at vq->last_avail_idx. - * Make sure descriptor read does not bypass avail index read. */ - if (num_heads) { - smp_rmb(); - } - - return num_heads; -} - -static unsigned int virtqueue_get_head(VirtQueue *vq, unsigned int idx) -{ - unsigned int head; - - /* Grab the next descriptor number they're advertising, and increment - * the index we've seen. */ - head = vring_avail_ring(vq, idx % vq->vring.num); - - /* If their number is silly, that's a fatal mistake. */ - if (head >= vq->vring.num) { - error_report("Guest says index %u is available", head); - exit(1); - } - - return head; -} - -static unsigned virtqueue_next_desc(hwaddr desc_pa, - unsigned int i, unsigned int max) -{ - unsigned int next; - - /* If this descriptor says it doesn't chain, we're done. */ - if (!(vring_desc_flags(desc_pa, i) & VRING_DESC_F_NEXT)) - return max; - - /* Check they're not leading us off end of descriptors. */ - next = vring_desc_next(desc_pa, i); - /* Make sure compiler knows to grab that: we don't want it changing! */ - smp_wmb(); - - if (next >= max) { - error_report("Desc next is %u", next); - exit(1); - } - - return next; -} - -void virtqueue_get_avail_bytes(VirtQueue *vq, unsigned int *in_bytes, - unsigned int *out_bytes, - unsigned max_in_bytes, unsigned max_out_bytes) -{ - unsigned int idx; - unsigned int total_bufs, in_total, out_total; - - idx = vq->last_avail_idx; - - total_bufs = in_total = out_total = 0; - while (virtqueue_num_heads(vq, idx)) { - unsigned int max, num_bufs, indirect = 0; - hwaddr desc_pa; - int i; - - max = vq->vring.num; - num_bufs = total_bufs; - i = virtqueue_get_head(vq, idx++); - desc_pa = vq->vring.desc; - - if (vring_desc_flags(desc_pa, i) & VRING_DESC_F_INDIRECT) { - if (vring_desc_len(desc_pa, i) % sizeof(VRingDesc)) { - error_report("Invalid size for indirect buffer table"); - exit(1); - } - - /* If we've got too many, that implies a descriptor loop. */ - if (num_bufs >= max) { - error_report("Looped descriptor"); - exit(1); - } - - /* loop over the indirect descriptor table */ - indirect = 1; - max = vring_desc_len(desc_pa, i) / sizeof(VRingDesc); - num_bufs = i = 0; - desc_pa = vring_desc_addr(desc_pa, i); - } - - do { - /* If we've got too many, that implies a descriptor loop. */ - if (++num_bufs > max) { - error_report("Looped descriptor"); - exit(1); - } - - if (vring_desc_flags(desc_pa, i) & VRING_DESC_F_WRITE) { - in_total += vring_desc_len(desc_pa, i); - } else { - out_total += vring_desc_len(desc_pa, i); - } - if (in_total >= max_in_bytes && out_total >= max_out_bytes) { - goto done; - } - } while ((i = virtqueue_next_desc(desc_pa, i, max)) != max); - - if (!indirect) - total_bufs = num_bufs; - else - total_bufs++; - } -done: - if (in_bytes) { - *in_bytes = in_total; - } - if (out_bytes) { - *out_bytes = out_total; - } -} - -int virtqueue_avail_bytes(VirtQueue *vq, unsigned int in_bytes, - unsigned int out_bytes) -{ - unsigned int in_total, out_total; - - virtqueue_get_avail_bytes(vq, &in_total, &out_total, in_bytes, out_bytes); - return in_bytes <= in_total && out_bytes <= out_total; -} - -void virtqueue_map_sg(struct iovec *sg, hwaddr *addr, - size_t num_sg, int is_write) -{ - unsigned int i; - hwaddr len; - - for (i = 0; i < num_sg; i++) { - len = sg[i].iov_len; - sg[i].iov_base = cpu_physical_memory_map(addr[i], &len, is_write); - if (sg[i].iov_base == NULL || len != sg[i].iov_len) { - error_report("virtio: trying to map MMIO memory"); - exit(1); - } - } -} - -int virtqueue_pop(VirtQueue *vq, VirtQueueElement *elem) -{ - unsigned int i, head, max; - hwaddr desc_pa = vq->vring.desc; - - if (!virtqueue_num_heads(vq, vq->last_avail_idx)) - return 0; - - /* When we start there are none of either input nor output. */ - elem->out_num = elem->in_num = 0; - - max = vq->vring.num; - - i = head = virtqueue_get_head(vq, vq->last_avail_idx++); - if (vq->vdev->guest_features & (1 << VIRTIO_RING_F_EVENT_IDX)) { - vring_avail_event(vq, vring_avail_idx(vq)); - } - - if (vring_desc_flags(desc_pa, i) & VRING_DESC_F_INDIRECT) { - if (vring_desc_len(desc_pa, i) % sizeof(VRingDesc)) { - error_report("Invalid size for indirect buffer table"); - exit(1); - } - - /* loop over the indirect descriptor table */ - max = vring_desc_len(desc_pa, i) / sizeof(VRingDesc); - desc_pa = vring_desc_addr(desc_pa, i); - i = 0; - } - - /* Collect all the descriptors */ - do { - struct iovec *sg; - - if (vring_desc_flags(desc_pa, i) & VRING_DESC_F_WRITE) { - if (elem->in_num >= ARRAY_SIZE(elem->in_sg)) { - error_report("Too many write descriptors in indirect table"); - exit(1); - } - elem->in_addr[elem->in_num] = vring_desc_addr(desc_pa, i); - sg = &elem->in_sg[elem->in_num++]; - } else { - if (elem->out_num >= ARRAY_SIZE(elem->out_sg)) { - error_report("Too many read descriptors in indirect table"); - exit(1); - } - elem->out_addr[elem->out_num] = vring_desc_addr(desc_pa, i); - sg = &elem->out_sg[elem->out_num++]; - } - - sg->iov_len = vring_desc_len(desc_pa, i); - - /* If we've got too many, that implies a descriptor loop. */ - if ((elem->in_num + elem->out_num) > max) { - error_report("Looped descriptor"); - exit(1); - } - } while ((i = virtqueue_next_desc(desc_pa, i, max)) != max); - - /* Now map what we have collected */ - virtqueue_map_sg(elem->in_sg, elem->in_addr, elem->in_num, 1); - virtqueue_map_sg(elem->out_sg, elem->out_addr, elem->out_num, 0); - - elem->index = head; - - vq->inuse++; - - trace_virtqueue_pop(vq, elem, elem->in_num, elem->out_num); - return elem->in_num + elem->out_num; -} - -/* virtio device */ -static void virtio_notify_vector(VirtIODevice *vdev, uint16_t vector) -{ - if (vdev->binding->notify) { - vdev->binding->notify(vdev->binding_opaque, vector); - } -} - -void virtio_update_irq(VirtIODevice *vdev) -{ - virtio_notify_vector(vdev, VIRTIO_NO_VECTOR); -} - -void virtio_set_status(VirtIODevice *vdev, uint8_t val) -{ - trace_virtio_set_status(vdev, val); - - if (vdev->set_status) { - vdev->set_status(vdev, val); - } - vdev->status = val; -} - -void virtio_reset(void *opaque) -{ - VirtIODevice *vdev = opaque; - int i; - - virtio_set_status(vdev, 0); - - if (vdev->reset) - vdev->reset(vdev); - - vdev->guest_features = 0; - vdev->queue_sel = 0; - vdev->status = 0; - vdev->isr = 0; - vdev->config_vector = VIRTIO_NO_VECTOR; - virtio_notify_vector(vdev, vdev->config_vector); - - for(i = 0; i < VIRTIO_PCI_QUEUE_MAX; i++) { - vdev->vq[i].vring.desc = 0; - vdev->vq[i].vring.avail = 0; - vdev->vq[i].vring.used = 0; - vdev->vq[i].last_avail_idx = 0; - vdev->vq[i].pa = 0; - vdev->vq[i].vector = VIRTIO_NO_VECTOR; - vdev->vq[i].signalled_used = 0; - vdev->vq[i].signalled_used_valid = false; - vdev->vq[i].notification = true; - } -} - -uint32_t virtio_config_readb(VirtIODevice *vdev, uint32_t addr) -{ - uint8_t val; - - vdev->get_config(vdev, vdev->config); - - if (addr > (vdev->config_len - sizeof(val))) - return (uint32_t)-1; - - val = ldub_p(vdev->config + addr); - return val; -} - -uint32_t virtio_config_readw(VirtIODevice *vdev, uint32_t addr) -{ - uint16_t val; - - vdev->get_config(vdev, vdev->config); - - if (addr > (vdev->config_len - sizeof(val))) - return (uint32_t)-1; - - val = lduw_p(vdev->config + addr); - return val; -} - -uint32_t virtio_config_readl(VirtIODevice *vdev, uint32_t addr) -{ - uint32_t val; - - vdev->get_config(vdev, vdev->config); - - if (addr > (vdev->config_len - sizeof(val))) - return (uint32_t)-1; - - val = ldl_p(vdev->config + addr); - return val; -} - -void virtio_config_writeb(VirtIODevice *vdev, uint32_t addr, uint32_t data) -{ - uint8_t val = data; - - if (addr > (vdev->config_len - sizeof(val))) - return; - - stb_p(vdev->config + addr, val); - - if (vdev->set_config) - vdev->set_config(vdev, vdev->config); -} - -void virtio_config_writew(VirtIODevice *vdev, uint32_t addr, uint32_t data) -{ - uint16_t val = data; - - if (addr > (vdev->config_len - sizeof(val))) - return; - - stw_p(vdev->config + addr, val); - - if (vdev->set_config) - vdev->set_config(vdev, vdev->config); -} - -void virtio_config_writel(VirtIODevice *vdev, uint32_t addr, uint32_t data) -{ - uint32_t val = data; - - if (addr > (vdev->config_len - sizeof(val))) - return; - - stl_p(vdev->config + addr, val); - - if (vdev->set_config) - vdev->set_config(vdev, vdev->config); -} - -void virtio_queue_set_addr(VirtIODevice *vdev, int n, hwaddr addr) -{ - vdev->vq[n].pa = addr; - virtqueue_init(&vdev->vq[n]); -} - -hwaddr virtio_queue_get_addr(VirtIODevice *vdev, int n) -{ - return vdev->vq[n].pa; -} - -int virtio_queue_get_num(VirtIODevice *vdev, int n) -{ - return vdev->vq[n].vring.num; -} - -int virtio_queue_get_id(VirtQueue *vq) -{ - VirtIODevice *vdev = vq->vdev; - assert(vq >= &vdev->vq[0] && vq < &vdev->vq[VIRTIO_PCI_QUEUE_MAX]); - return vq - &vdev->vq[0]; -} - -void virtio_queue_notify_vq(VirtQueue *vq) -{ - if (vq->vring.desc) { - VirtIODevice *vdev = vq->vdev; - trace_virtio_queue_notify(vdev, vq - vdev->vq, vq); - vq->handle_output(vdev, vq); - } -} - -void virtio_queue_notify(VirtIODevice *vdev, int n) -{ - virtio_queue_notify_vq(&vdev->vq[n]); -} - -uint16_t virtio_queue_vector(VirtIODevice *vdev, int n) -{ - return n < VIRTIO_PCI_QUEUE_MAX ? vdev->vq[n].vector : - VIRTIO_NO_VECTOR; -} - -void virtio_queue_set_vector(VirtIODevice *vdev, int n, uint16_t vector) -{ - if (n < VIRTIO_PCI_QUEUE_MAX) - vdev->vq[n].vector = vector; -} - -VirtQueue *virtio_add_queue(VirtIODevice *vdev, int queue_size, - void (*handle_output)(VirtIODevice *, VirtQueue *)) -{ - int i; - - for (i = 0; i < VIRTIO_PCI_QUEUE_MAX; i++) { - if (vdev->vq[i].vring.num == 0) - break; - } - - if (i == VIRTIO_PCI_QUEUE_MAX || queue_size > VIRTQUEUE_MAX_SIZE) - abort(); - - vdev->vq[i].vring.num = queue_size; - vdev->vq[i].handle_output = handle_output; - - return &vdev->vq[i]; -} - -void virtio_del_queue(VirtIODevice *vdev, int n) -{ - if (n < 0 || n >= VIRTIO_PCI_QUEUE_MAX) { - abort(); - } - - vdev->vq[n].vring.num = 0; -} - -void virtio_irq(VirtQueue *vq) -{ - trace_virtio_irq(vq); - vq->vdev->isr |= 0x01; - virtio_notify_vector(vq->vdev, vq->vector); -} - -/* Assuming a given event_idx value from the other size, if - * we have just incremented index from old to new_idx, - * should we trigger an event? */ -static inline int vring_need_event(uint16_t event, uint16_t new, uint16_t old) -{ - /* Note: Xen has similar logic for notification hold-off - * in include/xen/interface/io/ring.h with req_event and req_prod - * corresponding to event_idx + 1 and new respectively. - * Note also that req_event and req_prod in Xen start at 1, - * event indexes in virtio start at 0. */ - return (uint16_t)(new - event - 1) < (uint16_t)(new - old); -} - -static bool vring_notify(VirtIODevice *vdev, VirtQueue *vq) -{ - uint16_t old, new; - bool v; - /* We need to expose used array entries before checking used event. */ - smp_mb(); - /* Always notify when queue is empty (when feature acknowledge) */ - if (((vdev->guest_features & (1 << VIRTIO_F_NOTIFY_ON_EMPTY)) && - !vq->inuse && vring_avail_idx(vq) == vq->last_avail_idx)) { - return true; - } - - if (!(vdev->guest_features & (1 << VIRTIO_RING_F_EVENT_IDX))) { - return !(vring_avail_flags(vq) & VRING_AVAIL_F_NO_INTERRUPT); - } - - v = vq->signalled_used_valid; - vq->signalled_used_valid = true; - old = vq->signalled_used; - new = vq->signalled_used = vring_used_idx(vq); - return !v || vring_need_event(vring_used_event(vq), new, old); -} - -void virtio_notify(VirtIODevice *vdev, VirtQueue *vq) -{ - if (!vring_notify(vdev, vq)) { - return; - } - - trace_virtio_notify(vdev, vq); - vdev->isr |= 0x01; - virtio_notify_vector(vdev, vq->vector); -} - -void virtio_notify_config(VirtIODevice *vdev) -{ - if (!(vdev->status & VIRTIO_CONFIG_S_DRIVER_OK)) - return; - - vdev->isr |= 0x03; - virtio_notify_vector(vdev, vdev->config_vector); -} - -void virtio_save(VirtIODevice *vdev, QEMUFile *f) -{ - int i; - - if (vdev->binding->save_config) - vdev->binding->save_config(vdev->binding_opaque, f); - - qemu_put_8s(f, &vdev->status); - qemu_put_8s(f, &vdev->isr); - qemu_put_be16s(f, &vdev->queue_sel); - qemu_put_be32s(f, &vdev->guest_features); - qemu_put_be32(f, vdev->config_len); - qemu_put_buffer(f, vdev->config, vdev->config_len); - - for (i = 0; i < VIRTIO_PCI_QUEUE_MAX; i++) { - if (vdev->vq[i].vring.num == 0) - break; - } - - qemu_put_be32(f, i); - - for (i = 0; i < VIRTIO_PCI_QUEUE_MAX; i++) { - if (vdev->vq[i].vring.num == 0) - break; - - qemu_put_be32(f, vdev->vq[i].vring.num); - qemu_put_be64(f, vdev->vq[i].pa); - qemu_put_be16s(f, &vdev->vq[i].last_avail_idx); - if (vdev->binding->save_queue) - vdev->binding->save_queue(vdev->binding_opaque, i, f); - } -} - -int virtio_set_features(VirtIODevice *vdev, uint32_t val) -{ - uint32_t supported_features = - vdev->binding->get_features(vdev->binding_opaque); - bool bad = (val & ~supported_features) != 0; - - val &= supported_features; - if (vdev->set_features) { - vdev->set_features(vdev, val); - } - vdev->guest_features = val; - return bad ? -1 : 0; -} - -int virtio_load(VirtIODevice *vdev, QEMUFile *f) -{ - int num, i, ret; - uint32_t features; - uint32_t supported_features; - - if (vdev->binding->load_config) { - ret = vdev->binding->load_config(vdev->binding_opaque, f); - if (ret) - return ret; - } - - qemu_get_8s(f, &vdev->status); - qemu_get_8s(f, &vdev->isr); - qemu_get_be16s(f, &vdev->queue_sel); - qemu_get_be32s(f, &features); - - if (virtio_set_features(vdev, features) < 0) { - supported_features = vdev->binding->get_features(vdev->binding_opaque); - error_report("Features 0x%x unsupported. Allowed features: 0x%x", - features, supported_features); - return -1; - } - vdev->config_len = qemu_get_be32(f); - qemu_get_buffer(f, vdev->config, vdev->config_len); - - num = qemu_get_be32(f); - - for (i = 0; i < num; i++) { - vdev->vq[i].vring.num = qemu_get_be32(f); - vdev->vq[i].pa = qemu_get_be64(f); - qemu_get_be16s(f, &vdev->vq[i].last_avail_idx); - vdev->vq[i].signalled_used_valid = false; - vdev->vq[i].notification = true; - - if (vdev->vq[i].pa) { - uint16_t nheads; - virtqueue_init(&vdev->vq[i]); - nheads = vring_avail_idx(&vdev->vq[i]) - vdev->vq[i].last_avail_idx; - /* Check it isn't doing very strange things with descriptor numbers. */ - if (nheads > vdev->vq[i].vring.num) { - error_report("VQ %d size 0x%x Guest index 0x%x " - "inconsistent with Host index 0x%x: delta 0x%x", - i, vdev->vq[i].vring.num, - vring_avail_idx(&vdev->vq[i]), - vdev->vq[i].last_avail_idx, nheads); - return -1; - } - } else if (vdev->vq[i].last_avail_idx) { - error_report("VQ %d address 0x0 " - "inconsistent with Host index 0x%x", - i, vdev->vq[i].last_avail_idx); - return -1; - } - if (vdev->binding->load_queue) { - ret = vdev->binding->load_queue(vdev->binding_opaque, i, f); - if (ret) - return ret; - } - } - - virtio_notify_vector(vdev, VIRTIO_NO_VECTOR); - return 0; -} - -void virtio_common_cleanup(VirtIODevice *vdev) -{ - qemu_del_vm_change_state_handler(vdev->vmstate); - g_free(vdev->config); - g_free(vdev->vq); -} - -void virtio_cleanup(VirtIODevice *vdev) -{ - virtio_common_cleanup(vdev); - g_free(vdev); -} - -static void virtio_vmstate_change(void *opaque, int running, RunState state) -{ - VirtIODevice *vdev = opaque; - bool backend_run = running && (vdev->status & VIRTIO_CONFIG_S_DRIVER_OK); - vdev->vm_running = running; - - if (backend_run) { - virtio_set_status(vdev, vdev->status); - } - - if (vdev->binding->vmstate_change) { - vdev->binding->vmstate_change(vdev->binding_opaque, backend_run); - } - - if (!backend_run) { - virtio_set_status(vdev, vdev->status); - } -} - -void virtio_init(VirtIODevice *vdev, const char *name, - uint16_t device_id, size_t config_size) -{ - int i; - vdev->device_id = device_id; - vdev->status = 0; - vdev->isr = 0; - vdev->queue_sel = 0; - vdev->config_vector = VIRTIO_NO_VECTOR; - vdev->vq = g_malloc0(sizeof(VirtQueue) * VIRTIO_PCI_QUEUE_MAX); - vdev->vm_running = runstate_is_running(); - for (i = 0; i < VIRTIO_PCI_QUEUE_MAX; i++) { - vdev->vq[i].vector = VIRTIO_NO_VECTOR; - vdev->vq[i].vdev = vdev; - vdev->vq[i].queue_index = i; - } - - vdev->name = name; - vdev->config_len = config_size; - if (vdev->config_len) { - vdev->config = g_malloc0(config_size); - } else { - vdev->config = NULL; - } - vdev->vmstate = qemu_add_vm_change_state_handler(virtio_vmstate_change, - vdev); -} - -VirtIODevice *virtio_common_init(const char *name, uint16_t device_id, - size_t config_size, size_t struct_size) -{ - VirtIODevice *vdev; - vdev = g_malloc0(struct_size); - virtio_init(vdev, name, device_id, config_size); - return vdev; -} - -void virtio_bind_device(VirtIODevice *vdev, const VirtIOBindings *binding, - DeviceState *opaque) -{ - vdev->binding = binding; - vdev->binding_opaque = opaque; -} - -hwaddr virtio_queue_get_desc_addr(VirtIODevice *vdev, int n) -{ - return vdev->vq[n].vring.desc; -} - -hwaddr virtio_queue_get_avail_addr(VirtIODevice *vdev, int n) -{ - return vdev->vq[n].vring.avail; -} - -hwaddr virtio_queue_get_used_addr(VirtIODevice *vdev, int n) -{ - return vdev->vq[n].vring.used; -} - -hwaddr virtio_queue_get_ring_addr(VirtIODevice *vdev, int n) -{ - return vdev->vq[n].vring.desc; -} - -hwaddr virtio_queue_get_desc_size(VirtIODevice *vdev, int n) -{ - return sizeof(VRingDesc) * vdev->vq[n].vring.num; -} - -hwaddr virtio_queue_get_avail_size(VirtIODevice *vdev, int n) -{ - return offsetof(VRingAvail, ring) + - sizeof(uint64_t) * vdev->vq[n].vring.num; -} - -hwaddr virtio_queue_get_used_size(VirtIODevice *vdev, int n) -{ - return offsetof(VRingUsed, ring) + - sizeof(VRingUsedElem) * vdev->vq[n].vring.num; -} - -hwaddr virtio_queue_get_ring_size(VirtIODevice *vdev, int n) -{ - return vdev->vq[n].vring.used - vdev->vq[n].vring.desc + - virtio_queue_get_used_size(vdev, n); -} - -uint16_t virtio_queue_get_last_avail_idx(VirtIODevice *vdev, int n) -{ - return vdev->vq[n].last_avail_idx; -} - -void virtio_queue_set_last_avail_idx(VirtIODevice *vdev, int n, uint16_t idx) -{ - vdev->vq[n].last_avail_idx = idx; -} - -VirtQueue *virtio_get_queue(VirtIODevice *vdev, int n) -{ - return vdev->vq + n; -} - -uint16_t virtio_get_queue_index(VirtQueue *vq) -{ - return vq->queue_index; -} - -static void virtio_queue_guest_notifier_read(EventNotifier *n) -{ - VirtQueue *vq = container_of(n, VirtQueue, guest_notifier); - if (event_notifier_test_and_clear(n)) { - virtio_irq(vq); - } -} - -void virtio_queue_set_guest_notifier_fd_handler(VirtQueue *vq, bool assign, - bool with_irqfd) -{ - if (assign && !with_irqfd) { - event_notifier_set_handler(&vq->guest_notifier, - virtio_queue_guest_notifier_read); - } else { - event_notifier_set_handler(&vq->guest_notifier, NULL); - } - if (!assign) { - /* Test and clear notifier before closing it, - * in case poll callback didn't have time to run. */ - virtio_queue_guest_notifier_read(&vq->guest_notifier); - } -} - -EventNotifier *virtio_queue_get_guest_notifier(VirtQueue *vq) -{ - return &vq->guest_notifier; -} - -static void virtio_queue_host_notifier_read(EventNotifier *n) -{ - VirtQueue *vq = container_of(n, VirtQueue, host_notifier); - if (event_notifier_test_and_clear(n)) { - virtio_queue_notify_vq(vq); - } -} - -void virtio_queue_set_host_notifier_fd_handler(VirtQueue *vq, bool assign, - bool set_handler) -{ - if (assign && set_handler) { - event_notifier_set_handler(&vq->host_notifier, - virtio_queue_host_notifier_read); - } else { - event_notifier_set_handler(&vq->host_notifier, NULL); - } - if (!assign) { - /* Test and clear notifier before after disabling event, - * in case poll callback didn't have time to run. */ - virtio_queue_host_notifier_read(&vq->host_notifier); - } -} - -EventNotifier *virtio_queue_get_host_notifier(VirtQueue *vq) -{ - return &vq->host_notifier; -} - -static int virtio_device_init(DeviceState *qdev) -{ - VirtIODevice *vdev = VIRTIO_DEVICE(qdev); - VirtioDeviceClass *k = VIRTIO_DEVICE_GET_CLASS(qdev); - assert(k->init != NULL); - if (k->init(vdev) < 0) { - return -1; - } - virtio_bus_plug_device(vdev); - return 0; -} - -static void virtio_device_class_init(ObjectClass *klass, void *data) -{ - /* Set the default value here. */ - DeviceClass *dc = DEVICE_CLASS(klass); - dc->init = virtio_device_init; - dc->bus_type = TYPE_VIRTIO_BUS; -} - -static const TypeInfo virtio_device_info = { - .name = TYPE_VIRTIO_DEVICE, - .parent = TYPE_DEVICE, - .instance_size = sizeof(VirtIODevice), - .class_init = virtio_device_class_init, - .abstract = true, - .class_size = sizeof(VirtioDeviceClass), -}; - -static void virtio_register_types(void) -{ - type_register_static(&virtio_device_info); -} - -type_init(virtio_register_types) diff --git a/hw/virtio/Makefile.objs b/hw/virtio/Makefile.objs index ed63495a7f..c7e801344b 100644 --- a/hw/virtio/Makefile.objs +++ b/hw/virtio/Makefile.objs @@ -1,4 +1,7 @@ common-obj-$(CONFIG_VIRTIO) += virtio-rng.o common-obj-$(CONFIG_VIRTIO_PCI) += virtio-pci.o common-obj-$(CONFIG_VIRTIO) += virtio-bus.o +common-obj-$(CONFIG_VIRTIO_BLK_DATA_PLANE) += dataplane/ +obj-$(CONFIG_VIRTIO) += virtio.o virtio-balloon.o +obj-$(CONFIG_VHOST_NET) += vhost.o diff --git a/hw/virtio/dataplane/Makefile.objs b/hw/virtio/dataplane/Makefile.objs new file mode 100644 index 0000000000..a91bf33c8b --- /dev/null +++ b/hw/virtio/dataplane/Makefile.objs @@ -0,0 +1 @@ +common-obj-y += hostmem.o vring.o diff --git a/hw/virtio/dataplane/hostmem.c b/hw/virtio/dataplane/hostmem.c new file mode 100644 index 0000000000..37292ffd00 --- /dev/null +++ b/hw/virtio/dataplane/hostmem.c @@ -0,0 +1,176 @@ +/* + * Thread-safe guest to host memory mapping + * + * Copyright 2012 Red Hat, Inc. and/or its affiliates + * + * Authors: + * Stefan Hajnoczi + * + * This work is licensed under the terms of the GNU GPL, version 2 or later. + * See the COPYING file in the top-level directory. + * + */ + +#include "exec/address-spaces.h" +#include "hw/virtio/dataplane/hostmem.h" + +static int hostmem_lookup_cmp(const void *phys_, const void *region_) +{ + hwaddr phys = *(const hwaddr *)phys_; + const HostMemRegion *region = region_; + + if (phys < region->guest_addr) { + return -1; + } else if (phys >= region->guest_addr + region->size) { + return 1; + } else { + return 0; + } +} + +/** + * Map guest physical address to host pointer + */ +void *hostmem_lookup(HostMem *hostmem, hwaddr phys, hwaddr len, bool is_write) +{ + HostMemRegion *region; + void *host_addr = NULL; + hwaddr offset_within_region; + + qemu_mutex_lock(&hostmem->current_regions_lock); + region = bsearch(&phys, hostmem->current_regions, + hostmem->num_current_regions, + sizeof(hostmem->current_regions[0]), + hostmem_lookup_cmp); + if (!region) { + goto out; + } + if (is_write && region->readonly) { + goto out; + } + offset_within_region = phys - region->guest_addr; + if (len <= region->size - offset_within_region) { + host_addr = region->host_addr + offset_within_region; + } +out: + qemu_mutex_unlock(&hostmem->current_regions_lock); + + return host_addr; +} + +/** + * Install new regions list + */ +static void hostmem_listener_commit(MemoryListener *listener) +{ + HostMem *hostmem = container_of(listener, HostMem, listener); + + qemu_mutex_lock(&hostmem->current_regions_lock); + g_free(hostmem->current_regions); + hostmem->current_regions = hostmem->new_regions; + hostmem->num_current_regions = hostmem->num_new_regions; + qemu_mutex_unlock(&hostmem->current_regions_lock); + + /* Reset new regions list */ + hostmem->new_regions = NULL; + hostmem->num_new_regions = 0; +} + +/** + * Add a MemoryRegionSection to the new regions list + */ +static void hostmem_append_new_region(HostMem *hostmem, + MemoryRegionSection *section) +{ + void *ram_ptr = memory_region_get_ram_ptr(section->mr); + size_t num = hostmem->num_new_regions; + size_t new_size = (num + 1) * sizeof(hostmem->new_regions[0]); + + hostmem->new_regions = g_realloc(hostmem->new_regions, new_size); + hostmem->new_regions[num] = (HostMemRegion){ + .host_addr = ram_ptr + section->offset_within_region, + .guest_addr = section->offset_within_address_space, + .size = section->size, + .readonly = section->readonly, + }; + hostmem->num_new_regions++; +} + +static void hostmem_listener_append_region(MemoryListener *listener, + MemoryRegionSection *section) +{ + HostMem *hostmem = container_of(listener, HostMem, listener); + + /* Ignore non-RAM regions, we may not be able to map them */ + if (!memory_region_is_ram(section->mr)) { + return; + } + + /* Ignore regions with dirty logging, we cannot mark them dirty */ + if (memory_region_is_logging(section->mr)) { + return; + } + + hostmem_append_new_region(hostmem, section); +} + +/* We don't implement most MemoryListener callbacks, use these nop stubs */ +static void hostmem_listener_dummy(MemoryListener *listener) +{ +} + +static void hostmem_listener_section_dummy(MemoryListener *listener, + MemoryRegionSection *section) +{ +} + +static void hostmem_listener_eventfd_dummy(MemoryListener *listener, + MemoryRegionSection *section, + bool match_data, uint64_t data, + EventNotifier *e) +{ +} + +static void hostmem_listener_coalesced_mmio_dummy(MemoryListener *listener, + MemoryRegionSection *section, + hwaddr addr, hwaddr len) +{ +} + +void hostmem_init(HostMem *hostmem) +{ + memset(hostmem, 0, sizeof(*hostmem)); + + qemu_mutex_init(&hostmem->current_regions_lock); + + hostmem->listener = (MemoryListener){ + .begin = hostmem_listener_dummy, + .commit = hostmem_listener_commit, + .region_add = hostmem_listener_append_region, + .region_del = hostmem_listener_section_dummy, + .region_nop = hostmem_listener_append_region, + .log_start = hostmem_listener_section_dummy, + .log_stop = hostmem_listener_section_dummy, + .log_sync = hostmem_listener_section_dummy, + .log_global_start = hostmem_listener_dummy, + .log_global_stop = hostmem_listener_dummy, + .eventfd_add = hostmem_listener_eventfd_dummy, + .eventfd_del = hostmem_listener_eventfd_dummy, + .coalesced_mmio_add = hostmem_listener_coalesced_mmio_dummy, + .coalesced_mmio_del = hostmem_listener_coalesced_mmio_dummy, + .priority = 10, + }; + + memory_listener_register(&hostmem->listener, &address_space_memory); + if (hostmem->num_new_regions > 0) { + hostmem_listener_commit(&hostmem->listener); + } +} + +void hostmem_finalize(HostMem *hostmem) +{ + memory_listener_unregister(&hostmem->listener); + g_free(hostmem->new_regions); + g_free(hostmem->current_regions); + qemu_mutex_destroy(&hostmem->current_regions_lock); +} diff --git a/hw/virtio/dataplane/vring.c b/hw/virtio/dataplane/vring.c new file mode 100644 index 0000000000..e0d6e83625 --- /dev/null +++ b/hw/virtio/dataplane/vring.c @@ -0,0 +1,363 @@ +/* Copyright 2012 Red Hat, Inc. + * Copyright IBM, Corp. 2012 + * + * Based on Linux 2.6.39 vhost code: + * Copyright (C) 2009 Red Hat, Inc. + * Copyright (C) 2006 Rusty Russell IBM Corporation + * + * Author: Michael S. Tsirkin + * Stefan Hajnoczi + * + * Inspiration, some code, and most witty comments come from + * Documentation/virtual/lguest/lguest.c, by Rusty Russell + * + * This work is licensed under the terms of the GNU GPL, version 2. + */ + +#include "trace.h" +#include "hw/virtio/dataplane/vring.h" +#include "qemu/error-report.h" + +/* Map the guest's vring to host memory */ +bool vring_setup(Vring *vring, VirtIODevice *vdev, int n) +{ + hwaddr vring_addr = virtio_queue_get_ring_addr(vdev, n); + hwaddr vring_size = virtio_queue_get_ring_size(vdev, n); + void *vring_ptr; + + vring->broken = false; + + hostmem_init(&vring->hostmem); + vring_ptr = hostmem_lookup(&vring->hostmem, vring_addr, vring_size, true); + if (!vring_ptr) { + error_report("Failed to map vring " + "addr %#" HWADDR_PRIx " size %" HWADDR_PRIu, + vring_addr, vring_size); + vring->broken = true; + return false; + } + + vring_init(&vring->vr, virtio_queue_get_num(vdev, n), vring_ptr, 4096); + + vring->last_avail_idx = 0; + vring->last_used_idx = 0; + vring->signalled_used = 0; + vring->signalled_used_valid = false; + + trace_vring_setup(virtio_queue_get_ring_addr(vdev, n), + vring->vr.desc, vring->vr.avail, vring->vr.used); + return true; +} + +void vring_teardown(Vring *vring) +{ + hostmem_finalize(&vring->hostmem); +} + +/* Disable guest->host notifies */ +void vring_disable_notification(VirtIODevice *vdev, Vring *vring) +{ + if (!(vdev->guest_features & (1 << VIRTIO_RING_F_EVENT_IDX))) { + vring->vr.used->flags |= VRING_USED_F_NO_NOTIFY; + } +} + +/* Enable guest->host notifies + * + * Return true if the vring is empty, false if there are more requests. + */ +bool vring_enable_notification(VirtIODevice *vdev, Vring *vring) +{ + if (vdev->guest_features & (1 << VIRTIO_RING_F_EVENT_IDX)) { + vring_avail_event(&vring->vr) = vring->vr.avail->idx; + } else { + vring->vr.used->flags &= ~VRING_USED_F_NO_NOTIFY; + } + smp_mb(); /* ensure update is seen before reading avail_idx */ + return !vring_more_avail(vring); +} + +/* This is stolen from linux/drivers/vhost/vhost.c:vhost_notify() */ +bool vring_should_notify(VirtIODevice *vdev, Vring *vring) +{ + uint16_t old, new; + bool v; + /* Flush out used index updates. This is paired + * with the barrier that the Guest executes when enabling + * interrupts. */ + smp_mb(); + + if ((vdev->guest_features & VIRTIO_F_NOTIFY_ON_EMPTY) && + unlikely(vring->vr.avail->idx == vring->last_avail_idx)) { + return true; + } + + if (!(vdev->guest_features & VIRTIO_RING_F_EVENT_IDX)) { + return !(vring->vr.avail->flags & VRING_AVAIL_F_NO_INTERRUPT); + } + old = vring->signalled_used; + v = vring->signalled_used_valid; + new = vring->signalled_used = vring->last_used_idx; + vring->signalled_used_valid = true; + + if (unlikely(!v)) { + return true; + } + + return vring_need_event(vring_used_event(&vring->vr), new, old); +} + +/* This is stolen from linux/drivers/vhost/vhost.c. */ +static int get_indirect(Vring *vring, + struct iovec iov[], struct iovec *iov_end, + unsigned int *out_num, unsigned int *in_num, + struct vring_desc *indirect) +{ + struct vring_desc desc; + unsigned int i = 0, count, found = 0; + + /* Sanity check */ + if (unlikely(indirect->len % sizeof(desc))) { + error_report("Invalid length in indirect descriptor: " + "len %#x not multiple of %#zx", + indirect->len, sizeof(desc)); + vring->broken = true; + return -EFAULT; + } + + count = indirect->len / sizeof(desc); + /* Buffers are chained via a 16 bit next field, so + * we can have at most 2^16 of these. */ + if (unlikely(count > USHRT_MAX + 1)) { + error_report("Indirect buffer length too big: %d", indirect->len); + vring->broken = true; + return -EFAULT; + } + + do { + struct vring_desc *desc_ptr; + + /* Translate indirect descriptor */ + desc_ptr = hostmem_lookup(&vring->hostmem, + indirect->addr + found * sizeof(desc), + sizeof(desc), false); + if (!desc_ptr) { + error_report("Failed to map indirect descriptor " + "addr %#" PRIx64 " len %zu", + (uint64_t)indirect->addr + found * sizeof(desc), + sizeof(desc)); + vring->broken = true; + return -EFAULT; + } + desc = *desc_ptr; + + /* Ensure descriptor has been loaded before accessing fields */ + barrier(); /* read_barrier_depends(); */ + + if (unlikely(++found > count)) { + error_report("Loop detected: last one at %u " + "indirect size %u", i, count); + vring->broken = true; + return -EFAULT; + } + + if (unlikely(desc.flags & VRING_DESC_F_INDIRECT)) { + error_report("Nested indirect descriptor"); + vring->broken = true; + return -EFAULT; + } + + /* Stop for now if there are not enough iovecs available. */ + if (iov >= iov_end) { + return -ENOBUFS; + } + + iov->iov_base = hostmem_lookup(&vring->hostmem, desc.addr, desc.len, + desc.flags & VRING_DESC_F_WRITE); + if (!iov->iov_base) { + error_report("Failed to map indirect descriptor" + "addr %#" PRIx64 " len %u", + (uint64_t)desc.addr, desc.len); + vring->broken = true; + return -EFAULT; + } + iov->iov_len = desc.len; + iov++; + + /* If this is an input descriptor, increment that count. */ + if (desc.flags & VRING_DESC_F_WRITE) { + *in_num += 1; + } else { + /* If it's an output descriptor, they're all supposed + * to come before any input descriptors. */ + if (unlikely(*in_num)) { + error_report("Indirect descriptor " + "has out after in: idx %u", i); + vring->broken = true; + return -EFAULT; + } + *out_num += 1; + } + i = desc.next; + } while (desc.flags & VRING_DESC_F_NEXT); + return 0; +} + +/* This looks in the virtqueue and for the first available buffer, and converts + * it to an iovec for convenient access. Since descriptors consist of some + * number of output then some number of input descriptors, it's actually two + * iovecs, but we pack them into one and note how many of each there were. + * + * This function returns the descriptor number found, or vq->num (which is + * never a valid descriptor number) if none was found. A negative code is + * returned on error. + * + * Stolen from linux/drivers/vhost/vhost.c. + */ +int vring_pop(VirtIODevice *vdev, Vring *vring, + struct iovec iov[], struct iovec *iov_end, + unsigned int *out_num, unsigned int *in_num) +{ + struct vring_desc desc; + unsigned int i, head, found = 0, num = vring->vr.num; + uint16_t avail_idx, last_avail_idx; + + /* If there was a fatal error then refuse operation */ + if (vring->broken) { + return -EFAULT; + } + + /* Check it isn't doing very strange things with descriptor numbers. */ + last_avail_idx = vring->last_avail_idx; + avail_idx = vring->vr.avail->idx; + barrier(); /* load indices now and not again later */ + + if (unlikely((uint16_t)(avail_idx - last_avail_idx) > num)) { + error_report("Guest moved used index from %u to %u", + last_avail_idx, avail_idx); + vring->broken = true; + return -EFAULT; + } + + /* If there's nothing new since last we looked. */ + if (avail_idx == last_avail_idx) { + return -EAGAIN; + } + + /* Only get avail ring entries after they have been exposed by guest. */ + smp_rmb(); + + /* Grab the next descriptor number they're advertising, and increment + * the index we've seen. */ + head = vring->vr.avail->ring[last_avail_idx % num]; + + /* If their number is silly, that's an error. */ + if (unlikely(head >= num)) { + error_report("Guest says index %u > %u is available", head, num); + vring->broken = true; + return -EFAULT; + } + + if (vdev->guest_features & (1 << VIRTIO_RING_F_EVENT_IDX)) { + vring_avail_event(&vring->vr) = vring->vr.avail->idx; + } + + /* When we start there are none of either input nor output. */ + *out_num = *in_num = 0; + + i = head; + do { + if (unlikely(i >= num)) { + error_report("Desc index is %u > %u, head = %u", i, num, head); + vring->broken = true; + return -EFAULT; + } + if (unlikely(++found > num)) { + error_report("Loop detected: last one at %u vq size %u head %u", + i, num, head); + vring->broken = true; + return -EFAULT; + } + desc = vring->vr.desc[i]; + + /* Ensure descriptor is loaded before accessing fields */ + barrier(); + + if (desc.flags & VRING_DESC_F_INDIRECT) { + int ret = get_indirect(vring, iov, iov_end, out_num, in_num, &desc); + if (ret < 0) { + return ret; + } + continue; + } + + /* If there are not enough iovecs left, stop for now. The caller + * should check if there are more descs available once they have dealt + * with the current set. + */ + if (iov >= iov_end) { + return -ENOBUFS; + } + + /* TODO handle non-contiguous memory across region boundaries */ + iov->iov_base = hostmem_lookup(&vring->hostmem, desc.addr, desc.len, + desc.flags & VRING_DESC_F_WRITE); + if (!iov->iov_base) { + error_report("Failed to map vring desc addr %#" PRIx64 " len %u", + (uint64_t)desc.addr, desc.len); + vring->broken = true; + return -EFAULT; + } + iov->iov_len = desc.len; + iov++; + + if (desc.flags & VRING_DESC_F_WRITE) { + /* If this is an input descriptor, + * increment that count. */ + *in_num += 1; + } else { + /* If it's an output descriptor, they're all supposed + * to come before any input descriptors. */ + if (unlikely(*in_num)) { + error_report("Descriptor has out after in: idx %d", i); + vring->broken = true; + return -EFAULT; + } + *out_num += 1; + } + i = desc.next; + } while (desc.flags & VRING_DESC_F_NEXT); + + /* On success, increment avail index. */ + vring->last_avail_idx++; + return head; +} + +/* After we've used one of their buffers, we tell them about it. + * + * Stolen from linux/drivers/vhost/vhost.c. + */ +void vring_push(Vring *vring, unsigned int head, int len) +{ + struct vring_used_elem *used; + uint16_t new; + + /* Don't touch vring if a fatal error occurred */ + if (vring->broken) { + return; + } + + /* The virtqueue contains a ring of used buffers. Get a pointer to the + * next entry in that used ring. */ + used = &vring->vr.used->ring[vring->last_used_idx % vring->vr.num]; + used->id = head; + used->len = len; + + /* Make sure buffer is written before we update index. */ + smp_wmb(); + + new = vring->vr.used->idx = ++vring->last_used_idx; + if (unlikely((int16_t)(new - vring->signalled_used) < (uint16_t)1)) { + vring->signalled_used_valid = false; + } +} diff --git a/hw/virtio/vhost.c b/hw/virtio/vhost.c new file mode 100644 index 0000000000..636fad0f74 --- /dev/null +++ b/hw/virtio/vhost.c @@ -0,0 +1,1042 @@ +/* + * vhost support + * + * Copyright Red Hat, Inc. 2010 + * + * Authors: + * Michael S. Tsirkin + * + * This work is licensed under the terms of the GNU GPL, version 2. See + * the COPYING file in the top-level directory. + * + * Contributions after 2012-01-13 are licensed under the terms of the + * GNU GPL, version 2 or (at your option) any later version. + */ + +#include +#include "hw/virtio/vhost.h" +#include "hw/hw.h" +#include "qemu/range.h" +#include +#include "exec/address-spaces.h" + +static void vhost_dev_sync_region(struct vhost_dev *dev, + MemoryRegionSection *section, + uint64_t mfirst, uint64_t mlast, + uint64_t rfirst, uint64_t rlast) +{ + uint64_t start = MAX(mfirst, rfirst); + uint64_t end = MIN(mlast, rlast); + vhost_log_chunk_t *from = dev->log + start / VHOST_LOG_CHUNK; + vhost_log_chunk_t *to = dev->log + end / VHOST_LOG_CHUNK + 1; + uint64_t addr = (start / VHOST_LOG_CHUNK) * VHOST_LOG_CHUNK; + + if (end < start) { + return; + } + assert(end / VHOST_LOG_CHUNK < dev->log_size); + assert(start / VHOST_LOG_CHUNK < dev->log_size); + + for (;from < to; ++from) { + vhost_log_chunk_t log; + int bit; + /* We first check with non-atomic: much cheaper, + * and we expect non-dirty to be the common case. */ + if (!*from) { + addr += VHOST_LOG_CHUNK; + continue; + } + /* Data must be read atomically. We don't really + * need the barrier semantics of __sync + * builtins, but it's easier to use them than + * roll our own. */ + log = __sync_fetch_and_and(from, 0); + while ((bit = sizeof(log) > sizeof(int) ? + ffsll(log) : ffs(log))) { + hwaddr page_addr; + hwaddr section_offset; + hwaddr mr_offset; + bit -= 1; + page_addr = addr + bit * VHOST_LOG_PAGE; + section_offset = page_addr - section->offset_within_address_space; + mr_offset = section_offset + section->offset_within_region; + memory_region_set_dirty(section->mr, mr_offset, VHOST_LOG_PAGE); + log &= ~(0x1ull << bit); + } + addr += VHOST_LOG_CHUNK; + } +} + +static int vhost_sync_dirty_bitmap(struct vhost_dev *dev, + MemoryRegionSection *section, + hwaddr first, + hwaddr last) +{ + int i; + hwaddr start_addr; + hwaddr end_addr; + + if (!dev->log_enabled || !dev->started) { + return 0; + } + start_addr = section->offset_within_address_space; + end_addr = range_get_last(start_addr, section->size); + start_addr = MAX(first, start_addr); + end_addr = MIN(last, end_addr); + + for (i = 0; i < dev->mem->nregions; ++i) { + struct vhost_memory_region *reg = dev->mem->regions + i; + vhost_dev_sync_region(dev, section, start_addr, end_addr, + reg->guest_phys_addr, + range_get_last(reg->guest_phys_addr, + reg->memory_size)); + } + for (i = 0; i < dev->nvqs; ++i) { + struct vhost_virtqueue *vq = dev->vqs + i; + vhost_dev_sync_region(dev, section, start_addr, end_addr, vq->used_phys, + range_get_last(vq->used_phys, vq->used_size)); + } + return 0; +} + +static void vhost_log_sync(MemoryListener *listener, + MemoryRegionSection *section) +{ + struct vhost_dev *dev = container_of(listener, struct vhost_dev, + memory_listener); + vhost_sync_dirty_bitmap(dev, section, 0x0, ~0x0ULL); +} + +static void vhost_log_sync_range(struct vhost_dev *dev, + hwaddr first, hwaddr last) +{ + int i; + /* FIXME: this is N^2 in number of sections */ + for (i = 0; i < dev->n_mem_sections; ++i) { + MemoryRegionSection *section = &dev->mem_sections[i]; + vhost_sync_dirty_bitmap(dev, section, first, last); + } +} + +/* Assign/unassign. Keep an unsorted array of non-overlapping + * memory regions in dev->mem. */ +static void vhost_dev_unassign_memory(struct vhost_dev *dev, + uint64_t start_addr, + uint64_t size) +{ + int from, to, n = dev->mem->nregions; + /* Track overlapping/split regions for sanity checking. */ + int overlap_start = 0, overlap_end = 0, overlap_middle = 0, split = 0; + + for (from = 0, to = 0; from < n; ++from, ++to) { + struct vhost_memory_region *reg = dev->mem->regions + to; + uint64_t reglast; + uint64_t memlast; + uint64_t change; + + /* clone old region */ + if (to != from) { + memcpy(reg, dev->mem->regions + from, sizeof *reg); + } + + /* No overlap is simple */ + if (!ranges_overlap(reg->guest_phys_addr, reg->memory_size, + start_addr, size)) { + continue; + } + + /* Split only happens if supplied region + * is in the middle of an existing one. Thus it can not + * overlap with any other existing region. */ + assert(!split); + + reglast = range_get_last(reg->guest_phys_addr, reg->memory_size); + memlast = range_get_last(start_addr, size); + + /* Remove whole region */ + if (start_addr <= reg->guest_phys_addr && memlast >= reglast) { + --dev->mem->nregions; + --to; + ++overlap_middle; + continue; + } + + /* Shrink region */ + if (memlast >= reglast) { + reg->memory_size = start_addr - reg->guest_phys_addr; + assert(reg->memory_size); + assert(!overlap_end); + ++overlap_end; + continue; + } + + /* Shift region */ + if (start_addr <= reg->guest_phys_addr) { + change = memlast + 1 - reg->guest_phys_addr; + reg->memory_size -= change; + reg->guest_phys_addr += change; + reg->userspace_addr += change; + assert(reg->memory_size); + assert(!overlap_start); + ++overlap_start; + continue; + } + + /* This only happens if supplied region + * is in the middle of an existing one. Thus it can not + * overlap with any other existing region. */ + assert(!overlap_start); + assert(!overlap_end); + assert(!overlap_middle); + /* Split region: shrink first part, shift second part. */ + memcpy(dev->mem->regions + n, reg, sizeof *reg); + reg->memory_size = start_addr - reg->guest_phys_addr; + assert(reg->memory_size); + change = memlast + 1 - reg->guest_phys_addr; + reg = dev->mem->regions + n; + reg->memory_size -= change; + assert(reg->memory_size); + reg->guest_phys_addr += change; + reg->userspace_addr += change; + /* Never add more than 1 region */ + assert(dev->mem->nregions == n); + ++dev->mem->nregions; + ++split; + } +} + +/* Called after unassign, so no regions overlap the given range. */ +static void vhost_dev_assign_memory(struct vhost_dev *dev, + uint64_t start_addr, + uint64_t size, + uint64_t uaddr) +{ + int from, to; + struct vhost_memory_region *merged = NULL; + for (from = 0, to = 0; from < dev->mem->nregions; ++from, ++to) { + struct vhost_memory_region *reg = dev->mem->regions + to; + uint64_t prlast, urlast; + uint64_t pmlast, umlast; + uint64_t s, e, u; + + /* clone old region */ + if (to != from) { + memcpy(reg, dev->mem->regions + from, sizeof *reg); + } + prlast = range_get_last(reg->guest_phys_addr, reg->memory_size); + pmlast = range_get_last(start_addr, size); + urlast = range_get_last(reg->userspace_addr, reg->memory_size); + umlast = range_get_last(uaddr, size); + + /* check for overlapping regions: should never happen. */ + assert(prlast < start_addr || pmlast < reg->guest_phys_addr); + /* Not an adjacent or overlapping region - do not merge. */ + if ((prlast + 1 != start_addr || urlast + 1 != uaddr) && + (pmlast + 1 != reg->guest_phys_addr || + umlast + 1 != reg->userspace_addr)) { + continue; + } + + if (merged) { + --to; + assert(to >= 0); + } else { + merged = reg; + } + u = MIN(uaddr, reg->userspace_addr); + s = MIN(start_addr, reg->guest_phys_addr); + e = MAX(pmlast, prlast); + uaddr = merged->userspace_addr = u; + start_addr = merged->guest_phys_addr = s; + size = merged->memory_size = e - s + 1; + assert(merged->memory_size); + } + + if (!merged) { + struct vhost_memory_region *reg = dev->mem->regions + to; + memset(reg, 0, sizeof *reg); + reg->memory_size = size; + assert(reg->memory_size); + reg->guest_phys_addr = start_addr; + reg->userspace_addr = uaddr; + ++to; + } + assert(to <= dev->mem->nregions + 1); + dev->mem->nregions = to; +} + +static uint64_t vhost_get_log_size(struct vhost_dev *dev) +{ + uint64_t log_size = 0; + int i; + for (i = 0; i < dev->mem->nregions; ++i) { + struct vhost_memory_region *reg = dev->mem->regions + i; + uint64_t last = range_get_last(reg->guest_phys_addr, + reg->memory_size); + log_size = MAX(log_size, last / VHOST_LOG_CHUNK + 1); + } + for (i = 0; i < dev->nvqs; ++i) { + struct vhost_virtqueue *vq = dev->vqs + i; + uint64_t last = vq->used_phys + vq->used_size - 1; + log_size = MAX(log_size, last / VHOST_LOG_CHUNK + 1); + } + return log_size; +} + +static inline void vhost_dev_log_resize(struct vhost_dev* dev, uint64_t size) +{ + vhost_log_chunk_t *log; + uint64_t log_base; + int r; + + log = g_malloc0(size * sizeof *log); + log_base = (uint64_t)(unsigned long)log; + r = ioctl(dev->control, VHOST_SET_LOG_BASE, &log_base); + assert(r >= 0); + /* Sync only the range covered by the old log */ + if (dev->log_size) { + vhost_log_sync_range(dev, 0, dev->log_size * VHOST_LOG_CHUNK - 1); + } + if (dev->log) { + g_free(dev->log); + } + dev->log = log; + dev->log_size = size; +} + +static int vhost_verify_ring_mappings(struct vhost_dev *dev, + uint64_t start_addr, + uint64_t size) +{ + int i; + for (i = 0; i < dev->nvqs; ++i) { + struct vhost_virtqueue *vq = dev->vqs + i; + hwaddr l; + void *p; + + if (!ranges_overlap(start_addr, size, vq->ring_phys, vq->ring_size)) { + continue; + } + l = vq->ring_size; + p = cpu_physical_memory_map(vq->ring_phys, &l, 1); + if (!p || l != vq->ring_size) { + fprintf(stderr, "Unable to map ring buffer for ring %d\n", i); + return -ENOMEM; + } + if (p != vq->ring) { + fprintf(stderr, "Ring buffer relocated for ring %d\n", i); + return -EBUSY; + } + cpu_physical_memory_unmap(p, l, 0, 0); + } + return 0; +} + +static struct vhost_memory_region *vhost_dev_find_reg(struct vhost_dev *dev, + uint64_t start_addr, + uint64_t size) +{ + int i, n = dev->mem->nregions; + for (i = 0; i < n; ++i) { + struct vhost_memory_region *reg = dev->mem->regions + i; + if (ranges_overlap(reg->guest_phys_addr, reg->memory_size, + start_addr, size)) { + return reg; + } + } + return NULL; +} + +static bool vhost_dev_cmp_memory(struct vhost_dev *dev, + uint64_t start_addr, + uint64_t size, + uint64_t uaddr) +{ + struct vhost_memory_region *reg = vhost_dev_find_reg(dev, start_addr, size); + uint64_t reglast; + uint64_t memlast; + + if (!reg) { + return true; + } + + reglast = range_get_last(reg->guest_phys_addr, reg->memory_size); + memlast = range_get_last(start_addr, size); + + /* Need to extend region? */ + if (start_addr < reg->guest_phys_addr || memlast > reglast) { + return true; + } + /* userspace_addr changed? */ + return uaddr != reg->userspace_addr + start_addr - reg->guest_phys_addr; +} + +static void vhost_set_memory(MemoryListener *listener, + MemoryRegionSection *section, + bool add) +{ + struct vhost_dev *dev = container_of(listener, struct vhost_dev, + memory_listener); + hwaddr start_addr = section->offset_within_address_space; + ram_addr_t size = section->size; + bool log_dirty = memory_region_is_logging(section->mr); + int s = offsetof(struct vhost_memory, regions) + + (dev->mem->nregions + 1) * sizeof dev->mem->regions[0]; + uint64_t log_size; + int r; + void *ram; + + dev->mem = g_realloc(dev->mem, s); + + if (log_dirty) { + add = false; + } + + assert(size); + + /* Optimize no-change case. At least cirrus_vga does this a lot at this time. */ + ram = memory_region_get_ram_ptr(section->mr) + section->offset_within_region; + if (add) { + if (!vhost_dev_cmp_memory(dev, start_addr, size, (uintptr_t)ram)) { + /* Region exists with same address. Nothing to do. */ + return; + } + } else { + if (!vhost_dev_find_reg(dev, start_addr, size)) { + /* Removing region that we don't access. Nothing to do. */ + return; + } + } + + vhost_dev_unassign_memory(dev, start_addr, size); + if (add) { + /* Add given mapping, merging adjacent regions if any */ + vhost_dev_assign_memory(dev, start_addr, size, (uintptr_t)ram); + } else { + /* Remove old mapping for this memory, if any. */ + vhost_dev_unassign_memory(dev, start_addr, size); + } + + if (!dev->started) { + return; + } + + if (dev->started) { + r = vhost_verify_ring_mappings(dev, start_addr, size); + assert(r >= 0); + } + + if (!dev->log_enabled) { + r = ioctl(dev->control, VHOST_SET_MEM_TABLE, dev->mem); + assert(r >= 0); + return; + } + log_size = vhost_get_log_size(dev); + /* We allocate an extra 4K bytes to log, + * to reduce the * number of reallocations. */ +#define VHOST_LOG_BUFFER (0x1000 / sizeof *dev->log) + /* To log more, must increase log size before table update. */ + if (dev->log_size < log_size) { + vhost_dev_log_resize(dev, log_size + VHOST_LOG_BUFFER); + } + r = ioctl(dev->control, VHOST_SET_MEM_TABLE, dev->mem); + assert(r >= 0); + /* To log less, can only decrease log size after table update. */ + if (dev->log_size > log_size + VHOST_LOG_BUFFER) { + vhost_dev_log_resize(dev, log_size); + } +} + +static bool vhost_section(MemoryRegionSection *section) +{ + return memory_region_is_ram(section->mr); +} + +static void vhost_begin(MemoryListener *listener) +{ +} + +static void vhost_commit(MemoryListener *listener) +{ +} + +static void vhost_region_add(MemoryListener *listener, + MemoryRegionSection *section) +{ + struct vhost_dev *dev = container_of(listener, struct vhost_dev, + memory_listener); + + if (!vhost_section(section)) { + return; + } + + ++dev->n_mem_sections; + dev->mem_sections = g_renew(MemoryRegionSection, dev->mem_sections, + dev->n_mem_sections); + dev->mem_sections[dev->n_mem_sections - 1] = *section; + vhost_set_memory(listener, section, true); +} + +static void vhost_region_del(MemoryListener *listener, + MemoryRegionSection *section) +{ + struct vhost_dev *dev = container_of(listener, struct vhost_dev, + memory_listener); + int i; + + if (!vhost_section(section)) { + return; + } + + vhost_set_memory(listener, section, false); + for (i = 0; i < dev->n_mem_sections; ++i) { + if (dev->mem_sections[i].offset_within_address_space + == section->offset_within_address_space) { + --dev->n_mem_sections; + memmove(&dev->mem_sections[i], &dev->mem_sections[i+1], + (dev->n_mem_sections - i) * sizeof(*dev->mem_sections)); + break; + } + } +} + +static void vhost_region_nop(MemoryListener *listener, + MemoryRegionSection *section) +{ +} + +static int vhost_virtqueue_set_addr(struct vhost_dev *dev, + struct vhost_virtqueue *vq, + unsigned idx, bool enable_log) +{ + struct vhost_vring_addr addr = { + .index = idx, + .desc_user_addr = (uint64_t)(unsigned long)vq->desc, + .avail_user_addr = (uint64_t)(unsigned long)vq->avail, + .used_user_addr = (uint64_t)(unsigned long)vq->used, + .log_guest_addr = vq->used_phys, + .flags = enable_log ? (1 << VHOST_VRING_F_LOG) : 0, + }; + int r = ioctl(dev->control, VHOST_SET_VRING_ADDR, &addr); + if (r < 0) { + return -errno; + } + return 0; +} + +static int vhost_dev_set_features(struct vhost_dev *dev, bool enable_log) +{ + uint64_t features = dev->acked_features; + int r; + if (enable_log) { + features |= 0x1 << VHOST_F_LOG_ALL; + } + r = ioctl(dev->control, VHOST_SET_FEATURES, &features); + return r < 0 ? -errno : 0; +} + +static int vhost_dev_set_log(struct vhost_dev *dev, bool enable_log) +{ + int r, t, i; + r = vhost_dev_set_features(dev, enable_log); + if (r < 0) { + goto err_features; + } + for (i = 0; i < dev->nvqs; ++i) { + r = vhost_virtqueue_set_addr(dev, dev->vqs + i, i, + enable_log); + if (r < 0) { + goto err_vq; + } + } + return 0; +err_vq: + for (; i >= 0; --i) { + t = vhost_virtqueue_set_addr(dev, dev->vqs + i, i, + dev->log_enabled); + assert(t >= 0); + } + t = vhost_dev_set_features(dev, dev->log_enabled); + assert(t >= 0); +err_features: + return r; +} + +static int vhost_migration_log(MemoryListener *listener, int enable) +{ + struct vhost_dev *dev = container_of(listener, struct vhost_dev, + memory_listener); + int r; + if (!!enable == dev->log_enabled) { + return 0; + } + if (!dev->started) { + dev->log_enabled = enable; + return 0; + } + if (!enable) { + r = vhost_dev_set_log(dev, false); + if (r < 0) { + return r; + } + if (dev->log) { + g_free(dev->log); + } + dev->log = NULL; + dev->log_size = 0; + } else { + vhost_dev_log_resize(dev, vhost_get_log_size(dev)); + r = vhost_dev_set_log(dev, true); + if (r < 0) { + return r; + } + } + dev->log_enabled = enable; + return 0; +} + +static void vhost_log_global_start(MemoryListener *listener) +{ + int r; + + r = vhost_migration_log(listener, true); + if (r < 0) { + abort(); + } +} + +static void vhost_log_global_stop(MemoryListener *listener) +{ + int r; + + r = vhost_migration_log(listener, false); + if (r < 0) { + abort(); + } +} + +static void vhost_log_start(MemoryListener *listener, + MemoryRegionSection *section) +{ + /* FIXME: implement */ +} + +static void vhost_log_stop(MemoryListener *listener, + MemoryRegionSection *section) +{ + /* FIXME: implement */ +} + +static int vhost_virtqueue_start(struct vhost_dev *dev, + struct VirtIODevice *vdev, + struct vhost_virtqueue *vq, + unsigned idx) +{ + hwaddr s, l, a; + int r; + int vhost_vq_index = idx - dev->vq_index; + struct vhost_vring_file file = { + .index = vhost_vq_index + }; + struct vhost_vring_state state = { + .index = vhost_vq_index + }; + struct VirtQueue *vvq = virtio_get_queue(vdev, idx); + + assert(idx >= dev->vq_index && idx < dev->vq_index + dev->nvqs); + + vq->num = state.num = virtio_queue_get_num(vdev, idx); + r = ioctl(dev->control, VHOST_SET_VRING_NUM, &state); + if (r) { + return -errno; + } + + state.num = virtio_queue_get_last_avail_idx(vdev, idx); + r = ioctl(dev->control, VHOST_SET_VRING_BASE, &state); + if (r) { + return -errno; + } + + s = l = virtio_queue_get_desc_size(vdev, idx); + a = virtio_queue_get_desc_addr(vdev, idx); + vq->desc = cpu_physical_memory_map(a, &l, 0); + if (!vq->desc || l != s) { + r = -ENOMEM; + goto fail_alloc_desc; + } + s = l = virtio_queue_get_avail_size(vdev, idx); + a = virtio_queue_get_avail_addr(vdev, idx); + vq->avail = cpu_physical_memory_map(a, &l, 0); + if (!vq->avail || l != s) { + r = -ENOMEM; + goto fail_alloc_avail; + } + vq->used_size = s = l = virtio_queue_get_used_size(vdev, idx); + vq->used_phys = a = virtio_queue_get_used_addr(vdev, idx); + vq->used = cpu_physical_memory_map(a, &l, 1); + if (!vq->used || l != s) { + r = -ENOMEM; + goto fail_alloc_used; + } + + vq->ring_size = s = l = virtio_queue_get_ring_size(vdev, idx); + vq->ring_phys = a = virtio_queue_get_ring_addr(vdev, idx); + vq->ring = cpu_physical_memory_map(a, &l, 1); + if (!vq->ring || l != s) { + r = -ENOMEM; + goto fail_alloc_ring; + } + + r = vhost_virtqueue_set_addr(dev, vq, vhost_vq_index, dev->log_enabled); + if (r < 0) { + r = -errno; + goto fail_alloc; + } + + file.fd = event_notifier_get_fd(virtio_queue_get_host_notifier(vvq)); + r = ioctl(dev->control, VHOST_SET_VRING_KICK, &file); + if (r) { + r = -errno; + goto fail_kick; + } + + /* Clear and discard previous events if any. */ + event_notifier_test_and_clear(&vq->masked_notifier); + + return 0; + +fail_kick: +fail_alloc: + cpu_physical_memory_unmap(vq->ring, virtio_queue_get_ring_size(vdev, idx), + 0, 0); +fail_alloc_ring: + cpu_physical_memory_unmap(vq->used, virtio_queue_get_used_size(vdev, idx), + 0, 0); +fail_alloc_used: + cpu_physical_memory_unmap(vq->avail, virtio_queue_get_avail_size(vdev, idx), + 0, 0); +fail_alloc_avail: + cpu_physical_memory_unmap(vq->desc, virtio_queue_get_desc_size(vdev, idx), + 0, 0); +fail_alloc_desc: + return r; +} + +static void vhost_virtqueue_stop(struct vhost_dev *dev, + struct VirtIODevice *vdev, + struct vhost_virtqueue *vq, + unsigned idx) +{ + struct vhost_vring_state state = { + .index = idx - dev->vq_index + }; + int r; + assert(idx >= dev->vq_index && idx < dev->vq_index + dev->nvqs); + r = ioctl(dev->control, VHOST_GET_VRING_BASE, &state); + if (r < 0) { + fprintf(stderr, "vhost VQ %d ring restore failed: %d\n", idx, r); + fflush(stderr); + } + virtio_queue_set_last_avail_idx(vdev, idx, state.num); + assert (r >= 0); + cpu_physical_memory_unmap(vq->ring, virtio_queue_get_ring_size(vdev, idx), + 0, virtio_queue_get_ring_size(vdev, idx)); + cpu_physical_memory_unmap(vq->used, virtio_queue_get_used_size(vdev, idx), + 1, virtio_queue_get_used_size(vdev, idx)); + cpu_physical_memory_unmap(vq->avail, virtio_queue_get_avail_size(vdev, idx), + 0, virtio_queue_get_avail_size(vdev, idx)); + cpu_physical_memory_unmap(vq->desc, virtio_queue_get_desc_size(vdev, idx), + 0, virtio_queue_get_desc_size(vdev, idx)); +} + +static void vhost_eventfd_add(MemoryListener *listener, + MemoryRegionSection *section, + bool match_data, uint64_t data, EventNotifier *e) +{ +} + +static void vhost_eventfd_del(MemoryListener *listener, + MemoryRegionSection *section, + bool match_data, uint64_t data, EventNotifier *e) +{ +} + +static int vhost_virtqueue_init(struct vhost_dev *dev, + struct vhost_virtqueue *vq, int n) +{ + struct vhost_vring_file file = { + .index = n, + }; + int r = event_notifier_init(&vq->masked_notifier, 0); + if (r < 0) { + return r; + } + + file.fd = event_notifier_get_fd(&vq->masked_notifier); + r = ioctl(dev->control, VHOST_SET_VRING_CALL, &file); + if (r) { + r = -errno; + goto fail_call; + } + return 0; +fail_call: + event_notifier_cleanup(&vq->masked_notifier); + return r; +} + +static void vhost_virtqueue_cleanup(struct vhost_virtqueue *vq) +{ + event_notifier_cleanup(&vq->masked_notifier); +} + +int vhost_dev_init(struct vhost_dev *hdev, int devfd, const char *devpath, + bool force) +{ + uint64_t features; + int i, r; + if (devfd >= 0) { + hdev->control = devfd; + } else { + hdev->control = open(devpath, O_RDWR); + if (hdev->control < 0) { + return -errno; + } + } + r = ioctl(hdev->control, VHOST_SET_OWNER, NULL); + if (r < 0) { + goto fail; + } + + r = ioctl(hdev->control, VHOST_GET_FEATURES, &features); + if (r < 0) { + goto fail; + } + + for (i = 0; i < hdev->nvqs; ++i) { + r = vhost_virtqueue_init(hdev, hdev->vqs + i, i); + if (r < 0) { + goto fail_vq; + } + } + hdev->features = features; + + hdev->memory_listener = (MemoryListener) { + .begin = vhost_begin, + .commit = vhost_commit, + .region_add = vhost_region_add, + .region_del = vhost_region_del, + .region_nop = vhost_region_nop, + .log_start = vhost_log_start, + .log_stop = vhost_log_stop, + .log_sync = vhost_log_sync, + .log_global_start = vhost_log_global_start, + .log_global_stop = vhost_log_global_stop, + .eventfd_add = vhost_eventfd_add, + .eventfd_del = vhost_eventfd_del, + .priority = 10 + }; + hdev->mem = g_malloc0(offsetof(struct vhost_memory, regions)); + hdev->n_mem_sections = 0; + hdev->mem_sections = NULL; + hdev->log = NULL; + hdev->log_size = 0; + hdev->log_enabled = false; + hdev->started = false; + memory_listener_register(&hdev->memory_listener, &address_space_memory); + hdev->force = force; + return 0; +fail_vq: + while (--i >= 0) { + vhost_virtqueue_cleanup(hdev->vqs + i); + } +fail: + r = -errno; + close(hdev->control); + return r; +} + +void vhost_dev_cleanup(struct vhost_dev *hdev) +{ + int i; + for (i = 0; i < hdev->nvqs; ++i) { + vhost_virtqueue_cleanup(hdev->vqs + i); + } + memory_listener_unregister(&hdev->memory_listener); + g_free(hdev->mem); + g_free(hdev->mem_sections); + close(hdev->control); +} + +bool vhost_dev_query(struct vhost_dev *hdev, VirtIODevice *vdev) +{ + return !vdev->binding->query_guest_notifiers || + vdev->binding->query_guest_notifiers(vdev->binding_opaque) || + hdev->force; +} + +/* Stop processing guest IO notifications in qemu. + * Start processing them in vhost in kernel. + */ +int vhost_dev_enable_notifiers(struct vhost_dev *hdev, VirtIODevice *vdev) +{ + int i, r; + if (!vdev->binding->set_host_notifier) { + fprintf(stderr, "binding does not support host notifiers\n"); + r = -ENOSYS; + goto fail; + } + + for (i = 0; i < hdev->nvqs; ++i) { + r = vdev->binding->set_host_notifier(vdev->binding_opaque, + hdev->vq_index + i, + true); + if (r < 0) { + fprintf(stderr, "vhost VQ %d notifier binding failed: %d\n", i, -r); + goto fail_vq; + } + } + + return 0; +fail_vq: + while (--i >= 0) { + r = vdev->binding->set_host_notifier(vdev->binding_opaque, + hdev->vq_index + i, + false); + if (r < 0) { + fprintf(stderr, "vhost VQ %d notifier cleanup error: %d\n", i, -r); + fflush(stderr); + } + assert (r >= 0); + } +fail: + return r; +} + +/* Stop processing guest IO notifications in vhost. + * Start processing them in qemu. + * This might actually run the qemu handlers right away, + * so virtio in qemu must be completely setup when this is called. + */ +void vhost_dev_disable_notifiers(struct vhost_dev *hdev, VirtIODevice *vdev) +{ + int i, r; + + for (i = 0; i < hdev->nvqs; ++i) { + r = vdev->binding->set_host_notifier(vdev->binding_opaque, + hdev->vq_index + i, + false); + if (r < 0) { + fprintf(stderr, "vhost VQ %d notifier cleanup failed: %d\n", i, -r); + fflush(stderr); + } + assert (r >= 0); + } +} + +/* Test and clear event pending status. + * Should be called after unmask to avoid losing events. + */ +bool vhost_virtqueue_pending(struct vhost_dev *hdev, int n) +{ + struct vhost_virtqueue *vq = hdev->vqs + n - hdev->vq_index; + assert(hdev->started); + assert(n >= hdev->vq_index && n < hdev->vq_index + hdev->nvqs); + return event_notifier_test_and_clear(&vq->masked_notifier); +} + +/* Mask/unmask events from this vq. */ +void vhost_virtqueue_mask(struct vhost_dev *hdev, VirtIODevice *vdev, int n, + bool mask) +{ + struct VirtQueue *vvq = virtio_get_queue(vdev, n); + int r, index = n - hdev->vq_index; + + assert(hdev->started); + assert(n >= hdev->vq_index && n < hdev->vq_index + hdev->nvqs); + + struct vhost_vring_file file = { + .index = index + }; + if (mask) { + file.fd = event_notifier_get_fd(&hdev->vqs[index].masked_notifier); + } else { + file.fd = event_notifier_get_fd(virtio_queue_get_guest_notifier(vvq)); + } + r = ioctl(hdev->control, VHOST_SET_VRING_CALL, &file); + assert(r >= 0); +} + +/* Host notifiers must be enabled at this point. */ +int vhost_dev_start(struct vhost_dev *hdev, VirtIODevice *vdev) +{ + int i, r; + + hdev->started = true; + + r = vhost_dev_set_features(hdev, hdev->log_enabled); + if (r < 0) { + goto fail_features; + } + r = ioctl(hdev->control, VHOST_SET_MEM_TABLE, hdev->mem); + if (r < 0) { + r = -errno; + goto fail_mem; + } + for (i = 0; i < hdev->nvqs; ++i) { + r = vhost_virtqueue_start(hdev, + vdev, + hdev->vqs + i, + hdev->vq_index + i); + if (r < 0) { + goto fail_vq; + } + } + + if (hdev->log_enabled) { + hdev->log_size = vhost_get_log_size(hdev); + hdev->log = hdev->log_size ? + g_malloc0(hdev->log_size * sizeof *hdev->log) : NULL; + r = ioctl(hdev->control, VHOST_SET_LOG_BASE, + (uint64_t)(unsigned long)hdev->log); + if (r < 0) { + r = -errno; + goto fail_log; + } + } + + return 0; +fail_log: +fail_vq: + while (--i >= 0) { + vhost_virtqueue_stop(hdev, + vdev, + hdev->vqs + i, + hdev->vq_index + i); + } + i = hdev->nvqs; +fail_mem: +fail_features: + + hdev->started = false; + return r; +} + +/* Host notifiers must be enabled at this point. */ +void vhost_dev_stop(struct vhost_dev *hdev, VirtIODevice *vdev) +{ + int i; + + for (i = 0; i < hdev->nvqs; ++i) { + vhost_virtqueue_stop(hdev, + vdev, + hdev->vqs + i, + hdev->vq_index + i); + } + vhost_log_sync_range(hdev, 0, ~0x0ull); + + hdev->started = false; + g_free(hdev->log); + hdev->log = NULL; + hdev->log_size = 0; +} + diff --git a/hw/virtio/virtio-balloon.c b/hw/virtio/virtio-balloon.c new file mode 100644 index 0000000000..c2c446eb9b --- /dev/null +++ b/hw/virtio/virtio-balloon.c @@ -0,0 +1,416 @@ +/* + * Virtio Balloon Device + * + * Copyright IBM, Corp. 2008 + * Copyright (C) 2011 Red Hat, Inc. + * Copyright (C) 2011 Amit Shah + * + * Authors: + * Anthony Liguori + * + * This work is licensed under the terms of the GNU GPL, version 2. See + * the COPYING file in the top-level directory. + * + */ + +#include "qemu/iov.h" +#include "qemu/timer.h" +#include "qemu-common.h" +#include "hw/virtio/virtio.h" +#include "hw/i386/pc.h" +#include "cpu.h" +#include "sysemu/balloon.h" +#include "hw/virtio/virtio-balloon.h" +#include "sysemu/kvm.h" +#include "exec/address-spaces.h" +#include "qapi/visitor.h" + +#if defined(__linux__) +#include +#endif + +#include "hw/virtio/virtio-bus.h" + +static void balloon_page(void *addr, int deflate) +{ +#if defined(__linux__) + if (!kvm_enabled() || kvm_has_sync_mmu()) + qemu_madvise(addr, TARGET_PAGE_SIZE, + deflate ? QEMU_MADV_WILLNEED : QEMU_MADV_DONTNEED); +#endif +} + +static const char *balloon_stat_names[] = { + [VIRTIO_BALLOON_S_SWAP_IN] = "stat-swap-in", + [VIRTIO_BALLOON_S_SWAP_OUT] = "stat-swap-out", + [VIRTIO_BALLOON_S_MAJFLT] = "stat-major-faults", + [VIRTIO_BALLOON_S_MINFLT] = "stat-minor-faults", + [VIRTIO_BALLOON_S_MEMFREE] = "stat-free-memory", + [VIRTIO_BALLOON_S_MEMTOT] = "stat-total-memory", + [VIRTIO_BALLOON_S_NR] = NULL +}; + +/* + * reset_stats - Mark all items in the stats array as unset + * + * This function needs to be called at device intialization and before + * before updating to a set of newly-generated stats. This will ensure that no + * stale values stick around in case the guest reports a subset of the supported + * statistics. + */ +static inline void reset_stats(VirtIOBalloon *dev) +{ + int i; + for (i = 0; i < VIRTIO_BALLOON_S_NR; dev->stats[i++] = -1); +} + +static bool balloon_stats_supported(const VirtIOBalloon *s) +{ + VirtIODevice *vdev = VIRTIO_DEVICE(s); + return vdev->guest_features & (1 << VIRTIO_BALLOON_F_STATS_VQ); +} + +static bool balloon_stats_enabled(const VirtIOBalloon *s) +{ + return s->stats_poll_interval > 0; +} + +static void balloon_stats_destroy_timer(VirtIOBalloon *s) +{ + if (balloon_stats_enabled(s)) { + qemu_del_timer(s->stats_timer); + qemu_free_timer(s->stats_timer); + s->stats_timer = NULL; + s->stats_poll_interval = 0; + } +} + +static void balloon_stats_change_timer(VirtIOBalloon *s, int secs) +{ + qemu_mod_timer(s->stats_timer, qemu_get_clock_ms(vm_clock) + secs * 1000); +} + +static void balloon_stats_poll_cb(void *opaque) +{ + VirtIOBalloon *s = opaque; + VirtIODevice *vdev = VIRTIO_DEVICE(s); + + if (!balloon_stats_supported(s)) { + /* re-schedule */ + balloon_stats_change_timer(s, s->stats_poll_interval); + return; + } + + virtqueue_push(s->svq, &s->stats_vq_elem, s->stats_vq_offset); + virtio_notify(vdev, s->svq); +} + +static void balloon_stats_get_all(Object *obj, struct Visitor *v, + void *opaque, const char *name, Error **errp) +{ + VirtIOBalloon *s = opaque; + int i; + + if (!s->stats_last_update) { + error_setg(errp, "guest hasn't updated any stats yet"); + return; + } + + visit_start_struct(v, NULL, "guest-stats", name, 0, errp); + visit_type_int(v, &s->stats_last_update, "last-update", errp); + + visit_start_struct(v, NULL, NULL, "stats", 0, errp); + for (i = 0; i < VIRTIO_BALLOON_S_NR; i++) { + visit_type_int64(v, (int64_t *) &s->stats[i], balloon_stat_names[i], + errp); + } + visit_end_struct(v, errp); + + visit_end_struct(v, errp); +} + +static void balloon_stats_get_poll_interval(Object *obj, struct Visitor *v, + void *opaque, const char *name, + Error **errp) +{ + VirtIOBalloon *s = opaque; + visit_type_int(v, &s->stats_poll_interval, name, errp); +} + +static void balloon_stats_set_poll_interval(Object *obj, struct Visitor *v, + void *opaque, const char *name, + Error **errp) +{ + VirtIOBalloon *s = opaque; + int64_t value; + + visit_type_int(v, &value, name, errp); + if (error_is_set(errp)) { + return; + } + + if (value < 0) { + error_setg(errp, "timer value must be greater than zero"); + return; + } + + if (value == s->stats_poll_interval) { + return; + } + + if (value == 0) { + /* timer=0 disables the timer */ + balloon_stats_destroy_timer(s); + return; + } + + if (balloon_stats_enabled(s)) { + /* timer interval change */ + s->stats_poll_interval = value; + balloon_stats_change_timer(s, value); + return; + } + + /* create a new timer */ + g_assert(s->stats_timer == NULL); + s->stats_timer = qemu_new_timer_ms(vm_clock, balloon_stats_poll_cb, s); + s->stats_poll_interval = value; + balloon_stats_change_timer(s, 0); +} + +static void virtio_balloon_handle_output(VirtIODevice *vdev, VirtQueue *vq) +{ + VirtIOBalloon *s = VIRTIO_BALLOON(vdev); + VirtQueueElement elem; + MemoryRegionSection section; + + while (virtqueue_pop(vq, &elem)) { + size_t offset = 0; + uint32_t pfn; + + while (iov_to_buf(elem.out_sg, elem.out_num, offset, &pfn, 4) == 4) { + ram_addr_t pa; + ram_addr_t addr; + + pa = (ram_addr_t)ldl_p(&pfn) << VIRTIO_BALLOON_PFN_SHIFT; + offset += 4; + + /* FIXME: remove get_system_memory(), but how? */ + section = memory_region_find(get_system_memory(), pa, 1); + if (!section.size || !memory_region_is_ram(section.mr)) + continue; + + /* Using memory_region_get_ram_ptr is bending the rules a bit, but + should be OK because we only want a single page. */ + addr = section.offset_within_region; + balloon_page(memory_region_get_ram_ptr(section.mr) + addr, + !!(vq == s->dvq)); + } + + virtqueue_push(vq, &elem, offset); + virtio_notify(vdev, vq); + } +} + +static void virtio_balloon_receive_stats(VirtIODevice *vdev, VirtQueue *vq) +{ + VirtIOBalloon *s = VIRTIO_BALLOON(vdev); + VirtQueueElement *elem = &s->stats_vq_elem; + VirtIOBalloonStat stat; + size_t offset = 0; + qemu_timeval tv; + + if (!virtqueue_pop(vq, elem)) { + goto out; + } + + /* Initialize the stats to get rid of any stale values. This is only + * needed to handle the case where a guest supports fewer stats than it + * used to (ie. it has booted into an old kernel). + */ + reset_stats(s); + + while (iov_to_buf(elem->out_sg, elem->out_num, offset, &stat, sizeof(stat)) + == sizeof(stat)) { + uint16_t tag = tswap16(stat.tag); + uint64_t val = tswap64(stat.val); + + offset += sizeof(stat); + if (tag < VIRTIO_BALLOON_S_NR) + s->stats[tag] = val; + } + s->stats_vq_offset = offset; + + if (qemu_gettimeofday(&tv) < 0) { + fprintf(stderr, "warning: %s: failed to get time of day\n", __func__); + goto out; + } + + s->stats_last_update = tv.tv_sec; + +out: + if (balloon_stats_enabled(s)) { + balloon_stats_change_timer(s, s->stats_poll_interval); + } +} + +static void virtio_balloon_get_config(VirtIODevice *vdev, uint8_t *config_data) +{ + VirtIOBalloon *dev = VIRTIO_BALLOON(vdev); + struct virtio_balloon_config config; + + config.num_pages = cpu_to_le32(dev->num_pages); + config.actual = cpu_to_le32(dev->actual); + + memcpy(config_data, &config, 8); +} + +static void virtio_balloon_set_config(VirtIODevice *vdev, + const uint8_t *config_data) +{ + VirtIOBalloon *dev = VIRTIO_BALLOON(vdev); + struct virtio_balloon_config config; + uint32_t oldactual = dev->actual; + memcpy(&config, config_data, 8); + dev->actual = le32_to_cpu(config.actual); + if (dev->actual != oldactual) { + qemu_balloon_changed(ram_size - + (dev->actual << VIRTIO_BALLOON_PFN_SHIFT)); + } +} + +static uint32_t virtio_balloon_get_features(VirtIODevice *vdev, uint32_t f) +{ + f |= (1 << VIRTIO_BALLOON_F_STATS_VQ); + return f; +} + +static void virtio_balloon_stat(void *opaque, BalloonInfo *info) +{ + VirtIOBalloon *dev = opaque; + info->actual = ram_size - ((uint64_t) dev->actual << + VIRTIO_BALLOON_PFN_SHIFT); +} + +static void virtio_balloon_to_target(void *opaque, ram_addr_t target) +{ + VirtIOBalloon *dev = VIRTIO_BALLOON(opaque); + VirtIODevice *vdev = VIRTIO_DEVICE(dev); + + if (target > ram_size) { + target = ram_size; + } + if (target) { + dev->num_pages = (ram_size - target) >> VIRTIO_BALLOON_PFN_SHIFT; + virtio_notify_config(vdev); + } +} + +static void virtio_balloon_save(QEMUFile *f, void *opaque) +{ + VirtIOBalloon *s = VIRTIO_BALLOON(opaque); + VirtIODevice *vdev = VIRTIO_DEVICE(s); + + virtio_save(vdev, f); + + qemu_put_be32(f, s->num_pages); + qemu_put_be32(f, s->actual); +} + +static int virtio_balloon_load(QEMUFile *f, void *opaque, int version_id) +{ + VirtIOBalloon *s = VIRTIO_BALLOON(opaque); + VirtIODevice *vdev = VIRTIO_DEVICE(s); + int ret; + + if (version_id != 1) + return -EINVAL; + + ret = virtio_load(vdev, f); + if (ret) { + return ret; + } + + s->num_pages = qemu_get_be32(f); + s->actual = qemu_get_be32(f); + return 0; +} + +static int virtio_balloon_device_init(VirtIODevice *vdev) +{ + DeviceState *qdev = DEVICE(vdev); + VirtIOBalloon *s = VIRTIO_BALLOON(vdev); + int ret; + + virtio_init(vdev, "virtio-balloon", VIRTIO_ID_BALLOON, 8); + + vdev->get_config = virtio_balloon_get_config; + vdev->set_config = virtio_balloon_set_config; + vdev->get_features = virtio_balloon_get_features; + + ret = qemu_add_balloon_handler(virtio_balloon_to_target, + virtio_balloon_stat, s); + + if (ret < 0) { + virtio_common_cleanup(VIRTIO_DEVICE(s)); + return -1; + } + + s->ivq = virtio_add_queue(vdev, 128, virtio_balloon_handle_output); + s->dvq = virtio_add_queue(vdev, 128, virtio_balloon_handle_output); + s->svq = virtio_add_queue(vdev, 128, virtio_balloon_receive_stats); + + register_savevm(qdev, "virtio-balloon", -1, 1, + virtio_balloon_save, virtio_balloon_load, s); + + object_property_add(OBJECT(qdev), "guest-stats", "guest statistics", + balloon_stats_get_all, NULL, NULL, s, NULL); + + object_property_add(OBJECT(qdev), "guest-stats-polling-interval", "int", + balloon_stats_get_poll_interval, + balloon_stats_set_poll_interval, + NULL, s, NULL); + return 0; +} + +static int virtio_balloon_device_exit(DeviceState *qdev) +{ + VirtIOBalloon *s = VIRTIO_BALLOON(qdev); + VirtIODevice *vdev = VIRTIO_DEVICE(qdev); + + balloon_stats_destroy_timer(s); + qemu_remove_balloon_handler(s); + unregister_savevm(qdev, "virtio-balloon", s); + virtio_common_cleanup(vdev); + return 0; +} + +static Property virtio_balloon_properties[] = { + DEFINE_PROP_END_OF_LIST(), +}; + +static void virtio_balloon_class_init(ObjectClass *klass, void *data) +{ + DeviceClass *dc = DEVICE_CLASS(klass); + VirtioDeviceClass *vdc = VIRTIO_DEVICE_CLASS(klass); + dc->exit = virtio_balloon_device_exit; + dc->props = virtio_balloon_properties; + vdc->init = virtio_balloon_device_init; + vdc->get_config = virtio_balloon_get_config; + vdc->set_config = virtio_balloon_set_config; + vdc->get_features = virtio_balloon_get_features; +} + +static const TypeInfo virtio_balloon_info = { + .name = TYPE_VIRTIO_BALLOON, + .parent = TYPE_VIRTIO_DEVICE, + .instance_size = sizeof(VirtIOBalloon), + .class_init = virtio_balloon_class_init, +}; + +static void virtio_register_types(void) +{ + type_register_static(&virtio_balloon_info); +} + +type_init(virtio_register_types) diff --git a/hw/virtio/virtio.c b/hw/virtio/virtio.c new file mode 100644 index 0000000000..1c2282c54f --- /dev/null +++ b/hw/virtio/virtio.c @@ -0,0 +1,1121 @@ +/* + * Virtio Support + * + * Copyright IBM, Corp. 2007 + * + * Authors: + * Anthony Liguori + * + * This work is licensed under the terms of the GNU GPL, version 2. See + * the COPYING file in the top-level directory. + * + */ + +#include + +#include "trace.h" +#include "qemu/error-report.h" +#include "hw/virtio/virtio.h" +#include "qemu/atomic.h" +#include "hw/virtio/virtio-bus.h" + +/* The alignment to use between consumer and producer parts of vring. + * x86 pagesize again. */ +#define VIRTIO_PCI_VRING_ALIGN 4096 + +typedef struct VRingDesc +{ + uint64_t addr; + uint32_t len; + uint16_t flags; + uint16_t next; +} VRingDesc; + +typedef struct VRingAvail +{ + uint16_t flags; + uint16_t idx; + uint16_t ring[0]; +} VRingAvail; + +typedef struct VRingUsedElem +{ + uint32_t id; + uint32_t len; +} VRingUsedElem; + +typedef struct VRingUsed +{ + uint16_t flags; + uint16_t idx; + VRingUsedElem ring[0]; +} VRingUsed; + +typedef struct VRing +{ + unsigned int num; + hwaddr desc; + hwaddr avail; + hwaddr used; +} VRing; + +struct VirtQueue +{ + VRing vring; + hwaddr pa; + uint16_t last_avail_idx; + /* Last used index value we have signalled on */ + uint16_t signalled_used; + + /* Last used index value we have signalled on */ + bool signalled_used_valid; + + /* Notification enabled? */ + bool notification; + + uint16_t queue_index; + + int inuse; + + uint16_t vector; + void (*handle_output)(VirtIODevice *vdev, VirtQueue *vq); + VirtIODevice *vdev; + EventNotifier guest_notifier; + EventNotifier host_notifier; +}; + +/* virt queue functions */ +static void virtqueue_init(VirtQueue *vq) +{ + hwaddr pa = vq->pa; + + vq->vring.desc = pa; + vq->vring.avail = pa + vq->vring.num * sizeof(VRingDesc); + vq->vring.used = vring_align(vq->vring.avail + + offsetof(VRingAvail, ring[vq->vring.num]), + VIRTIO_PCI_VRING_ALIGN); +} + +static inline uint64_t vring_desc_addr(hwaddr desc_pa, int i) +{ + hwaddr pa; + pa = desc_pa + sizeof(VRingDesc) * i + offsetof(VRingDesc, addr); + return ldq_phys(pa); +} + +static inline uint32_t vring_desc_len(hwaddr desc_pa, int i) +{ + hwaddr pa; + pa = desc_pa + sizeof(VRingDesc) * i + offsetof(VRingDesc, len); + return ldl_phys(pa); +} + +static inline uint16_t vring_desc_flags(hwaddr desc_pa, int i) +{ + hwaddr pa; + pa = desc_pa + sizeof(VRingDesc) * i + offsetof(VRingDesc, flags); + return lduw_phys(pa); +} + +static inline uint16_t vring_desc_next(hwaddr desc_pa, int i) +{ + hwaddr pa; + pa = desc_pa + sizeof(VRingDesc) * i + offsetof(VRingDesc, next); + return lduw_phys(pa); +} + +static inline uint16_t vring_avail_flags(VirtQueue *vq) +{ + hwaddr pa; + pa = vq->vring.avail + offsetof(VRingAvail, flags); + return lduw_phys(pa); +} + +static inline uint16_t vring_avail_idx(VirtQueue *vq) +{ + hwaddr pa; + pa = vq->vring.avail + offsetof(VRingAvail, idx); + return lduw_phys(pa); +} + +static inline uint16_t vring_avail_ring(VirtQueue *vq, int i) +{ + hwaddr pa; + pa = vq->vring.avail + offsetof(VRingAvail, ring[i]); + return lduw_phys(pa); +} + +static inline uint16_t vring_used_event(VirtQueue *vq) +{ + return vring_avail_ring(vq, vq->vring.num); +} + +static inline void vring_used_ring_id(VirtQueue *vq, int i, uint32_t val) +{ + hwaddr pa; + pa = vq->vring.used + offsetof(VRingUsed, ring[i].id); + stl_phys(pa, val); +} + +static inline void vring_used_ring_len(VirtQueue *vq, int i, uint32_t val) +{ + hwaddr pa; + pa = vq->vring.used + offsetof(VRingUsed, ring[i].len); + stl_phys(pa, val); +} + +static uint16_t vring_used_idx(VirtQueue *vq) +{ + hwaddr pa; + pa = vq->vring.used + offsetof(VRingUsed, idx); + return lduw_phys(pa); +} + +static inline void vring_used_idx_set(VirtQueue *vq, uint16_t val) +{ + hwaddr pa; + pa = vq->vring.used + offsetof(VRingUsed, idx); + stw_phys(pa, val); +} + +static inline void vring_used_flags_set_bit(VirtQueue *vq, int mask) +{ + hwaddr pa; + pa = vq->vring.used + offsetof(VRingUsed, flags); + stw_phys(pa, lduw_phys(pa) | mask); +} + +static inline void vring_used_flags_unset_bit(VirtQueue *vq, int mask) +{ + hwaddr pa; + pa = vq->vring.used + offsetof(VRingUsed, flags); + stw_phys(pa, lduw_phys(pa) & ~mask); +} + +static inline void vring_avail_event(VirtQueue *vq, uint16_t val) +{ + hwaddr pa; + if (!vq->notification) { + return; + } + pa = vq->vring.used + offsetof(VRingUsed, ring[vq->vring.num]); + stw_phys(pa, val); +} + +void virtio_queue_set_notification(VirtQueue *vq, int enable) +{ + vq->notification = enable; + if (vq->vdev->guest_features & (1 << VIRTIO_RING_F_EVENT_IDX)) { + vring_avail_event(vq, vring_avail_idx(vq)); + } else if (enable) { + vring_used_flags_unset_bit(vq, VRING_USED_F_NO_NOTIFY); + } else { + vring_used_flags_set_bit(vq, VRING_USED_F_NO_NOTIFY); + } + if (enable) { + /* Expose avail event/used flags before caller checks the avail idx. */ + smp_mb(); + } +} + +int virtio_queue_ready(VirtQueue *vq) +{ + return vq->vring.avail != 0; +} + +int virtio_queue_empty(VirtQueue *vq) +{ + return vring_avail_idx(vq) == vq->last_avail_idx; +} + +void virtqueue_fill(VirtQueue *vq, const VirtQueueElement *elem, + unsigned int len, unsigned int idx) +{ + unsigned int offset; + int i; + + trace_virtqueue_fill(vq, elem, len, idx); + + offset = 0; + for (i = 0; i < elem->in_num; i++) { + size_t size = MIN(len - offset, elem->in_sg[i].iov_len); + + cpu_physical_memory_unmap(elem->in_sg[i].iov_base, + elem->in_sg[i].iov_len, + 1, size); + + offset += size; + } + + for (i = 0; i < elem->out_num; i++) + cpu_physical_memory_unmap(elem->out_sg[i].iov_base, + elem->out_sg[i].iov_len, + 0, elem->out_sg[i].iov_len); + + idx = (idx + vring_used_idx(vq)) % vq->vring.num; + + /* Get a pointer to the next entry in the used ring. */ + vring_used_ring_id(vq, idx, elem->index); + vring_used_ring_len(vq, idx, len); +} + +void virtqueue_flush(VirtQueue *vq, unsigned int count) +{ + uint16_t old, new; + /* Make sure buffer is written before we update index. */ + smp_wmb(); + trace_virtqueue_flush(vq, count); + old = vring_used_idx(vq); + new = old + count; + vring_used_idx_set(vq, new); + vq->inuse -= count; + if (unlikely((int16_t)(new - vq->signalled_used) < (uint16_t)(new - old))) + vq->signalled_used_valid = false; +} + +void virtqueue_push(VirtQueue *vq, const VirtQueueElement *elem, + unsigned int len) +{ + virtqueue_fill(vq, elem, len, 0); + virtqueue_flush(vq, 1); +} + +static int virtqueue_num_heads(VirtQueue *vq, unsigned int idx) +{ + uint16_t num_heads = vring_avail_idx(vq) - idx; + + /* Check it isn't doing very strange things with descriptor numbers. */ + if (num_heads > vq->vring.num) { + error_report("Guest moved used index from %u to %u", + idx, vring_avail_idx(vq)); + exit(1); + } + /* On success, callers read a descriptor at vq->last_avail_idx. + * Make sure descriptor read does not bypass avail index read. */ + if (num_heads) { + smp_rmb(); + } + + return num_heads; +} + +static unsigned int virtqueue_get_head(VirtQueue *vq, unsigned int idx) +{ + unsigned int head; + + /* Grab the next descriptor number they're advertising, and increment + * the index we've seen. */ + head = vring_avail_ring(vq, idx % vq->vring.num); + + /* If their number is silly, that's a fatal mistake. */ + if (head >= vq->vring.num) { + error_report("Guest says index %u is available", head); + exit(1); + } + + return head; +} + +static unsigned virtqueue_next_desc(hwaddr desc_pa, + unsigned int i, unsigned int max) +{ + unsigned int next; + + /* If this descriptor says it doesn't chain, we're done. */ + if (!(vring_desc_flags(desc_pa, i) & VRING_DESC_F_NEXT)) + return max; + + /* Check they're not leading us off end of descriptors. */ + next = vring_desc_next(desc_pa, i); + /* Make sure compiler knows to grab that: we don't want it changing! */ + smp_wmb(); + + if (next >= max) { + error_report("Desc next is %u", next); + exit(1); + } + + return next; +} + +void virtqueue_get_avail_bytes(VirtQueue *vq, unsigned int *in_bytes, + unsigned int *out_bytes, + unsigned max_in_bytes, unsigned max_out_bytes) +{ + unsigned int idx; + unsigned int total_bufs, in_total, out_total; + + idx = vq->last_avail_idx; + + total_bufs = in_total = out_total = 0; + while (virtqueue_num_heads(vq, idx)) { + unsigned int max, num_bufs, indirect = 0; + hwaddr desc_pa; + int i; + + max = vq->vring.num; + num_bufs = total_bufs; + i = virtqueue_get_head(vq, idx++); + desc_pa = vq->vring.desc; + + if (vring_desc_flags(desc_pa, i) & VRING_DESC_F_INDIRECT) { + if (vring_desc_len(desc_pa, i) % sizeof(VRingDesc)) { + error_report("Invalid size for indirect buffer table"); + exit(1); + } + + /* If we've got too many, that implies a descriptor loop. */ + if (num_bufs >= max) { + error_report("Looped descriptor"); + exit(1); + } + + /* loop over the indirect descriptor table */ + indirect = 1; + max = vring_desc_len(desc_pa, i) / sizeof(VRingDesc); + num_bufs = i = 0; + desc_pa = vring_desc_addr(desc_pa, i); + } + + do { + /* If we've got too many, that implies a descriptor loop. */ + if (++num_bufs > max) { + error_report("Looped descriptor"); + exit(1); + } + + if (vring_desc_flags(desc_pa, i) & VRING_DESC_F_WRITE) { + in_total += vring_desc_len(desc_pa, i); + } else { + out_total += vring_desc_len(desc_pa, i); + } + if (in_total >= max_in_bytes && out_total >= max_out_bytes) { + goto done; + } + } while ((i = virtqueue_next_desc(desc_pa, i, max)) != max); + + if (!indirect) + total_bufs = num_bufs; + else + total_bufs++; + } +done: + if (in_bytes) { + *in_bytes = in_total; + } + if (out_bytes) { + *out_bytes = out_total; + } +} + +int virtqueue_avail_bytes(VirtQueue *vq, unsigned int in_bytes, + unsigned int out_bytes) +{ + unsigned int in_total, out_total; + + virtqueue_get_avail_bytes(vq, &in_total, &out_total, in_bytes, out_bytes); + return in_bytes <= in_total && out_bytes <= out_total; +} + +void virtqueue_map_sg(struct iovec *sg, hwaddr *addr, + size_t num_sg, int is_write) +{ + unsigned int i; + hwaddr len; + + for (i = 0; i < num_sg; i++) { + len = sg[i].iov_len; + sg[i].iov_base = cpu_physical_memory_map(addr[i], &len, is_write); + if (sg[i].iov_base == NULL || len != sg[i].iov_len) { + error_report("virtio: trying to map MMIO memory"); + exit(1); + } + } +} + +int virtqueue_pop(VirtQueue *vq, VirtQueueElement *elem) +{ + unsigned int i, head, max; + hwaddr desc_pa = vq->vring.desc; + + if (!virtqueue_num_heads(vq, vq->last_avail_idx)) + return 0; + + /* When we start there are none of either input nor output. */ + elem->out_num = elem->in_num = 0; + + max = vq->vring.num; + + i = head = virtqueue_get_head(vq, vq->last_avail_idx++); + if (vq->vdev->guest_features & (1 << VIRTIO_RING_F_EVENT_IDX)) { + vring_avail_event(vq, vring_avail_idx(vq)); + } + + if (vring_desc_flags(desc_pa, i) & VRING_DESC_F_INDIRECT) { + if (vring_desc_len(desc_pa, i) % sizeof(VRingDesc)) { + error_report("Invalid size for indirect buffer table"); + exit(1); + } + + /* loop over the indirect descriptor table */ + max = vring_desc_len(desc_pa, i) / sizeof(VRingDesc); + desc_pa = vring_desc_addr(desc_pa, i); + i = 0; + } + + /* Collect all the descriptors */ + do { + struct iovec *sg; + + if (vring_desc_flags(desc_pa, i) & VRING_DESC_F_WRITE) { + if (elem->in_num >= ARRAY_SIZE(elem->in_sg)) { + error_report("Too many write descriptors in indirect table"); + exit(1); + } + elem->in_addr[elem->in_num] = vring_desc_addr(desc_pa, i); + sg = &elem->in_sg[elem->in_num++]; + } else { + if (elem->out_num >= ARRAY_SIZE(elem->out_sg)) { + error_report("Too many read descriptors in indirect table"); + exit(1); + } + elem->out_addr[elem->out_num] = vring_desc_addr(desc_pa, i); + sg = &elem->out_sg[elem->out_num++]; + } + + sg->iov_len = vring_desc_len(desc_pa, i); + + /* If we've got too many, that implies a descriptor loop. */ + if ((elem->in_num + elem->out_num) > max) { + error_report("Looped descriptor"); + exit(1); + } + } while ((i = virtqueue_next_desc(desc_pa, i, max)) != max); + + /* Now map what we have collected */ + virtqueue_map_sg(elem->in_sg, elem->in_addr, elem->in_num, 1); + virtqueue_map_sg(elem->out_sg, elem->out_addr, elem->out_num, 0); + + elem->index = head; + + vq->inuse++; + + trace_virtqueue_pop(vq, elem, elem->in_num, elem->out_num); + return elem->in_num + elem->out_num; +} + +/* virtio device */ +static void virtio_notify_vector(VirtIODevice *vdev, uint16_t vector) +{ + if (vdev->binding->notify) { + vdev->binding->notify(vdev->binding_opaque, vector); + } +} + +void virtio_update_irq(VirtIODevice *vdev) +{ + virtio_notify_vector(vdev, VIRTIO_NO_VECTOR); +} + +void virtio_set_status(VirtIODevice *vdev, uint8_t val) +{ + trace_virtio_set_status(vdev, val); + + if (vdev->set_status) { + vdev->set_status(vdev, val); + } + vdev->status = val; +} + +void virtio_reset(void *opaque) +{ + VirtIODevice *vdev = opaque; + int i; + + virtio_set_status(vdev, 0); + + if (vdev->reset) + vdev->reset(vdev); + + vdev->guest_features = 0; + vdev->queue_sel = 0; + vdev->status = 0; + vdev->isr = 0; + vdev->config_vector = VIRTIO_NO_VECTOR; + virtio_notify_vector(vdev, vdev->config_vector); + + for(i = 0; i < VIRTIO_PCI_QUEUE_MAX; i++) { + vdev->vq[i].vring.desc = 0; + vdev->vq[i].vring.avail = 0; + vdev->vq[i].vring.used = 0; + vdev->vq[i].last_avail_idx = 0; + vdev->vq[i].pa = 0; + vdev->vq[i].vector = VIRTIO_NO_VECTOR; + vdev->vq[i].signalled_used = 0; + vdev->vq[i].signalled_used_valid = false; + vdev->vq[i].notification = true; + } +} + +uint32_t virtio_config_readb(VirtIODevice *vdev, uint32_t addr) +{ + uint8_t val; + + vdev->get_config(vdev, vdev->config); + + if (addr > (vdev->config_len - sizeof(val))) + return (uint32_t)-1; + + val = ldub_p(vdev->config + addr); + return val; +} + +uint32_t virtio_config_readw(VirtIODevice *vdev, uint32_t addr) +{ + uint16_t val; + + vdev->get_config(vdev, vdev->config); + + if (addr > (vdev->config_len - sizeof(val))) + return (uint32_t)-1; + + val = lduw_p(vdev->config + addr); + return val; +} + +uint32_t virtio_config_readl(VirtIODevice *vdev, uint32_t addr) +{ + uint32_t val; + + vdev->get_config(vdev, vdev->config); + + if (addr > (vdev->config_len - sizeof(val))) + return (uint32_t)-1; + + val = ldl_p(vdev->config + addr); + return val; +} + +void virtio_config_writeb(VirtIODevice *vdev, uint32_t addr, uint32_t data) +{ + uint8_t val = data; + + if (addr > (vdev->config_len - sizeof(val))) + return; + + stb_p(vdev->config + addr, val); + + if (vdev->set_config) + vdev->set_config(vdev, vdev->config); +} + +void virtio_config_writew(VirtIODevice *vdev, uint32_t addr, uint32_t data) +{ + uint16_t val = data; + + if (addr > (vdev->config_len - sizeof(val))) + return; + + stw_p(vdev->config + addr, val); + + if (vdev->set_config) + vdev->set_config(vdev, vdev->config); +} + +void virtio_config_writel(VirtIODevice *vdev, uint32_t addr, uint32_t data) +{ + uint32_t val = data; + + if (addr > (vdev->config_len - sizeof(val))) + return; + + stl_p(vdev->config + addr, val); + + if (vdev->set_config) + vdev->set_config(vdev, vdev->config); +} + +void virtio_queue_set_addr(VirtIODevice *vdev, int n, hwaddr addr) +{ + vdev->vq[n].pa = addr; + virtqueue_init(&vdev->vq[n]); +} + +hwaddr virtio_queue_get_addr(VirtIODevice *vdev, int n) +{ + return vdev->vq[n].pa; +} + +int virtio_queue_get_num(VirtIODevice *vdev, int n) +{ + return vdev->vq[n].vring.num; +} + +int virtio_queue_get_id(VirtQueue *vq) +{ + VirtIODevice *vdev = vq->vdev; + assert(vq >= &vdev->vq[0] && vq < &vdev->vq[VIRTIO_PCI_QUEUE_MAX]); + return vq - &vdev->vq[0]; +} + +void virtio_queue_notify_vq(VirtQueue *vq) +{ + if (vq->vring.desc) { + VirtIODevice *vdev = vq->vdev; + trace_virtio_queue_notify(vdev, vq - vdev->vq, vq); + vq->handle_output(vdev, vq); + } +} + +void virtio_queue_notify(VirtIODevice *vdev, int n) +{ + virtio_queue_notify_vq(&vdev->vq[n]); +} + +uint16_t virtio_queue_vector(VirtIODevice *vdev, int n) +{ + return n < VIRTIO_PCI_QUEUE_MAX ? vdev->vq[n].vector : + VIRTIO_NO_VECTOR; +} + +void virtio_queue_set_vector(VirtIODevice *vdev, int n, uint16_t vector) +{ + if (n < VIRTIO_PCI_QUEUE_MAX) + vdev->vq[n].vector = vector; +} + +VirtQueue *virtio_add_queue(VirtIODevice *vdev, int queue_size, + void (*handle_output)(VirtIODevice *, VirtQueue *)) +{ + int i; + + for (i = 0; i < VIRTIO_PCI_QUEUE_MAX; i++) { + if (vdev->vq[i].vring.num == 0) + break; + } + + if (i == VIRTIO_PCI_QUEUE_MAX || queue_size > VIRTQUEUE_MAX_SIZE) + abort(); + + vdev->vq[i].vring.num = queue_size; + vdev->vq[i].handle_output = handle_output; + + return &vdev->vq[i]; +} + +void virtio_del_queue(VirtIODevice *vdev, int n) +{ + if (n < 0 || n >= VIRTIO_PCI_QUEUE_MAX) { + abort(); + } + + vdev->vq[n].vring.num = 0; +} + +void virtio_irq(VirtQueue *vq) +{ + trace_virtio_irq(vq); + vq->vdev->isr |= 0x01; + virtio_notify_vector(vq->vdev, vq->vector); +} + +/* Assuming a given event_idx value from the other size, if + * we have just incremented index from old to new_idx, + * should we trigger an event? */ +static inline int vring_need_event(uint16_t event, uint16_t new, uint16_t old) +{ + /* Note: Xen has similar logic for notification hold-off + * in include/xen/interface/io/ring.h with req_event and req_prod + * corresponding to event_idx + 1 and new respectively. + * Note also that req_event and req_prod in Xen start at 1, + * event indexes in virtio start at 0. */ + return (uint16_t)(new - event - 1) < (uint16_t)(new - old); +} + +static bool vring_notify(VirtIODevice *vdev, VirtQueue *vq) +{ + uint16_t old, new; + bool v; + /* We need to expose used array entries before checking used event. */ + smp_mb(); + /* Always notify when queue is empty (when feature acknowledge) */ + if (((vdev->guest_features & (1 << VIRTIO_F_NOTIFY_ON_EMPTY)) && + !vq->inuse && vring_avail_idx(vq) == vq->last_avail_idx)) { + return true; + } + + if (!(vdev->guest_features & (1 << VIRTIO_RING_F_EVENT_IDX))) { + return !(vring_avail_flags(vq) & VRING_AVAIL_F_NO_INTERRUPT); + } + + v = vq->signalled_used_valid; + vq->signalled_used_valid = true; + old = vq->signalled_used; + new = vq->signalled_used = vring_used_idx(vq); + return !v || vring_need_event(vring_used_event(vq), new, old); +} + +void virtio_notify(VirtIODevice *vdev, VirtQueue *vq) +{ + if (!vring_notify(vdev, vq)) { + return; + } + + trace_virtio_notify(vdev, vq); + vdev->isr |= 0x01; + virtio_notify_vector(vdev, vq->vector); +} + +void virtio_notify_config(VirtIODevice *vdev) +{ + if (!(vdev->status & VIRTIO_CONFIG_S_DRIVER_OK)) + return; + + vdev->isr |= 0x03; + virtio_notify_vector(vdev, vdev->config_vector); +} + +void virtio_save(VirtIODevice *vdev, QEMUFile *f) +{ + int i; + + if (vdev->binding->save_config) + vdev->binding->save_config(vdev->binding_opaque, f); + + qemu_put_8s(f, &vdev->status); + qemu_put_8s(f, &vdev->isr); + qemu_put_be16s(f, &vdev->queue_sel); + qemu_put_be32s(f, &vdev->guest_features); + qemu_put_be32(f, vdev->config_len); + qemu_put_buffer(f, vdev->config, vdev->config_len); + + for (i = 0; i < VIRTIO_PCI_QUEUE_MAX; i++) { + if (vdev->vq[i].vring.num == 0) + break; + } + + qemu_put_be32(f, i); + + for (i = 0; i < VIRTIO_PCI_QUEUE_MAX; i++) { + if (vdev->vq[i].vring.num == 0) + break; + + qemu_put_be32(f, vdev->vq[i].vring.num); + qemu_put_be64(f, vdev->vq[i].pa); + qemu_put_be16s(f, &vdev->vq[i].last_avail_idx); + if (vdev->binding->save_queue) + vdev->binding->save_queue(vdev->binding_opaque, i, f); + } +} + +int virtio_set_features(VirtIODevice *vdev, uint32_t val) +{ + uint32_t supported_features = + vdev->binding->get_features(vdev->binding_opaque); + bool bad = (val & ~supported_features) != 0; + + val &= supported_features; + if (vdev->set_features) { + vdev->set_features(vdev, val); + } + vdev->guest_features = val; + return bad ? -1 : 0; +} + +int virtio_load(VirtIODevice *vdev, QEMUFile *f) +{ + int num, i, ret; + uint32_t features; + uint32_t supported_features; + + if (vdev->binding->load_config) { + ret = vdev->binding->load_config(vdev->binding_opaque, f); + if (ret) + return ret; + } + + qemu_get_8s(f, &vdev->status); + qemu_get_8s(f, &vdev->isr); + qemu_get_be16s(f, &vdev->queue_sel); + qemu_get_be32s(f, &features); + + if (virtio_set_features(vdev, features) < 0) { + supported_features = vdev->binding->get_features(vdev->binding_opaque); + error_report("Features 0x%x unsupported. Allowed features: 0x%x", + features, supported_features); + return -1; + } + vdev->config_len = qemu_get_be32(f); + qemu_get_buffer(f, vdev->config, vdev->config_len); + + num = qemu_get_be32(f); + + for (i = 0; i < num; i++) { + vdev->vq[i].vring.num = qemu_get_be32(f); + vdev->vq[i].pa = qemu_get_be64(f); + qemu_get_be16s(f, &vdev->vq[i].last_avail_idx); + vdev->vq[i].signalled_used_valid = false; + vdev->vq[i].notification = true; + + if (vdev->vq[i].pa) { + uint16_t nheads; + virtqueue_init(&vdev->vq[i]); + nheads = vring_avail_idx(&vdev->vq[i]) - vdev->vq[i].last_avail_idx; + /* Check it isn't doing very strange things with descriptor numbers. */ + if (nheads > vdev->vq[i].vring.num) { + error_report("VQ %d size 0x%x Guest index 0x%x " + "inconsistent with Host index 0x%x: delta 0x%x", + i, vdev->vq[i].vring.num, + vring_avail_idx(&vdev->vq[i]), + vdev->vq[i].last_avail_idx, nheads); + return -1; + } + } else if (vdev->vq[i].last_avail_idx) { + error_report("VQ %d address 0x0 " + "inconsistent with Host index 0x%x", + i, vdev->vq[i].last_avail_idx); + return -1; + } + if (vdev->binding->load_queue) { + ret = vdev->binding->load_queue(vdev->binding_opaque, i, f); + if (ret) + return ret; + } + } + + virtio_notify_vector(vdev, VIRTIO_NO_VECTOR); + return 0; +} + +void virtio_common_cleanup(VirtIODevice *vdev) +{ + qemu_del_vm_change_state_handler(vdev->vmstate); + g_free(vdev->config); + g_free(vdev->vq); +} + +void virtio_cleanup(VirtIODevice *vdev) +{ + virtio_common_cleanup(vdev); + g_free(vdev); +} + +static void virtio_vmstate_change(void *opaque, int running, RunState state) +{ + VirtIODevice *vdev = opaque; + bool backend_run = running && (vdev->status & VIRTIO_CONFIG_S_DRIVER_OK); + vdev->vm_running = running; + + if (backend_run) { + virtio_set_status(vdev, vdev->status); + } + + if (vdev->binding->vmstate_change) { + vdev->binding->vmstate_change(vdev->binding_opaque, backend_run); + } + + if (!backend_run) { + virtio_set_status(vdev, vdev->status); + } +} + +void virtio_init(VirtIODevice *vdev, const char *name, + uint16_t device_id, size_t config_size) +{ + int i; + vdev->device_id = device_id; + vdev->status = 0; + vdev->isr = 0; + vdev->queue_sel = 0; + vdev->config_vector = VIRTIO_NO_VECTOR; + vdev->vq = g_malloc0(sizeof(VirtQueue) * VIRTIO_PCI_QUEUE_MAX); + vdev->vm_running = runstate_is_running(); + for (i = 0; i < VIRTIO_PCI_QUEUE_MAX; i++) { + vdev->vq[i].vector = VIRTIO_NO_VECTOR; + vdev->vq[i].vdev = vdev; + vdev->vq[i].queue_index = i; + } + + vdev->name = name; + vdev->config_len = config_size; + if (vdev->config_len) { + vdev->config = g_malloc0(config_size); + } else { + vdev->config = NULL; + } + vdev->vmstate = qemu_add_vm_change_state_handler(virtio_vmstate_change, + vdev); +} + +VirtIODevice *virtio_common_init(const char *name, uint16_t device_id, + size_t config_size, size_t struct_size) +{ + VirtIODevice *vdev; + vdev = g_malloc0(struct_size); + virtio_init(vdev, name, device_id, config_size); + return vdev; +} + +void virtio_bind_device(VirtIODevice *vdev, const VirtIOBindings *binding, + DeviceState *opaque) +{ + vdev->binding = binding; + vdev->binding_opaque = opaque; +} + +hwaddr virtio_queue_get_desc_addr(VirtIODevice *vdev, int n) +{ + return vdev->vq[n].vring.desc; +} + +hwaddr virtio_queue_get_avail_addr(VirtIODevice *vdev, int n) +{ + return vdev->vq[n].vring.avail; +} + +hwaddr virtio_queue_get_used_addr(VirtIODevice *vdev, int n) +{ + return vdev->vq[n].vring.used; +} + +hwaddr virtio_queue_get_ring_addr(VirtIODevice *vdev, int n) +{ + return vdev->vq[n].vring.desc; +} + +hwaddr virtio_queue_get_desc_size(VirtIODevice *vdev, int n) +{ + return sizeof(VRingDesc) * vdev->vq[n].vring.num; +} + +hwaddr virtio_queue_get_avail_size(VirtIODevice *vdev, int n) +{ + return offsetof(VRingAvail, ring) + + sizeof(uint64_t) * vdev->vq[n].vring.num; +} + +hwaddr virtio_queue_get_used_size(VirtIODevice *vdev, int n) +{ + return offsetof(VRingUsed, ring) + + sizeof(VRingUsedElem) * vdev->vq[n].vring.num; +} + +hwaddr virtio_queue_get_ring_size(VirtIODevice *vdev, int n) +{ + return vdev->vq[n].vring.used - vdev->vq[n].vring.desc + + virtio_queue_get_used_size(vdev, n); +} + +uint16_t virtio_queue_get_last_avail_idx(VirtIODevice *vdev, int n) +{ + return vdev->vq[n].last_avail_idx; +} + +void virtio_queue_set_last_avail_idx(VirtIODevice *vdev, int n, uint16_t idx) +{ + vdev->vq[n].last_avail_idx = idx; +} + +VirtQueue *virtio_get_queue(VirtIODevice *vdev, int n) +{ + return vdev->vq + n; +} + +uint16_t virtio_get_queue_index(VirtQueue *vq) +{ + return vq->queue_index; +} + +static void virtio_queue_guest_notifier_read(EventNotifier *n) +{ + VirtQueue *vq = container_of(n, VirtQueue, guest_notifier); + if (event_notifier_test_and_clear(n)) { + virtio_irq(vq); + } +} + +void virtio_queue_set_guest_notifier_fd_handler(VirtQueue *vq, bool assign, + bool with_irqfd) +{ + if (assign && !with_irqfd) { + event_notifier_set_handler(&vq->guest_notifier, + virtio_queue_guest_notifier_read); + } else { + event_notifier_set_handler(&vq->guest_notifier, NULL); + } + if (!assign) { + /* Test and clear notifier before closing it, + * in case poll callback didn't have time to run. */ + virtio_queue_guest_notifier_read(&vq->guest_notifier); + } +} + +EventNotifier *virtio_queue_get_guest_notifier(VirtQueue *vq) +{ + return &vq->guest_notifier; +} + +static void virtio_queue_host_notifier_read(EventNotifier *n) +{ + VirtQueue *vq = container_of(n, VirtQueue, host_notifier); + if (event_notifier_test_and_clear(n)) { + virtio_queue_notify_vq(vq); + } +} + +void virtio_queue_set_host_notifier_fd_handler(VirtQueue *vq, bool assign, + bool set_handler) +{ + if (assign && set_handler) { + event_notifier_set_handler(&vq->host_notifier, + virtio_queue_host_notifier_read); + } else { + event_notifier_set_handler(&vq->host_notifier, NULL); + } + if (!assign) { + /* Test and clear notifier before after disabling event, + * in case poll callback didn't have time to run. */ + virtio_queue_host_notifier_read(&vq->host_notifier); + } +} + +EventNotifier *virtio_queue_get_host_notifier(VirtQueue *vq) +{ + return &vq->host_notifier; +} + +static int virtio_device_init(DeviceState *qdev) +{ + VirtIODevice *vdev = VIRTIO_DEVICE(qdev); + VirtioDeviceClass *k = VIRTIO_DEVICE_GET_CLASS(qdev); + assert(k->init != NULL); + if (k->init(vdev) < 0) { + return -1; + } + virtio_bus_plug_device(vdev); + return 0; +} + +static void virtio_device_class_init(ObjectClass *klass, void *data) +{ + /* Set the default value here. */ + DeviceClass *dc = DEVICE_CLASS(klass); + dc->init = virtio_device_init; + dc->bus_type = TYPE_VIRTIO_BUS; +} + +static const TypeInfo virtio_device_info = { + .name = TYPE_VIRTIO_DEVICE, + .parent = TYPE_DEVICE, + .instance_size = sizeof(VirtIODevice), + .class_init = virtio_device_class_init, + .abstract = true, + .class_size = sizeof(VirtioDeviceClass), +}; + +static void virtio_register_types(void) +{ + type_register_static(&virtio_device_info); +} + +type_init(virtio_register_types) -- cgit v1.2.1