[Date Prev][Date Next] [Thread Prev][Thread Next] [Date Index] [Thread Index]

[PATCH fio] engines: Add Network Block Device (NBD) support.



Signed-off-by: Richard W.M. Jones <rjones@redhat.com>
---
 Makefile         |   2 +-
 engines/nbd.c    | 570 +++++++++++++++++++++++++++++++++++++++++++++++
 examples/nbd.fio |   9 +
 optgroup.c       |   4 +
 optgroup.h       |   2 +
 options.c        |   3 +
 6 files changed, 589 insertions(+), 1 deletion(-)

diff --git a/Makefile b/Makefile
index fd138dd2..ef8db418 100644
--- a/Makefile
+++ b/Makefile
@@ -45,7 +45,7 @@ SOURCE :=	$(sort $(patsubst $(SRCDIR)/%,%,$(wildcard $(SRCDIR)/crc/*.c)) \
 		pshared.c options.c \
 		smalloc.c filehash.c profile.c debug.c engines/cpu.c \
 		engines/mmap.c engines/sync.c engines/null.c engines/net.c \
-		engines/ftruncate.c engines/filecreate.c \
+		engines/ftruncate.c engines/filecreate.c engines/nbd.c \
 		server.c client.c iolog.c backend.c libfio.c flow.c cconv.c \
 		gettime-thread.c helpers.c json.c idletime.c td_error.c \
 		profiles/tiobench.c profiles/act.c io_u_queue.c filelock.c \
diff --git a/engines/nbd.c b/engines/nbd.c
new file mode 100644
index 00000000..86e394a1
--- /dev/null
+++ b/engines/nbd.c
@@ -0,0 +1,570 @@
+/*
+ * NBD engine
+ *
+ * IO engine that talks to a newstyle-fixed NBD server over a TCP or
+ * Unix domain socket.
+ *
+ * Copyright (C) 2019 Red Hat Inc.
+ * Written by Richard W.M. Jones <rjones@redhat.com>
+ * Distributed under the GNU GPL v2 license.
+ *
+ */
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <stdint.h>
+#include <errno.h>
+#include <netdb.h>
+#include <netinet/in.h>
+#include <netinet/tcp.h>
+#include <sys/types.h>
+#include <sys/socket.h>
+#include <sys/un.h>
+
+#include "../fio.h"
+#include "../optgroup.h"
+
+/* New-style handshake. */
+struct nbd_new_handshake {
+        char nbdmagic[8];           /* "NBDMAGIC" */
+        uint64_t version;           /* NEW_VERSION */
+        uint16_t gflags;            /* global flags */
+} __attribute__((packed));
+
+#define NEW_VERSION UINT64_C(0x49484156454F5054)
+
+#define NBD_FLAG_FIXED_NEWSTYLE     1
+#define NBD_FLAG_NO_ZEROES          2
+
+struct nbd_new_option {
+        uint64_t version;           /* NEW_VERSION */
+        uint32_t option;            /* NBD_OPT_* */
+        uint32_t optlen;            /* option data length */
+        /* option data follows */
+} __attribute__((packed));
+
+#define NBD_OPT_EXPORT_NAME         UINT32_C(1)
+
+struct nbd_new_handshake_finish {
+        uint64_t exportsize;
+        uint16_t eflags;              /* per-export flags */
+} __attribute__((packed));
+
+/* Per-export flags. */
+#define NBD_FLAG_HAS_FLAGS         (1 << 0)
+#define NBD_FLAG_READ_ONLY         (1 << 1)
+#define NBD_FLAG_SEND_FLUSH        (1 << 2)
+#define NBD_FLAG_SEND_FUA          (1 << 3)
+#define NBD_FLAG_ROTATIONAL        (1 << 4)
+#define NBD_FLAG_SEND_TRIM         (1 << 5)
+#define NBD_FLAG_SEND_WRITE_ZEROES (1 << 6)
+#define NBD_FLAG_SEND_DF           (1 << 7)
+#define NBD_FLAG_CAN_MULTI_CONN    (1 << 8)
+
+struct nbd_request {
+        uint32_t magic;               /* NBD_REQUEST_MAGIC. */
+        uint16_t flags;               /* NBD_CMD_FLAG_* */
+        uint16_t type;                /* NBD_CMD_* */
+        uint64_t handle;              /* Opaque handle. */
+        uint64_t offset;              /* Request offset. */
+        uint32_t count;               /* Request length. */
+} __attribute__((packed));
+
+struct nbd_reply {
+        uint32_t magic;               /* NBD_SIMPLE_REPLY_MAGIC. */
+        uint32_t error;               /* NBD_SUCCESS or one of NBD_E*. */
+        uint64_t handle;              /* Opaque handle. */
+} __attribute__((packed));
+
+#define NBD_REQUEST_MAGIC           UINT32_C(0x25609513)
+#define NBD_SIMPLE_REPLY_MAGIC      UINT32_C(0x67446698)
+
+#define NBD_CMD_READ                0
+#define NBD_CMD_WRITE               1
+#define NBD_CMD_DISC                2
+#define NBD_CMD_FLUSH               3
+#define NBD_CMD_TRIM                4
+#define NBD_CMD_WRITE_ZEROES        6
+
+#define NBD_CMD_FLAG_FUA      (1<<0)
+#define NBD_CMD_FLAG_NO_HOLE  (1<<1)
+
+#define NBD_SUCCESS     0
+#define NBD_EPERM       1
+#define NBD_EIO         5
+#define NBD_ENOMEM     12
+#define NBD_EINVAL     22
+#define NBD_ENOSPC     28
+#define NBD_ESHUTDOWN 108
+
+/* Actually this differs across servers, but for nbdkit ... */
+#define NBD_MAX_REQUEST_SIZE (64 * 1024 * 1024)
+
+static uint64_t exportsize;
+static uint16_t eflags;
+
+struct nbd_data {
+        int fd;
+};
+
+struct nbd_options {
+        void *padding;
+        char *hostname;
+        char *port;
+        char *sockname;
+        char *exportname;
+        unsigned long long size;
+};
+
+static struct fio_option options[] = {
+	{
+		.name	= "hostname",
+		.lname	= "NBD remote hostname",
+		.help	= "Hostname of remote NBD server",
+		.category = FIO_OPT_C_ENGINE,
+		.group	= FIO_OPT_G_NBD,
+		.type	= FIO_OPT_STR_STORE,
+                .off1   = offsetof(struct nbd_options, hostname),
+	},
+	{
+		.name	= "port",
+		.lname	= "NBD port",
+		.help	= "Port or service name to use for NBD connection",
+		.category = FIO_OPT_C_ENGINE,
+		.group	= FIO_OPT_G_NBD,
+		.type	= FIO_OPT_STR_STORE,
+		.off1	= offsetof(struct nbd_options, port),
+	},
+	{
+		.name	= "sockname",
+		.lname	= "NBD Unix domain socket",
+		.help	= "Name of Unix domain socket",
+		.category = FIO_OPT_C_ENGINE,
+		.group	= FIO_OPT_G_NBD,
+		.type	= FIO_OPT_STR_STORE,
+                .off1   = offsetof(struct nbd_options, sockname),
+	},
+	{
+		.name	= "exportname",
+		.lname	= "NBD export name",
+		.help	= "Name of NBD export",
+		.category = FIO_OPT_C_ENGINE,
+		.group	= FIO_OPT_G_NBD,
+		.type	= FIO_OPT_STR_STORE,
+                .off1   = offsetof(struct nbd_options, exportname),
+	},
+	{
+		.name	= "nbdsize",
+		.lname	= "Size of block device",
+		.help	= "Size of NBD export to use",
+		.category = FIO_OPT_C_ENGINE,
+		.group	= FIO_OPT_G_NBD,
+		.type	= FIO_OPT_ULL,
+                .off1   = offsetof(struct nbd_options, size),
+                .minval = 512,
+                .interval = 512,
+	},
+        {
+                .name   = NULL,
+        },
+};
+
+/* Alocates nbd_data. */
+static int nbd_setup(struct thread_data *td)
+{
+        struct nbd_data *nbd_data;
+        struct nbd_options *o = td->eo;
+        struct fio_file *f;
+
+        if (!td->io_ops_data) {
+                nbd_data = calloc(1, sizeof(*nbd_data));
+                if (!nbd_data) return 1;
+                td->io_ops_data = nbd_data;
+        }
+
+        /* Pretend to deal with files.  See engines/rbd.c */
+        if (!td->files_index) {
+                add_file(td, "nbd", 0, 0);
+                td->o.nr_files = td->o.nr_files ? : 1;
+                td->o.open_files++;
+        }
+        f = td->files[0];
+        /* XXX We could read the size from NBD, but that's awkward as
+         * we haven't connected to the server.  In any case nothing
+         * bad happens (except an error) if the size is too large.
+         */
+        f->real_file_size = o->size;
+
+        return 0;
+}
+
+/* Closes socket and frees nbd_data -- the opposite of nbd_setup. */
+static void nbd_cleanup(struct thread_data *td)
+{
+        struct nbd_data *nbd_data = td->io_ops_data;
+
+        if (nbd_data) {
+                if (nbd_data->fd >= 0)
+                        close(nbd_data->fd);
+                free(nbd_data);
+        }
+}
+
+/* Connect to the NBD server. */
+static int set_nodelay(int fd)
+{
+#ifdef CONFIG_TCP_NODELAY
+        const int optval = 1;
+
+        if (setsockopt(fd, IPPROTO_TCP, TCP_NODELAY,
+                       (void *)&optval, sizeof(int)) == -1) {
+                log_err("fio: nbd: cannot set TCP_NODELAY option: %s\n",
+                        strerror(errno));
+                return 1;
+        }
+#endif
+        return 0;
+}
+
+static int nbd_connect_inet(struct thread_data *td,
+                            const char *hostname, const char *port)
+{
+        struct nbd_data *nbd_data = td->io_ops_data;
+        struct addrinfo hints;
+        struct addrinfo *result, *rp;
+        int r;
+
+        memset(&hints, 0, sizeof hints);
+        hints.ai_family = AF_UNSPEC;
+        hints.ai_socktype = SOCK_STREAM;
+        hints.ai_flags = 0;
+        hints.ai_protocol = 0;
+
+        r = getaddrinfo(hostname, port, &hints, &result);
+        if(r != 0) {
+                log_err("fio: nbd: getaddrinfo: %s\n", gai_strerror(r));
+                return 1;
+        }
+
+        for (rp = result; rp; rp = rp->ai_next) {
+                nbd_data->fd = socket(rp->ai_family, rp->ai_socktype,
+                                      rp->ai_protocol);
+                if(nbd_data->fd == -1)
+                        continue;
+                if(connect(nbd_data->fd, rp->ai_addr, rp->ai_addrlen) != -1)
+                        break;
+                close(nbd_data->fd);
+        }
+        freeaddrinfo(result);
+        if (rp == NULL) {
+                td_verror(td, errno, "connect");
+                return 1;
+        }
+
+        if (set_nodelay(nbd_data->fd)) {
+                close(nbd_data->fd);
+                nbd_data->fd = -1;
+                return 1;
+        }
+
+        return 0;
+}
+
+static int nbd_connect_unix(struct thread_data *td,
+                            const char *sockname)
+{
+        struct nbd_data *nbd_data = td->io_ops_data;
+        struct sockaddr_un sun;
+        socklen_t len;
+
+        nbd_data->fd = socket(AF_UNIX, SOCK_STREAM, 0);
+        if(nbd_data->fd == -1) {
+                td_verror(td, errno, "socket");
+                return 1;
+        }
+
+        sun.sun_family = AF_UNIX;
+        memset(sun.sun_path, 0, sizeof(sun.sun_path));
+        strncpy(sun.sun_path, sockname, sizeof(sun.sun_path) - 1);
+
+        len = sizeof(sun.sun_family) + strlen(sun.sun_path) + 1;
+        if (connect(nbd_data->fd, (struct sockaddr *) &sun, len) == -1) {
+                td_verror(td, errno, "connect");
+                goto fail;
+        }
+
+        return 0;
+
+ fail:
+        close(nbd_data->fd);
+        return 1;
+}
+
+/* Like read(2) and write(2) except they always read/write the full
+ * data or fail if that isn't possible.  Read fails on early EOF too.
+ *
+ * Returns 0 for success or 1 for failure.
+ */
+static int full_read(struct thread_data *td,
+                     int fd, void *buf, size_t count)
+{
+        ssize_t r;
+
+        while (count > 0) {
+                r = read(fd, buf, count);
+                if (r == -1) {
+                        td_verror(td, errno, "read");
+                        return 1;
+                }
+                if (r == 0) {
+                        log_err("fio: nbd: unexpected end of file "
+                                "reading from server\n");
+                        return 1;
+                }
+                buf += r;
+                count -= r;
+        }
+
+        return 0;
+}
+
+static int full_write(struct thread_data *td,
+                      int fd, const void *buf, size_t count)
+{
+        ssize_t r;
+
+        while (count > 0) {
+                r = write(fd, buf, count);
+                if (r == -1) {
+                        td_verror(td, errno, "write");
+                        return 1;
+                }
+                buf += r;
+                count -= r;
+        }
+
+        return 0;
+}
+
+static int nbd_handshake(struct thread_data *td,
+                         const char *exportname)
+{
+        struct nbd_data *nbd_data = td->io_ops_data;
+        struct nbd_options *o = td->eo;
+        struct nbd_new_handshake handshake;
+        const int expected_gflags = NBD_FLAG_FIXED_NEWSTYLE|NBD_FLAG_NO_ZEROES;
+        uint32_t cflags;
+        struct nbd_new_option option;
+        const uint32_t exportname_len = strlen(exportname);
+        struct nbd_new_handshake_finish finish;
+
+        /* Expect the fixed newstyle handshake. */
+        if (full_read(td, nbd_data->fd, &handshake, sizeof handshake))
+                return 1;
+        handshake.version = be64_to_cpu(handshake.version);
+        handshake.gflags = be16_to_cpu(handshake.gflags);
+        if (memcmp(handshake.nbdmagic, "NBDMAGIC", 8) != 0 ||
+            handshake.version != NEW_VERSION ||
+            (handshake.gflags & expected_gflags) != expected_gflags) {
+                log_err("fio: nbd: handshake failed: server is not a "
+                        "fixed newstyle NBD server\n");
+                return 1;
+        }
+
+        /* Send 32 bit flags word back to the server containing only
+         * the global flags we understand.
+         */
+        cflags = handshake.gflags & expected_gflags;
+        cflags = cpu_to_be32(cflags);
+        if (full_write(td, nbd_data->fd, &cflags, sizeof cflags))
+                return 1;
+
+        /* Send NBD_OPT_EXPORT_NAME.  If we send any other options
+         * they must go before this one, since this option terminates
+         * the option phase.
+         */
+        option.version = cpu_to_be64(NEW_VERSION);
+        option.option = cpu_to_be32(NBD_OPT_EXPORT_NAME);
+        option.optlen = cpu_to_be32(exportname_len);
+        if (full_write(td, nbd_data->fd, &option, sizeof option) ||
+            full_write(td, nbd_data->fd, exportname, exportname_len))
+                return 1;
+        /* Note the server does not send an option reply to
+         * NBD_OPT_EXPORT_NAME.
+         */
+
+        /* Server sends handshake finish. */
+        if (full_read(td, nbd_data->fd, &finish, sizeof finish))
+                return 1;
+        finish.exportsize = be64_to_cpu(finish.exportsize);
+        finish.eflags = be16_to_cpu(finish.eflags);
+
+        exportsize = finish.exportsize;
+        eflags = finish.eflags;
+
+        if (exportsize < o->size) {
+                log_err("fio: nbd: NBD export size is smaller than test size: "
+                        "increase the fio size= parameter to at least "
+                        "%" PRIu64,
+                        exportsize);
+                return 1;
+        }
+
+        return 0;
+}
+
+static int nbd_init(struct thread_data *td)
+{
+        struct nbd_options *o = td->eo;
+        int r;
+
+        if (o->sockname) {
+                /* Unix domain socket connection. */
+                if (o->hostname || o->port) {
+                        log_err("fio: nbd: if sockname is specified, "
+                                "hostname and port cannot be used\n");
+                        return 1;
+                }
+                r = nbd_connect_unix(td, o->sockname);
+        } else {
+                /* TCP connection to (usually) remote server. */
+                if (!o->hostname) {
+                        log_err("fio: nbd: you must specify either "
+                                "hostname or sockname\n");
+                        return 1;
+                }
+                r = nbd_connect_inet(td, o->hostname,
+                                     o->port ? o->port : "10809");
+        }
+        if (r)
+                return 1;
+
+        log_info("fio: connected to NBD server\n");
+        r = nbd_handshake(td, o->exportname ? o->exportname : "");
+        if (r)
+                return 1;
+
+        log_info("fio: handshake with NBD server completed\n");
+        return 0;
+}
+
+/* Read or write request (handled synchronously in this engine). */
+static enum fio_q_status nbd_queue(struct thread_data *td,
+                                   struct io_u *io_u)
+{
+        struct nbd_data *nbd_data = td->io_ops_data;
+        struct nbd_request rq;
+        uint16_t cmd;
+        struct nbd_reply rp;
+
+        fio_ro_check(td, io_u);
+
+        if (io_u->ddir == DDIR_WRITE || io_u->ddir == DDIR_READ)
+                assert(io_u->xfer_buflen <= NBD_MAX_REQUEST_SIZE);
+
+        rq.magic = cpu_to_be32(NBD_REQUEST_MAGIC);
+        rq.flags = 0;
+        rq.handle = 0; /* XXX If we do non-sync, need to set this. */
+        rq.offset = cpu_to_be64((uint64_t) io_u->offset);
+        rq.count = cpu_to_be32((uint32_t) io_u->xfer_buflen);
+
+        switch (io_u->ddir) {
+        case DDIR_WRITE: cmd = NBD_CMD_WRITE; break;
+        case DDIR_READ:  cmd = NBD_CMD_READ;  break;
+        case DDIR_TRIM:  cmd = NBD_CMD_TRIM;  break;
+                /* XXX We could probably also handle
+                 * DDIR_SYNC_FILE_RANGE with a bit of effort.
+                 */
+        case DDIR_SYNC:  cmd = NBD_CMD_FLUSH; break;
+        default:
+                io_u->error = EINVAL;
+                return FIO_Q_COMPLETED;
+        }
+        rq.type = cpu_to_be16(cmd);
+
+        /* Send the request + optional write payload. */
+        if (full_write(td, nbd_data->fd, &rq, sizeof rq)) {
+                io_u->error = errno;
+                return FIO_Q_COMPLETED;
+        }
+        if (io_u->ddir == DDIR_WRITE) {
+                if(full_write(td, nbd_data->fd,
+                              io_u->xfer_buf, io_u->xfer_buflen)) {
+                        io_u->error = errno;
+                        return FIO_Q_COMPLETED;
+                }
+        }
+
+        /* Read the reply + optional read payload. */
+        if (full_read(td, nbd_data->fd, &rp, sizeof rp)) {
+                io_u->error = errno;
+                return FIO_Q_COMPLETED;
+        }
+        rp.magic = be32_to_cpu(rp.magic);
+        rp.error = be32_to_cpu(rp.error);
+        rp.handle = be64_to_cpu(rp.handle);
+        if (rp.magic != NBD_SIMPLE_REPLY_MAGIC) {
+                log_err("fio: nbd: invalid magic in reply message\n");
+                io_u->error = EINVAL;
+                return FIO_Q_COMPLETED;
+        }
+
+        if (rp.error != NBD_SUCCESS) {
+                switch (rp.error) {
+                case NBD_EPERM:     io_u->error = EPERM; break;
+                case NBD_EIO:       io_u->error = EIO; break;
+                case NBD_ENOMEM:    io_u->error = ENOMEM; break;
+                case NBD_ENOSPC:    io_u->error = ENOSPC; break;
+                case NBD_ESHUTDOWN: io_u->error = ESHUTDOWN; break;
+                case NBD_EINVAL: default: io_u->error = EINVAL; break;
+                }
+                return FIO_Q_COMPLETED;
+        }
+
+        if (io_u->ddir == DDIR_READ) {
+                if (full_read(td, nbd_data->fd,
+                              io_u->xfer_buf, io_u->xfer_buflen)) {
+                        io_u->error = errno;
+                        return FIO_Q_COMPLETED;
+                }
+        }
+
+        io_u->error = 0;
+        return FIO_Q_COMPLETED;
+}
+
+
+static int nbd_open_file(struct thread_data *td, struct fio_file *f)
+{
+        return 0;
+}
+
+static int nbd_invalidate(struct thread_data *td, struct fio_file *f)
+{
+        return 0;
+}
+
+static struct ioengine_ops ioengine = {
+        .name			= "nbd",
+        .version                = FIO_IOOPS_VERSION,
+        .options                = options,
+        .option_struct_size     = sizeof(struct nbd_options),
+        .flags                  = FIO_SYNCIO | FIO_NOEXTEND,
+
+        .setup                  = nbd_setup,
+        .init                   = nbd_init,
+        .cleanup                = nbd_cleanup,
+        .queue                  = nbd_queue,
+        .open_file              = nbd_open_file,
+        .invalidate             = nbd_invalidate,
+};
+
+static void fio_init fio_nbd_register(void)
+{
+        register_ioengine(&ioengine);
+}
+
+static void fio_exit fio_nbd_unregister(void)
+{
+        unregister_ioengine(&ioengine);
+}
diff --git a/examples/nbd.fio b/examples/nbd.fio
new file mode 100644
index 00000000..0e02f7f3
--- /dev/null
+++ b/examples/nbd.fio
@@ -0,0 +1,9 @@
+[test]
+ioengine=nbd
+sockname=/tmp/socket
+rw=randrw
+size=64m
+nbdsize=64m
+iodepth=4
+time_based
+runtime=120
diff --git a/optgroup.c b/optgroup.c
index 04ceec7e..c228ff29 100644
--- a/optgroup.c
+++ b/optgroup.c
@@ -169,6 +169,10 @@ static const struct opt_group fio_opt_cat_groups[] = {
 		.name	= "libhdfs I/O engine", /* libhdfs */
 		.mask	= FIO_OPT_G_HDFS,
 	},
+	{
+		.name	= "NBD I/O engine", /* NBD */
+		.mask	= FIO_OPT_G_NBD,
+	},
 	{
 		.name	= NULL,
 	},
diff --git a/optgroup.h b/optgroup.h
index adf4d09b..bf5c9735 100644
--- a/optgroup.h
+++ b/optgroup.h
@@ -61,6 +61,7 @@ enum opt_category_group {
 	__FIO_OPT_G_MTD,
 	__FIO_OPT_G_HDFS,
 	__FIO_OPT_G_SG,
+	__FIO_OPT_G_NBD,
 	__FIO_OPT_G_NR,
 
 	FIO_OPT_G_RATE		= (1ULL << __FIO_OPT_G_RATE),
@@ -97,6 +98,7 @@ enum opt_category_group {
 	FIO_OPT_G_MTD		= (1ULL << __FIO_OPT_G_MTD),
 	FIO_OPT_G_HDFS		= (1ULL << __FIO_OPT_G_HDFS),
 	FIO_OPT_G_SG		= (1ULL << __FIO_OPT_G_SG),
+	FIO_OPT_G_NBD		= (1ULL << __FIO_OPT_G_NBD),
 	FIO_OPT_G_INVALID	= (1ULL << __FIO_OPT_G_NR),
 };
 
diff --git a/options.c b/options.c
index 95086074..f4c9bedf 100644
--- a/options.c
+++ b/options.c
@@ -1899,6 +1899,9 @@ struct fio_option fio_options[FIO_MAX_OPTS] = {
 			    .help = "HTTP (WebDAV/S3) IO engine",
 			  },
 #endif
+			  { .ival = "nbd",
+			    .help = "Network Block Device (NBD) IO engine"
+			  },
 		},
 	},
 	{
-- 
2.20.1


Reply to: