Re: [Nbd] [RFC] [PATCH] support FUA and FLUSH
Wouter,
--On 16 May 2011 15:28:32 +0200 Wouter Verhelst <w@...112...> wrote:
diff --git a/cliserv.h b/cliserv.h
index 51c8bd1..63c2743 100644
--- a/cliserv.h
+++ b/cliserv.h
...
@@ -135,6 +140,9 @@ u64 ntohll(u64 a) {
/* Flags used between the client and server */
# define NBD_FLAG_HAS_FLAGS (1 << 0) /* Flags are there */
# define NBD_FLAG_READ_ONLY (1 << 1) /* Device is read-only */
+#define NBD_FLAG_SEND_FLUSH (1 << 2) /* Send FLUSH */
+#define NBD_FLAG_SEND_FUA (1 << 3) /* Send FLUSH */
"Send FUA" rather than "Send FLUSH"? Also, please expand what 'FUA'
stands for in this comment.
Fixed
+#define NBD_FLAG_ROTATIONAL (1 << 3) /* Use elevator algorithm -
rotational media */
That's the same value as the _SEND_FUA thing here; should that not be
different?
Fixed
+/* For sync_file_range */
+#define _GNU_SOURCE
+
This should perhaps be inside an #ifdef HAVE_SYNC_FILE_RANGE, too.
This is (surprisingly) non-trivial. HAVE_SYNC_FILE_RANGE isn't defined
until config.h is included. config.h is supplied by lfs.h. However,
whether NBD_LFS is set or not, if _GNU_SOURCE is set, types.h will
(through features.h) set _LARGEFILE_SOURCE but (confusingly) not
_FILE_OFFSET_BITS.
Given sync_file_range() is 64 bit only, I think the best solution is
for lfs.h to define USE_SYNC_FILE_RANGE only if NBD_LFS is set *and*
HAVE_SYNC_FILE_RANGE is true. That can then set _GNU_SOURCE without
risking changing the file offset bits halfway through the headers.
This is pretty disgusting but I can't see any other way.
One thing that would make it cleaner is for cliserv.h to avoid
manipulating _LARGEFILE_SOURCE too. I don't think it needs to,
and it seems to me simply including "lfs.h" would do. That's unless
there's some distinction I've been missing between NBD_LFS merely
defined, and set to 1 (the checks are different). However, I am aware
that fiddling with LFS code can lead to subtle errors.
@@ -160,6 +165,9 @@ gboolean do_oldstyle=FALSE;
# define F_SPARSE 16 /**< flag to tell us copyronwrite should use a
sparse file */
# define F_SDP 32 /**< flag to tell us the export should be done
# using
the Socket Direct Protocol for RDMA */
# define F_SYNC 64 /**< Whether to fsync() after a write */
+#define F_FLUSH 128 /**< Whether server wants FLUSH to be sent by the
client */
+#define F_FUA 128 /**< Whether server wants FUA to be sent by the
client
These should probably not be the same values?
Fixed
+ } else if (fua) {
+ /* open question: would it be cheaper to do multiple sync_file_ranges?
+ as we iterate through the above?
+ */
Good question... it might be a good idea to test this.
I will have to write the client for that :-)
- if (request.type==NBD_CMD_WRITE) {
+ if ((request.type==NBD_CMD_WRITE) ||
(request.type==NBD_CMD_WRITE_FUA)) {
This makes me think that the _FUA thing should be something that can be
masked away; that way, we can do stuff like
int type = request.type & ~NBD_CMD_FLAG_FUA;
if (type == NBD_CMD_WRITE) {
...
}
or something similar, which seems cleaner. It would also allow it to be
applied to other request types without having to special-case too much;
e.g., there's been talk of a truncate command, which could sparsify the
backend. I guess that could also use this FUA thing.
That's true. I considered doing it that way but I was attempting to make
things as unintrusive as possible. I'm happy to recode this bit to
use a FUA bit from the top of the command 32bit int. However, if we do
that, we should probably reserve a fixed number of bits there (e.g.
16 bits - I can't imagine more than 65,536 different command types)
for per-command flags). We should then probably have a (separate) NBD_
flag that says "the server supports flags being passed in the the
command type". It seemed a bit much to change just to save one
command. What do you think?
Incidentally, I think that block could be better written as a switch
on request.type. Assuming it is a read if it's not anything else
is perhaps not the safest.
- }
+ }
Another gratuitous whitespace change.
Fixed
This file is taken from the kernel, so should probably be patched there
rather than here (though of course it would be hard to compile-test
without these bits).
Sure. Though I /think/ for proper compilation it needs to be fixed
in both places, yes?
Please also document the new config file options in nbd-server.5.in.sgml.
Fixed
Other than that, seems fine to me.
Thanks. New version attached, and at:
http://www.alex.org.uk/nbd-flush-fua-02.patch
Again, untested beyond compilation.
I'll have a look at doing the client and some testing.
--
Alex Bligh
Signed-Off-By: Alex Bligh <alex@...872...>
diff --git a/cliserv.h b/cliserv.h
index 51c8bd1..42e0a17 100644
--- a/cliserv.h
+++ b/cliserv.h
@@ -40,7 +40,12 @@ typedef unsigned long long u64;
#include "nbd.h"
#if NBD_LFS==1
+/* /usr/include/features.h (included from /usr/include/sys/types.h)
+ defines this when _GNU_SOURCE is defined
+ */
+#ifndef _LARGEFILE_SOURCE
#define _LARGEFILE_SOURCE
+#endif
#define _FILE_OFFSET_BITS 64
#endif
@@ -135,6 +140,9 @@ u64 ntohll(u64 a) {
/* Flags used between the client and server */
#define NBD_FLAG_HAS_FLAGS (1 << 0) /* Flags are there */
#define NBD_FLAG_READ_ONLY (1 << 1) /* Device is read-only */
+#define NBD_FLAG_SEND_FLUSH (1 << 2) /* Send FLUSH */
+#define NBD_FLAG_SEND_FUA (1 << 3) /* Send FUA (Force Unit Access) */
+#define NBD_FLAG_ROTATIONAL (1 << 4) /* Use elevator algorithm -
rotational media */
#define NBD_DEFAULT_PORT "10809" /* Port on which named exports are
* served */
diff --git a/configure.ac b/configure.ac
index f50193f..c78177b 100644
--- a/configure.ac
+++ b/configure.ac
@@ -94,6 +94,9 @@ AC_CHECK_SIZEOF(unsigned int)
AC_CHECK_SIZEOF(unsigned long int)
AC_CHECK_SIZEOF(unsigned long long int)
AC_CHECK_FUNCS([llseek alarm gethostbyname inet_ntoa memset socket
strerror strstr])
+AC_CHECK_FUNC([sync_file_range],
+ [AC_DEFINE([HAVE_SYNC_FILE_RANGE], [sync_file_range(2) is not supported],
[sync_file_range(2) is supported])],
+ [])
AC_FUNC_FORK
AC_FUNC_SETVBUF_REVERSED
AC_MSG_CHECKING(whether client should be built)
diff --git a/doc/proto.txt b/doc/proto.txt
index fe5e819..d9a269b 100644
--- a/doc/proto.txt
+++ b/doc/proto.txt
@@ -26,8 +26,9 @@ server during the handshake.
There are two message types in the data pushing phase: the request, and
the response.
-There are three request types in the data pushing phase: NBD_CMD_READ,
-NBD_CMD_WRITE, and NBD_CMD_DISC (disconnect).
+There are four request types in the data pushing phase: NBD_CMD_READ,
+NBD_CMD_WRITE, NBD_CMD_WRITE_FUA, NBD_CMD_DISC (disconnect), and
+NBD_CMD_FLUSH.
The request is sent by the client; the response by the server. A request
header consists a 32 bit magic number (magic), a 32 bit field denoting
@@ -50,6 +51,15 @@ we change that to asynchronous handling, handling the
disconnect request
will probably be postponed until there are no other outstanding
requests.
+NBD_CMD_WRITE_FUA is the equivalent of NBD_WRITE, with an implied
+FUA (force unit access). This is implementing using sync_file_range
+if present, else by fdatasync() of that file (note not all files
+in a multifile environment). NBD_CMD_WRITE_FUA will not be sent
+unless NBD_FLAG_SEND_FUA is set.
+
+A flush request will not be sent unless NBD_FLAG_SEND_FLUSH is set,
+and indicates the backing file should be fdatasync()'d to disk.
+
There are two versions of the negotiation: the 'old' style (nbd <=
2.9.16) and the 'new' style (nbd >= 2.9.17, though due to a bug it does
not work with anything below 2.9.18). What follows is a description of
diff --git a/lfs.h b/lfs.h
index 929ce08..480d6bf 100644
--- a/lfs.h
+++ b/lfs.h
@@ -4,7 +4,13 @@
#include "config.h"
#if NBD_LFS
#define _FILE_OFFSET_BITS 64
+#ifndef _LARGEFILE_SOURCE
#define _LARGEFILE_SOURCE
+#endif
+#ifdef HAVE_SYNC_FILE_RANGE
+#define USE_SYNC_FILE_RANGE
+#define _GNU_SOURCE
+#endif /* HAVE_SYNC_FILE_RANGE */
#endif /* NBD_LFS */
#endif /* LFS_H */
diff --git a/man/nbd-server.5.in.sgml b/man/nbd-server.5.in.sgml
index 9fb2eff..07ed9fd 100644
--- a/man/nbd-server.5.in.sgml
+++ b/man/nbd-server.5.in.sgml
@@ -410,6 +410,64 @@ manpage.1: manpage.sgml
</listitem>
</varlistentry>
<varlistentry>
+ <term><option>flush</option></term>
+ <listitem>
+ <para>Optional; boolean.</para>
+ <para>When this option is enabled,
+ <command>nbd-server</command> will inform the client that it
+ supports and desires to be sent flush requests when the
+ elevator layer receives them. Receipt of a flush request
+ will cause an fdatasync() (or, if the sync option is set,
+ an fsync()) on the backend storage. This increases
+ reliability in the case of an unclean shutdown at
+ the expense of a degradation of performance. The default
+ state is disabled. This option will have no effect unless
+ supported by the client.
+ </para>
+ </listitem>
+ </varlistentry>
+ <varlistentry>
+ <term><option>fua</option></term>
+ <listitem>
+ <para>Optional; boolean.</para>
+ <para>When this option is enabled,
+ <command>nbd-server</command> will inform the client that it
+ supports and desires to be sent fua (force unit access) commands
+ when the elevator layer receives them. Receipt of a force unit
+ access command will cause the specified command to be synced
+ to backend storage using sync_file_range() if supported, or
+ fdatasync() otherwise. This increases
+ reliability in the case of an unclean shutdown at
+ the expense of a degradation of performance. The default
+ state is disabled. This option will have no effect unless
+ supported by the client.
+ </para>
+ </listitem>
+ </varlistentry>
+ <varlistentry>
+ <term><option>rotational</option></term>
+ <listitem>
+ <para>Optional; boolean.</para>
+ <para>When this option is enabled,
+ <command>nbd-server</command> will inform the client that it
+ it would prefer it to send requests in elevator order, perhaps
+ because it has a backing store and no local elevator. By
+ default, the client uses QUEUE_FLAG_NONROT, which effectively
+ restricts the function of the elevator to block merges. By
+ specifying this flag on the server, the client will not use
+ QUEUE_FLAG_NONROT, meaning the client elevator will perform
+ normal elevator ordering of I/O requests. Note that even when
+ the backing store is on rotating media, it is not normally
+ necessary to specify this flag, as the server's elevator
+ algorithm will be used. This flag is only required where
+ the server will not be using an elevator algorithm or where
+ the elevator algorithm is effectively neutered (e.g. with
+ the sync option set). This option will have no effect unless
+ supported by the client.
+ </para>
+ </listitem>
+ </varlistentry>
+ <varlistentry>
<term><option>sparse_cow</option></term>
<listitem>
<para>Optional; boolean.</para>
diff --git a/nbd-server.c b/nbd-server.c
index 4b9a083..cac8e3f 100644
--- a/nbd-server.c
+++ b/nbd-server.c
@@ -134,11 +134,13 @@ gboolean do_oldstyle=FALSE;
#define DEBUG2( a,b ) printf( a,b )
#define DEBUG3( a,b,c ) printf( a,b,c )
#define DEBUG4( a,b,c,d ) printf( a,b,c,d )
+#define DEBUG5( a,b,c,d,e ) printf( a,b,c,d,e )
#else
#define DEBUG( a )
#define DEBUG2( a,b )
#define DEBUG3( a,b,c )
#define DEBUG4( a,b,c,d )
+#define DEBUG5( a,b,c,d,e )
#endif
#ifndef PACKAGE_VERSION
#define PACKAGE_VERSION ""
@@ -160,6 +162,9 @@ gboolean do_oldstyle=FALSE;
#define F_SPARSE 16 /**< flag to tell us copyronwrite should use a
sparse file */
#define F_SDP 32 /**< flag to tell us the export should be done using
the Socket Direct Protocol for RDMA */
#define F_SYNC 64 /**< Whether to fsync() after a write */
+#define F_FLUSH 128 /**< Whether server wants FLUSH to be sent by the
client */
+#define F_FUA 256 /**< Whether server wants FUA to be sent by the client
*/
+#define F_ROTATIONAL 512 /**< Whether server wants the client to
implement the elevator algorithm */
GHashTable *children;
char pidfname[256]; /**< name of our PID file */
char pidftemplate[256]; /**< template to be used for the filename of the
PID file */
@@ -713,6 +718,9 @@ GArray* parse_cfile(gchar* f, GError** e) {
{ "sparse_cow", FALSE, PARAM_BOOL, NULL, F_SPARSE },
{ "sdp", FALSE, PARAM_BOOL, NULL, F_SDP },
{ "sync", FALSE, PARAM_BOOL, NULL, F_SYNC },
+ { "flush", FALSE, PARAM_BOOL, NULL, F_FLUSH },
+ { "fua", FALSE, PARAM_BOOL, NULL, F_FUA },
+ { "rotational", FALSE, PARAM_BOOL, NULL, F_ROTATIONAL },
{ "listenaddr", FALSE, PARAM_STRING, NULL, 0 },
{ "maxconnections", FALSE, PARAM_INT, NULL, 0 },
};
@@ -763,9 +771,10 @@ GArray* parse_cfile(gchar* f, GError** e) {
lp[6].target=&(s.postrun);
lp[7].target=lp[8].target=lp[9].target=
lp[10].target=lp[11].target=
- lp[12].target=&(s.flags);
- lp[13].target=&(s.listenaddr);
- lp[14].target=&(s.max_connections);
+ lp[12].target=lp[13].target=
+ lp[14].target=lp[15].target=&(s.flags);
+ lp[16].target=&(s.listenaddr);
+ lp[17].target=&(s.max_connections);
/* After the [generic] group, start parsing exports */
if(i==1) {
@@ -1060,7 +1069,7 @@ void myseek(int handle,off_t a) {
* @param client The client we're serving for
* @return The number of bytes actually written, or -1 in case of an error
**/
-ssize_t rawexpwrite(off_t a, char *buf, size_t len, CLIENT *client) {
+ssize_t rawexpwrite(off_t a, char *buf, size_t len, CLIENT *client, int
fua) {
int fhandle;
off_t foffset;
size_t maxbytes;
@@ -1071,12 +1080,20 @@ ssize_t rawexpwrite(off_t a, char *buf, size_t len,
CLIENT *client) {
if(maxbytes && len > maxbytes)
len = maxbytes;
- DEBUG4("(WRITE to fd %d offset %llu len %u), ", fhandle, foffset, len);
+ DEBUG5("(WRITE to fd %d offset %llu len %u fua %d), ", fhandle, foffset,
len, fua);
myseek(fhandle, foffset);
retval = write(fhandle, buf, len);
if(client->server->flags & F_SYNC) {
fsync(fhandle);
+ } else if (fua) {
+#ifdef USE_SYNC_FILE_RANGE
+ sync_file_range(fhandle, foffset, len,
+ SYNC_FILE_RANGE_WAIT_BEFORE | SYNC_FILE_RANGE_WRITE |
+ SYNC_FILE_RANGE_WAIT_AFTER);
+#else
+ fdatasync(fhandle);
+#endif
}
return retval;
}
@@ -1085,10 +1102,10 @@ ssize_t rawexpwrite(off_t a, char *buf, size_t len,
CLIENT *client) {
* Call rawexpwrite repeatedly until all data has been written.
* @return 0 on success, nonzero on failure
**/
-int rawexpwrite_fully(off_t a, char *buf, size_t len, CLIENT *client) {
+int rawexpwrite_fully(off_t a, char *buf, size_t len, CLIENT *client, int
fua) {
ssize_t ret=0;
- while(len > 0 && (ret=rawexpwrite(a, buf, len, client)) > 0 ) {
+ while(len > 0 && (ret=rawexpwrite(a, buf, len, client, fua)) > 0 ) {
a += ret;
buf += ret;
len -= ret;
@@ -1189,7 +1206,7 @@ int expread(off_t a, char *buf, size_t len, CLIENT
*client) {
* @param client The client we're going to write for.
* @return 0 on success, nonzero on failure
**/
-int expwrite(off_t a, char *buf, size_t len, CLIENT *client) {
+int expwrite(off_t a, char *buf, size_t len, CLIENT *client, int fua) {
char pagebuf[DIFFPAGESIZE];
off_t mapcnt,mapl,maph;
off_t wrlen,rdlen;
@@ -1197,7 +1214,7 @@ int expwrite(off_t a, char *buf, size_t len, CLIENT
*client) {
off_t offset;
if (!(client->server->flags & F_COPYONWRITE))
- return(rawexpwrite_fully(a, buf, len, client));
+ return(rawexpwrite_fully(a, buf, len, client, fua));
DEBUG3("Asked to write %d bytes at %llu.\n", len, (unsigned long long)a);
mapl=a/DIFFPAGESIZE ; maph=(a+len-1)/DIFFPAGESIZE ;
@@ -1230,6 +1247,33 @@ int expwrite(off_t a, char *buf, size_t len, CLIENT
*client) {
}
len-=wrlen ; a+=wrlen ; buf+=wrlen ;
}
+ if (client->server->flags & F_SYNC) {
+ fsync(client->difffile);
+ } else if (fua) {
+ /* open question: would it be cheaper to do multiple sync_file_ranges?
+ as we iterate through the above?
+ */
+ fdatasync(client->difffile);
+ }
+ return 0;
+}
+
+int expflush(CLIENT *client) {
+ int fhandle;
+ off_t foffset;
+ size_t maxbytes;
+ gint i;
+
+ if (!(client->server->flags & F_COPYONWRITE)) {
+ return fsync(client->difffile);
+ }
+
+ for (i = 0; i < client->export->len; i++) {
+ FILE_INFO fi = g_array_index(client->export, FILE_INFO, i);
+ if (fsync(fi.fhandle) < 0)
+ return -1;
+ }
+
return 0;
}
@@ -1322,6 +1366,12 @@ CLIENT* negotiate(int net, CLIENT *client, GArray*
servers) {
err("Negotiation failed: %m");
if (client->server->flags & F_READONLY)
flags |= NBD_FLAG_READ_ONLY;
+ if (client->server->flags & F_FLUSH)
+ flags |= NBD_FLAG_SEND_FLUSH;
+ if (client->server->flags & F_FUA)
+ flags |= NBD_FLAG_SEND_FUA;
+ if (client->server->flags & F_ROTATIONAL)
+ flags |= NBD_FLAG_ROTATIONAL;
if (!client->modern) {
/* oldstyle */
flags = htonl(flags);
@@ -1419,7 +1469,7 @@ int mainloop(CLIENT *client) {
continue;
}
- if (request.type==NBD_CMD_WRITE) {
+ if ((request.type==NBD_CMD_WRITE) || (request.type==NBD_CMD_WRITE_FUA)) {
DEBUG("wr: net->buf, ");
while(len > 0) {
readit(client->net, buf, currlen);
@@ -1430,7 +1480,8 @@ int mainloop(CLIENT *client) {
ERROR(client, reply, EPERM);
continue;
}
- if (expwrite(request.from, buf, len, client)) {
+ if (expwrite(request.from, buf, len, client,
+ (request.type==NBD_CMD_WRITE_FUA)?1:0)) {
DEBUG("Write failed: %m" );
ERROR(client, reply, errno);
continue;
@@ -1442,6 +1493,19 @@ int mainloop(CLIENT *client) {
}
continue;
}
+
+ if (request.type==NBD_CMD_FLUSH) {
+ DEBUG("fl: ");
+ if (expflush(client)) {
+ DEBUG("Flush failed: %m");
+ ERROR(client, reply, errno);
+ continue;
+ }
+ SEND(client->net, reply);
+ DEBUG("OK!\n");
+ continue;
+ }
+
/* READ */
DEBUG("exp->buf, ");
diff --git a/nbd.h b/nbd.h
index 451f50c..05488b0 100644
--- a/nbd.h
+++ b/nbd.h
@@ -31,7 +31,9 @@
enum {
NBD_CMD_READ = 0,
NBD_CMD_WRITE = 1,
- NBD_CMD_DISC = 2
+ NBD_CMD_DISC = 2,
+ NBD_CMD_FLUSH = 3,
+ NBD_CMD_WRITE_FUA = 4
};
#define nbd_cmd(req) ((req)->cmd[0])
Reply to: