1 files changed, 412 insertions, 108 deletions
diff --git a/zebra/kernel_netlink.c b/zebra/kernel_netlink.c
index a4d22c12a4..ad0d4bf56b 100644
--- a/zebra/kernel_netlink.c
+++ b/zebra/kernel_netlink.c
@@ -20,12 +20,6 @@
 
 #include <zebra.h>
 
-#if defined(HANDLE_NETLINK_FUZZING)
-#include <stdio.h>
-#include <string.h>
-#include "libfrr.h"
-#endif /* HANDLE_NETLINK_FUZZING */
-
 #ifdef HAVE_NETLINK
 
 #include "linklist.h"
@@ -84,6 +78,20 @@
 #define RTPROT_MROUTED 17
 #endif
 
+#define NL_DEFAULT_BATCH_BUFSIZE (16 * NL_PKT_BUF_SIZE)
+
+/*
+ * We limit the batch's size to a number smaller than the length of the
+ * underlying buffer since the last message that wouldn't fit the batch would go
+ * over the upper boundary and then it would have to be encoded again into a new
+ * buffer. If the difference between the limit and the length of the buffer is
+ * big enough (bigger than the biggest Netlink message) then this situation
+ * won't occur.
+ */
+#define NL_DEFAULT_BATCH_SEND_THRESHOLD (15 * NL_PKT_BUF_SIZE)
+
+#define NL_BATCH_RX_BUFSIZE NL_RCV_PKT_BUF_SIZE
+
 static const struct message nlmsg_str[] = {{RTM_NEWROUTE, "RTM_NEWROUTE"},
 					   {RTM_DELROUTE, "RTM_DELROUTE"},
 					   {RTM_GETROUTE, "RTM_GETROUTE"},
@@ -151,6 +159,62 @@ extern uint32_t nl_rcvbufsize;
 
 extern struct zebra_privs_t zserv_privs;
 
+DEFINE_MTYPE_STATIC(ZEBRA, NL_BUF, "Zebra Netlink buffers")
+
+size_t nl_batch_tx_bufsize;
+char *nl_batch_tx_buf;
+
+char nl_batch_rx_buf[NL_BATCH_RX_BUFSIZE];
+
+_Atomic uint32_t nl_batch_bufsize = NL_DEFAULT_BATCH_BUFSIZE;
+_Atomic uint32_t nl_batch_send_threshold = NL_DEFAULT_BATCH_SEND_THRESHOLD;
+
+struct nl_batch {
+	void *buf;
+	size_t bufsiz;
+	size_t limit;
+
+	void *buf_head;
+	size_t curlen;
+	size_t msgcnt;
+
+	const struct zebra_dplane_info *zns;
+
+	struct dplane_ctx_q ctx_list;
+
+	/*
+	 * Pointer to the queue of completed contexts outbound back
+	 * towards the dataplane module.
+	 */
+	struct dplane_ctx_q *ctx_out_q;
+};
+
+int netlink_config_write_helper(struct vty *vty)
+{
+	uint32_t size =
+		atomic_load_explicit(&nl_batch_bufsize, memory_order_relaxed);
+	uint32_t threshold = atomic_load_explicit(&nl_batch_send_threshold,
+						  memory_order_relaxed);
+
+	if (size != NL_DEFAULT_BATCH_BUFSIZE
+	    || threshold != NL_DEFAULT_BATCH_SEND_THRESHOLD)
+		vty_out(vty, "zebra kernel netlink batch-tx-buf %u %u\n", size,
+			threshold);
+
+	return 0;
+}
+
+void netlink_set_batch_buffer_size(uint32_t size, uint32_t threshold, bool set)
+{
+	if (!set) {
+		size = NL_DEFAULT_BATCH_BUFSIZE;
+		threshold = NL_DEFAULT_BATCH_SEND_THRESHOLD;
+	}
+
+	atomic_store_explicit(&nl_batch_bufsize, size, memory_order_relaxed);
+	atomic_store_explicit(&nl_batch_send_threshold, threshold,
+			      memory_order_relaxed);
+}
 
 int netlink_talk_filter(struct nlmsghdr *h, ns_id_t ns_id, int startup)
 {
@@ -327,86 +391,6 @@ static int netlink_information_fetch(struct nlmsghdr *h, ns_id_t ns_id,
 	return 0;
 }
 
-#if defined(HANDLE_NETLINK_FUZZING)
-/* Using globals here to avoid adding function parameters */
-
-/* Keep distinct filenames for netlink fuzzy collection */
-static unsigned int netlink_file_counter = 1;
-
-/* File name to read fuzzed netlink from */
-static char netlink_fuzz_file[MAXPATHLEN] = "";
-
-/* Flag for whether to read from file or not */
-bool netlink_read;
-
-/**
- * netlink_read_init() - Starts the message parser
- * @fname:      Filename to read.
- */
-void netlink_read_init(const char *fname)
-{
-	struct zebra_dplane_info dp_info;
-
-	snprintf(netlink_fuzz_file, MAXPATHLEN, "%s", fname);
-	/* Creating this fake socket for testing purposes */
-	struct zebra_ns *zns = zebra_ns_lookup(NS_DEFAULT);
-
-	/* Capture key info from zns struct */
-	zebra_dplane_info_from_zns(&dp_info, zns, false);
-
-	netlink_parse_info(netlink_information_fetch, &zns->netlink,
-			   &dp_info, 1, 0);
-}
-
-/**
- * netlink_write_incoming() - Writes all data received from netlink to a file
- * @buf:        Data from netlink.
- * @size:       Size of data.
- * @counter:    Counter for keeping filenames distinct.
- */
-static void netlink_write_incoming(const char *buf, const unsigned int size,
-				   unsigned int counter)
-{
-	char fname[MAXPATHLEN];
-	FILE *f;
-
-	snprintf(fname, MAXPATHLEN, "%s/%s_%u", frr_vtydir, "netlink", counter);
-	frr_with_privs(&zserv_privs) {
-		f = fopen(fname, "w");
-	}
-	if (f) {
-		fwrite(buf, 1, size, f);
-		fclose(f);
-	}
-}
-
-/**
- * netlink_read_file() - Reads netlink data from file
- * @buf:        Netlink buffer being overwritten.
- * @fname:      File name to read from.
- *
- * Return:      Size of file.
- */
-static long netlink_read_file(char *buf, const char *fname)
-{
-	FILE *f;
-	long file_bytes = -1;
-
-	frr_with_privs(&zserv_privs) {
-		f = fopen(fname, "r");
-	}
-	if (f) {
-		fseek(f, 0, SEEK_END);
-		file_bytes = ftell(f);
-		rewind(f);
-		fread(buf, NL_RCV_PKT_BUF_SIZE, 1, f);
-		fclose(f);
-	}
-	return file_bytes;
-}
-
-#endif /* HANDLE_NETLINK_FUZZING */
-
 static int kernel_read(struct thread *thread)
 {
 	struct zebra_ns *zns = (struct zebra_ns *)THREAD_ARG(thread);
@@ -757,18 +741,7 @@ static int netlink_recv_msg(const struct nlsock *nl, struct msghdr msg,
 	msg.msg_iovlen = 1;
 
 	do {
-#if defined(HANDLE_NETLINK_FUZZING)
-		/* Check if reading and filename is set */
-		if (netlink_read && '\0' != netlink_fuzz_file[0]) {
-			zlog_debug("Reading netlink fuzz file");
-			status = netlink_read_file(buf, netlink_fuzz_file);
-			((struct sockaddr_nl *)msg.msg_name)->nl_pid = 0;
-		} else {
-			status = recvmsg(nl->sock, &msg, 0);
-		}
-#else
 		status = recvmsg(nl->sock, &msg, 0);
-#endif /* HANDLE_NETLINK_FUZZING */
 	} while (status == -1 && errno == EINTR);
 
 	if (status == -1) {
@@ -800,13 +773,6 @@ static int netlink_recv_msg(const struct nlsock *nl, struct msghdr msg,
 		zlog_hexdump(buf, status);
 	}
 
-#if defined(HANDLE_NETLINK_FUZZING)
-	if (!netlink_read) {
-		zlog_debug("Writing incoming netlink message");
-		netlink_write_incoming(buf, status, netlink_file_counter++);
-	}
-#endif /* HANDLE_NETLINK_FUZZING */
-
 	return status;
 }
 
@@ -1008,9 +974,10 @@ int netlink_parse_info(int (*filter)(struct nlmsghdr *, ns_id_t, int),
  * startup  -> Are we reading in under startup conditions
  *             This is passed through eventually to filter.
  */
-int netlink_talk_info(int (*filter)(struct nlmsghdr *, ns_id_t, int startup),
-		      struct nlmsghdr *n,
-		      const struct zebra_dplane_info *dp_info, int startup)
+static int
+netlink_talk_info(int (*filter)(struct nlmsghdr *, ns_id_t, int startup),
+		  struct nlmsghdr *n, const struct zebra_dplane_info *dp_info,
+		  int startup)
 {
 	const struct nlsock *nl;
 
@@ -1080,6 +1047,331 @@ int netlink_request(struct nlsock *nl, void *req)
 	return 0;
 }
 
+static int nl_batch_read_resp(struct nl_batch *bth)
+{
+	struct nlmsghdr *h;
+	struct sockaddr_nl snl;
+	struct msghdr msg;
+	int status, seq;
+	const struct nlsock *nl;
+	struct zebra_dplane_ctx *ctx;
+	bool ignore_msg;
+
+	nl = &(bth->zns->nls);
+
+	msg.msg_name = (void *)&snl;
+	msg.msg_namelen = sizeof(snl);
+
+	/*
+	 * The responses are not batched, so we need to read and process one
+	 * message at a time.
+	 */
+	while (true) {
+		status = netlink_recv_msg(nl, msg, nl_batch_rx_buf,
+					  sizeof(nl_batch_rx_buf));
+		if (status == -1 || status == 0)
+			return status;
+
+		h = (struct nlmsghdr *)nl_batch_rx_buf;
+		ignore_msg = false;
+		seq = h->nlmsg_seq;
+		/*
+		 * Find the corresponding context object. Received responses are
+		 * in the same order as requests we sent, so we can simply
+		 * iterate over the context list and match responses with
+		 * requests at same time.
+		 */
+		while (true) {
+			ctx = dplane_ctx_dequeue(&(bth->ctx_list));
+			if (ctx == NULL)
+				break;
+
+			dplane_ctx_enqueue_tail(bth->ctx_out_q, ctx);
+
+			/* We have found corresponding context object. */
+			if (dplane_ctx_get_ns(ctx)->nls.seq == seq)
+				break;
+
+			/*
+			 * 'update' context objects take two consecutive
+			 * sequence numbers.
+			 */
+			if (dplane_ctx_is_update(ctx)
+			    && dplane_ctx_get_ns(ctx)->nls.seq + 1 == seq) {
+				/*
+				 * This is the situation where we get a response
+				 * to a message that should be ignored.
+				 */
+				ignore_msg = true;
+				break;
+			}
+		}
+
+		if (ignore_msg)
+			continue;
+
+		/*
+		 * We received a message with the sequence number that isn't
+		 * associated with any dplane context object.
+		 */
+		if (ctx == NULL) {
+			zlog_debug(
+				"%s: skipping unassociated response, seq number %d NS %u",
+				__func__, h->nlmsg_seq, bth->zns->ns_id);
+			continue;
+		}
+
+		if (h->nlmsg_type == NLMSG_ERROR) {
+			int err = netlink_parse_error(nl, h, bth->zns, 0);
+
+			if (err == -1)
+				dplane_ctx_set_status(
+					ctx, ZEBRA_DPLANE_REQUEST_FAILURE);
+
+			zlog_debug("%s: netlink error message seq=%d ",
+				   __func__, h->nlmsg_seq);
+			continue;
+		}
+
+		/*
+		 * If we get here then we did not receive neither the ack nor
+		 * the error and instead received some other message in an
+		 * unexpected way.
+		 */
+		zlog_debug("%s: ignoring message type 0x%04x(%s) NS %u",
+			   __func__, h->nlmsg_type,
+			   nl_msg_type_to_str(h->nlmsg_type), bth->zns->ns_id);
+	}
+
+	return 0;
+}
+
+static void nl_batch_reset(struct nl_batch *bth)
+{
+	bth->buf_head = bth->buf;
+	bth->curlen = 0;
+	bth->msgcnt = 0;
+	bth->zns = NULL;
+
+	TAILQ_INIT(&(bth->ctx_list));
+}
+
+static void nl_batch_init(struct nl_batch *bth, struct dplane_ctx_q *ctx_out_q)
+{
+	/*
+	 * If the size of the buffer has changed, free and then allocate a new
+	 * one.
+	 */
+	size_t bufsize =
+		atomic_load_explicit(&nl_batch_bufsize, memory_order_relaxed);
+	if (bufsize != nl_batch_tx_bufsize) {
+		if (nl_batch_tx_buf)
+			XFREE(MTYPE_NL_BUF, nl_batch_tx_buf);
+
+		nl_batch_tx_buf = XCALLOC(MTYPE_NL_BUF, bufsize);
+		nl_batch_tx_bufsize = bufsize;
+	}
+
+	bth->buf = nl_batch_tx_buf;
+	bth->bufsiz = bufsize;
+	bth->limit = atomic_load_explicit(&nl_batch_send_threshold,
+					  memory_order_relaxed);
+
+	bth->ctx_out_q = ctx_out_q;
+
+	nl_batch_reset(bth);
+}
+
+static void nl_batch_send(struct nl_batch *bth)
+{
+	struct zebra_dplane_ctx *ctx;
+	bool err = false;
+
+	if (bth->curlen != 0 && bth->zns != NULL) {
+		if (IS_ZEBRA_DEBUG_KERNEL)
+			zlog_debug("%s: %s, batch size=%zu, msg cnt=%zu",
+				   __func__, bth->zns->nls.name, bth->curlen,
+				   bth->msgcnt);
+
+		if (netlink_send_msg(&(bth->zns->nls), bth->buf, bth->curlen)
+		    == -1)
+			err = true;
+
+		if (!err) {
+			if (nl_batch_read_resp(bth) == -1)
+				err = true;
+		}
+	}
+
+	/* Move remaining contexts to the outbound queue. */
+	while (true) {
+		ctx = dplane_ctx_dequeue(&(bth->ctx_list));
+		if (ctx == NULL)
+			break;
+
+		if (err)
+			dplane_ctx_set_status(ctx,
+					      ZEBRA_DPLANE_REQUEST_FAILURE);
+
+		dplane_ctx_enqueue_tail(bth->ctx_out_q, ctx);
+	}
+
+	nl_batch_reset(bth);
+}
+
+enum netlink_msg_status netlink_batch_add_msg(
+	struct nl_batch *bth, struct zebra_dplane_ctx *ctx,
+	ssize_t (*msg_encoder)(struct zebra_dplane_ctx *, void *, size_t),
+	bool ignore_res)
+{
+	int seq;
+	ssize_t size;
+	struct nlmsghdr *msgh;
+
+	size = (*msg_encoder)(ctx, bth->buf_head, bth->bufsiz - bth->curlen);
+
+	/*
+	 * If there was an error while encoding the message (other than buffer
+	 * overflow) then return an error.
+	 */
+	if (size < 0)
+		return FRR_NETLINK_ERROR;
+
+	/*
+	 * If the message doesn't fit entirely in the buffer then send the batch
+	 * and retry.
+	 */
+	if (size == 0) {
+		nl_batch_send(bth);
+		size = (*msg_encoder)(ctx, bth->buf_head,
+				      bth->bufsiz - bth->curlen);
+		/*
+		 * If the message doesn't fit in the empty buffer then just
+		 * return an error.
+		 */
+		if (size <= 0)
+			return FRR_NETLINK_ERROR;
+	}
+
+	seq = dplane_ctx_get_ns(ctx)->nls.seq;
+	if (ignore_res)
+		seq++;
+
+	msgh = (struct nlmsghdr *)bth->buf_head;
+	msgh->nlmsg_seq = seq;
+	msgh->nlmsg_pid = dplane_ctx_get_ns(ctx)->nls.snl.nl_pid;
+
+	bth->zns = dplane_ctx_get_ns(ctx);
+	bth->buf_head = ((char *)bth->buf_head) + size;
+	bth->curlen += size;
+	bth->msgcnt++;
+
+	return FRR_NETLINK_QUEUED;
+}
+
+static enum netlink_msg_status nl_put_msg(struct nl_batch *bth,
+					  struct zebra_dplane_ctx *ctx)
+{
+	if (dplane_ctx_is_skip_kernel(ctx))
+		return FRR_NETLINK_SUCCESS;
+
+	switch (dplane_ctx_get_op(ctx)) {
+
+	case DPLANE_OP_ROUTE_INSTALL:
+	case DPLANE_OP_ROUTE_UPDATE:
+	case DPLANE_OP_ROUTE_DELETE:
+		return netlink_put_route_update_msg(bth, ctx);
+
+	case DPLANE_OP_NH_INSTALL:
+	case DPLANE_OP_NH_UPDATE:
+	case DPLANE_OP_NH_DELETE:
+		return netlink_put_nexthop_update_msg(bth, ctx);
+
+	case DPLANE_OP_LSP_INSTALL:
+	case DPLANE_OP_LSP_UPDATE:
+	case DPLANE_OP_LSP_DELETE:
+		return netlink_put_lsp_update_msg(bth, ctx);
+
+	case DPLANE_OP_PW_INSTALL:
+	case DPLANE_OP_PW_UNINSTALL:
+		return netlink_put_pw_update_msg(bth, ctx);
+
+	case DPLANE_OP_ADDR_INSTALL:
+	case DPLANE_OP_ADDR_UNINSTALL:
+		return netlink_put_address_update_msg(bth, ctx);
+
+	case DPLANE_OP_MAC_INSTALL:
+	case DPLANE_OP_MAC_DELETE:
+		return netlink_put_mac_update_msg(bth, ctx);
+
+	case DPLANE_OP_NEIGH_INSTALL:
+	case DPLANE_OP_NEIGH_UPDATE:
+	case DPLANE_OP_NEIGH_DELETE:
+	case DPLANE_OP_VTEP_ADD:
+	case DPLANE_OP_VTEP_DELETE:
+	case DPLANE_OP_NEIGH_DISCOVER:
+		return netlink_put_neigh_update_msg(bth, ctx);
+
+	case DPLANE_OP_RULE_ADD:
+	case DPLANE_OP_RULE_DELETE:
+	case DPLANE_OP_RULE_UPDATE:
+		return netlink_put_rule_update_msg(bth, ctx);
+
+	case DPLANE_OP_SYS_ROUTE_ADD:
+	case DPLANE_OP_SYS_ROUTE_DELETE:
+	case DPLANE_OP_ROUTE_NOTIFY:
+	case DPLANE_OP_LSP_NOTIFY:
+		return FRR_NETLINK_SUCCESS;
+
+	case DPLANE_OP_NONE:
+		return FRR_NETLINK_ERROR;
+	}
+
+	return FRR_NETLINK_ERROR;
+}
+
+void kernel_update_multi(struct dplane_ctx_q *ctx_list)
+{
+	struct nl_batch batch;
+	struct zebra_dplane_ctx *ctx;
+	struct dplane_ctx_q handled_list;
+	enum netlink_msg_status res;
+
+	TAILQ_INIT(&handled_list);
+	nl_batch_init(&batch, &handled_list);
+
+	while (true) {
+		ctx = dplane_ctx_dequeue(ctx_list);
+		if (ctx == NULL)
+			break;
+
+		if (batch.zns != NULL
+		    && batch.zns->ns_id != dplane_ctx_get_ns(ctx)->ns_id)
+			nl_batch_send(&batch);
+
+		/*
+		 * Assume all messages will succeed and then mark only the ones
+		 * that failed.
+		 */
+		dplane_ctx_set_status(ctx, ZEBRA_DPLANE_REQUEST_SUCCESS);
+
+		res = nl_put_msg(&batch, ctx);
+
+		dplane_ctx_enqueue_tail(&(batch.ctx_list), ctx);
+		if (res == FRR_NETLINK_ERROR)
+			dplane_ctx_set_status(ctx,
+					      ZEBRA_DPLANE_REQUEST_FAILURE);
+
+		if (batch.curlen > batch.limit)
+			nl_batch_send(&batch);
+	}
+
+	nl_batch_send(&batch);
+
+	TAILQ_INIT(ctx_list);
+	dplane_ctx_list_append(ctx_list, &handled_list);
+}
+
 /* Exported interface function.  This function simply calls
    netlink_socket (). */
 void kernel_init(struct zebra_ns *zns)
@@ -1161,6 +1453,15 @@ void kernel_init(struct zebra_ns *zns)
 	if (ret < 0)
 		zlog_notice("Registration for extended dp ACK failed : %d %s",
 			    errno, safe_strerror(errno));
+
+	/*
+	 * Trim off the payload of the original netlink message in the
+	 * acknowledgment. This option is available since Linux 4.2, so if
+	 * setsockopt fails, ignore the error.
+	 */
+	one = 1;
+	ret = setsockopt(zns->netlink_dplane.sock, SOL_NETLINK, NETLINK_CAP_ACK,
+			 &one, sizeof(one));
 #endif
 
 	/* Register kernel socket. */
@@ -1177,8 +1478,11 @@ void kernel_init(struct zebra_ns *zns)
 			 zns->netlink_dplane.name, safe_strerror(errno), errno);
 
 	/* Set receive buffer size if it's set from command line */
-	if (nl_rcvbufsize)
+	if (nl_rcvbufsize) {
 		netlink_recvbuf(&zns->netlink, nl_rcvbufsize);
+		netlink_recvbuf(&zns->netlink_cmd, nl_rcvbufsize);
+		netlink_recvbuf(&zns->netlink_dplane, nl_rcvbufsize);
+	}
 
 	netlink_install_filter(zns->netlink.sock,
 			       zns->netlink_cmd.snl.nl_pid,