summaryrefslogtreecommitdiff
path: root/zebra/kernel_netlink.c
diff options
context:
space:
mode:
Diffstat (limited to 'zebra/kernel_netlink.c')
-rw-r--r--zebra/kernel_netlink.c406
1 files changed, 403 insertions, 3 deletions
diff --git a/zebra/kernel_netlink.c b/zebra/kernel_netlink.c
index a4d22c12a4..d0c1bc812d 100644
--- a/zebra/kernel_netlink.c
+++ b/zebra/kernel_netlink.c
@@ -84,6 +84,27 @@
#define RTPROT_MROUTED 17
#endif
+#define NL_DEFAULT_BATCH_BUFSIZE (16 * NL_PKT_BUF_SIZE)
+
+/*
+ * We limit the batch's size to a number smaller than the length of the
+ * underlying buffer since the last message that wouldn't fit the batch would go
+ * over the upper boundary and then it would have to be encoded again into a new
+ * buffer. If the difference between the limit and the length of the buffer is
+ * big enough (bigger than the biggest Netlink message) then this situation
+ * won't occur.
+ */
+#define NL_DEFAULT_BATCH_SEND_THRESHOLD (15 * NL_PKT_BUF_SIZE)
+
+/*
+ * For every request sent to the kernel that has failed we get an error message,
+ * which contains a standard netlink message header and the payload consisting
+ * of an error code and the original netlink mesage. So the receiving buffer
+ * must be at least as big as the transmitting buffer increased by some space
+ * for headers.
+ */
+#define NL_BATCH_RX_BUFSIZE (NL_DEFAULT_BATCH_BUFSIZE + NL_PKT_BUF_SIZE)
+
static const struct message nlmsg_str[] = {{RTM_NEWROUTE, "RTM_NEWROUTE"},
{RTM_DELROUTE, "RTM_DELROUTE"},
{RTM_GETROUTE, "RTM_GETROUTE"},
@@ -151,6 +172,62 @@ extern uint32_t nl_rcvbufsize;
extern struct zebra_privs_t zserv_privs;
+DEFINE_MTYPE_STATIC(ZEBRA, NL_BUF, "Zebra Netlink buffers")
+
+size_t nl_batch_tx_bufsize;
+char *nl_batch_tx_buf;
+
+char nl_batch_rx_buf[NL_BATCH_RX_BUFSIZE];
+
+_Atomic uint32_t nl_batch_bufsize = NL_DEFAULT_BATCH_BUFSIZE;
+_Atomic uint32_t nl_batch_send_threshold = NL_DEFAULT_BATCH_SEND_THRESHOLD;
+
+struct nl_batch {
+ void *buf;
+ size_t bufsiz;
+ size_t limit;
+
+ void *buf_head;
+ size_t curlen;
+ size_t msgcnt;
+
+ const struct zebra_dplane_info *zns;
+
+ struct dplane_ctx_q ctx_list;
+
+ /*
+ * Pointer to the queue of completed contexts outbound back
+ * towards the dataplane module.
+ */
+ struct dplane_ctx_q *ctx_out_q;
+};
+
+int netlink_config_write_helper(struct vty *vty)
+{
+ uint32_t size =
+ atomic_load_explicit(&nl_batch_bufsize, memory_order_relaxed);
+ uint32_t threshold = atomic_load_explicit(&nl_batch_send_threshold,
+ memory_order_relaxed);
+
+ if (size != NL_DEFAULT_BATCH_BUFSIZE
+ || threshold != NL_DEFAULT_BATCH_SEND_THRESHOLD)
+ vty_out(vty, "zebra kernel netlink batch-tx-buf %u %u\n", size,
+ threshold);
+
+ return 0;
+}
+
+void netlink_set_batch_buffer_size(uint32_t size, uint32_t threshold, bool set)
+{
+ if (!set) {
+ size = NL_DEFAULT_BATCH_BUFSIZE;
+ threshold = NL_DEFAULT_BATCH_SEND_THRESHOLD;
+ }
+
+ atomic_store_explicit(&nl_batch_bufsize, size, memory_order_relaxed);
+ atomic_store_explicit(&nl_batch_send_threshold, threshold,
+ memory_order_relaxed);
+}
int netlink_talk_filter(struct nlmsghdr *h, ns_id_t ns_id, int startup)
{
@@ -1008,9 +1085,10 @@ int netlink_parse_info(int (*filter)(struct nlmsghdr *, ns_id_t, int),
* startup -> Are we reading in under startup conditions
* This is passed through eventually to filter.
*/
-int netlink_talk_info(int (*filter)(struct nlmsghdr *, ns_id_t, int startup),
- struct nlmsghdr *n,
- const struct zebra_dplane_info *dp_info, int startup)
+static int
+netlink_talk_info(int (*filter)(struct nlmsghdr *, ns_id_t, int startup),
+ struct nlmsghdr *n, const struct zebra_dplane_info *dp_info,
+ int startup)
{
const struct nlsock *nl;
@@ -1080,6 +1158,328 @@ int netlink_request(struct nlsock *nl, void *req)
return 0;
}
+static int nl_batch_read_resp(struct nl_batch *bth)
+{
+ struct nlmsghdr *h;
+ struct sockaddr_nl snl;
+ struct msghdr msg;
+ int status, seq;
+ const struct nlsock *nl;
+ struct zebra_dplane_ctx *ctx;
+ bool ignore_msg;
+
+ nl = &(bth->zns->nls);
+
+ msg.msg_name = (void *)&snl;
+ msg.msg_namelen = sizeof(snl);
+
+ status = netlink_recv_msg(nl, msg, nl_batch_rx_buf,
+ sizeof(nl_batch_rx_buf));
+ if (status == -1 || status == 0)
+ return status;
+
+ for (h = (struct nlmsghdr *)nl_batch_rx_buf;
+ (status >= 0 && NLMSG_OK(h, (unsigned int)status));
+ h = NLMSG_NEXT(h, status)) {
+ ignore_msg = false;
+ seq = h->nlmsg_seq;
+ /*
+ * Find the corresponding context object. Received responses are
+ * in the same order as requests we sent, so we can simply
+ * iterate over the context list and match responses with
+ * requests at same time.
+ */
+ while (true) {
+ ctx = dplane_ctx_dequeue(&(bth->ctx_list));
+ if (ctx == NULL)
+ break;
+
+ dplane_ctx_enqueue_tail(bth->ctx_out_q, ctx);
+
+ /* We have found corresponding context object. */
+ if (dplane_ctx_get_ns(ctx)->nls.seq == seq)
+ break;
+
+ /*
+ * 'update' context objects take two consecutive
+ * sequence numbers.
+ */
+ if (dplane_ctx_is_update(ctx)
+ && dplane_ctx_get_ns(ctx)->nls.seq + 1 == seq) {
+ /*
+ * This is the situation where we get a response
+ * to a message that should be ignored.
+ */
+ ignore_msg = true;
+ break;
+ }
+ }
+
+ if (ignore_msg)
+ continue;
+
+ /*
+ * We received a message with the sequence number that isn't
+ * associated with any dplane context object.
+ */
+ if (ctx == NULL) {
+ zlog_debug(
+ "%s: skipping unassociated response, seq number %d NS %u",
+ __func__, h->nlmsg_seq, bth->zns->ns_id);
+ continue;
+ }
+
+ if (h->nlmsg_type == NLMSG_ERROR) {
+ int err = netlink_parse_error(nl, h, bth->zns, 0);
+
+ if (err == -1)
+ dplane_ctx_set_status(
+ ctx, ZEBRA_DPLANE_REQUEST_FAILURE);
+
+ zlog_debug("%s: netlink error message seq=%d ",
+ __func__, h->nlmsg_seq);
+ continue;
+ }
+
+ /*
+ * If we get here then we did not receive neither the ack nor
+ * the error and instead received some other message in an
+ * unexpected way.
+ */
+ zlog_debug("%s: ignoring message type 0x%04x(%s) NS %u",
+ __func__, h->nlmsg_type,
+ nl_msg_type_to_str(h->nlmsg_type), bth->zns->ns_id);
+ }
+
+ return 0;
+}
+
+static void nl_batch_reset(struct nl_batch *bth)
+{
+ bth->buf_head = bth->buf;
+ bth->curlen = 0;
+ bth->msgcnt = 0;
+ bth->zns = NULL;
+
+ TAILQ_INIT(&(bth->ctx_list));
+}
+
+static void nl_batch_init(struct nl_batch *bth, struct dplane_ctx_q *ctx_out_q)
+{
+ /*
+ * If the size of the buffer has changed, free and then allocate a new
+ * one.
+ */
+ size_t bufsize =
+ atomic_load_explicit(&nl_batch_bufsize, memory_order_relaxed);
+ if (bufsize != nl_batch_tx_bufsize) {
+ if (nl_batch_tx_buf)
+ XFREE(MTYPE_NL_BUF, nl_batch_tx_buf);
+
+ nl_batch_tx_buf = XCALLOC(MTYPE_NL_BUF, bufsize);
+ nl_batch_tx_bufsize = bufsize;
+ }
+
+ bth->buf = nl_batch_tx_buf;
+ bth->bufsiz = bufsize;
+ bth->limit = atomic_load_explicit(&nl_batch_send_threshold,
+ memory_order_relaxed);
+
+ bth->ctx_out_q = ctx_out_q;
+
+ nl_batch_reset(bth);
+}
+
+static void nl_batch_send(struct nl_batch *bth)
+{
+ struct zebra_dplane_ctx *ctx;
+ bool err = false;
+
+ if (bth->curlen != 0 && bth->zns != NULL) {
+ if (IS_ZEBRA_DEBUG_KERNEL)
+ zlog_debug("%s: %s, batch size=%zu, msg cnt=%zu",
+ __func__, bth->zns->nls.name, bth->curlen,
+ bth->msgcnt);
+
+ if (netlink_send_msg(&(bth->zns->nls), bth->buf, bth->curlen)
+ == -1)
+ err = true;
+
+ if (!err) {
+ if (nl_batch_read_resp(bth) == -1)
+ err = true;
+ }
+ }
+
+ /* Move remaining contexts to the outbound queue. */
+ while (true) {
+ ctx = dplane_ctx_dequeue(&(bth->ctx_list));
+ if (ctx == NULL)
+ break;
+
+ if (err)
+ dplane_ctx_set_status(ctx,
+ ZEBRA_DPLANE_REQUEST_FAILURE);
+
+ dplane_ctx_enqueue_tail(bth->ctx_out_q, ctx);
+ }
+
+ nl_batch_reset(bth);
+}
+
+enum netlink_msg_status netlink_batch_add_msg(
+ struct nl_batch *bth, struct zebra_dplane_ctx *ctx,
+ ssize_t (*msg_encoder)(struct zebra_dplane_ctx *, void *, size_t),
+ bool ignore_res)
+{
+ int seq;
+ ssize_t size;
+ struct nlmsghdr *msgh;
+
+ size = (*msg_encoder)(ctx, bth->buf_head, bth->bufsiz - bth->curlen);
+
+ /*
+ * If there was an error while encoding the message (other than buffer
+ * overflow) then return an error.
+ */
+ if (size < 0)
+ return FRR_NETLINK_ERROR;
+
+ /*
+ * If the message doesn't fit entirely in the buffer then send the batch
+ * and retry.
+ */
+ if (size == 0) {
+ nl_batch_send(bth);
+ size = (*msg_encoder)(ctx, bth->buf_head,
+ bth->bufsiz - bth->curlen);
+ /*
+ * If the message doesn't fit in the empty buffer then just
+ * return an error.
+ */
+ if (size <= 0)
+ return FRR_NETLINK_ERROR;
+ }
+
+ seq = dplane_ctx_get_ns(ctx)->nls.seq;
+ if (ignore_res)
+ seq++;
+
+ msgh = (struct nlmsghdr *)bth->buf_head;
+ msgh->nlmsg_seq = seq;
+ msgh->nlmsg_pid = dplane_ctx_get_ns(ctx)->nls.snl.nl_pid;
+
+ bth->zns = dplane_ctx_get_ns(ctx);
+ bth->buf_head = ((char *)bth->buf_head) + size;
+ bth->curlen += size;
+ bth->msgcnt++;
+
+ return FRR_NETLINK_QUEUED;
+}
+
+static enum netlink_msg_status nl_put_msg(struct nl_batch *bth,
+ struct zebra_dplane_ctx *ctx)
+{
+ if (dplane_ctx_is_skip_kernel(ctx))
+ return FRR_NETLINK_SUCCESS;
+
+ switch (dplane_ctx_get_op(ctx)) {
+
+ case DPLANE_OP_ROUTE_INSTALL:
+ case DPLANE_OP_ROUTE_UPDATE:
+ case DPLANE_OP_ROUTE_DELETE:
+ return netlink_put_route_update_msg(bth, ctx);
+
+ case DPLANE_OP_NH_INSTALL:
+ case DPLANE_OP_NH_UPDATE:
+ case DPLANE_OP_NH_DELETE:
+ return netlink_put_nexthop_update_msg(bth, ctx);
+
+ case DPLANE_OP_LSP_INSTALL:
+ case DPLANE_OP_LSP_UPDATE:
+ case DPLANE_OP_LSP_DELETE:
+ return netlink_put_lsp_update_msg(bth, ctx);
+
+ case DPLANE_OP_PW_INSTALL:
+ case DPLANE_OP_PW_UNINSTALL:
+ return netlink_put_pw_update_msg(bth, ctx);
+
+ case DPLANE_OP_ADDR_INSTALL:
+ case DPLANE_OP_ADDR_UNINSTALL:
+ return netlink_put_address_update_msg(bth, ctx);
+
+ case DPLANE_OP_MAC_INSTALL:
+ case DPLANE_OP_MAC_DELETE:
+ return netlink_put_mac_update_msg(bth, ctx);
+
+ case DPLANE_OP_NEIGH_INSTALL:
+ case DPLANE_OP_NEIGH_UPDATE:
+ case DPLANE_OP_NEIGH_DELETE:
+ case DPLANE_OP_VTEP_ADD:
+ case DPLANE_OP_VTEP_DELETE:
+ case DPLANE_OP_NEIGH_DISCOVER:
+ return netlink_put_neigh_update_msg(bth, ctx);
+
+ case DPLANE_OP_RULE_ADD:
+ case DPLANE_OP_RULE_DELETE:
+ case DPLANE_OP_RULE_UPDATE:
+ return netlink_put_rule_update_msg(bth, ctx);
+
+ case DPLANE_OP_SYS_ROUTE_ADD:
+ case DPLANE_OP_SYS_ROUTE_DELETE:
+ case DPLANE_OP_ROUTE_NOTIFY:
+ case DPLANE_OP_LSP_NOTIFY:
+ return FRR_NETLINK_SUCCESS;
+
+ case DPLANE_OP_NONE:
+ return FRR_NETLINK_ERROR;
+ }
+
+ return FRR_NETLINK_ERROR;
+}
+
+void kernel_update_multi(struct dplane_ctx_q *ctx_list)
+{
+ struct nl_batch batch;
+ struct zebra_dplane_ctx *ctx;
+ struct dplane_ctx_q handled_list;
+ enum netlink_msg_status res;
+
+ TAILQ_INIT(&handled_list);
+ nl_batch_init(&batch, &handled_list);
+
+ while (true) {
+ ctx = dplane_ctx_dequeue(ctx_list);
+ if (ctx == NULL)
+ break;
+
+ if (batch.zns != NULL
+ && batch.zns->ns_id != dplane_ctx_get_ns(ctx)->ns_id)
+ nl_batch_send(&batch);
+
+ /*
+ * Assume all messages will succeed and then mark only the ones
+ * that failed.
+ */
+ dplane_ctx_set_status(ctx, ZEBRA_DPLANE_REQUEST_SUCCESS);
+
+ res = nl_put_msg(&batch, ctx);
+
+ dplane_ctx_enqueue_tail(&(batch.ctx_list), ctx);
+ if (res == FRR_NETLINK_ERROR)
+ dplane_ctx_set_status(ctx,
+ ZEBRA_DPLANE_REQUEST_FAILURE);
+
+ if (batch.curlen > batch.limit)
+ nl_batch_send(&batch);
+ }
+
+ nl_batch_send(&batch);
+
+ TAILQ_INIT(ctx_list);
+ dplane_ctx_list_append(ctx_list, &handled_list);
+}
+
/* Exported interface function. This function simply calls
netlink_socket (). */
void kernel_init(struct zebra_ns *zns)