diff options
Diffstat (limited to 'zebra/kernel_netlink.c')
| -rw-r--r-- | zebra/kernel_netlink.c | 520 |
1 files changed, 412 insertions, 108 deletions
diff --git a/zebra/kernel_netlink.c b/zebra/kernel_netlink.c index a4d22c12a4..ad0d4bf56b 100644 --- a/zebra/kernel_netlink.c +++ b/zebra/kernel_netlink.c @@ -20,12 +20,6 @@ #include <zebra.h> -#if defined(HANDLE_NETLINK_FUZZING) -#include <stdio.h> -#include <string.h> -#include "libfrr.h" -#endif /* HANDLE_NETLINK_FUZZING */ - #ifdef HAVE_NETLINK #include "linklist.h" @@ -84,6 +78,20 @@ #define RTPROT_MROUTED 17 #endif +#define NL_DEFAULT_BATCH_BUFSIZE (16 * NL_PKT_BUF_SIZE) + +/* + * We limit the batch's size to a number smaller than the length of the + * underlying buffer since the last message that wouldn't fit the batch would go + * over the upper boundary and then it would have to be encoded again into a new + * buffer. If the difference between the limit and the length of the buffer is + * big enough (bigger than the biggest Netlink message) then this situation + * won't occur. + */ +#define NL_DEFAULT_BATCH_SEND_THRESHOLD (15 * NL_PKT_BUF_SIZE) + +#define NL_BATCH_RX_BUFSIZE NL_RCV_PKT_BUF_SIZE + static const struct message nlmsg_str[] = {{RTM_NEWROUTE, "RTM_NEWROUTE"}, {RTM_DELROUTE, "RTM_DELROUTE"}, {RTM_GETROUTE, "RTM_GETROUTE"}, @@ -151,6 +159,62 @@ extern uint32_t nl_rcvbufsize; extern struct zebra_privs_t zserv_privs; +DEFINE_MTYPE_STATIC(ZEBRA, NL_BUF, "Zebra Netlink buffers") + +size_t nl_batch_tx_bufsize; +char *nl_batch_tx_buf; + +char nl_batch_rx_buf[NL_BATCH_RX_BUFSIZE]; + +_Atomic uint32_t nl_batch_bufsize = NL_DEFAULT_BATCH_BUFSIZE; +_Atomic uint32_t nl_batch_send_threshold = NL_DEFAULT_BATCH_SEND_THRESHOLD; + +struct nl_batch { + void *buf; + size_t bufsiz; + size_t limit; + + void *buf_head; + size_t curlen; + size_t msgcnt; + + const struct zebra_dplane_info *zns; + + struct dplane_ctx_q ctx_list; + + /* + * Pointer to the queue of completed contexts outbound back + * towards the dataplane module. + */ + struct dplane_ctx_q *ctx_out_q; +}; + +int netlink_config_write_helper(struct vty *vty) +{ + uint32_t size = + atomic_load_explicit(&nl_batch_bufsize, memory_order_relaxed); + uint32_t threshold = atomic_load_explicit(&nl_batch_send_threshold, + memory_order_relaxed); + + if (size != NL_DEFAULT_BATCH_BUFSIZE + || threshold != NL_DEFAULT_BATCH_SEND_THRESHOLD) + vty_out(vty, "zebra kernel netlink batch-tx-buf %u %u\n", size, + threshold); + + return 0; +} + +void netlink_set_batch_buffer_size(uint32_t size, uint32_t threshold, bool set) +{ + if (!set) { + size = NL_DEFAULT_BATCH_BUFSIZE; + threshold = NL_DEFAULT_BATCH_SEND_THRESHOLD; + } + + atomic_store_explicit(&nl_batch_bufsize, size, memory_order_relaxed); + atomic_store_explicit(&nl_batch_send_threshold, threshold, + memory_order_relaxed); +} int netlink_talk_filter(struct nlmsghdr *h, ns_id_t ns_id, int startup) { @@ -327,86 +391,6 @@ static int netlink_information_fetch(struct nlmsghdr *h, ns_id_t ns_id, return 0; } -#if defined(HANDLE_NETLINK_FUZZING) -/* Using globals here to avoid adding function parameters */ - -/* Keep distinct filenames for netlink fuzzy collection */ -static unsigned int netlink_file_counter = 1; - -/* File name to read fuzzed netlink from */ -static char netlink_fuzz_file[MAXPATHLEN] = ""; - -/* Flag for whether to read from file or not */ -bool netlink_read; - -/** - * netlink_read_init() - Starts the message parser - * @fname: Filename to read. - */ -void netlink_read_init(const char *fname) -{ - struct zebra_dplane_info dp_info; - - snprintf(netlink_fuzz_file, MAXPATHLEN, "%s", fname); - /* Creating this fake socket for testing purposes */ - struct zebra_ns *zns = zebra_ns_lookup(NS_DEFAULT); - - /* Capture key info from zns struct */ - zebra_dplane_info_from_zns(&dp_info, zns, false); - - netlink_parse_info(netlink_information_fetch, &zns->netlink, - &dp_info, 1, 0); -} - -/** - * netlink_write_incoming() - Writes all data received from netlink to a file - * @buf: Data from netlink. - * @size: Size of data. - * @counter: Counter for keeping filenames distinct. - */ -static void netlink_write_incoming(const char *buf, const unsigned int size, - unsigned int counter) -{ - char fname[MAXPATHLEN]; - FILE *f; - - snprintf(fname, MAXPATHLEN, "%s/%s_%u", frr_vtydir, "netlink", counter); - frr_with_privs(&zserv_privs) { - f = fopen(fname, "w"); - } - if (f) { - fwrite(buf, 1, size, f); - fclose(f); - } -} - -/** - * netlink_read_file() - Reads netlink data from file - * @buf: Netlink buffer being overwritten. - * @fname: File name to read from. - * - * Return: Size of file. - */ -static long netlink_read_file(char *buf, const char *fname) -{ - FILE *f; - long file_bytes = -1; - - frr_with_privs(&zserv_privs) { - f = fopen(fname, "r"); - } - if (f) { - fseek(f, 0, SEEK_END); - file_bytes = ftell(f); - rewind(f); - fread(buf, NL_RCV_PKT_BUF_SIZE, 1, f); - fclose(f); - } - return file_bytes; -} - -#endif /* HANDLE_NETLINK_FUZZING */ - static int kernel_read(struct thread *thread) { struct zebra_ns *zns = (struct zebra_ns *)THREAD_ARG(thread); @@ -757,18 +741,7 @@ static int netlink_recv_msg(const struct nlsock *nl, struct msghdr msg, msg.msg_iovlen = 1; do { -#if defined(HANDLE_NETLINK_FUZZING) - /* Check if reading and filename is set */ - if (netlink_read && '\0' != netlink_fuzz_file[0]) { - zlog_debug("Reading netlink fuzz file"); - status = netlink_read_file(buf, netlink_fuzz_file); - ((struct sockaddr_nl *)msg.msg_name)->nl_pid = 0; - } else { - status = recvmsg(nl->sock, &msg, 0); - } -#else status = recvmsg(nl->sock, &msg, 0); -#endif /* HANDLE_NETLINK_FUZZING */ } while (status == -1 && errno == EINTR); if (status == -1) { @@ -800,13 +773,6 @@ static int netlink_recv_msg(const struct nlsock *nl, struct msghdr msg, zlog_hexdump(buf, status); } -#if defined(HANDLE_NETLINK_FUZZING) - if (!netlink_read) { - zlog_debug("Writing incoming netlink message"); - netlink_write_incoming(buf, status, netlink_file_counter++); - } -#endif /* HANDLE_NETLINK_FUZZING */ - return status; } @@ -1008,9 +974,10 @@ int netlink_parse_info(int (*filter)(struct nlmsghdr *, ns_id_t, int), * startup -> Are we reading in under startup conditions * This is passed through eventually to filter. */ -int netlink_talk_info(int (*filter)(struct nlmsghdr *, ns_id_t, int startup), - struct nlmsghdr *n, - const struct zebra_dplane_info *dp_info, int startup) +static int +netlink_talk_info(int (*filter)(struct nlmsghdr *, ns_id_t, int startup), + struct nlmsghdr *n, const struct zebra_dplane_info *dp_info, + int startup) { const struct nlsock *nl; @@ -1080,6 +1047,331 @@ int netlink_request(struct nlsock *nl, void *req) return 0; } +static int nl_batch_read_resp(struct nl_batch *bth) +{ + struct nlmsghdr *h; + struct sockaddr_nl snl; + struct msghdr msg; + int status, seq; + const struct nlsock *nl; + struct zebra_dplane_ctx *ctx; + bool ignore_msg; + + nl = &(bth->zns->nls); + + msg.msg_name = (void *)&snl; + msg.msg_namelen = sizeof(snl); + + /* + * The responses are not batched, so we need to read and process one + * message at a time. + */ + while (true) { + status = netlink_recv_msg(nl, msg, nl_batch_rx_buf, + sizeof(nl_batch_rx_buf)); + if (status == -1 || status == 0) + return status; + + h = (struct nlmsghdr *)nl_batch_rx_buf; + ignore_msg = false; + seq = h->nlmsg_seq; + /* + * Find the corresponding context object. Received responses are + * in the same order as requests we sent, so we can simply + * iterate over the context list and match responses with + * requests at same time. + */ + while (true) { + ctx = dplane_ctx_dequeue(&(bth->ctx_list)); + if (ctx == NULL) + break; + + dplane_ctx_enqueue_tail(bth->ctx_out_q, ctx); + + /* We have found corresponding context object. */ + if (dplane_ctx_get_ns(ctx)->nls.seq == seq) + break; + + /* + * 'update' context objects take two consecutive + * sequence numbers. + */ + if (dplane_ctx_is_update(ctx) + && dplane_ctx_get_ns(ctx)->nls.seq + 1 == seq) { + /* + * This is the situation where we get a response + * to a message that should be ignored. + */ + ignore_msg = true; + break; + } + } + + if (ignore_msg) + continue; + + /* + * We received a message with the sequence number that isn't + * associated with any dplane context object. + */ + if (ctx == NULL) { + zlog_debug( + "%s: skipping unassociated response, seq number %d NS %u", + __func__, h->nlmsg_seq, bth->zns->ns_id); + continue; + } + + if (h->nlmsg_type == NLMSG_ERROR) { + int err = netlink_parse_error(nl, h, bth->zns, 0); + + if (err == -1) + dplane_ctx_set_status( + ctx, ZEBRA_DPLANE_REQUEST_FAILURE); + + zlog_debug("%s: netlink error message seq=%d ", + __func__, h->nlmsg_seq); + continue; + } + + /* + * If we get here then we did not receive neither the ack nor + * the error and instead received some other message in an + * unexpected way. + */ + zlog_debug("%s: ignoring message type 0x%04x(%s) NS %u", + __func__, h->nlmsg_type, + nl_msg_type_to_str(h->nlmsg_type), bth->zns->ns_id); + } + + return 0; +} + +static void nl_batch_reset(struct nl_batch *bth) +{ + bth->buf_head = bth->buf; + bth->curlen = 0; + bth->msgcnt = 0; + bth->zns = NULL; + + TAILQ_INIT(&(bth->ctx_list)); +} + +static void nl_batch_init(struct nl_batch *bth, struct dplane_ctx_q *ctx_out_q) +{ + /* + * If the size of the buffer has changed, free and then allocate a new + * one. + */ + size_t bufsize = + atomic_load_explicit(&nl_batch_bufsize, memory_order_relaxed); + if (bufsize != nl_batch_tx_bufsize) { + if (nl_batch_tx_buf) + XFREE(MTYPE_NL_BUF, nl_batch_tx_buf); + + nl_batch_tx_buf = XCALLOC(MTYPE_NL_BUF, bufsize); + nl_batch_tx_bufsize = bufsize; + } + + bth->buf = nl_batch_tx_buf; + bth->bufsiz = bufsize; + bth->limit = atomic_load_explicit(&nl_batch_send_threshold, + memory_order_relaxed); + + bth->ctx_out_q = ctx_out_q; + + nl_batch_reset(bth); +} + +static void nl_batch_send(struct nl_batch *bth) +{ + struct zebra_dplane_ctx *ctx; + bool err = false; + + if (bth->curlen != 0 && bth->zns != NULL) { + if (IS_ZEBRA_DEBUG_KERNEL) + zlog_debug("%s: %s, batch size=%zu, msg cnt=%zu", + __func__, bth->zns->nls.name, bth->curlen, + bth->msgcnt); + + if (netlink_send_msg(&(bth->zns->nls), bth->buf, bth->curlen) + == -1) + err = true; + + if (!err) { + if (nl_batch_read_resp(bth) == -1) + err = true; + } + } + + /* Move remaining contexts to the outbound queue. */ + while (true) { + ctx = dplane_ctx_dequeue(&(bth->ctx_list)); + if (ctx == NULL) + break; + + if (err) + dplane_ctx_set_status(ctx, + ZEBRA_DPLANE_REQUEST_FAILURE); + + dplane_ctx_enqueue_tail(bth->ctx_out_q, ctx); + } + + nl_batch_reset(bth); +} + +enum netlink_msg_status netlink_batch_add_msg( + struct nl_batch *bth, struct zebra_dplane_ctx *ctx, + ssize_t (*msg_encoder)(struct zebra_dplane_ctx *, void *, size_t), + bool ignore_res) +{ + int seq; + ssize_t size; + struct nlmsghdr *msgh; + + size = (*msg_encoder)(ctx, bth->buf_head, bth->bufsiz - bth->curlen); + + /* + * If there was an error while encoding the message (other than buffer + * overflow) then return an error. + */ + if (size < 0) + return FRR_NETLINK_ERROR; + + /* + * If the message doesn't fit entirely in the buffer then send the batch + * and retry. + */ + if (size == 0) { + nl_batch_send(bth); + size = (*msg_encoder)(ctx, bth->buf_head, + bth->bufsiz - bth->curlen); + /* + * If the message doesn't fit in the empty buffer then just + * return an error. + */ + if (size <= 0) + return FRR_NETLINK_ERROR; + } + + seq = dplane_ctx_get_ns(ctx)->nls.seq; + if (ignore_res) + seq++; + + msgh = (struct nlmsghdr *)bth->buf_head; + msgh->nlmsg_seq = seq; + msgh->nlmsg_pid = dplane_ctx_get_ns(ctx)->nls.snl.nl_pid; + + bth->zns = dplane_ctx_get_ns(ctx); + bth->buf_head = ((char *)bth->buf_head) + size; + bth->curlen += size; + bth->msgcnt++; + + return FRR_NETLINK_QUEUED; +} + +static enum netlink_msg_status nl_put_msg(struct nl_batch *bth, + struct zebra_dplane_ctx *ctx) +{ + if (dplane_ctx_is_skip_kernel(ctx)) + return FRR_NETLINK_SUCCESS; + + switch (dplane_ctx_get_op(ctx)) { + + case DPLANE_OP_ROUTE_INSTALL: + case DPLANE_OP_ROUTE_UPDATE: + case DPLANE_OP_ROUTE_DELETE: + return netlink_put_route_update_msg(bth, ctx); + + case DPLANE_OP_NH_INSTALL: + case DPLANE_OP_NH_UPDATE: + case DPLANE_OP_NH_DELETE: + return netlink_put_nexthop_update_msg(bth, ctx); + + case DPLANE_OP_LSP_INSTALL: + case DPLANE_OP_LSP_UPDATE: + case DPLANE_OP_LSP_DELETE: + return netlink_put_lsp_update_msg(bth, ctx); + + case DPLANE_OP_PW_INSTALL: + case DPLANE_OP_PW_UNINSTALL: + return netlink_put_pw_update_msg(bth, ctx); + + case DPLANE_OP_ADDR_INSTALL: + case DPLANE_OP_ADDR_UNINSTALL: + return netlink_put_address_update_msg(bth, ctx); + + case DPLANE_OP_MAC_INSTALL: + case DPLANE_OP_MAC_DELETE: + return netlink_put_mac_update_msg(bth, ctx); + + case DPLANE_OP_NEIGH_INSTALL: + case DPLANE_OP_NEIGH_UPDATE: + case DPLANE_OP_NEIGH_DELETE: + case DPLANE_OP_VTEP_ADD: + case DPLANE_OP_VTEP_DELETE: + case DPLANE_OP_NEIGH_DISCOVER: + return netlink_put_neigh_update_msg(bth, ctx); + + case DPLANE_OP_RULE_ADD: + case DPLANE_OP_RULE_DELETE: + case DPLANE_OP_RULE_UPDATE: + return netlink_put_rule_update_msg(bth, ctx); + + case DPLANE_OP_SYS_ROUTE_ADD: + case DPLANE_OP_SYS_ROUTE_DELETE: + case DPLANE_OP_ROUTE_NOTIFY: + case DPLANE_OP_LSP_NOTIFY: + return FRR_NETLINK_SUCCESS; + + case DPLANE_OP_NONE: + return FRR_NETLINK_ERROR; + } + + return FRR_NETLINK_ERROR; +} + +void kernel_update_multi(struct dplane_ctx_q *ctx_list) +{ + struct nl_batch batch; + struct zebra_dplane_ctx *ctx; + struct dplane_ctx_q handled_list; + enum netlink_msg_status res; + + TAILQ_INIT(&handled_list); + nl_batch_init(&batch, &handled_list); + + while (true) { + ctx = dplane_ctx_dequeue(ctx_list); + if (ctx == NULL) + break; + + if (batch.zns != NULL + && batch.zns->ns_id != dplane_ctx_get_ns(ctx)->ns_id) + nl_batch_send(&batch); + + /* + * Assume all messages will succeed and then mark only the ones + * that failed. + */ + dplane_ctx_set_status(ctx, ZEBRA_DPLANE_REQUEST_SUCCESS); + + res = nl_put_msg(&batch, ctx); + + dplane_ctx_enqueue_tail(&(batch.ctx_list), ctx); + if (res == FRR_NETLINK_ERROR) + dplane_ctx_set_status(ctx, + ZEBRA_DPLANE_REQUEST_FAILURE); + + if (batch.curlen > batch.limit) + nl_batch_send(&batch); + } + + nl_batch_send(&batch); + + TAILQ_INIT(ctx_list); + dplane_ctx_list_append(ctx_list, &handled_list); +} + /* Exported interface function. This function simply calls netlink_socket (). */ void kernel_init(struct zebra_ns *zns) @@ -1161,6 +1453,15 @@ void kernel_init(struct zebra_ns *zns) if (ret < 0) zlog_notice("Registration for extended dp ACK failed : %d %s", errno, safe_strerror(errno)); + + /* + * Trim off the payload of the original netlink message in the + * acknowledgment. This option is available since Linux 4.2, so if + * setsockopt fails, ignore the error. + */ + one = 1; + ret = setsockopt(zns->netlink_dplane.sock, SOL_NETLINK, NETLINK_CAP_ACK, + &one, sizeof(one)); #endif /* Register kernel socket. */ @@ -1177,8 +1478,11 @@ void kernel_init(struct zebra_ns *zns) zns->netlink_dplane.name, safe_strerror(errno), errno); /* Set receive buffer size if it's set from command line */ - if (nl_rcvbufsize) + if (nl_rcvbufsize) { netlink_recvbuf(&zns->netlink, nl_rcvbufsize); + netlink_recvbuf(&zns->netlink_cmd, nl_rcvbufsize); + netlink_recvbuf(&zns->netlink_dplane, nl_rcvbufsize); + } netlink_install_filter(zns->netlink.sock, zns->netlink_cmd.snl.nl_pid, |
