--- /dev/null
+/*
+ * Main implementation file for interface to Forwarding Plane Manager.
+ *
+ * Copyright (C) 2012 by Open Source Routing.
+ * Copyright (C) 2012 by Internet Systems Consortium, Inc. ("ISC")
+ *
+ * This file is part of GNU Zebra.
+ *
+ * GNU Zebra is free software; you can redistribute it and/or modify it
+ * under the terms of the GNU General Public License as published by the
+ * Free Software Foundation; either version 2, or (at your option) any
+ * later version.
+ *
+ * GNU Zebra is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with GNU Zebra; see the file COPYING. If not, write to the Free
+ * Software Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA
+ * 02111-1307, USA.
+ */
+
+#include <zebra.h>
+
+#include "log.h"
+#include "stream.h"
+#include "thread.h"
+#include "network.h"
+#include "command.h"
+
+#include "zebra/rib.h"
+
+#include "fpm/fpm.h"
+#include "zebra_fpm.h"
+#include "zebra_fpm_private.h"
+
+/*
+ * Interval at which we attempt to connect to the FPM.
+ */
+#define ZFPM_CONNECT_RETRY_IVL 5
+
+/*
+ * Sizes of outgoing and incoming stream buffers for writing/reading
+ * FPM messages.
+ */
+#define ZFPM_OBUF_SIZE (2 * FPM_MAX_MSG_LEN)
+#define ZFPM_IBUF_SIZE (FPM_MAX_MSG_LEN)
+
+/*
+ * The maximum number of times the FPM socket write callback can call
+ * 'write' before it yields.
+ */
+#define ZFPM_MAX_WRITES_PER_RUN 10
+
+/*
+ * Interval over which we collect statistics.
+ */
+#define ZFPM_STATS_IVL_SECS 10
+
+/*
+ * Structure that holds state for iterating over all route_node
+ * structures that are candidates for being communicated to the FPM.
+ */
+typedef struct zfpm_rnodes_iter_t_
+{
+ rib_tables_iter_t tables_iter;
+ route_table_iter_t iter;
+} zfpm_rnodes_iter_t;
+
+/*
+ * Statistics.
+ */
+typedef struct zfpm_stats_t_ {
+ unsigned long connect_calls;
+ unsigned long connect_no_sock;
+
+ unsigned long read_cb_calls;
+
+ unsigned long write_cb_calls;
+ unsigned long write_calls;
+ unsigned long partial_writes;
+ unsigned long max_writes_hit;
+ unsigned long t_write_yields;
+
+ unsigned long nop_deletes_skipped;
+ unsigned long route_adds;
+ unsigned long route_dels;
+
+ unsigned long updates_triggered;
+ unsigned long redundant_triggers;
+ unsigned long non_fpm_table_triggers;
+
+ unsigned long dests_del_after_update;
+
+ unsigned long t_conn_down_starts;
+ unsigned long t_conn_down_dests_processed;
+ unsigned long t_conn_down_yields;
+ unsigned long t_conn_down_finishes;
+
+ unsigned long t_conn_up_starts;
+ unsigned long t_conn_up_dests_processed;
+ unsigned long t_conn_up_yields;
+ unsigned long t_conn_up_aborts;
+ unsigned long t_conn_up_finishes;
+
+} zfpm_stats_t;
+
+/*
+ * States for the FPM state machine.
+ */
+typedef enum {
+
+ /*
+ * In this state we are not yet ready to connect to the FPM. This
+ * can happen when this module is disabled, or if we're cleaning up
+ * after a connection has gone down.
+ */
+ ZFPM_STATE_IDLE,
+
+ /*
+ * Ready to talk to the FPM and periodically trying to connect to
+ * it.
+ */
+ ZFPM_STATE_ACTIVE,
+
+ /*
+ * In the middle of bringing up a TCP connection. Specifically,
+ * waiting for a connect() call to complete asynchronously.
+ */
+ ZFPM_STATE_CONNECTING,
+
+ /*
+ * TCP connection to the FPM is up.
+ */
+ ZFPM_STATE_ESTABLISHED
+
+} zfpm_state_t;
+
+/*
+ * Globals.
+ */
+typedef struct zfpm_glob_t_
+{
+
+ /*
+ * True if the FPM module has been enabled.
+ */
+ int enabled;
+
+ struct thread_master *master;
+
+ zfpm_state_t state;
+
+ /*
+ * Port on which the FPM is running.
+ */
+ int fpm_port;
+
+ /*
+ * List of rib_dest_t structures to be processed
+ */
+ TAILQ_HEAD (zfpm_dest_q, rib_dest_t_) dest_q;
+
+ /*
+ * Stream socket to the FPM.
+ */
+ int sock;
+
+ /*
+ * Buffers for messages to/from the FPM.
+ */
+ struct stream *obuf;
+ struct stream *ibuf;
+
+ /*
+ * Threads for I/O.
+ */
+ struct thread *t_connect;
+ struct thread *t_write;
+ struct thread *t_read;
+
+ /*
+ * Thread to clean up after the TCP connection to the FPM goes down
+ * and the state that belongs to it.
+ */
+ struct thread *t_conn_down;
+
+ struct {
+ zfpm_rnodes_iter_t iter;
+ } t_conn_down_state;
+
+ /*
+ * Thread to take actions once the TCP conn to the FPM comes up, and
+ * the state that belongs to it.
+ */
+ struct thread *t_conn_up;
+
+ struct {
+ zfpm_rnodes_iter_t iter;
+ } t_conn_up_state;
+
+ unsigned long connect_calls;
+ time_t last_connect_call_time;
+
+ /*
+ * Stats from the start of the current statistics interval up to
+ * now. These are the counters we typically update in the code.
+ */
+ zfpm_stats_t stats;
+
+ /*
+ * Statistics that were gathered in the last collection interval.
+ */
+ zfpm_stats_t last_ivl_stats;
+
+ /*
+ * Cumulative stats from the last clear to the start of the current
+ * statistics interval.
+ */
+ zfpm_stats_t cumulative_stats;
+
+ /*
+ * Stats interval timer.
+ */
+ struct thread *t_stats;
+
+ /*
+ * If non-zero, the last time when statistics were cleared.
+ */
+ time_t last_stats_clear_time;
+
+} zfpm_glob_t;
+
+static zfpm_glob_t zfpm_glob_space;
+static zfpm_glob_t *zfpm_g = &zfpm_glob_space;
+
+static int zfpm_read_cb (struct thread *thread);
+static int zfpm_write_cb (struct thread *thread);
+
+static void zfpm_set_state (zfpm_state_t state, const char *reason);
+static void zfpm_start_connect_timer (const char *reason);
+static void zfpm_start_stats_timer (void);
+
+/*
+ * zfpm_thread_should_yield
+ */
+static inline int
+zfpm_thread_should_yield (struct thread *t)
+{
+ return thread_should_yield (t);
+}
+
+/*
+ * zfpm_state_to_str
+ */
+static const char *
+zfpm_state_to_str (zfpm_state_t state)
+{
+ switch (state)
+ {
+
+ case ZFPM_STATE_IDLE:
+ return "idle";
+
+ case ZFPM_STATE_ACTIVE:
+ return "active";
+
+ case ZFPM_STATE_CONNECTING:
+ return "connecting";
+
+ case ZFPM_STATE_ESTABLISHED:
+ return "established";
+
+ default:
+ return "unknown";
+ }
+}
+
+/*
+ * zfpm_get_time
+ */
+static time_t
+zfpm_get_time (void)
+{
+ struct timeval tv;
+
+ if (quagga_gettime (QUAGGA_CLK_MONOTONIC, &tv) < 0)
+ zlog_warn ("FPM: quagga_gettime failed!!");
+
+ return tv.tv_sec;
+}
+
+/*
+ * zfpm_get_elapsed_time
+ *
+ * Returns the time elapsed (in seconds) since the given time.
+ */
+static time_t
+zfpm_get_elapsed_time (time_t reference)
+{
+ time_t now;
+
+ now = zfpm_get_time ();
+
+ if (now < reference)
+ {
+ assert (0);
+ return 0;
+ }
+
+ return now - reference;
+}
+
+/*
+ * zfpm_is_table_for_fpm
+ *
+ * Returns TRUE if the the given table is to be communicated to the
+ * FPM.
+ */
+static inline int
+zfpm_is_table_for_fpm (struct route_table *table)
+{
+ rib_table_info_t *info;
+
+ info = rib_table_info (table);
+
+ /*
+ * We only send the unicast tables in the main instance to the FPM
+ * at this point.
+ */
+ if (info->vrf->id != 0)
+ return 0;
+
+ if (info->safi != SAFI_UNICAST)
+ return 0;
+
+ return 1;
+}
+
+/*
+ * zfpm_rnodes_iter_init
+ */
+static inline void
+zfpm_rnodes_iter_init (zfpm_rnodes_iter_t *iter)
+{
+ memset (iter, 0, sizeof (*iter));
+ rib_tables_iter_init (&iter->tables_iter);
+
+ /*
+ * This is a hack, but it makes implementing 'next' easier by
+ * ensuring that route_table_iter_next() will return NULL the first
+ * time we call it.
+ */
+ route_table_iter_init (&iter->iter, NULL);
+ route_table_iter_cleanup (&iter->iter);
+}
+
+/*
+ * zfpm_rnodes_iter_next
+ */
+static inline struct route_node *
+zfpm_rnodes_iter_next (zfpm_rnodes_iter_t *iter)
+{
+ struct route_node *rn;
+ struct route_table *table;
+
+ while (1)
+ {
+ rn = route_table_iter_next (&iter->iter);
+ if (rn)
+ return rn;
+
+ /*
+ * We've made our way through this table, go to the next one.
+ */
+ route_table_iter_cleanup (&iter->iter);
+
+ while ((table = rib_tables_iter_next (&iter->tables_iter)))
+ {
+ if (zfpm_is_table_for_fpm (table))
+ break;
+ }
+
+ if (!table)
+ return NULL;
+
+ route_table_iter_init (&iter->iter, table);
+ }
+
+ return NULL;
+}
+
+/*
+ * zfpm_rnodes_iter_pause
+ */
+static inline void
+zfpm_rnodes_iter_pause (zfpm_rnodes_iter_t *iter)
+{
+ route_table_iter_pause (&iter->iter);
+}
+
+/*
+ * zfpm_rnodes_iter_cleanup
+ */
+static inline void
+zfpm_rnodes_iter_cleanup (zfpm_rnodes_iter_t *iter)
+{
+ route_table_iter_cleanup (&iter->iter);
+ rib_tables_iter_cleanup (&iter->tables_iter);
+}
+
+/*
+ * zfpm_stats_init
+ *
+ * Initialize a statistics block.
+ */
+static inline void
+zfpm_stats_init (zfpm_stats_t *stats)
+{
+ memset (stats, 0, sizeof (*stats));
+}
+
+/*
+ * zfpm_stats_reset
+ */
+static inline void
+zfpm_stats_reset (zfpm_stats_t *stats)
+{
+ zfpm_stats_init (stats);
+}
+
+/*
+ * zfpm_stats_copy
+ */
+static inline void
+zfpm_stats_copy (const zfpm_stats_t *src, zfpm_stats_t *dest)
+{
+ memcpy (dest, src, sizeof (*dest));
+}
+
+/*
+ * zfpm_stats_compose
+ *
+ * Total up the statistics in two stats structures ('s1 and 's2') and
+ * return the result in the third argument, 'result'. Note that the
+ * pointer 'result' may be the same as 's1' or 's2'.
+ *
+ * For simplicity, the implementation below assumes that the stats
+ * structure is composed entirely of counters. This can easily be
+ * changed when necessary.
+ */
+static void
+zfpm_stats_compose (const zfpm_stats_t *s1, const zfpm_stats_t *s2,
+ zfpm_stats_t *result)
+{
+ const unsigned long *p1, *p2;
+ unsigned long *result_p;
+ int i, num_counters;
+
+ p1 = (const unsigned long *) s1;
+ p2 = (const unsigned long *) s2;
+ result_p = (unsigned long *) result;
+
+ num_counters = (sizeof (zfpm_stats_t) / sizeof (unsigned long));
+
+ for (i = 0; i < num_counters; i++)
+ {
+ result_p[i] = p1[i] + p2[i];
+ }
+}
+
+/*
+ * zfpm_read_on
+ */
+static inline void
+zfpm_read_on (void)
+{
+ assert (!zfpm_g->t_read);
+ assert (zfpm_g->sock >= 0);
+
+ THREAD_READ_ON (zfpm_g->master, zfpm_g->t_read, zfpm_read_cb, 0,
+ zfpm_g->sock);
+}
+
+/*
+ * zfpm_write_on
+ */
+static inline void
+zfpm_write_on (void)
+{
+ assert (!zfpm_g->t_write);
+ assert (zfpm_g->sock >= 0);
+
+ THREAD_WRITE_ON (zfpm_g->master, zfpm_g->t_write, zfpm_write_cb, 0,
+ zfpm_g->sock);
+}
+
+/*
+ * zfpm_read_off
+ */
+static inline void
+zfpm_read_off (void)
+{
+ THREAD_READ_OFF (zfpm_g->t_read);
+}
+
+/*
+ * zfpm_write_off
+ */
+static inline void
+zfpm_write_off (void)
+{
+ THREAD_WRITE_OFF (zfpm_g->t_write);
+}
+
+/*
+ * zfpm_conn_up_thread_cb
+ *
+ * Callback for actions to be taken when the connection to the FPM
+ * comes up.
+ */
+static int
+zfpm_conn_up_thread_cb (struct thread *thread)
+{
+ struct route_node *rnode;
+ zfpm_rnodes_iter_t *iter;
+ rib_dest_t *dest;
+
+ assert (zfpm_g->t_conn_up);
+ zfpm_g->t_conn_up = NULL;
+
+ iter = &zfpm_g->t_conn_up_state.iter;
+
+ if (zfpm_g->state != ZFPM_STATE_ESTABLISHED)
+ {
+ zfpm_debug ("Connection not up anymore, conn_up thread aborting");
+ zfpm_g->stats.t_conn_up_aborts++;
+ goto done;
+ }
+
+ while ((rnode = zfpm_rnodes_iter_next (iter)))
+ {
+ dest = rib_dest_from_rnode (rnode);
+
+ if (dest)
+ {
+ zfpm_g->stats.t_conn_up_dests_processed++;
+ zfpm_trigger_update (rnode, NULL);
+ }
+
+ /*
+ * Yield if need be.
+ */
+ if (!zfpm_thread_should_yield (thread))
+ continue;
+
+ zfpm_g->stats.t_conn_up_yields++;
+ zfpm_rnodes_iter_pause (iter);
+ zfpm_g->t_conn_up = thread_add_background (zfpm_g->master,
+ zfpm_conn_up_thread_cb,
+ 0, 0);
+ return 0;
+ }
+
+ zfpm_g->stats.t_conn_up_finishes++;
+
+ done:
+ zfpm_rnodes_iter_cleanup (iter);
+ return 0;
+}
+
+/*
+ * zfpm_connection_up
+ *
+ * Called when the connection to the FPM comes up.
+ */
+static void
+zfpm_connection_up (const char *detail)
+{
+ assert (zfpm_g->sock >= 0);
+ zfpm_read_on ();
+ zfpm_write_on ();
+ zfpm_set_state (ZFPM_STATE_ESTABLISHED, detail);
+
+ /*
+ * Start thread to push existing routes to the FPM.
+ */
+ assert (!zfpm_g->t_conn_up);
+
+ zfpm_rnodes_iter_init (&zfpm_g->t_conn_up_state.iter);
+
+ zfpm_debug ("Starting conn_up thread");
+ zfpm_g->t_conn_up = thread_add_background (zfpm_g->master,
+ zfpm_conn_up_thread_cb, 0, 0);
+ zfpm_g->stats.t_conn_up_starts++;
+}
+
+/*
+ * zfpm_connect_check
+ *
+ * Check if an asynchronous connect() to the FPM is complete.
+ */
+static void
+zfpm_connect_check ()
+{
+ int status;
+ socklen_t slen;
+ int ret;
+
+ zfpm_read_off ();
+ zfpm_write_off ();
+
+ slen = sizeof (status);
+ ret = getsockopt (zfpm_g->sock, SOL_SOCKET, SO_ERROR, (void *) &status,
+ &slen);
+
+ if (ret >= 0 && status == 0)
+ {
+ zfpm_connection_up ("async connect complete");
+ return;
+ }
+
+ /*
+ * getsockopt() failed or indicated an error on the socket.
+ */
+ close (zfpm_g->sock);
+ zfpm_g->sock = -1;
+
+ zfpm_start_connect_timer ("getsockopt() after async connect failed");
+ return;
+}
+
+/*
+ * zfpm_conn_down_thread_cb
+ *
+ * Callback that is invoked to clean up state after the TCP connection
+ * to the FPM goes down.
+ */
+static int
+zfpm_conn_down_thread_cb (struct thread *thread)
+{
+ struct route_node *rnode;
+ zfpm_rnodes_iter_t *iter;
+ rib_dest_t *dest;
+
+ assert (zfpm_g->state == ZFPM_STATE_IDLE);
+
+ assert (zfpm_g->t_conn_down);
+ zfpm_g->t_conn_down = NULL;
+
+ iter = &zfpm_g->t_conn_down_state.iter;
+
+ while ((rnode = zfpm_rnodes_iter_next (iter)))
+ {
+ dest = rib_dest_from_rnode (rnode);
+
+ if (dest)
+ {
+ if (CHECK_FLAG (dest->flags, RIB_DEST_UPDATE_FPM))
+ {
+ TAILQ_REMOVE (&zfpm_g->dest_q, dest, fpm_q_entries);
+ }
+
+ UNSET_FLAG (dest->flags, RIB_DEST_UPDATE_FPM);
+ UNSET_FLAG (dest->flags, RIB_DEST_SENT_TO_FPM);
+
+ zfpm_g->stats.t_conn_down_dests_processed++;
+
+ /*
+ * Check if the dest should be deleted.
+ */
+ rib_gc_dest(rnode);
+ }
+
+ /*
+ * Yield if need be.
+ */
+ if (!zfpm_thread_should_yield (thread))
+ continue;
+
+ zfpm_g->stats.t_conn_down_yields++;
+ zfpm_rnodes_iter_pause (iter);
+ zfpm_g->t_conn_down = thread_add_background (zfpm_g->master,
+ zfpm_conn_down_thread_cb,
+ 0, 0);
+ return 0;
+ }
+
+ zfpm_g->stats.t_conn_down_finishes++;
+ zfpm_rnodes_iter_cleanup (iter);
+
+ /*
+ * Start the process of connecting to the FPM again.
+ */
+ zfpm_start_connect_timer ("cleanup complete");
+ return 0;
+}
+
+/*
+ * zfpm_connection_down
+ *
+ * Called when the connection to the FPM has gone down.
+ */
+static void
+zfpm_connection_down (const char *detail)
+{
+ if (!detail)
+ detail = "unknown";
+
+ assert (zfpm_g->state == ZFPM_STATE_ESTABLISHED);
+
+ zlog_info ("connection to the FPM has gone down: %s", detail);
+
+ zfpm_read_off ();
+ zfpm_write_off ();
+
+ stream_reset (zfpm_g->ibuf);
+ stream_reset (zfpm_g->obuf);
+
+ if (zfpm_g->sock >= 0) {
+ close (zfpm_g->sock);
+ zfpm_g->sock = -1;
+ }
+
+ /*
+ * Start thread to clean up state after the connection goes down.
+ */
+ assert (!zfpm_g->t_conn_down);
+ zfpm_debug ("Starting conn_down thread");
+ zfpm_rnodes_iter_init (&zfpm_g->t_conn_down_state.iter);
+ zfpm_g->t_conn_down = thread_add_background (zfpm_g->master,
+ zfpm_conn_down_thread_cb, 0, 0);
+ zfpm_g->stats.t_conn_down_starts++;
+
+ zfpm_set_state (ZFPM_STATE_IDLE, detail);
+}
+
+/*
+ * zfpm_read_cb
+ */
+static int
+zfpm_read_cb (struct thread *thread)
+{
+ size_t already;
+ struct stream *ibuf;
+ uint16_t msg_len;
+ fpm_msg_hdr_t *hdr;
+
+ zfpm_g->stats.read_cb_calls++;
+ assert (zfpm_g->t_read);
+ zfpm_g->t_read = NULL;
+
+ /*
+ * Check if async connect is now done.
+ */
+ if (zfpm_g->state == ZFPM_STATE_CONNECTING)
+ {
+ zfpm_connect_check();
+ return 0;
+ }
+
+ assert (zfpm_g->state == ZFPM_STATE_ESTABLISHED);
+ assert (zfpm_g->sock >= 0);
+
+ ibuf = zfpm_g->ibuf;
+
+ already = stream_get_endp (ibuf);
+ if (already < FPM_MSG_HDR_LEN)
+ {
+ ssize_t nbyte;
+
+ nbyte = stream_read_try (ibuf, zfpm_g->sock, FPM_MSG_HDR_LEN - already);
+ if (nbyte == 0 || nbyte == -1)
+ {
+ zfpm_connection_down ("closed socket in read");
+ return 0;
+ }
+
+ if (nbyte != (ssize_t) (FPM_MSG_HDR_LEN - already))
+ goto done;
+
+ already = FPM_MSG_HDR_LEN;
+ }
+
+ stream_set_getp (ibuf, 0);
+
+ hdr = (fpm_msg_hdr_t *) stream_pnt (ibuf);
+
+ if (!fpm_msg_hdr_ok (hdr))
+ {
+ zfpm_connection_down ("invalid message header");
+ return 0;
+ }
+
+ msg_len = fpm_msg_len (hdr);
+
+ /*
+ * Read out the rest of the packet.
+ */
+ if (already < msg_len)
+ {
+ ssize_t nbyte;
+
+ nbyte = stream_read_try (ibuf, zfpm_g->sock, msg_len - already);
+
+ if (nbyte == 0 || nbyte == -1)
+ {
+ zfpm_connection_down ("failed to read message");
+ return 0;
+ }
+
+ if (nbyte != (ssize_t) (msg_len - already))
+ goto done;
+ }
+
+ zfpm_debug ("Read out a full fpm message");
+
+ /*
+ * Just throw it away for now.
+ */
+ stream_reset (ibuf);
+
+ done:
+ zfpm_read_on ();
+ return 0;
+}
+
+/*
+ * zfpm_writes_pending
+ *
+ * Returns TRUE if we may have something to write to the FPM.
+ */
+static int
+zfpm_writes_pending (void)
+{
+
+ /*
+ * Check if there is any data in the outbound buffer that has not
+ * been written to the socket yet.
+ */
+ if (stream_get_endp (zfpm_g->obuf) - stream_get_getp (zfpm_g->obuf))
+ return 1;
+
+ /*
+ * Check if there are any prefixes on the outbound queue.
+ */
+ if (!TAILQ_EMPTY (&zfpm_g->dest_q))
+ return 1;
+
+ return 0;
+}
+
+/*
+ * zfpm_encode_route
+ *
+ * Encode a message to the FPM with information about the given route.
+ *
+ * Returns the number of bytes written to the buffer. 0 or a negative
+ * value indicates an error.
+ */
+static inline int
+zfpm_encode_route (rib_dest_t *dest, struct rib *rib, char *in_buf,
+ size_t in_buf_len)
+{
+#ifndef HAVE_NETLINK
+ return 0;
+#else
+
+ int cmd;
+
+ cmd = rib ? RTM_NEWROUTE : RTM_DELROUTE;
+
+ return zfpm_netlink_encode_route (cmd, dest, rib, in_buf, in_buf_len);
+
+#endif /* HAVE_NETLINK */
+}
+
+/*
+ * zfpm_route_for_update
+ *
+ * Returns the rib that is to be sent to the FPM for a given dest.
+ */
+static struct rib *
+zfpm_route_for_update (rib_dest_t *dest)
+{
+ struct rib *rib;
+
+ RIB_DEST_FOREACH_ROUTE (dest, rib)
+ {
+ if (!CHECK_FLAG (rib->flags, ZEBRA_FLAG_SELECTED))
+ continue;
+
+ return rib;
+ }
+
+ /*
+ * We have no route for this destination.
+ */
+ return NULL;
+}
+
+/*
+ * zfpm_build_updates
+ *
+ * Process the outgoing queue and write messages to the outbound
+ * buffer.
+ */
+static void
+zfpm_build_updates (void)
+{
+ struct stream *s;
+ rib_dest_t *dest;
+ unsigned char *buf, *data, *buf_end;
+ size_t msg_len;
+ size_t data_len;
+ fpm_msg_hdr_t *hdr;
+ struct rib *rib;
+ int is_add, write_msg;
+
+ s = zfpm_g->obuf;
+
+ assert (stream_empty (s));
+
+ do {
+
+ /*
+ * Make sure there is enough space to write another message.
+ */
+ if (STREAM_WRITEABLE (s) < FPM_MAX_MSG_LEN)
+ break;
+
+ buf = STREAM_DATA (s) + stream_get_endp (s);
+ buf_end = buf + STREAM_WRITEABLE (s);
+
+ dest = TAILQ_FIRST (&zfpm_g->dest_q);
+ if (!dest)
+ break;
+
+ assert (CHECK_FLAG (dest->flags, RIB_DEST_UPDATE_FPM));
+
+ hdr = (fpm_msg_hdr_t *) buf;
+ hdr->version = FPM_PROTO_VERSION;
+ hdr->msg_type = FPM_MSG_TYPE_NETLINK;
+
+ data = fpm_msg_data (hdr);
+
+ rib = zfpm_route_for_update (dest);
+ is_add = rib ? 1 : 0;
+
+ write_msg = 1;
+
+ /*
+ * If this is a route deletion, and we have not sent the route to
+ * the FPM previously, skip it.
+ */
+ if (!is_add && !CHECK_FLAG (dest->flags, RIB_DEST_SENT_TO_FPM))
+ {
+ write_msg = 0;
+ zfpm_g->stats.nop_deletes_skipped++;
+ }
+
+ if (write_msg) {
+ data_len = zfpm_encode_route (dest, rib, (char *) data, buf_end - data);
+
+ assert (data_len);
+ if (data_len)
+ {
+ msg_len = fpm_data_len_to_msg_len (data_len);
+ hdr->msg_len = htons (msg_len);
+ stream_forward_endp (s, msg_len);
+
+ if (is_add)
+ zfpm_g->stats.route_adds++;
+ else
+ zfpm_g->stats.route_dels++;
+ }
+ }
+
+ /*
+ * Remove the dest from the queue, and reset the flag.
+ */
+ UNSET_FLAG (dest->flags, RIB_DEST_UPDATE_FPM);
+ TAILQ_REMOVE (&zfpm_g->dest_q, dest, fpm_q_entries);
+
+ if (is_add)
+ {
+ SET_FLAG (dest->flags, RIB_DEST_SENT_TO_FPM);
+ }
+ else
+ {
+ UNSET_FLAG (dest->flags, RIB_DEST_SENT_TO_FPM);
+ }
+
+ /*
+ * Delete the destination if necessary.
+ */
+ if (rib_gc_dest (dest->rnode))
+ zfpm_g->stats.dests_del_after_update++;
+
+ } while (1);
+
+}
+
+/*
+ * zfpm_write_cb
+ */
+static int
+zfpm_write_cb (struct thread *thread)
+{
+ struct stream *s;
+ int num_writes;
+
+ zfpm_g->stats.write_cb_calls++;
+ assert (zfpm_g->t_write);
+ zfpm_g->t_write = NULL;
+
+ /*
+ * Check if async connect is now done.
+ */
+ if (zfpm_g->state == ZFPM_STATE_CONNECTING)
+ {
+ zfpm_connect_check ();
+ return 0;
+ }
+
+ assert (zfpm_g->state == ZFPM_STATE_ESTABLISHED);
+ assert (zfpm_g->sock >= 0);
+
+ num_writes = 0;
+
+ do
+ {
+ int bytes_to_write, bytes_written;
+
+ s = zfpm_g->obuf;
+
+ /*
+ * If the stream is empty, try fill it up with data.
+ */
+ if (stream_empty (s))
+ {
+ zfpm_build_updates ();
+ }
+
+ bytes_to_write = stream_get_endp (s) - stream_get_getp (s);
+ if (!bytes_to_write)
+ break;
+
+ bytes_written = write (zfpm_g->sock, STREAM_PNT (s), bytes_to_write);
+ zfpm_g->stats.write_calls++;
+ num_writes++;
+
+ if (bytes_written < 0)
+ {
+ if (ERRNO_IO_RETRY (errno))
+ break;
+
+ zfpm_connection_down ("failed to write to socket");
+ return 0;
+ }
+
+ if (bytes_written != bytes_to_write)
+ {
+
+ /*
+ * Partial write.
+ */
+ stream_forward_getp (s, bytes_written);
+ zfpm_g->stats.partial_writes++;
+ break;
+ }
+
+ /*
+ * We've written out the entire contents of the stream.
+ */
+ stream_reset (s);
+
+ if (num_writes >= ZFPM_MAX_WRITES_PER_RUN)
+ {
+ zfpm_g->stats.max_writes_hit++;
+ break;
+ }
+
+ if (zfpm_thread_should_yield (thread))
+ {
+ zfpm_g->stats.t_write_yields++;
+ break;
+ }
+ } while (1);
+
+ if (zfpm_writes_pending ())
+ zfpm_write_on ();
+
+ return 0;
+}
+
+/*
+ * zfpm_connect_cb
+ */
+static int
+zfpm_connect_cb (struct thread *t)
+{
+ int sock, ret;
+ struct sockaddr_in serv;
+
+ assert (zfpm_g->t_connect);
+ zfpm_g->t_connect = NULL;
+ assert (zfpm_g->state == ZFPM_STATE_ACTIVE);
+
+ sock = socket (AF_INET, SOCK_STREAM, 0);
+ if (sock < 0)
+ {
+ zfpm_debug ("Failed to create socket for connect(): %s", strerror(errno));
+ zfpm_g->stats.connect_no_sock++;
+ return 0;
+ }
+
+ set_nonblocking(sock);
+
+ /* Make server socket. */
+ memset (&serv, 0, sizeof (serv));
+ serv.sin_family = AF_INET;
+ serv.sin_port = htons (zfpm_g->fpm_port);
+#ifdef HAVE_STRUCT_SOCKADDR_IN_SIN_LEN
+ serv.sin_len = sizeof (struct sockaddr_in);
+#endif /* HAVE_STRUCT_SOCKADDR_IN_SIN_LEN */
+ serv.sin_addr.s_addr = htonl (INADDR_LOOPBACK);
+
+ /*
+ * Connect to the FPM.
+ */
+ zfpm_g->connect_calls++;
+ zfpm_g->stats.connect_calls++;
+ zfpm_g->last_connect_call_time = zfpm_get_time ();
+
+ ret = connect (sock, (struct sockaddr *) &serv, sizeof (serv));
+ if (ret >= 0)
+ {
+ zfpm_g->sock = sock;
+ zfpm_connection_up ("connect succeeded");
+ return 1;
+ }
+
+ if (errno == EINPROGRESS)
+ {
+ zfpm_g->sock = sock;
+ zfpm_read_on ();
+ zfpm_write_on ();
+ zfpm_set_state (ZFPM_STATE_CONNECTING, "async connect in progress");
+ return 0;
+ }
+
+ zlog_info ("can't connect to FPM %d: %s", sock, safe_strerror (errno));
+ close (sock);
+
+ /*
+ * Restart timer for retrying connection.
+ */
+ zfpm_start_connect_timer ("connect() failed");
+ return 0;
+}
+
+/*
+ * zfpm_set_state
+ *
+ * Move state machine into the given state.
+ */
+static void
+zfpm_set_state (zfpm_state_t state, const char *reason)
+{
+ zfpm_state_t cur_state = zfpm_g->state;
+
+ if (!reason)
+ reason = "Unknown";
+
+ if (state == cur_state)
+ return;
+
+ zfpm_debug("beginning state transition %s -> %s. Reason: %s",
+ zfpm_state_to_str (cur_state), zfpm_state_to_str (state),
+ reason);
+
+ switch (state) {
+
+ case ZFPM_STATE_IDLE:
+ assert (cur_state == ZFPM_STATE_ESTABLISHED);
+ break;
+
+ case ZFPM_STATE_ACTIVE:
+ assert (cur_state == ZFPM_STATE_IDLE ||
+ cur_state == ZFPM_STATE_CONNECTING);
+ assert (zfpm_g->t_connect);
+ break;
+
+ case ZFPM_STATE_CONNECTING:
+ assert (zfpm_g->sock);
+ assert (cur_state == ZFPM_STATE_ACTIVE);
+ assert (zfpm_g->t_read);
+ assert (zfpm_g->t_write);
+ break;
+
+ case ZFPM_STATE_ESTABLISHED:
+ assert (cur_state == ZFPM_STATE_ACTIVE ||
+ cur_state == ZFPM_STATE_CONNECTING);
+ assert (zfpm_g->sock);
+ assert (zfpm_g->t_read);
+ assert (zfpm_g->t_write);
+ break;
+ }
+
+ zfpm_g->state = state;
+}
+
+/*
+ * zfpm_calc_connect_delay
+ *
+ * Returns the number of seconds after which we should attempt to
+ * reconnect to the FPM.
+ */
+static long
+zfpm_calc_connect_delay (void)
+{
+ time_t elapsed;
+
+ /*
+ * Return 0 if this is our first attempt to connect.
+ */
+ if (zfpm_g->connect_calls == 0)
+ {
+ return 0;
+ }
+
+ elapsed = zfpm_get_elapsed_time (zfpm_g->last_connect_call_time);
+
+ if (elapsed > ZFPM_CONNECT_RETRY_IVL) {
+ return 0;
+ }
+
+ return ZFPM_CONNECT_RETRY_IVL - elapsed;
+}
+
+/*
+ * zfpm_start_connect_timer
+ */
+static void
+zfpm_start_connect_timer (const char *reason)
+{
+ long delay_secs;
+
+ assert (!zfpm_g->t_connect);
+ assert (zfpm_g->sock < 0);
+
+ assert(zfpm_g->state == ZFPM_STATE_IDLE ||
+ zfpm_g->state == ZFPM_STATE_ACTIVE ||
+ zfpm_g->state == ZFPM_STATE_CONNECTING);
+
+ delay_secs = zfpm_calc_connect_delay();
+ zfpm_debug ("scheduling connect in %ld seconds", delay_secs);
+
+ THREAD_TIMER_ON (zfpm_g->master, zfpm_g->t_connect, zfpm_connect_cb, 0,
+ delay_secs);
+ zfpm_set_state (ZFPM_STATE_ACTIVE, reason);
+}
+
+/*
+ * zfpm_is_enabled
+ *
+ * Returns TRUE if the zebra FPM module has been enabled.
+ */
+static inline int
+zfpm_is_enabled (void)
+{
+ return zfpm_g->enabled;
+}
+
+/*
+ * zfpm_conn_is_up
+ *
+ * Returns TRUE if the connection to the FPM is up.
+ */
+static inline int
+zfpm_conn_is_up (void)
+{
+ if (zfpm_g->state != ZFPM_STATE_ESTABLISHED)
+ return 0;
+
+ assert (zfpm_g->sock >= 0);
+
+ return 1;
+}
+
+/*
+ * zfpm_trigger_update
+ *
+ * The zebra code invokes this function to indicate that we should
+ * send an update to the FPM about the given route_node.
+ */
+void
+zfpm_trigger_update (struct route_node *rn, const char *reason)
+{
+ rib_dest_t *dest;
+ char buf[INET6_ADDRSTRLEN];
+
+ /*
+ * Ignore if the connection is down. We will update the FPM about
+ * all destinations once the connection comes up.
+ */
+ if (!zfpm_conn_is_up ())
+ return;
+
+ dest = rib_dest_from_rnode (rn);
+
+ /*
+ * Ignore the trigger if the dest is not in a table that we would
+ * send to the FPM.
+ */
+ if (!zfpm_is_table_for_fpm (rib_dest_table (dest)))
+ {
+ zfpm_g->stats.non_fpm_table_triggers++;
+ return;
+ }
+
+ if (CHECK_FLAG (dest->flags, RIB_DEST_UPDATE_FPM)) {
+ zfpm_g->stats.redundant_triggers++;
+ return;
+ }
+
+ if (reason)
+ {
+ zfpm_debug ("%s/%d triggering update to FPM - Reason: %s",
+ inet_ntop (rn->p.family, &rn->p.u.prefix, buf, sizeof (buf)),
+ rn->p.prefixlen, reason);
+ }
+
+ SET_FLAG (dest->flags, RIB_DEST_UPDATE_FPM);
+ TAILQ_INSERT_TAIL (&zfpm_g->dest_q, dest, fpm_q_entries);
+ zfpm_g->stats.updates_triggered++;
+
+ /*
+ * Make sure that writes are enabled.
+ */
+ if (zfpm_g->t_write)
+ return;
+
+ zfpm_write_on ();
+}
+
+/*
+ * zfpm_stats_timer_cb
+ */
+static int
+zfpm_stats_timer_cb (struct thread *t)
+{
+ assert (zfpm_g->t_stats);
+ zfpm_g->t_stats = NULL;
+
+ /*
+ * Remember the stats collected in the last interval for display
+ * purposes.
+ */
+ zfpm_stats_copy (&zfpm_g->stats, &zfpm_g->last_ivl_stats);
+
+ /*
+ * Add the current set of stats into the cumulative statistics.
+ */
+ zfpm_stats_compose (&zfpm_g->cumulative_stats, &zfpm_g->stats,
+ &zfpm_g->cumulative_stats);
+
+ /*
+ * Start collecting stats afresh over the next interval.
+ */
+ zfpm_stats_reset (&zfpm_g->stats);
+
+ zfpm_start_stats_timer ();
+
+ return 0;
+}
+
+/*
+ * zfpm_stop_stats_timer
+ */
+static void
+zfpm_stop_stats_timer (void)
+{
+ if (!zfpm_g->t_stats)
+ return;
+
+ zfpm_debug ("Stopping existing stats timer");
+ THREAD_TIMER_OFF (zfpm_g->t_stats);
+}
+
+/*
+ * zfpm_start_stats_timer
+ */
+void
+zfpm_start_stats_timer (void)
+{
+ assert (!zfpm_g->t_stats);
+
+ THREAD_TIMER_ON (zfpm_g->master, zfpm_g->t_stats, zfpm_stats_timer_cb, 0,
+ ZFPM_STATS_IVL_SECS);
+}
+
+/*
+ * Helper macro for zfpm_show_stats() below.
+ */
+#define ZFPM_SHOW_STAT(counter) \
+ do { \
+ vty_out (vty, "%-40s %10lu %16lu%s", #counter, total_stats.counter, \
+ zfpm_g->last_ivl_stats.counter, VTY_NEWLINE); \
+ } while (0)
+
+/*
+ * zfpm_show_stats
+ */
+static void
+zfpm_show_stats (struct vty *vty)
+{
+ zfpm_stats_t total_stats;
+ time_t elapsed;
+
+ vty_out (vty, "%s%-40s %10s Last %2d secs%s%s", VTY_NEWLINE, "Counter",
+ "Total", ZFPM_STATS_IVL_SECS, VTY_NEWLINE, VTY_NEWLINE);
+
+ /*
+ * Compute the total stats up to this instant.
+ */
+ zfpm_stats_compose (&zfpm_g->cumulative_stats, &zfpm_g->stats,
+ &total_stats);
+
+ ZFPM_SHOW_STAT (connect_calls);
+ ZFPM_SHOW_STAT (connect_no_sock);
+ ZFPM_SHOW_STAT (read_cb_calls);
+ ZFPM_SHOW_STAT (write_cb_calls);
+ ZFPM_SHOW_STAT (write_calls);
+ ZFPM_SHOW_STAT (partial_writes);
+ ZFPM_SHOW_STAT (max_writes_hit);
+ ZFPM_SHOW_STAT (t_write_yields);
+ ZFPM_SHOW_STAT (nop_deletes_skipped);
+ ZFPM_SHOW_STAT (route_adds);
+ ZFPM_SHOW_STAT (route_dels);
+ ZFPM_SHOW_STAT (updates_triggered);
+ ZFPM_SHOW_STAT (non_fpm_table_triggers);
+ ZFPM_SHOW_STAT (redundant_triggers);
+ ZFPM_SHOW_STAT (dests_del_after_update);
+ ZFPM_SHOW_STAT (t_conn_down_starts);
+ ZFPM_SHOW_STAT (t_conn_down_dests_processed);
+ ZFPM_SHOW_STAT (t_conn_down_yields);
+ ZFPM_SHOW_STAT (t_conn_down_finishes);
+ ZFPM_SHOW_STAT (t_conn_up_starts);
+ ZFPM_SHOW_STAT (t_conn_up_dests_processed);
+ ZFPM_SHOW_STAT (t_conn_up_yields);
+ ZFPM_SHOW_STAT (t_conn_up_aborts);
+ ZFPM_SHOW_STAT (t_conn_up_finishes);
+
+ if (!zfpm_g->last_stats_clear_time)
+ return;
+
+ elapsed = zfpm_get_elapsed_time (zfpm_g->last_stats_clear_time);
+
+ vty_out (vty, "%sStats were cleared %lu seconds ago%s", VTY_NEWLINE,
+ (unsigned long) elapsed, VTY_NEWLINE);
+}
+
+/*
+ * zfpm_clear_stats
+ */
+static void
+zfpm_clear_stats (struct vty *vty)
+{
+ if (!zfpm_is_enabled ())
+ {
+ vty_out (vty, "The FPM module is not enabled...%s", VTY_NEWLINE);
+ return;
+ }
+
+ zfpm_stats_reset (&zfpm_g->stats);
+ zfpm_stats_reset (&zfpm_g->last_ivl_stats);
+ zfpm_stats_reset (&zfpm_g->cumulative_stats);
+
+ zfpm_stop_stats_timer ();
+ zfpm_start_stats_timer ();
+
+ zfpm_g->last_stats_clear_time = zfpm_get_time();
+
+ vty_out (vty, "Cleared FPM stats%s", VTY_NEWLINE);
+}
+
+/*
+ * show_zebra_fpm_stats
+ */
+DEFUN (show_zebra_fpm_stats,
+ show_zebra_fpm_stats_cmd,
+ "show zebra fpm stats",
+ SHOW_STR
+ "Zebra information\n"
+ "Forwarding Path Manager information\n"
+ "Statistics\n")
+{
+ zfpm_show_stats (vty);
+ return CMD_SUCCESS;
+}
+
+/*
+ * clear_zebra_fpm_stats
+ */
+DEFUN (clear_zebra_fpm_stats,
+ clear_zebra_fpm_stats_cmd,
+ "clear zebra fpm stats",
+ CLEAR_STR
+ "Zebra information\n"
+ "Clear Forwarding Path Manager information\n"
+ "Statistics\n")
+{
+ zfpm_clear_stats (vty);
+ return CMD_SUCCESS;
+}
+
+/**
+ * zfpm_init
+ *
+ * One-time initialization of the Zebra FPM module.
+ *
+ * @param[in] port port at which FPM is running.
+ * @param[in] enable TRUE if the zebra FPM module should be enabled
+ *
+ * Returns TRUE on success.
+ */
+int
+zfpm_init (struct thread_master *master, int enable, uint16_t port)
+{
+ static int initialized = 0;
+
+ if (initialized) {
+ return 1;
+ }
+
+ initialized = 1;
+
+ memset (zfpm_g, 0, sizeof (*zfpm_g));
+ zfpm_g->master = master;
+ TAILQ_INIT(&zfpm_g->dest_q);
+ zfpm_g->sock = -1;
+ zfpm_g->state = ZFPM_STATE_IDLE;
+
+ /*
+ * Netlink must currently be available for the Zebra-FPM interface
+ * to be enabled.
+ */
+#ifndef HAVE_NETLINK
+ enable = 0;
+#endif
+
+ zfpm_g->enabled = enable;
+
+ zfpm_stats_init (&zfpm_g->stats);
+ zfpm_stats_init (&zfpm_g->last_ivl_stats);
+ zfpm_stats_init (&zfpm_g->cumulative_stats);
+
+ install_element (ENABLE_NODE, &show_zebra_fpm_stats_cmd);
+ install_element (ENABLE_NODE, &clear_zebra_fpm_stats_cmd);
+
+ if (!enable) {
+ return 1;
+ }
+
+ if (!port)
+ port = FPM_DEFAULT_PORT;
+
+ zfpm_g->fpm_port = port;
+
+ zfpm_g->obuf = stream_new (ZFPM_OBUF_SIZE);
+ zfpm_g->ibuf = stream_new (ZFPM_IBUF_SIZE);
+
+ zfpm_start_stats_timer ();
+ zfpm_start_connect_timer ("initialized");
+
+ return 1;
+}
--- /dev/null
+/*
+ * Code for encoding/decoding FPM messages that are in netlink format.
+ *
+ * Copyright (C) 1997, 98, 99 Kunihiro Ishiguro
+ * Copyright (C) 2012 by Open Source Routing.
+ * Copyright (C) 2012 by Internet Systems Consortium, Inc. ("ISC")
+ *
+ * This file is part of GNU Zebra.
+ *
+ * GNU Zebra is free software; you can redistribute it and/or modify it
+ * under the terms of the GNU General Public License as published by the
+ * Free Software Foundation; either version 2, or (at your option) any
+ * later version.
+ *
+ * GNU Zebra is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with GNU Zebra; see the file COPYING. If not, write to the Free
+ * Software Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA
+ * 02111-1307, USA.
+ */
+
+#include <zebra.h>
+
+#include "log.h"
+#include "rib.h"
+
+#include "rt_netlink.h"
+
+#include "zebra_fpm_private.h"
+
+/*
+ * addr_to_a
+ *
+ * Returns string representation of an address of the given AF.
+ */
+static inline const char *
+addr_to_a (u_char af, void *addr)
+{
+ if (!addr)
+ return "<No address>";
+
+ switch (af)
+ {
+
+ case AF_INET:
+ return inet_ntoa (*((struct in_addr *) addr));
+
+#ifdef HAVE_IPV6
+ case AF_INET6:
+ return inet6_ntoa (*((struct in6_addr *) addr));
+#endif
+
+ default:
+ return "<Addr in unknown AF>";
+ }
+}
+
+/*
+ * prefix_addr_to_a
+ *
+ * Convience wrapper that returns a human-readable string for the
+ * address in a prefix.
+ */
+static const char *
+prefix_addr_to_a (struct prefix *prefix)
+{
+ if (!prefix)
+ return "<No address>";
+
+ return addr_to_a (prefix->family, &prefix->u.prefix);
+}
+
+/*
+ * af_addr_size
+ *
+ * The size of an address in a given address family.
+ */
+static size_t
+af_addr_size (u_char af)
+{
+ switch (af)
+ {
+
+ case AF_INET:
+ return 4;
+
+#ifdef HAVE_IPV6
+ case AF_INET6:
+ return 16;
+#endif
+
+ default:
+ assert(0);
+ return 16;
+ }
+}
+
+/*
+ * netlink_nh_info_t
+ *
+ * Holds information about a single nexthop for netlink. These info
+ * structures are transient and may contain pointers into rib
+ * data structures for convenience.
+ */
+typedef struct netlink_nh_info_t_
+{
+ uint32_t if_index;
+ union g_addr *gateway;
+
+ /*
+ * Information from the struct nexthop from which this nh was
+ * derived. For debug purposes only.
+ */
+ int recursive;
+ enum nexthop_types_t type;
+} netlink_nh_info_t;
+
+/*
+ * netlink_route_info_t
+ *
+ * A structure for holding information for a netlink route message.
+ */
+typedef struct netlink_route_info_t_
+{
+ uint16_t nlmsg_type;
+ u_char rtm_type;
+ uint32_t rtm_table;
+ u_char rtm_protocol;
+ u_char af;
+ struct prefix *prefix;
+ uint32_t *metric;
+ int num_nhs;
+
+ /*
+ * Nexthop structures. We keep things simple for now by enforcing a
+ * maximum of 64 in case MULTIPATH_NUM is 0;
+ */
+ netlink_nh_info_t nhs[MAX (MULTIPATH_NUM, 64)];
+ union g_addr *pref_src;
+} netlink_route_info_t;
+
+/*
+ * netlink_route_info_add_nh
+ *
+ * Add information about the given nexthop to the given route info
+ * structure.
+ *
+ * Returns TRUE if a nexthop was added, FALSE otherwise.
+ */
+static int
+netlink_route_info_add_nh (netlink_route_info_t *ri, struct nexthop *nexthop)
+{
+ netlink_nh_info_t nhi;
+ union g_addr *src;
+
+ memset (&nhi, 0, sizeof (nhi));
+ src = NULL;
+
+ if (ri->num_nhs >= (int) ZEBRA_NUM_OF (ri->nhs))
+ return 0;
+
+ if (CHECK_FLAG (nexthop->flags, NEXTHOP_FLAG_RECURSIVE))
+ {
+ nhi.recursive = 1;
+ nhi.type = nexthop->rtype;
+
+ if (nexthop->rtype == NEXTHOP_TYPE_IPV4
+ || nexthop->rtype == NEXTHOP_TYPE_IPV4_IFINDEX)
+ {
+ nhi.gateway = &nexthop->rgate;
+ if (nexthop->src.ipv4.s_addr)
+ src = &nexthop->src;
+ }
+
+#ifdef HAVE_IPV6
+ if (nexthop->rtype == NEXTHOP_TYPE_IPV6
+ || nexthop->rtype == NEXTHOP_TYPE_IPV6_IFINDEX
+ || nexthop->rtype == NEXTHOP_TYPE_IPV6_IFNAME)
+ {
+ nhi.gateway = &nexthop->rgate;
+ }
+#endif /* HAVE_IPV6 */
+
+ if (nexthop->rtype == NEXTHOP_TYPE_IFINDEX
+ || nexthop->rtype == NEXTHOP_TYPE_IFNAME
+ || nexthop->rtype == NEXTHOP_TYPE_IPV4_IFINDEX
+ || nexthop->rtype == NEXTHOP_TYPE_IPV6_IFINDEX
+ || nexthop->rtype == NEXTHOP_TYPE_IPV6_IFNAME)
+ {
+ nhi.if_index = nexthop->rifindex;
+ if ((nexthop->rtype == NEXTHOP_TYPE_IPV4_IFINDEX
+ || nexthop->rtype == NEXTHOP_TYPE_IFINDEX)
+ && nexthop->src.ipv4.s_addr)
+ src = &nexthop->src;
+ }
+
+ goto done;
+ }
+
+ nhi.recursive = 0;
+ nhi.type = nexthop->type;
+
+ if (nexthop->type == NEXTHOP_TYPE_IPV4
+ || nexthop->type == NEXTHOP_TYPE_IPV4_IFINDEX)
+ {
+ nhi.gateway = &nexthop->gate;
+ if (nexthop->src.ipv4.s_addr)
+ src = &nexthop->src;
+ }
+
+#ifdef HAVE_IPV6
+ if (nexthop->type == NEXTHOP_TYPE_IPV6
+ || nexthop->type == NEXTHOP_TYPE_IPV6_IFNAME
+ || nexthop->type == NEXTHOP_TYPE_IPV6_IFINDEX)
+ {
+ nhi.gateway = &nexthop->gate;
+ }
+#endif /* HAVE_IPV6 */
+ if (nexthop->type == NEXTHOP_TYPE_IFINDEX
+ || nexthop->type == NEXTHOP_TYPE_IFNAME
+ || nexthop->type == NEXTHOP_TYPE_IPV4_IFINDEX)
+ {
+ nhi.if_index = nexthop->ifindex;
+
+ if (nexthop->src.ipv4.s_addr)
+ src = &nexthop->src;
+ }
+ else if (nexthop->type == NEXTHOP_TYPE_IPV6_IFINDEX
+ || nexthop->type == NEXTHOP_TYPE_IPV6_IFNAME)
+ {
+ nhi.if_index = nexthop->ifindex;
+ }
+
+ /*
+ * Fall through...
+ */
+
+ done:
+ if (!nhi.gateway && nhi.if_index == 0)
+ return 0;
+
+ /*
+ * We have a valid nhi. Copy the structure over to the route_info.
+ */
+ ri->nhs[ri->num_nhs] = nhi;
+ ri->num_nhs++;
+
+ if (src && !ri->pref_src)
+ ri->pref_src = src;
+
+ return 1;
+}
+
+/*
+ * netlink_proto_from_route_type
+ */
+static u_char
+netlink_proto_from_route_type (int type)
+{
+ switch (type)
+ {
+ case ZEBRA_ROUTE_KERNEL:
+ case ZEBRA_ROUTE_CONNECT:
+ return RTPROT_KERNEL;
+
+ default:
+ return RTPROT_ZEBRA;
+ }
+}
+
+/*
+ * netlink_route_info_fill
+ *
+ * Fill out the route information object from the given route.
+ *
+ * Returns TRUE on success and FALSE on failure.
+ */
+static int
+netlink_route_info_fill (netlink_route_info_t *ri, int cmd,
+ rib_dest_t *dest, struct rib *rib)
+{
+ struct nexthop *nexthop = NULL;
+ int discard;
+
+ memset (ri, 0, sizeof (*ri));
+
+ ri->prefix = rib_dest_prefix (dest);
+ ri->af = rib_dest_af (dest);
+
+ ri->nlmsg_type = cmd;
+ ri->rtm_table = rib_dest_vrf (dest)->id;
+ ri->rtm_protocol = RTPROT_UNSPEC;
+
+ /*
+ * An RTM_DELROUTE need not be accompanied by any nexthops,
+ * particularly in our communication with the FPM.
+ */
+ if (cmd == RTM_DELROUTE && !rib)
+ goto skip;
+
+ if (rib)
+ ri->rtm_protocol = netlink_proto_from_route_type (rib->type);
+
+ if ((rib->flags & ZEBRA_FLAG_BLACKHOLE) || (rib->flags & ZEBRA_FLAG_REJECT))
+ discard = 1;
+ else
+ discard = 0;
+
+ if (cmd == RTM_NEWROUTE)
+ {
+ if (discard)
+ {
+ if (rib->flags & ZEBRA_FLAG_BLACKHOLE)
+ ri->rtm_type = RTN_BLACKHOLE;
+ else if (rib->flags & ZEBRA_FLAG_REJECT)
+ ri->rtm_type = RTN_UNREACHABLE;
+ else
+ assert (0);
+ }
+ else
+ ri->rtm_type = RTN_UNICAST;
+ }
+
+ ri->metric = &rib->metric;
+
+ if (discard)
+ {
+ goto skip;
+ }
+
+ /* Multipath case. */
+ if (rib->nexthop_active_num == 1 || MULTIPATH_NUM == 1)
+ {
+ for (nexthop = rib->nexthop; nexthop; nexthop = nexthop->next)
+ {
+
+ if ((cmd == RTM_NEWROUTE
+ && CHECK_FLAG (nexthop->flags, NEXTHOP_FLAG_ACTIVE))
+ || (cmd == RTM_DELROUTE
+ && CHECK_FLAG (nexthop->flags, NEXTHOP_FLAG_FIB)))
+ {
+ netlink_route_info_add_nh (ri, nexthop);
+ break;
+ }
+ }
+ }
+ else
+ {
+ for (nexthop = rib->nexthop;
+ nexthop && (MULTIPATH_NUM == 0 || ri->num_nhs < MULTIPATH_NUM);
+ nexthop = nexthop->next)
+ {
+ if ((cmd == RTM_NEWROUTE
+ && CHECK_FLAG (nexthop->flags, NEXTHOP_FLAG_ACTIVE))
+ || (cmd == RTM_DELROUTE
+ && CHECK_FLAG (nexthop->flags, NEXTHOP_FLAG_FIB)))
+ {
+ netlink_route_info_add_nh (ri, nexthop);
+ }
+ }
+ }
+
+ /* If there is no useful nexthop then return. */
+ if (ri->num_nhs == 0)
+ {
+ zfpm_debug ("netlink_encode_route(): No useful nexthop.");
+ return 0;
+ }
+
+ skip:
+ return 1;
+}
+
+/*
+ * netlink_route_info_encode
+ *
+ * Returns the number of bytes written to the buffer. 0 or a negative
+ * value indicates an error.
+ */
+static int
+netlink_route_info_encode (netlink_route_info_t *ri, char *in_buf,
+ size_t in_buf_len)
+{
+ int bytelen;
+ int nexthop_num = 0;
+ size_t buf_offset;
+ netlink_nh_info_t *nhi;
+
+ struct
+ {
+ struct nlmsghdr n;
+ struct rtmsg r;
+ char buf[1];
+ } *req;
+
+ req = (void *) in_buf;
+
+ buf_offset = ((char *) req->buf) - ((char *) req);
+
+ if (in_buf_len < buf_offset) {
+ assert(0);
+ return 0;
+ }
+
+ memset (req, 0, buf_offset);
+
+ bytelen = af_addr_size (ri->af);
+
+ req->n.nlmsg_len = NLMSG_LENGTH (sizeof (struct rtmsg));
+ req->n.nlmsg_flags = NLM_F_CREATE | NLM_F_REQUEST;
+ req->n.nlmsg_type = ri->nlmsg_type;
+ req->r.rtm_family = ri->af;
+ req->r.rtm_table = ri->rtm_table;
+ req->r.rtm_dst_len = ri->prefix->prefixlen;
+ req->r.rtm_protocol = ri->rtm_protocol;
+ req->r.rtm_scope = RT_SCOPE_UNIVERSE;
+
+ addattr_l (&req->n, in_buf_len, RTA_DST, &ri->prefix->u.prefix, bytelen);
+
+ req->r.rtm_type = ri->rtm_type;
+
+ /* Metric. */
+ if (ri->metric)
+ addattr32 (&req->n, in_buf_len, RTA_PRIORITY, *ri->metric);
+
+ if (ri->num_nhs == 0)
+ goto done;
+
+ if (ri->num_nhs == 1)
+ {
+ nhi = &ri->nhs[0];
+
+ if (nhi->gateway)
+ {
+ addattr_l (&req->n, in_buf_len, RTA_GATEWAY, nhi->gateway,
+ bytelen);
+ }
+
+ if (nhi->if_index)
+ {
+ addattr32 (&req->n, in_buf_len, RTA_OIF, nhi->if_index);
+ }
+
+ goto done;
+
+ }
+
+ /*
+ * Multipath case.
+ */
+ char buf[NL_PKT_BUF_SIZE];
+ struct rtattr *rta = (void *) buf;
+ struct rtnexthop *rtnh;
+
+ rta->rta_type = RTA_MULTIPATH;
+ rta->rta_len = RTA_LENGTH (0);
+ rtnh = RTA_DATA (rta);
+
+ for (nexthop_num = 0; nexthop_num < ri->num_nhs; nexthop_num++)
+ {
+ nhi = &ri->nhs[nexthop_num];
+
+ rtnh->rtnh_len = sizeof (*rtnh);
+ rtnh->rtnh_flags = 0;
+ rtnh->rtnh_hops = 0;
+ rtnh->rtnh_ifindex = 0;
+ rta->rta_len += rtnh->rtnh_len;
+
+ if (nhi->gateway)
+ {
+ rta_addattr_l (rta, sizeof (buf), RTA_GATEWAY, nhi->gateway, bytelen);
+ rtnh->rtnh_len += sizeof (struct rtattr) + bytelen;
+ }
+
+ if (nhi->if_index)
+ {
+ rtnh->rtnh_ifindex = nhi->if_index;
+ }
+
+ rtnh = RTNH_NEXT (rtnh);
+ }
+
+ assert (rta->rta_len > RTA_LENGTH (0));
+ addattr_l (&req->n, in_buf_len, RTA_MULTIPATH, RTA_DATA (rta),
+ RTA_PAYLOAD (rta));
+
+done:
+
+ if (ri->pref_src)
+ {
+ addattr_l (&req->n, in_buf_len, RTA_PREFSRC, &ri->pref_src, bytelen);
+ }
+
+ assert (req->n.nlmsg_len < in_buf_len);
+ return req->n.nlmsg_len;
+}
+
+/*
+ * zfpm_log_route_info
+ *
+ * Helper function to log the information in a route_info structure.
+ */
+static void
+zfpm_log_route_info (netlink_route_info_t *ri, const char *label)
+{
+ netlink_nh_info_t *nhi;
+ int i;
+
+ zfpm_debug ("%s : %s %s/%d, Proto: %s, Metric: %u", label,
+ nl_msg_type_to_str (ri->nlmsg_type),
+ prefix_addr_to_a (ri->prefix), ri->prefix->prefixlen,
+ nl_rtproto_to_str (ri->rtm_protocol),
+ ri->metric ? *ri->metric : 0);
+
+ for (i = 0; i < ri->num_nhs; i++)
+ {
+ nhi = &ri->nhs[i];
+ zfpm_debug(" Intf: %u, Gateway: %s, Recursive: %s, Type: %s",
+ nhi->if_index, addr_to_a (ri->af, nhi->gateway),
+ nhi->recursive ? "yes" : "no",
+ nexthop_type_to_str (nhi->type));
+ }
+}
+
+/*
+ * zfpm_netlink_encode_route
+ *
+ * Create a netlink message corresponding to the given route in the
+ * given buffer space.
+ *
+ * Returns the number of bytes written to the buffer. 0 or a negative
+ * value indicates an error.
+ */
+int
+zfpm_netlink_encode_route (int cmd, rib_dest_t *dest, struct rib *rib,
+ char *in_buf, size_t in_buf_len)
+{
+ netlink_route_info_t ri_space, *ri;
+
+ ri = &ri_space;
+
+ if (!netlink_route_info_fill (ri, cmd, dest, rib))
+ return 0;
+
+ zfpm_log_route_info (ri, __FUNCTION__);
+
+ return netlink_route_info_encode (ri, in_buf, in_buf_len);
+}