summaryrefslogtreecommitdiff
path: root/lib
diff options
context:
space:
mode:
Diffstat (limited to 'lib')
-rw-r--r--lib/event.c85
-rw-r--r--lib/frr_pthread.c8
-rw-r--r--lib/frrcu.c15
-rw-r--r--lib/frrevent.h19
-rw-r--r--lib/sigevent.h27
-rw-r--r--lib/zlog_5424.c12
6 files changed, 132 insertions, 34 deletions
diff --git a/lib/event.c b/lib/event.c
index 6081ba4727..8c09debf91 100644
--- a/lib/event.c
+++ b/lib/event.c
@@ -571,6 +571,11 @@ struct event_loop *event_master_create(const char *name)
rv->spin = true;
rv->handle_signals = true;
+ /* tardy event warnings */
+ monotime(&rv->last_tardy_warning);
+ rv->last_tardy_warning.tv_sec -= (TARDY_WARNING_INTERVAL + TIMER_SECOND_MICRO - 1) /
+ TIMER_SECOND_MICRO;
+
/* Set pthread owner, should be updated by actual owner */
rv->owner = pthread_self();
rv->cancel_req = list_new();
@@ -770,13 +775,13 @@ static struct event *thread_get(struct event_loop *m, uint8_t type,
thread->master = m;
thread->arg = arg;
thread->yield = EVENT_YIELD_TIME_SLOT; /* default */
+ thread->tardy_threshold = 0;
/* thread->ref is zeroed either by XCALLOC above or by memset before
* being put on the "unuse" list by thread_add_unuse().
* Setting it here again makes coverity complain about a missing
* lock :(
*/
/* thread->ref = NULL; */
- thread->ignore_timer_late = false;
/*
* So if the passed in funcname is not what we have
@@ -1023,6 +1028,8 @@ static void _event_add_timer_timeval(const struct xref_eventsched *xref,
return;
thread = thread_get(m, EVENT_TIMER, func, arg, xref);
+ /* default lateness warning: 4s */
+ thread->tardy_threshold = TARDY_DEFAULT_THRESHOLD;
frr_with_mutex (&thread->mtx) {
thread->u.sands = t;
@@ -1685,34 +1692,12 @@ static void thread_process_io(struct event_loop *m, unsigned int num)
static unsigned int thread_process_timers(struct event_loop *m,
struct timeval *timenow)
{
- struct timeval prev = *timenow;
- bool displayed = false;
struct event *thread;
unsigned int ready = 0;
while ((thread = event_timer_list_first(&m->timer))) {
if (timercmp(timenow, &thread->u.sands, <))
break;
- prev = thread->u.sands;
- prev.tv_sec += 4;
- /*
- * If the timer would have popped 4 seconds in the
- * past then we are in a situation where we are
- * really getting behind on handling of events.
- * Let's log it and do the right thing with it.
- */
- if (timercmp(timenow, &prev, >)) {
- atomic_fetch_add_explicit(
- &thread->hist->total_starv_warn, 1,
- memory_order_seq_cst);
- if (!displayed && !thread->ignore_timer_late) {
- flog_warn(
- EC_LIB_STARVE_THREAD,
- "Thread Starvation: %pTHD was scheduled to pop greater than 4s ago",
- thread);
- displayed = true;
- }
- }
event_timer_list_pop(&m->timer);
thread->type = EVENT_READY;
@@ -1946,6 +1931,29 @@ void event_getrusage(RUSAGE_T *r)
#endif
}
+static void event_tardy_warn(struct event *thread, unsigned long since_us)
+{
+ char buf[64];
+ struct fbuf fb = { .buf = buf, .pos = buf, .len = sizeof(buf) };
+ double loadavg[3];
+ int rv;
+
+ rv = getloadavg(loadavg, array_size(loadavg));
+ if (rv < 0)
+ bprintfrr(&fb, "not available");
+ else {
+ for (int i = 0; i < rv; i++) {
+ bprintfrr(&fb, "%.2f", loadavg[i]);
+ if (i < rv - 1)
+ bputs(&fb, ", ");
+ }
+ }
+
+ flog_warn(EC_LIB_STARVE_THREAD,
+ "CPU starvation: %pTHD getting executed %lums late, warning threshold %lums. System load: %pFB",
+ thread, (since_us + 999) / 1000, (thread->tardy_threshold + 999) / 1000, &fb);
+}
+
/*
* Call a thread.
*
@@ -1962,6 +1970,33 @@ void event_call(struct event *thread)
RUSAGE_T before, after;
bool suppress_warnings = EVENT_ARG(thread);
+ if (thread->tardy_threshold) {
+ int64_t timer_late_us = monotime_since(&thread->u.sands, NULL);
+
+ /* Timers have a tardiness warning defaulting to 4s.
+ * It can be customized with event_set_tardy_threshold()
+ * (bfdd does that since the protocol has really short timers)
+ *
+ * If we are more than that threshold late, print a warning
+ * since we're running behind in calling timers (probably due
+ * to high system load.)
+ */
+ if (timer_late_us > (int64_t)thread->tardy_threshold) {
+ int64_t since_last_warning;
+ struct timeval *tw;
+
+ atomic_fetch_add_explicit(&thread->hist->total_starv_warn, 1,
+ memory_order_seq_cst);
+
+ tw = &thread->master->last_tardy_warning;
+ since_last_warning = monotime_since(tw, NULL);
+ if (since_last_warning > TARDY_WARNING_INTERVAL) {
+ event_tardy_warn(thread, timer_late_us);
+ monotime(tw);
+ }
+ }
+ }
+
/* if the thread being called is the CLI, it may change cputime_enabled
* ("service cputime-stats" command), which can result in nonsensical
* and very confusing warnings
@@ -2145,9 +2180,9 @@ static ssize_t printfrr_thread_dbg(struct fbuf *buf, struct printfrr_eargs *ea,
char info[16] = "";
if (!thread)
- return bputs(buf, "{(thread *)NULL}");
+ return bputs(buf, "{(event *)NULL}");
- rv += bprintfrr(buf, "{(thread *)%p arg=%p", thread, thread->arg);
+ rv += bprintfrr(buf, "{(event *)%p arg=%p", thread, thread->arg);
if (thread->type < array_size(types) && types[thread->type])
rv += bprintfrr(buf, " %-6s", types[thread->type]);
diff --git a/lib/frr_pthread.c b/lib/frr_pthread.c
index 3a4bc712fc..0b4d7c77ae 100644
--- a/lib/frr_pthread.c
+++ b/lib/frr_pthread.c
@@ -20,6 +20,7 @@
#include "zlog.h"
#include "libfrr.h"
#include "libfrr_trace.h"
+#include "sigevent.h"
DEFINE_MTYPE_STATIC(LIB, FRR_PTHREAD, "FRR POSIX Thread");
DEFINE_MTYPE_STATIC(LIB, PTHREAD_PRIM, "POSIX sync primitives");
@@ -185,10 +186,9 @@ int frr_pthread_run(struct frr_pthread *fpt, const pthread_attr_t *attr)
assert(frr_is_after_fork || !"trying to start thread before fork()");
- /* Ensure we never handle signals on a background thread by blocking
- * everything here (new thread inherits signal mask)
- */
- sigfillset(&blocksigs);
+ sigemptyset(&blocksigs);
+ frr_sigset_add_mainonly(&blocksigs);
+ /* new thread inherits mask */
pthread_sigmask(SIG_BLOCK, &blocksigs, &oldsigs);
frrtrace(1, frr_libfrr, frr_pthread_run, fpt->name);
diff --git a/lib/frrcu.c b/lib/frrcu.c
index b85c525c58..1e7ed99eff 100644
--- a/lib/frrcu.c
+++ b/lib/frrcu.c
@@ -42,6 +42,7 @@
#include "frrcu.h"
#include "seqlock.h"
#include "atomlist.h"
+#include "sigevent.h"
DEFINE_MTYPE_STATIC(LIB, RCU_THREAD, "RCU thread");
DEFINE_MTYPE_STATIC(LIB, RCU_NEXT, "RCU sequence barrier");
@@ -346,7 +347,19 @@ static void rcu_start(void)
*/
sigset_t oldsigs, blocksigs;
- sigfillset(&blocksigs);
+ /* technically, the RCU thread is very poorly suited to run even just a
+ * crashlog handler, since zlog_sigsafe() could deadlock on transiently
+ * invalid (due to RCU) logging data structures
+ *
+ * but given that when we try to write a crashlog, we're already in
+ * b0rked territory anyway - give the crashlog handler a chance.
+ *
+ * (also cf. the SIGALRM usage in writing crashlogs to avoid hung
+ * processes on any kind of deadlock in crash handlers)
+ */
+ sigemptyset(&blocksigs);
+ frr_sigset_add_mainonly(&blocksigs);
+ /* new thread inherits mask */
pthread_sigmask(SIG_BLOCK, &blocksigs, &oldsigs);
rcu_active = true;
diff --git a/lib/frrevent.h b/lib/frrevent.h
index c35b39a147..61baf7919c 100644
--- a/lib/frrevent.h
+++ b/lib/frrevent.h
@@ -95,6 +95,7 @@ struct event_loop {
bool ready_run_loop;
RUSAGE_T last_getrusage;
+ struct timeval last_tardy_warning;
};
/* Event types. */
@@ -126,11 +127,17 @@ struct event {
struct timeval real;
struct cpu_event_history *hist; /* cache pointer to cpu_history */
unsigned long yield; /* yield time in microseconds */
+ /* lateness warning threshold, usec. 0 if it's not a timer. */
+ unsigned long tardy_threshold;
const struct xref_eventsched *xref; /* origin location */
pthread_mutex_t mtx; /* mutex for thread.c functions */
- bool ignore_timer_late;
};
+/* rate limit late timer warnings */
+#define TARDY_WARNING_INTERVAL 10 * TIMER_SECOND_MICRO
+/* default threshold for late timer warning */
+#define TARDY_DEFAULT_THRESHOLD 4 * TIMER_SECOND_MICRO
+
#ifdef _FRR_ATTRIBUTE_PRINTFRR
#pragma FRR printfrr_ext "%pTH"(struct event *)
#endif
@@ -305,9 +312,17 @@ static inline bool event_is_scheduled(struct event *thread)
/* Debug signal mask */
void debug_signals(const sigset_t *sigs);
+/* getting called more than given microseconds late will print a warning.
+ * Default if not called: 4s. Don't call this on non-timers.
+ */
+static inline void event_set_tardy_threshold(struct event *event, unsigned long thres)
+{
+ event->tardy_threshold = thres;
+}
+
static inline void event_ignore_late_timer(struct event *event)
{
- event->ignore_timer_late = true;
+ event->tardy_threshold = 0;
}
#ifdef __cplusplus
diff --git a/lib/sigevent.h b/lib/sigevent.h
index 0b07f594c1..2c51ba3767 100644
--- a/lib/sigevent.h
+++ b/lib/sigevent.h
@@ -45,6 +45,33 @@ bool frr_sigevent_check(sigset_t *setp);
/* check whether there are signals to handle, process any found */
extern int frr_sigevent_process(void);
+/* Ensure we don't handle "application-type" signals on a secondary thread by
+ * blocking these signals when creating threads
+ *
+ * NB: SIGSEGV, SIGABRT, etc. must be allowed on all threads or we get no
+ * crashlogs. Since signals vary a little bit between platforms, below is a
+ * list of known things to go to the main thread. Any unknown signals should
+ * stay thread-local.
+ */
+static inline void frr_sigset_add_mainonly(sigset_t *blocksigs)
+{
+ /* signals we actively handle */
+ sigaddset(blocksigs, SIGHUP);
+ sigaddset(blocksigs, SIGINT);
+ sigaddset(blocksigs, SIGTERM);
+ sigaddset(blocksigs, SIGUSR1);
+
+ /* signals we don't actively use but that semantically belong */
+ sigaddset(blocksigs, SIGUSR2);
+ sigaddset(blocksigs, SIGQUIT);
+ sigaddset(blocksigs, SIGCHLD);
+ sigaddset(blocksigs, SIGPIPE);
+ sigaddset(blocksigs, SIGTSTP);
+ sigaddset(blocksigs, SIGTTIN);
+ sigaddset(blocksigs, SIGTTOU);
+ sigaddset(blocksigs, SIGWINCH);
+}
+
#ifdef __cplusplus
}
#endif
diff --git a/lib/zlog_5424.c b/lib/zlog_5424.c
index 4c60d4b405..6265ce3b1f 100644
--- a/lib/zlog_5424.c
+++ b/lib/zlog_5424.c
@@ -782,7 +782,7 @@ static void zlog_5424_cycle(struct zlog_cfg_5424 *zcf, int fd)
}
old = zcf->active ? &zcf->active->zt : NULL;
- old = zlog_target_replace(old, &zlt->zt);
+ old = zlog_target_replace(old, zlt ? &zlt->zt : NULL);
zcf->active = zlt;
/* oldt->fd == fd happens for zlog_5424_apply_meta() */
@@ -1076,9 +1076,17 @@ bool zlog_5424_apply_dst(struct zlog_cfg_5424 *zcf)
bool zlog_5424_apply_meta(struct zlog_cfg_5424 *zcf)
{
+ int fd;
+
frr_with_mutex (&zcf->cfg_mtx) {
if (zcf->active)
- zlog_5424_cycle(zcf, zcf->active->fd);
+ fd = zcf->active->fd;
+ else if (zcf->prio_min != ZLOG_DISABLED)
+ fd = zlog_5424_open(zcf, -1);
+ else
+ fd = -1;
+ if (fd >= 0)
+ zlog_5424_cycle(zcf, fd);
}
return true;