diff options
Diffstat (limited to 'lib')
| -rw-r--r-- | lib/event.c | 85 | ||||
| -rw-r--r-- | lib/frr_pthread.c | 8 | ||||
| -rw-r--r-- | lib/frrcu.c | 15 | ||||
| -rw-r--r-- | lib/frrevent.h | 19 | ||||
| -rw-r--r-- | lib/sigevent.h | 27 | ||||
| -rw-r--r-- | lib/zlog_5424.c | 12 |
6 files changed, 132 insertions, 34 deletions
diff --git a/lib/event.c b/lib/event.c index 6081ba4727..8c09debf91 100644 --- a/lib/event.c +++ b/lib/event.c @@ -571,6 +571,11 @@ struct event_loop *event_master_create(const char *name) rv->spin = true; rv->handle_signals = true; + /* tardy event warnings */ + monotime(&rv->last_tardy_warning); + rv->last_tardy_warning.tv_sec -= (TARDY_WARNING_INTERVAL + TIMER_SECOND_MICRO - 1) / + TIMER_SECOND_MICRO; + /* Set pthread owner, should be updated by actual owner */ rv->owner = pthread_self(); rv->cancel_req = list_new(); @@ -770,13 +775,13 @@ static struct event *thread_get(struct event_loop *m, uint8_t type, thread->master = m; thread->arg = arg; thread->yield = EVENT_YIELD_TIME_SLOT; /* default */ + thread->tardy_threshold = 0; /* thread->ref is zeroed either by XCALLOC above or by memset before * being put on the "unuse" list by thread_add_unuse(). * Setting it here again makes coverity complain about a missing * lock :( */ /* thread->ref = NULL; */ - thread->ignore_timer_late = false; /* * So if the passed in funcname is not what we have @@ -1023,6 +1028,8 @@ static void _event_add_timer_timeval(const struct xref_eventsched *xref, return; thread = thread_get(m, EVENT_TIMER, func, arg, xref); + /* default lateness warning: 4s */ + thread->tardy_threshold = TARDY_DEFAULT_THRESHOLD; frr_with_mutex (&thread->mtx) { thread->u.sands = t; @@ -1685,34 +1692,12 @@ static void thread_process_io(struct event_loop *m, unsigned int num) static unsigned int thread_process_timers(struct event_loop *m, struct timeval *timenow) { - struct timeval prev = *timenow; - bool displayed = false; struct event *thread; unsigned int ready = 0; while ((thread = event_timer_list_first(&m->timer))) { if (timercmp(timenow, &thread->u.sands, <)) break; - prev = thread->u.sands; - prev.tv_sec += 4; - /* - * If the timer would have popped 4 seconds in the - * past then we are in a situation where we are - * really getting behind on handling of events. - * Let's log it and do the right thing with it. - */ - if (timercmp(timenow, &prev, >)) { - atomic_fetch_add_explicit( - &thread->hist->total_starv_warn, 1, - memory_order_seq_cst); - if (!displayed && !thread->ignore_timer_late) { - flog_warn( - EC_LIB_STARVE_THREAD, - "Thread Starvation: %pTHD was scheduled to pop greater than 4s ago", - thread); - displayed = true; - } - } event_timer_list_pop(&m->timer); thread->type = EVENT_READY; @@ -1946,6 +1931,29 @@ void event_getrusage(RUSAGE_T *r) #endif } +static void event_tardy_warn(struct event *thread, unsigned long since_us) +{ + char buf[64]; + struct fbuf fb = { .buf = buf, .pos = buf, .len = sizeof(buf) }; + double loadavg[3]; + int rv; + + rv = getloadavg(loadavg, array_size(loadavg)); + if (rv < 0) + bprintfrr(&fb, "not available"); + else { + for (int i = 0; i < rv; i++) { + bprintfrr(&fb, "%.2f", loadavg[i]); + if (i < rv - 1) + bputs(&fb, ", "); + } + } + + flog_warn(EC_LIB_STARVE_THREAD, + "CPU starvation: %pTHD getting executed %lums late, warning threshold %lums. System load: %pFB", + thread, (since_us + 999) / 1000, (thread->tardy_threshold + 999) / 1000, &fb); +} + /* * Call a thread. * @@ -1962,6 +1970,33 @@ void event_call(struct event *thread) RUSAGE_T before, after; bool suppress_warnings = EVENT_ARG(thread); + if (thread->tardy_threshold) { + int64_t timer_late_us = monotime_since(&thread->u.sands, NULL); + + /* Timers have a tardiness warning defaulting to 4s. + * It can be customized with event_set_tardy_threshold() + * (bfdd does that since the protocol has really short timers) + * + * If we are more than that threshold late, print a warning + * since we're running behind in calling timers (probably due + * to high system load.) + */ + if (timer_late_us > (int64_t)thread->tardy_threshold) { + int64_t since_last_warning; + struct timeval *tw; + + atomic_fetch_add_explicit(&thread->hist->total_starv_warn, 1, + memory_order_seq_cst); + + tw = &thread->master->last_tardy_warning; + since_last_warning = monotime_since(tw, NULL); + if (since_last_warning > TARDY_WARNING_INTERVAL) { + event_tardy_warn(thread, timer_late_us); + monotime(tw); + } + } + } + /* if the thread being called is the CLI, it may change cputime_enabled * ("service cputime-stats" command), which can result in nonsensical * and very confusing warnings @@ -2145,9 +2180,9 @@ static ssize_t printfrr_thread_dbg(struct fbuf *buf, struct printfrr_eargs *ea, char info[16] = ""; if (!thread) - return bputs(buf, "{(thread *)NULL}"); + return bputs(buf, "{(event *)NULL}"); - rv += bprintfrr(buf, "{(thread *)%p arg=%p", thread, thread->arg); + rv += bprintfrr(buf, "{(event *)%p arg=%p", thread, thread->arg); if (thread->type < array_size(types) && types[thread->type]) rv += bprintfrr(buf, " %-6s", types[thread->type]); diff --git a/lib/frr_pthread.c b/lib/frr_pthread.c index 3a4bc712fc..0b4d7c77ae 100644 --- a/lib/frr_pthread.c +++ b/lib/frr_pthread.c @@ -20,6 +20,7 @@ #include "zlog.h" #include "libfrr.h" #include "libfrr_trace.h" +#include "sigevent.h" DEFINE_MTYPE_STATIC(LIB, FRR_PTHREAD, "FRR POSIX Thread"); DEFINE_MTYPE_STATIC(LIB, PTHREAD_PRIM, "POSIX sync primitives"); @@ -185,10 +186,9 @@ int frr_pthread_run(struct frr_pthread *fpt, const pthread_attr_t *attr) assert(frr_is_after_fork || !"trying to start thread before fork()"); - /* Ensure we never handle signals on a background thread by blocking - * everything here (new thread inherits signal mask) - */ - sigfillset(&blocksigs); + sigemptyset(&blocksigs); + frr_sigset_add_mainonly(&blocksigs); + /* new thread inherits mask */ pthread_sigmask(SIG_BLOCK, &blocksigs, &oldsigs); frrtrace(1, frr_libfrr, frr_pthread_run, fpt->name); diff --git a/lib/frrcu.c b/lib/frrcu.c index b85c525c58..1e7ed99eff 100644 --- a/lib/frrcu.c +++ b/lib/frrcu.c @@ -42,6 +42,7 @@ #include "frrcu.h" #include "seqlock.h" #include "atomlist.h" +#include "sigevent.h" DEFINE_MTYPE_STATIC(LIB, RCU_THREAD, "RCU thread"); DEFINE_MTYPE_STATIC(LIB, RCU_NEXT, "RCU sequence barrier"); @@ -346,7 +347,19 @@ static void rcu_start(void) */ sigset_t oldsigs, blocksigs; - sigfillset(&blocksigs); + /* technically, the RCU thread is very poorly suited to run even just a + * crashlog handler, since zlog_sigsafe() could deadlock on transiently + * invalid (due to RCU) logging data structures + * + * but given that when we try to write a crashlog, we're already in + * b0rked territory anyway - give the crashlog handler a chance. + * + * (also cf. the SIGALRM usage in writing crashlogs to avoid hung + * processes on any kind of deadlock in crash handlers) + */ + sigemptyset(&blocksigs); + frr_sigset_add_mainonly(&blocksigs); + /* new thread inherits mask */ pthread_sigmask(SIG_BLOCK, &blocksigs, &oldsigs); rcu_active = true; diff --git a/lib/frrevent.h b/lib/frrevent.h index c35b39a147..61baf7919c 100644 --- a/lib/frrevent.h +++ b/lib/frrevent.h @@ -95,6 +95,7 @@ struct event_loop { bool ready_run_loop; RUSAGE_T last_getrusage; + struct timeval last_tardy_warning; }; /* Event types. */ @@ -126,11 +127,17 @@ struct event { struct timeval real; struct cpu_event_history *hist; /* cache pointer to cpu_history */ unsigned long yield; /* yield time in microseconds */ + /* lateness warning threshold, usec. 0 if it's not a timer. */ + unsigned long tardy_threshold; const struct xref_eventsched *xref; /* origin location */ pthread_mutex_t mtx; /* mutex for thread.c functions */ - bool ignore_timer_late; }; +/* rate limit late timer warnings */ +#define TARDY_WARNING_INTERVAL 10 * TIMER_SECOND_MICRO +/* default threshold for late timer warning */ +#define TARDY_DEFAULT_THRESHOLD 4 * TIMER_SECOND_MICRO + #ifdef _FRR_ATTRIBUTE_PRINTFRR #pragma FRR printfrr_ext "%pTH"(struct event *) #endif @@ -305,9 +312,17 @@ static inline bool event_is_scheduled(struct event *thread) /* Debug signal mask */ void debug_signals(const sigset_t *sigs); +/* getting called more than given microseconds late will print a warning. + * Default if not called: 4s. Don't call this on non-timers. + */ +static inline void event_set_tardy_threshold(struct event *event, unsigned long thres) +{ + event->tardy_threshold = thres; +} + static inline void event_ignore_late_timer(struct event *event) { - event->ignore_timer_late = true; + event->tardy_threshold = 0; } #ifdef __cplusplus diff --git a/lib/sigevent.h b/lib/sigevent.h index 0b07f594c1..2c51ba3767 100644 --- a/lib/sigevent.h +++ b/lib/sigevent.h @@ -45,6 +45,33 @@ bool frr_sigevent_check(sigset_t *setp); /* check whether there are signals to handle, process any found */ extern int frr_sigevent_process(void); +/* Ensure we don't handle "application-type" signals on a secondary thread by + * blocking these signals when creating threads + * + * NB: SIGSEGV, SIGABRT, etc. must be allowed on all threads or we get no + * crashlogs. Since signals vary a little bit between platforms, below is a + * list of known things to go to the main thread. Any unknown signals should + * stay thread-local. + */ +static inline void frr_sigset_add_mainonly(sigset_t *blocksigs) +{ + /* signals we actively handle */ + sigaddset(blocksigs, SIGHUP); + sigaddset(blocksigs, SIGINT); + sigaddset(blocksigs, SIGTERM); + sigaddset(blocksigs, SIGUSR1); + + /* signals we don't actively use but that semantically belong */ + sigaddset(blocksigs, SIGUSR2); + sigaddset(blocksigs, SIGQUIT); + sigaddset(blocksigs, SIGCHLD); + sigaddset(blocksigs, SIGPIPE); + sigaddset(blocksigs, SIGTSTP); + sigaddset(blocksigs, SIGTTIN); + sigaddset(blocksigs, SIGTTOU); + sigaddset(blocksigs, SIGWINCH); +} + #ifdef __cplusplus } #endif diff --git a/lib/zlog_5424.c b/lib/zlog_5424.c index 4c60d4b405..6265ce3b1f 100644 --- a/lib/zlog_5424.c +++ b/lib/zlog_5424.c @@ -782,7 +782,7 @@ static void zlog_5424_cycle(struct zlog_cfg_5424 *zcf, int fd) } old = zcf->active ? &zcf->active->zt : NULL; - old = zlog_target_replace(old, &zlt->zt); + old = zlog_target_replace(old, zlt ? &zlt->zt : NULL); zcf->active = zlt; /* oldt->fd == fd happens for zlog_5424_apply_meta() */ @@ -1076,9 +1076,17 @@ bool zlog_5424_apply_dst(struct zlog_cfg_5424 *zcf) bool zlog_5424_apply_meta(struct zlog_cfg_5424 *zcf) { + int fd; + frr_with_mutex (&zcf->cfg_mtx) { if (zcf->active) - zlog_5424_cycle(zcf, zcf->active->fd); + fd = zcf->active->fd; + else if (zcf->prio_min != ZLOG_DISABLED) + fd = zlog_5424_open(zcf, -1); + else + fd = -1; + if (fd >= 0) + zlog_5424_cycle(zcf, fd); } return true; |
