summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorPatrick Ruddy <pat@voltanet.io>2021-04-14 18:33:18 +0100
committerGitHub <noreply@github.com>2021-04-14 18:33:18 +0100
commit5bb91468c55c17405d112ffe91ba01594583c2e6 (patch)
tree10edf4414da92a455cdbc49d43d4f58a07bab3d6
parent8a0a716f83c1eecafb8d3a3c7cbf0b153b8c4280 (diff)
parent9b8e01cae44cf6cb3a1a08c483ab0deceda21375 (diff)
Merge pull request #8003 from donaldsharp/timings
lib: Differentiate between real and cpu bound processes
-rw-r--r--lib/lib_errors.c12
-rw-r--r--lib/lib_errors.h3
-rw-r--r--lib/thread.c45
-rw-r--r--lib/thread.h2
-rw-r--r--lib/vty.c14
5 files changed, 58 insertions, 18 deletions
diff --git a/lib/lib_errors.c b/lib/lib_errors.c
index 6e5088142a..17695e6607 100644
--- a/lib/lib_errors.c
+++ b/lib/lib_errors.c
@@ -45,9 +45,15 @@ static struct log_ref ferr_lib_warn[] = {
.suggestion = "Gather log data and open an Issue. restart FRR",
},
{
- .code = EC_LIB_SLOW_THREAD,
- .title = "The Event subsystem has detected a slow process",
- .description = "The Event subsystem has detected a slow process, this typically indicates that FRR is having trouble completing work in a timely manner. This can be either a misconfiguration, bug, or some combination therof.",
+ .code = EC_LIB_SLOW_THREAD_CPU,
+ .title = "The Event subsystem has detected a slow cpu time process",
+ .description = "The Event subsystem has detected a slow process, this typically indicates that FRR is having trouble completing work in a timely manner. This can be either a misconfiguration, bug, or some combination therof. In this case total CPU time was over 5 seconds. Which indicates that FRR is very busy doing some work and should be addressed",
+ .suggestion = "Gather log data and open an Issue",
+ },
+ {
+ .code = EC_LIB_SLOW_THREAD_WALL,
+ .title = "The Event subsystem has detected a slow wall time process",
+ .description = "The Event subsystem has detected a slow process, this typically indicates that FRR is having trouble completing work in a timely manner. This can be either a misconfiguration, bug or some combination therof. In this case total WALL time was over 5 seconds. Which indicates that FRR might be having trouble being scheduled or some system call is delaying",
.suggestion = "Gather log data and open an Issue",
},
{
diff --git a/lib/lib_errors.h b/lib/lib_errors.h
index 4730b6aa33..9f0f58d20b 100644
--- a/lib/lib_errors.h
+++ b/lib/lib_errors.h
@@ -44,7 +44,8 @@ enum lib_log_refs {
EC_LIB_SNMP,
EC_LIB_STREAM,
EC_LIB_LINUX_NS,
- EC_LIB_SLOW_THREAD,
+ EC_LIB_SLOW_THREAD_CPU,
+ EC_LIB_SLOW_THREAD_WALL,
EC_LIB_NO_THREAD,
EC_LIB_RMAP_RECURSION_LIMIT,
EC_LIB_BACKUP_CONFIG,
diff --git a/lib/thread.c b/lib/thread.c
index 866090341e..567516300d 100644
--- a/lib/thread.c
+++ b/lib/thread.c
@@ -124,11 +124,12 @@ static void cpu_record_hash_free(void *a)
static void vty_out_cpu_thread_history(struct vty *vty,
struct cpu_thread_history *a)
{
- vty_out(vty, "%5zu %10zu.%03zu %9zu %8zu %9zu %8zu %9zu",
+ vty_out(vty, "%5zu %10zu.%03zu %9zu %8zu %9zu %8zu %9zu %9zu %9zu",
a->total_active, a->cpu.total / 1000, a->cpu.total % 1000,
- a->total_calls, (a->cpu.total / a->total_calls), a->cpu.max,
- (a->real.total / a->total_calls), a->real.max);
- vty_out(vty, " %c%c%c%c%c %s\n",
+ a->total_calls, (a->cpu.total / a->total_calls), a->cpu.max,
+ (a->real.total / a->total_calls), a->real.max,
+ a->total_cpu_warn, a->total_wall_warn);
+ vty_out(vty, " %c%c%c%c%c %s\n",
a->types & (1 << THREAD_READ) ? 'R' : ' ',
a->types & (1 << THREAD_WRITE) ? 'W' : ' ',
a->types & (1 << THREAD_TIMER) ? 'T' : ' ',
@@ -149,6 +150,10 @@ static void cpu_record_hash_print(struct hash_bucket *bucket, void *args[])
atomic_load_explicit(&a->total_active, memory_order_seq_cst);
copy.total_calls =
atomic_load_explicit(&a->total_calls, memory_order_seq_cst);
+ copy.total_cpu_warn =
+ atomic_load_explicit(&a->total_cpu_warn, memory_order_seq_cst);
+ copy.total_wall_warn =
+ atomic_load_explicit(&a->total_wall_warn, memory_order_seq_cst);
copy.cpu.total =
atomic_load_explicit(&a->cpu.total, memory_order_seq_cst);
copy.cpu.max = atomic_load_explicit(&a->cpu.max, memory_order_seq_cst);
@@ -165,6 +170,8 @@ static void cpu_record_hash_print(struct hash_bucket *bucket, void *args[])
vty_out_cpu_thread_history(vty, &copy);
totals->total_active += copy.total_active;
totals->total_calls += copy.total_calls;
+ totals->total_cpu_warn += copy.total_cpu_warn;
+ totals->total_wall_warn += copy.total_wall_warn;
totals->real.total += copy.real.total;
if (totals->real.max < copy.real.max)
totals->real.max = copy.real.max;
@@ -202,7 +209,7 @@ static void cpu_record_print(struct vty *vty, uint8_t filter)
vty_out(vty,
"Active Runtime(ms) Invoked Avg uSec Max uSecs");
vty_out(vty, " Avg uSec Max uSecs");
- vty_out(vty, " Type Thread\n");
+ vty_out(vty, " CPU_Warn Wall_Warn Type Thread\n");
if (m->cpu_record->count)
hash_iterate(
@@ -223,7 +230,7 @@ static void cpu_record_print(struct vty *vty, uint8_t filter)
vty_out(vty, "%30s %18s %18s\n", "",
"CPU (user+system):", "Real (wall-clock):");
vty_out(vty, "Active Runtime(ms) Invoked Avg uSec Max uSecs");
- vty_out(vty, " Avg uSec Max uSecs");
+ vty_out(vty, " Avg uSec Max uSecs CPU_Warn Wall_Warn");
vty_out(vty, " Type Thread\n");
if (tmp.total_calls > 0)
@@ -1850,15 +1857,33 @@ void thread_call(struct thread *thread)
memory_order_seq_cst);
#ifdef CONSUMED_TIME_CHECK
- if (realtime > CONSUMED_TIME_CHECK) {
+ if (cputime > CONSUMED_TIME_CHECK) {
/*
- * We have a CPU Hog on our hands.
+ * We have a CPU Hog on our hands. The time FRR
+ * has spent doing actual work ( not sleeping )
+ * is greater than 5 seconds.
* Whinge about it now, so we're aware this is yet another task
* to fix.
*/
+ atomic_fetch_add_explicit(&thread->hist->total_cpu_warn,
+ 1, memory_order_seq_cst);
flog_warn(
- EC_LIB_SLOW_THREAD,
- "SLOW THREAD: task %s (%lx) ran for %lums (cpu time %lums)",
+ EC_LIB_SLOW_THREAD_CPU,
+ "CPU HOG: task %s (%lx) ran for %lums (cpu time %lums)",
+ thread->xref->funcname, (unsigned long)thread->func,
+ realtime / 1000, cputime / 1000);
+ } else if (realtime > CONSUMED_TIME_CHECK) {
+ /*
+ * The runtime for a task is greater than 5 seconds, but
+ * the cpu time is under 5 seconds. Let's whine
+ * about this because this could imply some sort of
+ * scheduling issue.
+ */
+ atomic_fetch_add_explicit(&thread->hist->total_wall_warn,
+ 1, memory_order_seq_cst);
+ flog_warn(
+ EC_LIB_SLOW_THREAD_WALL,
+ "STARVATION: task %s (%lx) ran for %lums (cpu time %lums)",
thread->xref->funcname, (unsigned long)thread->func,
realtime / 1000, cputime / 1000);
}
diff --git a/lib/thread.h b/lib/thread.h
index af68331131..fee728dbf9 100644
--- a/lib/thread.h
+++ b/lib/thread.h
@@ -119,6 +119,8 @@ struct thread {
struct cpu_thread_history {
int (*func)(struct thread *);
+ atomic_size_t total_cpu_warn;
+ atomic_size_t total_wall_warn;
atomic_size_t total_calls;
atomic_size_t total_active;
struct time_stats {
diff --git a/lib/vty.c b/lib/vty.c
index 96cfef1c0a..f92c912084 100644
--- a/lib/vty.c
+++ b/lib/vty.c
@@ -515,13 +515,19 @@ static int vty_command(struct vty *vty, char *buf)
#ifdef CONSUMED_TIME_CHECK
GETRUSAGE(&after);
- if ((realtime = thread_consumed_time(&after, &before, &cputime))
- > CONSUMED_TIME_CHECK)
+ realtime = thread_consumed_time(&after, &before, &cputime);
+ if (cputime > CONSUMED_TIME_CHECK) {
/* Warn about CPU hog that must be fixed. */
flog_warn(
- EC_LIB_SLOW_THREAD,
- "SLOW COMMAND: command took %lums (cpu time %lums): %s",
+ EC_LIB_SLOW_THREAD_CPU,
+ "CPU HOG: command took %lums (cpu time %lums): %s",
realtime / 1000, cputime / 1000, buf);
+ } else if (realtime > CONSUMED_TIME_CHECK) {
+ flog_warn(
+ EC_LIB_SLOW_THREAD_WALL,
+ "STARVATION: command took %lums (cpu time %lums): %s",
+ realtime / 1000, cputime / 1000, buf);
+ }
}
#endif /* CONSUMED_TIME_CHECK */