Browse Source

Merge branch 'dgoulet_ticket23753_032_02_squashed' into maint-0.3.2

Nick Mathewson 6 years ago
parent
commit
cb42c62c9e
4 changed files with 58 additions and 6 deletions
  1. 4 0
      changes/bug23753
  2. 41 1
      src/or/scheduler.c
  3. 8 0
      src/or/scheduler.h
  4. 5 5
      src/or/scheduler_kist.c

+ 4 - 0
changes/bug23753

@@ -0,0 +1,4 @@
+  o Minor features (logging, scheduler):
+    - Introduce a SCHED_BUG() function to log extra information about the
+      scheduler state if we ever catch a bug in the scheduler. Closes ticket
+      23753.

+ 41 - 1
src/or/scheduler.c

@@ -9,6 +9,9 @@
 #define SCHEDULER_KIST_PRIVATE
 #include "scheduler.h"
 #include "main.h"
+#include "buffers.h"
+#define TOR_CHANNEL_INTERNAL_
+#include "channeltls.h"
 
 #include <event2/event.h>
 
@@ -604,7 +607,7 @@ scheduler_release_channel,(channel_t *chan))
   }
 
   if (chan->scheduler_state == SCHED_CHAN_PENDING) {
-    if (smartlist_pos(channels_pending, chan) == -1) {
+    if (SCHED_BUG(smartlist_pos(channels_pending, chan) == -1, chan)) {
       log_warn(LD_SCHED, "Scheduler asked to release channel %" PRIu64 " "
                          "but it wasn't in channels_pending",
                chan->global_identifier);
@@ -667,6 +670,43 @@ scheduler_channel_wants_writes(channel_t *chan)
   }
 }
 
+/* Log warn the given channel and extra scheduler context as well. This is
+ * used by SCHED_BUG() in order to be able to extract as much information as
+ * we can when we hit a bug. Channel chan can be NULL. */
+void
+scheduler_bug_occurred(const channel_t *chan)
+{
+  char buf[128];
+
+  if (chan != NULL) {
+    const size_t outbuf_len =
+      buf_datalen(TO_CONN(BASE_CHAN_TO_TLS((channel_t *) chan)->conn)->outbuf);
+    tor_snprintf(buf, sizeof(buf),
+                 "Channel %" PRIu64 " in state %s and scheduler state %d."
+                 " Num cells on cmux: %d. Connection outbuf len: %lu.",
+                 chan->global_identifier,
+                 channel_state_to_string(chan->state),
+                 chan->scheduler_state, circuitmux_num_cells(chan->cmux),
+                 outbuf_len);
+  }
+
+  {
+    char *msg;
+    /* Rate limit every 60 seconds. If we start seeing this every 60 sec, we
+     * know something is stuck/wrong. It *should* be loud but not too much. */
+    static ratelim_t rlimit = RATELIM_INIT(60);
+    if ((msg = rate_limit_log(&rlimit, approx_time()))) {
+      log_warn(LD_BUG, "%s Num pending channels: %d. "
+                       "Channel in pending list: %s.%s",
+               (chan != NULL) ? buf : "No channel in bug context.",
+               smartlist_len(channels_pending),
+               (smartlist_pos(channels_pending, chan) == -1) ? "no" : "yes",
+               msg);
+      tor_free(msg);
+    }
+  }
+}
+
 #ifdef TOR_UNIT_TESTS
 
 /*

+ 8 - 0
src/or/scheduler.h

@@ -143,6 +143,14 @@ MOCK_DECL(void, scheduler_channel_has_waiting_cells, (channel_t *chan));
 /*********************************
  * Defined in scheduler.c
  *********************************/
+
+/* Triggers a BUG() and extra information with chan if available. */
+#define SCHED_BUG(cond, chan) \
+  (PREDICT_UNLIKELY(cond) ? \
+   ((BUG(cond)) ? (scheduler_bug_occurred(chan), 1) : 0) : 0)
+
+void scheduler_bug_occurred(const channel_t *chan);
+
 smartlist_t *get_channels_pending(void);
 MOCK_DECL(int, scheduler_compare_channels,
           (const void *c1_v, const void *c2_v));

+ 5 - 5
src/or/scheduler_kist.c

@@ -116,7 +116,7 @@ channel_outbuf_length(channel_t *chan)
   /* In theory, this can not happen because we can not scheduler a channel
    * without a connection that has its outbuf initialized. Just in case, bug
    * on this so we can understand a bit more why it happened. */
-  if (BUG(BASE_CHAN_TO_TLS(chan)->conn == NULL)) {
+  if (SCHED_BUG(BASE_CHAN_TO_TLS(chan)->conn == NULL, chan)) {
     return 0;
   }
   return buf_datalen(TO_CONN(BASE_CHAN_TO_TLS(chan)->conn)->outbuf);
@@ -371,7 +371,7 @@ socket_can_write(socket_table_t *table, const channel_t *chan)
 {
   socket_table_ent_t *ent = NULL;
   ent = socket_table_search(table, chan);
-  IF_BUG_ONCE(!ent) {
+  if (SCHED_BUG(!ent, chan)) {
     return 1; // Just return true, saying that kist wouldn't limit the socket
   }
 
@@ -391,7 +391,7 @@ update_socket_info(socket_table_t *table, const channel_t *chan)
 {
   socket_table_ent_t *ent = NULL;
   ent = socket_table_search(table, chan);
-  IF_BUG_ONCE(!ent) {
+  if (SCHED_BUG(!ent, chan)) {
     return; // Whelp. Entry didn't exist for some reason so nothing to do.
   }
   update_socket_info_impl(ent);
@@ -403,7 +403,7 @@ update_socket_written(socket_table_t *table, channel_t *chan, size_t bytes)
 {
   socket_table_ent_t *ent = NULL;
   ent = socket_table_search(table, chan);
-  IF_BUG_ONCE(!ent) {
+  if (SCHED_BUG(!ent, chan)) {
     return; // Whelp. Entry didn't exist so nothing to do.
   }
 
@@ -573,7 +573,7 @@ kist_scheduler_run(void)
     /* get best channel */
     chan = smartlist_pqueue_pop(cp, scheduler_compare_channels,
                                 offsetof(channel_t, sched_heap_idx));
-    IF_BUG_ONCE(!chan) {
+    if (SCHED_BUG(!chan, NULL)) {
       /* Some-freaking-how a NULL got into the channels_pending. That should
        * never happen, but it should be harmless to ignore it and keep looping.
        */