Browse Source

Merge branch 'bug12184_diagnostic_squashed'

Nick Mathewson 10 years ago
parent
commit
a58d94fb7c
8 changed files with 149 additions and 8 deletions
  1. 4 0
      changes/bug12184_diagnostic
  2. 63 7
      src/or/circuitbuild.c
  3. 28 1
      src/or/circuitlist.c
  4. 2 0
      src/or/circuitlist.h
  5. 35 0
      src/or/circuitmux.c
  6. 4 0
      src/or/circuitmux.h
  7. 11 0
      src/or/relay.c
  8. 2 0
      src/or/relay.h

+ 4 - 0
changes/bug12184_diagnostic

@@ -0,0 +1,4 @@
+  o Minor features (diagnostic):
+    - Add significant new logging code to attempt to diagnose bug 12184,
+      where relays seem to run out of available circuit IDs.
+

+ 63 - 7
src/or/circuitbuild.c

@@ -96,10 +96,12 @@ get_unique_circ_id_by_chan(channel_t *chan)
  * N=256. */
 #define MAX_CIRCID_ATTEMPTS 64
   int in_use;
-  unsigned n_with_circ = 0, n_pending_destroy = 0;
+  unsigned n_with_circ = 0, n_pending_destroy = 0, n_weird_pending_destroy = 0;
   circid_t test_circ_id;
   circid_t attempts=0;
   circid_t high_bit, max_range, mask;
+  int64_t pending_destroy_time_total = 0;
+  int64_t pending_destroy_time_max = 0;
 
   tor_assert(chan);
 
@@ -130,15 +132,57 @@ get_unique_circ_id_by_chan(channel_t *chan)
        * whole circuit ID space every time we extend a circuit, which is
        * not so great either.
        */
-      log_fn_ratelim(&chan->last_warned_circ_ids_exhausted, LOG_WARN,
-                 LD_CIRC,"No unused circIDs found on channel %s wide "
+      int64_t queued_destroys;
+      char *m = rate_limit_log(&chan->last_warned_circ_ids_exhausted,
+                               approx_time());
+      if (m == NULL)
+        return 0; /* This message has been rate-limited away. */
+      if (n_pending_destroy)
+        pending_destroy_time_total /= n_pending_destroy;
+      log_warn(LD_CIRC,"No unused circIDs found on channel %s wide "
                  "circID support, with %u inbound and %u outbound circuits. "
                  "Found %u circuit IDs in use by circuits, and %u with "
-                 "pending destroy cells."
-                 "Failing a circuit.",
+                 "pending destroy cells. (%u of those were marked bogusly.) "
+                 "The ones with pending destroy cells "
+                 "have been marked unusable for an average of %ld seconds "
+                 "and a maximum of %ld seconds. This channel is %ld seconds "
+                 "old. Failing a circuit.%s",
                  chan->wide_circ_ids ? "with" : "without",
                  chan->num_p_circuits, chan->num_n_circuits,
-                 n_with_circ, n_pending_destroy);
+                 n_with_circ, n_pending_destroy, n_weird_pending_destroy,
+                 (long)pending_destroy_time_total,
+                 (long)pending_destroy_time_max,
+                 (long)(approx_time() - chan->timestamp_created),
+                 m);
+      tor_free(m);
+
+      if (!chan->cmux) {
+        /* This warning should be impossible. */
+        log_warn(LD_BUG, "  This channel somehow has no cmux on it!");
+        return 0;
+      }
+
+      /* analysis so far on 12184 suggests that we're running out of circuit
+         IDs because it looks like we have too many pending destroy
+         cells. Let's see how many we really have pending.
+      */
+      queued_destroys = circuitmux_count_queued_destroy_cells(chan,
+                                                              chan->cmux);
+
+      log_warn(LD_CIRC, "  Circuitmux on this channel has %u circuits, "
+               "of which %u are active. It says it has "I64_FORMAT
+               " destroy cells queued.",
+               circuitmux_num_circuits(chan->cmux),
+               circuitmux_num_active_circuits(chan->cmux),
+               I64_PRINTF_ARG(queued_destroys));
+
+      /* Change this into "if (1)" in order to get more information about
+       * possible failure modes here.  You'll need to know how to use gdb with
+       * Tor: this will make Tor exit with an assertion failure if the cmux is
+       * corrupt. */
+      if (0)
+        circuitmux_assert_okay(chan->cmux);
+
       return 0;
     }
 
@@ -152,8 +196,20 @@ get_unique_circ_id_by_chan(channel_t *chan)
     in_use = circuit_id_in_use_on_channel(test_circ_id, chan);
     if (in_use == 1)
       ++n_with_circ;
-    else if (in_use == 2)
+    else if (in_use == 2) {
+      time_t since_when;
       ++n_pending_destroy;
+      since_when =
+        circuit_id_when_marked_unusable_on_channel(test_circ_id, chan);
+      if (since_when) {
+        time_t waiting = approx_time() - since_when;
+        pending_destroy_time_total += waiting;
+        if (waiting > pending_destroy_time_max)
+          pending_destroy_time_max = waiting;
+      } else {
+        ++n_weird_pending_destroy;
+      }
+    }
   } while (in_use);
   return test_circ_id;
 }

+ 28 - 1
src/or/circuitlist.c

@@ -59,6 +59,8 @@ typedef struct chan_circid_circuit_map_t {
   channel_t *chan;
   circid_t circ_id;
   circuit_t *circuit;
+  /* For debugging 12184: when was this placeholder item added? */
+  time_t made_placeholder_at;
 } chan_circid_circuit_map_t;
 
 /** Helper for hash tables: compare the channel and circuit ID for a and
@@ -184,6 +186,7 @@ circuit_set_circid_chan_helper(circuit_t *circ, int direction,
   found = HT_FIND(chan_circid_map, &chan_circid_map, &search);
   if (found) {
     found->circuit = circ;
+    found->made_placeholder_at = 0;
   } else {
     found = tor_malloc_zero(sizeof(chan_circid_circuit_map_t));
     found->circ_id = id;
@@ -241,11 +244,14 @@ channel_mark_circid_unusable(channel_t *chan, circid_t id)
              "a circuit there.", (unsigned)id, chan);
   } else if (ent) {
     /* It's already marked. */
+    if (!ent->made_placeholder_at)
+      ent->made_placeholder_at = approx_time();
   } else {
     ent = tor_malloc_zero(sizeof(chan_circid_circuit_map_t));
     ent->chan = chan;
     ent->circ_id = id;
-    /* leave circuit at NULL */
+    /* leave circuit at NULL. */
+    ent->made_placeholder_at = approx_time();
     HT_INSERT(chan_circid_map, &chan_circid_map, ent);
   }
 }
@@ -1096,6 +1102,27 @@ circuit_id_in_use_on_channel(circid_t circ_id, channel_t *chan)
   return 0;
 }
 
+/** Helper for debugging 12184.  Returns the time since which 'circ_id' has
+ * been marked unusable on 'chan'. */
+time_t
+circuit_id_when_marked_unusable_on_channel(circid_t circ_id, channel_t *chan)
+{
+  chan_circid_circuit_map_t search;
+  chan_circid_circuit_map_t *found;
+
+  memset(&search, 0, sizeof(search));
+  search.circ_id = circ_id;
+  search.chan = chan;
+
+  found = HT_FIND(chan_circid_map, &chan_circid_map, &search);
+
+  if (! found || found->circuit)
+    return 0;
+
+  return found->made_placeholder_at;
+}
+
+
 /** Return the circuit that a given edge connection is using. */
 circuit_t *
 circuit_get_by_edge_conn(edge_connection_t *conn)

+ 2 - 0
src/or/circuitlist.h

@@ -28,6 +28,8 @@ void circuit_set_n_circid_chan(circuit_t *circ, circid_t id,
                                channel_t *chan);
 void channel_mark_circid_unusable(channel_t *chan, circid_t id);
 void channel_mark_circid_usable(channel_t *chan, circid_t id);
+time_t circuit_id_when_marked_unusable_on_channel(circid_t circ_id,
+                                                  channel_t *chan);
 void circuit_set_state(circuit_t *circ, uint8_t state);
 void circuit_close_all_marked(void);
 int32_t circuit_initial_package_window(void);

+ 35 - 0
src/or/circuitmux.c

@@ -1909,3 +1909,38 @@ circuitmux_append_destroy_cell(channel_t *chan,
   }
 }
 
+/*DOCDOC; for debugging 12184.  This runs slowly. */
+int64_t
+circuitmux_count_queued_destroy_cells(const channel_t *chan,
+                                      const circuitmux_t *cmux)
+{
+  int64_t n_destroy_cells = cmux->destroy_ctr;
+  int64_t destroy_queue_size = cmux->destroy_cell_queue.n;
+
+  int64_t manual_total = 0;
+  int64_t manual_total_in_map = 0;
+  packed_cell_t *cell;
+
+  TOR_SIMPLEQ_FOREACH(cell, &cmux->destroy_cell_queue.head, next) {
+    circid_t id;
+    ++manual_total;
+
+    id = packed_cell_get_circid(cell, chan->wide_circ_ids);
+    if (circuit_id_in_use_on_channel(id, (channel_t*)chan))
+      ++manual_total_in_map;
+  }
+
+  if (n_destroy_cells != destroy_queue_size ||
+      n_destroy_cells != manual_total ||
+      n_destroy_cells != manual_total_in_map) {
+    log_warn(LD_BUG, "  Discrepancy in counts for queued destroy cells on "
+             "circuitmux. n="I64_FORMAT". queue_size="I64_FORMAT". "
+             "manual_total="I64_FORMAT". manual_total_in_map="I64_FORMAT".",
+             I64_PRINTF_ARG(n_destroy_cells),
+             I64_PRINTF_ARG(destroy_queue_size),
+             I64_PRINTF_ARG(manual_total),
+             I64_PRINTF_ARG(manual_total_in_map));
+  }
+
+  return n_destroy_cells;
+}

+ 4 - 0
src/or/circuitmux.h

@@ -121,6 +121,10 @@ unsigned int circuitmux_num_cells(circuitmux_t *cmux);
 unsigned int circuitmux_num_circuits(circuitmux_t *cmux);
 unsigned int circuitmux_num_active_circuits(circuitmux_t *cmux);
 
+/* Debuging interface - slow. */
+int64_t circuitmux_count_queued_destroy_cells(const channel_t *chan,
+                                              const circuitmux_t *cmux);
+
 /* Channel interface */
 circuit_t * circuitmux_get_first_active_circuit(circuitmux_t *cmux,
                                            cell_queue_t **destroy_queue_out);

+ 11 - 0
src/or/relay.c

@@ -2569,6 +2569,17 @@ packed_cell_get_command(const packed_cell_t *cell, int wide_circ_ids)
   }
 }
 
+/** Extract the circuit ID from a packed cell. */
+circid_t
+packed_cell_get_circid(const packed_cell_t *cell, int wide_circ_ids)
+{
+  if (wide_circ_ids) {
+    return ntohl(get_uint32(cell->body));
+  } else {
+    return ntohs(get_uint16(cell->body));
+  }
+}
+
 /** Pull as many cells as possible (but no more than <b>max</b>) from the
  * queue of the first active circuit on <b>chan</b>, and write them to
  * <b>chan</b>-&gt;outbuf.  Return the number of cells written.  Advance

+ 2 - 0
src/or/relay.h

@@ -82,6 +82,8 @@ void stream_choice_seed_weak_rng(void);
 int relay_crypt(circuit_t *circ, cell_t *cell, cell_direction_t cell_direction,
                 crypt_path_t **layer_hint, char *recognized);
 
+circid_t packed_cell_get_circid(const packed_cell_t *cell, int wide_circ_ids);
+
 #ifdef RELAY_PRIVATE
 STATIC int connected_cell_parse(const relay_header_t *rh, const cell_t *cell,
                          tor_addr_t *addr_out, int *ttl_out);