Browse Source

Bug 3443: Don't count ORconn setup in circuit build time.

Also, add a hack Roger suggested where we're more patient if no circuits are
opened yet.
Mike Perry 11 years ago
parent
commit
42e3c04a7a
9 changed files with 157 additions and 33 deletions
  1. 11 0
      changes/bug3443
  2. 13 1
      src/or/circuitbuild.c
  3. 7 2
      src/or/circuitlist.c
  4. 5 5
      src/or/circuitstats.c
  5. 99 19
      src/or/circuituse.c
  6. 2 2
      src/or/control.c
  7. 1 1
      src/or/entrynodes.c
  8. 1 0
      src/or/entrynodes.h
  9. 18 3
      src/or/or.h

+ 11 - 0
changes/bug3443

@@ -0,0 +1,11 @@
+ o Minor bugfixes
+   - Alter circuit build timeout measurement to start at the point
+     where we begin the CREATE/CREATE_FAST step (as opposed to circuit
+     initialization). This should make our timeout measurements more
+     uniform. Previously, we were sometimes including ORconn setup time
+     in our circuit build time measurements. Fixes bug #3443.
+
+ o Minor features
+   - If we have no circuits open, use a relaxed timeout (the 95-percentile
+     cutoff) until a circuit succeeds. This should allow Tor to succeed
+     building circuits if the network connection drastically changes.

+ 13 - 1
src/or/circuitbuild.c

@@ -522,6 +522,18 @@ circuit_deliver_create_cell(circuit_t *circ, uint8_t cell_type,
                                CELL_DIRECTION_OUT, 0);
 
   if (CIRCUIT_IS_ORIGIN(circ)) {
+    /* Update began timestamp for circuits starting their first hop */
+    if (TO_ORIGIN_CIRCUIT(circ)->cpath->state == CPATH_STATE_CLOSED) {
+      if (circ->n_chan->state != CHANNEL_STATE_OPEN) {
+        log_warn(LD_CIRC,
+                 "Got first hop for a circuit without an opened channel. "
+                 "State: %s.", channel_state_to_string(circ->n_chan->state));
+        tor_fragile_assert();
+      }
+
+      tor_gettimeofday(&circ->timestamp_began);
+    }
+
     /* mark it so it gets better rate limiting treatment. */
     channel_timestamp_client(circ->n_chan);
   }
@@ -672,7 +684,7 @@ circuit_send_next_onion_skin(origin_circuit_t *circ)
         struct timeval end;
         long timediff;
         tor_gettimeofday(&end);
-        timediff = tv_mdiff(&circ->base_.timestamp_created, &end);
+        timediff = tv_mdiff(&circ->base_.timestamp_began, &end);
 
         /*
          * If the circuit build time is much greater than we would have cut

+ 7 - 2
src/or/circuitlist.c

@@ -555,6 +555,11 @@ init_circuit_base(circuit_t *circ)
 {
   tor_gettimeofday(&circ->timestamp_created);
 
+  // Gets reset when we send CREATE_FAST.
+  // circuit_expire_building() expects these to be equal
+  // until the orconn is built.
+  circ->timestamp_began = circ->timestamp_created;
+
   circ->package_window = circuit_initial_package_window();
   circ->deliver_window = CIRCWINDOW_START;
 
@@ -777,7 +782,7 @@ circuit_dump_conn_details(int severity,
       "state %d (%s), born %ld:",
       conn_array_index, type, this_circid, other_circid, circ->state,
       circuit_state_to_string(circ->state),
-      (long)circ->timestamp_created.tv_sec);
+      (long)circ->timestamp_began.tv_sec);
   if (CIRCUIT_IS_ORIGIN(circ)) { /* circ starts at this node */
     circuit_log_path(severity, LD_CIRC, TO_ORIGIN_CIRCUIT(circ));
   }
@@ -840,7 +845,7 @@ circuit_dump_chan_details(int severity,
       "state %d (%s), born %ld:",
       chan, type, this_circid, other_circid, circ->state,
       circuit_state_to_string(circ->state),
-      (long)circ->timestamp_created.tv_sec);
+      (long)circ->timestamp_began.tv_sec);
   if (CIRCUIT_IS_ORIGIN(circ)) { /* circ starts at this node */
     circuit_log_path(severity, LD_CIRC, TO_ORIGIN_CIRCUIT(circ));
   }

+ 5 - 5
src/or/circuitstats.c

@@ -1469,11 +1469,6 @@ circuit_build_times_set_timeout_worker(circuit_build_times_t *cbt)
 
   max_time = circuit_build_times_max(cbt);
 
-  /* Sometimes really fast guard nodes give us such a steep curve
-   * that this ends up being not that much greater than timeout_ms.
-   * Make it be at least 1 min to handle this case. */
-  cbt->close_ms = MAX(cbt->close_ms, circuit_build_times_initial_timeout());
-
   if (cbt->timeout_ms > max_time) {
     log_info(LD_CIRC,
                "Circuit build timeout of %dms is beyond the maximum build "
@@ -1490,6 +1485,11 @@ circuit_build_times_set_timeout_worker(circuit_build_times_t *cbt)
     cbt->close_ms = 2*max_time;
   }
 
+  /* Sometimes really fast guard nodes give us such a steep curve
+   * that this ends up being not that much greater than timeout_ms.
+   * Make it be at least 1 min to handle this case. */
+  cbt->close_ms = MAX(cbt->close_ms, circuit_build_times_initial_timeout());
+
   cbt->have_computed_timeout = 1;
   return 1;
 }

+ 99 - 19
src/or/circuituse.c

@@ -176,6 +176,13 @@ circuit_is_better(const origin_circuit_t *oa, const origin_circuit_t *ob,
   const uint8_t purpose = ENTRY_TO_CONN(conn)->purpose;
   int a_bits, b_bits;
 
+  /* If one of the circuits was allowed to live due to relaxing its timeout,
+   * it is definitely worse (it's probably a much slower path). */
+  if (oa->relaxed_timeout && !ob->relaxed_timeout)
+    return 0; /* ob is better. It's not relaxed. */
+  if (!oa->relaxed_timeout && ob->relaxed_timeout)
+    return 1; /* oa is better. It's not relaxed. */
+
   switch (purpose) {
     case CIRCUIT_PURPOSE_C_GENERAL:
       /* if it's used but less dirty it's best;
@@ -187,7 +194,7 @@ circuit_is_better(const origin_circuit_t *oa, const origin_circuit_t *ob,
           return 1;
       } else {
         if (a->timestamp_dirty ||
-            timercmp(&a->timestamp_created, &b->timestamp_created, >))
+            timercmp(&a->timestamp_began, &b->timestamp_began, >))
           return 1;
         if (ob->build_state->is_internal)
           /* XXX023 what the heck is this internal thing doing here. I
@@ -279,7 +286,7 @@ circuit_get_best(const entry_connection_t *conn,
 
     if (purpose == CIRCUIT_PURPOSE_C_INTRODUCE_ACK_WAIT &&
         !must_be_open && circ->state != CIRCUIT_STATE_OPEN &&
-        tv_mdiff(&now, &circ->timestamp_created) > circ_times.timeout_ms) {
+        tv_mdiff(&now, &circ->timestamp_began) > circ_times.timeout_ms) {
       intro_going_on_but_too_old = 1;
       continue;
     }
@@ -365,8 +372,31 @@ circuit_expire_building(void)
   const or_options_t *options = get_options();
   struct timeval now;
   cpath_build_state_t *build_state;
+  int any_opened_circs = 0;
 
   tor_gettimeofday(&now);
+
+  /* Check to see if we have any opened circuits. If we don't,
+   * we want to be more lenient with timeouts, in case the
+   * user has relocated and/or changed network connections.
+   * See bug #3443. */
+  while (next_circ) {
+    if (!CIRCUIT_IS_ORIGIN(next_circ) || /* didn't originate here */
+        next_circ->marked_for_close) /* don't mess with marked circs */
+      continue;
+
+    if (TO_ORIGIN_CIRCUIT(next_circ)->has_opened &&
+        next_circ->state == CIRCUIT_STATE_OPEN &&
+        TO_ORIGIN_CIRCUIT(next_circ)->build_state &&
+        TO_ORIGIN_CIRCUIT(next_circ)->build_state->desired_path_len
+          == DEFAULT_ROUTE_LEN) {
+      any_opened_circs = 1;
+      break;
+    }
+    next_circ = next_circ->next;
+  }
+  next_circ = global_circuitlist;
+
 #define SET_CUTOFF(target, msec) do {                       \
     long ms = tor_lround(msec);                             \
     struct timeval diff;                                    \
@@ -391,8 +421,20 @@ circuit_expire_building(void)
     victim = next_circ;
     next_circ = next_circ->next;
     if (!CIRCUIT_IS_ORIGIN(victim) || /* didn't originate here */
-        victim->marked_for_close) /* don't mess with marked circs */
+        victim->marked_for_close)     /* don't mess with marked circs */
+      continue;
+
+    /* If we haven't yet started the first hop, it means we don't have
+     * any orconns available, and thus have not started counting time yet
+     * for this circuit. See circuit_deliver_create_cell() and uses of
+     * timestamp_began.
+     *
+     * Continue to wait in this case. The ORConn should timeout
+     * independently and kill us then.
+     */
+    if (TO_ORIGIN_CIRCUIT(victim)->cpath->state == CPATH_STATE_CLOSED) {
       continue;
+    }
 
     build_state = TO_ORIGIN_CIRCUIT(victim)->build_state;
     if (build_state && build_state->onehop_tunnel)
@@ -410,9 +452,40 @@ circuit_expire_building(void)
     if (TO_ORIGIN_CIRCUIT(victim)->hs_circ_has_timed_out)
       cutoff = hs_extremely_old_cutoff;
 
-    if (timercmp(&victim->timestamp_created, &cutoff, >))
+    if (timercmp(&victim->timestamp_began, &cutoff, >))
       continue; /* it's still young, leave it alone */
 
+    if (!any_opened_circs) {
+      /* It's still young enough that we wouldn't close it, right? */
+      if (timercmp(&victim->timestamp_began, &close_cutoff, >)) {
+        if (!TO_ORIGIN_CIRCUIT(victim)->relaxed_timeout) {
+          int first_hop_succeeded = TO_ORIGIN_CIRCUIT(victim)->cpath->state
+                                      == CPATH_STATE_OPEN;
+          log_info(LD_CIRC,
+                 "No circuits are opened. Relaxing timeout for "
+                 "a circuit with channel state %s. %d guards are live.",
+                 channel_state_to_string(victim->n_chan->state),
+                 num_live_entry_guards());
+
+          /* We count the timeout here for CBT, because technically this
+           * was a timeout, and the timeout value needs to reset if we
+           * see enough of them. Note this means we also need to avoid
+           * double-counting below, too. */
+          circuit_build_times_count_timeout(&circ_times, first_hop_succeeded);
+          TO_ORIGIN_CIRCUIT(victim)->relaxed_timeout = 1;
+        }
+        continue;
+      } else {
+        log_notice(LD_CIRC,
+                 "No circuits are opened. Relaxed timeout for "
+                 "a circuit with channel state %s to %ldms. "
+                 "However, it appears the circuit has timed out anyway. "
+                 "%d guards are live. ",
+                 channel_state_to_string(victim->n_chan->state),
+                 (long)circ_times.close_ms, num_live_entry_guards());
+      }
+    }
+
 #if 0
     /* some debug logs, to help track bugs */
     if (victim->purpose >= CIRCUIT_PURPOSE_C_INTRODUCING &&
@@ -489,9 +562,12 @@ circuit_expire_building(void)
           /* Record this failure to check for too many timeouts
            * in a row. This function does not record a time value yet
            * (we do that later); it only counts the fact that we did
-           * have a timeout. */
-          circuit_build_times_count_timeout(&circ_times,
-                                            first_hop_succeeded);
+           * have a timeout. We also want to avoid double-counting
+           * already "relaxed" circuits, which are counted above. */
+          if (!TO_ORIGIN_CIRCUIT(victim)->relaxed_timeout) {
+            circuit_build_times_count_timeout(&circ_times,
+                                              first_hop_succeeded);
+          }
           continue;
         }
 
@@ -500,16 +576,16 @@ circuit_expire_building(void)
          * it off at, we probably had a suspend event along this codepath,
          * and we should discard the value.
          */
-        if (timercmp(&victim->timestamp_created, &extremely_old_cutoff, <)) {
+        if (timercmp(&victim->timestamp_began, &extremely_old_cutoff, <)) {
           log_notice(LD_CIRC,
                      "Extremely large value for circuit build timeout: %lds. "
                      "Assuming clock jump. Purpose %d (%s)",
-                     (long)(now.tv_sec - victim->timestamp_created.tv_sec),
+                     (long)(now.tv_sec - victim->timestamp_began.tv_sec),
                      victim->purpose,
                      circuit_purpose_to_string(victim->purpose));
         } else if (circuit_build_times_count_close(&circ_times,
                                          first_hop_succeeded,
-                                         victim->timestamp_created.tv_sec)) {
+                                         victim->timestamp_began.tv_sec)) {
           circuit_build_times_set_timeout(&circ_times);
         }
       }
@@ -791,7 +867,7 @@ circuit_build_needed_circs(time_t now)
     circ = circuit_get_youngest_clean_open(CIRCUIT_PURPOSE_C_GENERAL);
     if (get_options()->RunTesting &&
         circ &&
-        circ->timestamp_created.tv_sec + TESTING_CIRCUIT_INTERVAL < now) {
+        circ->timestamp_began.tv_sec + TESTING_CIRCUIT_INTERVAL < now) {
       log_fn(LOG_INFO,"Creating a new testing circuit.");
       circuit_launch(CIRCUIT_PURPOSE_C_GENERAL, 0);
     }
@@ -911,7 +987,7 @@ circuit_expire_old_circuits_clientside(void)
                 circ->purpose);
       circuit_mark_for_close(circ, END_CIRC_REASON_FINISHED);
     } else if (!circ->timestamp_dirty && circ->state == CIRCUIT_STATE_OPEN) {
-      if (timercmp(&circ->timestamp_created, &cutoff, <)) {
+      if (timercmp(&circ->timestamp_began, &cutoff, <)) {
         if (circ->purpose == CIRCUIT_PURPOSE_C_GENERAL ||
                 circ->purpose == CIRCUIT_PURPOSE_C_MEASURE_TIMEOUT ||
                 circ->purpose == CIRCUIT_PURPOSE_S_ESTABLISH_INTRO ||
@@ -921,7 +997,7 @@ circuit_expire_old_circuits_clientside(void)
                 circ->purpose == CIRCUIT_PURPOSE_S_CONNECT_REND) {
           log_debug(LD_CIRC,
                     "Closing circuit that has been unused for %ld msec.",
-                    tv_mdiff(&circ->timestamp_created, &now));
+                    tv_mdiff(&circ->timestamp_began, &now));
           circuit_mark_for_close(circ, END_CIRC_REASON_FINISHED);
         } else if (!TO_ORIGIN_CIRCUIT(circ)->is_ancient) {
           /* Server-side rend joined circuits can end up really old, because
@@ -935,7 +1011,7 @@ circuit_expire_old_circuits_clientside(void)
                        "Ancient non-dirty circuit %d is still around after "
                        "%ld milliseconds. Purpose: %d (%s)",
                        TO_ORIGIN_CIRCUIT(circ)->global_identifier,
-                       tv_mdiff(&circ->timestamp_created, &now),
+                       tv_mdiff(&circ->timestamp_began, &now),
                        circ->purpose,
                        circuit_purpose_to_string(circ->purpose));
             TO_ORIGIN_CIRCUIT(circ)->is_ancient = 1;
@@ -1316,20 +1392,24 @@ circuit_launch_by_extend_info(uint8_t purpose,
     circ = circuit_find_to_cannibalize(purpose, extend_info, flags);
     if (circ) {
       uint8_t old_purpose = circ->base_.purpose;
-      struct timeval old_timestamp_created;
+      struct timeval old_timestamp_began;
 
       log_info(LD_CIRC,"Cannibalizing circ '%s' for purpose %d (%s)",
                build_state_get_exit_nickname(circ->build_state), purpose,
                circuit_purpose_to_string(purpose));
 
       circuit_change_purpose(TO_CIRCUIT(circ), purpose);
-      /* reset the birth date of this circ, else expire_building
+      /* Reset the start date of this circ, else expire_building
        * will see it and think it's been trying to build since it
-       * began. */
-      tor_gettimeofday(&circ->base_.timestamp_created);
+       * began.
+       *
+       * Technically, the code should reset this when the
+       * create cell is finally sent, but we're close enough
+       * here. */
+      tor_gettimeofday(&circ->base_.timestamp_began);
 
       control_event_circuit_cannibalized(circ, old_purpose,
-                                         &old_timestamp_created);
+                                         &old_timestamp_began);
 
       switch (purpose) {
         case CIRCUIT_PURPOSE_C_ESTABLISH_REND:

+ 2 - 2
src/or/control.c

@@ -3578,9 +3578,9 @@ control_event_circuit_status_minor(origin_circuit_t *circ,
         /* event_tail can currently be up to 130 chars long */
         const char *hs_state_str =
           circuit_purpose_to_controller_hs_state_string(purpose);
-        const struct timeval *old_timestamp_created = tv;
+        const struct timeval *old_timestamp_began = tv;
         char tbuf[ISO_TIME_USEC_LEN+1];
-        format_iso_time_nospace_usec(tbuf, old_timestamp_created);
+        format_iso_time_nospace_usec(tbuf, old_timestamp_began);
 
         tor_snprintf(event_tail, sizeof(event_tail),
                      " OLD_PURPOSE=%s%s%s OLD_TIME_CREATED=%s",

+ 1 - 1
src/or/entrynodes.c

@@ -219,7 +219,7 @@ entry_is_live(entry_guard_t *e, int need_uptime, int need_capacity,
 }
 
 /** Return the number of entry guards that we think are usable. */
-static int
+int
 num_live_entry_guards(void)
 {
   int n = 0;

+ 1 - 0
src/or/entrynodes.h

@@ -52,6 +52,7 @@ typedef struct entry_guard_t {
 entry_guard_t *entry_guard_get_by_id_digest(const char *digest);
 void entry_guards_changed(void);
 const smartlist_t *get_entry_guards(void);
+int num_live_entry_guards(void);
 
 #endif
 

+ 18 - 3
src/or/or.h

@@ -2702,10 +2702,21 @@ typedef struct circuit_t {
     * length ONIONSKIN_CHALLENGE_LEN. */
   char *n_chan_onionskin;
 
-  /** When was this circuit created?  We keep this timestamp with a higher
-   * resolution than most so that the circuit-build-time tracking code can
-   * get millisecond resolution. */
+  /** When did circuit construction actually begin (ie send the
+   * CREATE cell or begin cannibalization).
+   *
+   * Note: This timer will get reset if we decide to cannibalize
+   * a circuit. It may also get reset during certain phases of hidden
+   * service circuit use.
+   *
+   * We keep this timestamp with a higher resolution than most so that the
+   * circuit-build-time tracking code can get millisecond resolution.
+   */
+  struct timeval timestamp_began;
+
+  /** This timestamp marks when the init_circuit_base constructor ran. */
   struct timeval timestamp_created;
+
   /** When the circuit was first used, or 0 if the circuit is clean.
    *
    * XXXX023 Note that some code will artifically adjust this value backward
@@ -2815,6 +2826,10 @@ typedef struct origin_circuit_t {
    * service-side introduction circuits never have this flag set.) */
   unsigned int hs_circ_has_timed_out : 1;
 
+  /** Set iff this circuit has been given a relaxed timeout because
+   * no circuits have opened. Used to prevent spamming logs. */
+  unsigned int relaxed_timeout : 1;
+
   /** Set iff this is a service-side rendezvous circuit for which a
    * new connection attempt has been launched.  We consider launching
    * a new service-side rend circ to a client when the previous one