Browse Source

the third piece of bug 969 fixing

when we write out our stability info, detect relays that have slipped
through the cracks. log about them and correct the problem.

if we continue to see a lot of these over time, it means there's another
spot where relays fall out of the routerlist without being marked as
unreachable.
Roger Dingledine 15 years ago
parent
commit
e7bc189f7c
3 changed files with 24 additions and 7 deletions
  1. 5 4
      src/or/main.c
  2. 1 1
      src/or/or.h
  3. 18 2
      src/or/rephist.c

+ 5 - 4
src/or/main.c

@@ -903,7 +903,7 @@ run_scheduled_events(time_t now)
     time_to_downrate_stability = rep_hist_downrate_old_runs(now);
   if (authdir_mode_tests_reachability(options)) {
     if (time_to_save_stability < now) {
-      if (time_to_save_stability && rep_hist_record_mtbf_data()<0) {
+      if (time_to_save_stability && rep_hist_record_mtbf_data(now, 1)<0) {
         log_warn(LD_GENERAL, "Couldn't store mtbf data.");
       }
 #define SAVE_STABILITY_INTERVAL (30*60)
@@ -1955,14 +1955,15 @@ tor_cleanup(void)
   /* Remove our pid file. We don't care if there was an error when we
    * unlink, nothing we could do about it anyways. */
   if (options->command == CMD_RUN_TOR) {
+    time_t now = time(NULL);
     if (options->PidFile)
       unlink(options->PidFile);
     if (accounting_is_enabled(options))
-      accounting_record_bandwidth_usage(time(NULL), get_or_state());
+      accounting_record_bandwidth_usage(now, get_or_state());
     or_state_mark_dirty(get_or_state(), 0); /* force an immediate save. */
-    or_state_save(time(NULL));
+    or_state_save(now);
     if (authdir_mode_tests_reachability(options))
-      rep_hist_record_mtbf_data();
+      rep_hist_record_mtbf_data(now, 0);
   }
 #ifdef USE_DMALLOC
   dmalloc_log_stats();

+ 1 - 1
src/or/or.h

@@ -3970,7 +3970,7 @@ void rep_history_clean(time_t before);
 
 void rep_hist_note_router_reachable(const char *id, time_t when);
 void rep_hist_note_router_unreachable(const char *id, time_t when);
-int rep_hist_record_mtbf_data(void);
+int rep_hist_record_mtbf_data(time_t now, int missing_means_down);
 int rep_hist_load_mtbf_data(time_t now);
 
 time_t rep_hist_downrate_old_runs(time_t now);

+ 18 - 2
src/or/rephist.c

@@ -683,9 +683,13 @@ rep_history_clean(time_t before)
   }
 }
 
-/** Write MTBF data to disk.  Returns 0 on success, negative on failure. */
+/** Write MTBF data to disk. Return 0 on success, negative on failure.
+ *
+ * If <b>missing_means_down</b>, then if we're about to write an entry
+ * that is still considered up but isn't in our routerlist, consider it
+ * to be down. */
 int
-rep_hist_record_mtbf_data(void)
+rep_hist_record_mtbf_data(time_t now, int missing_means_down)
 {
   char time_buf[ISO_TIME_LEN+1];
 
@@ -745,6 +749,18 @@ rep_hist_record_mtbf_data(void)
     hist = (or_history_t*) or_history_p;
 
     base16_encode(dbuf, sizeof(dbuf), digest, DIGEST_LEN);
+
+    if (missing_means_down && hist->start_of_run &&
+        !router_get_by_digest(digest)) {
+      /* We think this relay is running, but it's not listed in our
+       * routerlist. Somehow it fell out without telling us it went
+       * down. Complain and also correct it. */
+      log_info(LD_HIST,
+               "Relay '%s' is listed as up in rephist, but it's not in "
+               "our routerlist. Correcting.", dbuf);
+      rep_hist_note_router_unreachable(digest, now);
+    }
+
     PRINTF((f, "R %s\n", dbuf));
     if (hist->start_of_run > 0) {
       format_iso_time(time_buf, hist->start_of_run);