Browse Source

Merge remote-tracking branch 'dgoulet/ticket12541_032_02'

Nick Mathewson 6 years ago
parent
commit
0f4f40b70f

+ 23 - 0
changes/ticket12541

@@ -0,0 +1,23 @@
+  o Major feature (scheduler, channel):
+    - Introducing the KIST scheduler which stands for Kernel Informed Socket
+      Transport. It is only available on Linux systems. This comes from a
+      researched and published paper you can find here:
+
+      http://www.robgjansen.com/publications/kist-sec2014.pdf
+      https://arxiv.org/abs/1709.01044
+
+      This is also a major refactoring of the entire scheduler subsystem in
+      order for it to be more modular and thus much more easier to add more
+      scheduler type later. The current scheduler has been named "Vanilla" but
+      we favor KIST if available in this version.
+
+      A new torrc option has been added and named "Schedulers type1,type2,..."
+      which allows a user to select which scheduler type it wants tor to use.
+      It is also possible to change it at runtime. It is an ordered list by
+      priority. KIST might not be available on all platforms so there is a
+      fallback to "KISTLite" that uses the same mechanisms but without the
+      kernel support.
+
+      The current default values are: Schedulers KIST,KISTLite,Vanilla.
+
+      Closes ticket 12541.

+ 28 - 0
configure.ac

@@ -792,6 +792,34 @@ AC_CHECK_MEMBERS([SSL.state], , ,
 [#include <openssl/ssl.h>
 ])
 
+dnl Define the set of checks for KIST scheduler support.
+AC_DEFUN([CHECK_KIST_SUPPORT],[
+  dnl KIST needs struct tcp_info and for certain members to exist.
+  AC_CHECK_MEMBERS(
+    [struct tcp_info.tcpi_unacked, struct tcp_info.tcpi_snd_mss],
+    , ,[[#include <netinet/tcp.h>]])
+  dnl KIST needs SIOCOUTQNSD to exist for an ioctl call.
+  AC_COMPILE_IFELSE([AC_LANG_PROGRAM([], [
+                     #include <linux/sockios.h>
+                     #ifndef SIOCOUTQNSD
+                     #error
+                     #endif
+                     ])], have_siocoutqnsd=yes, have_siocoutqnsd=no)
+  if test "x$have_siocoutqnsd" = "xyes"; then
+    if test "x$ac_cv_member_struct_tcp_info_tcpi_unacked" = "xyes"; then
+      if test "x$ac_cv_member_struct_tcp_info_tcpi_snd_mss" = "xyes"; then
+        have_kist_support=yes
+      fi
+    fi
+  fi
+])
+dnl Now, trigger the check.
+CHECK_KIST_SUPPORT
+AS_IF([test "x$have_kist_support" = "xyes"],
+      [AC_DEFINE(HAVE_KIST_SUPPORT, 1, [Defined if KIST scheduler is supported
+                                        on this system])],
+      [AC_MSG_NOTICE([KIST scheduler can't be used. Missing support.])])
+
 LIBS="$save_LIBS"
 LDFLAGS="$save_LDFLAGS"
 CPPFLAGS="$save_CPPFLAGS"

+ 30 - 0
doc/tor.1.txt

@@ -782,6 +782,36 @@ GENERAL OPTIONS
     option has been set to 1, it cannot be set back to 0 without
     restarting Tor. (Default: 0)
 
+[[Schedulers]] **Schedulers** **KIST**|**KISTLite**|**Vanilla**::
+    Specify the scheduler type that tor should use to handle outbound data on
+    channels. This is an ordered list by priority which means that the first
+    value will be tried first and if unavailable, the second one is tried and
+    so on. It is possible to change thse values at runtime.
+    (Default: KIST,KISTLite,Vanilla)
+ +
+    The possible scheduler types are:
++
+    KIST: Kernel Informed Socket Transport. Tor will use the kernel tcp
+    information stack per-socket to make an informed decision on if it should
+    send or not the data. (Only available on Linux)
++
+    KISTLite: Same as KIST but without kernel support which means that tor
+    will use all the same mecanics as KIST but without the TCP information the
+    kernel can provide.
++
+    Vanilla: The scheduler that tor has always used that is do as much as
+    possible or AMAP.
+
+[[KISTSchedRunInterval]] **KISTSchedRunInterval** __NUM__ **msec**::
+    If KIST or KISTLite is used in Schedulers option, this control at which
+    interval the scheduler tick is. If the value is 0 msec, the value is taken
+    from the consensus if possible else it will fallback to the default 10
+    msec. Maximum possible value is 100 msec. (Default: 0 msec)
+
+[[KISTSockBufSizeFactor]] **KISTSockBufSizeFactor** __NUM__::
+    If KIST is used in Schedulers, this is a multiplier of the per-socket
+    limit calculation of the KIST algorithm. (Default: 1.0)
+
 CLIENT OPTIONS
 --------------
 

+ 33 - 1
src/common/sandbox.c

@@ -653,6 +653,25 @@ sb_socketpair(scmp_filter_ctx ctx, sandbox_cfg_t *filter)
   return 0;
 }
 
+#ifdef HAVE_KIST_SUPPORT
+
+#include <linux/sockios.h>
+
+static int
+sb_ioctl(scmp_filter_ctx ctx, sandbox_cfg_t *filter)
+{
+  int rc;
+  (void) filter;
+
+  rc = seccomp_rule_add_1(ctx, SCMP_ACT_ALLOW, SCMP_SYS(ioctl),
+                          SCMP_CMP(1, SCMP_CMP_EQ, SIOCOUTQNSD));
+  if (rc)
+    return rc;
+  return 0;
+}
+
+#endif /* HAVE_KIST_SUPPORT */
+
 /**
  * Function responsible for setting up the setsockopt syscall for
  * the seccomp filter sandbox.
@@ -760,6 +779,15 @@ sb_getsockopt(scmp_filter_ctx ctx, sandbox_cfg_t *filter)
     return rc;
 #endif
 
+#ifdef HAVE_KIST_SUPPORT
+#include <netinet/tcp.h>
+  rc = seccomp_rule_add_2(ctx, SCMP_ACT_ALLOW, SCMP_SYS(getsockopt),
+      SCMP_CMP(1, SCMP_CMP_EQ, SOL_TCP),
+      SCMP_CMP(2, SCMP_CMP_EQ, TCP_INFO));
+  if (rc)
+    return rc;
+#endif
+
   return 0;
 }
 
@@ -1060,7 +1088,11 @@ static sandbox_filter_func_t filter_func[] = {
     sb_socket,
     sb_setsockopt,
     sb_getsockopt,
-    sb_socketpair
+    sb_socketpair,
+
+#ifdef HAVE_KIST_SUPPORT
+    sb_ioctl,
+#endif
 };
 
 const char *

+ 2 - 0
src/ext/ht.h

@@ -150,6 +150,8 @@
 #define HT_CLEAR(name, head)         name##_HT_CLEAR(head)
 #define HT_INIT(name, head)          name##_HT_INIT(head)
 #define HT_REP_IS_BAD_(name, head)    name##_HT_REP_IS_BAD_(head)
+#define HT_FOREACH_FN(name, head, fn, data) \
+   name##_HT_FOREACH_FN((head), (fn), (data))
 /* Helper: */
 static inline unsigned
 ht_improve_hash(unsigned h)

+ 2 - 6
src/or/channel.c

@@ -2603,8 +2603,8 @@ channel_flush_cells(channel_t *chan)
  * available.
  */
 
-int
-channel_more_to_flush(channel_t *chan)
+MOCK_IMPL(int,
+channel_more_to_flush, (channel_t *chan))
 {
   tor_assert(chan);
 
@@ -4841,8 +4841,6 @@ channel_update_xmit_queue_size(channel_t *chan)
                 U64_FORMAT ", new size is " U64_FORMAT,
                 U64_PRINTF_ARG(adj), U64_PRINTF_ARG(chan->global_identifier),
                 U64_PRINTF_ARG(estimated_total_queue_size));
-      /* Tell the scheduler we're increasing the queue size */
-      scheduler_adjust_queue_size(chan, 1, adj);
     }
   } else if (queued < chan->bytes_queued_for_xmit) {
     adj = chan->bytes_queued_for_xmit - queued;
@@ -4865,8 +4863,6 @@ channel_update_xmit_queue_size(channel_t *chan)
                 U64_FORMAT ", new size is " U64_FORMAT,
                 U64_PRINTF_ARG(adj), U64_PRINTF_ARG(chan->global_identifier),
                 U64_PRINTF_ARG(estimated_total_queue_size));
-      /* Tell the scheduler we're decreasing the queue size */
-      scheduler_adjust_queue_size(chan, -1, adj);
     }
   }
 }

+ 1 - 1
src/or/channel.h

@@ -568,7 +568,7 @@ MOCK_DECL(ssize_t, channel_flush_some_cells,
           (channel_t *chan, ssize_t num_cells));
 
 /* Query if data available on this channel */
-int channel_more_to_flush(channel_t *chan);
+MOCK_DECL(int, channel_more_to_flush, (channel_t *chan));
 
 /* Notify flushed outgoing for dirreq handling */
 void channel_notify_flushed(channel_t *chan);

+ 72 - 19
src/or/config.c

@@ -488,9 +488,12 @@ static config_var_t option_vars_[] = {
   V(ServerDNSSearchDomains,      BOOL,     "0"),
   V(ServerDNSTestAddresses,      CSV,
       "www.google.com,www.mit.edu,www.yahoo.com,www.slashdot.org"),
-  V(SchedulerLowWaterMark__,     MEMUNIT,  "100 MB"),
-  V(SchedulerHighWaterMark__,    MEMUNIT,  "101 MB"),
-  V(SchedulerMaxFlushCells__,    UINT,     "1000"),
+  OBSOLETE("SchedulerLowWaterMark__"),
+  OBSOLETE("SchedulerHighWaterMark__"),
+  OBSOLETE("SchedulerMaxFlushCells__"),
+  V(KISTSchedRunInterval,        MSEC_INTERVAL, "0 msec"),
+  V(KISTSockBufSizeFactor,       DOUBLE,   "1.0"),
+  V(Schedulers,                  CSV,      "KIST,KISTLite,Vanilla"),
   V(ShutdownWaitLength,          INTERVAL, "30 seconds"),
   OBSOLETE("SocksListenAddress"),
   V(SocksPolicy,                 LINELIST, NULL),
@@ -918,6 +921,10 @@ or_options_free(or_options_t *options)
                       rs, routerset_free(rs));
     smartlist_free(options->NodeFamilySets);
   }
+  if (options->SchedulerTypes_) {
+    SMARTLIST_FOREACH(options->SchedulerTypes_, int *, i, tor_free(i));
+    smartlist_free(options->SchedulerTypes_);
+  }
   tor_free(options->BridgePassword_AuthDigest_);
   tor_free(options->command_arg);
   tor_free(options->master_key_fname);
@@ -1828,11 +1835,9 @@ options_act(const or_options_t *old_options)
     return -1;
   }
 
-  /* Set up scheduler thresholds */
-  scheduler_set_watermarks((uint32_t)options->SchedulerLowWaterMark__,
-                           (uint32_t)options->SchedulerHighWaterMark__,
-                           (options->SchedulerMaxFlushCells__ > 0) ?
-                           options->SchedulerMaxFlushCells__ : 1000);
+  /* Inform the scheduler subsystem that a configuration changed happened. It
+   * might be a change of scheduler or parameter. */
+  scheduler_conf_changed();
 
   /* Set up accounting */
   if (accounting_parse_options(options, 0)<0) {
@@ -2928,6 +2933,61 @@ warn_about_relative_paths(or_options_t *options)
   return n != 0;
 }
 
+/* Validate options related to the scheduler. From the Schedulers list, the
+ * SchedulerTypes_ list is created with int values so once we select the
+ * scheduler, which can happen anytime at runtime, we don't have to parse
+ * strings and thus be quick.
+ *
+ * Return 0 on success else -1 and msg is set with an error message. */
+static int
+options_validate_scheduler(or_options_t *options, char **msg)
+{
+  tor_assert(options);
+  tor_assert(msg);
+
+  if (!options->Schedulers || smartlist_len(options->Schedulers) == 0) {
+    REJECT("Empty Schedulers list. Either remove the option so the defaults "
+           "can be used or set at least one value.");
+  }
+  /* Ok, we do have scheduler types, validate them. */
+  options->SchedulerTypes_ = smartlist_new();
+  SMARTLIST_FOREACH_BEGIN(options->Schedulers, const char *, type) {
+    int *sched_type;
+    if (!strcasecmp("KISTLite", type)) {
+      sched_type = tor_malloc_zero(sizeof(int));
+      *sched_type = SCHEDULER_KIST_LITE;
+      smartlist_add(options->SchedulerTypes_, sched_type);
+    } else if (!strcasecmp("KIST", type)) {
+      sched_type = tor_malloc_zero(sizeof(int));
+      *sched_type = SCHEDULER_KIST;
+      smartlist_add(options->SchedulerTypes_, sched_type);
+    } else if (!strcasecmp("Vanilla", type)) {
+      sched_type = tor_malloc_zero(sizeof(int));
+      *sched_type = SCHEDULER_VANILLA;
+      smartlist_add(options->SchedulerTypes_, sched_type);
+    } else {
+      tor_asprintf(msg, "Unknown type %s in option Schedulers. "
+                        "Possible values are KIST, KISTLite and Vanilla.",
+                   escaped(type));
+      return -1;
+    }
+  } SMARTLIST_FOREACH_END(type);
+
+  if (options->KISTSockBufSizeFactor < 0) {
+    REJECT("KISTSockBufSizeFactor must be at least 0");
+  }
+
+  /* Don't need to validate that the Interval is less than anything because
+   * zero is valid and all negative values are valid. */
+  if (options->KISTSchedRunInterval > KIST_SCHED_RUN_INTERVAL_MAX) {
+    tor_asprintf(msg, "KISTSchedRunInterval must not be more than %d (ms)",
+                 KIST_SCHED_RUN_INTERVAL_MAX);
+    return -1;
+  }
+
+  return 0;
+}
+
 /* Validate options related to single onion services.
  * Modifies some options that are incompatible with single onion services.
  * On failure returns -1, and sets *msg to an error string.
@@ -3156,17 +3216,6 @@ options_validate(or_options_t *old_options, or_options_t *options,
     routerset_union(options->ExcludeExitNodesUnion_,options->ExcludeNodes);
   }
 
-  if (options->SchedulerLowWaterMark__ == 0 ||
-      options->SchedulerLowWaterMark__ > UINT32_MAX) {
-    log_warn(LD_GENERAL, "Bad SchedulerLowWaterMark__ option");
-    return -1;
-  } else if (options->SchedulerHighWaterMark__ <=
-             options->SchedulerLowWaterMark__ ||
-             options->SchedulerHighWaterMark__ > UINT32_MAX) {
-    log_warn(LD_GENERAL, "Bad SchedulerHighWaterMark option");
-    return -1;
-  }
-
   if (options->NodeFamilies) {
     options->NodeFamilySets = smartlist_new();
     for (cl = options->NodeFamilies; cl; cl = cl->next) {
@@ -4285,6 +4334,10 @@ options_validate(or_options_t *old_options, or_options_t *options,
       REJECT("BridgeRelay is 1, ORPort is not set. This is an invalid "
              "combination.");
 
+  if (options_validate_scheduler(options, msg) < 0) {
+    return -1;
+  }
+
   return 0;
 }
 

+ 2 - 0
src/or/include.am

@@ -99,6 +99,8 @@ LIBTOR_A_SOURCES = \
 	src/or/routerparse.c				\
 	src/or/routerset.c				\
 	src/or/scheduler.c				\
+	src/or/scheduler_kist.c				\
+	src/or/scheduler_vanilla.c			\
 	src/or/statefile.c				\
 	src/or/status.c					\
 	src/or/torcert.c				\

+ 14 - 5
src/or/networkstatus.c

@@ -61,6 +61,7 @@
 #include "router.h"
 #include "routerlist.h"
 #include "routerparse.h"
+#include "scheduler.h"
 #include "shared_random.h"
 #include "transports.h"
 #include "torcert.h"
@@ -1561,6 +1562,15 @@ notify_control_networkstatus_changed(const networkstatus_t *old_c,
   smartlist_free(changed);
 }
 
+/* Called when the consensus has changed from old_c to new_c. */
+static void
+notify_networkstatus_changed(const networkstatus_t *old_c,
+                             const networkstatus_t *new_c)
+{
+  notify_control_networkstatus_changed(old_c, new_c);
+  scheduler_notify_networkstatus_changed(old_c, new_c);
+}
+
 /** Copy all the ancillary information (like router download status and so on)
  * from <b>old_c</b> to <b>new_c</b>. */
 static void
@@ -1886,8 +1896,7 @@ networkstatus_set_current_consensus(const char *consensus,
   const int is_usable_flavor = flav == usable_consensus_flavor();
 
   if (is_usable_flavor) {
-    notify_control_networkstatus_changed(
-                         networkstatus_get_latest_consensus(), c);
+    notify_networkstatus_changed(networkstatus_get_latest_consensus(), c);
   }
   if (flav == FLAV_NS) {
     if (current_ns_consensus) {
@@ -2314,9 +2323,9 @@ get_net_param_from_list(smartlist_t *net_params, const char *param_name,
  * Make sure the value parsed from the consensus is at least
  * <b>min_val</b> and at most <b>max_val</b> and raise/cap the parsed value
  * if necessary. */
-int32_t
-networkstatus_get_param(const networkstatus_t *ns, const char *param_name,
-                        int32_t default_val, int32_t min_val, int32_t max_val)
+MOCK_IMPL(int32_t,
+networkstatus_get_param, (const networkstatus_t *ns, const char *param_name,
+                        int32_t default_val, int32_t min_val, int32_t max_val))
 {
   if (!ns) /* if they pass in null, go find it ourselves */
     ns = networkstatus_get_latest_consensus();

+ 3 - 4
src/or/networkstatus.h

@@ -109,10 +109,9 @@ void signed_descs_update_status_from_consensus_networkstatus(
 char *networkstatus_getinfo_helper_single(const routerstatus_t *rs);
 char *networkstatus_getinfo_by_purpose(const char *purpose_string, time_t now);
 void networkstatus_dump_bridge_status_to_file(time_t now);
-int32_t networkstatus_get_param(const networkstatus_t *ns,
-                                const char *param_name,
-                                int32_t default_val, int32_t min_val,
-                                int32_t max_val);
+MOCK_DECL(int32_t, networkstatus_get_param,
+          (const networkstatus_t *ns, const char *param_name,
+           int32_t default_val, int32_t min_val, int32_t max_val));
 int32_t networkstatus_get_overridable_param(const networkstatus_t *ns,
                                             int32_t torrc_value,
                                             const char *param_name,

+ 15 - 13
src/or/or.h

@@ -4548,19 +4548,6 @@ typedef struct {
   /** How long (seconds) do we keep a guard before picking a new one? */
   int GuardLifetime;
 
-  /** Low-water mark for global scheduler - start sending when estimated
-   * queued size falls below this threshold.
-   */
-  uint64_t SchedulerLowWaterMark__;
-  /** High-water mark for global scheduler - stop sending when estimated
-   * queued size exceeds this threshold.
-   */
-  uint64_t SchedulerHighWaterMark__;
-  /** Flush size for global scheduler - flush this many cells at a time
-   * when sending.
-   */
-  int SchedulerMaxFlushCells__;
-
   /** Is this an exit node?  This is a tristate, where "1" means "yes, and use
    * the default exit policy if none is given" and "0" means "no; exit policy
    * is 'reject *'" and "auto" (-1) means "same as 1, but warn the user."
@@ -4633,6 +4620,21 @@ typedef struct {
   /** Bool (default: 0). Tells Tor to never try to exec another program.
    */
   int NoExec;
+
+  /** Have the KIST scheduler run every X milliseconds. If less than zero, do
+   * not use the KIST scheduler but use the old vanilla scheduler instead. If
+   * zero, do what the consensus says and fall back to using KIST as if this is
+   * set to "10 msec" if the consensus doesn't say anything. */
+  int64_t KISTSchedRunInterval;
+
+  /** A multiplier for the KIST per-socket limit calculation. */
+  double KISTSockBufSizeFactor;
+
+  /** The list of scheduler type string ordered by priority that is first one
+   * has to be tried first. Default: KIST,KISTLite,Vanilla */
+  smartlist_t *Schedulers;
+  /* An ordered list of scheduler_types mapped from Schedulers. */
+  smartlist_t *SchedulerTypes_;
 } or_options_t;
 
 /** Persistent state for an onion router, as saved to disk. */

+ 324 - 411
src/or/scheduler.c

@@ -2,45 +2,50 @@
 /* See LICENSE for licensing information */
 
 #include "or.h"
-
-#define TOR_CHANNEL_INTERNAL_ /* For channel_flush_some_cells() */
-#include "channel.h"
+#include "config.h"
 
 #include "compat_libevent.h"
 #define SCHEDULER_PRIVATE_
+#define SCHEDULER_KIST_PRIVATE
 #include "scheduler.h"
 
 #include <event2/event.h>
 
-/*
- * Scheduler high/low watermarks
- */
-
-static uint32_t sched_q_low_water = 16384;
-static uint32_t sched_q_high_water = 32768;
-
-/*
- * Maximum cells to flush in a single call to channel_flush_some_cells();
- * setting this low means more calls, but too high and we could overshoot
- * sched_q_high_water.
- */
-
-static uint32_t sched_max_flush_cells = 16;
-
 /**
  * \file scheduler.c
  * \brief Channel scheduling system: decides which channels should send and
  * receive when.
  *
- * This module implements a scheduler algorithm, to decide
- * which channels should send/receive when.
+ * This module is the global/common parts of the scheduling system. This system
+ * is what decides what channels get to send cells on their circuits and when.
+ *
+ * Terms:
+ * - "Scheduling system": the collection of scheduler*.{h,c} files and their
+ *   aggregate behavior.
+ * - "Scheduler implementation": a scheduler_t. The scheduling system has one
+ *   active scheduling implementation at a time.
+ *
+ * In this file you will find state that any scheduler implmentation can have
+ * access to as well as the functions the rest of Tor uses to interact with the
+ * scheduling system.
  *
  * The earliest versions of Tor approximated a kind of round-robin system
- * among active connections, but only approximated it.
+ * among active connections, but only approximated it. It would only consider
+ * one connection (roughly equal to a channel in today's terms) at a time, and
+ * thus could only prioritize circuits against others on the same connection.
+ *
+ * Then in response to the KIST paper[0], Tor implemented a global
+ * circuit scheduler. It was supposed to prioritize circuits across man
+ * channels, but wasn't effective. It is preserved in scheduler_vanilla.c.
  *
- * Now, write scheduling works by keeping track of which channels can
- * accept cells, and have cells to write.  From the scheduler's perspective,
- * a channel can be in four possible states:
+ * [0]: http://www.robgjansen.com/publications/kist-sec2014.pdf
+ *
+ * Then we actually got around to implementing KIST for real. We decided to
+ * modularize the scheduler so new ones can be implemented. You can find KIST
+ * in scheduler_kist.c.
+ *
+ * Channels have one of four scheduling states based on whether or not they
+ * have cells to send and whether or not they are able to send.
  *
  * <ol>
  * <li>
@@ -125,85 +130,96 @@ static uint32_t sched_max_flush_cells = 16;
  * </ol>
  *
  * Other event-driven parts of the code move channels between these scheduling
- * states by calling scheduler functions; the scheduler only runs on open-for-
- * writes/has-cells channels and is the only path for those to transition to
- * other states.  The scheduler_run() function gives us the opportunity to do
- * scheduling work, and is called from other scheduler functions whenever a
- * state transition occurs, and periodically from the main event loop.
+ * states by calling scheduler functions. The scheduling system builds up a
+ * list of channels in the SCHED_CHAN_PENDING state that the scheduler
+ * implementation should then use when it runs. Scheduling implementations need
+ * to properly update channel states during their scheduler_t->run() function
+ * as that is the only opportunity for channels to move from SCHED_CHAN_PENDING
+ * to any other state.
+ *
+ * The remainder of this file is a small amount of state that any scheduler
+ * implementation should have access to, and the functions the rest of Tor uses
+ * to interact with the scheduling system.
  */
 
-/* Scheduler global data structures */
+/*****************************************************************************
+ * Scheduling system state
+ *
+ * State that can be accessed from any scheduler implementation (but not
+ * outside the scheduling system)
+ *****************************************************************************/
+
+STATIC const scheduler_t *the_scheduler;
 
 /*
  * We keep a list of channels that are pending - i.e, have cells to write
- * and can accept them to send.  The enum scheduler_state in channel_t
+ * and can accept them to send. The enum scheduler_state in channel_t
  * is reserved for our use.
+ *
+ * Priority queue of channels that can write and have cells (pending work)
  */
-
-/* Pqueue of channels that can write and have cells (pending work) */
 STATIC smartlist_t *channels_pending = NULL;
 
 /*
  * This event runs the scheduler from its callback, and is manually
  * activated whenever a channel enters open for writes/cells to send.
  */
-
 STATIC struct event *run_sched_ev = NULL;
 
-/*
- * Queue heuristic; this is not the queue size, but an 'effective queuesize'
- * that ages out contributions from stalled channels.
- */
-
-STATIC uint64_t queue_heuristic = 0;
+/*****************************************************************************
+ * Scheduling system static function definitions
+ *
+ * Functions that can only be accessed from this file.
+ *****************************************************************************/
 
 /*
- * Timestamp for last queue heuristic update
+ * Scheduler event callback; this should get triggered once per event loop
+ * if any scheduling work was created during the event loop.
  */
+static void
+scheduler_evt_callback(evutil_socket_t fd, short events, void *arg)
+{
+  (void) fd;
+  (void) events;
+  (void) arg;
 
-STATIC time_t queue_heuristic_timestamp = 0;
-
-/* Scheduler static function declarations */
+  log_debug(LD_SCHED, "Scheduler event callback called");
 
-static void scheduler_evt_callback(evutil_socket_t fd,
-                                   short events, void *arg);
-static int scheduler_more_work(void);
-static void scheduler_retrigger(void);
-#if 0
-static void scheduler_trigger(void);
-#endif
+  /* Run the scheduler. This is a mandatory function. */
 
-/* Scheduler function implementations */
+  /* We might as well assert on this. If this function doesn't exist, no cells
+   * are getting scheduled. Things are very broken. scheduler_t says the run()
+   * function is mandatory. */
+  tor_assert(the_scheduler->run);
+  the_scheduler->run();
 
-/** Free everything and shut down the scheduling system */
+  /* Schedule itself back in if it has more work. */
 
-void
-scheduler_free_all(void)
-{
-  log_debug(LD_SCHED, "Shutting down scheduler");
+  /* Again, might as well assert on this mandatory scheduler_t function. If it
+   * doesn't exist, there's no way to tell libevent to run the scheduler again
+   * in the future. */
+  tor_assert(the_scheduler->schedule);
+  the_scheduler->schedule();
+}
 
-  if (run_sched_ev) {
-    if (event_del(run_sched_ev) < 0) {
-      log_warn(LD_BUG, "Problem deleting run_sched_ev");
-    }
-    tor_event_free(run_sched_ev);
-    run_sched_ev = NULL;
-  }
+/*****************************************************************************
+ * Scheduling system private function definitions
+ *
+ * Functions that can only be accessed from scheduler*.c
+ *****************************************************************************/
 
-  if (channels_pending) {
-    smartlist_free(channels_pending);
-    channels_pending = NULL;
-  }
+/* Return the pending channel list. */
+smartlist_t *
+get_channels_pending(void)
+{
+  return channels_pending;
 }
 
-/**
- * Comparison function to use when sorting pending channels
- */
-
-MOCK_IMPL(STATIC int,
+/* Comparison function to use when sorting pending channels */
+MOCK_IMPL(int,
 scheduler_compare_channels, (const void *c1_v, const void *c2_v))
 {
-  channel_t *c1 = NULL, *c2 = NULL;
+  const channel_t *c1 = NULL, *c2 = NULL;
   /* These are a workaround for -Wbad-function-cast throwing a fit */
   const circuitmux_policy_t *p1, *p2;
   uintptr_t p1_i, p2_i;
@@ -211,11 +227,8 @@ scheduler_compare_channels, (const void *c1_v, const void *c2_v))
   tor_assert(c1_v);
   tor_assert(c2_v);
 
-  c1 = (channel_t *)(c1_v);
-  c2 = (channel_t *)(c2_v);
-
-  tor_assert(c1);
-  tor_assert(c2);
+  c1 = (const channel_t *)(c1_v);
+  c2 = (const channel_t *)(c2_v);
 
   if (c1 != c2) {
     if (circuitmux_get_policy(c1->cmux) ==
@@ -242,26 +255,158 @@ scheduler_compare_channels, (const void *c1_v, const void *c2_v))
   }
 }
 
+/*****************************************************************************
+ * Scheduling system global functions
+ *
+ * Functions that can be accessed from anywhere in Tor.
+ *****************************************************************************/
+
+/* Using the global options, select the scheduler we should be using. */
+static void
+select_scheduler(void)
+{
+  const char *chosen_sched_type = NULL;
+
+#ifdef TOR_UNIT_TESTS
+  /* This is hella annoying to set in the options for every test that passes
+   * through the scheduler and there are many so if we don't explicitely have
+   * a list of types set, just put the vanilla one. */
+  if (get_options()->SchedulerTypes_ == NULL) {
+    the_scheduler = get_vanilla_scheduler();
+    return;
+  }
+#endif
+
+  /* This list is ordered that is first entry has the first priority. Thus, as
+   * soon as we find a scheduler type that we can use, we use it and stop. */
+  SMARTLIST_FOREACH_BEGIN(get_options()->SchedulerTypes_, int *, type) {
+    switch (*type) {
+    case SCHEDULER_VANILLA:
+      the_scheduler = get_vanilla_scheduler();
+      chosen_sched_type = "Vanilla";
+      goto end;
+    case SCHEDULER_KIST:
+      if (!scheduler_can_use_kist()) {
+#ifdef HAVE_KIST_SUPPORT
+        if (get_options()->KISTSchedRunInterval == -1) {
+          log_info(LD_SCHED, "Scheduler type KIST can not be used. It is "
+                             "disabled because KISTSchedRunInterval=-1");
+        } else {
+          log_notice(LD_SCHED, "Scheduler type KIST has been disabled by "
+                               "the consensus.");
+        }
+#else /* HAVE_KIST_SUPPORT */
+        log_info(LD_SCHED, "Scheduler type KIST not built in");
+#endif /* HAVE_KIST_SUPPORT */
+        continue;
+      }
+      the_scheduler = get_kist_scheduler();
+      chosen_sched_type = "KIST";
+      scheduler_kist_set_full_mode();
+      goto end;
+    case SCHEDULER_KIST_LITE:
+      chosen_sched_type = "KISTLite";
+      the_scheduler = get_kist_scheduler();
+      scheduler_kist_set_lite_mode();
+      goto end;
+    default:
+      /* Our option validation should have caught this. */
+      tor_assert_unreached();
+    }
+  } SMARTLIST_FOREACH_END(type);
+
+ end:
+  log_notice(LD_CONFIG, "Scheduler type %s has been enabled.",
+             chosen_sched_type);
+}
+
 /*
- * Scheduler event callback; this should get triggered once per event loop
- * if any scheduling work was created during the event loop.
+ * Little helper function called from a few different places. It changes the
+ * scheduler implementation, if necessary. And if it did, it then tells the
+ * old one to free its state and the new one to initialize.
  */
-
 static void
-scheduler_evt_callback(evutil_socket_t fd, short events, void *arg)
+set_scheduler(void)
 {
-  (void)fd;
-  (void)events;
-  (void)arg;
-  log_debug(LD_SCHED, "Scheduler event callback called");
+  const scheduler_t *old_scheduler = the_scheduler;
 
-  tor_assert(run_sched_ev);
+  /* From the options, select the scheduler type to set. */
+  select_scheduler();
+
+  if (old_scheduler != the_scheduler) {
+    /* Allow the old scheduler to clean up, if needed. */
+    if (old_scheduler && old_scheduler->free_all) {
+      old_scheduler->free_all();
+    }
+    /* We don't clean up the old scheduler_t. We keep any type of scheduler
+     * we've allocated so we can do an easy switch back. */
+
+    /* Initialize the new scheduler. */
+    if (the_scheduler->init) {
+      the_scheduler->init();
+    }
+  }
+}
+
+/*
+ * This is how the scheduling system is notified of Tor's configuration
+ * changing. For example: a SIGHUP was issued.
+ */
+void
+scheduler_conf_changed(void)
+{
+  /* Let the scheduler decide what it should do. */
+  set_scheduler();
+
+  /* Then tell the (possibly new) scheduler that we have new options. */
+  if (the_scheduler->on_new_options) {
+    the_scheduler->on_new_options();
+  }
+}
 
-  /* Run the scheduler */
-  scheduler_run();
+/*
+ * Whenever we get a new consensus, this function is called.
+ */
+void
+scheduler_notify_networkstatus_changed(const networkstatus_t *old_c,
+                                       const networkstatus_t *new_c)
+{
+  /* Then tell the (possibly new) scheduler that we have a new consensus */
+  if (the_scheduler->on_new_consensus) {
+    the_scheduler->on_new_consensus(old_c, new_c);
+  }
+  /* Maybe the consensus param made us change the scheduler. */
+  set_scheduler();
+}
 
-  /* Do we have more work to do? */
-  if (scheduler_more_work()) scheduler_retrigger();
+/*
+ * Free everything scheduling-related from main.c. Note this is only called
+ * when Tor is shutting down, while scheduler_t->free_all() is called both when
+ * Tor is shutting down and when we are switching schedulers.
+ */
+void
+scheduler_free_all(void)
+{
+  log_debug(LD_SCHED, "Shutting down scheduler");
+
+  if (run_sched_ev) {
+    if (event_del(run_sched_ev) < 0) {
+      log_warn(LD_BUG, "Problem deleting run_sched_ev");
+    }
+    tor_event_free(run_sched_ev);
+    run_sched_ev = NULL;
+  }
+
+  if (channels_pending) {
+    /* We don't have ownership of the object in this list. */
+    smartlist_free(channels_pending);
+    channels_pending = NULL;
+  }
+
+  if (the_scheduler && the_scheduler->free_all) {
+    the_scheduler->free_all();
+  }
+  the_scheduler = NULL;
 }
 
 /** Mark a channel as no longer ready to accept writes */
@@ -269,9 +414,12 @@ scheduler_evt_callback(evutil_socket_t fd, short events, void *arg)
 MOCK_IMPL(void,
 scheduler_channel_doesnt_want_writes,(channel_t *chan))
 {
-  tor_assert(chan);
-
-  tor_assert(channels_pending);
+  IF_BUG_ONCE(!chan) {
+    return;
+  }
+  IF_BUG_ONCE(!channels_pending) {
+    return;
+  }
 
   /* If it's already in pending, we can put it in waiting_to_write */
   if (chan->scheduler_state == SCHED_CHAN_PENDING) {
@@ -309,10 +457,12 @@ scheduler_channel_doesnt_want_writes,(channel_t *chan))
 MOCK_IMPL(void,
 scheduler_channel_has_waiting_cells,(channel_t *chan))
 {
-  int became_pending = 0;
-
-  tor_assert(chan);
-  tor_assert(channels_pending);
+  IF_BUG_ONCE(!chan) {
+    return;
+  }
+  IF_BUG_ONCE(!channels_pending) {
+    return;
+  }
 
   /* First, check if this one also writeable */
   if (chan->scheduler_state == SCHED_CHAN_WAITING_FOR_CELLS) {
@@ -330,7 +480,9 @@ scheduler_channel_has_waiting_cells,(channel_t *chan))
               "Channel " U64_FORMAT " at %p went from waiting_for_cells "
               "to pending",
               U64_PRINTF_ARG(chan->global_identifier), chan);
-    became_pending = 1;
+    /* If we made a channel pending, we potentially have scheduling work to
+     * do. */
+    the_scheduler->schedule();
   } else {
     /*
      * It's not in waiting_for_cells, so it can't become pending; it's
@@ -345,240 +497,104 @@ scheduler_channel_has_waiting_cells,(channel_t *chan))
                 U64_PRINTF_ARG(chan->global_identifier), chan);
     }
   }
+}
 
-  /*
-   * If we made a channel pending, we potentially have scheduling work
-   * to do.
-   */
-  if (became_pending) scheduler_retrigger();
+/* Add the scheduler event to the set of pending events with next_run being
+ * the time up to libevent should wait before triggering the event. */
+void
+scheduler_ev_add(const struct timeval *next_run)
+{
+  tor_assert(run_sched_ev);
+  tor_assert(next_run);
+  event_add(run_sched_ev, next_run);
 }
 
-/** Set up the scheduling system */
+/* Make the scheduler event active with the given flags. */
+void
+scheduler_ev_active(int flags)
+{
+  tor_assert(run_sched_ev);
+  event_active(run_sched_ev, flags, 1);
+}
 
+/*
+ * Initialize everything scheduling-related from config.c. Note this is only
+ * called when Tor is starting up, while scheduler_t->init() is called both
+ * when Tor is starting up and when we are switching schedulers.
+ */
 void
 scheduler_init(void)
 {
   log_debug(LD_SCHED, "Initting scheduler");
 
-  tor_assert(!run_sched_ev);
+  // Two '!' because we really do want to check if the pointer is non-NULL
+  IF_BUG_ONCE(!!run_sched_ev) {
+    log_warn(LD_SCHED, "We should not already have a libevent scheduler event."
+             "I'll clean the old one up, but this is odd.");
+    tor_event_free(run_sched_ev);
+    run_sched_ev = NULL;
+  }
   run_sched_ev = tor_event_new(tor_libevent_get_base(), -1,
                                0, scheduler_evt_callback, NULL);
-
   channels_pending = smartlist_new();
-  queue_heuristic = 0;
-  queue_heuristic_timestamp = approx_time();
-}
-
-/** Check if there's more scheduling work */
-
-static int
-scheduler_more_work(void)
-{
-  tor_assert(channels_pending);
-
-  return ((scheduler_get_queue_heuristic() < sched_q_low_water) &&
-          ((smartlist_len(channels_pending) > 0))) ? 1 : 0;
-}
-
-/** Retrigger the scheduler in a way safe to use from the callback */
 
-static void
-scheduler_retrigger(void)
-{
-  tor_assert(run_sched_ev);
-  event_active(run_sched_ev, EV_TIMEOUT, 1);
+  set_scheduler();
 }
 
-/** Notify the scheduler of a channel being closed */
-
+/*
+ * If a channel is going away, this is how the scheduling system is informed
+ * so it can do any freeing necessary. This ultimately calls
+ * scheduler_t->on_channel_free() so the current scheduler can release any
+ * state specific to this channel.
+ */
 MOCK_IMPL(void,
 scheduler_release_channel,(channel_t *chan))
 {
-  tor_assert(chan);
-  tor_assert(channels_pending);
-
-  if (chan->scheduler_state == SCHED_CHAN_PENDING) {
-    smartlist_pqueue_remove(channels_pending,
-                            scheduler_compare_channels,
-                            offsetof(channel_t, sched_heap_idx),
-                            chan);
+  IF_BUG_ONCE(!chan) {
+    return;
+  }
+  IF_BUG_ONCE(!channels_pending) {
+    return;
   }
 
-  chan->scheduler_state = SCHED_CHAN_IDLE;
-}
-
-/** Run the scheduling algorithm if necessary */
-
-MOCK_IMPL(void,
-scheduler_run, (void))
-{
-  int n_cells, n_chans_before, n_chans_after;
-  uint64_t q_len_before, q_heur_before, q_len_after, q_heur_after;
-  ssize_t flushed, flushed_this_time;
-  smartlist_t *to_readd = NULL;
-  channel_t *chan = NULL;
-
-  log_debug(LD_SCHED, "We have a chance to run the scheduler");
-
-  if (scheduler_get_queue_heuristic() < sched_q_low_water) {
-    n_chans_before = smartlist_len(channels_pending);
-    q_len_before = channel_get_global_queue_estimate();
-    q_heur_before = scheduler_get_queue_heuristic();
-
-    while (scheduler_get_queue_heuristic() <= sched_q_high_water &&
-           smartlist_len(channels_pending) > 0) {
-      /* Pop off a channel */
-      chan = smartlist_pqueue_pop(channels_pending,
-                                  scheduler_compare_channels,
-                                  offsetof(channel_t, sched_heap_idx));
-      tor_assert(chan);
-
-      /* Figure out how many cells we can write */
-      n_cells = channel_num_cells_writeable(chan);
-      if (n_cells > 0) {
-        log_debug(LD_SCHED,
-                  "Scheduler saw pending channel " U64_FORMAT " at %p with "
-                  "%d cells writeable",
-                  U64_PRINTF_ARG(chan->global_identifier), chan, n_cells);
-
-        flushed = 0;
-        while (flushed < n_cells &&
-               scheduler_get_queue_heuristic() <= sched_q_high_water) {
-          flushed_this_time =
-            channel_flush_some_cells(chan,
-                                     MIN(sched_max_flush_cells,
-                                         (size_t) n_cells - flushed));
-          if (flushed_this_time <= 0) break;
-          flushed += flushed_this_time;
-        }
-
-        if (flushed < n_cells) {
-          /* We ran out of cells to flush */
-          chan->scheduler_state = SCHED_CHAN_WAITING_FOR_CELLS;
-          log_debug(LD_SCHED,
-                    "Channel " U64_FORMAT " at %p "
-                    "entered waiting_for_cells from pending",
-                    U64_PRINTF_ARG(chan->global_identifier),
-                    chan);
-        } else {
-          /* The channel may still have some cells */
-          if (channel_more_to_flush(chan)) {
-          /* The channel goes to either pending or waiting_to_write */
-            if (channel_num_cells_writeable(chan) > 0) {
-              /* Add it back to pending later */
-              if (!to_readd) to_readd = smartlist_new();
-              smartlist_add(to_readd, chan);
-              log_debug(LD_SCHED,
-                        "Channel " U64_FORMAT " at %p "
-                        "is still pending",
-                        U64_PRINTF_ARG(chan->global_identifier),
-                        chan);
-            } else {
-              /* It's waiting to be able to write more */
-              chan->scheduler_state = SCHED_CHAN_WAITING_TO_WRITE;
-              log_debug(LD_SCHED,
-                        "Channel " U64_FORMAT " at %p "
-                        "entered waiting_to_write from pending",
-                        U64_PRINTF_ARG(chan->global_identifier),
-                        chan);
-            }
-          } else {
-            /* No cells left; it can go to idle or waiting_for_cells */
-            if (channel_num_cells_writeable(chan) > 0) {
-              /*
-               * It can still accept writes, so it goes to
-               * waiting_for_cells
-               */
-              chan->scheduler_state = SCHED_CHAN_WAITING_FOR_CELLS;
-              log_debug(LD_SCHED,
-                        "Channel " U64_FORMAT " at %p "
-                        "entered waiting_for_cells from pending",
-                        U64_PRINTF_ARG(chan->global_identifier),
-                        chan);
-            } else {
-              /*
-               * We exactly filled up the output queue with all available
-               * cells; go to idle.
-               */
-              chan->scheduler_state = SCHED_CHAN_IDLE;
-              log_debug(LD_SCHED,
-                        "Channel " U64_FORMAT " at %p "
-                        "become idle from pending",
-                        U64_PRINTF_ARG(chan->global_identifier),
-                        chan);
-            }
-          }
-        }
-
-        log_debug(LD_SCHED,
-                  "Scheduler flushed %d cells onto pending channel "
-                  U64_FORMAT " at %p",
-                  (int)flushed, U64_PRINTF_ARG(chan->global_identifier),
-                  chan);
-      } else {
-        log_info(LD_SCHED,
-                 "Scheduler saw pending channel " U64_FORMAT " at %p with "
-                 "no cells writeable",
-                 U64_PRINTF_ARG(chan->global_identifier), chan);
-        /* Put it back to WAITING_TO_WRITE */
-        chan->scheduler_state = SCHED_CHAN_WAITING_TO_WRITE;
-      }
-    }
-
-    /* Readd any channels we need to */
-    if (to_readd) {
-      SMARTLIST_FOREACH_BEGIN(to_readd, channel_t *, readd_chan) {
-        readd_chan->scheduler_state = SCHED_CHAN_PENDING;
-        smartlist_pqueue_add(channels_pending,
-                             scheduler_compare_channels,
-                             offsetof(channel_t, sched_heap_idx),
-                             readd_chan);
-      } SMARTLIST_FOREACH_END(readd_chan);
-      smartlist_free(to_readd);
+  if (chan->scheduler_state == SCHED_CHAN_PENDING) {
+    if (smartlist_pos(channels_pending, chan) == -1) {
+      log_warn(LD_SCHED, "Scheduler asked to release channel %" PRIu64 " "
+                         "but it wasn't in channels_pending",
+               chan->global_identifier);
+    } else {
+      smartlist_pqueue_remove(channels_pending,
+                              scheduler_compare_channels,
+                              offsetof(channel_t, sched_heap_idx),
+                              chan);
     }
-
-    n_chans_after = smartlist_len(channels_pending);
-    q_len_after = channel_get_global_queue_estimate();
-    q_heur_after = scheduler_get_queue_heuristic();
-    log_debug(LD_SCHED,
-              "Scheduler handled %d of %d pending channels, queue size from "
-              U64_FORMAT " to " U64_FORMAT ", queue heuristic from "
-              U64_FORMAT " to " U64_FORMAT,
-              n_chans_before - n_chans_after, n_chans_before,
-              U64_PRINTF_ARG(q_len_before), U64_PRINTF_ARG(q_len_after),
-              U64_PRINTF_ARG(q_heur_before), U64_PRINTF_ARG(q_heur_after));
   }
-}
 
-/** Trigger the scheduling event so we run the scheduler later */
-
-#if 0
-static void
-scheduler_trigger(void)
-{
-  log_debug(LD_SCHED, "Triggering scheduler event");
-
-  tor_assert(run_sched_ev);
-
-  event_add(run_sched_ev, EV_TIMEOUT, 1);
+  if (the_scheduler->on_channel_free) {
+    the_scheduler->on_channel_free(chan);
+  }
+  chan->scheduler_state = SCHED_CHAN_IDLE;
 }
-#endif
 
 /** Mark a channel as ready to accept writes */
 
 void
 scheduler_channel_wants_writes(channel_t *chan)
 {
-  int became_pending = 0;
-
-  tor_assert(chan);
-  tor_assert(channels_pending);
+  IF_BUG_ONCE(!chan) {
+    return;
+  }
+  IF_BUG_ONCE(!channels_pending) {
+    return;
+  }
 
   /* If it's already in waiting_to_write, we can put it in pending */
   if (chan->scheduler_state == SCHED_CHAN_WAITING_TO_WRITE) {
     /*
      * It can write now, so it goes to channels_pending.
      */
+    log_debug(LD_SCHED, "chan=%" PRIu64 " became pending",
+        chan->global_identifier);
     smartlist_pqueue_add(channels_pending,
                          scheduler_compare_channels,
                          offsetof(channel_t, sched_heap_idx),
@@ -588,7 +604,8 @@ scheduler_channel_wants_writes(channel_t *chan)
               "Channel " U64_FORMAT " at %p went from waiting_to_write "
               "to pending",
               U64_PRINTF_ARG(chan->global_identifier), chan);
-    became_pending = 1;
+    /* We just made a channel pending, we have scheduling work to do. */
+    the_scheduler->schedule();
   } else {
     /*
      * It's not in SCHED_CHAN_WAITING_TO_WRITE, so it can't become pending;
@@ -602,23 +619,19 @@ scheduler_channel_wants_writes(channel_t *chan)
                 U64_PRINTF_ARG(chan->global_identifier), chan);
     }
   }
-
-  /*
-   * If we made a channel pending, we potentially have scheduling work
-   * to do.
-   */
-  if (became_pending) scheduler_retrigger();
 }
 
-/**
- * Notify the scheduler that a channel's position in the pqueue may have
- * changed
- */
+#ifdef TOR_UNIT_TESTS
 
+/*
+ * Notify scheduler that a channel's queue position may have changed.
+ */
 void
 scheduler_touch_channel(channel_t *chan)
 {
-  tor_assert(chan);
+  IF_BUG_ONCE(!chan) {
+    return;
+  }
 
   if (chan->scheduler_state == SCHED_CHAN_PENDING) {
     /* Remove and re-add it */
@@ -634,105 +647,5 @@ scheduler_touch_channel(channel_t *chan)
   /* else no-op, since it isn't in the queue */
 }
 
-/**
- * Notify the scheduler of a queue size adjustment, to recalculate the
- * queue heuristic.
- */
-
-void
-scheduler_adjust_queue_size(channel_t *chan, int dir, uint64_t adj)
-{
-  time_t now = approx_time();
-
-  log_debug(LD_SCHED,
-            "Queue size adjustment by %s" U64_FORMAT " for channel "
-            U64_FORMAT,
-            (dir >= 0) ? "+" : "-",
-            U64_PRINTF_ARG(adj),
-            U64_PRINTF_ARG(chan->global_identifier));
-
-  /* Get the queue heuristic up to date */
-  scheduler_update_queue_heuristic(now);
-
-  /* Adjust as appropriate */
-  if (dir >= 0) {
-    /* Increasing it */
-    queue_heuristic += adj;
-  } else {
-    /* Decreasing it */
-    if (queue_heuristic > adj) queue_heuristic -= adj;
-    else queue_heuristic = 0;
-  }
-
-  log_debug(LD_SCHED,
-            "Queue heuristic is now " U64_FORMAT,
-            U64_PRINTF_ARG(queue_heuristic));
-}
-
-/**
- * Query the current value of the queue heuristic
- */
-
-STATIC uint64_t
-scheduler_get_queue_heuristic(void)
-{
-  time_t now = approx_time();
-
-  scheduler_update_queue_heuristic(now);
-
-  return queue_heuristic;
-}
-
-/**
- * Adjust the queue heuristic value to the present time
- */
-
-STATIC void
-scheduler_update_queue_heuristic(time_t now)
-{
-  time_t diff;
-
-  if (queue_heuristic_timestamp == 0) {
-    /*
-     * Nothing we can sensibly do; must not have been initted properly.
-     * Oh well.
-     */
-    queue_heuristic_timestamp = now;
-  } else if (queue_heuristic_timestamp < now) {
-    diff = now - queue_heuristic_timestamp;
-    /*
-     * This is a simple exponential age-out; the other proposed alternative
-     * was a linear age-out using the bandwidth history in rephist.c; I'm
-     * going with this out of concern that if an adversary can jam the
-     * scheduler long enough, it would cause the bandwidth to drop to
-     * zero and render the aging mechanism ineffective thereafter.
-     */
-    if (0 <= diff && diff < 64) queue_heuristic >>= diff;
-    else queue_heuristic = 0;
-
-    queue_heuristic_timestamp = now;
-
-    log_debug(LD_SCHED,
-              "Queue heuristic is now " U64_FORMAT,
-              U64_PRINTF_ARG(queue_heuristic));
-  }
-  /* else no update needed, or time went backward */
-}
-
-/**
- * Set scheduler watermarks and flush size
- */
-
-void
-scheduler_set_watermarks(uint32_t lo, uint32_t hi, uint32_t max_flush)
-{
-  /* Sanity assertions - caller should ensure these are true */
-  tor_assert(lo > 0);
-  tor_assert(hi > lo);
-  tor_assert(max_flush > 0);
-
-  sched_q_low_water = lo;
-  sched_q_high_water = hi;
-  sched_max_flush_cells = max_flush;
-}
+#endif /* TOR_UNIT_TESTS */
 

+ 175 - 28
src/or/scheduler.h

@@ -1,9 +1,9 @@
-/* * Copyright (c) 2013-2017, The Tor Project, Inc. */
+/* * Copyright (c) 2017, The Tor Project, Inc. */
 /* See LICENSE for licensing information */
 
 /**
  * \file scheduler.h
- * \brief Header file for scheduler.c
+ * \brief Header file for scheduler*.c
  **/
 
 #ifndef TOR_SCHEDULER_H
@@ -13,45 +13,192 @@
 #include "channel.h"
 #include "testsupport.h"
 
-/* Global-visibility scheduler functions */
+/*
+ * A scheduler implementation is a collection of function pointers. If you
+ * would like to add a new scheduler called foo, create scheduler_foo.c,
+ * implement at least the mandatory ones, and implement get_foo_scheduler()
+ * that returns a complete scheduler_t for your foo scheduler. See
+ * scheduler_kist.c for an example.
+ *
+ * These function pointers SHOULD NOT be used anywhere outside of the
+ * scheduling source files. The rest of Tor should communicate with the
+ * scheduling system through the functions near the bottom of this file, and
+ * those functions will call into the current scheduler implementation as
+ * necessary.
+ *
+ * If your scheduler doesn't need to implement something (for example: it
+ * doesn't create any state for itself, thus it has nothing to free when Tor
+ * is shutting down), then set that function pointer to NULL.
+ */
+typedef struct scheduler_s {
+  /* (Optional) To be called when we want to prepare a scheduler for use.
+   * Perhaps Tor just started and we are the lucky chosen scheduler, or
+   * perhaps Tor is switching to this scheduler. No matter the case, this is
+   * where we would prepare any state and initialize parameters. You might
+   * think of this as the opposite of free_all(). */
+  void (*init)(void);
 
-/* Set up and shut down the scheduler from main.c */
-void scheduler_free_all(void);
-void scheduler_init(void);
-MOCK_DECL(void, scheduler_run, (void));
+  /* (Optional) To be called when we want to tell the scheduler to delete all
+   * of its state (if any). Perhaps Tor is shutting down or perhaps we are
+   * switching schedulers. */
+  void (*free_all)(void);
 
-/* Mark channels as having cells or wanting/not wanting writes */
-MOCK_DECL(void,scheduler_channel_doesnt_want_writes,(channel_t *chan));
-MOCK_DECL(void,scheduler_channel_has_waiting_cells,(channel_t *chan));
-void scheduler_channel_wants_writes(channel_t *chan);
+  /* (Mandatory) Libevent controls the main event loop in Tor, and this is
+   * where we register with libevent the next execution of run_sched_ev [which
+   * ultimately calls run()]. */
+  void (*schedule)(void);
 
-/* Notify the scheduler of a channel being closed */
-MOCK_DECL(void,scheduler_release_channel,(channel_t *chan));
+  /* (Mandatory) This is the heart of a scheduler! This is where the
+   * excitement happens! Here libevent has given us the chance to execute, and
+   * we should do whatever we need to do in order to move some cells from
+   * their circuit queues to output buffers in an intelligent manner. We
+   * should do this quickly. When we are done, we'll try to schedule() ourself
+   * if more work needs to be done to setup the next scehduling run. */
+  void (*run)(void);
 
-/* Notify scheduler of queue size adjustments */
-void scheduler_adjust_queue_size(channel_t *chan, int dir, uint64_t adj);
+  /*
+   * External event not related to the scheduler but that can influence it.
+   */
 
-/* Notify scheduler that a channel's queue position may have changed */
-void scheduler_touch_channel(channel_t *chan);
+  /* (Optional) To be called whenever Tor finds out about a new consensus.
+   * First the scheduling system as a whole will react to the new consensus
+   * and change the scheduler if needed. After that, the current scheduler
+   * (which might be new) will call this so it has the chance to react to the
+   * new consensus too. If there's a consensus parameter that your scheduler
+   * wants to keep an eye on, this is where you should check for it.  */
+  void (*on_new_consensus)(const networkstatus_t *old_c,
+                           const networkstatus_t *new_c);
+
+  /* (Optional) To be called when a channel is being freed. Sometimes channels
+   * go away (for example: the relay on the other end is shutting down). If
+   * the scheduler keeps any channel-specific state and has memory to free
+   * when channels go away, implement this and free it here. */
+  void (*on_channel_free)(const channel_t *);
+
+  /* (Optional) To be called whenever Tor is reloading configuration options.
+   * For example: SIGHUP was issued and Tor is rereading its torrc. A
+   * scheduler should use this as an opportunity to parse and cache torrc
+   * options so that it doesn't have to call get_options() all the time. */
+  void (*on_new_options)(void);
+} scheduler_t;
+
+/** Scheduler type, we build an ordered list with those values from the
+ * parsed strings in Schedulers. The reason to do such a thing is so we can
+ * quickly and without parsing strings select the scheduler at anytime. */
+typedef enum {
+  SCHEDULER_VANILLA =   1,
+  SCHEDULER_KIST =      2,
+  SCHEDULER_KIST_LITE = 3,
+} scheduler_types_t;
 
-/* Adjust the watermarks from config file*/
-void scheduler_set_watermarks(uint32_t lo, uint32_t hi, uint32_t max_flush);
+/*****************************************************************************
+ * Globally visible scheduler variables/values
+ *
+ * These are variables/constants that all of Tor should be able to see.
+ *****************************************************************************/
 
-/* Things only scheduler.c and its test suite should see */
+/* Default interval that KIST runs (in ms). */
+#define KIST_SCHED_RUN_INTERVAL_DEFAULT 10
+/* Minimum interval that KIST runs. This value disables KIST. */
+#define KIST_SCHED_RUN_INTERVAL_MIN 0
+/* Maximum interval that KIST runs (in ms). */
+#define KIST_SCHED_RUN_INTERVAL_MAX 100
 
+/*****************************************************************************
+ * Globally visible scheduler functions
+ *
+ * These functions are how the rest of Tor communicates with the scheduling
+ * system.
+ *****************************************************************************/
+
+void scheduler_init(void);
+void scheduler_free_all(void);
+void scheduler_conf_changed(void);
+void scheduler_notify_networkstatus_changed(const networkstatus_t *old_c,
+                                            const networkstatus_t *new_c);
+MOCK_DECL(void, scheduler_release_channel, (channel_t *chan));
+
+/*
+ * Ways for a channel to interact with the scheduling system. A channel only
+ * really knows (i) whether or not it has cells it wants to send, and
+ * (ii) whether or not it would like to write.
+ */
+void scheduler_channel_wants_writes(channel_t *chan);
+MOCK_DECL(void, scheduler_channel_doesnt_want_writes, (channel_t *chan));
+MOCK_DECL(void, scheduler_channel_has_waiting_cells, (channel_t *chan));
+
+/*****************************************************************************
+ * Private scheduler functions
+ *
+ * These functions are only visible to the scheduling system, the current
+ * scheduler implementation, and tests.
+ *****************************************************************************/
 #ifdef SCHEDULER_PRIVATE_
-MOCK_DECL(STATIC int, scheduler_compare_channels,
+
+/*********************************
+ * Defined in scheduler.c
+ *********************************/
+smartlist_t *get_channels_pending(void);
+MOCK_DECL(int, scheduler_compare_channels,
           (const void *c1_v, const void *c2_v));
-STATIC uint64_t scheduler_get_queue_heuristic(void);
-STATIC void scheduler_update_queue_heuristic(time_t now);
+void scheduler_ev_active(int flags);
+void scheduler_ev_add(const struct timeval *next_run);
 
 #ifdef TOR_UNIT_TESTS
 extern smartlist_t *channels_pending;
 extern struct event *run_sched_ev;
-extern uint64_t queue_heuristic;
-extern time_t queue_heuristic_timestamp;
-#endif
-#endif
+extern const scheduler_t *the_scheduler;
+void scheduler_touch_channel(channel_t *chan);
+#endif /* TOR_UNIT_TESTS */
+
+/*********************************
+ * Defined in scheduler_kist.c
+ *********************************/
+
+#ifdef SCHEDULER_KIST_PRIVATE
+
+/* Socke table entry which holds information of a channel's socket and kernel
+ * TCP information. Only used by KIST. */
+typedef struct socket_table_ent_s {
+  HT_ENTRY(socket_table_ent_s) node;
+  const channel_t *chan;
+  /* Amount written this scheduling run */
+  uint64_t written;
+  /* Amount that can be written this scheduling run */
+  uint64_t limit;
+  /* TCP info from the kernel */
+  uint32_t cwnd;
+  uint32_t unacked;
+  uint32_t mss;
+  uint32_t notsent;
+} socket_table_ent_t;
+
+typedef HT_HEAD(outbuf_table_s, outbuf_table_ent_s) outbuf_table_t;
+
+MOCK_DECL(int, channel_should_write_to_kernel,
+          (outbuf_table_t *table, channel_t *chan));
+MOCK_DECL(void, channel_write_to_kernel, (channel_t *chan));
+MOCK_DECL(void, update_socket_info_impl, (socket_table_ent_t *ent));
+
+int scheduler_can_use_kist(void);
+void scheduler_kist_set_full_mode(void);
+void scheduler_kist_set_lite_mode(void);
+scheduler_t *get_kist_scheduler(void);
+int32_t kist_scheduler_run_interval(const networkstatus_t *ns);
+
+#ifdef TOR_UNIT_TESTS
+extern int32_t sched_run_interval;
+#endif /* TOR_UNIT_TESTS */
+
+#endif /* SCHEDULER_KIST_PRIVATE */
+
+/*********************************
+ * Defined in scheduler_vanilla.c
+ *********************************/
+
+scheduler_t *get_vanilla_scheduler(void);
+
+#endif /* SCHEDULER_PRIVATE_ */
 
-#endif /* !defined(TOR_SCHEDULER_H) */
+#endif /* TOR_SCHEDULER_H */
 

+ 774 - 0
src/or/scheduler_kist.c

@@ -0,0 +1,774 @@
+/* Copyright (c) 2017, The Tor Project, Inc. */
+/* See LICENSE for licensing information */
+
+#define SCHEDULER_KIST_PRIVATE
+
+#include <event2/event.h>
+#include <netinet/tcp.h>
+
+#include "or.h"
+#include "buffers.h"
+#include "config.h"
+#include "connection.h"
+#include "networkstatus.h"
+#define TOR_CHANNEL_INTERNAL_
+#include "channel.h"
+#include "channeltls.h"
+#define SCHEDULER_PRIVATE_
+#include "scheduler.h"
+
+#define TLS_PER_CELL_OVERHEAD 29
+
+#ifdef HAVE_KIST_SUPPORT
+/* Kernel interface needed for KIST. */
+#include <linux/sockios.h>
+#endif /* HAVE_KIST_SUPPORT */
+
+/*****************************************************************************
+ * Data structures and supporting functions
+ *****************************************************************************/
+
+#ifdef HAVE_KIST_SUPPORT
+/* Indicate if KIST lite mode is on or off. We can disable it at runtime.
+ * Important to have because of the KISTLite -> KIST possible transition. */
+static unsigned int kist_lite_mode = 0;
+/* Indicate if we don't have the kernel support. This can happen if the kernel
+ * changed and it doesn't recognized the values passed to the syscalls needed
+ * by KIST. In that case, fallback to the naive approach. */
+static unsigned int kist_no_kernel_support = 0;
+#else
+static unsigned int kist_no_kernel_support = 1;
+static unsigned int kist_lite_mode = 1;
+#endif
+
+/* Socket_table hash table stuff. The socket_table keeps track of per-socket
+ * limit information imposed by kist and used by kist. */
+
+static uint32_t
+socket_table_ent_hash(const socket_table_ent_t *ent)
+{
+  return (uint32_t)ent->chan->global_identifier;
+}
+
+static unsigned
+socket_table_ent_eq(const socket_table_ent_t *a, const socket_table_ent_t *b)
+{
+  return a->chan == b->chan;
+}
+
+typedef HT_HEAD(socket_table_s, socket_table_ent_s) socket_table_t;
+
+static socket_table_t socket_table = HT_INITIALIZER();
+
+HT_PROTOTYPE(socket_table_s, socket_table_ent_s, node, socket_table_ent_hash,
+             socket_table_ent_eq)
+HT_GENERATE2(socket_table_s, socket_table_ent_s, node, socket_table_ent_hash,
+             socket_table_ent_eq, 0.6, tor_reallocarray, tor_free_)
+
+/* outbuf_table hash table stuff. The outbuf_table keeps track of which
+ * channels have data sitting in their outbuf so the kist scheduler can force
+ * a write from outbuf to kernel periodically during a run and at the end of a
+ * run. */
+
+typedef struct outbuf_table_ent_s {
+  HT_ENTRY(outbuf_table_ent_s) node;
+  channel_t *chan;
+} outbuf_table_ent_t;
+
+static uint32_t
+outbuf_table_ent_hash(const outbuf_table_ent_t *ent)
+{
+  return (uint32_t)ent->chan->global_identifier;
+}
+
+static unsigned
+outbuf_table_ent_eq(const outbuf_table_ent_t *a, const outbuf_table_ent_t *b)
+{
+  return a->chan->global_identifier == b->chan->global_identifier;
+}
+
+HT_PROTOTYPE(outbuf_table_s, outbuf_table_ent_s, node, outbuf_table_ent_hash,
+             outbuf_table_ent_eq)
+HT_GENERATE2(outbuf_table_s, outbuf_table_ent_s, node, outbuf_table_ent_hash,
+             outbuf_table_ent_eq, 0.6, tor_reallocarray, tor_free_)
+
+/*****************************************************************************
+ * Other internal data
+ *****************************************************************************/
+
+/* Store the last time the scheduler was run so we can decide when to next run
+ * the scheduler based on it. */
+static monotime_t scheduler_last_run;
+/* This is a factor for the extra_space calculation in kist per-socket limits.
+ * It is the number of extra congestion windows we want to write to the kernel.
+ */
+static double sock_buf_size_factor = 1.0;
+/* How often the scheduler runs. */
+STATIC int32_t sched_run_interval = 10;
+
+/*****************************************************************************
+ * Internally called function implementations
+ *****************************************************************************/
+
+/* Little helper function to get the length of a channel's output buffer */
+static inline size_t
+channel_outbuf_length(channel_t *chan)
+{
+  /* In theory, this can not happen because we can not scheduler a channel
+   * without a connection that has its outbuf initialized. Just in case, bug
+   * on this so we can understand a bit more why it happened. */
+  if (BUG(BASE_CHAN_TO_TLS(chan)->conn == NULL)) {
+    return 0;
+  }
+  return buf_datalen(TO_CONN(BASE_CHAN_TO_TLS(chan)->conn)->outbuf);
+}
+
+/* Little helper function for HT_FOREACH_FN. */
+static int
+each_channel_write_to_kernel(outbuf_table_ent_t *ent, void *data)
+{
+  (void) data; /* Make compiler happy. */
+  channel_write_to_kernel(ent->chan);
+  return 0; /* Returning non-zero removes the element from the table. */
+}
+
+/* Free the given outbuf table entry ent. */
+static int
+free_outbuf_info_by_ent(outbuf_table_ent_t *ent, void *data)
+{
+  (void) data; /* Make compiler happy. */
+  log_debug(LD_SCHED, "Freeing outbuf table entry from chan=%" PRIu64,
+            ent->chan->global_identifier);
+  tor_free(ent);
+  return 1; /* So HT_FOREACH_FN will remove the element */
+}
+
+/* Free the given socket table entry ent. */
+static int
+free_socket_info_by_ent(socket_table_ent_t *ent, void *data)
+{
+  (void) data; /* Make compiler happy. */
+  log_debug(LD_SCHED, "Freeing socket table entry from chan=%" PRIu64,
+            ent->chan->global_identifier);
+  tor_free(ent);
+  return 1; /* So HT_FOREACH_FN will remove the element */
+}
+
+/* Clean up socket_table. Probably because the KIST sched impl is going away */
+static void
+free_all_socket_info(void)
+{
+  HT_FOREACH_FN(socket_table_s, &socket_table, free_socket_info_by_ent, NULL);
+}
+
+static socket_table_ent_t *
+socket_table_search(socket_table_t *table, const channel_t *chan)
+{
+  socket_table_ent_t search, *ent = NULL;
+  search.chan = chan;
+  ent = HT_FIND(socket_table_s, table, &search);
+  return ent;
+}
+
+/* Free a socket entry in table for the given chan. */
+static void
+free_socket_info_by_chan(socket_table_t *table, const channel_t *chan)
+{
+  socket_table_ent_t *ent = NULL;
+  ent = socket_table_search(table, chan);
+  if (!ent)
+    return;
+  log_debug(LD_SCHED, "scheduler free socket info for chan=%" PRIu64,
+            chan->global_identifier);
+  HT_REMOVE(socket_table_s, table, ent);
+  free_socket_info_by_ent(ent, NULL);
+}
+
+/* Perform system calls for the given socket in order to calculate kist's
+ * per-socket limit as documented in the function body. */
+MOCK_IMPL(void,
+update_socket_info_impl, (socket_table_ent_t *ent))
+{
+#ifdef HAVE_KIST_SUPPORT
+  int64_t tcp_space, extra_space;
+  const tor_socket_t sock =
+    TO_CONN(BASE_CHAN_TO_TLS((channel_t *) ent->chan)->conn)->s;
+  struct tcp_info tcp;
+  socklen_t tcp_info_len = sizeof(tcp);
+
+  if (kist_no_kernel_support || kist_lite_mode) {
+    goto fallback;
+  }
+
+  /* Gather information */
+  if (getsockopt(sock, SOL_TCP, TCP_INFO, (void *)&(tcp), &tcp_info_len) < 0) {
+    if (errno == EINVAL) {
+      /* Oops, this option is not provided by the kernel, we'll have to
+       * disable KIST entirely. This can happen if tor was built on a machine
+       * with the support previously or if the kernel was updated and lost the
+       * support. */
+      log_notice(LD_SCHED, "Looks like our kernel doesn't have the support "
+                           "for KIST anymore. We will fallback to the naive "
+                           "approach. Set KISTSchedRunInterval=-1 to disable "
+                           "KIST.");
+      kist_no_kernel_support = 1;
+    }
+    goto fallback;
+  }
+  if (ioctl(sock, SIOCOUTQNSD, &(ent->notsent)) < 0) {
+    if (errno == EINVAL) {
+      log_notice(LD_SCHED, "Looks like our kernel doesn't have the support "
+                           "for KIST anymore. We will fallback to the naive "
+                           "approach. Set KISTSchedRunInterval=-1 to disable "
+                           "KIST.");
+      /* Same reason as the above. */
+      kist_no_kernel_support = 1;
+    }
+    goto fallback;
+  }
+  ent->cwnd = tcp.tcpi_snd_cwnd;
+  ent->unacked = tcp.tcpi_unacked;
+  ent->mss = tcp.tcpi_snd_mss;
+
+  /* In order to reduce outbound kernel queuing delays and thus improve Tor's
+   * ability to prioritize circuits, KIST wants to set a socket write limit that
+   * is near the amount that the socket would be able to immediately send into
+   * the Internet.
+   *
+   * We first calculate how much the socket could send immediately (assuming
+   * completely full packets) according to the congestion window and the number
+   * of unacked packets.
+   *
+   * Then we add a little extra space in a controlled way. We do this so any
+   * when the kernel gets ACKs back for data currently sitting in the "TCP
+   * space", it will already have some more data to send immediately. It will
+   * not have to wait for the scheduler to run again. The amount of extra space
+   * is a factor of the current congestion window. With the suggested
+   * sock_buf_size_factor value of 1.0, we allow at most 2*cwnd bytes to sit in
+   * the kernel: 1 cwnd on the wire waiting for ACKs and 1 cwnd ready and
+   * waiting to be sent when those ACKs finally come.
+   *
+   * In the below diagram, we see some bytes in the TCP-space (denoted by '*')
+   * that have be sent onto the wire and are waiting for ACKs. We have a little
+   * more room in "TCP space" that we can fill with data that will be
+   * immediately sent. We also see the "extra space" KIST calculates. The sum
+   * of the empty "TCP space" and the "extra space" is the kist-imposed write
+   * limit for this socket.
+   *
+   * <----------------kernel-outbound-socket-queue----------------|
+   * <*********---------------------------------------------------|
+   * <----TCP-space-----|----extra-space-----|
+   * <------------------|
+   *                    ^ ((cwnd - unacked) * mss) bytes
+   *                    |--------------------|
+   *                                         ^ ((cwnd * mss) * factor) bytes
+   */
+
+   /* Assuming all these values from the kernel are uint32_t still, they will
+   * always fit into a int64_t tcp_space variable. */
+  tcp_space = (ent->cwnd - ent->unacked) * ent->mss;
+  if (tcp_space < 0) {
+    tcp_space = 0;
+  }
+
+  /* The clamp_double_to_int64 makes sure the first part fits into an int64_t.
+   * In fact, if sock_buf_size_factor is still forced to be >= 0 in config.c,
+   * then it will be positive for sure. Then we subtract a uint32_t. At worst
+   * we end up negative, but then we just set extra_space to 0 in the sanity
+   * check.*/
+  extra_space =
+    clamp_double_to_int64((ent->cwnd * ent->mss) * sock_buf_size_factor) -
+    ent->notsent;
+  if (extra_space < 0) {
+    extra_space = 0;
+  }
+
+  /* Finally we set the limit. Adding two positive int64_t together will always
+   * fit in an uint64_t. */
+  ent->limit = (uint64_t)tcp_space + (uint64_t)extra_space;
+  return;
+
+#else /* HAVE_KIST_SUPPORT */
+  goto fallback;
+#endif /* HAVE_KIST_SUPPORT */
+
+ fallback:
+  /* If all of a sudden we don't have kist support, we just zero out all the
+   * variables for this socket since we don't know what they should be.
+   * We also effectively allow the socket write as much as it wants to the
+   * kernel, effectively returning it to vanilla scheduler behavior. Writes
+   * are still limited by the lower layers of Tor: socket blocking, full
+   * outbuf, etc. */
+  ent->cwnd = ent->unacked = ent->mss = ent->notsent = 0;
+  ent->limit = INT_MAX;
+}
+
+/* Given a socket that isn't in the table, add it.
+ * Given a socket that is in the table, reinit values that need init-ing
+ * every scheduling run
+ */
+static void
+init_socket_info(socket_table_t *table, const channel_t *chan)
+{
+  socket_table_ent_t *ent = NULL;
+  ent = socket_table_search(table, chan);
+  if (!ent) {
+    log_debug(LD_SCHED, "scheduler init socket info for chan=%" PRIu64,
+              chan->global_identifier);
+    ent = tor_malloc_zero(sizeof(*ent));
+    ent->chan = chan;
+    HT_INSERT(socket_table_s, table, ent);
+  }
+  ent->written = 0;
+}
+
+/* Add chan to the outbuf table if it isn't already in it. If it is, then don't
+ * do anything */
+static void
+outbuf_table_add(outbuf_table_t *table, channel_t *chan)
+{
+  outbuf_table_ent_t search, *ent;
+  search.chan = chan;
+  ent = HT_FIND(outbuf_table_s, table, &search);
+  if (!ent) {
+    log_debug(LD_SCHED, "scheduler init outbuf info for chan=%" PRIu64,
+              chan->global_identifier);
+    ent = tor_malloc_zero(sizeof(*ent));
+    ent->chan = chan;
+    HT_INSERT(outbuf_table_s, table, ent);
+  }
+}
+
+static void
+outbuf_table_remove(outbuf_table_t *table, channel_t *chan)
+{
+  outbuf_table_ent_t search, *ent;
+  search.chan = chan;
+  ent = HT_FIND(outbuf_table_s, table, &search);
+  if (ent) {
+    HT_REMOVE(outbuf_table_s, table, ent);
+    free_outbuf_info_by_ent(ent, NULL);
+  }
+}
+
+/* Set the scheduler running interval. */
+static void
+set_scheduler_run_interval(const networkstatus_t *ns)
+{
+  int32_t old_sched_run_interval = sched_run_interval;
+  sched_run_interval = kist_scheduler_run_interval(ns);
+  if (old_sched_run_interval != sched_run_interval) {
+    log_info(LD_SCHED, "Scheduler KIST changing its running interval "
+                       "from %" PRId32 " to %" PRId32,
+             old_sched_run_interval, sched_run_interval);
+  }
+}
+
+/* Return true iff the channel associated socket can write to the kernel that
+ * is hasn't reach the limit. */
+static int
+socket_can_write(socket_table_t *table, const channel_t *chan)
+{
+  socket_table_ent_t *ent = NULL;
+  ent = socket_table_search(table, chan);
+  IF_BUG_ONCE(!ent) {
+    return 1; // Just return true, saying that kist wouldn't limit the socket
+  }
+
+  /* We previously caclulated a write limit for this socket. In the below
+   * calculation, first determine how much room is left in bytes. Then divide
+   * that by the amount of space a cell takes. If there's room for at least 1
+   * cell, then KIST will allow the socket to write. */
+  int64_t kist_limit_space =
+    (int64_t) (ent->limit - ent->written) /
+    (CELL_MAX_NETWORK_SIZE + TLS_PER_CELL_OVERHEAD);
+  return kist_limit_space > 0;
+}
+
+/* Update the channel's socket kernel information. */
+static void
+update_socket_info(socket_table_t *table, const channel_t *chan)
+{
+  socket_table_ent_t *ent = NULL;
+  ent = socket_table_search(table, chan);
+  IF_BUG_ONCE(!ent) {
+    return; // Whelp. Entry didn't exist for some reason so nothing to do.
+  }
+  update_socket_info_impl(ent);
+}
+
+/* Increament the channel's socket written value by the number of bytes. */
+static void
+update_socket_written(socket_table_t *table, channel_t *chan, size_t bytes)
+{
+  socket_table_ent_t *ent = NULL;
+  ent = socket_table_search(table, chan);
+  IF_BUG_ONCE(!ent) {
+    return; // Whelp. Entry didn't exist so nothing to do.
+  }
+
+  log_debug(LD_SCHED, "chan=%" PRIu64 " wrote %lu bytes, old was %" PRIi64,
+            chan->global_identifier, bytes, ent->written);
+
+  ent->written += bytes;
+}
+
+/*
+ * A naive KIST impl would write every single cell all the way to the kernel.
+ * That would take a lot of system calls. A less bad KIST impl would write a
+ * channel's outbuf to the kernel only when we are switching to a different
+ * channel. But if we have two channels with equal priority, we end up writing
+ * one cell for each and bouncing back and forth. This KIST impl avoids that
+ * by only writing a channel's outbuf to the kernel if it has 8 cells or more
+ * in it.
+ */
+MOCK_IMPL(int, channel_should_write_to_kernel,
+          (outbuf_table_t *table, channel_t *chan))
+{
+  outbuf_table_add(table, chan);
+  /* CELL_MAX_NETWORK_SIZE * 8 because we only want to write the outbuf to the
+   * kernel if there's 8 or more cells waiting */
+  return channel_outbuf_length(chan) > (CELL_MAX_NETWORK_SIZE * 8);
+}
+
+/* Little helper function to write a channel's outbuf all the way to the
+ * kernel */
+MOCK_IMPL(void, channel_write_to_kernel, (channel_t *chan))
+{
+  log_debug(LD_SCHED, "Writing %lu bytes to kernel for chan %" PRIu64,
+            channel_outbuf_length(chan), chan->global_identifier);
+  connection_handle_write(TO_CONN(BASE_CHAN_TO_TLS(chan)->conn), 0);
+}
+
+/* Return true iff the scheduler has work to perform. */
+static int
+have_work(void)
+{
+  smartlist_t *cp = get_channels_pending();
+  IF_BUG_ONCE(!cp) {
+    return 0; // channels_pending doesn't exist so... no work?
+  }
+  return smartlist_len(cp) > 0;
+}
+
+/* Function of the scheduler interface: free_all() */
+static void
+kist_free_all(void)
+{
+  free_all_socket_info();
+}
+
+/* Function of the scheduler interface: on_channel_free() */
+static void
+kist_on_channel_free(const channel_t *chan)
+{
+  free_socket_info_by_chan(&socket_table, chan);
+}
+
+/* Function of the scheduler interface: on_new_consensus() */
+static void
+kist_scheduler_on_new_consensus(const networkstatus_t *old_c,
+                                const networkstatus_t *new_c)
+{
+  (void) old_c;
+  (void) new_c;
+
+  set_scheduler_run_interval(new_c);
+}
+
+/* Function of the scheduler interface: on_new_options() */
+static void
+kist_scheduler_on_new_options(void)
+{
+  sock_buf_size_factor = get_options()->KISTSockBufSizeFactor;
+
+  /* Calls kist_scheduler_run_interval which calls get_options(). */
+  set_scheduler_run_interval(NULL);
+}
+
+/* Function of the scheduler interface: init() */
+static void
+kist_scheduler_init(void)
+{
+  kist_scheduler_on_new_options();
+  IF_BUG_ONCE(sched_run_interval <= 0) {
+    log_warn(LD_SCHED, "We are initing the KIST scheduler and noticed the "
+             "KISTSchedRunInterval is telling us to not use KIST. That's "
+             "weird! We'll continue using KIST, but at %dms.",
+             KIST_SCHED_RUN_INTERVAL_DEFAULT);
+    sched_run_interval = KIST_SCHED_RUN_INTERVAL_DEFAULT;
+  }
+}
+
+/* Function of the scheduler interface: schedule() */
+static void
+kist_scheduler_schedule(void)
+{
+  struct monotime_t now;
+  struct timeval next_run;
+  int32_t diff;
+
+  if (!have_work()) {
+    return;
+  }
+  monotime_get(&now);
+  diff = (int32_t) monotime_diff_msec(&scheduler_last_run, &now);
+  if (diff < sched_run_interval) {
+    next_run.tv_sec = 0;
+    /* 1000 for ms -> us */
+    next_run.tv_usec = (sched_run_interval - diff) * 1000;
+    /* Readding an event reschedules it. It does not duplicate it. */
+    scheduler_ev_add(&next_run);
+  } else {
+    scheduler_ev_active(EV_TIMEOUT);
+  }
+}
+
+/* Function of the scheduler interface: run() */
+static void
+kist_scheduler_run(void)
+{
+  /* Define variables */
+  channel_t *chan = NULL; // current working channel
+  /* The last distinct chan served in a sched loop. */
+  channel_t *prev_chan = NULL;
+  int flush_result; // temporarily store results from flush calls
+  /* Channels to be readding to pending at the end */
+  smartlist_t *to_readd = NULL;
+  smartlist_t *cp = get_channels_pending();
+
+  outbuf_table_t outbuf_table = HT_INITIALIZER();
+
+  /* For each pending channel, collect new kernel information */
+  SMARTLIST_FOREACH_BEGIN(cp, const channel_t *, pchan) {
+      init_socket_info(&socket_table, pchan);
+      update_socket_info(&socket_table, pchan);
+  } SMARTLIST_FOREACH_END(pchan);
+
+  log_debug(LD_SCHED, "Running the scheduler. %d channels pending",
+            smartlist_len(cp));
+
+  /* The main scheduling loop. Loop until there are no more pending channels */
+  while (smartlist_len(cp) > 0) {
+    /* get best channel */
+    chan = smartlist_pqueue_pop(cp, scheduler_compare_channels,
+                                offsetof(channel_t, sched_heap_idx));
+    IF_BUG_ONCE(!chan) {
+      /* Some-freaking-how a NULL got into the channels_pending. That should
+       * never happen, but it should be harmless to ignore it and keep looping.
+       */
+      continue;
+    }
+    outbuf_table_add(&outbuf_table, chan);
+
+    /* if we have switched to a new channel, consider writing the previous
+     * channel's outbuf to the kernel. */
+    if (!prev_chan) {
+      prev_chan = chan;
+    }
+    if (prev_chan != chan) {
+      if (channel_should_write_to_kernel(&outbuf_table, prev_chan)) {
+        channel_write_to_kernel(prev_chan);
+        outbuf_table_remove(&outbuf_table, prev_chan);
+      }
+      prev_chan = chan;
+    }
+
+    /* Only flush and write if the per-socket limit hasn't been hit */
+    if (socket_can_write(&socket_table, chan)) {
+      /* flush to channel queue/outbuf */
+      flush_result = (int)channel_flush_some_cells(chan, 1); // 1 for num cells
+      /* flush_result has the # cells flushed */
+      if (flush_result > 0) {
+        update_socket_written(&socket_table, chan, flush_result *
+                              (CELL_MAX_NETWORK_SIZE + TLS_PER_CELL_OVERHEAD));
+      }
+      /* XXX What if we didn't flush? */
+    }
+
+    /* Decide what to do with the channel now */
+
+    if (!channel_more_to_flush(chan) &&
+        !socket_can_write(&socket_table, chan)) {
+
+      /* Case 1: no more cells to send, and cannot write */
+
+      /*
+       * You might think we should put the channel in SCHED_CHAN_IDLE. And
+       * you're probably correct. While implementing KIST, we found that the
+       * scheduling system would sometimes lose track of channels when we did
+       * that. We suspect it has to do with the difference between "can't
+       * write because socket/outbuf is full" and KIST's "can't write because
+       * we've arbitrarily decided that that's enough for now." Sometimes
+       * channels run out of cells at the same time they hit their
+       * kist-imposed write limit and maybe the rest of Tor doesn't put the
+       * channel back in pending when it is supposed to.
+       *
+       * This should be investigated again. It is as simple as changing
+       * SCHED_CHAN_WAITING_FOR_CELLS to SCHED_CHAN_IDLE and seeing if Tor
+       * starts having serious throughput issues. Best done in shadow/chutney.
+       */
+      chan->scheduler_state = SCHED_CHAN_WAITING_FOR_CELLS;
+      log_debug(LD_SCHED, "chan=%" PRIu64 " now waiting_for_cells",
+                chan->global_identifier);
+    } else if (!channel_more_to_flush(chan)) {
+
+      /* Case 2: no more cells to send, but still open for writes */
+
+      chan->scheduler_state = SCHED_CHAN_WAITING_FOR_CELLS;
+      log_debug(LD_SCHED, "chan=%" PRIu64 " now waiting_for_cells",
+                chan->global_identifier);
+    } else if (!socket_can_write(&socket_table, chan)) {
+
+      /* Case 3: cells to send, but cannot write */
+
+      /*
+       * We want to write, but can't. If we left the channel in
+       * channels_pending, we would never exit the scheduling loop. We need to
+       * add it to a temporary list of channels to be added to channels_pending
+       * after the scheduling loop is over. They can hopefully be taken care of
+       * in the next scheduling round.
+       */
+      chan->scheduler_state = SCHED_CHAN_WAITING_TO_WRITE;
+      if (!to_readd) {
+        to_readd = smartlist_new();
+      }
+      smartlist_add(to_readd, chan);
+      log_debug(LD_SCHED, "chan=%" PRIu64 " now waiting_to_write",
+                chan->global_identifier);
+    } else {
+
+      /* Case 4: cells to send, and still open for writes */
+
+      chan->scheduler_state = SCHED_CHAN_PENDING;
+      smartlist_pqueue_add(cp, scheduler_compare_channels,
+                           offsetof(channel_t, sched_heap_idx), chan);
+    }
+  } /* End of main scheduling loop */
+
+  /* Write the outbuf of any channels that still have data */
+  HT_FOREACH_FN(outbuf_table_s, &outbuf_table, each_channel_write_to_kernel,
+                NULL);
+  /* We are done with it. */
+  HT_FOREACH_FN(outbuf_table_s, &outbuf_table, free_outbuf_info_by_ent, NULL);
+  HT_CLEAR(outbuf_table_s, &outbuf_table);
+
+  log_debug(LD_SCHED, "len pending=%d, len to_readd=%d",
+            smartlist_len(cp),
+            (to_readd ? smartlist_len(to_readd) : -1));
+
+  /* Readd any channels we need to */
+  if (to_readd) {
+    SMARTLIST_FOREACH_BEGIN(to_readd, channel_t *, readd_chan) {
+      readd_chan->scheduler_state = SCHED_CHAN_PENDING;
+      if (!smartlist_contains(cp, readd_chan)) {
+        smartlist_pqueue_add(cp, scheduler_compare_channels,
+                             offsetof(channel_t, sched_heap_idx), readd_chan);
+      }
+    } SMARTLIST_FOREACH_END(readd_chan);
+    smartlist_free(to_readd);
+  }
+
+  monotime_get(&scheduler_last_run);
+}
+
+/*****************************************************************************
+ * Externally called function implementations not called through scheduler_t
+ *****************************************************************************/
+
+/* Stores the kist scheduler function pointers. */
+static scheduler_t kist_scheduler = {
+  .free_all = kist_free_all,
+  .on_channel_free = kist_on_channel_free,
+  .init = kist_scheduler_init,
+  .on_new_consensus = kist_scheduler_on_new_consensus,
+  .schedule = kist_scheduler_schedule,
+  .run = kist_scheduler_run,
+  .on_new_options = kist_scheduler_on_new_options,
+};
+
+/* Return the KIST scheduler object. If it didn't exists, return a newly
+ * allocated one but init() is not called. */
+scheduler_t *
+get_kist_scheduler(void)
+{
+  return &kist_scheduler;
+}
+
+/* Check the torrc for the configured KIST scheduler run interval.
+ * - If torrc < 0, then return the negative torrc value (shouldn't even be
+ *   using KIST)
+ * - If torrc > 0, then return the positive torrc value (should use KIST, and
+ *   should use the set value)
+ * - If torrc == 0, then look in the consensus for what the value should be.
+ *   - If == 0, then return -1 (don't use KIST)
+ *   - If > 0, then return the positive consensus value
+ *   - If consensus doesn't say anything, return 10 milliseconds
+ */
+int32_t
+kist_scheduler_run_interval(const networkstatus_t *ns)
+{
+  int32_t run_interval = (int32_t)get_options()->KISTSchedRunInterval;
+  if (run_interval != 0) {
+    log_debug(LD_SCHED, "Found KISTSchedRunInterval in torrc. Using that.");
+    return run_interval;
+  }
+
+  log_debug(LD_SCHED, "Turning to the consensus for KISTSchedRunInterval");
+  run_interval = networkstatus_get_param(ns, "KISTSchedRunInterval",
+                                         KIST_SCHED_RUN_INTERVAL_DEFAULT,
+                                         KIST_SCHED_RUN_INTERVAL_MIN,
+                                         KIST_SCHED_RUN_INTERVAL_MAX);
+  if (run_interval <= 0)
+    return -1;
+  return run_interval;
+}
+
+/* Set KISTLite mode that is KIST without kernel support. */
+void
+scheduler_kist_set_lite_mode(void)
+{
+  kist_lite_mode = 1;
+  log_info(LD_SCHED,
+           "Setting KIST scheduler without kernel support (KISTLite mode)");
+}
+
+/* Set KIST mode that is KIST with kernel support. */
+void
+scheduler_kist_set_full_mode(void)
+{
+  kist_lite_mode = 0;
+  log_info(LD_SCHED,
+           "Setting KIST scheduler with kernel support (KIST mode)");
+}
+
+#ifdef HAVE_KIST_SUPPORT
+
+/* Return true iff the scheduler subsystem should use KIST. */
+int
+scheduler_can_use_kist(void)
+{
+  if (kist_no_kernel_support) {
+    /* We have no kernel support so we can't use KIST. */
+    return 0;
+  }
+
+  /* We do have the support, time to check if we can get the interval that the
+   * consensus can be disabling. */
+  int64_t run_interval = kist_scheduler_run_interval(NULL);
+  log_debug(LD_SCHED, "Determined KIST sched_run_interval should be "
+                      "%" PRId64 ". Can%s use KIST.",
+           run_interval, (run_interval > 0 ? "" : " not"));
+  return run_interval > 0;
+}
+
+#else /* HAVE_KIST_SUPPORT */
+
+int
+scheduler_can_use_kist(void)
+{
+  return 0;
+}
+
+#endif /* HAVE_KIST_SUPPORT */
+

+ 196 - 0
src/or/scheduler_vanilla.c

@@ -0,0 +1,196 @@
+/* Copyright (c) 2017, The Tor Project, Inc. */
+/* See LICENSE for licensing information */
+
+#include <event2/event.h>
+
+#include "or.h"
+#include "config.h"
+#define TOR_CHANNEL_INTERNAL_
+#include "channel.h"
+#define SCHEDULER_PRIVATE_
+#include "scheduler.h"
+
+/*****************************************************************************
+ * Other internal data
+ *****************************************************************************/
+
+/* Maximum cells to flush in a single call to channel_flush_some_cells(); */
+#define MAX_FLUSH_CELLS 1000
+
+/*****************************************************************************
+ * Externally called function implementations
+ *****************************************************************************/
+
+/* Return true iff the scheduler has work to perform. */
+static int
+have_work(void)
+{
+  smartlist_t *cp = get_channels_pending();
+  IF_BUG_ONCE(!cp) {
+    return 0; // channels_pending doesn't exist so... no work?
+  }
+  return smartlist_len(cp) > 0;
+}
+
+/** Retrigger the scheduler in a way safe to use from the callback */
+
+static void
+vanilla_scheduler_schedule(void)
+{
+  if (!have_work()) {
+    return;
+  }
+
+  /* Activate our event so it can process channels. */
+  scheduler_ev_active(EV_TIMEOUT);
+}
+
+static void
+vanilla_scheduler_run(void)
+{
+  int n_cells, n_chans_before, n_chans_after;
+  ssize_t flushed, flushed_this_time;
+  smartlist_t *cp = get_channels_pending();
+  smartlist_t *to_readd = NULL;
+  channel_t *chan = NULL;
+
+  log_debug(LD_SCHED, "We have a chance to run the scheduler");
+
+  n_chans_before = smartlist_len(cp);
+
+  while (smartlist_len(cp) > 0) {
+    /* Pop off a channel */
+    chan = smartlist_pqueue_pop(cp,
+                                scheduler_compare_channels,
+                                offsetof(channel_t, sched_heap_idx));
+    IF_BUG_ONCE(!chan) {
+      /* Some-freaking-how a NULL got into the channels_pending. That should
+       * never happen, but it should be harmless to ignore it and keep looping.
+       */
+      continue;
+    }
+
+    /* Figure out how many cells we can write */
+    n_cells = channel_num_cells_writeable(chan);
+    if (n_cells > 0) {
+      log_debug(LD_SCHED,
+                "Scheduler saw pending channel " U64_FORMAT " at %p with "
+                "%d cells writeable",
+                U64_PRINTF_ARG(chan->global_identifier), chan, n_cells);
+
+      flushed = 0;
+      while (flushed < n_cells) {
+        flushed_this_time =
+          channel_flush_some_cells(chan,
+                        MIN(MAX_FLUSH_CELLS, (size_t) n_cells - flushed));
+        if (flushed_this_time <= 0) break;
+        flushed += flushed_this_time;
+      }
+
+      if (flushed < n_cells) {
+        /* We ran out of cells to flush */
+        chan->scheduler_state = SCHED_CHAN_WAITING_FOR_CELLS;
+        log_debug(LD_SCHED,
+                  "Channel " U64_FORMAT " at %p "
+                  "entered waiting_for_cells from pending",
+                  U64_PRINTF_ARG(chan->global_identifier),
+                  chan);
+      } else {
+        /* The channel may still have some cells */
+        if (channel_more_to_flush(chan)) {
+        /* The channel goes to either pending or waiting_to_write */
+          if (channel_num_cells_writeable(chan) > 0) {
+            /* Add it back to pending later */
+            if (!to_readd) to_readd = smartlist_new();
+            smartlist_add(to_readd, chan);
+            log_debug(LD_SCHED,
+                      "Channel " U64_FORMAT " at %p "
+                      "is still pending",
+                      U64_PRINTF_ARG(chan->global_identifier),
+                      chan);
+          } else {
+            /* It's waiting to be able to write more */
+            chan->scheduler_state = SCHED_CHAN_WAITING_TO_WRITE;
+            log_debug(LD_SCHED,
+                      "Channel " U64_FORMAT " at %p "
+                      "entered waiting_to_write from pending",
+                      U64_PRINTF_ARG(chan->global_identifier),
+                      chan);
+          }
+        } else {
+          /* No cells left; it can go to idle or waiting_for_cells */
+          if (channel_num_cells_writeable(chan) > 0) {
+            /*
+             * It can still accept writes, so it goes to
+             * waiting_for_cells
+             */
+            chan->scheduler_state = SCHED_CHAN_WAITING_FOR_CELLS;
+            log_debug(LD_SCHED,
+                      "Channel " U64_FORMAT " at %p "
+                      "entered waiting_for_cells from pending",
+                      U64_PRINTF_ARG(chan->global_identifier),
+                      chan);
+          } else {
+            /*
+             * We exactly filled up the output queue with all available
+             * cells; go to idle.
+             */
+            chan->scheduler_state = SCHED_CHAN_IDLE;
+            log_debug(LD_SCHED,
+                      "Channel " U64_FORMAT " at %p "
+                      "become idle from pending",
+                      U64_PRINTF_ARG(chan->global_identifier),
+                      chan);
+          }
+        }
+      }
+
+      log_debug(LD_SCHED,
+                "Scheduler flushed %d cells onto pending channel "
+                U64_FORMAT " at %p",
+                (int)flushed, U64_PRINTF_ARG(chan->global_identifier),
+                chan);
+    } else {
+      log_info(LD_SCHED,
+               "Scheduler saw pending channel " U64_FORMAT " at %p with "
+               "no cells writeable",
+               U64_PRINTF_ARG(chan->global_identifier), chan);
+      /* Put it back to WAITING_TO_WRITE */
+      chan->scheduler_state = SCHED_CHAN_WAITING_TO_WRITE;
+    }
+  }
+
+  /* Readd any channels we need to */
+  if (to_readd) {
+    SMARTLIST_FOREACH_BEGIN(to_readd, channel_t *, readd_chan) {
+      readd_chan->scheduler_state = SCHED_CHAN_PENDING;
+      smartlist_pqueue_add(cp,
+                           scheduler_compare_channels,
+                           offsetof(channel_t, sched_heap_idx),
+                           readd_chan);
+    } SMARTLIST_FOREACH_END(readd_chan);
+    smartlist_free(to_readd);
+  }
+
+  n_chans_after = smartlist_len(cp);
+  log_debug(LD_SCHED, "Scheduler handled %d of %d pending channels",
+            n_chans_before - n_chans_after, n_chans_before);
+}
+
+/* Stores the vanilla scheduler function pointers. */
+static scheduler_t vanilla_scheduler = {
+  .free_all = NULL,
+  .on_channel_free = NULL,
+  .init = NULL,
+  .on_new_consensus = NULL,
+  .schedule = vanilla_scheduler_schedule,
+  .run = vanilla_scheduler_run,
+  .on_new_options = NULL,
+};
+
+scheduler_t *
+get_vanilla_scheduler(void)
+{
+  return &vanilla_scheduler;
+}
+

+ 68 - 227
src/test/test_options.c

@@ -401,9 +401,8 @@ fixed_get_uname(void)
   "ClientUseIPv4 1\n"                                                     \
   "VirtualAddrNetworkIPv4 127.192.0.0/10\n"                             \
   "VirtualAddrNetworkIPv6 [FE80::]/10\n"                                \
-  "SchedulerHighWaterMark__ 42\n"                                       \
-  "SchedulerLowWaterMark__ 10\n"                                        \
-  "UseEntryGuards 1\n"
+  "UseEntryGuards 1\n"                                                  \
+  "Schedulers Vanilla\n"
 
 typedef struct {
   or_options_t *old_opt;
@@ -507,7 +506,7 @@ test_options_validate__uname_for_server(void *ignored)
   fixed_get_uname_result = "Windows 2000";
   mock_clean_saved_logs();
   options_validate(tdata->old_opt, tdata->opt, tdata->def_opt, 0, &msg);
-  expect_log_entry();
+  expect_no_log_entry();
   tor_free(msg);
 
  done:
@@ -593,13 +592,14 @@ test_options_validate__nickname(void *ignored)
   tdata = get_options_test_data("Nickname AMoreValidNick");
   ret = options_validate(tdata->old_opt, tdata->opt, tdata->def_opt, 0, &msg);
   tt_int_op(ret, OP_EQ, -1);
-  tt_assert(!msg);
+  tt_str_op(msg, OP_EQ, "ConnLimit must be greater than 0, but was set to 0");
+  tor_free(msg);
 
   free_options_test_data(tdata);
   tdata = get_options_test_data("DataDirectory /tmp/somewhere");
   ret = options_validate(tdata->old_opt, tdata->opt, tdata->def_opt, 0, &msg);
   tt_int_op(ret, OP_EQ, -1);
-  tt_assert(!msg);
+  tt_str_op(msg, OP_EQ, "ConnLimit must be greater than 0, but was set to 0");
 
  done:
   free_options_test_data(tdata);
@@ -749,13 +749,13 @@ test_options_validate__authdir(void *ignored)
   mock_clean_saved_logs();
   ret = options_validate(tdata->old_opt, tdata->opt, tdata->def_opt, 0, &msg);
   tt_int_op(ret, OP_EQ, -1);
-  tt_assert(!msg);
+  tt_str_op(msg, OP_EQ, "Authoritative directory servers must set "
+                        "ContactInfo");
+  tor_free(msg);
 
   free_options_test_data(tdata);
   tdata = get_options_test_data("AuthoritativeDirectory 1\n"
-                                "Address 100.200.10.1\n"
-                                "SchedulerHighWaterMark__ 42\n"
-                                "SchedulerLowWaterMark__ 10\n");
+                                "Address 100.200.10.1\n");
   mock_clean_saved_logs();
   ret = options_validate(tdata->old_opt, tdata->opt, tdata->def_opt, 0, &msg);
   tt_int_op(ret, OP_EQ, -1);
@@ -766,9 +766,7 @@ test_options_validate__authdir(void *ignored)
   free_options_test_data(tdata);
   tdata = get_options_test_data("AuthoritativeDirectory 1\n"
                                 "Address 100.200.10.1\n"
-                                "TestingTorNetwork 1\n"
-                                "SchedulerHighWaterMark__ 42\n"
-                                "SchedulerLowWaterMark__ 10\n");
+                                "TestingTorNetwork 1\n");
   mock_clean_saved_logs();
   ret = options_validate(tdata->old_opt, tdata->opt, tdata->def_opt, 0, &msg);
   tt_int_op(ret, OP_EQ, -1);
@@ -779,9 +777,7 @@ test_options_validate__authdir(void *ignored)
   free_options_test_data(tdata);
   tdata = get_options_test_data("AuthoritativeDirectory 1\n"
                                 "Address 100.200.10.1\n"
-                                "ContactInfo hello@hello.com\n"
-                                "SchedulerHighWaterMark__ 42\n"
-                                "SchedulerLowWaterMark__ 10\n");
+                                "ContactInfo hello@hello.com\n");
   mock_clean_saved_logs();
   ret = options_validate(tdata->old_opt, tdata->opt, tdata->def_opt, 0, &msg);
   tt_int_op(ret, OP_EQ, -1);
@@ -793,9 +789,7 @@ test_options_validate__authdir(void *ignored)
   tdata = get_options_test_data("AuthoritativeDirectory 1\n"
                                 "Address 100.200.10.1\n"
                                 "RecommendedVersions 1.2, 3.14\n"
-                                "ContactInfo hello@hello.com\n"
-                                "SchedulerHighWaterMark__ 42\n"
-                                "SchedulerLowWaterMark__ 10\n");
+                                "ContactInfo hello@hello.com\n");
   mock_clean_saved_logs();
   options_validate(tdata->old_opt, tdata->opt, tdata->def_opt, 0, &msg);
   tt_str_op(tdata->opt->RecommendedClientVersions->value, OP_EQ, "1.2, 3.14");
@@ -808,9 +802,7 @@ test_options_validate__authdir(void *ignored)
                                 "RecommendedVersions 1.2, 3.14\n"
                                 "RecommendedClientVersions 25\n"
                                 "RecommendedServerVersions 4.18\n"
-                                "ContactInfo hello@hello.com\n"
-                                "SchedulerHighWaterMark__ 42\n"
-                                "SchedulerLowWaterMark__ 10\n");
+                                "ContactInfo hello@hello.com\n");
   mock_clean_saved_logs();
   options_validate(tdata->old_opt, tdata->opt, tdata->def_opt, 0, &msg);
   tt_str_op(tdata->opt->RecommendedClientVersions->value, OP_EQ, "25");
@@ -824,9 +816,7 @@ test_options_validate__authdir(void *ignored)
                                 "RecommendedVersions 1.2, 3.14\n"
                                 "RecommendedClientVersions 25\n"
                                 "RecommendedServerVersions 4.18\n"
-                                "ContactInfo hello@hello.com\n"
-                                "SchedulerHighWaterMark__ 42\n"
-                                "SchedulerLowWaterMark__ 10\n");
+                                "ContactInfo hello@hello.com\n");
   mock_clean_saved_logs();
   options_validate(tdata->old_opt, tdata->opt, tdata->def_opt, 0, &msg);
   tt_str_op(msg, OP_EQ, "AuthoritativeDir is set, but none of (Bridge/V3)"
@@ -838,9 +828,7 @@ test_options_validate__authdir(void *ignored)
                                 "Address 100.200.10.1\n"
                                 "VersioningAuthoritativeDirectory 1\n"
                                 "RecommendedServerVersions 4.18\n"
-                                "ContactInfo hello@hello.com\n"
-                                "SchedulerHighWaterMark__ 42\n"
-                                "SchedulerLowWaterMark__ 10\n");
+                                "ContactInfo hello@hello.com\n");
   mock_clean_saved_logs();
   options_validate(tdata->old_opt, tdata->opt, tdata->def_opt, 0, &msg);
   tt_str_op(msg, OP_EQ, "Versioning authoritative dir servers must set "
@@ -852,9 +840,7 @@ test_options_validate__authdir(void *ignored)
                                 "Address 100.200.10.1\n"
                                 "VersioningAuthoritativeDirectory 1\n"
                                 "RecommendedClientVersions 4.18\n"
-                                "ContactInfo hello@hello.com\n"
-                                "SchedulerHighWaterMark__ 42\n"
-                                "SchedulerLowWaterMark__ 10\n");
+                                "ContactInfo hello@hello.com\n");
   mock_clean_saved_logs();
   options_validate(tdata->old_opt, tdata->opt, tdata->def_opt, 0, &msg);
   tt_str_op(msg, OP_EQ, "Versioning authoritative dir servers must set "
@@ -865,9 +851,7 @@ test_options_validate__authdir(void *ignored)
   tdata = get_options_test_data("AuthoritativeDirectory 1\n"
                                 "Address 100.200.10.1\n"
                                 "UseEntryGuards 1\n"
-                                "ContactInfo hello@hello.com\n"
-                                "SchedulerHighWaterMark__ 42\n"
-                                "SchedulerLowWaterMark__ 10\n");
+                                "ContactInfo hello@hello.com\n");
   mock_clean_saved_logs();
   options_validate(tdata->old_opt, tdata->opt, tdata->def_opt, 0, &msg);
   expect_log_msg("Authoritative directory servers "
@@ -879,9 +863,7 @@ test_options_validate__authdir(void *ignored)
   tdata = get_options_test_data("AuthoritativeDirectory 1\n"
                                 "Address 100.200.10.1\n"
                                 "V3AuthoritativeDir 1\n"
-                                "ContactInfo hello@hello.com\n"
-                                "SchedulerHighWaterMark__ 42\n"
-                                "SchedulerLowWaterMark__ 10\n");
+                                "ContactInfo hello@hello.com\n");
   mock_clean_saved_logs();
   options_validate(tdata->old_opt, tdata->opt, tdata->def_opt, 0, &msg);
   expect_log_msg("Authoritative directories always try"
@@ -894,9 +876,7 @@ test_options_validate__authdir(void *ignored)
                                 "Address 100.200.10.1\n"
                                 "DownloadExtraInfo 1\n"
                                 "V3AuthoritativeDir 1\n"
-                                "ContactInfo hello@hello.com\n"
-                                "SchedulerHighWaterMark__ 42\n"
-                                "SchedulerLowWaterMark__ 10\n");
+                                "ContactInfo hello@hello.com\n");
   mock_clean_saved_logs();
   options_validate(tdata->old_opt, tdata->opt, tdata->def_opt, 0, &msg);
   expect_no_log_msg("Authoritative directories always try"
@@ -907,9 +887,7 @@ test_options_validate__authdir(void *ignored)
   free_options_test_data(tdata);
   tdata = get_options_test_data("AuthoritativeDirectory 1\n"
                                 "Address 100.200.10.1\n"
-                                "ContactInfo hello@hello.com\n"
-                                "SchedulerHighWaterMark__ 42\n"
-                                "SchedulerLowWaterMark__ 10\n");
+                                "ContactInfo hello@hello.com\n");
   mock_clean_saved_logs();
   options_validate(tdata->old_opt, tdata->opt, tdata->def_opt, 0, &msg);
   tt_str_op(msg, OP_EQ, "AuthoritativeDir is set, but none of (Bridge/V3)"
@@ -921,9 +899,7 @@ test_options_validate__authdir(void *ignored)
                                 "Address 100.200.10.1\n"
                                 "BridgeAuthoritativeDir 1\n"
                                 "ContactInfo hello@hello.com\n"
-                                "V3BandwidthsFile non-existant-file\n"
-                                "SchedulerHighWaterMark__ 42\n"
-                                "SchedulerLowWaterMark__ 10\n");
+                                "V3BandwidthsFile non-existant-file\n");
   mock_clean_saved_logs();
   options_validate(tdata->old_opt, tdata->opt, tdata->def_opt, 0, &msg);
   tt_str_op(msg, OP_EQ,
@@ -935,9 +911,7 @@ test_options_validate__authdir(void *ignored)
                                 "Address 100.200.10.1\n"
                                 "BridgeAuthoritativeDir 1\n"
                                 "ContactInfo hello@hello.com\n"
-                                "V3BandwidthsFile non-existant-file\n"
-                                "SchedulerHighWaterMark__ 42\n"
-                                "SchedulerLowWaterMark__ 10\n");
+                                "V3BandwidthsFile non-existant-file\n");
   mock_clean_saved_logs();
   options_validate(NULL, tdata->opt, tdata->def_opt, 0, &msg);
   tt_str_op(msg, OP_EQ,
@@ -949,9 +923,7 @@ test_options_validate__authdir(void *ignored)
                                 "Address 100.200.10.1\n"
                                 "BridgeAuthoritativeDir 1\n"
                                 "ContactInfo hello@hello.com\n"
-                                "GuardfractionFile non-existant-file\n"
-                                "SchedulerHighWaterMark__ 42\n"
-                                "SchedulerLowWaterMark__ 10\n");
+                                "GuardfractionFile non-existant-file\n");
   mock_clean_saved_logs();
   options_validate(tdata->old_opt, tdata->opt, tdata->def_opt, 0, &msg);
   tt_str_op(msg, OP_EQ,
@@ -963,9 +935,7 @@ test_options_validate__authdir(void *ignored)
                                 "Address 100.200.10.1\n"
                                 "BridgeAuthoritativeDir 1\n"
                                 "ContactInfo hello@hello.com\n"
-                                "GuardfractionFile non-existant-file\n"
-                                "SchedulerHighWaterMark__ 42\n"
-                                "SchedulerLowWaterMark__ 10\n");
+                                "GuardfractionFile non-existant-file\n");
   mock_clean_saved_logs();
   options_validate(NULL, tdata->opt, tdata->def_opt, 0, &msg);
   tt_str_op(msg, OP_EQ,
@@ -976,9 +946,7 @@ test_options_validate__authdir(void *ignored)
   tdata = get_options_test_data("AuthoritativeDirectory 1\n"
                                 "Address 100.200.10.1\n"
                                 "BridgeAuthoritativeDir 1\n"
-                                "ContactInfo hello@hello.com\n"
-                                "SchedulerHighWaterMark__ 42\n"
-                                "SchedulerLowWaterMark__ 10\n");
+                                "ContactInfo hello@hello.com\n");
   mock_clean_saved_logs();
   ret = options_validate(tdata->old_opt, tdata->opt, tdata->def_opt, 0, &msg);
   tt_int_op(ret, OP_EQ, -1);
@@ -991,9 +959,7 @@ test_options_validate__authdir(void *ignored)
                                 "Address 100.200.10.1\n"
                                 "DirPort 999\n"
                                 "BridgeAuthoritativeDir 1\n"
-                                "ContactInfo hello@hello.com\n"
-                                "SchedulerHighWaterMark__ 42\n"
-                                "SchedulerLowWaterMark__ 10\n");
+                                "ContactInfo hello@hello.com\n");
   mock_clean_saved_logs();
   ret = options_validate(tdata->old_opt, tdata->opt, tdata->def_opt, 0, &msg);
   tt_int_op(ret, OP_EQ, -1);
@@ -1010,9 +976,7 @@ test_options_validate__authdir(void *ignored)
   /*                               "ORPort 888\n" */
   /*                               "ClientOnly 1\n" */
   /*                               "BridgeAuthoritativeDir 1\n" */
-  /*                               "ContactInfo hello@hello.com\n" */
-  /*                               "SchedulerHighWaterMark__ 42\n" */
-  /*                               "SchedulerLowWaterMark__ 10\n"); */
+  /*                               "ContactInfo hello@hello.com\n" ); */
   /* mock_clean_saved_logs(); */
   /* ret = options_validate(tdata->old_opt, tdata->opt, */
   /*                        tdata->def_opt, 0, &msg); */
@@ -1156,9 +1120,7 @@ test_options_validate__transproxy(void *ignored)
                                 "TransPort 127.0.0.1:123\n");
   ret = options_validate(tdata->old_opt, tdata->opt, tdata->def_opt, 0, &msg);
   tt_int_op(ret, OP_EQ, -1);
-  if (msg) {
-    TT_DIE(("Expected NULL but got '%s'", msg));
-  }
+  tt_str_op(msg, OP_EQ, "ConnLimit must be greater than 0, but was set to 0");
 #elif defined(KERNEL_MAY_SUPPORT_IPFW)
   tdata = get_options_test_data("TransProxyType ipfw\n"
                                 "TransPort 127.0.0.1:123\n");
@@ -1260,9 +1222,7 @@ test_options_validate__exclude_nodes(void *ignored)
 
   free_options_test_data(tdata);
   tdata = get_options_test_data("ExcludeNodes {cn}\n"
-                                "StrictNodes 1\n"
-                                "SchedulerHighWaterMark__ 42\n"
-                                "SchedulerLowWaterMark__ 10\n");
+                                "StrictNodes 1\n");
   mock_clean_saved_logs();
   ret = options_validate(tdata->old_opt, tdata->opt, tdata->def_opt, 0, &msg);
   tt_int_op(ret, OP_EQ, -1);
@@ -1273,9 +1233,7 @@ test_options_validate__exclude_nodes(void *ignored)
   tor_free(msg);
 
   free_options_test_data(tdata);
-  tdata = get_options_test_data("ExcludeNodes {cn}\n"
-                                "SchedulerHighWaterMark__ 42\n"
-                                "SchedulerLowWaterMark__ 10\n");
+  tdata = get_options_test_data("ExcludeNodes {cn}\n");
   mock_clean_saved_logs();
   ret = options_validate(tdata->old_opt, tdata->opt, tdata->def_opt, 0, &msg);
   tt_int_op(ret, OP_EQ, -1);
@@ -1292,49 +1250,6 @@ test_options_validate__exclude_nodes(void *ignored)
   tor_free(msg);
 }
 
-static void
-test_options_validate__scheduler(void *ignored)
-{
-  (void)ignored;
-  int ret;
-  char *msg;
-  setup_capture_of_logs(LOG_DEBUG);
-  options_test_data_t *tdata = get_options_test_data(
-                                            "SchedulerLowWaterMark__ 0\n");
-
-  ret = options_validate(tdata->old_opt, tdata->opt, tdata->def_opt, 0, &msg);
-  tt_int_op(ret, OP_EQ, -1);
-  expect_log_msg("Bad SchedulerLowWaterMark__ option\n");
-  tor_free(msg);
-
-  // TODO: this test cannot run on platforms where UINT32_MAX == UINT64_MAX.
-  // I suspect it's unlikely this branch can actually happen
-  /* free_options_test_data(tdata); */
-  /* tdata = get_options_test_data( */
-  /*                      "SchedulerLowWaterMark 10000000000000000000\n"); */
-  /* tdata->opt->SchedulerLowWaterMark__ = (uint64_t)UINT32_MAX; */
-  /* tdata->opt->SchedulerLowWaterMark__++; */
-  /* mock_clean_saved_logs(); */
-  /* ret = options_validate(tdata->old_opt, tdata->opt, */
-  /*                        tdata->def_opt, 0, &msg); */
-  /* tt_int_op(ret, OP_EQ, -1); */
-  /* expect_log_msg("Bad SchedulerLowWaterMark__ option\n"); */
-
-  free_options_test_data(tdata);
-  tdata = get_options_test_data("SchedulerLowWaterMark__ 42\n"
-                                "SchedulerHighWaterMark__ 42\n");
-  mock_clean_saved_logs();
-  ret = options_validate(tdata->old_opt, tdata->opt, tdata->def_opt, 0, &msg);
-  tt_int_op(ret, OP_EQ, -1);
-  expect_log_msg("Bad SchedulerHighWaterMark option\n");
-  tor_free(msg);
-
- done:
-  teardown_capture_of_logs();
-  free_options_test_data(tdata);
-  tor_free(msg);
-}
-
 static void
 test_options_validate__node_families(void *ignored)
 {
@@ -1343,9 +1258,7 @@ test_options_validate__node_families(void *ignored)
   char *msg;
   options_test_data_t *tdata = get_options_test_data(
                                      "NodeFamily flux, flax\n"
-                                     "NodeFamily somewhere\n"
-                                     "SchedulerHighWaterMark__ 42\n"
-                                     "SchedulerLowWaterMark__ 10\n");
+                                     "NodeFamily somewhere\n");
 
   ret = options_validate(tdata->old_opt, tdata->opt, tdata->def_opt, 0, &msg);
   tt_int_op(ret, OP_EQ, -1);
@@ -1363,8 +1276,7 @@ test_options_validate__node_families(void *ignored)
   tor_free(msg);
 
   free_options_test_data(tdata);
-  tdata = get_options_test_data("SchedulerHighWaterMark__ 42\n"
-                                "SchedulerLowWaterMark__ 10\n");
+  tdata = get_options_test_data("");
 
   ret = options_validate(tdata->old_opt, tdata->opt, tdata->def_opt, 0, &msg);
   tt_int_op(ret, OP_EQ, -1);
@@ -1372,9 +1284,7 @@ test_options_validate__node_families(void *ignored)
   tor_free(msg);
 
   free_options_test_data(tdata);
-  tdata = get_options_test_data("NodeFamily !flux\n"
-                                "SchedulerHighWaterMark__ 42\n"
-                                "SchedulerLowWaterMark__ 10\n");
+  tdata = get_options_test_data("NodeFamily !flux\n");
 
   ret = options_validate(tdata->old_opt, tdata->opt, tdata->def_opt, 0, &msg);
   tt_int_op(ret, OP_EQ, -1);
@@ -1423,9 +1333,7 @@ test_options_validate__recommended_packages(void *ignored)
   setup_capture_of_logs(LOG_WARN);
   options_test_data_t *tdata = get_options_test_data(
             "RecommendedPackages foo 1.2 http://foo.com sha1=123123123123\n"
-            "RecommendedPackages invalid-package-line\n"
-            "SchedulerHighWaterMark__ 42\n"
-            "SchedulerLowWaterMark__ 10\n");
+            "RecommendedPackages invalid-package-line\n");
 
   ret = options_validate(tdata->old_opt, tdata->opt, tdata->def_opt, 0, &msg);
   tt_int_op(ret, OP_EQ, -1);
@@ -1447,9 +1355,7 @@ test_options_validate__fetch_dir(void *ignored)
   char *msg;
   options_test_data_t *tdata = get_options_test_data(
                                             "FetchDirInfoExtraEarly 1\n"
-                                            "FetchDirInfoEarly 0\n"
-                                            "SchedulerHighWaterMark__ 42\n"
-                                            "SchedulerLowWaterMark__ 10\n");
+                                            "FetchDirInfoEarly 0\n");
 
   ret = options_validate(tdata->old_opt, tdata->opt, tdata->def_opt, 0, &msg);
   tt_int_op(ret, OP_EQ, -1);
@@ -1459,9 +1365,7 @@ test_options_validate__fetch_dir(void *ignored)
 
   free_options_test_data(tdata);
   tdata = get_options_test_data("FetchDirInfoExtraEarly 1\n"
-                                "FetchDirInfoEarly 1\n"
-                                "SchedulerHighWaterMark__ 42\n"
-                                "SchedulerLowWaterMark__ 10\n");
+                                "FetchDirInfoEarly 1\n");
 
   ret = options_validate(tdata->old_opt, tdata->opt, tdata->def_opt, 0, &msg);
   tt_int_op(ret, OP_EQ, -1);
@@ -1481,9 +1385,7 @@ test_options_validate__conn_limit(void *ignored)
   int ret;
   char *msg;
   options_test_data_t *tdata = get_options_test_data(
-                                            "ConnLimit 0\n"
-                                            "SchedulerHighWaterMark__ 42\n"
-                                            "SchedulerLowWaterMark__ 10\n");
+                                            "ConnLimit 0\n");
 
   ret = options_validate(tdata->old_opt, tdata->opt, tdata->def_opt, 0, &msg);
   tt_int_op(ret, OP_EQ, -1);
@@ -1491,9 +1393,7 @@ test_options_validate__conn_limit(void *ignored)
   tor_free(msg);
 
   free_options_test_data(tdata);
-  tdata = get_options_test_data("ConnLimit 1\n"
-                                "SchedulerHighWaterMark__ 42\n"
-                                "SchedulerLowWaterMark__ 10\n");
+  tdata = get_options_test_data("ConnLimit 1\n");
 
   ret = options_validate(tdata->old_opt, tdata->opt, tdata->def_opt, 0, &msg);
   tt_int_op(ret, OP_EQ, -1);
@@ -1515,9 +1415,7 @@ test_options_validate__paths_needed(void *ignored)
   setup_capture_of_logs(LOG_WARN);
   options_test_data_t *tdata = get_options_test_data(
                                       "PathsNeededToBuildCircuits 0.1\n"
-                                      "ConnLimit 1\n"
-                                      "SchedulerHighWaterMark__ 42\n"
-                                      "SchedulerLowWaterMark__ 10\n");
+                                      "ConnLimit 1\n");
 
   ret = options_validate(tdata->old_opt, tdata->opt, tdata->def_opt, 0, &msg);
   tt_int_op(ret, OP_EQ, -1);
@@ -1530,9 +1428,7 @@ test_options_validate__paths_needed(void *ignored)
   free_options_test_data(tdata);
   mock_clean_saved_logs();
   tdata = get_options_test_data("PathsNeededToBuildCircuits 0.99\n"
-                                "ConnLimit 1\n"
-                                "SchedulerHighWaterMark__ 42\n"
-                                "SchedulerLowWaterMark__ 10\n");
+                                "ConnLimit 1\n");
 
   ret = options_validate(tdata->old_opt, tdata->opt, tdata->def_opt, 0, &msg);
   tt_int_op(ret, OP_EQ, -1);
@@ -1545,9 +1441,7 @@ test_options_validate__paths_needed(void *ignored)
   free_options_test_data(tdata);
   mock_clean_saved_logs();
   tdata = get_options_test_data("PathsNeededToBuildCircuits 0.91\n"
-                                "ConnLimit 1\n"
-                                "SchedulerHighWaterMark__ 42\n"
-                                "SchedulerLowWaterMark__ 10\n");
+                                "ConnLimit 1\n");
 
   ret = options_validate(tdata->old_opt, tdata->opt, tdata->def_opt, 0, &msg);
   tt_int_op(ret, OP_EQ, -1);
@@ -1570,9 +1464,7 @@ test_options_validate__max_client_circuits(void *ignored)
   char *msg;
   options_test_data_t *tdata = get_options_test_data(
                                            "MaxClientCircuitsPending 0\n"
-                                           "ConnLimit 1\n"
-                                           "SchedulerHighWaterMark__ 42\n"
-                                           "SchedulerLowWaterMark__ 10\n");
+                                           "ConnLimit 1\n");
 
   ret = options_validate(tdata->old_opt, tdata->opt, tdata->def_opt, 0, &msg);
   tt_int_op(ret, OP_EQ, -1);
@@ -1582,9 +1474,7 @@ test_options_validate__max_client_circuits(void *ignored)
 
   free_options_test_data(tdata);
   tdata = get_options_test_data("MaxClientCircuitsPending 1025\n"
-                                "ConnLimit 1\n"
-                                "SchedulerHighWaterMark__ 42\n"
-                                "SchedulerLowWaterMark__ 10\n");
+                                "ConnLimit 1\n");
 
   ret = options_validate(tdata->old_opt, tdata->opt, tdata->def_opt, 0, &msg);
   tt_int_op(ret, OP_EQ, -1);
@@ -1594,9 +1484,7 @@ test_options_validate__max_client_circuits(void *ignored)
 
   free_options_test_data(tdata);
   tdata = get_options_test_data("MaxClientCircuitsPending 1\n"
-                                "ConnLimit 1\n"
-                                "SchedulerHighWaterMark__ 42\n"
-                                "SchedulerLowWaterMark__ 10\n");
+                                "ConnLimit 1\n");
 
   ret = options_validate(tdata->old_opt, tdata->opt, tdata->def_opt, 0, &msg);
   tt_int_op(ret, OP_EQ, -1);
@@ -1617,9 +1505,7 @@ test_options_validate__ports(void *ignored)
   options_test_data_t *tdata = get_options_test_data(
                                       "FirewallPorts 65537\n"
                                       "MaxClientCircuitsPending 1\n"
-                                      "ConnLimit 1\n"
-                                      "SchedulerHighWaterMark__ 42\n"
-                                      "SchedulerLowWaterMark__ 10\n");
+                                      "ConnLimit 1\n");
 
   ret = options_validate(tdata->old_opt, tdata->opt, tdata->def_opt, 0, &msg);
   tt_int_op(ret, OP_EQ, -1);
@@ -1630,9 +1516,7 @@ test_options_validate__ports(void *ignored)
   tdata = get_options_test_data("FirewallPorts 1\n"
                                 "LongLivedPorts 124444\n"
                                 "MaxClientCircuitsPending 1\n"
-                                "ConnLimit 1\n"
-                                "SchedulerHighWaterMark__ 42\n"
-                                "SchedulerLowWaterMark__ 10\n");
+                                "ConnLimit 1\n");
 
   ret = options_validate(tdata->old_opt, tdata->opt, tdata->def_opt, 0, &msg);
   tt_int_op(ret, OP_EQ, -1);
@@ -1644,9 +1528,7 @@ test_options_validate__ports(void *ignored)
                                 "LongLivedPorts 2\n"
                                 "RejectPlaintextPorts 112233\n"
                                 "MaxClientCircuitsPending 1\n"
-                                "ConnLimit 1\n"
-                                "SchedulerHighWaterMark__ 42\n"
-                                "SchedulerLowWaterMark__ 10\n");
+                                "ConnLimit 1\n");
 
   ret = options_validate(tdata->old_opt, tdata->opt, tdata->def_opt, 0, &msg);
   tt_int_op(ret, OP_EQ, -1);
@@ -1659,9 +1541,7 @@ test_options_validate__ports(void *ignored)
                                 "RejectPlaintextPorts 3\n"
                                 "WarnPlaintextPorts 65536\n"
                                 "MaxClientCircuitsPending 1\n"
-                                "ConnLimit 1\n"
-                                "SchedulerHighWaterMark__ 42\n"
-                                "SchedulerLowWaterMark__ 10\n");
+                                "ConnLimit 1\n");
 
   ret = options_validate(tdata->old_opt, tdata->opt, tdata->def_opt, 0, &msg);
   tt_int_op(ret, OP_EQ, -1);
@@ -1674,9 +1554,7 @@ test_options_validate__ports(void *ignored)
                                 "RejectPlaintextPorts 3\n"
                                 "WarnPlaintextPorts 4\n"
                                 "MaxClientCircuitsPending 1\n"
-                                "ConnLimit 1\n"
-                                "SchedulerHighWaterMark__ 42\n"
-                                "SchedulerLowWaterMark__ 10\n");
+                                "ConnLimit 1\n");
 
   ret = options_validate(tdata->old_opt, tdata->opt, tdata->def_opt, 0, &msg);
   tt_int_op(ret, OP_EQ, -1);
@@ -1698,9 +1576,7 @@ test_options_validate__reachable_addresses(void *ignored)
   options_test_data_t *tdata = get_options_test_data(
                                      "FascistFirewall 1\n"
                                      "MaxClientCircuitsPending 1\n"
-                                     "ConnLimit 1\n"
-                                     "SchedulerHighWaterMark__ 42\n"
-                                     "SchedulerLowWaterMark__ 10\n");
+                                     "ConnLimit 1\n");
 
   ret = options_validate(tdata->old_opt, tdata->opt, tdata->def_opt, 0, &msg);
   tt_int_op(ret, OP_EQ, -1);
@@ -1718,9 +1594,7 @@ test_options_validate__reachable_addresses(void *ignored)
                                 "ReachableDirAddresses *:81\n"
                                 "ReachableORAddresses *:444\n"
                                 "MaxClientCircuitsPending 1\n"
-                                "ConnLimit 1\n"
-                                "SchedulerHighWaterMark__ 42\n"
-                                "SchedulerLowWaterMark__ 10\n");
+                                "ConnLimit 1\n");
   tdata->opt->FirewallPorts = smartlist_new();
   ret = options_validate(tdata->old_opt, tdata->opt, tdata->def_opt, 0, &msg);
   tt_int_op(ret, OP_EQ, -1);
@@ -1734,9 +1608,7 @@ test_options_validate__reachable_addresses(void *ignored)
   tdata = get_options_test_data("FascistFirewall 1\n"
                                 "FirewallPort 123\n"
                                 "MaxClientCircuitsPending 1\n"
-                                "ConnLimit 1\n"
-                                "SchedulerHighWaterMark__ 42\n"
-                                "SchedulerLowWaterMark__ 10\n");
+                                "ConnLimit 1\n");
 
   ret = options_validate(tdata->old_opt, tdata->opt, tdata->def_opt, 0, &msg);
   tt_int_op(ret, OP_EQ, -1);
@@ -1753,9 +1625,7 @@ test_options_validate__reachable_addresses(void *ignored)
                                 "ReachableAddresses *:83\n"
                                 "ReachableAddresses reject *:*\n"
                                 "MaxClientCircuitsPending 1\n"
-                                "ConnLimit 1\n"
-                                "SchedulerHighWaterMark__ 42\n"
-                                "SchedulerLowWaterMark__ 10\n");
+                                "ConnLimit 1\n");
 
   ret = options_validate(tdata->old_opt, tdata->opt, tdata->def_opt, 0, &msg);
   tt_int_op(ret, OP_EQ, -1);
@@ -1771,9 +1641,7 @@ test_options_validate__reachable_addresses(void *ignored)
   tdata = get_options_test_data("ReachableAddresses *:82\n"
                                 "ORPort 127.0.0.1:5555\n"
                                 "MaxClientCircuitsPending 1\n"
-                                "ConnLimit 1\n"
-                                "SchedulerHighWaterMark__ 42\n"
-                                "SchedulerLowWaterMark__ 10\n");
+                                "ConnLimit 1\n");
 
   ret = options_validate(tdata->old_opt, tdata->opt, tdata->def_opt, 0, &msg);
   tt_int_op(ret, OP_EQ, -1);
@@ -1784,9 +1652,7 @@ test_options_validate__reachable_addresses(void *ignored)
   tdata = get_options_test_data("ReachableORAddresses *:82\n"
                                 "ORPort 127.0.0.1:5555\n"
                                 "MaxClientCircuitsPending 1\n"
-                                "ConnLimit 1\n"
-                                "SchedulerHighWaterMark__ 42\n"
-                                "SchedulerLowWaterMark__ 10\n");
+                                "ConnLimit 1\n");
 
   ret = options_validate(tdata->old_opt, tdata->opt, tdata->def_opt, 0, &msg);
   tt_int_op(ret, OP_EQ, -1);
@@ -1797,9 +1663,7 @@ test_options_validate__reachable_addresses(void *ignored)
   tdata = get_options_test_data("ReachableDirAddresses *:82\n"
                                 "ORPort 127.0.0.1:5555\n"
                                 "MaxClientCircuitsPending 1\n"
-                                "ConnLimit 1\n"
-                                "SchedulerHighWaterMark__ 42\n"
-                                "SchedulerLowWaterMark__ 10\n");
+                                "ConnLimit 1\n");
 
   ret = options_validate(tdata->old_opt, tdata->opt, tdata->def_opt, 0, &msg);
   tt_int_op(ret, OP_EQ, -1);
@@ -1810,9 +1674,7 @@ test_options_validate__reachable_addresses(void *ignored)
   tdata = get_options_test_data("ClientUseIPv4 0\n"
                                 "ORPort 127.0.0.1:5555\n"
                                 "MaxClientCircuitsPending 1\n"
-                                "ConnLimit 1\n"
-                                "SchedulerHighWaterMark__ 42\n"
-                                "SchedulerLowWaterMark__ 10\n");
+                                "ConnLimit 1\n");
 
   ret = options_validate(tdata->old_opt, tdata->opt, tdata->def_opt, 0, &msg);
   tt_int_op(ret, OP_EQ, -1);
@@ -1910,9 +1772,7 @@ test_options_validate__use_bridges(void *ignored)
                                    "ClientUseIPv4 1\n"
                                    "ORPort 127.0.0.1:5555\n"
                                    "MaxClientCircuitsPending 1\n"
-                                   "ConnLimit 1\n"
-                                   "SchedulerHighWaterMark__ 42\n"
-                                   "SchedulerLowWaterMark__ 10\n");
+                                   "ConnLimit 1\n");
 
   ret = options_validate(tdata->old_opt, tdata->opt, tdata->def_opt, 0, &msg);
   tt_int_op(ret, OP_EQ, -1);
@@ -1923,9 +1783,7 @@ test_options_validate__use_bridges(void *ignored)
   free_options_test_data(tdata);
   tdata = get_options_test_data("UseBridges 1\n"
                                 "MaxClientCircuitsPending 1\n"
-                                "ConnLimit 1\n"
-                                "SchedulerHighWaterMark__ 42\n"
-                                "SchedulerLowWaterMark__ 10\n");
+                                "ConnLimit 1\n");
 
   ret = options_validate(tdata->old_opt, tdata->opt, tdata->def_opt, 0, &msg);
   tt_int_op(ret, OP_EQ, -1);
@@ -1938,9 +1796,7 @@ test_options_validate__use_bridges(void *ignored)
   tdata = get_options_test_data("UseBridges 1\n"
                                 "EntryNodes {cn}\n"
                                 "MaxClientCircuitsPending 1\n"
-                                "ConnLimit 1\n"
-                                "SchedulerHighWaterMark__ 42\n"
-                                "SchedulerLowWaterMark__ 10\n");
+                                "ConnLimit 1\n");
 
   ret = options_validate(tdata->old_opt, tdata->opt, tdata->def_opt, 0, &msg);
   tt_int_op(ret, OP_EQ, -1);
@@ -2000,9 +1856,7 @@ test_options_validate__entry_nodes(void *ignored)
                                          "EntryNodes {cn}\n"
                                          "UseEntryGuards 0\n"
                                          "MaxClientCircuitsPending 1\n"
-                                         "ConnLimit 1\n"
-                                         "SchedulerHighWaterMark__ 42\n"
-                                         "SchedulerLowWaterMark__ 10\n");
+                                         "ConnLimit 1\n");
 
   ret = options_validate(tdata->old_opt, tdata->opt, tdata->def_opt, 0, &msg);
   tt_int_op(ret, OP_EQ, -1);
@@ -2014,9 +1868,7 @@ test_options_validate__entry_nodes(void *ignored)
   tdata = get_options_test_data("EntryNodes {cn}\n"
                                 "UseEntryGuards 1\n"
                                 "MaxClientCircuitsPending 1\n"
-                                "ConnLimit 1\n"
-                                "SchedulerHighWaterMark__ 42\n"
-                                "SchedulerLowWaterMark__ 10\n");
+                                "ConnLimit 1\n");
 
   ret = options_validate(tdata->old_opt, tdata->opt, tdata->def_opt, 0, &msg);
   tt_int_op(ret, OP_EQ, -1);
@@ -2037,9 +1889,7 @@ test_options_validate__safe_logging(void *ignored)
   char *msg;
   options_test_data_t *tdata = get_options_test_data(
                                             "MaxClientCircuitsPending 1\n"
-                                            "ConnLimit 1\n"
-                                            "SchedulerHighWaterMark__ 42\n"
-                                            "SchedulerLowWaterMark__ 10\n");
+                                            "ConnLimit 1\n");
 
   ret = options_validate(tdata->old_opt, tdata->opt, tdata->def_opt, 0, &msg);
   tt_int_op(ret, OP_EQ, -1);
@@ -2049,9 +1899,7 @@ test_options_validate__safe_logging(void *ignored)
   free_options_test_data(tdata);
   tdata = get_options_test_data("SafeLogging 0\n"
                                 "MaxClientCircuitsPending 1\n"
-                                "ConnLimit 1\n"
-                                "SchedulerHighWaterMark__ 42\n"
-                                "SchedulerLowWaterMark__ 10\n");
+                                "ConnLimit 1\n");
 
   ret = options_validate(tdata->old_opt, tdata->opt, tdata->def_opt, 0, &msg);
   tt_int_op(ret, OP_EQ, -1);
@@ -2061,9 +1909,7 @@ test_options_validate__safe_logging(void *ignored)
   free_options_test_data(tdata);
   tdata = get_options_test_data("SafeLogging Relay\n"
                                 "MaxClientCircuitsPending 1\n"
-                                "ConnLimit 1\n"
-                                "SchedulerHighWaterMark__ 42\n"
-                                "SchedulerLowWaterMark__ 10\n");
+                                "ConnLimit 1\n");
 
   ret = options_validate(tdata->old_opt, tdata->opt, tdata->def_opt, 0, &msg);
   tt_int_op(ret, OP_EQ, -1);
@@ -2073,9 +1919,7 @@ test_options_validate__safe_logging(void *ignored)
   free_options_test_data(tdata);
   tdata = get_options_test_data("SafeLogging 1\n"
                                 "MaxClientCircuitsPending 1\n"
-                                "ConnLimit 1\n"
-                                "SchedulerHighWaterMark__ 42\n"
-                                "SchedulerLowWaterMark__ 10\n");
+                                "ConnLimit 1\n");
 
   ret = options_validate(tdata->old_opt, tdata->opt, tdata->def_opt, 0, &msg);
   tt_int_op(ret, OP_EQ, -1);
@@ -2085,9 +1929,7 @@ test_options_validate__safe_logging(void *ignored)
   free_options_test_data(tdata);
   tdata = get_options_test_data("SafeLogging stuffy\n"
                                 "MaxClientCircuitsPending 1\n"
-                                "ConnLimit 1\n"
-                                "SchedulerHighWaterMark__ 42\n"
-                                "SchedulerLowWaterMark__ 10\n");
+                                "ConnLimit 1\n");
 
   ret = options_validate(tdata->old_opt, tdata->opt, tdata->def_opt, 0, &msg);
   tt_int_op(ret, OP_EQ, -1);
@@ -4416,7 +4258,6 @@ struct testcase_t options_tests[] = {
   LOCAL_VALIDATE_TEST(relay_with_hidden_services),
   LOCAL_VALIDATE_TEST(transproxy),
   LOCAL_VALIDATE_TEST(exclude_nodes),
-  LOCAL_VALIDATE_TEST(scheduler),
   LOCAL_VALIDATE_TEST(node_families),
   LOCAL_VALIDATE_TEST(token_bucket),
   LOCAL_VALIDATE_TEST(recommended_packages),

+ 502 - 190
src/test/test_scheduler.c

@@ -6,11 +6,16 @@
 #include <math.h>
 #include <event2/event.h>
 
+#define SCHEDULER_KIST_PRIVATE
 #define TOR_CHANNEL_INTERNAL_
 #define CHANNEL_PRIVATE_
 #include "or.h"
+#include "config.h"
 #include "compat_libevent.h"
 #include "channel.h"
+#include "channeltls.h"
+#include "connection.h"
+#include "networkstatus.h"
 #define SCHEDULER_PRIVATE_
 #include "scheduler.h"
 
@@ -18,53 +23,89 @@
 #include "test.h"
 #include "fakechans.h"
 
-/* Event base for scheduelr tests */
-static struct event_base *mock_event_base = NULL;
-
-/* Statics controlling mocks */
-static circuitmux_t *mock_ccm_tgt_1 = NULL;
-static circuitmux_t *mock_ccm_tgt_2 = NULL;
+/* Shamelessly stolen from compat_libevent.c */
+#define V(major, minor, patch) \
+  (((major) << 24) | ((minor) << 16) | ((patch) << 8))
 
-static circuitmux_t *mock_cgp_tgt_1 = NULL;
-static circuitmux_policy_t *mock_cgp_val_1 = NULL;
-static circuitmux_t *mock_cgp_tgt_2 = NULL;
-static circuitmux_policy_t *mock_cgp_val_2 = NULL;
+/******************************************************************************
+ * Statistical info
+ *****************************************************************************/
 static int scheduler_compare_channels_mock_ctr = 0;
 static int scheduler_run_mock_ctr = 0;
 
-static void channel_flush_some_cells_mock_free_all(void);
-static void channel_flush_some_cells_mock_set(channel_t *chan,
-                                              ssize_t num_cells);
+/******************************************************************************
+ * Utility functions and things we need to mock
+ *****************************************************************************/
+static or_options_t mocked_options;
+static const or_options_t *
+mock_get_options(void)
+{
+  return &mocked_options;
+}
 
-/* Setup for mock event stuff */
-static void mock_event_free_all(void);
-static void mock_event_init(void);
+static void
+cleanup_scheduler_options(void)
+{
+  if (mocked_options.SchedulerTypes_) {
+    SMARTLIST_FOREACH(mocked_options.SchedulerTypes_, int *, i, tor_free(i));
+    smartlist_free(mocked_options.SchedulerTypes_);
+    mocked_options.SchedulerTypes_ = NULL;
+  }
+}
 
-/* Mocks used by scheduler tests */
-static ssize_t channel_flush_some_cells_mock(channel_t *chan,
-                                             ssize_t num_cells);
-static int circuitmux_compare_muxes_mock(circuitmux_t *cmux_1,
-                                         circuitmux_t *cmux_2);
-static const circuitmux_policy_t * circuitmux_get_policy_mock(
-    circuitmux_t *cmux);
-static int scheduler_compare_channels_mock(const void *c1_v,
-                                           const void *c2_v);
-static void scheduler_run_noop_mock(void);
-static struct event_base * tor_libevent_get_base_mock(void);
-
-/* Scheduler test cases */
-static void test_scheduler_channel_states(void *arg);
-static void test_scheduler_compare_channels(void *arg);
-static void test_scheduler_initfree(void *arg);
-static void test_scheduler_loop(void *arg);
-static void test_scheduler_queue_heuristic(void *arg);
-
-/* Mock event init/free */
+static void
+set_scheduler_options(int val)
+{
+  int *type;
 
-/* Shamelessly stolen from compat_libevent.c */
-#define V(major, minor, patch) \
-  (((major) << 24) | ((minor) << 16) | ((patch) << 8))
+  if (mocked_options.SchedulerTypes_ == NULL) {
+    mocked_options.SchedulerTypes_ = smartlist_new();
+  }
+  type = tor_malloc_zero(sizeof(int));
+  *type = val;
+  smartlist_add(mocked_options.SchedulerTypes_, type);
+}
+
+static void
+clear_options(void)
+{
+  cleanup_scheduler_options();
+  memset(&mocked_options, 0, sizeof(mocked_options));
+}
 
+static int32_t
+mock_vanilla_networkstatus_get_param(
+    const networkstatus_t *ns, const char *param_name, int32_t default_val,
+    int32_t min_val, int32_t max_val)
+{
+  (void)ns;
+  (void)default_val;
+  (void)min_val;
+  (void)max_val;
+  // only support KISTSchedRunInterval right now
+  tor_assert(strcmp(param_name, "KISTSchedRunInterval")==0);
+  return -1;
+}
+
+static int32_t
+mock_kist_networkstatus_get_param(
+    const networkstatus_t *ns, const char *param_name, int32_t default_val,
+    int32_t min_val, int32_t max_val)
+{
+  (void)ns;
+  (void)default_val;
+  (void)min_val;
+  (void)max_val;
+  // only support KISTSchedRunInterval right now
+  tor_assert(strcmp(param_name, "KISTSchedRunInterval")==0);
+  return 12;
+}
+
+/* Event base for scheduelr tests */
+static struct event_base *mock_event_base = NULL;
+/* Setup for mock event stuff */
+static void mock_event_free_all(void);
+static void mock_event_init(void);
 static void
 mock_event_free_all(void)
 {
@@ -110,7 +151,84 @@ mock_event_init(void)
   return;
 }
 
-/* Mocks */
+static struct event_base *
+tor_libevent_get_base_mock(void)
+{
+  return mock_event_base;
+}
+
+static int
+scheduler_compare_channels_mock(const void *c1_v,
+                                const void *c2_v)
+{
+  uintptr_t p1, p2;
+
+  p1 = (uintptr_t)(c1_v);
+  p2 = (uintptr_t)(c2_v);
+
+  ++scheduler_compare_channels_mock_ctr;
+
+  if (p1 == p2) return 0;
+  else if (p1 < p2) return 1;
+  else return -1;
+}
+
+static void
+scheduler_run_noop_mock(void)
+{
+  ++scheduler_run_mock_ctr;
+}
+
+static circuitmux_t *mock_ccm_tgt_1 = NULL;
+static circuitmux_t *mock_ccm_tgt_2 = NULL;
+static circuitmux_t *mock_cgp_tgt_1 = NULL;
+static circuitmux_policy_t *mock_cgp_val_1 = NULL;
+static circuitmux_t *mock_cgp_tgt_2 = NULL;
+static circuitmux_policy_t *mock_cgp_val_2 = NULL;
+
+static const circuitmux_policy_t *
+circuitmux_get_policy_mock(circuitmux_t *cmux)
+{
+  const circuitmux_policy_t *result = NULL;
+
+  tt_assert(cmux != NULL);
+  if (cmux) {
+    if (cmux == mock_cgp_tgt_1) result = mock_cgp_val_1;
+    else if (cmux == mock_cgp_tgt_2) result = mock_cgp_val_2;
+    else result = circuitmux_get_policy__real(cmux);
+  }
+
+ done:
+  return result;
+}
+
+static int
+circuitmux_compare_muxes_mock(circuitmux_t *cmux_1,
+                              circuitmux_t *cmux_2)
+{
+  int result = 0;
+
+  tt_assert(cmux_1 != NULL);
+  tt_assert(cmux_2 != NULL);
+
+  if (cmux_1 != cmux_2) {
+    if (cmux_1 == mock_ccm_tgt_1 && cmux_2 == mock_ccm_tgt_2) result = -1;
+    else if (cmux_1 == mock_ccm_tgt_2 && cmux_2 == mock_ccm_tgt_1) {
+      result = 1;
+    } else {
+      if (cmux_1 == mock_ccm_tgt_1 || cmux_1 == mock_ccm_tgt_2) result = -1;
+      else if (cmux_2 == mock_ccm_tgt_1 || cmux_2 == mock_ccm_tgt_2) {
+        result = 1;
+      } else {
+        result = circuitmux_compare_muxes__real(cmux_1, cmux_2);
+      }
+    }
+  }
+  /* else result = 0 always */
+
+ done:
+  return result;
+}
 
 typedef struct {
   const channel_t *chan;
@@ -174,6 +292,67 @@ channel_flush_some_cells_mock_set(channel_t *chan, ssize_t num_cells)
   }
 }
 
+static int
+channel_more_to_flush_mock(channel_t *chan)
+{
+  tor_assert(chan);
+
+  flush_mock_channel_t *found_mock_ch = NULL;
+
+  /* Check if we have any queued */
+  if (! TOR_SIMPLEQ_EMPTY(&chan->incoming_queue))
+      return 1;
+
+  SMARTLIST_FOREACH_BEGIN(chans_for_flush_mock,
+                          flush_mock_channel_t *,
+                          flush_mock_ch) {
+    if (flush_mock_ch != NULL && flush_mock_ch->chan != NULL) {
+      if (flush_mock_ch->chan == chan) {
+        /* Found it */
+        found_mock_ch = flush_mock_ch;
+        break;
+      }
+    } else {
+      /* That shouldn't be there... */
+      SMARTLIST_DEL_CURRENT(chans_for_flush_mock, flush_mock_ch);
+      tor_free(flush_mock_ch);
+    }
+  } SMARTLIST_FOREACH_END(flush_mock_ch);
+
+  tor_assert(found_mock_ch);
+
+  /* Check if any circuits would like to queue some */
+  /* special for the mock: return the number of cells (instead of 1), or zero
+   * if nothing to flush */
+  return (found_mock_ch->cells > 0 ? (int)found_mock_ch->cells : 0 );
+}
+
+static void
+channel_write_to_kernel_mock(channel_t *chan)
+{
+  (void)chan;
+  //log_debug(LD_SCHED, "chan=%d writing to kernel",
+  //    (int)chan->global_identifier);
+}
+
+static int
+channel_should_write_to_kernel_mock(outbuf_table_t *ot, channel_t *chan)
+{
+  (void)ot;
+  (void)chan;
+  return 1;
+  /* We could make this more complicated if we wanted. But I don't think doing
+   * so tests much of anything */
+  //static int called_counter = 0;
+  //if (++called_counter >= 3) {
+  //  called_counter -= 3;
+  //  log_debug(LD_SCHED, "chan=%d should write to kernel",
+  //      (int)chan->global_identifier);
+  //  return 1;
+  //}
+  //return 0;
+}
+
 static ssize_t
 channel_flush_some_cells_mock(channel_t *chan, ssize_t num_cells)
 {
@@ -215,11 +394,6 @@ channel_flush_some_cells_mock(channel_t *chan, ssize_t num_cells)
 
         flushed += max;
         found->cells -= max;
-
-        if (found->cells <= 0) {
-          smartlist_remove(chans_for_flush_mock, found);
-          tor_free(found);
-        }
       }
     }
   }
@@ -228,90 +402,26 @@ channel_flush_some_cells_mock(channel_t *chan, ssize_t num_cells)
   return flushed;
 }
 
-static int
-circuitmux_compare_muxes_mock(circuitmux_t *cmux_1,
-                              circuitmux_t *cmux_2)
-{
-  int result = 0;
-
-  tt_ptr_op(cmux_1, OP_NE, NULL);
-  tt_ptr_op(cmux_2, OP_NE, NULL);
-
-  if (cmux_1 != cmux_2) {
-    if (cmux_1 == mock_ccm_tgt_1 && cmux_2 == mock_ccm_tgt_2) result = -1;
-    else if (cmux_1 == mock_ccm_tgt_2 && cmux_2 == mock_ccm_tgt_1) {
-      result = 1;
-    } else {
-      if (cmux_1 == mock_ccm_tgt_1 || cmux_1 == mock_ccm_tgt_2) result = -1;
-      else if (cmux_2 == mock_ccm_tgt_1 || cmux_2 == mock_ccm_tgt_2) {
-        result = 1;
-      } else {
-        result = circuitmux_compare_muxes__real(cmux_1, cmux_2);
-      }
-    }
-  }
-  /* else result = 0 always */
-
- done:
-  return result;
-}
-
-static const circuitmux_policy_t *
-circuitmux_get_policy_mock(circuitmux_t *cmux)
-{
-  const circuitmux_policy_t *result = NULL;
-
-  tt_ptr_op(cmux, OP_NE, NULL);
-  if (cmux) {
-    if (cmux == mock_cgp_tgt_1) result = mock_cgp_val_1;
-    else if (cmux == mock_cgp_tgt_2) result = mock_cgp_val_2;
-    else result = circuitmux_get_policy__real(cmux);
-  }
-
- done:
-  return result;
-}
-
-static int
-scheduler_compare_channels_mock(const void *c1_v,
-                                const void *c2_v)
-{
-  uintptr_t p1, p2;
-
-  p1 = (uintptr_t)(c1_v);
-  p2 = (uintptr_t)(c2_v);
-
-  ++scheduler_compare_channels_mock_ctr;
-
-  if (p1 == p2) return 0;
-  else if (p1 < p2) return 1;
-  else return -1;
-}
-
 static void
-scheduler_run_noop_mock(void)
+update_socket_info_impl_mock(socket_table_ent_t *ent)
 {
-  ++scheduler_run_mock_ctr;
+  ent->cwnd = ent->unacked = ent->mss = ent->notsent = 0;
+  ent->limit = INT_MAX;
 }
 
-static struct event_base *
-tor_libevent_get_base_mock(void)
-{
-  return mock_event_base;
-}
-
-/* Test cases */
-
 static void
-test_scheduler_channel_states(void *arg)
+perform_channel_state_tests(int KISTSchedRunInterval, int sched_type)
 {
   channel_t *ch1 = NULL, *ch2 = NULL;
   int old_count;
 
-  (void)arg;
+  /* setup options so we're sure about what sched we are running */
+  MOCK(get_options, mock_get_options);
+  clear_options();
+  mocked_options.KISTSchedRunInterval = KISTSchedRunInterval;
+  set_scheduler_options(sched_type);
 
   /* Set up libevent and scheduler */
-
   mock_event_init();
   MOCK(tor_libevent_get_base, tor_libevent_get_base_mock);
   scheduler_init();
@@ -324,7 +434,7 @@ test_scheduler_channel_states(void *arg)
    * Disable scheduler_run so we can just check the state transitions
    * without having to make everything it might call work too.
    */
-  MOCK(scheduler_run, scheduler_run_noop_mock);
+  ((scheduler_t *) the_scheduler)->run = scheduler_run_noop_mock;
 
   tt_int_op(smartlist_len(channels_pending), OP_EQ, 0);
 
@@ -351,7 +461,7 @@ test_scheduler_channel_states(void *arg)
   channel_register(ch2);
   tt_assert(ch2->registered);
 
-  /* Send it to SCHED_CHAN_WAITING_TO_WRITE */
+  /* Send ch1 to SCHED_CHAN_WAITING_TO_WRITE */
   scheduler_channel_has_waiting_cells(ch1);
   tt_int_op(ch1->scheduler_state, OP_EQ, SCHED_CHAN_WAITING_TO_WRITE);
 
@@ -415,8 +525,9 @@ test_scheduler_channel_states(void *arg)
   tor_free(ch2);
 
   UNMOCK(scheduler_compare_channels);
-  UNMOCK(scheduler_run);
   UNMOCK(tor_libevent_get_base);
+  UNMOCK(get_options);
+  cleanup_scheduler_options();
 
   return;
 }
@@ -502,40 +613,22 @@ test_scheduler_compare_channels(void *arg)
   return;
 }
 
-static void
-test_scheduler_initfree(void *arg)
-{
-  (void)arg;
-
-  tt_ptr_op(channels_pending, OP_EQ, NULL);
-  tt_ptr_op(run_sched_ev, OP_EQ, NULL);
-
-  mock_event_init();
-  MOCK(tor_libevent_get_base, tor_libevent_get_base_mock);
-
-  scheduler_init();
-
-  tt_ptr_op(channels_pending, OP_NE, NULL);
-  tt_ptr_op(run_sched_ev, OP_NE, NULL);
-
-  scheduler_free_all();
-
-  UNMOCK(tor_libevent_get_base);
-  mock_event_free_all();
-
-  tt_ptr_op(channels_pending, OP_EQ, NULL);
-  tt_ptr_op(run_sched_ev, OP_EQ, NULL);
-
- done:
-  return;
-}
+/******************************************************************************
+ * The actual tests!
+ *****************************************************************************/
 
 static void
-test_scheduler_loop(void *arg)
+test_scheduler_loop_vanilla(void *arg)
 {
+  (void)arg;
   channel_t *ch1 = NULL, *ch2 = NULL;
+  void (*run_func_ptr)(void);
 
-  (void)arg;
+  /* setup options so we're sure about what sched we are running */
+  MOCK(get_options, mock_get_options);
+  clear_options();
+  set_scheduler_options(SCHEDULER_VANILLA);
+  mocked_options.KISTSchedRunInterval = -1;
 
   /* Set up libevent and scheduler */
 
@@ -551,12 +644,14 @@ test_scheduler_loop(void *arg)
    * Disable scheduler_run so we can just check the state transitions
    * without having to make everything it might call work too.
    */
-  MOCK(scheduler_run, scheduler_run_noop_mock);
+  run_func_ptr = the_scheduler->run;
+  ((scheduler_t *) the_scheduler)->run = scheduler_run_noop_mock;
 
   tt_int_op(smartlist_len(channels_pending), OP_EQ, 0);
 
   /* Set up a fake channel */
   ch1 = new_fake_channel();
+  ch1->magic = TLS_CHAN_MAGIC;
   tt_assert(ch1);
 
   /* Start it off in OPENING */
@@ -574,6 +669,7 @@ test_scheduler_loop(void *arg)
 
   /* Now get another one */
   ch2 = new_fake_channel();
+  ch2->magic = TLS_CHAN_MAGIC;
   tt_assert(ch2);
   ch2->state = CHANNEL_STATE_OPENING;
   ch2->cmux = circuitmux_alloc();
@@ -615,15 +711,9 @@ test_scheduler_loop(void *arg)
 
   /*
    * Now we've got two pending channels and need to fire off
-   * scheduler_run(); first, unmock it.
+   * the scheduler run() that we kept.
    */
-
-  UNMOCK(scheduler_run);
-
-  scheduler_run();
-
-  /* Now re-mock it */
-  MOCK(scheduler_run, scheduler_run_noop_mock);
+  run_func_ptr();
 
   /*
    * Assert that they're still in the states we left and aren't still
@@ -661,15 +751,10 @@ test_scheduler_loop(void *arg)
   channel_flush_some_cells_mock_set(ch2, 48);
 
   /*
-   * And re-run the scheduler_run() loop with non-zero returns from
+   * And re-run the scheduler run() loop with non-zero returns from
    * channel_flush_some_cells() this time.
    */
-  UNMOCK(scheduler_run);
-
-  scheduler_run();
-
-  /* Now re-mock it */
-  MOCK(scheduler_run, scheduler_run_noop_mock);
+  run_func_ptr();
 
   /*
    * ch1 should have gone to SCHED_CHAN_WAITING_FOR_CELLS, with 16 flushed
@@ -704,55 +789,282 @@ test_scheduler_loop(void *arg)
  done:
   tor_free(ch1);
   tor_free(ch2);
+  cleanup_scheduler_options();
 
   UNMOCK(channel_flush_some_cells);
   UNMOCK(scheduler_compare_channels);
-  UNMOCK(scheduler_run);
   UNMOCK(tor_libevent_get_base);
+  UNMOCK(get_options);
 }
 
 static void
-test_scheduler_queue_heuristic(void *arg)
+test_scheduler_loop_kist(void *arg)
 {
-  time_t now = approx_time();
-  uint64_t qh;
+  (void) arg;
 
+#ifndef HAVE_KIST_SUPPORT
+  return;
+#endif
+
+  channel_t *ch1 = new_fake_channel(), *ch2 = new_fake_channel();
+
+  /* setup options so we're sure about what sched we are running */
+  MOCK(get_options, mock_get_options);
+  MOCK(channel_flush_some_cells, channel_flush_some_cells_mock);
+  MOCK(channel_more_to_flush, channel_more_to_flush_mock);
+  MOCK(channel_write_to_kernel, channel_write_to_kernel_mock);
+  MOCK(channel_should_write_to_kernel, channel_should_write_to_kernel_mock);
+  MOCK(update_socket_info_impl, update_socket_info_impl_mock);
+  clear_options();
+  mocked_options.KISTSchedRunInterval = 11;
+  set_scheduler_options(SCHEDULER_KIST);
+  scheduler_init();
+
+  tt_assert(ch1);
+  ch1->magic = TLS_CHAN_MAGIC;
+  ch1->state = CHANNEL_STATE_OPENING;
+  ch1->cmux = circuitmux_alloc();
+  channel_register(ch1);
+  tt_assert(ch1->registered);
+  channel_change_state_open(ch1);
+  scheduler_channel_has_waiting_cells(ch1);
+  scheduler_channel_wants_writes(ch1);
+  channel_flush_some_cells_mock_set(ch1, 5);
+
+  tt_assert(ch2);
+  ch2->magic = TLS_CHAN_MAGIC;
+  ch2->state = CHANNEL_STATE_OPENING;
+  ch2->cmux = circuitmux_alloc();
+  channel_register(ch2);
+  tt_assert(ch2->registered);
+  channel_change_state_open(ch2);
+  scheduler_channel_has_waiting_cells(ch2);
+  scheduler_channel_wants_writes(ch2);
+  channel_flush_some_cells_mock_set(ch2, 5);
+
+  the_scheduler->run();
+
+  scheduler_channel_has_waiting_cells(ch1);
+  channel_flush_some_cells_mock_set(ch1, 5);
+
+  the_scheduler->run();
+
+  scheduler_channel_has_waiting_cells(ch1);
+  channel_flush_some_cells_mock_set(ch1, 5);
+  scheduler_channel_has_waiting_cells(ch2);
+  channel_flush_some_cells_mock_set(ch2, 5);
+
+  the_scheduler->run();
+
+  channel_flush_some_cells_mock_free_all();
+  tt_int_op(1,==,1);
+
+ done:
+  /* Prep the channel so the free() function doesn't explode. */
+  ch1->state = ch2->state = CHANNEL_STATE_CLOSED;
+  ch1->registered = ch2->registered = 0;
+  channel_free(ch1);
+  channel_free(ch2);
+  UNMOCK(update_socket_info_impl);
+  UNMOCK(channel_should_write_to_kernel);
+  UNMOCK(channel_write_to_kernel);
+  UNMOCK(channel_more_to_flush);
+  UNMOCK(channel_flush_some_cells);
+  UNMOCK(get_options);
+  scheduler_free_all();
+  return;
+}
+
+static void
+test_scheduler_channel_states(void *arg)
+{
   (void)arg;
+  perform_channel_state_tests(-1, SCHEDULER_VANILLA);
+  perform_channel_state_tests(11, SCHEDULER_KIST_LITE);
+#ifdef HAVE_KIST_SUPPORT
+  perform_channel_state_tests(11, SCHEDULER_KIST);
+#endif
+}
 
-  queue_heuristic = 0;
-  queue_heuristic_timestamp = 0;
+static void
+test_scheduler_initfree(void *arg)
+{
+  (void)arg;
 
-  /* Not yet inited case */
-  scheduler_update_queue_heuristic(now - 180);
-  tt_u64_op(queue_heuristic, OP_EQ, 0);
-  tt_int_op(queue_heuristic_timestamp, OP_EQ, now - 180);
+  tt_ptr_op(channels_pending, ==, NULL);
+  tt_ptr_op(run_sched_ev, ==, NULL);
 
-  queue_heuristic = 1000000000L;
-  queue_heuristic_timestamp = now - 120;
+  mock_event_init();
+  MOCK(tor_libevent_get_base, tor_libevent_get_base_mock);
+  MOCK(get_options, mock_get_options);
+  set_scheduler_options(SCHEDULER_KIST);
+  set_scheduler_options(SCHEDULER_KIST_LITE);
+  set_scheduler_options(SCHEDULER_VANILLA);
 
-  scheduler_update_queue_heuristic(now - 119);
-  tt_u64_op(queue_heuristic, OP_EQ, 500000000L);
-  tt_int_op(queue_heuristic_timestamp, OP_EQ, now - 119);
+  scheduler_init();
 
-  scheduler_update_queue_heuristic(now - 116);
-  tt_u64_op(queue_heuristic, OP_EQ, 62500000L);
-  tt_int_op(queue_heuristic_timestamp, OP_EQ, now - 116);
+  tt_ptr_op(channels_pending, !=, NULL);
+  tt_ptr_op(run_sched_ev, !=, NULL);
+  /* We have specified nothing in the torrc and there's no consensus so the
+   * KIST scheduler is what should be in use */
+  tt_ptr_op(the_scheduler, ==, get_kist_scheduler());
+  tt_int_op(sched_run_interval, ==, 10);
 
-  qh = scheduler_get_queue_heuristic();
-  tt_u64_op(qh, OP_EQ, 0);
+  scheduler_free_all();
+
+  UNMOCK(tor_libevent_get_base);
+  mock_event_free_all();
+
+  tt_ptr_op(channels_pending, ==, NULL);
+  tt_ptr_op(run_sched_ev, ==, NULL);
 
  done:
+  UNMOCK(get_options);
+  cleanup_scheduler_options();
+  return;
+}
+
+static void
+test_scheduler_can_use_kist(void *arg)
+{
+  (void)arg;
+
+  int res_should, res_freq;
+  MOCK(get_options, mock_get_options);
+
+  /* Test force disabling of KIST */
+  clear_options();
+  mocked_options.KISTSchedRunInterval = -1;
+  res_should = scheduler_can_use_kist();
+  res_freq = kist_scheduler_run_interval(NULL);
+  tt_int_op(res_should, ==, 0);
+  tt_int_op(res_freq, ==, -1);
+
+  /* Test force enabling of KIST */
+  clear_options();
+  mocked_options.KISTSchedRunInterval = 1234;
+  res_should = scheduler_can_use_kist();
+  res_freq = kist_scheduler_run_interval(NULL);
+#ifdef HAVE_KIST_SUPPORT
+  tt_int_op(res_should, ==, 1);
+#else /* HAVE_KIST_SUPPORT */
+  tt_int_op(res_should, ==, 0);
+#endif /* HAVE_KIST_SUPPORT */
+  tt_int_op(res_freq, ==, 1234);
+
+  /* Test defer to consensus, but no consensus available */
+  clear_options();
+  mocked_options.KISTSchedRunInterval = 0;
+  res_should = scheduler_can_use_kist();
+  res_freq = kist_scheduler_run_interval(NULL);
+#ifdef HAVE_KIST_SUPPORT
+  tt_int_op(res_should, ==, 1);
+#else /* HAVE_KIST_SUPPORT */
+  tt_int_op(res_should, ==, 0);
+#endif /* HAVE_KIST_SUPPORT */
+  tt_int_op(res_freq, ==, 10);
+
+  /* Test defer to consensus, and kist consensus available */
+  MOCK(networkstatus_get_param, mock_kist_networkstatus_get_param);
+  clear_options();
+  mocked_options.KISTSchedRunInterval = 0;
+  res_should = scheduler_can_use_kist();
+  res_freq = kist_scheduler_run_interval(NULL);
+#ifdef HAVE_KIST_SUPPORT
+  tt_int_op(res_should, ==, 1);
+#else /* HAVE_KIST_SUPPORT */
+  tt_int_op(res_should, ==, 0);
+#endif /* HAVE_KIST_SUPPORT */
+  tt_int_op(res_freq, ==, 12);
+  UNMOCK(networkstatus_get_param);
+
+  /* Test defer to consensus, and vanilla consensus available */
+  MOCK(networkstatus_get_param, mock_vanilla_networkstatus_get_param);
+  clear_options();
+  mocked_options.KISTSchedRunInterval = 0;
+  res_should = scheduler_can_use_kist();
+  res_freq = kist_scheduler_run_interval(NULL);
+  tt_int_op(res_should, ==, 0);
+  tt_int_op(res_freq, ==, -1);
+  UNMOCK(networkstatus_get_param);
+
+ done:
+  UNMOCK(get_options);
+  return;
+}
+
+static void
+test_scheduler_ns_changed(void *arg)
+{
+  (void) arg;
+
+  /*
+   * Currently no scheduler implementations use the old/new consensuses passed
+   * in scheduler_notify_networkstatus_changed, so it is okay to pass NULL.
+   *
+   * "But then what does test actually exercise???" It tests that
+   * scheduler_notify_networkstatus_changed fetches the correct value from the
+   * consensus, and then switches the scheduler if necessasry.
+   */
+
+  MOCK(get_options, mock_get_options);
+  clear_options();
+  set_scheduler_options(SCHEDULER_KIST);
+  set_scheduler_options(SCHEDULER_VANILLA);
+
+  tt_ptr_op(the_scheduler, ==, NULL);
+
+  /* Change from vanilla to kist via consensus */
+  the_scheduler = get_vanilla_scheduler();
+  MOCK(networkstatus_get_param, mock_kist_networkstatus_get_param);
+  scheduler_notify_networkstatus_changed(NULL, NULL);
+  UNMOCK(networkstatus_get_param);
+#ifdef HAVE_KIST_SUPPORT
+  tt_ptr_op(the_scheduler, ==, get_kist_scheduler());
+#else
+  tt_ptr_op(the_scheduler, ==, get_vanilla_scheduler());
+#endif
+
+  /* Change from kist to vanilla via consensus */
+  the_scheduler = get_kist_scheduler();
+  MOCK(networkstatus_get_param, mock_vanilla_networkstatus_get_param);
+  scheduler_notify_networkstatus_changed(NULL, NULL);
+  UNMOCK(networkstatus_get_param);
+  tt_ptr_op(the_scheduler, ==, get_vanilla_scheduler());
+
+  /* Doesn't change when using KIST */
+  the_scheduler = get_kist_scheduler();
+  MOCK(networkstatus_get_param, mock_kist_networkstatus_get_param);
+  scheduler_notify_networkstatus_changed(NULL, NULL);
+  UNMOCK(networkstatus_get_param);
+#ifdef HAVE_KIST_SUPPORT
+  tt_ptr_op(the_scheduler, ==, get_kist_scheduler());
+#else
+  tt_ptr_op(the_scheduler, ==, get_vanilla_scheduler());
+#endif
+
+  /* Doesn't change when using vanilla */
+  the_scheduler = get_vanilla_scheduler();
+  MOCK(networkstatus_get_param, mock_vanilla_networkstatus_get_param);
+  scheduler_notify_networkstatus_changed(NULL, NULL);
+  UNMOCK(networkstatus_get_param);
+  tt_ptr_op(the_scheduler, ==, get_vanilla_scheduler());
+
+ done:
+  UNMOCK(get_options);
+  cleanup_scheduler_options();
   return;
 }
 
 struct testcase_t scheduler_tests[] = {
-  { "channel_states", test_scheduler_channel_states, TT_FORK, NULL, NULL },
   { "compare_channels", test_scheduler_compare_channels,
     TT_FORK, NULL, NULL },
+  { "channel_states", test_scheduler_channel_states, TT_FORK, NULL, NULL },
   { "initfree", test_scheduler_initfree, TT_FORK, NULL, NULL },
-  { "loop", test_scheduler_loop, TT_FORK, NULL, NULL },
-  { "queue_heuristic", test_scheduler_queue_heuristic,
-    TT_FORK, NULL, NULL },
+  { "loop_vanilla", test_scheduler_loop_vanilla, TT_FORK, NULL, NULL },
+  { "loop_kist", test_scheduler_loop_kist, TT_FORK, NULL, NULL },
+  { "ns_changed", test_scheduler_ns_changed, TT_FORK, NULL, NULL},
+  { "should_use_kist", test_scheduler_can_use_kist, TT_FORK, NULL, NULL },
   END_OF_TESTCASES
 };