123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711 |
- /* * Copyright (c) 2013-2015, The Tor Project, Inc. */
- /* See LICENSE for licensing information */
- /**
- * \file scheduler.c
- * \brief Relay scheduling system
- **/
- #include "or.h"
- #define TOR_CHANNEL_INTERNAL_ /* For channel_flush_some_cells() */
- #include "channel.h"
- #include "compat_libevent.h"
- #define SCHEDULER_PRIVATE_
- #include "scheduler.h"
- #ifdef HAVE_EVENT2_EVENT_H
- #include <event2/event.h>
- #else
- #include <event.h>
- #endif
- /*
- * Scheduler high/low watermarks
- */
- static uint32_t sched_q_low_water = 16384;
- static uint32_t sched_q_high_water = 32768;
- /*
- * Maximum cells to flush in a single call to channel_flush_some_cells();
- * setting this low means more calls, but too high and we could overshoot
- * sched_q_high_water.
- */
- static uint32_t sched_max_flush_cells = 16;
- /*
- * Write scheduling works by keeping track of which channels can
- * accept cells, and have cells to write. From the scheduler's perspective,
- * a channel can be in four possible states:
- *
- * 1.) Not open for writes, no cells to send
- * - Not much to do here, and the channel will have scheduler_state ==
- * SCHED_CHAN_IDLE
- * - Transitions from:
- * - Open for writes/has cells by simultaneously draining all circuit
- * queues and filling the output buffer.
- * - Transitions to:
- * - Not open for writes/has cells by arrival of cells on an attached
- * circuit (this would be driven from append_cell_to_circuit_queue())
- * - Open for writes/no cells by a channel type specific path;
- * driven from connection_or_flushed_some() for channel_tls_t.
- *
- * 2.) Open for writes, no cells to send
- * - Not much here either; this will be the state an idle but open channel
- * can be expected to settle in. It will have scheduler_state ==
- * SCHED_CHAN_WAITING_FOR_CELLS
- * - Transitions from:
- * - Not open for writes/no cells by flushing some of the output
- * buffer.
- * - Open for writes/has cells by the scheduler moving cells from
- * circuit queues to channel output queue, but not having enough
- * to fill the output queue.
- * - Transitions to:
- * - Open for writes/has cells by arrival of new cells on an attached
- * circuit, in append_cell_to_circuit_queue()
- *
- * 3.) Not open for writes, cells to send
- * - This is the state of a busy circuit limited by output bandwidth;
- * cells have piled up in the circuit queues waiting to be relayed.
- * The channel will have scheduler_state == SCHED_CHAN_WAITING_TO_WRITE.
- * - Transitions from:
- * - Not open for writes/no cells by arrival of cells on an attached
- * circuit
- * - Open for writes/has cells by filling an output buffer without
- * draining all cells from attached circuits
- * - Transitions to:
- * - Opens for writes/has cells by draining some of the output buffer
- * via the connection_or_flushed_some() path (for channel_tls_t).
- *
- * 4.) Open for writes, cells to send
- * - This connection is ready to relay some cells and waiting for
- * the scheduler to choose it. The channel will have scheduler_state ==
- * SCHED_CHAN_PENDING.
- * - Transitions from:
- * - Not open for writes/has cells by the connection_or_flushed_some()
- * path
- * - Open for writes/no cells by the append_cell_to_circuit_queue()
- * path
- * - Transitions to:
- * - Not open for writes/no cells by draining all circuit queues and
- * simultaneously filling the output buffer.
- * - Not open for writes/has cells by writing enough cells to fill the
- * output buffer
- * - Open for writes/no cells by draining all attached circuit queues
- * without also filling the output buffer
- *
- * Other event-driven parts of the code move channels between these scheduling
- * states by calling scheduler functions; the scheduler only runs on open-for-
- * writes/has-cells channels and is the only path for those to transition to
- * other states. The scheduler_run() function gives us the opportunity to do
- * scheduling work, and is called from other scheduler functions whenever a
- * state transition occurs, and periodically from the main event loop.
- */
- /* Scheduler global data structures */
- /*
- * We keep a list of channels that are pending - i.e, have cells to write
- * and can accept them to send. The enum scheduler_state in channel_t
- * is reserved for our use.
- */
- /* Pqueue of channels that can write and have cells (pending work) */
- STATIC smartlist_t *channels_pending = NULL;
- /*
- * This event runs the scheduler from its callback, and is manually
- * activated whenever a channel enters open for writes/cells to send.
- */
- STATIC struct event *run_sched_ev = NULL;
- /*
- * Queue heuristic; this is not the queue size, but an 'effective queuesize'
- * that ages out contributions from stalled channels.
- */
- STATIC uint64_t queue_heuristic = 0;
- /*
- * Timestamp for last queue heuristic update
- */
- STATIC time_t queue_heuristic_timestamp = 0;
- /* Scheduler static function declarations */
- static void scheduler_evt_callback(evutil_socket_t fd,
- short events, void *arg);
- static int scheduler_more_work(void);
- static void scheduler_retrigger(void);
- #if 0
- static void scheduler_trigger(void);
- #endif
- /* Scheduler function implementations */
- /** Free everything and shut down the scheduling system */
- void
- scheduler_free_all(void)
- {
- log_debug(LD_SCHED, "Shutting down scheduler");
- if (run_sched_ev) {
- if (event_del(run_sched_ev) < 0) {
- log_warn(LD_BUG, "Problem deleting run_sched_ev");
- }
- tor_event_free(run_sched_ev);
- run_sched_ev = NULL;
- }
- if (channels_pending) {
- smartlist_free(channels_pending);
- channels_pending = NULL;
- }
- }
- /**
- * Comparison function to use when sorting pending channels
- */
- MOCK_IMPL(STATIC int,
- scheduler_compare_channels, (const void *c1_v, const void *c2_v))
- {
- channel_t *c1 = NULL, *c2 = NULL;
- /* These are a workaround for -Wbad-function-cast throwing a fit */
- const circuitmux_policy_t *p1, *p2;
- uintptr_t p1_i, p2_i;
- tor_assert(c1_v);
- tor_assert(c2_v);
- c1 = (channel_t *)(c1_v);
- c2 = (channel_t *)(c2_v);
- tor_assert(c1);
- tor_assert(c2);
- if (c1 != c2) {
- if (circuitmux_get_policy(c1->cmux) ==
- circuitmux_get_policy(c2->cmux)) {
- /* Same cmux policy, so use the mux comparison */
- return circuitmux_compare_muxes(c1->cmux, c2->cmux);
- } else {
- /*
- * Different policies; not important to get this edge case perfect
- * because the current code never actually gives different channels
- * different cmux policies anyway. Just use this arbitrary but
- * definite choice.
- */
- p1 = circuitmux_get_policy(c1->cmux);
- p2 = circuitmux_get_policy(c2->cmux);
- p1_i = (uintptr_t)p1;
- p2_i = (uintptr_t)p2;
- return (p1_i < p2_i) ? -1 : 1;
- }
- } else {
- /* c1 == c2, so always equal */
- return 0;
- }
- }
- /*
- * Scheduler event callback; this should get triggered once per event loop
- * if any scheduling work was created during the event loop.
- */
- static void
- scheduler_evt_callback(evutil_socket_t fd, short events, void *arg)
- {
- (void)fd;
- (void)events;
- (void)arg;
- log_debug(LD_SCHED, "Scheduler event callback called");
- tor_assert(run_sched_ev);
- /* Run the scheduler */
- scheduler_run();
- /* Do we have more work to do? */
- if (scheduler_more_work()) scheduler_retrigger();
- }
- /** Mark a channel as no longer ready to accept writes */
- MOCK_IMPL(void,
- scheduler_channel_doesnt_want_writes,(channel_t *chan))
- {
- tor_assert(chan);
- tor_assert(channels_pending);
- /* If it's already in pending, we can put it in waiting_to_write */
- if (chan->scheduler_state == SCHED_CHAN_PENDING) {
- /*
- * It's in channels_pending, so it shouldn't be in any of
- * the other lists. It can't write any more, so it goes to
- * channels_waiting_to_write.
- */
- smartlist_pqueue_remove(channels_pending,
- scheduler_compare_channels,
- STRUCT_OFFSET(channel_t, sched_heap_idx),
- chan);
- chan->scheduler_state = SCHED_CHAN_WAITING_TO_WRITE;
- log_debug(LD_SCHED,
- "Channel " U64_FORMAT " at %p went from pending "
- "to waiting_to_write",
- U64_PRINTF_ARG(chan->global_identifier), chan);
- } else {
- /*
- * It's not in pending, so it can't become waiting_to_write; it's
- * either not in any of the lists (nothing to do) or it's already in
- * waiting_for_cells (remove it, can't write any more).
- */
- if (chan->scheduler_state == SCHED_CHAN_WAITING_FOR_CELLS) {
- chan->scheduler_state = SCHED_CHAN_IDLE;
- log_debug(LD_SCHED,
- "Channel " U64_FORMAT " at %p left waiting_for_cells",
- U64_PRINTF_ARG(chan->global_identifier), chan);
- }
- }
- }
- /** Mark a channel as having waiting cells */
- MOCK_IMPL(void,
- scheduler_channel_has_waiting_cells,(channel_t *chan))
- {
- int became_pending = 0;
- tor_assert(chan);
- tor_assert(channels_pending);
- /* First, check if this one also writeable */
- if (chan->scheduler_state == SCHED_CHAN_WAITING_FOR_CELLS) {
- /*
- * It's in channels_waiting_for_cells, so it shouldn't be in any of
- * the other lists. It has waiting cells now, so it goes to
- * channels_pending.
- */
- chan->scheduler_state = SCHED_CHAN_PENDING;
- smartlist_pqueue_add(channels_pending,
- scheduler_compare_channels,
- STRUCT_OFFSET(channel_t, sched_heap_idx),
- chan);
- log_debug(LD_SCHED,
- "Channel " U64_FORMAT " at %p went from waiting_for_cells "
- "to pending",
- U64_PRINTF_ARG(chan->global_identifier), chan);
- became_pending = 1;
- } else {
- /*
- * It's not in waiting_for_cells, so it can't become pending; it's
- * either not in any of the lists (we add it to waiting_to_write)
- * or it's already in waiting_to_write or pending (we do nothing)
- */
- if (!(chan->scheduler_state == SCHED_CHAN_WAITING_TO_WRITE ||
- chan->scheduler_state == SCHED_CHAN_PENDING)) {
- chan->scheduler_state = SCHED_CHAN_WAITING_TO_WRITE;
- log_debug(LD_SCHED,
- "Channel " U64_FORMAT " at %p entered waiting_to_write",
- U64_PRINTF_ARG(chan->global_identifier), chan);
- }
- }
- /*
- * If we made a channel pending, we potentially have scheduling work
- * to do.
- */
- if (became_pending) scheduler_retrigger();
- }
- /** Set up the scheduling system */
- void
- scheduler_init(void)
- {
- log_debug(LD_SCHED, "Initting scheduler");
- tor_assert(!run_sched_ev);
- run_sched_ev = tor_event_new(tor_libevent_get_base(), -1,
- 0, scheduler_evt_callback, NULL);
- channels_pending = smartlist_new();
- queue_heuristic = 0;
- queue_heuristic_timestamp = approx_time();
- }
- /** Check if there's more scheduling work */
- static int
- scheduler_more_work(void)
- {
- tor_assert(channels_pending);
- return ((scheduler_get_queue_heuristic() < sched_q_low_water) &&
- ((smartlist_len(channels_pending) > 0))) ? 1 : 0;
- }
- /** Retrigger the scheduler in a way safe to use from the callback */
- static void
- scheduler_retrigger(void)
- {
- tor_assert(run_sched_ev);
- event_active(run_sched_ev, EV_TIMEOUT, 1);
- }
- /** Notify the scheduler of a channel being closed */
- MOCK_IMPL(void,
- scheduler_release_channel,(channel_t *chan))
- {
- tor_assert(chan);
- tor_assert(channels_pending);
- if (chan->scheduler_state == SCHED_CHAN_PENDING) {
- smartlist_pqueue_remove(channels_pending,
- scheduler_compare_channels,
- STRUCT_OFFSET(channel_t, sched_heap_idx),
- chan);
- }
- chan->scheduler_state = SCHED_CHAN_IDLE;
- }
- /** Run the scheduling algorithm if necessary */
- MOCK_IMPL(void,
- scheduler_run, (void))
- {
- int n_cells, n_chans_before, n_chans_after;
- uint64_t q_len_before, q_heur_before, q_len_after, q_heur_after;
- ssize_t flushed, flushed_this_time;
- smartlist_t *to_readd = NULL;
- channel_t *chan = NULL;
- log_debug(LD_SCHED, "We have a chance to run the scheduler");
- if (scheduler_get_queue_heuristic() < sched_q_low_water) {
- n_chans_before = smartlist_len(channels_pending);
- q_len_before = channel_get_global_queue_estimate();
- q_heur_before = scheduler_get_queue_heuristic();
- while (scheduler_get_queue_heuristic() <= sched_q_high_water &&
- smartlist_len(channels_pending) > 0) {
- /* Pop off a channel */
- chan = smartlist_pqueue_pop(channels_pending,
- scheduler_compare_channels,
- STRUCT_OFFSET(channel_t, sched_heap_idx));
- tor_assert(chan);
- /* Figure out how many cells we can write */
- n_cells = channel_num_cells_writeable(chan);
- if (n_cells > 0) {
- log_debug(LD_SCHED,
- "Scheduler saw pending channel " U64_FORMAT " at %p with "
- "%d cells writeable",
- U64_PRINTF_ARG(chan->global_identifier), chan, n_cells);
- flushed = 0;
- while (flushed < n_cells &&
- scheduler_get_queue_heuristic() <= sched_q_high_water) {
- flushed_this_time =
- channel_flush_some_cells(chan,
- MIN(sched_max_flush_cells,
- (size_t) n_cells - flushed));
- if (flushed_this_time <= 0) break;
- flushed += flushed_this_time;
- }
- if (flushed < n_cells) {
- /* We ran out of cells to flush */
- chan->scheduler_state = SCHED_CHAN_WAITING_FOR_CELLS;
- log_debug(LD_SCHED,
- "Channel " U64_FORMAT " at %p "
- "entered waiting_for_cells from pending",
- U64_PRINTF_ARG(chan->global_identifier),
- chan);
- } else {
- /* The channel may still have some cells */
- if (channel_more_to_flush(chan)) {
- /* The channel goes to either pending or waiting_to_write */
- if (channel_num_cells_writeable(chan) > 0) {
- /* Add it back to pending later */
- if (!to_readd) to_readd = smartlist_new();
- smartlist_add(to_readd, chan);
- log_debug(LD_SCHED,
- "Channel " U64_FORMAT " at %p "
- "is still pending",
- U64_PRINTF_ARG(chan->global_identifier),
- chan);
- } else {
- /* It's waiting to be able to write more */
- chan->scheduler_state = SCHED_CHAN_WAITING_TO_WRITE;
- log_debug(LD_SCHED,
- "Channel " U64_FORMAT " at %p "
- "entered waiting_to_write from pending",
- U64_PRINTF_ARG(chan->global_identifier),
- chan);
- }
- } else {
- /* No cells left; it can go to idle or waiting_for_cells */
- if (channel_num_cells_writeable(chan) > 0) {
- /*
- * It can still accept writes, so it goes to
- * waiting_for_cells
- */
- chan->scheduler_state = SCHED_CHAN_WAITING_FOR_CELLS;
- log_debug(LD_SCHED,
- "Channel " U64_FORMAT " at %p "
- "entered waiting_for_cells from pending",
- U64_PRINTF_ARG(chan->global_identifier),
- chan);
- } else {
- /*
- * We exactly filled up the output queue with all available
- * cells; go to idle.
- */
- chan->scheduler_state = SCHED_CHAN_IDLE;
- log_debug(LD_SCHED,
- "Channel " U64_FORMAT " at %p "
- "become idle from pending",
- U64_PRINTF_ARG(chan->global_identifier),
- chan);
- }
- }
- }
- log_debug(LD_SCHED,
- "Scheduler flushed %d cells onto pending channel "
- U64_FORMAT " at %p",
- (int)flushed, U64_PRINTF_ARG(chan->global_identifier),
- chan);
- } else {
- log_info(LD_SCHED,
- "Scheduler saw pending channel " U64_FORMAT " at %p with "
- "no cells writeable",
- U64_PRINTF_ARG(chan->global_identifier), chan);
- /* Put it back to WAITING_TO_WRITE */
- chan->scheduler_state = SCHED_CHAN_WAITING_TO_WRITE;
- }
- }
- /* Readd any channels we need to */
- if (to_readd) {
- SMARTLIST_FOREACH_BEGIN(to_readd, channel_t *, chan) {
- chan->scheduler_state = SCHED_CHAN_PENDING;
- smartlist_pqueue_add(channels_pending,
- scheduler_compare_channels,
- STRUCT_OFFSET(channel_t, sched_heap_idx),
- chan);
- } SMARTLIST_FOREACH_END(chan);
- smartlist_free(to_readd);
- }
- n_chans_after = smartlist_len(channels_pending);
- q_len_after = channel_get_global_queue_estimate();
- q_heur_after = scheduler_get_queue_heuristic();
- log_debug(LD_SCHED,
- "Scheduler handled %d of %d pending channels, queue size from "
- U64_FORMAT " to " U64_FORMAT ", queue heuristic from "
- U64_FORMAT " to " U64_FORMAT,
- n_chans_before - n_chans_after, n_chans_before,
- U64_PRINTF_ARG(q_len_before), U64_PRINTF_ARG(q_len_after),
- U64_PRINTF_ARG(q_heur_before), U64_PRINTF_ARG(q_heur_after));
- }
- }
- /** Trigger the scheduling event so we run the scheduler later */
- #if 0
- static void
- scheduler_trigger(void)
- {
- log_debug(LD_SCHED, "Triggering scheduler event");
- tor_assert(run_sched_ev);
- event_add(run_sched_ev, EV_TIMEOUT, 1);
- }
- #endif
- /** Mark a channel as ready to accept writes */
- void
- scheduler_channel_wants_writes(channel_t *chan)
- {
- int became_pending = 0;
- tor_assert(chan);
- tor_assert(channels_pending);
- /* If it's already in waiting_to_write, we can put it in pending */
- if (chan->scheduler_state == SCHED_CHAN_WAITING_TO_WRITE) {
- /*
- * It can write now, so it goes to channels_pending.
- */
- smartlist_pqueue_add(channels_pending,
- scheduler_compare_channels,
- STRUCT_OFFSET(channel_t, sched_heap_idx),
- chan);
- chan->scheduler_state = SCHED_CHAN_PENDING;
- log_debug(LD_SCHED,
- "Channel " U64_FORMAT " at %p went from waiting_to_write "
- "to pending",
- U64_PRINTF_ARG(chan->global_identifier), chan);
- became_pending = 1;
- } else {
- /*
- * It's not in SCHED_CHAN_WAITING_TO_WRITE, so it can't become pending;
- * it's either idle and goes to WAITING_FOR_CELLS, or it's a no-op.
- */
- if (!(chan->scheduler_state == SCHED_CHAN_WAITING_FOR_CELLS ||
- chan->scheduler_state == SCHED_CHAN_PENDING)) {
- chan->scheduler_state = SCHED_CHAN_WAITING_FOR_CELLS;
- log_debug(LD_SCHED,
- "Channel " U64_FORMAT " at %p entered waiting_for_cells",
- U64_PRINTF_ARG(chan->global_identifier), chan);
- }
- }
- /*
- * If we made a channel pending, we potentially have scheduling work
- * to do.
- */
- if (became_pending) scheduler_retrigger();
- }
- /**
- * Notify the scheduler that a channel's position in the pqueue may have
- * changed
- */
- void
- scheduler_touch_channel(channel_t *chan)
- {
- tor_assert(chan);
- if (chan->scheduler_state == SCHED_CHAN_PENDING) {
- /* Remove and re-add it */
- smartlist_pqueue_remove(channels_pending,
- scheduler_compare_channels,
- STRUCT_OFFSET(channel_t, sched_heap_idx),
- chan);
- smartlist_pqueue_add(channels_pending,
- scheduler_compare_channels,
- STRUCT_OFFSET(channel_t, sched_heap_idx),
- chan);
- }
- /* else no-op, since it isn't in the queue */
- }
- /**
- * Notify the scheduler of a queue size adjustment, to recalculate the
- * queue heuristic.
- */
- void
- scheduler_adjust_queue_size(channel_t *chan, int dir, uint64_t adj)
- {
- time_t now = approx_time();
- log_debug(LD_SCHED,
- "Queue size adjustment by %s" U64_FORMAT " for channel "
- U64_FORMAT,
- (dir >= 0) ? "+" : "-",
- U64_PRINTF_ARG(adj),
- U64_PRINTF_ARG(chan->global_identifier));
- /* Get the queue heuristic up to date */
- scheduler_update_queue_heuristic(now);
- /* Adjust as appropriate */
- if (dir >= 0) {
- /* Increasing it */
- queue_heuristic += adj;
- } else {
- /* Decreasing it */
- if (queue_heuristic > adj) queue_heuristic -= adj;
- else queue_heuristic = 0;
- }
- log_debug(LD_SCHED,
- "Queue heuristic is now " U64_FORMAT,
- U64_PRINTF_ARG(queue_heuristic));
- }
- /**
- * Query the current value of the queue heuristic
- */
- STATIC uint64_t
- scheduler_get_queue_heuristic(void)
- {
- time_t now = approx_time();
- scheduler_update_queue_heuristic(now);
- return queue_heuristic;
- }
- /**
- * Adjust the queue heuristic value to the present time
- */
- STATIC void
- scheduler_update_queue_heuristic(time_t now)
- {
- time_t diff;
- if (queue_heuristic_timestamp == 0) {
- /*
- * Nothing we can sensibly do; must not have been initted properly.
- * Oh well.
- */
- queue_heuristic_timestamp = now;
- } else if (queue_heuristic_timestamp < now) {
- diff = now - queue_heuristic_timestamp;
- /*
- * This is a simple exponential age-out; the other proposed alternative
- * was a linear age-out using the bandwidth history in rephist.c; I'm
- * going with this out of concern that if an adversary can jam the
- * scheduler long enough, it would cause the bandwidth to drop to
- * zero and render the aging mechanism ineffective thereafter.
- */
- if (0 <= diff && diff < 64) queue_heuristic >>= diff;
- else queue_heuristic = 0;
- queue_heuristic_timestamp = now;
- log_debug(LD_SCHED,
- "Queue heuristic is now " U64_FORMAT,
- U64_PRINTF_ARG(queue_heuristic));
- }
- /* else no update needed, or time went backward */
- }
- /**
- * Set scheduler watermarks and flush size
- */
- void
- scheduler_set_watermarks(uint32_t lo, uint32_t hi, uint32_t max_flush)
- {
- /* Sanity assertions - caller should ensure these are true */
- tor_assert(lo > 0);
- tor_assert(hi > lo);
- tor_assert(max_flush > 0);
- sched_q_low_water = lo;
- sched_q_high_water = hi;
- sched_max_flush_cells = max_flush;
- }
|