scheduler.c 18 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588
  1. /* * Copyright (c) 2013, The Tor Project, Inc. */
  2. /* See LICENSE for licensing information */
  3. /**
  4. * \file scheduler.c
  5. * \brief Relay scheduling system
  6. **/
  7. #include "or.h"
  8. #define TOR_CHANNEL_INTERNAL_ /* For channel_flush_some_cells() */
  9. #include "channel.h"
  10. #include "compat_libevent.h"
  11. #include "scheduler.h"
  12. #ifdef HAVE_EVENT2_EVENT_H
  13. #include <event2/event.h>
  14. #else
  15. #include <event.h>
  16. #endif
  17. #define SCHED_Q_LOW_WATER 16384
  18. #define SCHED_Q_HIGH_WATER (2 * SCHED_Q_LOW_WATER)
  19. /*
  20. * Write scheduling works by keeping track of which channels can
  21. * accept cells, and have cells to write. From the scheduler's perspective,
  22. * a channel can be in four possible states:
  23. *
  24. * 1.) Not open for writes, no cells to send
  25. * - Not much to do here, and the channel will have scheduler_state ==
  26. * SCHED_CHAN_IDLE
  27. * - Transitions from:
  28. * - Open for writes/has cells by simultaneously draining all circuit
  29. * queues and filling the output buffer.
  30. * - Transitions to:
  31. * - Not open for writes/has cells by arrival of cells on an attached
  32. * circuit (this would be driven from append_cell_to_circuit_queue())
  33. * - Open for writes/no cells by a channel type specific path;
  34. * driven from connection_or_flushed_some() for channel_tls_t.
  35. *
  36. * 2.) Open for writes, no cells to send
  37. * - Not much here either; this will be the state an idle but open channel
  38. * can be expected to settle in. It will have scheduler_state ==
  39. * SCHED_CHAN_WAITING_FOR_CELLS
  40. * - Transitions from:
  41. * - Not open for writes/no cells by flushing some of the output
  42. * buffer.
  43. * - Open for writes/has cells by the scheduler moving cells from
  44. * circuit queues to channel output queue, but not having enough
  45. * to fill the output queue.
  46. * - Transitions to:
  47. * - Open for writes/has cells by arrival of new cells on an attached
  48. * circuit, in append_cell_to_circuit_queue()
  49. *
  50. * 3.) Not open for writes, cells to send
  51. * - This is the state of a busy circuit limited by output bandwidth;
  52. * cells have piled up in the circuit queues waiting to be relayed.
  53. * The channel will have scheduler_state == SCHED_CHAN_WAITING_TO_WRITE.
  54. * - Transitions from:
  55. * - Not open for writes/no cells by arrival of cells on an attached
  56. * circuit
  57. * - Open for writes/has cells by filling an output buffer without
  58. * draining all cells from attached circuits
  59. * - Transitions to:
  60. * - Opens for writes/has cells by draining some of the output buffer
  61. * via the connection_or_flushed_some() path (for channel_tls_t).
  62. *
  63. * 4.) Open for writes, cells to send
  64. * - This connection is ready to relay some cells and waiting for
  65. * the scheduler to choose it. The channel will have scheduler_state ==
  66. * SCHED_CHAN_PENDING.
  67. * - Transitions from:
  68. * - Not open for writes/has cells by the connection_or_flushed_some()
  69. * path
  70. * - Open for writes/no cells by the append_cell_to_circuit_queue()
  71. * path
  72. * - Transitions to:
  73. * - Not open for writes/no cells by draining all circuit queues and
  74. * simultaneously filling the output buffer.
  75. * - Not open for writes/has cells by writing enough cells to fill the
  76. * output buffer
  77. * - Open for writes/no cells by draining all attached circuit queues
  78. * without also filling the output buffer
  79. *
  80. * Other event-driven parts of the code move channels between these scheduling
  81. * states by calling scheduler functions; the scheduler only runs on open-for-
  82. * writes/has-cells channels and is the only path for those to transition to
  83. * other states. The scheduler_run() function gives us the opportunity to do
  84. * scheduling work, and is called from other scheduler functions whenever a
  85. * state transition occurs, and periodically from the main event loop.
  86. */
  87. /* Scheduler global data structures */
  88. /*
  89. * We keep a list of channels that are pending - i.e, have cells to write
  90. * and can accept them to send. The enum scheduler_state in channel_t
  91. * is reserved for our use.
  92. */
  93. /* List of channels that can write and have cells (pending work) */
  94. static smartlist_t *channels_pending = NULL;
  95. /*
  96. * This event runs the scheduler from its callback, and is manually
  97. * activated whenever a channel enters open for writes/cells to send.
  98. */
  99. static struct event *run_sched_ev = NULL;
  100. /*
  101. * Queue heuristic; this is not the queue size, but an 'effective queuesize'
  102. * that ages out contributions from stalled channels.
  103. */
  104. static uint64_t queue_heuristic = 0;
  105. /*
  106. * Timestamp for last queue heuristic update
  107. */
  108. static time_t queue_heuristic_timestamp = 0;
  109. /* Scheduler static function declarations */
  110. static int scheduler_compare_channels(const void **c1_v, const void **c2_v);
  111. static void scheduler_evt_callback(evutil_socket_t fd,
  112. short events, void *arg);
  113. static int scheduler_more_work(void);
  114. static void scheduler_retrigger(void);
  115. #if 0
  116. static void scheduler_trigger(void);
  117. #endif
  118. static uint64_t scheduler_get_queue_heuristic(void);
  119. static void scheduler_update_queue_heuristic(time_t now);
  120. /* Scheduler function implementations */
  121. /** Free everything and shut down the scheduling system */
  122. void
  123. scheduler_free_all(void)
  124. {
  125. log_debug(LD_SCHED, "Shutting down scheduler");
  126. if (run_sched_ev) {
  127. event_del(run_sched_ev);
  128. tor_event_free(run_sched_ev);
  129. run_sched_ev = NULL;
  130. }
  131. if (channels_pending) {
  132. smartlist_free(channels_pending);
  133. channels_pending = NULL;
  134. }
  135. }
  136. /**
  137. * Comparison function to use when sorting pending channels
  138. */
  139. static int
  140. scheduler_compare_channels(const void **c1_v, const void **c2_v)
  141. {
  142. channel_t *c1 = NULL, *c2 = NULL;
  143. /* These are a workaround for -Wbad-function-cast throwing a fit */
  144. const circuitmux_policy_t *p1, *p2;
  145. uintptr_t p1_i, p2_i;
  146. tor_assert(c1_v);
  147. tor_assert(c2_v);
  148. c1 = (channel_t *)(*c1_v);
  149. c2 = (channel_t *)(*c2_v);
  150. tor_assert(c1);
  151. tor_assert(c2);
  152. if (c1 != c2) {
  153. if (circuitmux_get_policy(c1->cmux) ==
  154. circuitmux_get_policy(c2->cmux)) {
  155. /* Same cmux policy, so use the mux comparison */
  156. return circuitmux_compare_muxes(c1->cmux, c2->cmux);
  157. } else {
  158. /*
  159. * Different policies; not important to get this edge case perfect
  160. * because the current code never actually gives different channels
  161. * different cmux policies anyway. Just use this arbitrary but
  162. * definite choice.
  163. */
  164. p1 = circuitmux_get_policy(c1->cmux);
  165. p2 = circuitmux_get_policy(c2->cmux);
  166. p1_i = (uintptr_t)p1;
  167. p2_i = (uintptr_t)p2;
  168. return (p1_i < p2_i) ? -1 : 1;
  169. }
  170. } else {
  171. /* c1 == c2, so always equal */
  172. return 0;
  173. }
  174. }
  175. /*
  176. * Scheduler event callback; this should get triggered once per event loop
  177. * if any scheduling work was created during the event loop.
  178. */
  179. static void
  180. scheduler_evt_callback(evutil_socket_t fd, short events, void *arg)
  181. {
  182. (void)fd;
  183. (void)events;
  184. (void)arg;
  185. log_debug(LD_SCHED, "Scheduler event callback called");
  186. tor_assert(run_sched_ev);
  187. /* Run the scheduler */
  188. scheduler_run();
  189. /* Do we have more work to do? */
  190. if (scheduler_more_work()) scheduler_retrigger();
  191. }
  192. /** Mark a channel as no longer ready to accept writes */
  193. void
  194. scheduler_channel_doesnt_want_writes(channel_t *chan)
  195. {
  196. tor_assert(chan);
  197. tor_assert(channels_pending);
  198. /* If it's already in pending, we can put it in waiting_to_write */
  199. if (chan->scheduler_state == SCHED_CHAN_PENDING) {
  200. /*
  201. * It's in channels_pending, so it shouldn't be in any of
  202. * the other lists. It can't write any more, so it goes to
  203. * channels_waiting_to_write.
  204. */
  205. smartlist_remove(channels_pending, chan);
  206. chan->scheduler_state = SCHED_CHAN_WAITING_TO_WRITE;
  207. log_debug(LD_SCHED,
  208. "Channel " U64_FORMAT " at %p went from pending "
  209. "to waiting_to_write",
  210. U64_PRINTF_ARG(chan->global_identifier), chan);
  211. } else {
  212. /*
  213. * It's not in pending, so it can't become waiting_to_write; it's
  214. * either not in any of the lists (nothing to do) or it's already in
  215. * waiting_for_cells (remove it, can't write any more).
  216. */
  217. if (chan->scheduler_state == SCHED_CHAN_WAITING_FOR_CELLS) {
  218. chan->scheduler_state = SCHED_CHAN_IDLE;
  219. log_debug(LD_SCHED,
  220. "Channel " U64_FORMAT " at %p left waiting_for_cells",
  221. U64_PRINTF_ARG(chan->global_identifier), chan);
  222. }
  223. }
  224. }
  225. /** Mark a channel as having waiting cells */
  226. void
  227. scheduler_channel_has_waiting_cells(channel_t *chan)
  228. {
  229. int became_pending = 0;
  230. tor_assert(chan);
  231. tor_assert(channels_pending);
  232. /* First, check if this one also writeable */
  233. if (chan->scheduler_state == SCHED_CHAN_WAITING_FOR_CELLS) {
  234. /*
  235. * It's in channels_waiting_for_cells, so it shouldn't be in any of
  236. * the other lists. It has waiting cells now, so it goes to
  237. * channels_pending.
  238. */
  239. chan->scheduler_state = SCHED_CHAN_PENDING;
  240. smartlist_add(channels_pending, chan);
  241. log_debug(LD_SCHED,
  242. "Channel " U64_FORMAT " at %p went from waiting_for_cells "
  243. "to pending",
  244. U64_PRINTF_ARG(chan->global_identifier), chan);
  245. became_pending = 1;
  246. } else {
  247. /*
  248. * It's not in waiting_for_cells, so it can't become pending; it's
  249. * either not in any of the lists (we add it to waiting_to_write)
  250. * or it's already in waiting_to_write or pending (we do nothing)
  251. */
  252. if (!(chan->scheduler_state == SCHED_CHAN_WAITING_TO_WRITE ||
  253. chan->scheduler_state == SCHED_CHAN_PENDING)) {
  254. chan->scheduler_state = SCHED_CHAN_WAITING_TO_WRITE;
  255. log_debug(LD_SCHED,
  256. "Channel " U64_FORMAT " at %p entered waiting_to_write",
  257. U64_PRINTF_ARG(chan->global_identifier), chan);
  258. }
  259. }
  260. /*
  261. * If we made a channel pending, we potentially have scheduling work
  262. * to do.
  263. */
  264. if (became_pending) scheduler_retrigger();
  265. }
  266. /** Set up the scheduling system */
  267. void
  268. scheduler_init(void)
  269. {
  270. log_debug(LD_SCHED, "Initting scheduler");
  271. tor_assert(!run_sched_ev);
  272. run_sched_ev = tor_event_new(tor_libevent_get_base(), -1,
  273. 0, scheduler_evt_callback, NULL);
  274. channels_pending = smartlist_new();
  275. queue_heuristic = 0;
  276. queue_heuristic_timestamp = approx_time();
  277. }
  278. /** Check if there's more scheduling work */
  279. static int
  280. scheduler_more_work(void)
  281. {
  282. tor_assert(channels_pending);
  283. return ((scheduler_get_queue_heuristic() < SCHED_Q_LOW_WATER) &&
  284. ((smartlist_len(channels_pending) > 0))) ? 1 : 0;
  285. }
  286. /** Retrigger the scheduler in a way safe to use from the callback */
  287. static void
  288. scheduler_retrigger(void)
  289. {
  290. tor_assert(run_sched_ev);
  291. event_active(run_sched_ev, EV_TIMEOUT, 1);
  292. }
  293. /** Notify the scheduler of a channel being closed */
  294. void
  295. scheduler_release_channel(channel_t *chan)
  296. {
  297. tor_assert(chan);
  298. tor_assert(channels_pending);
  299. if (chan->scheduler_state == SCHED_CHAN_PENDING) {
  300. smartlist_remove(channels_pending, chan);
  301. }
  302. chan->scheduler_state = SCHED_CHAN_IDLE;
  303. }
  304. /** Run the scheduling algorithm if necessary */
  305. void
  306. scheduler_run(void)
  307. {
  308. smartlist_t *tmp = NULL;
  309. int n_cells, n_chans_before, n_chans_after;
  310. uint64_t q_len_before, q_heur_before, q_len_after, q_heur_after;
  311. ssize_t flushed, flushed_this_time;
  312. log_debug(LD_SCHED, "We have a chance to run the scheduler");
  313. if (scheduler_get_queue_heuristic() < SCHED_Q_LOW_WATER) {
  314. n_chans_before = smartlist_len(channels_pending);
  315. q_len_before = channel_get_global_queue_estimate();
  316. q_heur_before = scheduler_get_queue_heuristic();
  317. tmp = channels_pending;
  318. channels_pending = smartlist_new();
  319. /*
  320. * UGLY HACK: sort the list on each invocation
  321. *
  322. * TODO smarter data structures
  323. */
  324. smartlist_sort(tmp, scheduler_compare_channels);
  325. SMARTLIST_FOREACH_BEGIN(tmp, channel_t *, chan) {
  326. if (scheduler_get_queue_heuristic() <= SCHED_Q_HIGH_WATER) {
  327. n_cells = channel_num_cells_writeable(chan);
  328. if (n_cells > 0) {
  329. log_debug(LD_SCHED,
  330. "Scheduler saw pending channel " U64_FORMAT " at %p with "
  331. "%d cells writeable",
  332. U64_PRINTF_ARG(chan->global_identifier), chan, n_cells);
  333. flushed = 0;
  334. while (flushed < n_cells) {
  335. flushed_this_time =
  336. channel_flush_some_cells(chan, n_cells - flushed);
  337. if (flushed_this_time <= 0) break;
  338. flushed += flushed_this_time;
  339. }
  340. if (flushed < n_cells) {
  341. /* We ran out of cells to flush */
  342. chan->scheduler_state = SCHED_CHAN_WAITING_FOR_CELLS;
  343. } else {
  344. /* TODO get this right */
  345. }
  346. log_debug(LD_SCHED,
  347. "Scheduler flushed %d cells onto pending channel "
  348. U64_FORMAT " at %p",
  349. (int)flushed, U64_PRINTF_ARG(chan->global_identifier),
  350. chan);
  351. } else {
  352. log_info(LD_SCHED,
  353. "Scheduler saw pending channel " U64_FORMAT " at %p with "
  354. "no cells writeable",
  355. U64_PRINTF_ARG(chan->global_identifier), chan);
  356. /* Put it back to WAITING_TO_WRITE */
  357. chan->scheduler_state = SCHED_CHAN_WAITING_TO_WRITE;
  358. }
  359. } else {
  360. /* Not getting it this round; put it back on the list */
  361. smartlist_add(channels_pending, chan);
  362. /* It states in SCHED_CHAN_PENDING */
  363. }
  364. } SMARTLIST_FOREACH_END(chan);
  365. smartlist_free(tmp);
  366. n_chans_after = smartlist_len(channels_pending);
  367. q_len_after = channel_get_global_queue_estimate();
  368. q_heur_after = scheduler_get_queue_heuristic();
  369. log_debug(LD_SCHED,
  370. "Scheduler handled %d of %d pending channels, queue size from "
  371. U64_FORMAT " to " U64_FORMAT ", queue heuristic from "
  372. U64_FORMAT " to " U64_FORMAT,
  373. n_chans_before - n_chans_after, n_chans_before,
  374. U64_PRINTF_ARG(q_len_before), U64_PRINTF_ARG(q_len_after),
  375. U64_PRINTF_ARG(q_heur_before), U64_PRINTF_ARG(q_heur_after));
  376. }
  377. }
  378. /** Trigger the scheduling event so we run the scheduler later */
  379. #if 0
  380. static void
  381. scheduler_trigger(void)
  382. {
  383. log_debug(LD_SCHED, "Triggering scheduler event");
  384. tor_assert(run_sched_ev);
  385. event_add(run_sched_ev, EV_TIMEOUT, 1);
  386. }
  387. #endif
  388. /** Mark a channel as ready to accept writes */
  389. void
  390. scheduler_channel_wants_writes(channel_t *chan)
  391. {
  392. int became_pending = 0;
  393. tor_assert(chan);
  394. tor_assert(channels_pending);
  395. /* If it's already in waiting_to_write, we can put it in pending */
  396. if (chan->scheduler_state == SCHED_CHAN_WAITING_TO_WRITE) {
  397. /*
  398. * It can write now, so it goes to channels_pending.
  399. */
  400. smartlist_add(channels_pending, chan);
  401. chan->scheduler_state = SCHED_CHAN_PENDING;
  402. log_debug(LD_SCHED,
  403. "Channel " U64_FORMAT " at %p went from waiting_to_write "
  404. "to pending",
  405. U64_PRINTF_ARG(chan->global_identifier), chan);
  406. became_pending = 1;
  407. } else {
  408. /*
  409. * It's not in SCHED_CHAN_WAITING_TO_WRITE, so it can't become pending;
  410. * it's either idle and goes to WAITING_FOR_CELLS, or it's a no-op.
  411. */
  412. if (!(chan->scheduler_state == SCHED_CHAN_WAITING_FOR_CELLS ||
  413. chan->scheduler_state == SCHED_CHAN_PENDING)) {
  414. chan->scheduler_state = SCHED_CHAN_WAITING_FOR_CELLS;
  415. log_debug(LD_SCHED,
  416. "Channel " U64_FORMAT " at %p entered waiting_for_cells",
  417. U64_PRINTF_ARG(chan->global_identifier), chan);
  418. }
  419. }
  420. /*
  421. * If we made a channel pending, we potentially have scheduling work
  422. * to do.
  423. */
  424. if (became_pending) scheduler_retrigger();
  425. }
  426. /**
  427. * Notify the scheduler of a queue size adjustment, to recalculate the
  428. * queue heuristic.
  429. */
  430. void
  431. scheduler_adjust_queue_size(channel_t *chan, char dir, uint64_t adj)
  432. {
  433. time_t now = approx_time();
  434. log_debug(LD_SCHED,
  435. "Queue size adjustment by %s" U64_FORMAT " for channel "
  436. U64_FORMAT,
  437. (dir >= 0) ? "+" : "-",
  438. U64_PRINTF_ARG(adj),
  439. U64_PRINTF_ARG(chan->global_identifier));
  440. /* Get the queue heuristic up to date */
  441. scheduler_update_queue_heuristic(now);
  442. /* Adjust as appropriate */
  443. if (dir >= 0) {
  444. /* Increasing it */
  445. queue_heuristic += adj;
  446. } else {
  447. /* Decreasing it */
  448. if (queue_heuristic > adj) queue_heuristic -= adj;
  449. else queue_heuristic = 0;
  450. }
  451. log_debug(LD_SCHED,
  452. "Queue heuristic is now " U64_FORMAT,
  453. U64_PRINTF_ARG(queue_heuristic));
  454. }
  455. /**
  456. * Query the current value of the queue heuristic
  457. */
  458. static uint64_t
  459. scheduler_get_queue_heuristic(void)
  460. {
  461. time_t now = approx_time();
  462. scheduler_update_queue_heuristic(now);
  463. return queue_heuristic;
  464. }
  465. /**
  466. * Adjust the queue heuristic value to the present time
  467. */
  468. static void
  469. scheduler_update_queue_heuristic(time_t now)
  470. {
  471. time_t diff;
  472. if (queue_heuristic_timestamp == 0) {
  473. /*
  474. * Nothing we can sensibly do; must not have been initted properly.
  475. * Oh well.
  476. */
  477. queue_heuristic_timestamp = now;
  478. } else if (queue_heuristic_timestamp < now) {
  479. diff = now - queue_heuristic_timestamp;
  480. /*
  481. * This is a simple exponential age-out; the other proposed alternative
  482. * was a linear age-out using the bandwidth history in rephist.c; I'm
  483. * going with this out of concern that if an adversary can jam the
  484. * scheduler long enough, it would cause the bandwidth to drop to
  485. * zero and render the aging mechanism ineffective thereafter.
  486. */
  487. if (0 <= diff && diff < 64) queue_heuristic >>= diff;
  488. else queue_heuristic = 0;
  489. queue_heuristic_timestamp = now;
  490. log_debug(LD_SCHED,
  491. "Queue heuristic is now " U64_FORMAT,
  492. U64_PRINTF_ARG(queue_heuristic));
  493. }
  494. /* else no update needed, or time went backward */
  495. }