scheduler.c 18 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562
  1. /* * Copyright (c) 2013, The Tor Project, Inc. */
  2. /* See LICENSE for licensing information */
  3. /**
  4. * \file scheduler.c
  5. * \brief Relay scheduling system
  6. **/
  7. #include "or.h"
  8. #define TOR_CHANNEL_INTERNAL_ /* For channel_flush_some_cells() */
  9. #include "channel.h"
  10. #include "compat_libevent.h"
  11. #include "scheduler.h"
  12. #ifdef HAVE_EVENT2_EVENT_H
  13. #include <event2/event.h>
  14. #else
  15. #include <event.h>
  16. #endif
  17. #define SCHED_Q_LOW_WATER 16384
  18. #define SCHED_Q_HIGH_WATER (2 * SCHED_Q_LOW_WATER)
  19. /*
  20. * Write scheduling works by keeping track of lists of channels that can
  21. * accept cells, and have cells to write. From the scheduler's perspective,
  22. * a channel can be in four possible states:
  23. *
  24. * 1.) Not open for writes, no cells to send
  25. * - Not much to do here, and the channel will appear in neither list.
  26. * - Transitions from:
  27. * - Open for writes/has cells by simultaneously draining all circuit
  28. * queues and filling the output buffer.
  29. * - Transitions to:
  30. * - Not open for writes/has cells by arrival of cells on an attached
  31. * circuit (this would be driven from append_cell_to_circuit_queue())
  32. * - Open for writes/no cells by a channel type specific path;
  33. * driven from connection_or_flushed_some() for channel_tls_t.
  34. *
  35. * 2.) Open for writes, no cells to send
  36. * - Not much here either; this will be the state an idle but open channel
  37. * can be expected to settle in.
  38. * - Transitions from:
  39. * - Not open for writes/no cells by flushing some of the output
  40. * buffer.
  41. * - Open for writes/has cells by the scheduler moving cells from
  42. * circuit queues to channel output queue, but not having enough
  43. * to fill the output queue.
  44. * - Transitions to:
  45. * - Open for writes/has cells by arrival of new cells on an attached
  46. * circuit, in append_cell_to_circuit_queue()
  47. *
  48. * 3.) Not open for writes, cells to send
  49. * - This is the state of a busy circuit limited by output bandwidth;
  50. * cells have piled up in the circuit queues waiting to be relayed.
  51. * - Transitions from:
  52. * - Not open for writes/no cells by arrival of cells on an attached
  53. * circuit
  54. * - Open for writes/has cells by filling an output buffer without
  55. * draining all cells from attached circuits
  56. * - Transitions to:
  57. * - Opens for writes/has cells by draining some of the output buffer
  58. * via the connection_or_flushed_some() path (for channel_tls_t).
  59. *
  60. * 4.) Open for writes, cells to send
  61. * - This connection is ready to relay some cells and waiting for
  62. * the scheduler to choose it
  63. * - Transitions from:
  64. * - Not open for writes/has cells by the connection_or_flushed_some()
  65. * path
  66. * - Open for writes/no cells by the append_cell_to_circuit_queue()
  67. * path
  68. * - Transitions to:
  69. * - Not open for writes/no cells by draining all circuit queues and
  70. * simultaneously filling the output buffer.
  71. * - Not open for writes/has cells by writing enough cells to fill the
  72. * output buffer
  73. * - Open for writes/no cells by draining all attached circuit queues
  74. * without also filling the output buffer
  75. *
  76. * Other event-driven parts of the code move channels between these scheduling
  77. * states by calling scheduler functions; the scheduler only runs on open-for-
  78. * writes/has-cells channels and is the only path for those to transition to
  79. * other states. The scheduler_run() function gives us the opportunity to do
  80. * scheduling work, and is called from other scheduler functions whenever a
  81. * state transition occurs, and periodically from the main event loop.
  82. */
  83. /* Scheduler global data structures */
  84. /*
  85. * We keep lists of channels that either have cells queued, can accept
  86. * writes, or both (states 2, 3 and 4 above) - no explicit list of state
  87. * 1 channels is kept, so we don't have to worry about registering new
  88. * channels here or anything. The scheduler will learn about them when
  89. * it needs to. We can check how many channels in state 4 in O(1), so
  90. * the test whether we have anything to do in scheduler_run() is fast
  91. * and there's no harm in calling it opportunistically whenever we get
  92. * the chance.
  93. *
  94. * Note that it takes time O(n) to search for a channel in these smartlists
  95. * or move one; I don't think the number of channels on a relay will be large
  96. * enough for this to be a severe problem, but this would benefit from using
  97. * a doubly-linked list rather than smartlist_t, together with a hash map from
  98. * channel identifiers to pointers to list entries, so we can perform those
  99. * operations in O(log(n)).
  100. */
  101. /* List of channels that can write but have no cells (state 2 above) */
  102. static smartlist_t *channels_waiting_for_cells = NULL;
  103. /* List of channels with cells waiting to write (state 3 above) */
  104. static smartlist_t *channels_waiting_to_write = NULL;
  105. /* List of channels that can write and have cells (pending work) */
  106. static smartlist_t *channels_pending = NULL;
  107. /*
  108. * This event runs the scheduler from its callback, and is manually
  109. * activated whenever a channel enters open for writes/cells to send.
  110. */
  111. static struct event *run_sched_ev = NULL;
  112. /*
  113. * Queue heuristic; this is not the queue size, but an 'effective queuesize'
  114. * that ages out contributions from stalled channels.
  115. */
  116. static uint64_t queue_heuristic = 0;
  117. /*
  118. * Timestamp for last queue heuristic update
  119. */
  120. static time_t queue_heuristic_timestamp = 0;
  121. /* Scheduler static function declarations */
  122. static void scheduler_evt_callback(evutil_socket_t fd,
  123. short events, void *arg);
  124. static int scheduler_more_work(void);
  125. static void scheduler_retrigger(void);
  126. #if 0
  127. static void scheduler_trigger(void);
  128. #endif
  129. static uint64_t scheduler_get_queue_heuristic(void);
  130. static void scheduler_update_queue_heuristic(time_t now);
  131. /* Scheduler function implementations */
  132. /** Free everything and shut down the scheduling system */
  133. void
  134. scheduler_free_all(void)
  135. {
  136. log_debug(LD_SCHED, "Shutting down scheduler");
  137. if (run_sched_ev) {
  138. event_del(run_sched_ev);
  139. tor_event_free(run_sched_ev);
  140. run_sched_ev = NULL;
  141. }
  142. if (channels_waiting_for_cells) {
  143. smartlist_free(channels_waiting_for_cells);
  144. channels_waiting_for_cells = NULL;
  145. }
  146. if (channels_waiting_to_write) {
  147. smartlist_free(channels_waiting_to_write);
  148. channels_waiting_to_write = NULL;
  149. }
  150. if (channels_pending) {
  151. smartlist_free(channels_pending);
  152. channels_pending = NULL;
  153. }
  154. }
  155. /*
  156. * Scheduler event callback; this should get triggered once per event loop
  157. * if any scheduling work was created during the event loop.
  158. */
  159. static void
  160. scheduler_evt_callback(evutil_socket_t fd, short events, void *arg)
  161. {
  162. (void)fd;
  163. (void)events;
  164. (void)arg;
  165. log_debug(LD_SCHED, "Scheduler event callback called");
  166. tor_assert(run_sched_ev);
  167. /* Run the scheduler */
  168. scheduler_run();
  169. /* Do we have more work to do? */
  170. if (scheduler_more_work()) scheduler_retrigger();
  171. }
  172. /** Mark a channel as no longer ready to accept writes */
  173. void
  174. scheduler_channel_doesnt_want_writes(channel_t *chan)
  175. {
  176. tor_assert(chan);
  177. tor_assert(channels_waiting_for_cells);
  178. tor_assert(channels_waiting_to_write);
  179. tor_assert(channels_pending);
  180. /* If it's already in pending, we can put it in waiting_to_write */
  181. if (smartlist_contains(channels_pending, chan)) {
  182. /*
  183. * It's in channels_pending, so it shouldn't be in any of
  184. * the other lists. It can't write any more, so it goes to
  185. * channels_waiting_to_write.
  186. */
  187. smartlist_remove(channels_pending, chan);
  188. smartlist_add(channels_waiting_to_write, chan);
  189. log_debug(LD_SCHED,
  190. "Channel " U64_FORMAT " at %p went from pending "
  191. "to waiting_to_write",
  192. U64_PRINTF_ARG(chan->global_identifier), chan);
  193. } else {
  194. /*
  195. * It's not in pending, so it can't become waiting_to_write; it's
  196. * either not in any of the lists (nothing to do) or it's already in
  197. * waiting_for_cells (remove it, can't write any more).
  198. */
  199. if (smartlist_contains(channels_waiting_for_cells, chan)) {
  200. smartlist_remove(channels_waiting_for_cells, chan);
  201. log_debug(LD_SCHED,
  202. "Channel " U64_FORMAT " at %p left waiting_for_cells",
  203. U64_PRINTF_ARG(chan->global_identifier), chan);
  204. }
  205. }
  206. }
  207. /** Mark a channel as having waiting cells */
  208. void
  209. scheduler_channel_has_waiting_cells(channel_t *chan)
  210. {
  211. int became_pending = 0;
  212. tor_assert(chan);
  213. tor_assert(channels_waiting_for_cells);
  214. tor_assert(channels_waiting_to_write);
  215. tor_assert(channels_pending);
  216. /* First, check if this one also writeable */
  217. if (smartlist_contains(channels_waiting_for_cells, chan)) {
  218. /*
  219. * It's in channels_waiting_for_cells, so it shouldn't be in any of
  220. * the other lists. It has waiting cells now, so it goes to
  221. * channels_pending.
  222. */
  223. smartlist_remove(channels_waiting_for_cells, chan);
  224. smartlist_add(channels_pending, chan);
  225. log_debug(LD_SCHED,
  226. "Channel " U64_FORMAT " at %p went from waiting_for_cells "
  227. "to pending",
  228. U64_PRINTF_ARG(chan->global_identifier), chan);
  229. became_pending = 1;
  230. } else {
  231. /*
  232. * It's not in waiting_for_cells, so it can't become pending; it's
  233. * either not in any of the lists (we add it to waiting_to_write)
  234. * or it's already in waiting_to_write or pending (we do nothing)
  235. */
  236. if (!(smartlist_contains(channels_waiting_to_write, chan) ||
  237. smartlist_contains(channels_pending, chan))) {
  238. smartlist_add(channels_waiting_to_write, chan);
  239. log_debug(LD_SCHED,
  240. "Channel " U64_FORMAT " at %p entered waiting_to_write",
  241. U64_PRINTF_ARG(chan->global_identifier), chan);
  242. }
  243. }
  244. /*
  245. * If we made a channel pending, we potentially have scheduling work
  246. * to do.
  247. */
  248. if (became_pending) scheduler_retrigger();
  249. }
  250. /** Set up the scheduling system */
  251. void
  252. scheduler_init(void)
  253. {
  254. log_debug(LD_SCHED, "Initting scheduler");
  255. tor_assert(!run_sched_ev);
  256. run_sched_ev = tor_event_new(tor_libevent_get_base(), -1,
  257. 0, scheduler_evt_callback, NULL);
  258. channels_waiting_for_cells = smartlist_new();
  259. channels_waiting_to_write = smartlist_new();
  260. channels_pending = smartlist_new();
  261. queue_heuristic = 0;
  262. queue_heuristic_timestamp = approx_time();
  263. }
  264. /** Check if there's more scheduling work */
  265. static int
  266. scheduler_more_work(void)
  267. {
  268. tor_assert(channels_pending);
  269. return ((scheduler_get_queue_heuristic() < SCHED_Q_LOW_WATER) &&
  270. ((smartlist_len(channels_pending) > 0))) ? 1 : 0;
  271. }
  272. /** Retrigger the scheduler in a way safe to use from the callback */
  273. static void
  274. scheduler_retrigger(void)
  275. {
  276. tor_assert(run_sched_ev);
  277. event_active(run_sched_ev, EV_TIMEOUT, 1);
  278. }
  279. /** Notify the scheduler of a channel being closed */
  280. void
  281. scheduler_release_channel(channel_t *chan)
  282. {
  283. tor_assert(chan);
  284. tor_assert(channels_waiting_for_cells);
  285. tor_assert(channels_waiting_to_write);
  286. tor_assert(channels_pending);
  287. smartlist_remove(channels_waiting_for_cells, chan);
  288. smartlist_remove(channels_waiting_to_write, chan);
  289. smartlist_remove(channels_pending, chan);
  290. }
  291. /** Run the scheduling algorithm if necessary */
  292. void
  293. scheduler_run(void)
  294. {
  295. smartlist_t *tmp = NULL;
  296. int n_cells, n_chans_before, n_chans_after;
  297. uint64_t q_len_before, q_heur_before, q_len_after, q_heur_after;
  298. ssize_t flushed, flushed_this_time;
  299. log_debug(LD_SCHED, "We have a chance to run the scheduler");
  300. if (scheduler_get_queue_heuristic() < SCHED_Q_LOW_WATER) {
  301. n_chans_before = smartlist_len(channels_pending);
  302. q_len_before = channel_get_global_queue_estimate();
  303. q_heur_before = scheduler_get_queue_heuristic();
  304. tmp = channels_pending;
  305. channels_pending = smartlist_new();
  306. /*
  307. * For now, just run the old scheduler on all the chans in the list, until
  308. * we hit the high-water mark. TODO real channel priority API
  309. */
  310. SMARTLIST_FOREACH_BEGIN(tmp, channel_t *, chan) {
  311. if (scheduler_get_queue_heuristic() <= SCHED_Q_HIGH_WATER) {
  312. n_cells = channel_num_cells_writeable(chan);
  313. if (n_cells > 0) {
  314. log_debug(LD_SCHED,
  315. "Scheduler saw pending channel " U64_FORMAT " at %p with "
  316. "%d cells writeable",
  317. U64_PRINTF_ARG(chan->global_identifier), chan, n_cells);
  318. flushed = 0;
  319. while (flushed < n_cells) {
  320. flushed_this_time =
  321. channel_flush_some_cells(chan, n_cells - flushed);
  322. if (flushed_this_time <= 0) break;
  323. flushed += flushed_this_time;
  324. }
  325. log_debug(LD_SCHED,
  326. "Scheduler flushed %d cells onto pending channel "
  327. U64_FORMAT " at %p",
  328. flushed, U64_PRINTF_ARG(chan->global_identifier), chan);
  329. } else {
  330. log_info(LD_SCHED,
  331. "Scheduler saw pending channel " U64_FORMAT " at %p with "
  332. "no cells writeable",
  333. U64_PRINTF_ARG(chan->global_identifier), chan);
  334. }
  335. } else {
  336. /* Not getting it this round; put it back on the list */
  337. smartlist_add(channels_pending, chan);
  338. }
  339. } SMARTLIST_FOREACH_END(chan);
  340. smartlist_free(tmp);
  341. n_chans_after = smartlist_len(channels_pending);
  342. q_len_after = channel_get_global_queue_estimate();
  343. q_heur_after = scheduler_get_queue_heuristic();
  344. log_debug(LD_SCHED,
  345. "Scheduler handled %d of %d pending channels, queue size from "
  346. U64_FORMAT " to " U64_FORMAT ", queue heuristic from "
  347. U64_FORMAT " to " U64_FORMAT,
  348. n_chans_before - n_chans_after, n_chans_before,
  349. U64_PRINTF_ARG(q_len_before), U64_PRINTF_ARG(q_len_after),
  350. U64_PRINTF_ARG(q_heur_before), U64_PRINTF_ARG(q_heur_after));
  351. }
  352. }
  353. /** Trigger the scheduling event so we run the scheduler later */
  354. #if 0
  355. static void
  356. scheduler_trigger(void)
  357. {
  358. log_debug(LD_SCHED, "Triggering scheduler event");
  359. tor_assert(run_sched_ev);
  360. event_add(run_sched_ev, EV_TIMEOUT, 1);
  361. }
  362. #endif
  363. /** Mark a channel as ready to accept writes */
  364. void
  365. scheduler_channel_wants_writes(channel_t *chan)
  366. {
  367. int became_pending = 0;
  368. tor_assert(chan);
  369. tor_assert(channels_waiting_for_cells);
  370. tor_assert(channels_waiting_to_write);
  371. tor_assert(channels_pending);
  372. /* If it's already in waiting_to_write, we can put it in pending */
  373. if (smartlist_contains(channels_waiting_to_write, chan)) {
  374. /*
  375. * It's in channels_waiting_to_write, so it shouldn't be in any of
  376. * the other lists. It can write now, so it goes to channels_pending.
  377. */
  378. smartlist_remove(channels_waiting_to_write, chan);
  379. smartlist_add(channels_pending, chan);
  380. log_debug(LD_SCHED,
  381. "Channel " U64_FORMAT " at %p went from waiting_to_write "
  382. "to pending",
  383. U64_PRINTF_ARG(chan->global_identifier), chan);
  384. became_pending = 1;
  385. } else {
  386. /*
  387. * It's not in waiting_to_write, so it can't become pending; it's
  388. * either not in any of the lists (we add it to waiting_for_cells)
  389. * or it's already in waiting_for_cells or pending (we do nothing)
  390. */
  391. if (!(smartlist_contains(channels_waiting_for_cells, chan) ||
  392. smartlist_contains(channels_pending, chan))) {
  393. smartlist_add(channels_waiting_for_cells, chan);
  394. log_debug(LD_SCHED,
  395. "Channel " U64_FORMAT " at %p entered waiting_for_cells",
  396. U64_PRINTF_ARG(chan->global_identifier), chan);
  397. }
  398. }
  399. /*
  400. * If we made a channel pending, we potentially have scheduling work
  401. * to do.
  402. */
  403. if (became_pending) scheduler_retrigger();
  404. }
  405. /**
  406. * Notify the scheduler of a queue size adjustment, to recalculate the
  407. * queue heuristic.
  408. */
  409. void
  410. scheduler_adjust_queue_size(channel_t *chan, char dir, uint64_t adj)
  411. {
  412. time_t now = approx_time();
  413. log_debug(LD_SCHED,
  414. "Queue size adjustment by %s" U64_FORMAT " for channel "
  415. U64_FORMAT,
  416. (dir >= 0) ? "+" : "-",
  417. U64_PRINTF_ARG(adj),
  418. U64_PRINTF_ARG(chan->global_identifier));
  419. /* Get the queue heuristic up to date */
  420. scheduler_update_queue_heuristic(now);
  421. /* Adjust as appropriate */
  422. if (dir >= 0) {
  423. /* Increasing it */
  424. queue_heuristic += adj;
  425. } else {
  426. /* Decreasing it */
  427. if (queue_heuristic > adj) queue_heuristic -= adj;
  428. else queue_heuristic = 0;
  429. }
  430. log_debug(LD_SCHED,
  431. "Queue heuristic is now " U64_FORMAT,
  432. U64_PRINTF_ARG(queue_heuristic));
  433. }
  434. /**
  435. * Query the current value of the queue heuristic
  436. */
  437. static uint64_t
  438. scheduler_get_queue_heuristic(void)
  439. {
  440. time_t now = approx_time();
  441. scheduler_update_queue_heuristic(now);
  442. return queue_heuristic;
  443. }
  444. /**
  445. * Adjust the queue heuristic value to the present time
  446. */
  447. static void
  448. scheduler_update_queue_heuristic(time_t now)
  449. {
  450. time_t diff;
  451. if (queue_heuristic_timestamp == 0) {
  452. /*
  453. * Nothing we can sensibly do; must not have been initted properly.
  454. * Oh well.
  455. */
  456. queue_heuristic_timestamp = now;
  457. } else if (queue_heuristic_timestamp < now) {
  458. diff = now - queue_heuristic_timestamp;
  459. /*
  460. * This is a simple exponential age-out; the other proposed alternative
  461. * was a linear age-out using the bandwidth history in rephist.c; I'm
  462. * going with this out of concern that if an adversary can jam the
  463. * scheduler long enough, it would cause the bandwidth to drop to
  464. * zero and render the aging mechanism ineffective thereafter.
  465. */
  466. if (0 <= diff && diff < 64) queue_heuristic >>= diff;
  467. else queue_heuristic = 0;
  468. queue_heuristic_timestamp = now;
  469. log_debug(LD_SCHED,
  470. "Queue heuristic is now " U64_FORMAT,
  471. U64_PRINTF_ARG(queue_heuristic));
  472. }
  473. /* else no update needed, or time went backward */
  474. }