util_string.c 13 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543
  1. /* Copyright (c) 2003-2004, Roger Dingledine
  2. * Copyright (c) 2004-2006, Roger Dingledine, Nick Mathewson.
  3. * Copyright (c) 2007-2018, The Tor Project, Inc. */
  4. /* See LICENSE for licensing information */
  5. /**
  6. * \file util_string.c
  7. * \brief Non-standard string functions used throughout Tor.
  8. **/
  9. #include "lib/string/util_string.h"
  10. #include "lib/string/compat_ctype.h"
  11. #include "lib/err/torerr.h"
  12. #include "lib/ctime/di_ops.h"
  13. #include "lib/defs/digest_sizes.h"
  14. #include <string.h>
  15. #include <stdlib.h>
  16. /** Given <b>hlen</b> bytes at <b>haystack</b> and <b>nlen</b> bytes at
  17. * <b>needle</b>, return a pointer to the first occurrence of the needle
  18. * within the haystack, or NULL if there is no such occurrence.
  19. *
  20. * This function is <em>not</em> timing-safe.
  21. *
  22. * Requires that <b>nlen</b> be greater than zero.
  23. */
  24. const void *
  25. tor_memmem(const void *_haystack, size_t hlen,
  26. const void *_needle, size_t nlen)
  27. {
  28. #if defined(HAVE_MEMMEM) && (!defined(__GNUC__) || __GNUC__ >= 2)
  29. raw_assert(nlen);
  30. return memmem(_haystack, hlen, _needle, nlen);
  31. #else
  32. /* This isn't as fast as the GLIBC implementation, but it doesn't need to
  33. * be. */
  34. const char *p, *last_possible_start;
  35. const char *haystack = (const char*)_haystack;
  36. const char *needle = (const char*)_needle;
  37. char first;
  38. raw_assert(nlen);
  39. if (nlen > hlen)
  40. return NULL;
  41. p = haystack;
  42. /* Last position at which the needle could start. */
  43. last_possible_start = haystack + hlen - nlen;
  44. first = *(const char*)needle;
  45. while ((p = memchr(p, first, last_possible_start + 1 - p))) {
  46. if (fast_memeq(p, needle, nlen))
  47. return p;
  48. if (++p > last_possible_start) {
  49. /* This comparison shouldn't be necessary, since if p was previously
  50. * equal to last_possible_start, the next memchr call would be
  51. * "memchr(p, first, 0)", which will return NULL. But it clarifies the
  52. * logic. */
  53. return NULL;
  54. }
  55. }
  56. return NULL;
  57. #endif /* defined(HAVE_MEMMEM) && (!defined(__GNUC__) || __GNUC__ >= 2) */
  58. }
  59. const void *
  60. tor_memstr(const void *haystack, size_t hlen, const char *needle)
  61. {
  62. return tor_memmem(haystack, hlen, needle, strlen(needle));
  63. }
  64. /** Return true iff the 'len' bytes at 'mem' are all zero. */
  65. int
  66. tor_mem_is_zero(const char *mem, size_t len)
  67. {
  68. static const char ZERO[] = {
  69. 0,0,0,0, 0,0,0,0, 0,0,0,0, 0,0,0,0, 0,0,0,0, 0,0,0,0, 0,0,0,0, 0,0,0,0,
  70. };
  71. while (len >= sizeof(ZERO)) {
  72. /* It's safe to use fast_memcmp here, since the very worst thing an
  73. * attacker could learn is how many initial bytes of a secret were zero */
  74. if (fast_memcmp(mem, ZERO, sizeof(ZERO)))
  75. return 0;
  76. len -= sizeof(ZERO);
  77. mem += sizeof(ZERO);
  78. }
  79. /* Deal with leftover bytes. */
  80. if (len)
  81. return fast_memeq(mem, ZERO, len);
  82. return 1;
  83. }
  84. /** Return true iff the DIGEST_LEN bytes in digest are all zero. */
  85. int
  86. tor_digest_is_zero(const char *digest)
  87. {
  88. static const uint8_t ZERO_DIGEST[] = {
  89. 0,0,0,0, 0,0,0,0, 0,0,0,0, 0,0,0,0, 0,0,0,0
  90. };
  91. return tor_memeq(digest, ZERO_DIGEST, DIGEST_LEN);
  92. }
  93. /** Return true iff the DIGEST256_LEN bytes in digest are all zero. */
  94. int
  95. tor_digest256_is_zero(const char *digest)
  96. {
  97. return tor_mem_is_zero(digest, DIGEST256_LEN);
  98. }
  99. /** Remove from the string <b>s</b> every character which appears in
  100. * <b>strip</b>. */
  101. void
  102. tor_strstrip(char *s, const char *strip)
  103. {
  104. char *readp = s;
  105. while (*readp) {
  106. if (strchr(strip, *readp)) {
  107. ++readp;
  108. } else {
  109. *s++ = *readp++;
  110. }
  111. }
  112. *s = '\0';
  113. }
  114. /** Convert all alphabetic characters in the nul-terminated string <b>s</b> to
  115. * lowercase. */
  116. void
  117. tor_strlower(char *s)
  118. {
  119. while (*s) {
  120. *s = TOR_TOLOWER(*s);
  121. ++s;
  122. }
  123. }
  124. /** Convert all alphabetic characters in the nul-terminated string <b>s</b> to
  125. * lowercase. */
  126. void
  127. tor_strupper(char *s)
  128. {
  129. while (*s) {
  130. *s = TOR_TOUPPER(*s);
  131. ++s;
  132. }
  133. }
  134. /** Return 1 if every character in <b>s</b> is printable, else return 0.
  135. */
  136. int
  137. tor_strisprint(const char *s)
  138. {
  139. while (*s) {
  140. if (!TOR_ISPRINT(*s))
  141. return 0;
  142. s++;
  143. }
  144. return 1;
  145. }
  146. /** Return 1 if no character in <b>s</b> is uppercase, else return 0.
  147. */
  148. int
  149. tor_strisnonupper(const char *s)
  150. {
  151. while (*s) {
  152. if (TOR_ISUPPER(*s))
  153. return 0;
  154. s++;
  155. }
  156. return 1;
  157. }
  158. /** Return true iff every character in <b>s</b> is whitespace space; else
  159. * return false. */
  160. int
  161. tor_strisspace(const char *s)
  162. {
  163. while (*s) {
  164. if (!TOR_ISSPACE(*s))
  165. return 0;
  166. s++;
  167. }
  168. return 1;
  169. }
  170. /** As strcmp, except that either string may be NULL. The NULL string is
  171. * considered to be before any non-NULL string. */
  172. int
  173. strcmp_opt(const char *s1, const char *s2)
  174. {
  175. if (!s1) {
  176. if (!s2)
  177. return 0;
  178. else
  179. return -1;
  180. } else if (!s2) {
  181. return 1;
  182. } else {
  183. return strcmp(s1, s2);
  184. }
  185. }
  186. /** Compares the first strlen(s2) characters of s1 with s2. Returns as for
  187. * strcmp.
  188. */
  189. int
  190. strcmpstart(const char *s1, const char *s2)
  191. {
  192. size_t n = strlen(s2);
  193. return strncmp(s1, s2, n);
  194. }
  195. /** Compare the s1_len-byte string <b>s1</b> with <b>s2</b>,
  196. * without depending on a terminating nul in s1. Sorting order is first by
  197. * length, then lexically; return values are as for strcmp.
  198. */
  199. int
  200. strcmp_len(const char *s1, const char *s2, size_t s1_len)
  201. {
  202. size_t s2_len = strlen(s2);
  203. if (s1_len < s2_len)
  204. return -1;
  205. if (s1_len > s2_len)
  206. return 1;
  207. return fast_memcmp(s1, s2, s2_len);
  208. }
  209. /** Compares the first strlen(s2) characters of s1 with s2. Returns as for
  210. * strcasecmp.
  211. */
  212. int
  213. strcasecmpstart(const char *s1, const char *s2)
  214. {
  215. size_t n = strlen(s2);
  216. return strncasecmp(s1, s2, n);
  217. }
  218. /** Compare the value of the string <b>prefix</b> with the start of the
  219. * <b>memlen</b>-byte memory chunk at <b>mem</b>. Return as for strcmp.
  220. *
  221. * [As fast_memcmp(mem, prefix, strlen(prefix)) but returns -1 if memlen is
  222. * less than strlen(prefix).]
  223. */
  224. int
  225. fast_memcmpstart(const void *mem, size_t memlen,
  226. const char *prefix)
  227. {
  228. size_t plen = strlen(prefix);
  229. if (memlen < plen)
  230. return -1;
  231. return fast_memcmp(mem, prefix, plen);
  232. }
  233. /** Compares the last strlen(s2) characters of s1 with s2. Returns as for
  234. * strcmp.
  235. */
  236. int
  237. strcmpend(const char *s1, const char *s2)
  238. {
  239. size_t n1 = strlen(s1), n2 = strlen(s2);
  240. if (n2>n1)
  241. return strcmp(s1,s2);
  242. else
  243. return strncmp(s1+(n1-n2), s2, n2);
  244. }
  245. /** Compares the last strlen(s2) characters of s1 with s2. Returns as for
  246. * strcasecmp.
  247. */
  248. int
  249. strcasecmpend(const char *s1, const char *s2)
  250. {
  251. size_t n1 = strlen(s1), n2 = strlen(s2);
  252. if (n2>n1) /* then they can't be the same; figure out which is bigger */
  253. return strcasecmp(s1,s2);
  254. else
  255. return strncasecmp(s1+(n1-n2), s2, n2);
  256. }
  257. /** Return a pointer to the first char of s that is not whitespace and
  258. * not a comment, or to the terminating NUL if no such character exists.
  259. */
  260. const char *
  261. eat_whitespace(const char *s)
  262. {
  263. raw_assert(s);
  264. while (1) {
  265. switch (*s) {
  266. case '\0':
  267. default:
  268. return s;
  269. case ' ':
  270. case '\t':
  271. case '\n':
  272. case '\r':
  273. ++s;
  274. break;
  275. case '#':
  276. ++s;
  277. while (*s && *s != '\n')
  278. ++s;
  279. }
  280. }
  281. }
  282. /** Return a pointer to the first char of s that is not whitespace and
  283. * not a comment, or to the terminating NUL if no such character exists.
  284. */
  285. const char *
  286. eat_whitespace_eos(const char *s, const char *eos)
  287. {
  288. raw_assert(s);
  289. raw_assert(eos && s <= eos);
  290. while (s < eos) {
  291. switch (*s) {
  292. case '\0':
  293. default:
  294. return s;
  295. case ' ':
  296. case '\t':
  297. case '\n':
  298. case '\r':
  299. ++s;
  300. break;
  301. case '#':
  302. ++s;
  303. while (s < eos && *s && *s != '\n')
  304. ++s;
  305. }
  306. }
  307. return s;
  308. }
  309. /** Return a pointer to the first char of s that is not a space or a tab
  310. * or a \\r, or to the terminating NUL if no such character exists. */
  311. const char *
  312. eat_whitespace_no_nl(const char *s)
  313. {
  314. while (*s == ' ' || *s == '\t' || *s == '\r')
  315. ++s;
  316. return s;
  317. }
  318. /** As eat_whitespace_no_nl, but stop at <b>eos</b> whether we have
  319. * found a non-whitespace character or not. */
  320. const char *
  321. eat_whitespace_eos_no_nl(const char *s, const char *eos)
  322. {
  323. while (s < eos && (*s == ' ' || *s == '\t' || *s == '\r'))
  324. ++s;
  325. return s;
  326. }
  327. /** Return a pointer to the first char of s that is whitespace or <b>#</b>,
  328. * or to the terminating NUL if no such character exists.
  329. */
  330. const char *
  331. find_whitespace(const char *s)
  332. {
  333. /* tor_assert(s); */
  334. while (1) {
  335. switch (*s)
  336. {
  337. case '\0':
  338. case '#':
  339. case ' ':
  340. case '\r':
  341. case '\n':
  342. case '\t':
  343. return s;
  344. default:
  345. ++s;
  346. }
  347. }
  348. }
  349. /** As find_whitespace, but stop at <b>eos</b> whether we have found a
  350. * whitespace or not. */
  351. const char *
  352. find_whitespace_eos(const char *s, const char *eos)
  353. {
  354. /* tor_assert(s); */
  355. while (s < eos) {
  356. switch (*s)
  357. {
  358. case '\0':
  359. case '#':
  360. case ' ':
  361. case '\r':
  362. case '\n':
  363. case '\t':
  364. return s;
  365. default:
  366. ++s;
  367. }
  368. }
  369. return s;
  370. }
  371. /** Return the first occurrence of <b>needle</b> in <b>haystack</b> that
  372. * occurs at the start of a line (that is, at the beginning of <b>haystack</b>
  373. * or immediately after a newline). Return NULL if no such string is found.
  374. */
  375. const char *
  376. find_str_at_start_of_line(const char *haystack, const char *needle)
  377. {
  378. size_t needle_len = strlen(needle);
  379. do {
  380. if (!strncmp(haystack, needle, needle_len))
  381. return haystack;
  382. haystack = strchr(haystack, '\n');
  383. if (!haystack)
  384. return NULL;
  385. else
  386. ++haystack;
  387. } while (*haystack);
  388. return NULL;
  389. }
  390. /** Returns true if <b>string</b> could be a C identifier.
  391. A C identifier must begin with a letter or an underscore and the
  392. rest of its characters can be letters, numbers or underscores. No
  393. length limit is imposed. */
  394. int
  395. string_is_C_identifier(const char *string)
  396. {
  397. size_t iter;
  398. size_t length = strlen(string);
  399. if (!length)
  400. return 0;
  401. for (iter = 0; iter < length ; iter++) {
  402. if (iter == 0) {
  403. if (!(TOR_ISALPHA(string[iter]) ||
  404. string[iter] == '_'))
  405. return 0;
  406. } else {
  407. if (!(TOR_ISALPHA(string[iter]) ||
  408. TOR_ISDIGIT(string[iter]) ||
  409. string[iter] == '_'))
  410. return 0;
  411. }
  412. }
  413. return 1;
  414. }
  415. /** A byte with the top <b>x</b> bits set. */
  416. #define TOP_BITS(x) ((uint8_t)(0xFF << (8 - (x))))
  417. /** A byte with the lowest <b>x</b> bits set. */
  418. #define LOW_BITS(x) ((uint8_t)(0xFF >> (8 - (x))))
  419. /** Given the leading byte <b>b</b>, return the total number of bytes in the
  420. * UTF-8 character. Returns 0 if it's an invalid leading byte.
  421. */
  422. static uint8_t
  423. bytes_in_char(uint8_t b)
  424. {
  425. if ((TOP_BITS(1) & b) == 0x00)
  426. return 1; // a 1-byte UTF-8 char, aka ASCII
  427. if ((TOP_BITS(3) & b) == TOP_BITS(2))
  428. return 2; // a 2-byte UTF-8 char
  429. if ((TOP_BITS(4) & b) == TOP_BITS(3))
  430. return 3; // a 3-byte UTF-8 char
  431. if ((TOP_BITS(5) & b) == TOP_BITS(4))
  432. return 4; // a 4-byte UTF-8 char
  433. // Invalid: either the top 2 bits are 10, or the top 5 bits are 11111.
  434. return 0;
  435. }
  436. /** Returns true iff <b>b</b> is a UTF-8 continuation byte. */
  437. static bool
  438. is_continuation_byte(uint8_t b)
  439. {
  440. uint8_t top2bits = b & TOP_BITS(2);
  441. return top2bits == TOP_BITS(1);
  442. }
  443. /** Returns true iff the <b>len</b> bytes in <b>c</b> are a valid UTF-8
  444. * character.
  445. */
  446. static bool
  447. validate_char(const uint8_t *c, uint8_t len)
  448. {
  449. if (len == 1)
  450. return true; // already validated this is an ASCII char earlier.
  451. uint8_t mask = LOW_BITS(7 - len); // bitmask for the leading byte.
  452. uint32_t codepoint = c[0] & mask;
  453. mask = LOW_BITS(6); // bitmask for continuation bytes.
  454. for (uint8_t i = 1; i < len; i++) {
  455. if (!is_continuation_byte(c[i]))
  456. return false;
  457. codepoint <<= 6;
  458. codepoint |= (c[i] & mask);
  459. }
  460. if (len == 2 && codepoint <= 0x7f)
  461. return false; // Invalid, overly long encoding, should have fit in 1 byte.
  462. if (len == 3 && codepoint <= 0x7ff)
  463. return false; // Invalid, overly long encoding, should have fit in 2 bytes.
  464. if (len == 4 && codepoint <= 0xffff)
  465. return false; // Invalid, overly long encoding, should have fit in 3 bytes.
  466. if (codepoint >= 0xd800 && codepoint <= 0xdfff)
  467. return false; // Invalid, reserved for UTF-16 surrogate pairs.
  468. return codepoint <= 0x10ffff; // Check if within maximum.
  469. }
  470. /** Returns true iff the first <b>len</b> bytes in <b>str</b> are a
  471. valid UTF-8 string. */
  472. int
  473. string_is_utf8(const char *str, size_t len)
  474. {
  475. for (size_t i = 0; i < len;) {
  476. uint8_t num_bytes = bytes_in_char(str[i]);
  477. if (num_bytes == 0) // Invalid leading byte found.
  478. return false;
  479. size_t next_char = i + num_bytes;
  480. if (next_char > len)
  481. return false;
  482. // Validate the continuation bytes in this multi-byte character,
  483. // and advance to the next character in the string.
  484. if (!validate_char((const uint8_t*)&str[i], num_bytes))
  485. return false;
  486. i = next_char;
  487. }
  488. return true;
  489. }