util_string.c 12 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538
  1. /* Copyright (c) 2003-2004, Roger Dingledine
  2. * Copyright (c) 2004-2006, Roger Dingledine, Nick Mathewson.
  3. * Copyright (c) 2007-2019, The Tor Project, Inc. */
  4. /* See LICENSE for licensing information */
  5. /**
  6. * \file util_string.c
  7. * \brief Non-standard string functions used throughout Tor.
  8. **/
  9. #include "lib/string/util_string.h"
  10. #include "lib/string/compat_ctype.h"
  11. #include "lib/err/torerr.h"
  12. #include "lib/ctime/di_ops.h"
  13. #include "lib/defs/digest_sizes.h"
  14. #include <string.h>
  15. #include <stdlib.h>
  16. /** Given <b>hlen</b> bytes at <b>haystack</b> and <b>nlen</b> bytes at
  17. * <b>needle</b>, return a pointer to the first occurrence of the needle
  18. * within the haystack, or NULL if there is no such occurrence.
  19. *
  20. * This function is <em>not</em> timing-safe.
  21. *
  22. * Requires that <b>nlen</b> be greater than zero.
  23. */
  24. const void *
  25. tor_memmem(const void *_haystack, size_t hlen,
  26. const void *_needle, size_t nlen)
  27. {
  28. #if defined(HAVE_MEMMEM) && (!defined(__GNUC__) || __GNUC__ >= 2)
  29. raw_assert(nlen);
  30. return memmem(_haystack, hlen, _needle, nlen);
  31. #else
  32. /* This isn't as fast as the GLIBC implementation, but it doesn't need to
  33. * be. */
  34. const char *p, *last_possible_start;
  35. const char *haystack = (const char*)_haystack;
  36. const char *needle = (const char*)_needle;
  37. char first;
  38. raw_assert(nlen);
  39. if (nlen > hlen)
  40. return NULL;
  41. p = haystack;
  42. /* Last position at which the needle could start. */
  43. last_possible_start = haystack + hlen - nlen;
  44. first = *(const char*)needle;
  45. while ((p = memchr(p, first, last_possible_start + 1 - p))) {
  46. if (fast_memeq(p, needle, nlen))
  47. return p;
  48. if (++p > last_possible_start) {
  49. /* This comparison shouldn't be necessary, since if p was previously
  50. * equal to last_possible_start, the next memchr call would be
  51. * "memchr(p, first, 0)", which will return NULL. But it clarifies the
  52. * logic. */
  53. return NULL;
  54. }
  55. }
  56. return NULL;
  57. #endif /* defined(HAVE_MEMMEM) && (!defined(__GNUC__) || __GNUC__ >= 2) */
  58. }
  59. const void *
  60. tor_memstr(const void *haystack, size_t hlen, const char *needle)
  61. {
  62. return tor_memmem(haystack, hlen, needle, strlen(needle));
  63. }
  64. /** Return true iff the 'len' bytes at 'mem' are all zero. */
  65. int
  66. fast_mem_is_zero(const char *mem, size_t len)
  67. {
  68. static const char ZERO[] = {
  69. 0,0,0,0, 0,0,0,0, 0,0,0,0, 0,0,0,0, 0,0,0,0, 0,0,0,0, 0,0,0,0, 0,0,0,0,
  70. };
  71. while (len >= sizeof(ZERO)) {
  72. /* It's safe to use fast_memcmp here, since the very worst thing an
  73. * attacker could learn is how many initial bytes of a secret were zero */
  74. if (fast_memcmp(mem, ZERO, sizeof(ZERO)))
  75. return 0;
  76. len -= sizeof(ZERO);
  77. mem += sizeof(ZERO);
  78. }
  79. /* Deal with leftover bytes. */
  80. if (len)
  81. return fast_memeq(mem, ZERO, len);
  82. return 1;
  83. }
  84. /** Return true iff the DIGEST_LEN bytes in digest are all zero. */
  85. int
  86. tor_digest_is_zero(const char *digest)
  87. {
  88. return safe_mem_is_zero(digest, DIGEST_LEN);
  89. }
  90. /** Return true iff the DIGEST256_LEN bytes in digest are all zero. */
  91. int
  92. tor_digest256_is_zero(const char *digest)
  93. {
  94. return safe_mem_is_zero(digest, DIGEST256_LEN);
  95. }
  96. /** Remove from the string <b>s</b> every character which appears in
  97. * <b>strip</b>. */
  98. void
  99. tor_strstrip(char *s, const char *strip)
  100. {
  101. char *readp = s;
  102. while (*readp) {
  103. if (strchr(strip, *readp)) {
  104. ++readp;
  105. } else {
  106. *s++ = *readp++;
  107. }
  108. }
  109. *s = '\0';
  110. }
  111. /** Convert all alphabetic characters in the nul-terminated string <b>s</b> to
  112. * lowercase. */
  113. void
  114. tor_strlower(char *s)
  115. {
  116. while (*s) {
  117. *s = TOR_TOLOWER(*s);
  118. ++s;
  119. }
  120. }
  121. /** Convert all alphabetic characters in the nul-terminated string <b>s</b> to
  122. * lowercase. */
  123. void
  124. tor_strupper(char *s)
  125. {
  126. while (*s) {
  127. *s = TOR_TOUPPER(*s);
  128. ++s;
  129. }
  130. }
  131. /** Return 1 if every character in <b>s</b> is printable, else return 0.
  132. */
  133. int
  134. tor_strisprint(const char *s)
  135. {
  136. while (*s) {
  137. if (!TOR_ISPRINT(*s))
  138. return 0;
  139. s++;
  140. }
  141. return 1;
  142. }
  143. /** Return 1 if no character in <b>s</b> is uppercase, else return 0.
  144. */
  145. int
  146. tor_strisnonupper(const char *s)
  147. {
  148. while (*s) {
  149. if (TOR_ISUPPER(*s))
  150. return 0;
  151. s++;
  152. }
  153. return 1;
  154. }
  155. /** Return true iff every character in <b>s</b> is whitespace space; else
  156. * return false. */
  157. int
  158. tor_strisspace(const char *s)
  159. {
  160. while (*s) {
  161. if (!TOR_ISSPACE(*s))
  162. return 0;
  163. s++;
  164. }
  165. return 1;
  166. }
  167. /** As strcmp, except that either string may be NULL. The NULL string is
  168. * considered to be before any non-NULL string. */
  169. int
  170. strcmp_opt(const char *s1, const char *s2)
  171. {
  172. if (!s1) {
  173. if (!s2)
  174. return 0;
  175. else
  176. return -1;
  177. } else if (!s2) {
  178. return 1;
  179. } else {
  180. return strcmp(s1, s2);
  181. }
  182. }
  183. /** Compares the first strlen(s2) characters of s1 with s2. Returns as for
  184. * strcmp.
  185. */
  186. int
  187. strcmpstart(const char *s1, const char *s2)
  188. {
  189. size_t n = strlen(s2);
  190. return strncmp(s1, s2, n);
  191. }
  192. /** Compares the first strlen(s2) characters of s1 with s2. Returns as for
  193. * strcasecmp.
  194. */
  195. int
  196. strcasecmpstart(const char *s1, const char *s2)
  197. {
  198. size_t n = strlen(s2);
  199. return strncasecmp(s1, s2, n);
  200. }
  201. /** Compare the value of the string <b>prefix</b> with the start of the
  202. * <b>memlen</b>-byte memory chunk at <b>mem</b>. Return as for strcmp.
  203. *
  204. * [As fast_memcmp(mem, prefix, strlen(prefix)) but returns -1 if memlen is
  205. * less than strlen(prefix).]
  206. */
  207. int
  208. fast_memcmpstart(const void *mem, size_t memlen,
  209. const char *prefix)
  210. {
  211. size_t plen = strlen(prefix);
  212. if (memlen < plen)
  213. return -1;
  214. return fast_memcmp(mem, prefix, plen);
  215. }
  216. /** Compares the last strlen(s2) characters of s1 with s2. Returns as for
  217. * strcmp.
  218. */
  219. int
  220. strcmpend(const char *s1, const char *s2)
  221. {
  222. size_t n1 = strlen(s1), n2 = strlen(s2);
  223. if (n2>n1)
  224. return strcmp(s1,s2);
  225. else
  226. return strncmp(s1+(n1-n2), s2, n2);
  227. }
  228. /** Compares the last strlen(s2) characters of s1 with s2. Returns as for
  229. * strcasecmp.
  230. */
  231. int
  232. strcasecmpend(const char *s1, const char *s2)
  233. {
  234. size_t n1 = strlen(s1), n2 = strlen(s2);
  235. if (n2>n1) /* then they can't be the same; figure out which is bigger */
  236. return strcasecmp(s1,s2);
  237. else
  238. return strncasecmp(s1+(n1-n2), s2, n2);
  239. }
  240. /** Return a pointer to the first char of s that is not whitespace and
  241. * not a comment, or to the terminating NUL if no such character exists.
  242. */
  243. const char *
  244. eat_whitespace(const char *s)
  245. {
  246. raw_assert(s);
  247. while (1) {
  248. switch (*s) {
  249. case '\0':
  250. default:
  251. return s;
  252. case ' ':
  253. case '\t':
  254. case '\n':
  255. case '\r':
  256. ++s;
  257. break;
  258. case '#':
  259. ++s;
  260. while (*s && *s != '\n')
  261. ++s;
  262. }
  263. }
  264. }
  265. /** Return a pointer to the first char of s that is not whitespace and
  266. * not a comment, or to the terminating NUL if no such character exists.
  267. */
  268. const char *
  269. eat_whitespace_eos(const char *s, const char *eos)
  270. {
  271. raw_assert(s);
  272. raw_assert(eos && s <= eos);
  273. while (s < eos) {
  274. switch (*s) {
  275. case '\0':
  276. default:
  277. return s;
  278. case ' ':
  279. case '\t':
  280. case '\n':
  281. case '\r':
  282. ++s;
  283. break;
  284. case '#':
  285. ++s;
  286. while (s < eos && *s && *s != '\n')
  287. ++s;
  288. }
  289. }
  290. return s;
  291. }
  292. /** Return a pointer to the first char of s that is not a space or a tab
  293. * or a \\r, or to the terminating NUL if no such character exists. */
  294. const char *
  295. eat_whitespace_no_nl(const char *s)
  296. {
  297. while (*s == ' ' || *s == '\t' || *s == '\r')
  298. ++s;
  299. return s;
  300. }
  301. /** As eat_whitespace_no_nl, but stop at <b>eos</b> whether we have
  302. * found a non-whitespace character or not. */
  303. const char *
  304. eat_whitespace_eos_no_nl(const char *s, const char *eos)
  305. {
  306. while (s < eos && (*s == ' ' || *s == '\t' || *s == '\r'))
  307. ++s;
  308. return s;
  309. }
  310. /** Return a pointer to the first char of s that is whitespace or <b>#</b>,
  311. * or to the terminating NUL if no such character exists.
  312. */
  313. const char *
  314. find_whitespace(const char *s)
  315. {
  316. /* tor_assert(s); */
  317. while (1) {
  318. switch (*s)
  319. {
  320. case '\0':
  321. case '#':
  322. case ' ':
  323. case '\r':
  324. case '\n':
  325. case '\t':
  326. return s;
  327. default:
  328. ++s;
  329. }
  330. }
  331. }
  332. /** As find_whitespace, but stop at <b>eos</b> whether we have found a
  333. * whitespace or not. */
  334. const char *
  335. find_whitespace_eos(const char *s, const char *eos)
  336. {
  337. /* tor_assert(s); */
  338. while (s < eos) {
  339. switch (*s)
  340. {
  341. case '\0':
  342. case '#':
  343. case ' ':
  344. case '\r':
  345. case '\n':
  346. case '\t':
  347. return s;
  348. default:
  349. ++s;
  350. }
  351. }
  352. return s;
  353. }
  354. /** Return the first occurrence of <b>needle</b> in <b>haystack</b> that
  355. * occurs at the start of a line (that is, at the beginning of <b>haystack</b>
  356. * or immediately after a newline). Return NULL if no such string is found.
  357. */
  358. const char *
  359. find_str_at_start_of_line(const char *haystack, const char *needle)
  360. {
  361. size_t needle_len = strlen(needle);
  362. do {
  363. if (!strncmp(haystack, needle, needle_len))
  364. return haystack;
  365. haystack = strchr(haystack, '\n');
  366. if (!haystack)
  367. return NULL;
  368. else
  369. ++haystack;
  370. } while (*haystack);
  371. return NULL;
  372. }
  373. /** Returns true if <b>string</b> could be a C identifier.
  374. A C identifier must begin with a letter or an underscore and the
  375. rest of its characters can be letters, numbers or underscores. No
  376. length limit is imposed. */
  377. int
  378. string_is_C_identifier(const char *string)
  379. {
  380. size_t iter;
  381. size_t length = strlen(string);
  382. if (!length)
  383. return 0;
  384. for (iter = 0; iter < length ; iter++) {
  385. if (iter == 0) {
  386. if (!(TOR_ISALPHA(string[iter]) ||
  387. string[iter] == '_'))
  388. return 0;
  389. } else {
  390. if (!(TOR_ISALPHA(string[iter]) ||
  391. TOR_ISDIGIT(string[iter]) ||
  392. string[iter] == '_'))
  393. return 0;
  394. }
  395. }
  396. return 1;
  397. }
  398. /** A byte with the top <b>x</b> bits set. */
  399. #define TOP_BITS(x) ((uint8_t)(0xFF << (8 - (x))))
  400. /** A byte with the lowest <b>x</b> bits set. */
  401. #define LOW_BITS(x) ((uint8_t)(0xFF >> (8 - (x))))
  402. /** Given the leading byte <b>b</b>, return the total number of bytes in the
  403. * UTF-8 character. Returns 0 if it's an invalid leading byte.
  404. */
  405. static uint8_t
  406. bytes_in_char(uint8_t b)
  407. {
  408. if ((TOP_BITS(1) & b) == 0x00)
  409. return 1; // a 1-byte UTF-8 char, aka ASCII
  410. if ((TOP_BITS(3) & b) == TOP_BITS(2))
  411. return 2; // a 2-byte UTF-8 char
  412. if ((TOP_BITS(4) & b) == TOP_BITS(3))
  413. return 3; // a 3-byte UTF-8 char
  414. if ((TOP_BITS(5) & b) == TOP_BITS(4))
  415. return 4; // a 4-byte UTF-8 char
  416. // Invalid: either the top 2 bits are 10, or the top 5 bits are 11111.
  417. return 0;
  418. }
  419. /** Returns true iff <b>b</b> is a UTF-8 continuation byte. */
  420. static bool
  421. is_continuation_byte(uint8_t b)
  422. {
  423. uint8_t top2bits = b & TOP_BITS(2);
  424. return top2bits == TOP_BITS(1);
  425. }
  426. /** Returns true iff the <b>len</b> bytes in <b>c</b> are a valid UTF-8
  427. * character.
  428. */
  429. static bool
  430. validate_char(const uint8_t *c, uint8_t len)
  431. {
  432. if (len == 1)
  433. return true; // already validated this is an ASCII char earlier.
  434. uint8_t mask = LOW_BITS(7 - len); // bitmask for the leading byte.
  435. uint32_t codepoint = c[0] & mask;
  436. mask = LOW_BITS(6); // bitmask for continuation bytes.
  437. for (uint8_t i = 1; i < len; i++) {
  438. if (!is_continuation_byte(c[i]))
  439. return false;
  440. codepoint <<= 6;
  441. codepoint |= (c[i] & mask);
  442. }
  443. if (len == 2 && codepoint <= 0x7f)
  444. return false; // Invalid, overly long encoding, should have fit in 1 byte.
  445. if (len == 3 && codepoint <= 0x7ff)
  446. return false; // Invalid, overly long encoding, should have fit in 2 bytes.
  447. if (len == 4 && codepoint <= 0xffff)
  448. return false; // Invalid, overly long encoding, should have fit in 3 bytes.
  449. if (codepoint >= 0xd800 && codepoint <= 0xdfff)
  450. return false; // Invalid, reserved for UTF-16 surrogate pairs.
  451. return codepoint <= 0x10ffff; // Check if within maximum.
  452. }
  453. /** Returns true iff the first <b>len</b> bytes in <b>str</b> are a
  454. valid UTF-8 string. */
  455. int
  456. string_is_utf8(const char *str, size_t len)
  457. {
  458. for (size_t i = 0; i < len;) {
  459. uint8_t num_bytes = bytes_in_char(str[i]);
  460. if (num_bytes == 0) // Invalid leading byte found.
  461. return false;
  462. size_t next_char = i + num_bytes;
  463. if (next_char > len)
  464. return false;
  465. // Validate the continuation bytes in this multi-byte character,
  466. // and advance to the next character in the string.
  467. if (!validate_char((const uint8_t*)&str[i], num_bytes))
  468. return false;
  469. i = next_char;
  470. }
  471. return true;
  472. }
  473. /** As string_is_utf8(), but returns false if the string begins with a UTF-8
  474. * byte order mark (BOM).
  475. */
  476. int
  477. string_is_utf8_no_bom(const char *str, size_t len)
  478. {
  479. if (len >= 3 && (!strcmpstart(str, "\uFEFF") ||
  480. !strcmpstart(str, "\uFFFE"))) {
  481. return false;
  482. }
  483. return string_is_utf8(str, len);
  484. }