util_string.c 13 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541
  1. /* Copyright (c) 2003-2004, Roger Dingledine
  2. * Copyright (c) 2004-2006, Roger Dingledine, Nick Mathewson.
  3. * Copyright (c) 2007-2019, The Tor Project, Inc. */
  4. /* See LICENSE for licensing information */
  5. /**
  6. * \file util_string.c
  7. * \brief Non-standard string functions used throughout Tor.
  8. **/
  9. #include "lib/string/util_string.h"
  10. #include "lib/string/compat_ctype.h"
  11. #include "lib/err/torerr.h"
  12. #include "lib/ctime/di_ops.h"
  13. #include "lib/defs/digest_sizes.h"
  14. #include <string.h>
  15. #include <stdlib.h>
  16. /** Given <b>hlen</b> bytes at <b>haystack</b> and <b>nlen</b> bytes at
  17. * <b>needle</b>, return a pointer to the first occurrence of the needle
  18. * within the haystack, or NULL if there is no such occurrence.
  19. *
  20. * This function is <em>not</em> timing-safe.
  21. *
  22. * Requires that <b>nlen</b> be greater than zero.
  23. */
  24. const void *
  25. tor_memmem(const void *_haystack, size_t hlen,
  26. const void *_needle, size_t nlen)
  27. {
  28. #if defined(HAVE_MEMMEM) && (!defined(__GNUC__) || __GNUC__ >= 2)
  29. raw_assert(nlen);
  30. return memmem(_haystack, hlen, _needle, nlen);
  31. #else
  32. /* This isn't as fast as the GLIBC implementation, but it doesn't need to
  33. * be. */
  34. const char *p, *last_possible_start;
  35. const char *haystack = (const char*)_haystack;
  36. const char *needle = (const char*)_needle;
  37. char first;
  38. raw_assert(nlen);
  39. if (nlen > hlen)
  40. return NULL;
  41. p = haystack;
  42. /* Last position at which the needle could start. */
  43. last_possible_start = haystack + hlen - nlen;
  44. first = *(const char*)needle;
  45. while ((p = memchr(p, first, last_possible_start + 1 - p))) {
  46. if (fast_memeq(p, needle, nlen))
  47. return p;
  48. if (++p > last_possible_start) {
  49. /* This comparison shouldn't be necessary, since if p was previously
  50. * equal to last_possible_start, the next memchr call would be
  51. * "memchr(p, first, 0)", which will return NULL. But it clarifies the
  52. * logic. */
  53. return NULL;
  54. }
  55. }
  56. return NULL;
  57. #endif /* defined(HAVE_MEMMEM) && (!defined(__GNUC__) || __GNUC__ >= 2) */
  58. }
  59. const void *
  60. tor_memstr(const void *haystack, size_t hlen, const char *needle)
  61. {
  62. return tor_memmem(haystack, hlen, needle, strlen(needle));
  63. }
  64. /** Return true iff the 'len' bytes at 'mem' are all zero. */
  65. int
  66. tor_mem_is_zero(const char *mem, size_t len)
  67. {
  68. static const char ZERO[] = {
  69. 0,0,0,0, 0,0,0,0, 0,0,0,0, 0,0,0,0, 0,0,0,0, 0,0,0,0, 0,0,0,0, 0,0,0,0,
  70. };
  71. while (len >= sizeof(ZERO)) {
  72. /* It's safe to use fast_memcmp here, since the very worst thing an
  73. * attacker could learn is how many initial bytes of a secret were zero */
  74. if (fast_memcmp(mem, ZERO, sizeof(ZERO)))
  75. return 0;
  76. len -= sizeof(ZERO);
  77. mem += sizeof(ZERO);
  78. }
  79. /* Deal with leftover bytes. */
  80. if (len)
  81. return fast_memeq(mem, ZERO, len);
  82. return 1;
  83. }
  84. /** Return true iff the DIGEST_LEN bytes in digest are all zero. */
  85. int
  86. tor_digest_is_zero(const char *digest)
  87. {
  88. static const uint8_t ZERO_DIGEST[] = {
  89. 0,0,0,0, 0,0,0,0, 0,0,0,0, 0,0,0,0, 0,0,0,0
  90. };
  91. return tor_memeq(digest, ZERO_DIGEST, DIGEST_LEN);
  92. }
  93. /** Return true iff the DIGEST256_LEN bytes in digest are all zero. */
  94. int
  95. tor_digest256_is_zero(const char *digest)
  96. {
  97. return tor_mem_is_zero(digest, DIGEST256_LEN);
  98. }
  99. /** Remove from the string <b>s</b> every character which appears in
  100. * <b>strip</b>. */
  101. void
  102. tor_strstrip(char *s, const char *strip)
  103. {
  104. char *readp = s;
  105. while (*readp) {
  106. if (strchr(strip, *readp)) {
  107. ++readp;
  108. } else {
  109. *s++ = *readp++;
  110. }
  111. }
  112. *s = '\0';
  113. }
  114. /** Convert all alphabetic characters in the nul-terminated string <b>s</b> to
  115. * lowercase. */
  116. void
  117. tor_strlower(char *s)
  118. {
  119. while (*s) {
  120. *s = TOR_TOLOWER(*s);
  121. ++s;
  122. }
  123. }
  124. /** Convert all alphabetic characters in the nul-terminated string <b>s</b> to
  125. * lowercase. */
  126. void
  127. tor_strupper(char *s)
  128. {
  129. while (*s) {
  130. *s = TOR_TOUPPER(*s);
  131. ++s;
  132. }
  133. }
  134. /** Return 1 if every character in <b>s</b> is printable, else return 0.
  135. */
  136. int
  137. tor_strisprint(const char *s)
  138. {
  139. while (*s) {
  140. if (!TOR_ISPRINT(*s))
  141. return 0;
  142. s++;
  143. }
  144. return 1;
  145. }
  146. /** Return 1 if no character in <b>s</b> is uppercase, else return 0.
  147. */
  148. int
  149. tor_strisnonupper(const char *s)
  150. {
  151. while (*s) {
  152. if (TOR_ISUPPER(*s))
  153. return 0;
  154. s++;
  155. }
  156. return 1;
  157. }
  158. /** Return true iff every character in <b>s</b> is whitespace space; else
  159. * return false. */
  160. int
  161. tor_strisspace(const char *s)
  162. {
  163. while (*s) {
  164. if (!TOR_ISSPACE(*s))
  165. return 0;
  166. s++;
  167. }
  168. return 1;
  169. }
  170. /** As strcmp, except that either string may be NULL. The NULL string is
  171. * considered to be before any non-NULL string. */
  172. int
  173. strcmp_opt(const char *s1, const char *s2)
  174. {
  175. if (!s1) {
  176. if (!s2)
  177. return 0;
  178. else
  179. return -1;
  180. } else if (!s2) {
  181. return 1;
  182. } else {
  183. return strcmp(s1, s2);
  184. }
  185. }
  186. /** Compares the first strlen(s2) characters of s1 with s2. Returns as for
  187. * strcmp.
  188. */
  189. int
  190. strcmpstart(const char *s1, const char *s2)
  191. {
  192. size_t n = strlen(s2);
  193. return strncmp(s1, s2, n);
  194. }
  195. /** Compares the first strlen(s2) characters of s1 with s2. Returns as for
  196. * strcasecmp.
  197. */
  198. int
  199. strcasecmpstart(const char *s1, const char *s2)
  200. {
  201. size_t n = strlen(s2);
  202. return strncasecmp(s1, s2, n);
  203. }
  204. /** Compare the value of the string <b>prefix</b> with the start of the
  205. * <b>memlen</b>-byte memory chunk at <b>mem</b>. Return as for strcmp.
  206. *
  207. * [As fast_memcmp(mem, prefix, strlen(prefix)) but returns -1 if memlen is
  208. * less than strlen(prefix).]
  209. */
  210. int
  211. fast_memcmpstart(const void *mem, size_t memlen,
  212. const char *prefix)
  213. {
  214. size_t plen = strlen(prefix);
  215. if (memlen < plen)
  216. return -1;
  217. return fast_memcmp(mem, prefix, plen);
  218. }
  219. /** Compares the last strlen(s2) characters of s1 with s2. Returns as for
  220. * strcmp.
  221. */
  222. int
  223. strcmpend(const char *s1, const char *s2)
  224. {
  225. size_t n1 = strlen(s1), n2 = strlen(s2);
  226. if (n2>n1)
  227. return strcmp(s1,s2);
  228. else
  229. return strncmp(s1+(n1-n2), s2, n2);
  230. }
  231. /** Compares the last strlen(s2) characters of s1 with s2. Returns as for
  232. * strcasecmp.
  233. */
  234. int
  235. strcasecmpend(const char *s1, const char *s2)
  236. {
  237. size_t n1 = strlen(s1), n2 = strlen(s2);
  238. if (n2>n1) /* then they can't be the same; figure out which is bigger */
  239. return strcasecmp(s1,s2);
  240. else
  241. return strncasecmp(s1+(n1-n2), s2, n2);
  242. }
  243. /** Return a pointer to the first char of s that is not whitespace and
  244. * not a comment, or to the terminating NUL if no such character exists.
  245. */
  246. const char *
  247. eat_whitespace(const char *s)
  248. {
  249. raw_assert(s);
  250. while (1) {
  251. switch (*s) {
  252. case '\0':
  253. default:
  254. return s;
  255. case ' ':
  256. case '\t':
  257. case '\n':
  258. case '\r':
  259. ++s;
  260. break;
  261. case '#':
  262. ++s;
  263. while (*s && *s != '\n')
  264. ++s;
  265. }
  266. }
  267. }
  268. /** Return a pointer to the first char of s that is not whitespace and
  269. * not a comment, or to the terminating NUL if no such character exists.
  270. */
  271. const char *
  272. eat_whitespace_eos(const char *s, const char *eos)
  273. {
  274. raw_assert(s);
  275. raw_assert(eos && s <= eos);
  276. while (s < eos) {
  277. switch (*s) {
  278. case '\0':
  279. default:
  280. return s;
  281. case ' ':
  282. case '\t':
  283. case '\n':
  284. case '\r':
  285. ++s;
  286. break;
  287. case '#':
  288. ++s;
  289. while (s < eos && *s && *s != '\n')
  290. ++s;
  291. }
  292. }
  293. return s;
  294. }
  295. /** Return a pointer to the first char of s that is not a space or a tab
  296. * or a \\r, or to the terminating NUL if no such character exists. */
  297. const char *
  298. eat_whitespace_no_nl(const char *s)
  299. {
  300. while (*s == ' ' || *s == '\t' || *s == '\r')
  301. ++s;
  302. return s;
  303. }
  304. /** As eat_whitespace_no_nl, but stop at <b>eos</b> whether we have
  305. * found a non-whitespace character or not. */
  306. const char *
  307. eat_whitespace_eos_no_nl(const char *s, const char *eos)
  308. {
  309. while (s < eos && (*s == ' ' || *s == '\t' || *s == '\r'))
  310. ++s;
  311. return s;
  312. }
  313. /** Return a pointer to the first char of s that is whitespace or <b>#</b>,
  314. * or to the terminating NUL if no such character exists.
  315. */
  316. const char *
  317. find_whitespace(const char *s)
  318. {
  319. /* tor_assert(s); */
  320. while (1) {
  321. switch (*s)
  322. {
  323. case '\0':
  324. case '#':
  325. case ' ':
  326. case '\r':
  327. case '\n':
  328. case '\t':
  329. return s;
  330. default:
  331. ++s;
  332. }
  333. }
  334. }
  335. /** As find_whitespace, but stop at <b>eos</b> whether we have found a
  336. * whitespace or not. */
  337. const char *
  338. find_whitespace_eos(const char *s, const char *eos)
  339. {
  340. /* tor_assert(s); */
  341. while (s < eos) {
  342. switch (*s)
  343. {
  344. case '\0':
  345. case '#':
  346. case ' ':
  347. case '\r':
  348. case '\n':
  349. case '\t':
  350. return s;
  351. default:
  352. ++s;
  353. }
  354. }
  355. return s;
  356. }
  357. /** Return the first occurrence of <b>needle</b> in <b>haystack</b> that
  358. * occurs at the start of a line (that is, at the beginning of <b>haystack</b>
  359. * or immediately after a newline). Return NULL if no such string is found.
  360. */
  361. const char *
  362. find_str_at_start_of_line(const char *haystack, const char *needle)
  363. {
  364. size_t needle_len = strlen(needle);
  365. do {
  366. if (!strncmp(haystack, needle, needle_len))
  367. return haystack;
  368. haystack = strchr(haystack, '\n');
  369. if (!haystack)
  370. return NULL;
  371. else
  372. ++haystack;
  373. } while (*haystack);
  374. return NULL;
  375. }
  376. /** Returns true if <b>string</b> could be a C identifier.
  377. A C identifier must begin with a letter or an underscore and the
  378. rest of its characters can be letters, numbers or underscores. No
  379. length limit is imposed. */
  380. int
  381. string_is_C_identifier(const char *string)
  382. {
  383. size_t iter;
  384. size_t length = strlen(string);
  385. if (!length)
  386. return 0;
  387. for (iter = 0; iter < length ; iter++) {
  388. if (iter == 0) {
  389. if (!(TOR_ISALPHA(string[iter]) ||
  390. string[iter] == '_'))
  391. return 0;
  392. } else {
  393. if (!(TOR_ISALPHA(string[iter]) ||
  394. TOR_ISDIGIT(string[iter]) ||
  395. string[iter] == '_'))
  396. return 0;
  397. }
  398. }
  399. return 1;
  400. }
  401. /** A byte with the top <b>x</b> bits set. */
  402. #define TOP_BITS(x) ((uint8_t)(0xFF << (8 - (x))))
  403. /** A byte with the lowest <b>x</b> bits set. */
  404. #define LOW_BITS(x) ((uint8_t)(0xFF >> (8 - (x))))
  405. /** Given the leading byte <b>b</b>, return the total number of bytes in the
  406. * UTF-8 character. Returns 0 if it's an invalid leading byte.
  407. */
  408. static uint8_t
  409. bytes_in_char(uint8_t b)
  410. {
  411. if ((TOP_BITS(1) & b) == 0x00)
  412. return 1; // a 1-byte UTF-8 char, aka ASCII
  413. if ((TOP_BITS(3) & b) == TOP_BITS(2))
  414. return 2; // a 2-byte UTF-8 char
  415. if ((TOP_BITS(4) & b) == TOP_BITS(3))
  416. return 3; // a 3-byte UTF-8 char
  417. if ((TOP_BITS(5) & b) == TOP_BITS(4))
  418. return 4; // a 4-byte UTF-8 char
  419. // Invalid: either the top 2 bits are 10, or the top 5 bits are 11111.
  420. return 0;
  421. }
  422. /** Returns true iff <b>b</b> is a UTF-8 continuation byte. */
  423. static bool
  424. is_continuation_byte(uint8_t b)
  425. {
  426. uint8_t top2bits = b & TOP_BITS(2);
  427. return top2bits == TOP_BITS(1);
  428. }
  429. /** Returns true iff the <b>len</b> bytes in <b>c</b> are a valid UTF-8
  430. * character.
  431. */
  432. static bool
  433. validate_char(const uint8_t *c, uint8_t len)
  434. {
  435. if (len == 1)
  436. return true; // already validated this is an ASCII char earlier.
  437. uint8_t mask = LOW_BITS(7 - len); // bitmask for the leading byte.
  438. uint32_t codepoint = c[0] & mask;
  439. mask = LOW_BITS(6); // bitmask for continuation bytes.
  440. for (uint8_t i = 1; i < len; i++) {
  441. if (!is_continuation_byte(c[i]))
  442. return false;
  443. codepoint <<= 6;
  444. codepoint |= (c[i] & mask);
  445. }
  446. if (len == 2 && codepoint <= 0x7f)
  447. return false; // Invalid, overly long encoding, should have fit in 1 byte.
  448. if (len == 3 && codepoint <= 0x7ff)
  449. return false; // Invalid, overly long encoding, should have fit in 2 bytes.
  450. if (len == 4 && codepoint <= 0xffff)
  451. return false; // Invalid, overly long encoding, should have fit in 3 bytes.
  452. if (codepoint >= 0xd800 && codepoint <= 0xdfff)
  453. return false; // Invalid, reserved for UTF-16 surrogate pairs.
  454. return codepoint <= 0x10ffff; // Check if within maximum.
  455. }
  456. /** Returns true iff the first <b>len</b> bytes in <b>str</b> are a
  457. valid UTF-8 string. */
  458. int
  459. string_is_utf8(const char *str, size_t len)
  460. {
  461. for (size_t i = 0; i < len;) {
  462. uint8_t num_bytes = bytes_in_char(str[i]);
  463. if (num_bytes == 0) // Invalid leading byte found.
  464. return false;
  465. size_t next_char = i + num_bytes;
  466. if (next_char > len)
  467. return false;
  468. // Validate the continuation bytes in this multi-byte character,
  469. // and advance to the next character in the string.
  470. if (!validate_char((const uint8_t*)&str[i], num_bytes))
  471. return false;
  472. i = next_char;
  473. }
  474. return true;
  475. }
  476. /** As string_is_utf8(), but returns false if the string begins with a UTF-8
  477. * byte order mark (BOM).
  478. */
  479. int
  480. string_is_utf8_no_bom(const char *str, size_t len)
  481. {
  482. if (len >= 3 && (!strcmpstart(str, "\uFEFF") ||
  483. !strcmpstart(str, "\uFFFE"))) {
  484. return false;
  485. }
  486. return string_is_utf8(str, len);
  487. }