regex.cpp 8.2 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320
  1. //===-------------------------- regex.cpp ---------------------------------===//
  2. //
  3. // The LLVM Compiler Infrastructure
  4. //
  5. // This file is dual licensed under the MIT and the University of Illinois Open
  6. // Source Licenses. See LICENSE.TXT for details.
  7. //
  8. //===----------------------------------------------------------------------===//
  9. #include <__config>
  10. #if !defined(_LIBCPP_SGX_CONFIG)
  11. #include "regex"
  12. #include "algorithm"
  13. #include "iterator"
  14. _LIBCPP_BEGIN_NAMESPACE_STD
  15. static
  16. const char*
  17. make_error_type_string(regex_constants::error_type ecode)
  18. {
  19. switch (ecode)
  20. {
  21. case regex_constants::error_collate:
  22. return "The expression contained an invalid collating element name.";
  23. case regex_constants::error_ctype:
  24. return "The expression contained an invalid character class name.";
  25. case regex_constants::error_escape:
  26. return "The expression contained an invalid escaped character, or a "
  27. "trailing escape.";
  28. case regex_constants::error_backref:
  29. return "The expression contained an invalid back reference.";
  30. case regex_constants::error_brack:
  31. return "The expression contained mismatched [ and ].";
  32. case regex_constants::error_paren:
  33. return "The expression contained mismatched ( and ).";
  34. case regex_constants::error_brace:
  35. return "The expression contained mismatched { and }.";
  36. case regex_constants::error_badbrace:
  37. return "The expression contained an invalid range in a {} expression.";
  38. case regex_constants::error_range:
  39. return "The expression contained an invalid character range, "
  40. "such as [b-a] in most encodings.";
  41. case regex_constants::error_space:
  42. return "There was insufficient memory to convert the expression into "
  43. "a finite state machine.";
  44. case regex_constants::error_badrepeat:
  45. return "One of *?+{ was not preceded by a valid regular expression.";
  46. case regex_constants::error_complexity:
  47. return "The complexity of an attempted match against a regular "
  48. "expression exceeded a pre-set level.";
  49. case regex_constants::error_stack:
  50. return "There was insufficient memory to determine whether the regular "
  51. "expression could match the specified character sequence.";
  52. case regex_constants::__re_err_grammar:
  53. return "An invalid regex grammar has been requested.";
  54. case regex_constants::__re_err_empty:
  55. return "An empty regex is not allowed in the POSIX grammar.";
  56. default:
  57. break;
  58. }
  59. return "Unknown error type";
  60. }
  61. regex_error::regex_error(regex_constants::error_type ecode)
  62. : runtime_error(make_error_type_string(ecode)),
  63. __code_(ecode)
  64. {}
  65. regex_error::~regex_error() throw() {}
  66. namespace {
  67. struct collationnames
  68. {
  69. const char* elem_;
  70. char char_;
  71. };
  72. const collationnames collatenames[] =
  73. {
  74. {"A", 0x41},
  75. {"B", 0x42},
  76. {"C", 0x43},
  77. {"D", 0x44},
  78. {"E", 0x45},
  79. {"F", 0x46},
  80. {"G", 0x47},
  81. {"H", 0x48},
  82. {"I", 0x49},
  83. {"J", 0x4a},
  84. {"K", 0x4b},
  85. {"L", 0x4c},
  86. {"M", 0x4d},
  87. {"N", 0x4e},
  88. {"NUL", 0x00},
  89. {"O", 0x4f},
  90. {"P", 0x50},
  91. {"Q", 0x51},
  92. {"R", 0x52},
  93. {"S", 0x53},
  94. {"T", 0x54},
  95. {"U", 0x55},
  96. {"V", 0x56},
  97. {"W", 0x57},
  98. {"X", 0x58},
  99. {"Y", 0x59},
  100. {"Z", 0x5a},
  101. {"a", 0x61},
  102. {"alert", 0x07},
  103. {"ampersand", 0x26},
  104. {"apostrophe", 0x27},
  105. {"asterisk", 0x2a},
  106. {"b", 0x62},
  107. {"backslash", 0x5c},
  108. {"backspace", 0x08},
  109. {"c", 0x63},
  110. {"carriage-return", 0x0d},
  111. {"circumflex", 0x5e},
  112. {"circumflex-accent", 0x5e},
  113. {"colon", 0x3a},
  114. {"comma", 0x2c},
  115. {"commercial-at", 0x40},
  116. {"d", 0x64},
  117. {"dollar-sign", 0x24},
  118. {"e", 0x65},
  119. {"eight", 0x38},
  120. {"equals-sign", 0x3d},
  121. {"exclamation-mark", 0x21},
  122. {"f", 0x66},
  123. {"five", 0x35},
  124. {"form-feed", 0x0c},
  125. {"four", 0x34},
  126. {"full-stop", 0x2e},
  127. {"g", 0x67},
  128. {"grave-accent", 0x60},
  129. {"greater-than-sign", 0x3e},
  130. {"h", 0x68},
  131. {"hyphen", 0x2d},
  132. {"hyphen-minus", 0x2d},
  133. {"i", 0x69},
  134. {"j", 0x6a},
  135. {"k", 0x6b},
  136. {"l", 0x6c},
  137. {"left-brace", 0x7b},
  138. {"left-curly-bracket", 0x7b},
  139. {"left-parenthesis", 0x28},
  140. {"left-square-bracket", 0x5b},
  141. {"less-than-sign", 0x3c},
  142. {"low-line", 0x5f},
  143. {"m", 0x6d},
  144. {"n", 0x6e},
  145. {"newline", 0x0a},
  146. {"nine", 0x39},
  147. {"number-sign", 0x23},
  148. {"o", 0x6f},
  149. {"one", 0x31},
  150. {"p", 0x70},
  151. {"percent-sign", 0x25},
  152. {"period", 0x2e},
  153. {"plus-sign", 0x2b},
  154. {"q", 0x71},
  155. {"question-mark", 0x3f},
  156. {"quotation-mark", 0x22},
  157. {"r", 0x72},
  158. {"reverse-solidus", 0x5c},
  159. {"right-brace", 0x7d},
  160. {"right-curly-bracket", 0x7d},
  161. {"right-parenthesis", 0x29},
  162. {"right-square-bracket", 0x5d},
  163. {"s", 0x73},
  164. {"semicolon", 0x3b},
  165. {"seven", 0x37},
  166. {"six", 0x36},
  167. {"slash", 0x2f},
  168. {"solidus", 0x2f},
  169. {"space", 0x20},
  170. {"t", 0x74},
  171. {"tab", 0x09},
  172. {"three", 0x33},
  173. {"tilde", 0x7e},
  174. {"two", 0x32},
  175. {"u", 0x75},
  176. {"underscore", 0x5f},
  177. {"v", 0x76},
  178. {"vertical-line", 0x7c},
  179. {"vertical-tab", 0x0b},
  180. {"w", 0x77},
  181. {"x", 0x78},
  182. {"y", 0x79},
  183. {"z", 0x7a},
  184. {"zero", 0x30}
  185. };
  186. struct classnames
  187. {
  188. const char* elem_;
  189. regex_traits<char>::char_class_type mask_;
  190. };
  191. const classnames ClassNames[] =
  192. {
  193. {"alnum", ctype_base::alnum},
  194. {"alpha", ctype_base::alpha},
  195. {"blank", ctype_base::blank},
  196. {"cntrl", ctype_base::cntrl},
  197. {"d", ctype_base::digit},
  198. {"digit", ctype_base::digit},
  199. {"graph", ctype_base::graph},
  200. {"lower", ctype_base::lower},
  201. {"print", ctype_base::print},
  202. {"punct", ctype_base::punct},
  203. {"s", ctype_base::space},
  204. {"space", ctype_base::space},
  205. {"upper", ctype_base::upper},
  206. {"w", regex_traits<char>::__regex_word},
  207. {"xdigit", ctype_base::xdigit}
  208. };
  209. struct use_strcmp
  210. {
  211. bool operator()(const collationnames& x, const char* y)
  212. {return strcmp(x.elem_, y) < 0;}
  213. bool operator()(const classnames& x, const char* y)
  214. {return strcmp(x.elem_, y) < 0;}
  215. };
  216. }
  217. string
  218. __get_collation_name(const char* s)
  219. {
  220. const collationnames* i =
  221. _VSTD::lower_bound(begin(collatenames), end(collatenames), s, use_strcmp());
  222. string r;
  223. if (i != end(collatenames) && strcmp(s, i->elem_) == 0)
  224. r = char(i->char_);
  225. return r;
  226. }
  227. regex_traits<char>::char_class_type
  228. __get_classname(const char* s, bool __icase)
  229. {
  230. const classnames* i =
  231. _VSTD::lower_bound(begin(ClassNames), end(ClassNames), s, use_strcmp());
  232. regex_traits<char>::char_class_type r = 0;
  233. if (i != end(ClassNames) && strcmp(s, i->elem_) == 0)
  234. {
  235. r = i->mask_;
  236. if (r == regex_traits<char>::__regex_word)
  237. r |= ctype_base::alnum | ctype_base::upper | ctype_base::lower;
  238. else if (__icase)
  239. {
  240. if (r & (ctype_base::lower | ctype_base::upper))
  241. r |= ctype_base::alpha;
  242. }
  243. }
  244. return r;
  245. }
  246. template <>
  247. void
  248. __match_any_but_newline<char>::__exec(__state& __s) const
  249. {
  250. if (__s.__current_ != __s.__last_)
  251. {
  252. switch (*__s.__current_)
  253. {
  254. case '\r':
  255. case '\n':
  256. __s.__do_ = __state::__reject;
  257. __s.__node_ = nullptr;
  258. break;
  259. default:
  260. __s.__do_ = __state::__accept_and_consume;
  261. ++__s.__current_;
  262. __s.__node_ = this->first();
  263. break;
  264. }
  265. }
  266. else
  267. {
  268. __s.__do_ = __state::__reject;
  269. __s.__node_ = nullptr;
  270. }
  271. }
  272. template <>
  273. void
  274. __match_any_but_newline<wchar_t>::__exec(__state& __s) const
  275. {
  276. if (__s.__current_ != __s.__last_)
  277. {
  278. switch (*__s.__current_)
  279. {
  280. case '\r':
  281. case '\n':
  282. case 0x2028:
  283. case 0x2029:
  284. __s.__do_ = __state::__reject;
  285. __s.__node_ = nullptr;
  286. break;
  287. default:
  288. __s.__do_ = __state::__accept_and_consume;
  289. ++__s.__current_;
  290. __s.__node_ = this->first();
  291. break;
  292. }
  293. }
  294. else
  295. {
  296. __s.__do_ = __state::__reject;
  297. __s.__node_ = nullptr;
  298. }
  299. }
  300. _LIBCPP_END_NAMESPACE_STD
  301. #endif // !defined(_LIBCPP_SGX_CONFIG)