pcretest.c 32 KB

1234567891011121314151617181920212223242526272829303132333435363738394041424344454647484950515253545556575859606162636465666768697071727374757677787980818283848586878889909192939495969798991001011021031041051061071081091101111121131141151161171181191201211221231241251261271281291301311321331341351361371381391401411421431441451461471481491501511521531541551561571581591601611621631641651661671681691701711721731741751761771781791801811821831841851861871881891901911921931941951961971981992002012022032042052062072082092102112122132142152162172182192202212222232242252262272282292302312322332342352362372382392402412422432442452462472482492502512522532542552562572582592602612622632642652662672682692702712722732742752762772782792802812822832842852862872882892902912922932942952962972982993003013023033043053063073083093103113123133143153163173183193203213223233243253263273283293303313323333343353363373383393403413423433443453463473483493503513523533543553563573583593603613623633643653663673683693703713723733743753763773783793803813823833843853863873883893903913923933943953963973983994004014024034044054064074084094104114124134144154164174184194204214224234244254264274284294304314324334344354364374384394404414424434444454464474484494504514524534544554564574584594604614624634644654664674684694704714724734744754764774784794804814824834844854864874884894904914924934944954964974984995005015025035045055065075085095105115125135145155165175185195205215225235245255265275285295305315325335345355365375385395405415425435445455465475485495505515525535545555565575585595605615625635645655665675685695705715725735745755765775785795805815825835845855865875885895905915925935945955965975985996006016026036046056066076086096106116126136146156166176186196206216226236246256266276286296306316326336346356366376386396406416426436446456466476486496506516526536546556566576586596606616626636646656666676686696706716726736746756766776786796806816826836846856866876886896906916926936946956966976986997007017027037047057067077087097107117127137147157167177187197207217227237247257267277287297307317327337347357367377387397407417427437447457467477487497507517527537547557567577587597607617627637647657667677687697707717727737747757767777787797807817827837847857867877887897907917927937947957967977987998008018028038048058068078088098108118128138148158168178188198208218228238248258268278288298308318328338348358368378388398408418428438448458468478488498508518528538548558568578588598608618628638648658668678688698708718728738748758768778788798808818828838848858868878888898908918928938948958968978988999009019029039049059069079089099109119129139149159169179189199209219229239249259269279289299309319329339349359369379389399409419429439449459469479489499509519529539549559569579589599609619629639649659669679689699709719729739749759769779789799809819829839849859869879889899909919929939949959969979989991000100110021003100410051006100710081009101010111012101310141015101610171018101910201021102210231024102510261027102810291030103110321033103410351036103710381039104010411042104310441045104610471048104910501051105210531054105510561057105810591060106110621063106410651066106710681069107010711072107310741075107610771078107910801081108210831084108510861087108810891090109110921093109410951096109710981099110011011102110311041105110611071108110911101111111211131114111511161117111811191120112111221123112411251126112711281129113011311132113311341135113611371138113911401141114211431144114511461147114811491150115111521153115411551156115711581159116011611162116311641165116611671168116911701171117211731174117511761177117811791180118111821183118411851186118711881189119011911192119311941195119611971198119912001201120212031204120512061207120812091210121112121213121412151216121712181219122012211222122312241225
  1. /*************************************************
  2. * PCRE testing program *
  3. *************************************************/
  4. #include <ctype.h>
  5. #include <stdio.h>
  6. #include <string.h>
  7. #include <stdlib.h>
  8. #include <time.h>
  9. #include <locale.h>
  10. /* Use the internal info for displaying the results of pcre_study(). */
  11. #include "internal.h"
  12. /* It is possible to compile this test program without including support for
  13. testing the POSIX interface, though this is not available via the standard
  14. Makefile. */
  15. #if !defined NOPOSIX
  16. #include "pcreposix.h"
  17. #endif
  18. #ifndef CLOCKS_PER_SEC
  19. #ifdef CLK_TCK
  20. #define CLOCKS_PER_SEC CLK_TCK
  21. #else
  22. #define CLOCKS_PER_SEC 100
  23. #endif
  24. #endif
  25. #define LOOPREPEAT 20000
  26. static FILE *outfile;
  27. static int log_store = 0;
  28. static size_t gotten_store;
  29. static int utf8_table1[] = {
  30. 0x0000007f, 0x000007ff, 0x0000ffff, 0x001fffff, 0x03ffffff, 0x7fffffff};
  31. static int utf8_table2[] = {
  32. 0, 0xc0, 0xe0, 0xf0, 0xf8, 0xfc};
  33. static int utf8_table3[] = {
  34. 0xff, 0x1f, 0x0f, 0x07, 0x03, 0x01};
  35. /*************************************************
  36. * Convert character value to UTF-8 *
  37. *************************************************/
  38. /* This function takes an integer value in the range 0 - 0x7fffffff
  39. and encodes it as a UTF-8 character in 0 to 6 bytes.
  40. Arguments:
  41. cvalue the character value
  42. buffer pointer to buffer for result - at least 6 bytes long
  43. Returns: number of characters placed in the buffer
  44. -1 if input character is negative
  45. 0 if input character is positive but too big (only when
  46. int is longer than 32 bits)
  47. */
  48. static int
  49. ord2utf8(int cvalue, unsigned char *buffer)
  50. {
  51. register int i, j;
  52. for (i = 0; i < sizeof(utf8_table1)/sizeof(int); i++)
  53. if (cvalue <= utf8_table1[i]) break;
  54. if (i >= sizeof(utf8_table1)/sizeof(int)) return 0;
  55. if (cvalue < 0) return -1;
  56. *buffer++ = utf8_table2[i] | (cvalue & utf8_table3[i]);
  57. cvalue >>= 6 - i;
  58. for (j = 0; j < i; j++)
  59. {
  60. *buffer++ = 0x80 | (cvalue & 0x3f);
  61. cvalue >>= 6;
  62. }
  63. return i + 1;
  64. }
  65. /*************************************************
  66. * Convert UTF-8 string to value *
  67. *************************************************/
  68. /* This function takes one or more bytes that represents a UTF-8 character,
  69. and returns the value of the character.
  70. Argument:
  71. buffer a pointer to the byte vector
  72. vptr a pointer to an int to receive the value
  73. Returns: > 0 => the number of bytes consumed
  74. -6 to 0 => malformed UTF-8 character at offset = (-return)
  75. */
  76. int
  77. utf82ord(unsigned char *buffer, int *vptr)
  78. {
  79. int c = *buffer++;
  80. int d = c;
  81. int i, j, s;
  82. for (i = -1; i < 6; i++) /* i is number of additional bytes */
  83. {
  84. if ((d & 0x80) == 0) break;
  85. d <<= 1;
  86. }
  87. if (i == -1) { *vptr = c; return 1; } /* ascii character */
  88. if (i == 0 || i == 6) return 0; /* invalid UTF-8 */
  89. /* i now has a value in the range 1-5 */
  90. d = c & utf8_table3[i];
  91. s = 6 - i;
  92. for (j = 0; j < i; j++)
  93. {
  94. c = *buffer++;
  95. if ((c & 0xc0) != 0x80) return -(j+1);
  96. d |= (c & 0x3f) << s;
  97. s += 6;
  98. }
  99. /* Check that encoding was the correct unique one */
  100. for (j = 0; j < sizeof(utf8_table1)/sizeof(int); j++)
  101. if (d <= utf8_table1[j]) break;
  102. if (j != i) return -(i+1);
  103. /* Valid value */
  104. *vptr = d;
  105. return i+1;
  106. }
  107. /* Debugging function to print the internal form of the regex. This is the same
  108. code as contained in pcre.c under the DEBUG macro. */
  109. static const char *OP_names[] = {
  110. "End", "\\A", "\\B", "\\b", "\\D", "\\d",
  111. "\\S", "\\s", "\\W", "\\w", "\\Z", "\\z",
  112. "Opt", "^", "$", "Any", "chars", "not",
  113. "*", "*?", "+", "+?", "?", "??", "{", "{", "{",
  114. "*", "*?", "+", "+?", "?", "??", "{", "{", "{",
  115. "*", "*?", "+", "+?", "?", "??", "{", "{", "{",
  116. "*", "*?", "+", "+?", "?", "??", "{", "{",
  117. "class", "Ref", "Recurse",
  118. "Alt", "Ket", "KetRmax", "KetRmin", "Assert", "Assert not",
  119. "AssertB", "AssertB not", "Reverse", "Once", "Cond", "Cref",
  120. "Brazero", "Braminzero", "Bra"
  121. };
  122. static void print_internals(pcre *re)
  123. {
  124. unsigned char *code = ((real_pcre *)re)->code;
  125. fprintf(outfile, "------------------------------------------------------------------\n");
  126. for(;;)
  127. {
  128. int c;
  129. int charlength;
  130. fprintf(outfile, "%3d ", (int)(code - ((real_pcre *)re)->code));
  131. if (*code >= OP_BRA)
  132. {
  133. fprintf(outfile, "%3d Bra %d", (code[1] << 8) + code[2], *code - OP_BRA);
  134. code += 2;
  135. }
  136. else switch(*code)
  137. {
  138. case OP_END:
  139. fprintf(outfile, " %s\n", OP_names[*code]);
  140. fprintf(outfile, "------------------------------------------------------------------\n");
  141. return;
  142. case OP_OPT:
  143. fprintf(outfile, " %.2x %s", code[1], OP_names[*code]);
  144. code++;
  145. break;
  146. case OP_COND:
  147. fprintf(outfile, "%3d Cond", (code[1] << 8) + code[2]);
  148. code += 2;
  149. break;
  150. case OP_CREF:
  151. fprintf(outfile, " %.2d %s", code[1], OP_names[*code]);
  152. code++;
  153. break;
  154. case OP_CHARS:
  155. charlength = *(++code);
  156. fprintf(outfile, "%3d ", charlength);
  157. while (charlength-- > 0)
  158. if (isprint(c = *(++code))) fprintf(outfile, "%c", c);
  159. else fprintf(outfile, "\\x%02x", c);
  160. break;
  161. case OP_KETRMAX:
  162. case OP_KETRMIN:
  163. case OP_ALT:
  164. case OP_KET:
  165. case OP_ASSERT:
  166. case OP_ASSERT_NOT:
  167. case OP_ASSERTBACK:
  168. case OP_ASSERTBACK_NOT:
  169. case OP_ONCE:
  170. fprintf(outfile, "%3d %s", (code[1] << 8) + code[2], OP_names[*code]);
  171. code += 2;
  172. break;
  173. case OP_REVERSE:
  174. fprintf(outfile, "%3d %s", (code[1] << 8) + code[2], OP_names[*code]);
  175. code += 2;
  176. break;
  177. case OP_STAR:
  178. case OP_MINSTAR:
  179. case OP_PLUS:
  180. case OP_MINPLUS:
  181. case OP_QUERY:
  182. case OP_MINQUERY:
  183. case OP_TYPESTAR:
  184. case OP_TYPEMINSTAR:
  185. case OP_TYPEPLUS:
  186. case OP_TYPEMINPLUS:
  187. case OP_TYPEQUERY:
  188. case OP_TYPEMINQUERY:
  189. if (*code >= OP_TYPESTAR)
  190. fprintf(outfile, " %s", OP_names[code[1]]);
  191. else if (isprint(c = code[1])) fprintf(outfile, " %c", c);
  192. else fprintf(outfile, " \\x%02x", c);
  193. fprintf(outfile, "%s", OP_names[*code++]);
  194. break;
  195. case OP_EXACT:
  196. case OP_UPTO:
  197. case OP_MINUPTO:
  198. if (isprint(c = code[3])) fprintf(outfile, " %c{", c);
  199. else fprintf(outfile, " \\x%02x{", c);
  200. if (*code != OP_EXACT) fprintf(outfile, ",");
  201. fprintf(outfile, "%d}", (code[1] << 8) + code[2]);
  202. if (*code == OP_MINUPTO) fprintf(outfile, "?");
  203. code += 3;
  204. break;
  205. case OP_TYPEEXACT:
  206. case OP_TYPEUPTO:
  207. case OP_TYPEMINUPTO:
  208. fprintf(outfile, " %s{", OP_names[code[3]]);
  209. if (*code != OP_TYPEEXACT) fprintf(outfile, "0,");
  210. fprintf(outfile, "%d}", (code[1] << 8) + code[2]);
  211. if (*code == OP_TYPEMINUPTO) fprintf(outfile, "?");
  212. code += 3;
  213. break;
  214. case OP_NOT:
  215. if (isprint(c = *(++code))) fprintf(outfile, " [^%c]", c);
  216. else fprintf(outfile, " [^\\x%02x]", c);
  217. break;
  218. case OP_NOTSTAR:
  219. case OP_NOTMINSTAR:
  220. case OP_NOTPLUS:
  221. case OP_NOTMINPLUS:
  222. case OP_NOTQUERY:
  223. case OP_NOTMINQUERY:
  224. if (isprint(c = code[1])) fprintf(outfile, " [^%c]", c);
  225. else fprintf(outfile, " [^\\x%02x]", c);
  226. fprintf(outfile, "%s", OP_names[*code++]);
  227. break;
  228. case OP_NOTEXACT:
  229. case OP_NOTUPTO:
  230. case OP_NOTMINUPTO:
  231. if (isprint(c = code[3])) fprintf(outfile, " [^%c]{", c);
  232. else fprintf(outfile, " [^\\x%02x]{", c);
  233. if (*code != OP_NOTEXACT) fprintf(outfile, ",");
  234. fprintf(outfile, "%d}", (code[1] << 8) + code[2]);
  235. if (*code == OP_NOTMINUPTO) fprintf(outfile, "?");
  236. code += 3;
  237. break;
  238. case OP_REF:
  239. fprintf(outfile, " \\%d", *(++code));
  240. code++;
  241. goto CLASS_REF_REPEAT;
  242. case OP_CLASS:
  243. {
  244. int i, min, max;
  245. code++;
  246. fprintf(outfile, " [");
  247. for (i = 0; i < 256; i++)
  248. {
  249. if ((code[i/8] & (1 << (i&7))) != 0)
  250. {
  251. int j;
  252. for (j = i+1; j < 256; j++)
  253. if ((code[j/8] & (1 << (j&7))) == 0) break;
  254. if (i == '-' || i == ']') fprintf(outfile, "\\");
  255. if (isprint(i)) fprintf(outfile, "%c", i); else fprintf(outfile, "\\x%02x", i);
  256. if (--j > i)
  257. {
  258. fprintf(outfile, "-");
  259. if (j == '-' || j == ']') fprintf(outfile, "\\");
  260. if (isprint(j)) fprintf(outfile, "%c", j); else fprintf(outfile, "\\x%02x", j);
  261. }
  262. i = j;
  263. }
  264. }
  265. fprintf(outfile, "]");
  266. code += 32;
  267. CLASS_REF_REPEAT:
  268. switch(*code)
  269. {
  270. case OP_CRSTAR:
  271. case OP_CRMINSTAR:
  272. case OP_CRPLUS:
  273. case OP_CRMINPLUS:
  274. case OP_CRQUERY:
  275. case OP_CRMINQUERY:
  276. fprintf(outfile, "%s", OP_names[*code]);
  277. break;
  278. case OP_CRRANGE:
  279. case OP_CRMINRANGE:
  280. min = (code[1] << 8) + code[2];
  281. max = (code[3] << 8) + code[4];
  282. if (max == 0) fprintf(outfile, "{%d,}", min);
  283. else fprintf(outfile, "{%d,%d}", min, max);
  284. if (*code == OP_CRMINRANGE) fprintf(outfile, "?");
  285. code += 4;
  286. break;
  287. default:
  288. code--;
  289. }
  290. }
  291. break;
  292. /* Anything else is just a one-node item */
  293. default:
  294. fprintf(outfile, " %s", OP_names[*code]);
  295. break;
  296. }
  297. code++;
  298. fprintf(outfile, "\n");
  299. }
  300. }
  301. /* Character string printing function. A "normal" and a UTF-8 version. */
  302. static void pchars(unsigned char *p, int length, int utf8)
  303. {
  304. int c;
  305. while (length-- > 0)
  306. {
  307. if (utf8)
  308. {
  309. int rc = utf82ord(p, &c);
  310. if (rc > 0)
  311. {
  312. length -= rc - 1;
  313. p += rc;
  314. if (c < 256 && isprint(c)) fprintf(outfile, "%c", c);
  315. else fprintf(outfile, "\\x{%02x}", c);
  316. continue;
  317. }
  318. }
  319. /* Not UTF-8, or malformed UTF-8 */
  320. if (isprint(c = *(p++))) fprintf(outfile, "%c", c);
  321. else fprintf(outfile, "\\x%02x", c);
  322. }
  323. }
  324. /* Alternative malloc function, to test functionality and show the size of the
  325. compiled re. */
  326. static void *new_malloc(size_t size)
  327. {
  328. gotten_store = size;
  329. if (log_store)
  330. fprintf(outfile, "Memory allocation (code space): %d\n",
  331. (int)((int)size - offsetof(real_pcre, code[0])));
  332. return malloc(size);
  333. }
  334. /* Get one piece of information from the pcre_fullinfo() function */
  335. static void new_info(pcre *re, pcre_extra *study, int option, void *ptr)
  336. {
  337. int rc;
  338. if ((rc = pcre_fullinfo(re, study, option, ptr)) < 0)
  339. fprintf(outfile, "Error %d from pcre_fullinfo(%d)\n", rc, option);
  340. }
  341. /* Read lines from named file or stdin and write to named file or stdout; lines
  342. consist of a regular expression, in delimiters and optionally followed by
  343. options, followed by a set of test data, terminated by an empty line. */
  344. int main(int argc, char **argv)
  345. {
  346. FILE *infile = stdin;
  347. int options = 0;
  348. int study_options = 0;
  349. int op = 1;
  350. int timeit = 0;
  351. int showinfo = 0;
  352. int showstore = 0;
  353. int posix = 0;
  354. int debug = 0;
  355. int done = 0;
  356. unsigned char buffer[30000];
  357. unsigned char dbuffer[1024];
  358. /* Static so that new_malloc can use it. */
  359. outfile = stdout;
  360. /* Scan options */
  361. while (argc > 1 && argv[op][0] == '-')
  362. {
  363. if (strcmp(argv[op], "-s") == 0 || strcmp(argv[op], "-m") == 0)
  364. showstore = 1;
  365. else if (strcmp(argv[op], "-t") == 0) timeit = 1;
  366. else if (strcmp(argv[op], "-i") == 0) showinfo = 1;
  367. else if (strcmp(argv[op], "-d") == 0) showinfo = debug = 1;
  368. else if (strcmp(argv[op], "-p") == 0) posix = 1;
  369. else
  370. {
  371. printf("*** Unknown option %s\n", argv[op]);
  372. printf("Usage: pcretest [-d] [-i] [-p] [-s] [-t] [<input> [<output>]]\n");
  373. printf(" -d debug: show compiled code; implies -i\n"
  374. " -i show information about compiled pattern\n"
  375. " -p use POSIX interface\n"
  376. " -s output store information\n"
  377. " -t time compilation and execution\n");
  378. return 1;
  379. }
  380. op++;
  381. argc--;
  382. }
  383. /* Sort out the input and output files */
  384. if (argc > 1)
  385. {
  386. infile = fopen(argv[op], "r");
  387. if (infile == NULL)
  388. {
  389. printf("** Failed to open %s\n", argv[op]);
  390. return 1;
  391. }
  392. }
  393. if (argc > 2)
  394. {
  395. outfile = fopen(argv[op+1], "w");
  396. if (outfile == NULL)
  397. {
  398. printf("** Failed to open %s\n", argv[op+1]);
  399. return 1;
  400. }
  401. }
  402. /* Set alternative malloc function */
  403. pcre_malloc = new_malloc;
  404. /* Heading line, then prompt for first regex if stdin */
  405. fprintf(outfile, "PCRE version %s\n\n", pcre_version());
  406. /* Main loop */
  407. while (!done)
  408. {
  409. pcre *re = NULL;
  410. pcre_extra *extra = NULL;
  411. #if !defined NOPOSIX /* There are still compilers that require no indent */
  412. regex_t preg;
  413. int do_posix = 0;
  414. #endif
  415. const char *error;
  416. unsigned char *p, *pp, *ppp;
  417. unsigned const char *tables = NULL;
  418. int do_study = 0;
  419. int do_debug = debug;
  420. int do_G = 0;
  421. int do_g = 0;
  422. int do_showinfo = showinfo;
  423. int do_showrest = 0;
  424. int utf8 = 0;
  425. int erroroffset, len, delimiter;
  426. if (infile == stdin) printf(" re> ");
  427. if (fgets((char *)buffer, sizeof(buffer), infile) == NULL) break;
  428. if (infile != stdin) fprintf(outfile, "%s", (char *)buffer);
  429. p = buffer;
  430. while (isspace(*p)) p++;
  431. if (*p == 0) continue;
  432. /* Get the delimiter and seek the end of the pattern; if is isn't
  433. complete, read more. */
  434. delimiter = *p++;
  435. if (isalnum(delimiter) || delimiter == '\\')
  436. {
  437. fprintf(outfile, "** Delimiter must not be alphameric or \\\n");
  438. goto SKIP_DATA;
  439. }
  440. pp = p;
  441. for(;;)
  442. {
  443. while (*pp != 0)
  444. {
  445. if (*pp == '\\' && pp[1] != 0) pp++;
  446. else if (*pp == delimiter) break;
  447. pp++;
  448. }
  449. if (*pp != 0) break;
  450. len = sizeof(buffer) - (pp - buffer);
  451. if (len < 256)
  452. {
  453. fprintf(outfile, "** Expression too long - missing delimiter?\n");
  454. goto SKIP_DATA;
  455. }
  456. if (infile == stdin) printf(" > ");
  457. if (fgets((char *)pp, len, infile) == NULL)
  458. {
  459. fprintf(outfile, "** Unexpected EOF\n");
  460. done = 1;
  461. goto CONTINUE;
  462. }
  463. if (infile != stdin) fprintf(outfile, "%s", (char *)pp);
  464. }
  465. /* If the first character after the delimiter is backslash, make
  466. the pattern end with backslash. This is purely to provide a way
  467. of testing for the error message when a pattern ends with backslash. */
  468. if (pp[1] == '\\') *pp++ = '\\';
  469. /* Terminate the pattern at the delimiter */
  470. *pp++ = 0;
  471. /* Look for options after final delimiter */
  472. options = 0;
  473. study_options = 0;
  474. log_store = showstore; /* default from command line */
  475. while (*pp != 0)
  476. {
  477. switch (*pp++)
  478. {
  479. case 'g': do_g = 1; break;
  480. case 'i': options |= PCRE_CASELESS; break;
  481. case 'm': options |= PCRE_MULTILINE; break;
  482. case 's': options |= PCRE_DOTALL; break;
  483. case 'x': options |= PCRE_EXTENDED; break;
  484. case '+': do_showrest = 1; break;
  485. case 'A': options |= PCRE_ANCHORED; break;
  486. case 'D': do_debug = do_showinfo = 1; break;
  487. case 'E': options |= PCRE_DOLLAR_ENDONLY; break;
  488. case 'G': do_G = 1; break;
  489. case 'I': do_showinfo = 1; break;
  490. case 'M': log_store = 1; break;
  491. #if !defined NOPOSIX
  492. case 'P': do_posix = 1; break;
  493. #endif
  494. case 'S': do_study = 1; break;
  495. case 'U': options |= PCRE_UNGREEDY; break;
  496. case 'X': options |= PCRE_EXTRA; break;
  497. case '8': options |= PCRE_UTF8; utf8 = 1; break;
  498. case 'L':
  499. ppp = pp;
  500. while (*ppp != '\n' && *ppp != ' ') ppp++;
  501. *ppp = 0;
  502. if (setlocale(LC_CTYPE, (const char *)pp) == NULL)
  503. {
  504. fprintf(outfile, "** Failed to set locale \"%s\"\n", pp);
  505. goto SKIP_DATA;
  506. }
  507. tables = pcre_maketables();
  508. pp = ppp;
  509. break;
  510. case '\n': case ' ': break;
  511. default:
  512. fprintf(outfile, "** Unknown option '%c'\n", pp[-1]);
  513. goto SKIP_DATA;
  514. }
  515. }
  516. /* Handle compiling via the POSIX interface, which doesn't support the
  517. timing, showing, or debugging options, nor the ability to pass over
  518. local character tables. */
  519. #if !defined NOPOSIX
  520. if (posix || do_posix)
  521. {
  522. int rc;
  523. int cflags = 0;
  524. if ((options & PCRE_CASELESS) != 0) cflags |= REG_ICASE;
  525. if ((options & PCRE_MULTILINE) != 0) cflags |= REG_NEWLINE;
  526. rc = regcomp(&preg, (char *)p, cflags);
  527. /* Compilation failed; go back for another re, skipping to blank line
  528. if non-interactive. */
  529. if (rc != 0)
  530. {
  531. (void)regerror(rc, &preg, (char *)buffer, sizeof(buffer));
  532. fprintf(outfile, "Failed: POSIX code %d: %s\n", rc, buffer);
  533. goto SKIP_DATA;
  534. }
  535. }
  536. /* Handle compiling via the native interface */
  537. else
  538. #endif /* !defined NOPOSIX */
  539. {
  540. if (timeit)
  541. {
  542. register int i;
  543. clock_t time_taken;
  544. clock_t start_time = clock();
  545. for (i = 0; i < LOOPREPEAT; i++)
  546. {
  547. re = pcre_compile((char *)p, options, &error, &erroroffset, tables);
  548. if (re != NULL) free(re);
  549. }
  550. time_taken = clock() - start_time;
  551. fprintf(outfile, "Compile time %.3f milliseconds\n",
  552. ((double)time_taken * 1000.0) /
  553. ((double)LOOPREPEAT * (double)CLOCKS_PER_SEC));
  554. }
  555. re = pcre_compile((char *)p, options, &error, &erroroffset, tables);
  556. /* Compilation failed; go back for another re, skipping to blank line
  557. if non-interactive. */
  558. if (re == NULL)
  559. {
  560. fprintf(outfile, "Failed: %s at offset %d\n", error, erroroffset);
  561. SKIP_DATA:
  562. if (infile != stdin)
  563. {
  564. for (;;)
  565. {
  566. if (fgets((char *)buffer, sizeof(buffer), infile) == NULL)
  567. {
  568. done = 1;
  569. goto CONTINUE;
  570. }
  571. len = (int)strlen((char *)buffer);
  572. while (len > 0 && isspace(buffer[len-1])) len--;
  573. if (len == 0) break;
  574. }
  575. fprintf(outfile, "\n");
  576. }
  577. goto CONTINUE;
  578. }
  579. /* Compilation succeeded; print data if required. There are now two
  580. info-returning functions. The old one has a limited interface and
  581. returns only limited data. Check that it agrees with the newer one. */
  582. if (do_showinfo)
  583. {
  584. int old_first_char, old_options, old_count;
  585. int count, backrefmax, first_char, need_char;
  586. size_t size;
  587. if (do_debug) print_internals(re);
  588. new_info(re, NULL, PCRE_INFO_OPTIONS, &options);
  589. new_info(re, NULL, PCRE_INFO_SIZE, &size);
  590. new_info(re, NULL, PCRE_INFO_CAPTURECOUNT, &count);
  591. new_info(re, NULL, PCRE_INFO_BACKREFMAX, &backrefmax);
  592. new_info(re, NULL, PCRE_INFO_FIRSTCHAR, &first_char);
  593. new_info(re, NULL, PCRE_INFO_LASTLITERAL, &need_char);
  594. old_count = pcre_info(re, &old_options, &old_first_char);
  595. if (count < 0) fprintf(outfile,
  596. "Error %d from pcre_info()\n", count);
  597. else
  598. {
  599. if (old_count != count) fprintf(outfile,
  600. "Count disagreement: pcre_fullinfo=%d pcre_info=%d\n", count,
  601. old_count);
  602. if (old_first_char != first_char) fprintf(outfile,
  603. "First char disagreement: pcre_fullinfo=%d pcre_info=%d\n",
  604. first_char, old_first_char);
  605. if (old_options != options) fprintf(outfile,
  606. "Options disagreement: pcre_fullinfo=%d pcre_info=%d\n", options,
  607. old_options);
  608. }
  609. if (size != gotten_store) fprintf(outfile,
  610. "Size disagreement: pcre_fullinfo=%d call to malloc for %d\n",
  611. size, gotten_store);
  612. fprintf(outfile, "Capturing subpattern count = %d\n", count);
  613. if (backrefmax > 0)
  614. fprintf(outfile, "Max back reference = %d\n", backrefmax);
  615. if (options == 0) fprintf(outfile, "No options\n");
  616. else fprintf(outfile, "Options:%s%s%s%s%s%s%s%s%s\n",
  617. ((options & PCRE_ANCHORED) != 0)? " anchored" : "",
  618. ((options & PCRE_CASELESS) != 0)? " caseless" : "",
  619. ((options & PCRE_EXTENDED) != 0)? " extended" : "",
  620. ((options & PCRE_MULTILINE) != 0)? " multiline" : "",
  621. ((options & PCRE_DOTALL) != 0)? " dotall" : "",
  622. ((options & PCRE_DOLLAR_ENDONLY) != 0)? " dollar_endonly" : "",
  623. ((options & PCRE_EXTRA) != 0)? " extra" : "",
  624. ((options & PCRE_UNGREEDY) != 0)? " ungreedy" : "",
  625. ((options & PCRE_UTF8) != 0)? " utf8" : "");
  626. if (((((real_pcre *)re)->options) & PCRE_ICHANGED) != 0)
  627. fprintf(outfile, "Case state changes\n");
  628. if (first_char == -1)
  629. {
  630. fprintf(outfile, "First char at start or follows \\n\n");
  631. }
  632. else if (first_char < 0)
  633. {
  634. fprintf(outfile, "No first char\n");
  635. }
  636. else
  637. {
  638. if (isprint(first_char))
  639. fprintf(outfile, "First char = \'%c\'\n", first_char);
  640. else
  641. fprintf(outfile, "First char = %d\n", first_char);
  642. }
  643. if (need_char < 0)
  644. {
  645. fprintf(outfile, "No need char\n");
  646. }
  647. else
  648. {
  649. if (isprint(need_char))
  650. fprintf(outfile, "Need char = \'%c\'\n", need_char);
  651. else
  652. fprintf(outfile, "Need char = %d\n", need_char);
  653. }
  654. }
  655. /* If /S was present, study the regexp to generate additional info to
  656. help with the matching. */
  657. if (do_study)
  658. {
  659. if (timeit)
  660. {
  661. register int i;
  662. clock_t time_taken;
  663. clock_t start_time = clock();
  664. for (i = 0; i < LOOPREPEAT; i++)
  665. extra = pcre_study(re, study_options, &error);
  666. time_taken = clock() - start_time;
  667. if (extra != NULL) free(extra);
  668. fprintf(outfile, " Study time %.3f milliseconds\n",
  669. ((double)time_taken * 1000.0)/
  670. ((double)LOOPREPEAT * (double)CLOCKS_PER_SEC));
  671. }
  672. extra = pcre_study(re, study_options, &error);
  673. if (error != NULL)
  674. fprintf(outfile, "Failed to study: %s\n", error);
  675. else if (extra == NULL)
  676. fprintf(outfile, "Study returned NULL\n");
  677. else if (do_showinfo)
  678. {
  679. uschar *start_bits = NULL;
  680. new_info(re, extra, PCRE_INFO_FIRSTTABLE, &start_bits);
  681. if (start_bits == NULL)
  682. fprintf(outfile, "No starting character set\n");
  683. else
  684. {
  685. int i;
  686. int c = 24;
  687. fprintf(outfile, "Starting character set: ");
  688. for (i = 0; i < 256; i++)
  689. {
  690. if ((start_bits[i/8] & (1<<(i%8))) != 0)
  691. {
  692. if (c > 75)
  693. {
  694. fprintf(outfile, "\n ");
  695. c = 2;
  696. }
  697. if (isprint(i) && i != ' ')
  698. {
  699. fprintf(outfile, "%c ", i);
  700. c += 2;
  701. }
  702. else
  703. {
  704. fprintf(outfile, "\\x%02x ", i);
  705. c += 5;
  706. }
  707. }
  708. }
  709. fprintf(outfile, "\n");
  710. }
  711. }
  712. }
  713. }
  714. /* Read data lines and test them */
  715. for (;;)
  716. {
  717. unsigned char *q;
  718. unsigned char *bptr = dbuffer;
  719. int count, c;
  720. int copystrings = 0;
  721. int getstrings = 0;
  722. int getlist = 0;
  723. int gmatched = 0;
  724. int start_offset = 0;
  725. int g_notempty = 0;
  726. int offsets[45];
  727. int size_offsets = sizeof(offsets)/sizeof(int);
  728. options = 0;
  729. if (infile == stdin) printf("data> ");
  730. if (fgets((char *)buffer, sizeof(buffer), infile) == NULL)
  731. {
  732. done = 1;
  733. goto CONTINUE;
  734. }
  735. if (infile != stdin) fprintf(outfile, "%s", (char *)buffer);
  736. len = (int)strlen((char *)buffer);
  737. while (len > 0 && isspace(buffer[len-1])) len--;
  738. buffer[len] = 0;
  739. if (len == 0) break;
  740. p = buffer;
  741. while (isspace(*p)) p++;
  742. q = dbuffer;
  743. while ((c = *p++) != 0)
  744. {
  745. int i = 0;
  746. int n = 0;
  747. if (c == '\\') switch ((c = *p++))
  748. {
  749. case 'a': c = 7; break;
  750. case 'b': c = '\b'; break;
  751. case 'e': c = 27; break;
  752. case 'f': c = '\f'; break;
  753. case 'n': c = '\n'; break;
  754. case 'r': c = '\r'; break;
  755. case 't': c = '\t'; break;
  756. case 'v': c = '\v'; break;
  757. case '0': case '1': case '2': case '3':
  758. case '4': case '5': case '6': case '7':
  759. c -= '0';
  760. while (i++ < 2 && isdigit(*p) && *p != '8' && *p != '9')
  761. c = c * 8 + *p++ - '0';
  762. break;
  763. case 'x':
  764. /* Handle \x{..} specially - new Perl thing for utf8 */
  765. if (*p == '{')
  766. {
  767. unsigned char *pt = p;
  768. c = 0;
  769. while (isxdigit(*(++pt)))
  770. c = c * 16 + tolower(*pt) - ((isdigit(*pt))? '0' : 'W');
  771. if (*pt == '}')
  772. {
  773. unsigned char buffer[8];
  774. int ii, utn;
  775. utn = ord2utf8(c, buffer);
  776. for (ii = 0; ii < utn - 1; ii++) *q++ = buffer[ii];
  777. c = buffer[ii]; /* Last byte */
  778. p = pt + 1;
  779. break;
  780. }
  781. /* Not correct form; fall through */
  782. }
  783. /* Ordinary \x */
  784. c = 0;
  785. while (i++ < 2 && isxdigit(*p))
  786. {
  787. c = c * 16 + tolower(*p) - ((isdigit(*p))? '0' : 'W');
  788. p++;
  789. }
  790. break;
  791. case 0: /* Allows for an empty line */
  792. p--;
  793. continue;
  794. case 'A': /* Option setting */
  795. options |= PCRE_ANCHORED;
  796. continue;
  797. case 'B':
  798. options |= PCRE_NOTBOL;
  799. continue;
  800. case 'C':
  801. while(isdigit(*p)) n = n * 10 + *p++ - '0';
  802. copystrings |= 1 << n;
  803. continue;
  804. case 'G':
  805. while(isdigit(*p)) n = n * 10 + *p++ - '0';
  806. getstrings |= 1 << n;
  807. continue;
  808. case 'L':
  809. getlist = 1;
  810. continue;
  811. case 'N':
  812. options |= PCRE_NOTEMPTY;
  813. continue;
  814. case 'O':
  815. while(isdigit(*p)) n = n * 10 + *p++ - '0';
  816. if (n <= (int)(sizeof(offsets)/sizeof(int))) size_offsets = n;
  817. continue;
  818. case 'Z':
  819. options |= PCRE_NOTEOL;
  820. continue;
  821. }
  822. *q++ = c;
  823. }
  824. *q = 0;
  825. len = q - dbuffer;
  826. /* Handle matching via the POSIX interface, which does not
  827. support timing. */
  828. #if !defined NOPOSIX
  829. if (posix || do_posix)
  830. {
  831. int rc;
  832. int eflags = 0;
  833. regmatch_t pmatch[sizeof(offsets)/sizeof(int)];
  834. if ((options & PCRE_NOTBOL) != 0) eflags |= REG_NOTBOL;
  835. if ((options & PCRE_NOTEOL) != 0) eflags |= REG_NOTEOL;
  836. rc = regexec(&preg, (const char *)bptr, size_offsets, pmatch, eflags);
  837. if (rc != 0)
  838. {
  839. (void)regerror(rc, &preg, (char *)buffer, sizeof(buffer));
  840. fprintf(outfile, "No match: POSIX code %d: %s\n", rc, buffer);
  841. }
  842. else
  843. {
  844. size_t i;
  845. for (i = 0; i < size_offsets; i++)
  846. {
  847. if (pmatch[i].rm_so >= 0)
  848. {
  849. fprintf(outfile, "%2d: ", (int)i);
  850. pchars(dbuffer + pmatch[i].rm_so,
  851. pmatch[i].rm_eo - pmatch[i].rm_so, utf8);
  852. fprintf(outfile, "\n");
  853. if (i == 0 && do_showrest)
  854. {
  855. fprintf(outfile, " 0+ ");
  856. pchars(dbuffer + pmatch[i].rm_eo, len - pmatch[i].rm_eo, utf8);
  857. fprintf(outfile, "\n");
  858. }
  859. }
  860. }
  861. }
  862. }
  863. /* Handle matching via the native interface - repeats for /g and /G */
  864. else
  865. #endif /* !defined NOPOSIX */
  866. for (;; gmatched++) /* Loop for /g or /G */
  867. {
  868. if (timeit)
  869. {
  870. register int i;
  871. clock_t time_taken;
  872. clock_t start_time = clock();
  873. for (i = 0; i < LOOPREPEAT; i++)
  874. count = pcre_exec(re, extra, (char *)bptr, len,
  875. start_offset, options | g_notempty, offsets, size_offsets);
  876. time_taken = clock() - start_time;
  877. fprintf(outfile, "Execute time %.3f milliseconds\n",
  878. ((double)time_taken * 1000.0)/
  879. ((double)LOOPREPEAT * (double)CLOCKS_PER_SEC));
  880. }
  881. count = pcre_exec(re, extra, (char *)bptr, len,
  882. start_offset, options | g_notempty, offsets, size_offsets);
  883. if (count == 0)
  884. {
  885. fprintf(outfile, "Matched, but too many substrings\n");
  886. count = size_offsets/3;
  887. }
  888. /* Matched */
  889. if (count >= 0)
  890. {
  891. int i;
  892. for (i = 0; i < count * 2; i += 2)
  893. {
  894. if (offsets[i] < 0)
  895. fprintf(outfile, "%2d: <unset>\n", i/2);
  896. else
  897. {
  898. fprintf(outfile, "%2d: ", i/2);
  899. pchars(bptr + offsets[i], offsets[i+1] - offsets[i], utf8);
  900. fprintf(outfile, "\n");
  901. if (i == 0)
  902. {
  903. if (do_showrest)
  904. {
  905. fprintf(outfile, " 0+ ");
  906. pchars(bptr + offsets[i+1], len - offsets[i+1], utf8);
  907. fprintf(outfile, "\n");
  908. }
  909. }
  910. }
  911. }
  912. for (i = 0; i < 32; i++)
  913. {
  914. if ((copystrings & (1 << i)) != 0)
  915. {
  916. char copybuffer[16];
  917. int rc = pcre_copy_substring((char *)bptr, offsets, count,
  918. i, copybuffer, sizeof(copybuffer));
  919. if (rc < 0)
  920. fprintf(outfile, "copy substring %d failed %d\n", i, rc);
  921. else
  922. fprintf(outfile, "%2dC %s (%d)\n", i, copybuffer, rc);
  923. }
  924. }
  925. for (i = 0; i < 32; i++)
  926. {
  927. if ((getstrings & (1 << i)) != 0)
  928. {
  929. const char *substring;
  930. int rc = pcre_get_substring((char *)bptr, offsets, count,
  931. i, &substring);
  932. if (rc < 0)
  933. fprintf(outfile, "get substring %d failed %d\n", i, rc);
  934. else
  935. {
  936. fprintf(outfile, "%2dG %s (%d)\n", i, substring, rc);
  937. /* free((void *)substring); */
  938. pcre_free_substring(substring);
  939. }
  940. }
  941. }
  942. if (getlist)
  943. {
  944. const char **stringlist;
  945. int rc = pcre_get_substring_list((char *)bptr, offsets, count,
  946. &stringlist);
  947. if (rc < 0)
  948. fprintf(outfile, "get substring list failed %d\n", rc);
  949. else
  950. {
  951. for (i = 0; i < count; i++)
  952. fprintf(outfile, "%2dL %s\n", i, stringlist[i]);
  953. if (stringlist[i] != NULL)
  954. fprintf(outfile, "string list not terminated by NULL\n");
  955. /* free((void *)stringlist); */
  956. pcre_free_substring_list(stringlist);
  957. }
  958. }
  959. }
  960. /* Failed to match. If this is a /g or /G loop and we previously set
  961. g_notempty after a null match, this is not necessarily the end.
  962. We want to advance the start offset, and continue. Fudge the offset
  963. values to achieve this. We won't be at the end of the string - that
  964. was checked before setting g_notempty. */
  965. else
  966. {
  967. if (g_notempty != 0)
  968. {
  969. offsets[0] = start_offset;
  970. offsets[1] = start_offset + 1;
  971. }
  972. else
  973. {
  974. if (gmatched == 0) /* Error if no previous matches */
  975. {
  976. if (count == -1) fprintf(outfile, "No match\n");
  977. else fprintf(outfile, "Error %d\n", count);
  978. }
  979. break; /* Out of the /g loop */
  980. }
  981. }
  982. /* If not /g or /G we are done */
  983. if (!do_g && !do_G) break;
  984. /* If we have matched an empty string, first check to see if we are at
  985. the end of the subject. If so, the /g loop is over. Otherwise, mimic
  986. what Perl's /g options does. This turns out to be rather cunning. First
  987. we set PCRE_NOTEMPTY and PCRE_ANCHORED and try the match again at the
  988. same point. If this fails (picked up above) we advance to the next
  989. character. */
  990. g_notempty = 0;
  991. if (offsets[0] == offsets[1])
  992. {
  993. if (offsets[0] == len) break;
  994. g_notempty = PCRE_NOTEMPTY | PCRE_ANCHORED;
  995. }
  996. /* For /g, update the start offset, leaving the rest alone */
  997. if (do_g) start_offset = offsets[1];
  998. /* For /G, update the pointer and length */
  999. else
  1000. {
  1001. bptr += offsets[1];
  1002. len -= offsets[1];
  1003. }
  1004. } /* End of loop for /g and /G */
  1005. } /* End of loop for data lines */
  1006. CONTINUE:
  1007. #if !defined NOPOSIX
  1008. if (posix || do_posix) regfree(&preg);
  1009. #endif
  1010. if (re != NULL) free(re);
  1011. if (extra != NULL) free(extra);
  1012. if (tables != NULL)
  1013. {
  1014. free((void *)tables);
  1015. setlocale(LC_CTYPE, "C");
  1016. }
  1017. }
  1018. fprintf(outfile, "\n");
  1019. return 0;
  1020. }
  1021. /* End */