pcrs.c 36 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749750751752753754755756757758759760761762763764765766767768769770771772773774775776777778779780781782783784785786787788789790791792793794795796797798799800801802803804805806807808809810811812813814815816817818819820821822823824825826827828829830831832833834835836837838839840841842843844845846847848849850851852853854855856857858859860861862863864865866867868869870871872873874875876877878879880881882883884885886887888889890891892893894895896897898899900901902903904905906907908909910911912913914915916917918919920921922923924925926927928929930931932933934935936937938939940941942943944945946947948949950951952953954955956957958959960961962963964965966967968969970971972973974975976977978979980981982983984985986987988989990991992993994995996997998999100010011002100310041005100610071008100910101011101210131014101510161017101810191020102110221023102410251026102710281029103010311032103310341035103610371038103910401041104210431044104510461047104810491050105110521053105410551056105710581059106010611062106310641065106610671068106910701071107210731074107510761077107810791080108110821083108410851086108710881089109010911092109310941095109610971098109911001101110211031104110511061107110811091110111111121113111411151116111711181119112011211122112311241125112611271128112911301131113211331134113511361137113811391140114111421143114411451146114711481149115011511152115311541155115611571158115911601161116211631164116511661167116811691170117111721173117411751176117711781179118011811182118311841185118611871188118911901191119211931194119511961197119811991200120112021203120412051206120712081209121012111212121312141215121612171218121912201221122212231224122512261227122812291230123112321233123412351236
  1. /*********************************************************************
  2. *
  3. * File : $Source: /cvsroot/ijbswa/current/pcrs.c,v $
  4. *
  5. * Purpose : pcrs is a supplement to the pcre library by Philip Hazel
  6. * <ph10@cam.ac.uk> and adds Perl-style substitution. That
  7. * is, it mimics Perl's 's' operator. See pcrs(3) for details.
  8. *
  9. * WARNING: This file contains additional functions and bug
  10. * fixes that aren't part of the latest official pcrs package
  11. * (which apparently is no longer maintained).
  12. *
  13. * Copyright : Written and Copyright (C) 2000, 2001 by Andreas S. Oesterhelt
  14. * <andreas@oesterhelt.org>
  15. *
  16. * Copyright (C) 2006, 2007 Fabian Keil <fk@fabiankeil.de>
  17. *
  18. * This program is free software; you can redistribute it
  19. * and/or modify it under the terms of the GNU General
  20. * Public License as published by the Free Software
  21. * Foundation; either version 2 of the License, or (at
  22. * your option) any later version.
  23. *
  24. * This program is distributed in the hope that it will
  25. * be useful, but WITHOUT ANY WARRANTY; without even the
  26. * implied warranty of MERCHANTABILITY or FITNESS FOR A
  27. * PARTICULAR PURPOSE. See the GNU General Public
  28. * License for more details.
  29. *
  30. * The GNU General Public License should be included with
  31. * this file. If not, you can view it at
  32. * http://www.gnu.org/copyleft/gpl.html
  33. * or write to the Free Software Foundation, Inc., 59
  34. * Temple Place - Suite 330, Boston, MA 02111-1307, USA.
  35. *
  36. *********************************************************************/
  37. #include <string.h>
  38. #include <ctype.h>
  39. #include <assert.h>
  40. /*
  41. * Include project.h just so that the right pcre.h gets
  42. * included from there
  43. */
  44. #include "project.h"
  45. /* For snprintf only */
  46. #include "miscutil.h"
  47. /* For xtoi */
  48. #include "encode.h"
  49. #include "pcrs.h"
  50. /*
  51. * Internal prototypes
  52. */
  53. static int pcrs_parse_perl_options(const char *optstring, int *flags);
  54. static pcrs_substitute *pcrs_compile_replacement(const char *replacement, int trivialflag,
  55. int capturecount, int *errptr);
  56. static int is_hex_sequence(const char *sequence);
  57. /*********************************************************************
  58. *
  59. * Function : pcrs_strerror
  60. *
  61. * Description : Return a string describing a given error code.
  62. *
  63. * Parameters :
  64. * 1 : error = the error code
  65. *
  66. * Returns : char * to the descriptive string
  67. *
  68. *********************************************************************/
  69. const char *pcrs_strerror(const int error)
  70. {
  71. static char buf[100];
  72. if (error != 0)
  73. {
  74. switch (error)
  75. {
  76. /* Passed-through PCRE error: */
  77. case PCRE_ERROR_NOMEMORY: return "(pcre:) No memory";
  78. /* Shouldn't happen unless PCRE or PCRS bug, or user messed with compiled job: */
  79. case PCRE_ERROR_NULL: return "(pcre:) NULL code or subject or ovector";
  80. case PCRE_ERROR_BADOPTION: return "(pcre:) Unrecognized option bit";
  81. case PCRE_ERROR_BADMAGIC: return "(pcre:) Bad magic number in code";
  82. case PCRE_ERROR_UNKNOWN_NODE: return "(pcre:) Bad node in pattern";
  83. /* Can't happen / not passed: */
  84. case PCRE_ERROR_NOSUBSTRING: return "(pcre:) Fire in power supply";
  85. case PCRE_ERROR_NOMATCH: return "(pcre:) Water in power supply";
  86. #ifdef PCRE_ERROR_MATCHLIMIT
  87. /*
  88. * Only reported by PCRE versions newer than our own.
  89. */
  90. case PCRE_ERROR_MATCHLIMIT: return "(pcre:) Match limit reached";
  91. #endif /* def PCRE_ERROR_MATCHLIMIT */
  92. /* PCRS errors: */
  93. case PCRS_ERR_NOMEM: return "(pcrs:) No memory";
  94. case PCRS_ERR_CMDSYNTAX: return "(pcrs:) Syntax error while parsing command";
  95. case PCRS_ERR_STUDY: return "(pcrs:) PCRE error while studying the pattern";
  96. case PCRS_ERR_BADJOB: return "(pcrs:) Bad job - NULL job, pattern or substitute";
  97. case PCRS_WARN_BADREF: return "(pcrs:) Backreference out of range";
  98. case PCRS_WARN_TRUNCATION:
  99. return "(pcrs:) At least one variable was too big and has been truncated before compilation";
  100. /*
  101. * XXX: With the exception of PCRE_ERROR_MATCHLIMIT we
  102. * only catch PCRE errors that can happen with our internal
  103. * version. If Privoxy is linked against a newer
  104. * PCRE version all bets are off ...
  105. */
  106. default:
  107. snprintf(buf, sizeof(buf),
  108. "Error code %d. For details, check the pcre documentation.",
  109. error);
  110. return buf;
  111. }
  112. }
  113. /* error >= 0: No error */
  114. return "(pcrs:) Everything's just fine. Thanks for asking.";
  115. }
  116. /*********************************************************************
  117. *
  118. * Function : pcrs_parse_perl_options
  119. *
  120. * Description : This function parses a string containing the options to
  121. * Perl's s/// operator. It returns an integer that is the
  122. * pcre equivalent of the symbolic optstring.
  123. * Since pcre doesn't know about Perl's 'g' (global) or pcrs',
  124. * 'T' (trivial) options but pcrs needs them, the corresponding
  125. * flags are set if 'g'or 'T' is encountered.
  126. * Note: The 'T' and 'U' options do not conform to Perl.
  127. *
  128. * Parameters :
  129. * 1 : optstring = string with options in perl syntax
  130. * 2 : flags = see description
  131. *
  132. * Returns : option integer suitable for pcre
  133. *
  134. *********************************************************************/
  135. static int pcrs_parse_perl_options(const char *optstring, int *flags)
  136. {
  137. size_t i;
  138. int rc = 0;
  139. *flags = 0;
  140. if (NULL == optstring) return 0;
  141. for (i = 0; i < strlen(optstring); i++)
  142. {
  143. switch(optstring[i])
  144. {
  145. case 'e': break; /* ToDo ;-) */
  146. case 'g': *flags |= PCRS_GLOBAL; break;
  147. case 'i': rc |= PCRE_CASELESS; break;
  148. case 'm': rc |= PCRE_MULTILINE; break;
  149. case 'o': break;
  150. case 's': rc |= PCRE_DOTALL; break;
  151. case 'x': rc |= PCRE_EXTENDED; break;
  152. case 'D': *flags |= PCRS_DYNAMIC; break;
  153. case 'U': rc |= PCRE_UNGREEDY; break;
  154. case 'T': *flags |= PCRS_TRIVIAL; break;
  155. default: break;
  156. }
  157. }
  158. return rc;
  159. }
  160. #ifdef FUZZ
  161. /*********************************************************************
  162. *
  163. * Function : pcrs_compile_fuzzed_replacement
  164. *
  165. * Description : Wrapper around pcrs_compile_replacement() for
  166. * fuzzing purposes.
  167. *
  168. * Parameters :
  169. * 1 : replacement = replacement part of s/// operator
  170. * in perl syntax
  171. * 2 : errptr = pointer to an integer in which error
  172. * conditions can be returned.
  173. *
  174. * Returns : pcrs_substitute data structure, or NULL if an
  175. * error is encountered. In that case, *errptr has
  176. * the reason.
  177. *
  178. *********************************************************************/
  179. extern pcrs_substitute *pcrs_compile_fuzzed_replacement(const char *replacement, int *errptr)
  180. {
  181. int capturecount = PCRS_MAX_SUBMATCHES; /* XXX: fuzzworthy? */
  182. int trivial_flag = 0; /* We don't want to fuzz strncpy() */
  183. *errptr = 0; /* XXX: Should pcrs_compile_replacement() do this? */
  184. return pcrs_compile_replacement(replacement, trivial_flag, capturecount, errptr);
  185. }
  186. #endif
  187. /*********************************************************************
  188. *
  189. * Function : pcrs_compile_replacement
  190. *
  191. * Description : This function takes a Perl-style replacement (2nd argument
  192. * to the s/// operator and returns a compiled pcrs_substitute,
  193. * or NULL if memory allocation for the substitute structure
  194. * fails.
  195. *
  196. * Parameters :
  197. * 1 : replacement = replacement part of s/// operator
  198. * in perl syntax
  199. * 2 : trivialflag = Flag that causes backreferences to be
  200. * ignored.
  201. * 3 : capturecount = Number of capturing subpatterns in
  202. * the pattern. Needed for $+ handling.
  203. * 4 : errptr = pointer to an integer in which error
  204. * conditions can be returned.
  205. *
  206. * Returns : pcrs_substitute data structure, or NULL if an
  207. * error is encountered. In that case, *errptr has
  208. * the reason.
  209. *
  210. *********************************************************************/
  211. static pcrs_substitute *pcrs_compile_replacement(const char *replacement, int trivialflag, int capturecount, int *errptr)
  212. {
  213. int i, k, l, quoted;
  214. char *text;
  215. pcrs_substitute *r;
  216. #ifndef FUZZ
  217. size_t length;
  218. #else
  219. static size_t length;
  220. #endif
  221. i = k = l = quoted = 0;
  222. /*
  223. * Sanity check
  224. */
  225. if (NULL == replacement)
  226. {
  227. replacement = "";
  228. }
  229. /*
  230. * Get memory or fail
  231. */
  232. if (NULL == (r = (pcrs_substitute *)malloc(sizeof(pcrs_substitute))))
  233. {
  234. *errptr = PCRS_ERR_NOMEM;
  235. return NULL;
  236. }
  237. memset(r, '\0', sizeof(pcrs_substitute));
  238. length = strlen(replacement);
  239. if (NULL == (text = (char *)malloc(length + 1)))
  240. {
  241. free(r);
  242. *errptr = PCRS_ERR_NOMEM;
  243. return NULL;
  244. }
  245. memset(text, '\0', length + 1);
  246. /*
  247. * In trivial mode, just copy the substitute text
  248. */
  249. if (trivialflag)
  250. {
  251. text = strncpy(text, replacement, length + 1);
  252. k = (int)length;
  253. }
  254. /*
  255. * Else, parse, cut out and record all backreferences
  256. */
  257. else
  258. {
  259. while (i < (int)length)
  260. {
  261. /* Quoting */
  262. if (replacement[i] == '\\')
  263. {
  264. if (quoted)
  265. {
  266. text[k++] = replacement[i++];
  267. quoted = 0;
  268. }
  269. else
  270. {
  271. if (replacement[i+1] && strchr("tnrfae0", replacement[i+1]))
  272. {
  273. switch (replacement[++i])
  274. {
  275. case 't':
  276. text[k++] = '\t';
  277. break;
  278. case 'n':
  279. text[k++] = '\n';
  280. break;
  281. case 'r':
  282. text[k++] = '\r';
  283. break;
  284. case 'f':
  285. text[k++] = '\f';
  286. break;
  287. case 'a':
  288. text[k++] = 7;
  289. break;
  290. case 'e':
  291. text[k++] = 27;
  292. break;
  293. case '0':
  294. text[k++] = '\0';
  295. break;
  296. }
  297. i++;
  298. }
  299. else if (is_hex_sequence(&replacement[i]))
  300. {
  301. /*
  302. * Replace a hex sequence with a single
  303. * character with the sequence's ascii value.
  304. * e.g.: '\x7e' => '~'
  305. */
  306. const int ascii_value = xtoi(&replacement[i+2]);
  307. assert(ascii_value >= 0);
  308. assert(ascii_value < 256);
  309. text[k++] = (char)ascii_value;
  310. i += 4;
  311. }
  312. else
  313. {
  314. quoted = 1;
  315. i++;
  316. }
  317. }
  318. continue;
  319. }
  320. /* Backreferences */
  321. if (replacement[i] == '$' && !quoted && i < (int)(length - 1))
  322. {
  323. char *symbol, symbols[] = "'`+&";
  324. if (l >= PCRS_MAX_SUBMATCHES)
  325. {
  326. freez(text);
  327. freez(r);
  328. *errptr = PCRS_WARN_BADREF;
  329. return NULL;
  330. }
  331. r->block_length[l] = (size_t)(k - r->block_offset[l]);
  332. /* Numerical backreferences */
  333. if (isdigit((int)replacement[i + 1]))
  334. {
  335. while (i < (int)length && isdigit((int)replacement[++i]))
  336. {
  337. r->backref[l] = r->backref[l] * 10 + replacement[i] - 48;
  338. }
  339. if (r->backref[l] > capturecount)
  340. {
  341. freez(text);
  342. freez(r);
  343. *errptr = PCRS_WARN_BADREF;
  344. return NULL;
  345. }
  346. }
  347. /* Symbolic backreferences: */
  348. else if (NULL != (symbol = strchr(symbols, replacement[i + 1])))
  349. {
  350. if (symbol - symbols == 2) /* $+ */
  351. {
  352. r->backref[l] = capturecount;
  353. }
  354. else if (symbol - symbols == 3) /* $& */
  355. {
  356. r->backref[l] = 0;
  357. }
  358. else /* $' or $` */
  359. {
  360. r->backref[l] = (int)(PCRS_MAX_SUBMATCHES + 1 - (symbol - symbols));
  361. }
  362. i += 2;
  363. }
  364. /* Invalid backref -> plain '$' */
  365. else
  366. {
  367. goto plainchar;
  368. }
  369. assert(r->backref[l] < PCRS_MAX_SUBMATCHES + 2);
  370. /* Valid and in range? -> record */
  371. if ((0 <= r->backref[l]) &&
  372. (r->backref[l] < PCRS_MAX_SUBMATCHES + 2) &&
  373. (l < PCRS_MAX_SUBMATCHES - 1))
  374. {
  375. r->backref_count[r->backref[l]] += 1;
  376. r->block_offset[++l] = k;
  377. }
  378. else
  379. {
  380. freez(text);
  381. freez(r);
  382. *errptr = PCRS_WARN_BADREF;
  383. return NULL;
  384. }
  385. continue;
  386. }
  387. plainchar:
  388. /* Plain chars are copied */
  389. text[k++] = replacement[i++];
  390. quoted = 0;
  391. }
  392. } /* -END- if (!trivialflag) */
  393. /*
  394. * Finish & return
  395. */
  396. r->text = text;
  397. r->backrefs = l;
  398. r->length = (size_t)k;
  399. r->block_length[l] = (size_t)(k - r->block_offset[l]);
  400. return r;
  401. }
  402. /*********************************************************************
  403. *
  404. * Function : pcrs_free_job
  405. *
  406. * Description : Frees the memory used by a pcrs_job struct and its
  407. * dependent structures.
  408. *
  409. * Parameters :
  410. * 1 : job = pointer to the pcrs_job structure to be freed
  411. *
  412. * Returns : a pointer to the next job, if there was any, or
  413. * NULL otherwise.
  414. *
  415. *********************************************************************/
  416. pcrs_job *pcrs_free_job(pcrs_job *job)
  417. {
  418. pcrs_job *next;
  419. if (job == NULL)
  420. {
  421. return NULL;
  422. }
  423. else
  424. {
  425. next = job->next;
  426. if (job->pattern != NULL) free(job->pattern);
  427. if (job->hints != NULL)
  428. {
  429. #ifdef PCRE_CONFIG_JIT
  430. pcre_free_study(job->hints);
  431. #else
  432. free(job->hints);
  433. #endif
  434. }
  435. if (job->substitute != NULL)
  436. {
  437. if (job->substitute->text != NULL) free(job->substitute->text);
  438. free(job->substitute);
  439. }
  440. free(job);
  441. }
  442. return next;
  443. }
  444. /*********************************************************************
  445. *
  446. * Function : pcrs_free_joblist
  447. *
  448. * Description : Iterates through a chained list of pcrs_job's and
  449. * frees them using pcrs_free_job.
  450. *
  451. * Parameters :
  452. * 1 : joblist = pointer to the first pcrs_job structure to
  453. * be freed
  454. *
  455. * Returns : N/A
  456. *
  457. *********************************************************************/
  458. void pcrs_free_joblist(pcrs_job *joblist)
  459. {
  460. while (NULL != (joblist = pcrs_free_job(joblist))) {};
  461. return;
  462. }
  463. /*********************************************************************
  464. *
  465. * Function : pcrs_compile_command
  466. *
  467. * Description : Parses a string with a Perl-style s/// command,
  468. * calls pcrs_compile, and returns a corresponding
  469. * pcrs_job, or NULL if parsing or compiling the job
  470. * fails.
  471. *
  472. * Parameters :
  473. * 1 : command = string with perl-style s/// command
  474. * 2 : errptr = pointer to an integer in which error
  475. * conditions can be returned.
  476. *
  477. * Returns : a corresponding pcrs_job data structure, or NULL
  478. * if an error was encountered. In that case, *errptr
  479. * has the reason.
  480. *
  481. *********************************************************************/
  482. pcrs_job *pcrs_compile_command(const char *command, int *errptr)
  483. {
  484. int i, k, l, quoted = FALSE;
  485. size_t limit;
  486. char delimiter;
  487. char *tokens[4];
  488. pcrs_job *newjob;
  489. k = l = 0;
  490. /*
  491. * Tokenize the perl command
  492. */
  493. limit = strlen(command);
  494. if (limit < 4)
  495. {
  496. *errptr = PCRS_ERR_CMDSYNTAX;
  497. return NULL;
  498. }
  499. else
  500. {
  501. delimiter = command[1];
  502. }
  503. tokens[l] = (char *) malloc(limit + 1);
  504. for (i = 0; i <= (int)limit; i++)
  505. {
  506. if (command[i] == delimiter && !quoted)
  507. {
  508. if (l == 3)
  509. {
  510. l = -1;
  511. break;
  512. }
  513. tokens[0][k++] = '\0';
  514. tokens[++l] = tokens[0] + k;
  515. continue;
  516. }
  517. else if (command[i] == '\\' && !quoted)
  518. {
  519. quoted = TRUE;
  520. if (command[i+1] == delimiter) continue;
  521. }
  522. else
  523. {
  524. quoted = FALSE;
  525. }
  526. tokens[0][k++] = command[i];
  527. }
  528. /*
  529. * Syntax error ?
  530. */
  531. if (l != 3)
  532. {
  533. *errptr = PCRS_ERR_CMDSYNTAX;
  534. free(tokens[0]);
  535. return NULL;
  536. }
  537. newjob = pcrs_compile(tokens[1], tokens[2], tokens[3], errptr);
  538. free(tokens[0]);
  539. return newjob;
  540. }
  541. /*********************************************************************
  542. *
  543. * Function : pcrs_compile
  544. *
  545. * Description : Takes the three arguments to a perl s/// command
  546. * and compiles a pcrs_job structure from them.
  547. *
  548. * Parameters :
  549. * 1 : pattern = string with perl-style pattern
  550. * 2 : substitute = string with perl-style substitute
  551. * 3 : options = string with perl-style options
  552. * 4 : errptr = pointer to an integer in which error
  553. * conditions can be returned.
  554. *
  555. * Returns : a corresponding pcrs_job data structure, or NULL
  556. * if an error was encountered. In that case, *errptr
  557. * has the reason.
  558. *
  559. *********************************************************************/
  560. pcrs_job *pcrs_compile(const char *pattern, const char *substitute, const char *options, int *errptr)
  561. {
  562. pcrs_job *newjob;
  563. int flags;
  564. int capturecount;
  565. const char *error;
  566. int pcre_study_options = 0;
  567. *errptr = 0;
  568. /*
  569. * Handle NULL arguments
  570. */
  571. if (pattern == NULL) pattern = "";
  572. if (substitute == NULL) substitute = "";
  573. /*
  574. * Get and init memory
  575. */
  576. if (NULL == (newjob = (pcrs_job *)malloc(sizeof(pcrs_job))))
  577. {
  578. *errptr = PCRS_ERR_NOMEM;
  579. return NULL;
  580. }
  581. memset(newjob, '\0', sizeof(pcrs_job));
  582. /*
  583. * Evaluate the options
  584. */
  585. newjob->options = pcrs_parse_perl_options(options, &flags);
  586. newjob->flags = flags;
  587. /*
  588. * Compile the pattern
  589. */
  590. newjob->pattern = pcre_compile(pattern, newjob->options, &error, errptr, NULL);
  591. if (newjob->pattern == NULL)
  592. {
  593. pcrs_free_job(newjob);
  594. return NULL;
  595. }
  596. #ifdef PCRE_STUDY_JIT_COMPILE
  597. if (!(flags & PCRS_DYNAMIC))
  598. {
  599. pcre_study_options = PCRE_STUDY_JIT_COMPILE;
  600. }
  601. #endif
  602. /*
  603. * Generate hints. This has little overhead, since the
  604. * hints will be NULL for a boring pattern anyway.
  605. */
  606. newjob->hints = pcre_study(newjob->pattern, pcre_study_options, &error);
  607. if (error != NULL)
  608. {
  609. *errptr = PCRS_ERR_STUDY;
  610. pcrs_free_job(newjob);
  611. return NULL;
  612. }
  613. /*
  614. * Determine the number of capturing subpatterns.
  615. * This is needed for handling $+ in the substitute.
  616. */
  617. if (0 > (*errptr = pcre_fullinfo(newjob->pattern, newjob->hints, PCRE_INFO_CAPTURECOUNT, &capturecount)))
  618. {
  619. pcrs_free_job(newjob);
  620. return NULL;
  621. }
  622. /*
  623. * Compile the substitute
  624. */
  625. if (NULL == (newjob->substitute = pcrs_compile_replacement(substitute, newjob->flags & PCRS_TRIVIAL, capturecount, errptr)))
  626. {
  627. pcrs_free_job(newjob);
  628. return NULL;
  629. }
  630. return newjob;
  631. }
  632. /*********************************************************************
  633. *
  634. * Function : pcrs_execute_list
  635. *
  636. * Description : This is a multiple job wrapper for pcrs_execute().
  637. * Apply the regular substitutions defined by the jobs in
  638. * the joblist to the subject.
  639. * The subject itself is left untouched, memory for the result
  640. * is malloc()ed and it is the caller's responsibility to free
  641. * the result when it's no longer needed.
  642. *
  643. * Note: For convenient string handling, a null byte is
  644. * appended to the result. It does not count towards the
  645. * result_length, though.
  646. *
  647. *
  648. * Parameters :
  649. * 1 : joblist = the chained list of pcrs_jobs to be executed
  650. * 2 : subject = the subject string
  651. * 3 : subject_length = the subject's length
  652. * 4 : result = char** for returning the result
  653. * 5 : result_length = size_t* for returning the result's length
  654. *
  655. * Returns : On success, the number of substitutions that were made.
  656. * May be > 1 if job->flags contained PCRS_GLOBAL
  657. * On failure, the (negative) pcre error code describing the
  658. * failure, which may be translated to text using pcrs_strerror().
  659. *
  660. *********************************************************************/
  661. int pcrs_execute_list(pcrs_job *joblist, char *subject, size_t subject_length, char **result, size_t *result_length)
  662. {
  663. pcrs_job *job;
  664. char *old, *new = NULL;
  665. int hits, total_hits;
  666. old = subject;
  667. *result_length = subject_length;
  668. total_hits = 0;
  669. for (job = joblist; job != NULL; job = job->next)
  670. {
  671. hits = pcrs_execute(job, old, *result_length, &new, result_length);
  672. if (old != subject) free(old);
  673. if (hits < 0)
  674. {
  675. return(hits);
  676. }
  677. else
  678. {
  679. total_hits += hits;
  680. old = new;
  681. }
  682. }
  683. *result = new;
  684. return(total_hits);
  685. }
  686. /*********************************************************************
  687. *
  688. * Function : pcrs_execute
  689. *
  690. * Description : Apply the regular substitution defined by the job to the
  691. * subject.
  692. * The subject itself is left untouched, memory for the result
  693. * is malloc()ed and it is the caller's responsibility to free
  694. * the result when it's no longer needed.
  695. *
  696. * Note: For convenient string handling, a null byte is
  697. * appended to the result. It does not count towards the
  698. * result_length, though.
  699. *
  700. * Parameters :
  701. * 1 : job = the pcrs_job to be executed
  702. * 2 : subject = the subject (== original) string
  703. * 3 : subject_length = the subject's length
  704. * 4 : result = char** for returning the result (NULL on error)
  705. * 5 : result_length = size_t* for returning the result's length
  706. *
  707. * Returns : On success, the number of substitutions that were made.
  708. * May be > 1 if job->flags contained PCRS_GLOBAL
  709. * On failure, the (negative) pcre error code describing the
  710. * failure, which may be translated to text using pcrs_strerror().
  711. *
  712. *********************************************************************/
  713. int pcrs_execute(pcrs_job *job, const char *subject, size_t subject_length, char **result, size_t *result_length)
  714. {
  715. int offsets[3 * PCRS_MAX_SUBMATCHES],
  716. offset,
  717. i, k,
  718. matches_found,
  719. submatches,
  720. max_matches = PCRS_MAX_MATCH_INIT;
  721. size_t newsize;
  722. pcrs_match *matches, *dummy;
  723. char *result_offset;
  724. offset = i = 0;
  725. *result = NULL;
  726. /*
  727. * Sanity check & memory allocation
  728. */
  729. if (job == NULL || job->pattern == NULL || job->substitute == NULL || NULL == subject)
  730. {
  731. return(PCRS_ERR_BADJOB);
  732. }
  733. if (NULL == (matches = (pcrs_match *)malloc((size_t)max_matches * sizeof(pcrs_match))))
  734. {
  735. return(PCRS_ERR_NOMEM);
  736. }
  737. memset(matches, '\0', (size_t)max_matches * sizeof(pcrs_match));
  738. /*
  739. * Find the pattern and calculate the space
  740. * requirements for the result
  741. */
  742. newsize = subject_length;
  743. while ((submatches = pcre_exec(job->pattern, job->hints, subject, (int)subject_length, offset, 0, offsets, 3 * PCRS_MAX_SUBMATCHES)) > 0)
  744. {
  745. job->flags |= PCRS_SUCCESS;
  746. matches[i].submatches = submatches;
  747. for (k = 0; k < submatches; k++)
  748. {
  749. matches[i].submatch_offset[k] = offsets[2 * k];
  750. /* Note: Non-found optional submatches have length -1-(-1)==0 */
  751. matches[i].submatch_length[k] = (size_t)(offsets[2 * k + 1] - offsets[2 * k]);
  752. /* reserve mem for each submatch as often as it is ref'd */
  753. newsize += matches[i].submatch_length[k] * (size_t)job->substitute->backref_count[k];
  754. }
  755. /* plus replacement text size minus match text size */
  756. newsize += job->substitute->length - matches[i].submatch_length[0];
  757. /* chunk before match */
  758. matches[i].submatch_offset[PCRS_MAX_SUBMATCHES] = 0;
  759. matches[i].submatch_length[PCRS_MAX_SUBMATCHES] = (size_t)offsets[0];
  760. newsize += (size_t)offsets[0] * (size_t)job->substitute->backref_count[PCRS_MAX_SUBMATCHES];
  761. /* chunk after match */
  762. matches[i].submatch_offset[PCRS_MAX_SUBMATCHES + 1] = offsets[1];
  763. matches[i].submatch_length[PCRS_MAX_SUBMATCHES + 1] = subject_length - (size_t)offsets[1] - 1;
  764. newsize += (subject_length - (size_t)offsets[1]) * (size_t)job->substitute->backref_count[PCRS_MAX_SUBMATCHES + 1];
  765. /* Storage for matches exhausted? -> Extend! */
  766. if (++i >= max_matches)
  767. {
  768. max_matches = (int)(max_matches * PCRS_MAX_MATCH_GROW);
  769. if (NULL == (dummy = (pcrs_match *)realloc(matches, (size_t)max_matches * sizeof(pcrs_match))))
  770. {
  771. free(matches);
  772. return(PCRS_ERR_NOMEM);
  773. }
  774. matches = dummy;
  775. }
  776. /* Non-global search or limit reached? */
  777. if (!(job->flags & PCRS_GLOBAL)) break;
  778. /* Don't loop on empty matches */
  779. if (offsets[1] == offset)
  780. if ((size_t)offset < subject_length)
  781. offset++;
  782. else
  783. break;
  784. /* Go find the next one */
  785. else
  786. offset = offsets[1];
  787. }
  788. /* Pass pcre error through if (bad) failure */
  789. if (submatches < PCRE_ERROR_NOMATCH)
  790. {
  791. free(matches);
  792. return submatches;
  793. }
  794. matches_found = i;
  795. /*
  796. * Get memory for the result (must be freed by caller!)
  797. * and append terminating null byte.
  798. */
  799. if ((*result = (char *)malloc(newsize + 1)) == NULL)
  800. {
  801. free(matches);
  802. return PCRS_ERR_NOMEM;
  803. }
  804. else
  805. {
  806. (*result)[newsize] = '\0';
  807. }
  808. /*
  809. * Replace
  810. */
  811. offset = 0;
  812. result_offset = *result;
  813. for (i = 0; i < matches_found; i++)
  814. {
  815. /* copy the chunk preceding the match */
  816. memcpy(result_offset, subject + offset, (size_t)(matches[i].submatch_offset[0] - offset));
  817. result_offset += matches[i].submatch_offset[0] - offset;
  818. /* For every segment of the substitute.. */
  819. for (k = 0; k <= job->substitute->backrefs; k++)
  820. {
  821. /* ...copy its text.. */
  822. memcpy(result_offset, job->substitute->text + job->substitute->block_offset[k], job->substitute->block_length[k]);
  823. result_offset += job->substitute->block_length[k];
  824. /* ..plus, if it's not the last chunk, i.e.: There *is* a backref.. */
  825. if (k != job->substitute->backrefs
  826. /* ..in legal range.. */
  827. && job->substitute->backref[k] < PCRS_MAX_SUBMATCHES + 2
  828. /* ..and referencing a real submatch.. */
  829. && job->substitute->backref[k] < matches[i].submatches
  830. /* ..that is nonempty.. */
  831. && matches[i].submatch_length[job->substitute->backref[k]] > 0)
  832. {
  833. /* ..copy the submatch that is ref'd. */
  834. memcpy(
  835. result_offset,
  836. subject + matches[i].submatch_offset[job->substitute->backref[k]],
  837. matches[i].submatch_length[job->substitute->backref[k]]
  838. );
  839. result_offset += matches[i].submatch_length[job->substitute->backref[k]];
  840. }
  841. }
  842. offset = matches[i].submatch_offset[0] + (int)matches[i].submatch_length[0];
  843. }
  844. /* Copy the rest. */
  845. memcpy(result_offset, subject + offset, subject_length - (size_t)offset);
  846. *result_length = newsize;
  847. free(matches);
  848. return matches_found;
  849. }
  850. #define is_hex_digit(x) ((x) && strchr("0123456789ABCDEF", toupper(x)))
  851. /*********************************************************************
  852. *
  853. * Function : is_hex_sequence
  854. *
  855. * Description : Checks the first four characters of a string
  856. * and decides if they are a valid hex sequence
  857. * (like '\x40').
  858. *
  859. * Parameters :
  860. * 1 : sequence = The string to check
  861. *
  862. * Returns : Non-zero if it's valid sequence, or
  863. * Zero if it isn't.
  864. *
  865. *********************************************************************/
  866. static int is_hex_sequence(const char *sequence)
  867. {
  868. return (sequence[0] == '\\' &&
  869. sequence[1] == 'x' &&
  870. is_hex_digit(sequence[2]) &&
  871. is_hex_digit(sequence[3]));
  872. }
  873. /*
  874. * Functions below this line are only part of the pcrs version
  875. * included in Privoxy. If you use any of them you should not
  876. * try to dynamically link against external pcrs versions.
  877. */
  878. /*********************************************************************
  879. *
  880. * Function : pcrs_job_is_dynamic
  881. *
  882. * Description : Checks if a job has the "D" (dynamic) option set.
  883. *
  884. * Parameters :
  885. * 1 : job = The job to check
  886. *
  887. * Returns : TRUE if the job is indeed dynamic, otherwise
  888. * FALSE
  889. *
  890. *********************************************************************/
  891. int pcrs_job_is_dynamic(char *job)
  892. {
  893. const char delimiter = job[1];
  894. const size_t length = strlen(job);
  895. char *option;
  896. if (length < 5)
  897. {
  898. /*
  899. * The shortest valid (but useless)
  900. * dynamic pattern is "s@@@D"
  901. */
  902. return FALSE;
  903. }
  904. /*
  905. * Everything between the last character
  906. * and the last delimiter is an option ...
  907. */
  908. for (option = job + length; *option != delimiter; option--)
  909. {
  910. if (*option == 'D')
  911. {
  912. /*
  913. * ... and if said option is 'D' the job is dynamic.
  914. */
  915. return TRUE;
  916. }
  917. }
  918. return FALSE;
  919. }
  920. /*********************************************************************
  921. *
  922. * Function : pcrs_get_delimiter
  923. *
  924. * Description : Tries to find a character that is safe to
  925. * be used as a pcrs delimiter for a certain string.
  926. *
  927. * Parameters :
  928. * 1 : string = The string to search in
  929. *
  930. * Returns : A safe delimiter if one was found, otherwise '\0'.
  931. *
  932. *********************************************************************/
  933. char pcrs_get_delimiter(const char *string)
  934. {
  935. /*
  936. * Some characters that are unlikely to
  937. * be part of pcrs replacement strings.
  938. */
  939. static const char delimiters[] = "><#+*~%^-:;!@";
  940. const char *d = delimiters;
  941. /* Take the first delimiter that isn't part of the string */
  942. while (*d && NULL != strchr(string, *d))
  943. {
  944. d++;
  945. }
  946. return *d;
  947. }
  948. /*********************************************************************
  949. *
  950. * Function : pcrs_execute_single_command
  951. *
  952. * Description : Apply single pcrs command to the subject.
  953. * The subject itself is left untouched, memory for the result
  954. * is malloc()ed and it is the caller's responsibility to free
  955. * the result when it's no longer needed.
  956. *
  957. * Parameters :
  958. * 1 : subject = the subject (== original) string
  959. * 2 : pcrs_command = the pcrs command as string (s@foo@bar@)
  960. * 3 : hits = int* for returning the number of modifications
  961. *
  962. * Returns : NULL in case of errors, otherwise the
  963. * result of the pcrs command.
  964. *
  965. *********************************************************************/
  966. char *pcrs_execute_single_command(const char *subject, const char *pcrs_command, int *hits)
  967. {
  968. size_t size;
  969. char *result = NULL;
  970. pcrs_job *job;
  971. assert(subject);
  972. assert(pcrs_command);
  973. *hits = 0;
  974. size = strlen(subject);
  975. job = pcrs_compile_command(pcrs_command, hits);
  976. if (NULL != job)
  977. {
  978. *hits = pcrs_execute(job, subject, size, &result, &size);
  979. if (*hits < 0)
  980. {
  981. freez(result);
  982. }
  983. pcrs_free_job(job);
  984. }
  985. return result;
  986. }
  987. /*********************************************************************
  988. *
  989. * Function : pcrs_compile_dynamic_command
  990. *
  991. * Description : Takes a dynamic pcrs command, fills in the
  992. * values of the variables and compiles it.
  993. *
  994. * Parameters :
  995. * 1 : pcrs_command = The dynamic pcrs command to compile
  996. * 2 : v = NULL terminated array of variables and their values.
  997. * 3 : error = pcrs error code
  998. *
  999. * Returns : NULL in case of hard errors, otherwise the
  1000. * compiled pcrs job.
  1001. *
  1002. *********************************************************************/
  1003. pcrs_job *pcrs_compile_dynamic_command(char *pcrs_command, const struct pcrs_variable v[], int *error)
  1004. {
  1005. char buf[PCRS_BUFFER_SIZE];
  1006. const char *original_pcrs_command = pcrs_command;
  1007. char *pcrs_command_tmp = NULL;
  1008. pcrs_job *job = NULL;
  1009. int truncation = 0;
  1010. char d;
  1011. int ret;
  1012. while ((NULL != v->name) && (NULL != pcrs_command))
  1013. {
  1014. assert(NULL != v->value);
  1015. if (NULL == strstr(pcrs_command, v->name))
  1016. {
  1017. /*
  1018. * Skip the substitution if the variable
  1019. * name isn't part of the pattern.
  1020. */
  1021. v++;
  1022. continue;
  1023. }
  1024. /* Use pcrs to replace the variable with its value. */
  1025. d = pcrs_get_delimiter(v->value);
  1026. if ('\0' == d)
  1027. {
  1028. /* No proper delimiter found */
  1029. *error = PCRS_ERR_CMDSYNTAX;
  1030. freez(pcrs_command_tmp);
  1031. return NULL;
  1032. }
  1033. /*
  1034. * Variable names are supposed to contain alpha
  1035. * numerical characters plus '_' only.
  1036. */
  1037. assert(NULL == strchr(v->name, d));
  1038. ret = snprintf(buf, sizeof(buf), "s%c\\$%s%c%s%cDgT", d, v->name, d, v->value, d);
  1039. assert(ret >= 0);
  1040. if (ret >= sizeof(buf))
  1041. {
  1042. /*
  1043. * Value didn't completely fit into buffer,
  1044. * overwrite the end of the substitution text
  1045. * with a truncation message and close the pattern
  1046. * properly.
  1047. */
  1048. static const char warning[] = "... [too long, truncated]";
  1049. const size_t trailer_size = sizeof(warning) + 4; /* 4 for d + "DgT" */
  1050. char *trailer_start = buf + sizeof(buf) - trailer_size;
  1051. ret = snprintf(trailer_start, trailer_size, "%s%cDgT", warning, d);
  1052. assert(ret == trailer_size - 1);
  1053. assert(sizeof(buf) == strlen(buf) + 1);
  1054. truncation = 1;
  1055. }
  1056. pcrs_command_tmp = pcrs_execute_single_command(pcrs_command, buf, error);
  1057. if (NULL == pcrs_command_tmp)
  1058. {
  1059. return NULL;
  1060. }
  1061. if (pcrs_command != original_pcrs_command)
  1062. {
  1063. freez(pcrs_command);
  1064. }
  1065. pcrs_command = pcrs_command_tmp;
  1066. v++;
  1067. }
  1068. job = pcrs_compile_command(pcrs_command, error);
  1069. if (pcrs_command != original_pcrs_command)
  1070. {
  1071. freez(pcrs_command);
  1072. }
  1073. if (truncation)
  1074. {
  1075. *error = PCRS_WARN_TRUNCATION;
  1076. }
  1077. return job;
  1078. }
  1079. /*
  1080. Local Variables:
  1081. tab-width: 3
  1082. end:
  1083. */