encode.c 14 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462
  1. /*********************************************************************
  2. *
  3. * File : $Source: /cvsroot/ijbswa/current/encode.c,v $
  4. *
  5. * Purpose : Functions to encode and decode URLs, and also to
  6. * encode cookies and HTML text.
  7. *
  8. * Copyright : Written by and Copyright (C) 2001 the
  9. * Privoxy team. https://www.privoxy.org/
  10. *
  11. * Based on the Internet Junkbuster originally written
  12. * by and Copyright (C) 1997 Anonymous Coders and
  13. * Junkbusters Corporation. http://www.junkbusters.com
  14. *
  15. * This program is free software; you can redistribute it
  16. * and/or modify it under the terms of the GNU General
  17. * Public License as published by the Free Software
  18. * Foundation; either version 2 of the License, or (at
  19. * your option) any later version.
  20. *
  21. * This program is distributed in the hope that it will
  22. * be useful, but WITHOUT ANY WARRANTY; without even the
  23. * implied warranty of MERCHANTABILITY or FITNESS FOR A
  24. * PARTICULAR PURPOSE. See the GNU General Public
  25. * License for more details.
  26. *
  27. * The GNU General Public License should be included with
  28. * this file. If not, you can view it at
  29. * http://www.gnu.org/copyleft/gpl.html
  30. * or write to the Free Software Foundation, Inc., 59
  31. * Temple Place - Suite 330, Boston, MA 02111-1307, USA.
  32. *
  33. *********************************************************************/
  34. #include "config.h"
  35. #include <stdio.h>
  36. #include <stdlib.h>
  37. #include <string.h>
  38. #include <assert.h>
  39. #include "miscutil.h"
  40. #include "encode.h"
  41. /* Maps special characters in a URL to their equivalent % codes. */
  42. static const char url_code_map[256][4] = {
  43. "", "%01", "%02", "%03", "%04", "%05", "%06", "%07", "%08", "%09",
  44. "%0A", "%0B", "%0C", "%0D", "%0E", "%0F", "%10", "%11", "%12", "%13",
  45. "%14", "%15", "%16", "%17", "%18", "%19", "%1A", "%1B", "%1C", "%1D",
  46. "%1E", "%1F", "%20", "%21", "%22", "%23", "%24", "%25", "%26", "%27",
  47. "%28", "%29", "", "%2B", "%2C", "", "", "%2F", "", "",
  48. "", "", "", "", "", "", "", "", "%3A", "%3B",
  49. "%3C", "%3D", "%3E", "%3F", "", "", "", "", "", "",
  50. "", "", "", "", "", "", "", "", "", "",
  51. "", "", "", "", "", "", "", "", "", "",
  52. "", "%5B", "%5C", "%5D", "%5E", "", "%60", "", "", "",
  53. "", "", "", "", "", "", "", "", "", "",
  54. "", "", "", "", "", "", "", "", "", "",
  55. "", "", "", "%7B", "%7C", "%7D", "%7E", "%7F", "%80", "%81",
  56. "%82", "%83", "%84", "%85", "%86", "%87", "%88", "%89", "%8A", "%8B",
  57. "%8C", "%8D", "%8E", "%8F", "%90", "%91", "%92", "%93", "%94", "%95",
  58. "%96", "%97", "%98", "%99", "%9A", "%9B", "%9C", "%9D", "%9E", "%9F",
  59. "%A0", "%A1", "%A2", "%A3", "%A4", "%A5", "%A6", "%A7", "%A8", "%A9",
  60. "%AA", "%AB", "%AC", "%AD", "%AE", "%AF", "%B0", "%B1", "%B2", "%B3",
  61. "%B4", "%B5", "%B6", "%B7", "%B8", "%B9", "%BA", "%BB", "%BC", "%BD",
  62. "%BE", "%BF", "%C0", "%C1", "%C2", "%C3", "%C4", "%C5", "%C6", "%C7",
  63. "%C8", "%C9", "%CA", "%CB", "%CC", "%CD", "%CE", "%CF", "%D0", "%D1",
  64. "%D2", "%D3", "%D4", "%D5", "%D6", "%D7", "%D8", "%D9", "%DA", "%DB",
  65. "%DC", "%DD", "%DE", "%DF", "%E0", "%E1", "%E2", "%E3", "%E4", "%E5",
  66. "%E6", "%E7", "%E8", "%E9", "%EA", "%EB", "%EC", "%ED", "%EE", "%EF",
  67. "%F0", "%F1", "%F2", "%F3", "%F4", "%F5", "%F6", "%F7", "%F8", "%F9",
  68. "%FA", "%FB", "%FC", "%FD", "%FE", "%FF"
  69. };
  70. /* Maps special characters in HTML to their equivalent entities. */
  71. static const char * const html_code_map[256] = {
  72. NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL,
  73. NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL,
  74. NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL,
  75. NULL, NULL, NULL, NULL,"&quot;",NULL,NULL,NULL,"&amp;","&#39;",
  76. NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL,
  77. NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL,
  78. "&lt;",NULL,"&gt;",NULL,NULL, NULL, NULL, NULL, NULL, NULL,
  79. NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL,
  80. NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL,
  81. NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL,
  82. NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL,
  83. NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL,
  84. NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL,
  85. NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL,
  86. NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL,
  87. NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL,
  88. NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL,
  89. NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL,
  90. NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL,
  91. NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL,
  92. NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL,
  93. NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL,
  94. NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL,
  95. NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL,
  96. NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL,
  97. NULL, NULL, NULL, NULL, NULL, NULL
  98. };
  99. /*********************************************************************
  100. *
  101. * Function : html_encode
  102. *
  103. * Description : Encodes a string so it's not interpreted as
  104. * containing HTML tags or entities.
  105. * Replaces <, >, &, and " with the appropriate HTML
  106. * entities.
  107. *
  108. * Parameters :
  109. * 1 : s = String to encode. Null-terminated.
  110. *
  111. * Returns : Encoded string, newly allocated on the heap.
  112. * Caller is responsible for freeing it with free().
  113. * If s is NULL, or on out-of memory, returns NULL.
  114. *
  115. *********************************************************************/
  116. char * html_encode(const char *s)
  117. {
  118. char * buf;
  119. size_t buf_size;
  120. if (s == NULL)
  121. {
  122. return NULL;
  123. }
  124. /* each input char can expand to at most 6 chars */
  125. buf_size = (strlen(s) * 6) + 1;
  126. buf = (char *) malloc(buf_size);
  127. if (buf)
  128. {
  129. char c;
  130. char * p = buf;
  131. while ((c = *s++) != '\0')
  132. {
  133. const char * replace_with = html_code_map[(unsigned char) c];
  134. if (replace_with != NULL)
  135. {
  136. const size_t bytes_written = (size_t)(p - buf);
  137. assert(bytes_written < buf_size);
  138. p += strlcpy(p, replace_with, buf_size - bytes_written);
  139. }
  140. else
  141. {
  142. *p++ = c;
  143. }
  144. }
  145. *p = '\0';
  146. assert(strlen(buf) < buf_size);
  147. }
  148. return(buf);
  149. }
  150. /*********************************************************************
  151. *
  152. * Function : html_encode_and_free_original
  153. *
  154. * Description : Encodes a string so it's not interpreted as
  155. * containing HTML tags or entities.
  156. * Replaces <, >, &, and " with the appropriate HTML
  157. * entities. Free()s original string.
  158. * If original string is NULL, simply returns NULL.
  159. *
  160. * Parameters :
  161. * 1 : s = String to encode. Null-terminated.
  162. *
  163. * Returns : Encoded string, newly allocated on the heap.
  164. * Caller is responsible for freeing it with free().
  165. * If s is NULL, or on out-of memory, returns NULL.
  166. *
  167. *********************************************************************/
  168. char * html_encode_and_free_original(char *s)
  169. {
  170. char * result;
  171. if (s == NULL)
  172. {
  173. return NULL;
  174. }
  175. result = html_encode(s);
  176. free(s);
  177. return result;
  178. }
  179. /*********************************************************************
  180. *
  181. * Function : url_encode
  182. *
  183. * Description : Encodes a string so it can be used in a URL
  184. * query string. Replaces special characters with
  185. * the appropriate %xx codes.
  186. *
  187. * XXX: url_query_encode() would be a more fitting
  188. * name.
  189. *
  190. * Parameters :
  191. * 1 : s = String to encode. Null-terminated.
  192. *
  193. * Returns : Encoded string, newly allocated on the heap.
  194. * Caller is responsible for freeing it with free().
  195. * If s is NULL, or on out-of memory, returns NULL.
  196. *
  197. *********************************************************************/
  198. char * url_encode(const char *s)
  199. {
  200. char * buf;
  201. size_t buf_size;
  202. if (s == NULL)
  203. {
  204. return NULL;
  205. }
  206. /* each input char can expand to at most 3 chars */
  207. buf_size = (strlen(s) * 3) + 1;
  208. buf = (char *) malloc(buf_size);
  209. if (buf)
  210. {
  211. char c;
  212. char * p = buf;
  213. while((c = *s++) != '\0')
  214. {
  215. const char *replace_with = url_code_map[(unsigned char) c];
  216. if (*replace_with != '\0')
  217. {
  218. const size_t bytes_written = (size_t)(p - buf);
  219. assert(bytes_written < buf_size);
  220. p += strlcpy(p, replace_with, buf_size - bytes_written);
  221. }
  222. else
  223. {
  224. *p++ = c;
  225. }
  226. }
  227. *p = '\0';
  228. assert(strlen(buf) < buf_size);
  229. }
  230. return(buf);
  231. }
  232. /*********************************************************************
  233. *
  234. * Function : xdtoi
  235. *
  236. * Description : Converts a single hex digit to an integer.
  237. *
  238. * Parameters :
  239. * 1 : d = in the range of ['0'..'9', 'A'..'F', 'a'..'f']
  240. *
  241. * Returns : The integer value, or -1 for non-hex characters.
  242. *
  243. *********************************************************************/
  244. static int xdtoi(const int d)
  245. {
  246. if ((d >= '0') && (d <= '9'))
  247. {
  248. return(d - '0');
  249. }
  250. else if ((d >= 'a') && (d <= 'f'))
  251. {
  252. return(d - 'a' + 10);
  253. }
  254. else if ((d >= 'A') && (d <= 'F'))
  255. {
  256. return(d - 'A' + 10);
  257. }
  258. else
  259. {
  260. return(-1);
  261. }
  262. }
  263. /*********************************************************************
  264. *
  265. * Function : xtoi
  266. *
  267. * Description : Hex string to integer conversion.
  268. *
  269. * Parameters :
  270. * 1 : s = a 2 digit hex string (e.g. "1f"). Only the
  271. * first two characters will be looked at.
  272. *
  273. * Returns : The integer value, or 0 for non-hex strings.
  274. *
  275. *********************************************************************/
  276. int xtoi(const char *s)
  277. {
  278. int d1;
  279. d1 = xdtoi(*s);
  280. if (d1 >= 0)
  281. {
  282. int d2 = xdtoi(*(s+1));
  283. if (d2 >= 0)
  284. {
  285. return (d1 << 4) + d2;
  286. }
  287. }
  288. return 0;
  289. }
  290. /*********************************************************************
  291. *
  292. * Function : url_decode
  293. *
  294. * Description : Decodes a URL query string, replacing %xx codes
  295. * with their decoded form.
  296. *
  297. * Parameters :
  298. * 1 : s = String to decode. Null-terminated.
  299. *
  300. * Returns : Decoded string, newly allocated on the heap.
  301. * Caller is responsible for freeing it with free().
  302. *
  303. *********************************************************************/
  304. char *url_decode(const char * s)
  305. {
  306. char *buf = malloc(strlen(s) + 1);
  307. char *q = buf;
  308. if (buf)
  309. {
  310. while (*s)
  311. {
  312. switch (*s)
  313. {
  314. case '+':
  315. s++;
  316. *q++ = ' ';
  317. break;
  318. case '%':
  319. if ((*q = (char)xtoi(s + 1)) != '\0')
  320. {
  321. s += 3;
  322. q++;
  323. }
  324. else
  325. {
  326. /* malformed, just use it */
  327. *q++ = *s++;
  328. }
  329. break;
  330. default:
  331. *q++ = *s++;
  332. break;
  333. }
  334. }
  335. *q = '\0';
  336. }
  337. return(buf);
  338. }
  339. /*********************************************************************
  340. *
  341. * Function : percent_encode_url
  342. *
  343. * Description : Percent-encodes a string so it no longer contains
  344. * any characters that aren't valid in an URL according
  345. * to RFC 3986.
  346. *
  347. * XXX: Do not confuse with encode_url()
  348. *
  349. * Parameters :
  350. * 1 : s = String to encode. Null-terminated.
  351. *
  352. * Returns : Encoded string, newly allocated on the heap.
  353. * Caller is responsible for freeing it with free().
  354. * If s is NULL, or on out-of memory, returns NULL.
  355. *
  356. *********************************************************************/
  357. char *percent_encode_url(const char *s)
  358. {
  359. static const char allowed_characters[128] = {
  360. '\0', '\0', '\0', '\0', '\0', '\0', '\0', '\0', '\0', '\0',
  361. '\0', '\0', '\0', '\0', '\0', '\0', '\0', '\0', '\0', '\0',
  362. '\0', '\0', '\0', '\0', '\0', '\0', '\0', '\0', '\0', '\0',
  363. '\0', '\0', '\0', '!', '\0', '#', '$', '%', '&', '\'',
  364. '(', ')', '*', '+', ',', '-', '.', '/', '0', '1',
  365. '2', '3', '4', '5', '6', '7', '8', '9', ':', ';',
  366. '\0', '=', '\0', '?', '@', 'A', 'B', 'C', 'D', 'E',
  367. 'F', 'G', 'H', 'I', 'J', 'K', 'L', 'M', 'N', 'O',
  368. 'P', 'Q', 'R', 'S', 'T', 'U', 'V', 'W', 'X', 'Y',
  369. 'Z', '[', '\0', ']', '\0', '_', '\0', 'a', 'b', 'c',
  370. 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k', 'l', 'm',
  371. 'n', 'o', 'p', 'q', 'r', 's', 't', 'u', 'v', 'w',
  372. 'x', 'y', 'z', '\0', '\0', '\0', '~', '\0'
  373. };
  374. char *buf;
  375. size_t buf_size;
  376. assert(s != NULL);
  377. /* Each input char can expand to at most 3 chars. */
  378. buf_size = (strlen(s) * 3) + 1;
  379. buf = (char *)malloc(buf_size);
  380. if (buf != NULL)
  381. {
  382. char c;
  383. char *p = buf;
  384. while ((c = *s++) != '\0')
  385. {
  386. const unsigned int i = (unsigned char)c;
  387. if (i >= sizeof(allowed_characters) || '\0' == allowed_characters[i])
  388. {
  389. const char *replace_with = url_code_map[i];
  390. assert(*replace_with != '\0');
  391. if (*replace_with != '\0')
  392. {
  393. const size_t bytes_written = (size_t)(p - buf);
  394. assert(bytes_written < buf_size);
  395. p += strlcpy(p, replace_with, buf_size - bytes_written);
  396. }
  397. }
  398. else
  399. {
  400. *p++ = c;
  401. }
  402. }
  403. *p = '\0';
  404. assert(strlen(buf) < buf_size);
  405. }
  406. return(buf);
  407. }
  408. /*
  409. Local Variables:
  410. tab-width: 3
  411. end:
  412. */