polipo.texi 82 KB

12345678910111213141516171819202122232425262728293031323334353637383940414243444546474849505152535455565758596061626364656667686970717273747576777879808182838485868788899091929394959697989910010110210310410510610710810911011111211311411511611711811912012112212312412512612712812913013113213313413513613713813914014114214314414514614714814915015115215315415515615715815916016116216316416516616716816917017117217317417517617717817918018118218318418518618718818919019119219319419519619719819920020120220320420520620720820921021121221321421521621721821922022122222322422522622722822923023123223323423523623723823924024124224324424524624724824925025125225325425525625725825926026126226326426526626726826927027127227327427527627727827928028128228328428528628728828929029129229329429529629729829930030130230330430530630730830931031131231331431531631731831932032132232332432532632732832933033133233333433533633733833934034134234334434534634734834935035135235335435535635735835936036136236336436536636736836937037137237337437537637737837938038138238338438538638738838939039139239339439539639739839940040140240340440540640740840941041141241341441541641741841942042142242342442542642742842943043143243343443543643743843944044144244344444544644744844945045145245345445545645745845946046146246346446546646746846947047147247347447547647747847948048148248348448548648748848949049149249349449549649749849950050150250350450550650750850951051151251351451551651751851952052152252352452552652752852953053153253353453553653753853954054154254354454554654754854955055155255355455555655755855956056156256356456556656756856957057157257357457557657757857958058158258358458558658758858959059159259359459559659759859960060160260360460560660760860961061161261361461561661761861962062162262362462562662762862963063163263363463563663763863964064164264364464564664764864965065165265365465565665765865966066166266366466566666766866967067167267367467567667767867968068168268368468568668768868969069169269369469569669769869970070170270370470570670770870971071171271371471571671771871972072172272372472572672772872973073173273373473573673773873974074174274374474574674774874975075175275375475575675775875976076176276376476576676776876977077177277377477577677777877978078178278378478578678778878979079179279379479579679779879980080180280380480580680780880981081181281381481581681781881982082182282382482582682782882983083183283383483583683783883984084184284384484584684784884985085185285385485585685785885986086186286386486586686786886987087187287387487587687787887988088188288388488588688788888989089189289389489589689789889990090190290390490590690790890991091191291391491591691791891992092192292392492592692792892993093193293393493593693793893994094194294394494594694794894995095195295395495595695795895996096196296396496596696796896997097197297397497597697797897998098198298398498598698798898999099199299399499599699799899910001001100210031004100510061007100810091010101110121013101410151016101710181019102010211022102310241025102610271028102910301031103210331034103510361037103810391040104110421043104410451046104710481049105010511052105310541055105610571058105910601061106210631064106510661067106810691070107110721073107410751076107710781079108010811082108310841085108610871088108910901091109210931094109510961097109810991100110111021103110411051106110711081109111011111112111311141115111611171118111911201121112211231124112511261127112811291130113111321133113411351136113711381139114011411142114311441145114611471148114911501151115211531154115511561157115811591160116111621163116411651166116711681169117011711172117311741175117611771178117911801181118211831184118511861187118811891190119111921193119411951196119711981199120012011202120312041205120612071208120912101211121212131214121512161217121812191220122112221223122412251226122712281229123012311232123312341235123612371238123912401241124212431244124512461247124812491250125112521253125412551256125712581259126012611262126312641265126612671268126912701271127212731274127512761277127812791280128112821283128412851286128712881289129012911292129312941295129612971298129913001301130213031304130513061307130813091310131113121313131413151316131713181319132013211322132313241325132613271328132913301331133213331334133513361337133813391340134113421343134413451346134713481349135013511352135313541355135613571358135913601361136213631364136513661367136813691370137113721373137413751376137713781379138013811382138313841385138613871388138913901391139213931394139513961397139813991400140114021403140414051406140714081409141014111412141314141415141614171418141914201421142214231424142514261427142814291430143114321433143414351436143714381439144014411442144314441445144614471448144914501451145214531454145514561457145814591460146114621463146414651466146714681469147014711472147314741475147614771478147914801481148214831484148514861487148814891490149114921493149414951496149714981499150015011502150315041505150615071508150915101511151215131514151515161517151815191520152115221523152415251526152715281529153015311532153315341535153615371538153915401541154215431544154515461547154815491550155115521553155415551556155715581559156015611562156315641565156615671568156915701571157215731574157515761577157815791580158115821583158415851586158715881589159015911592159315941595159615971598159916001601160216031604160516061607160816091610161116121613161416151616161716181619162016211622162316241625162616271628162916301631163216331634163516361637163816391640164116421643164416451646164716481649165016511652165316541655165616571658165916601661166216631664166516661667166816691670167116721673167416751676167716781679168016811682168316841685168616871688168916901691169216931694169516961697169816991700170117021703170417051706170717081709171017111712171317141715171617171718171917201721172217231724172517261727172817291730173117321733173417351736173717381739174017411742174317441745174617471748174917501751175217531754175517561757175817591760176117621763176417651766176717681769177017711772177317741775177617771778177917801781178217831784178517861787178817891790179117921793179417951796179717981799180018011802180318041805180618071808180918101811181218131814181518161817181818191820182118221823182418251826182718281829183018311832183318341835183618371838183918401841184218431844184518461847184818491850185118521853185418551856185718581859186018611862186318641865186618671868186918701871187218731874187518761877187818791880188118821883188418851886188718881889189018911892189318941895189618971898189919001901190219031904190519061907190819091910191119121913191419151916191719181919192019211922192319241925192619271928192919301931193219331934193519361937193819391940194119421943194419451946194719481949195019511952195319541955195619571958195919601961196219631964
  1. \input texinfo @c -*-texinfo-*-
  2. @c %**start of header
  3. @setfilename polipo.info
  4. @settitle The Polipo Manual
  5. @afourpaper
  6. @c %**end of header
  7. @dircategory Network Applications
  8. @direntry
  9. * Polipo: (polipo). The Polipo caching web proxy.
  10. @end direntry
  11. @copying
  12. Copyright @copyright{} 2003 -- 2014 by Juliusz Chroboczek.
  13. @end copying
  14. @titlepage
  15. @title The Polipo Manual
  16. @author Juliusz Chroboczek
  17. @page
  18. @vskip 0pt plus 1fill
  19. Polipo is a caching web proxy designed to be used as a personal
  20. cache or a cache shared among a few users.
  21. @vskip 0pt plus 1fill
  22. @insertcopying
  23. @end titlepage
  24. @contents
  25. @ifnottex
  26. @node Top, Background, (dir), (dir)
  27. @top Polipo
  28. Polipo is a caching web proxy designed to be used as a personal
  29. cache or a cache shared among a few users.
  30. @ifhtml
  31. The latest version of Polipo can be found on
  32. @uref{http://www.pps.univ-paris-diderot.fr/~jch/software/polipo/,the Polipo web page}.
  33. @end ifhtml
  34. This manual was written by
  35. @uref{http://www.pps.univ-paris-diderot.fr/~jch/,,Juliusz Chroboczek}.
  36. @end ifnottex
  37. @menu
  38. * Background:: Background information.
  39. * Running:: Running Polipo
  40. * Network:: Polipo and the network.
  41. * Caching:: Caching.
  42. * Memory usage:: Limiting Polipo's memory usage.
  43. * Copying:: Your rights and mine.
  44. * Variable index:: Variable index.
  45. * Concept index:: Concept index.
  46. @end menu
  47. @node Background, Running, Top, Top
  48. @chapter Background
  49. @menu
  50. * The web:: The web and HTTP.
  51. * Proxies and caches:: Proxies and caches.
  52. * Latency and throughput:: Optimise latency, not throughput.
  53. * Network traffic:: Be nice to the net.
  54. * Partial instances:: Don't discard data.
  55. * POST and PUT:: Other requests
  56. @end menu
  57. @node The web, Proxies and caches, Background, Background
  58. @section The web and HTTP
  59. @cindex URL
  60. @cindex resource
  61. @cindex instance
  62. @cindex entity
  63. @cindex HTTP
  64. The web is a wide-scale decentralised distributed hypertext system,
  65. something that's obviously impossible to achieve reliably.
  66. The web is a collection of @dfn{resources} which are identified by
  67. @dfn{URLs}, strings starting with @code{http://}. At any point in
  68. time, a resource has a certain value, which is called an
  69. @dfn{instance} of the resource.
  70. The fundamental protocol of the web is HTTP, a simple request/response
  71. protocol. With HTTP, a client can make a request for a resource to a
  72. server, and the server replies with an @dfn{entity}, which is an
  73. on-the-wire representation of an instance or of a fragment thereof.
  74. @node Proxies and caches, Latency and throughput, The web, Background
  75. @section Proxies and caches
  76. @cindex proxy
  77. @cindex caching
  78. A proxy is a program that acts as both a client and a server. It
  79. listens for client requests and forwards them to servers, and forwards
  80. the servers' replies to clients.
  81. An HTTP proxy can optimise web traffic away by @dfn{caching} server
  82. replies, storing them in memory in case they are needed again. If a
  83. reply has been cached, a later client request may, under some
  84. conditions, be satisfied without going to the source again.
  85. In addition to taking the shortcuts made possible by caching, proxies
  86. can improve performance by generating better network traffic than the
  87. client applications would do.
  88. Proxies are also useful in ways unrelated to raw performance. A proxy
  89. can be used to contact a server that is not directly accessible to the
  90. client, for example because there is a firewall in the way
  91. (@pxref{Parent proxies}), or because the client and the server use
  92. different lower layer protocols (for example IPv4 and IPv6). Another
  93. common application of proxies is to modify the data sent to servers
  94. and returned to clients, for example by censoring headers that expose
  95. too much about the client's identity (@pxref{Censoring headers}) or
  96. removing advertisements from the data returned by the server
  97. (@pxref{Forbidden}).
  98. Polipo is a caching HTTP proxy that was originally designed as
  99. a @dfn{personal} proxy, i.e.@: a proxy that is used by a single user
  100. or a small group of users. However, it has successfully been used by
  101. larger groups.
  102. @node Latency and throughput, Network traffic, Proxies and caches, Background
  103. @section Latency and throughput
  104. @cindex throughput
  105. @cindex latency
  106. Most network benchmarks consider @dfn{throughput}, or the average
  107. amount of data being pushed around per unit of time. While important
  108. for batch applications (for example benchmarks), average throughput is
  109. mostly irrelevant when it comes to interactive web usage. What is more
  110. important is a transaction's median @dfn{latency}, or whether the data
  111. starts to trickle down before the user gets annoyed.
  112. Typical web caches optimise for throughput --- for example, by
  113. consulting sibling caches before accessing a remote resource. By
  114. doing so, they significantly add to the median latency, and therefore
  115. to the average user frustration.
  116. Polipo was designed to minimise latency.
  117. @node Network traffic, Partial instances, Latency and throughput, Background
  118. @section Network traffic
  119. The web was developed by people who were interested in text processing
  120. rather than in networking and, unsurprisingly enough, the first
  121. versions of the HTTP protocol did not make very good use of network
  122. resources. The main problem in HTTP/0.9 and early versions of
  123. HTTP/1.0 was that a separate TCP connection (``virtual circuit'' for
  124. them telecom people) was created for every entity transferred.
  125. Opening multiple TCP connections has significant performance
  126. implications. Obviously, connection setup and teardown require
  127. additional packet exchanges which increase network usage and, more
  128. importantly, latency.
  129. Less obviously, TCP is not optimised for that sort of usage. TCP aims
  130. to avoid network @dfn{congestion}, a situation in which the network
  131. becomes unusable due to overly aggressive traffic patterns. A correct
  132. TCP implementation will very carefully probe the network at the
  133. beginning of every connection, which means that a TCP connection is
  134. very slow during the first couple of kilobytes transferred, and only
  135. gets up to speed later. Because most HTTP entities are small (in the
  136. 1 to 10 kilobytes range), HTTP/0.9 uses TCP where it is most inefficient.
  137. @menu
  138. * Persistent connections:: Don't shut connections down.
  139. * Pipelining:: Send a bunch of requests at once.
  140. * Poor Mans Multiplexing:: Split requests.
  141. @end menu
  142. @node Persistent connections, Pipelining, Network traffic, Network traffic
  143. @subsection Persistent connections
  144. @cindex persistent connection
  145. @cindex keep-alive connection
  146. Later HTTP versions allow the transfer of multiple entities on a
  147. single connection. A connection that carries multiple entities is
  148. said to be @dfn{persistent} (or sometimes @dfn{keep-alive}).
  149. Unfortunately, persistent connections are an optional feature of HTTP,
  150. even in version 1.1.
  151. Polipo will attempt to use persistent connections on the server side,
  152. and will honour persistent connection requests from clients.
  153. @node Pipelining, Poor Mans Multiplexing, Persistent connections, Network traffic
  154. @subsection Pipelining
  155. @cindex Pipelining
  156. With persistent connections it becomes possible to @dfn{pipeline} or
  157. @dfn{stream} requests, i.e. to send multiple requests on a single
  158. connection without waiting for the replies to come back. Because this
  159. technique gets the requests to the server faster, it reduces latency.
  160. Additionally, because multiple requests can often be sent in a single
  161. packet, pipelining reduces network traffic.
  162. Pipelining is a fairly common technique@footnote{The X11 protocol
  163. fundamentally relies on pipelining. NNTP does support pipelining.
  164. SMTP doesn't, while ESMTP makes it an option. FTP does support
  165. pipelining on the control connection.}, but it is not supported by
  166. HTTP/1.0. HTTP/1.1 makes pipelining support compulsory in every
  167. server implementation that can use persistent connections, but there
  168. are a number of buggy servers that claim to implement HTTP/1.1 but
  169. don't support pipelining.
  170. Polipo carefully probes for pipelining support in a server and uses
  171. pipelining if it believes that it is reliable. Polipo also deeply
  172. enjoys being pipelined at by a client@footnote{Other client-side
  173. implementations of HTTP that make use of pipelining include
  174. @uref{http://www.opera.com/,,Opera},
  175. @uref{http://www.mozilla.org,,Mozilla}, APT (the package downloader
  176. used by @uref{http://www.debian.org,,Debian} GNU/Linux) and LFTP.}.
  177. @node Poor Mans Multiplexing, , Pipelining, Network traffic
  178. @subsection Poor Man's Multiplexing
  179. @cindex Poor Man's Multiplexing
  180. @cindex multiplexing
  181. A major weakness of the HTTP protocol is its inability to share a
  182. single connection between multiple simultaneous transactions --- to
  183. @dfn{multiplex} a number of transactions over a single connection. In
  184. HTTP, a client can either request all instances sequentially, which
  185. significantly increases latency, or else open multiple concurrent
  186. connections, with all the problems that this implies
  187. (@pxref{Persistent connections}).
  188. Poor Man's Multiplexing (PMM) is a technique that simulates
  189. multiplexing by requesting an instance in multiple segments; because
  190. the segments are fetched in independent transactions, they can be
  191. interleaved with requests for other resources.
  192. Obviously, PMM only makes sense in the presence of persistent
  193. connections; additionally, it is only effective in the presence of
  194. pipelining (@pxref{Pipelining}).
  195. PMM poses a number of reliability issues. If the resource being
  196. fetched is dynamic, it is quite possible that it will change between
  197. segments; thus, an implementation making use of PMM needs to be able
  198. to switch to full-resource retrieval when it detects a dynamic
  199. resource.
  200. Polipo supports PMM, but it is disabled it by default (@pxref{PMM}).
  201. @node Partial instances, POST and PUT, Network traffic, Background
  202. @section Caching partial instances
  203. @cindex partial instance
  204. @cindex range request
  205. A partial instance is an instance that is being cached but only part
  206. of which is available in the local cache. There are three ways in
  207. which partial instances can arise: client applications requesting only
  208. part of an instance (Adobe's Acrobat Reader plugin is famous for
  209. that), a server dropping a connection mid-transfer (because it is
  210. short on resources, or, surprisingly often, because it is buggy), a
  211. client dropping a connection (usually because the user pressed
  212. @emph{stop}).
  213. When an instance is requested that is only partially cached, it is
  214. possible to request just the missing data by using a feature of HTTP
  215. known as a @dfn{range} request. While support for range requests is
  216. optional, most servers honour them in case of static data (data that
  217. are stored on disk, rather then being generated on the fly e.g.@: by a
  218. CGI script).
  219. Caching partial instances has a number of positive effects. Obviously,
  220. it reduces the amount of data transmitted as the available data
  221. needn't be fetched again. Because it prevents partial data from being
  222. discarded, it makes it reasonable for a proxy to unconditionally abort
  223. a download when requested by the user, and therefore reduces network
  224. traffic.
  225. Polipo caches arbitrary partial instances in its in-memory cache. It
  226. will only store the initial segment of a partial instance (from its
  227. beginning up to its first hole) in its on-disk cache, though. In
  228. either case, it will attempt to use range requests to fetch the
  229. missing data.
  230. @node POST and PUT, , Partial instances, Background
  231. @section Other requests
  232. @cindex GET request
  233. @cindex HEAD request
  234. @cindex PUT request
  235. @cindex POST request
  236. @cindex OPTIONS request
  237. @cindex DELETE request
  238. @cindex PROPFIND request
  239. The previous sections pretend that there is only one kind of request
  240. in HTTP --- the @samp{GET} request. In fact, there are some others.
  241. The @samp{HEAD} request method retrieves data about an resource. Polipo
  242. does not normally use @samp{HEAD}, but will fall back to using it for
  243. validation it if finds that a given server fails to cooperate with its
  244. standard validation methods (@pxref{Cache transparency}). Polipo will
  245. correctly reply to a client's @samp{HEAD} request.
  246. The @samp{POST} method is used to request that the server should do
  247. something rather than merely sending an entity; it is usually used
  248. with HTML forms that have an effect@footnote{HTML forms should use the
  249. @samp{GET} method when the form has no side-effect as this makes the
  250. results cacheable.}. The @samp{PUT} method is used to replace an
  251. resource with a different instance; it is typically used by web
  252. publishing applications.
  253. @samp{POST}, @samp{PUT}, @samp{OPTIONS} and @samp{DELETE} requests are handled by
  254. Polipo pretty much like @samp{GET} and @samp{HEAD}; however, for various
  255. reasons, some precautions must be taken. In particular, any cached data
  256. for the resource they refer to must be discarded, and they can never be
  257. pipelined.
  258. Finally, HTTP/1.1 includes a convenient backdoor with the
  259. @samp{CONNECT} method. For more information, please see
  260. @ref{Tunnelling connections}.
  261. Polipo does not currently handle the more exotic methods such as
  262. @samp{PROPFIND}.
  263. @node Running, Network, Background, Top
  264. @chapter Running Polipo
  265. @menu
  266. * Polipo Invocation:: Starting Polipo.
  267. * Browser configuration:: Configuring your browser.
  268. * Stopping:: Stopping and refreshing Polipo.
  269. * Local server:: The local web server and web interface.
  270. @end menu
  271. @node Polipo Invocation, Browser configuration, Running, Running
  272. @section Starting Polipo
  273. @cindex invocation
  274. By default, Polipo runs as a normal foreground job in a terminal in
  275. which it can log random ``How do you do?'' messages. With the right
  276. configuration options, Polipo can run as a daemon.
  277. Polipo is run with the following command line:
  278. @example
  279. $ polipo [ -h ] [ -v ] [ -x ] [ -c @var{config} ] [ @var{var}=@var{val}... ]
  280. @end example
  281. All flags are optional. The flag @option{-h} causes Polipo to print a
  282. short help message and to quit. The flag @option{-v} causes Polipo to
  283. list all of its configuration variables and quit. The flag
  284. @option{-x} causes Polipo to purge its on-disk cache and then quit
  285. (@pxref{Purging}). The flag @option{-c} specifies the configuration
  286. file to use (by default @file{~/.polipo} or
  287. @file{/etc/polipo/config}). Finally, Polipo's configuration can be
  288. changed on the command line by assigning values to given configuration
  289. variables.
  290. @menu
  291. * Configuring Polipo:: Plenty of options.
  292. * Daemon:: Running in the background.
  293. * Logging:: Funnelling status messages.
  294. @end menu
  295. @node Configuring Polipo, Daemon, Polipo Invocation, Polipo Invocation
  296. @subsection Configuration
  297. @cindex runtime configuration
  298. @cindex variable
  299. @cindex configuration variable
  300. @cindex configuration file
  301. There is a number of variables that you can tweak in order to
  302. configure Polipo, and they should all be described in this manual
  303. (@pxref{Variable index}). You can display the complete, most
  304. up-to-date list of configuration variables by using the @option{-v}
  305. command line flag or by accessing the ``current configuration'' page
  306. of Polipo's web interface (@pxref{Web interface}). Configuration
  307. variables can be set either on the command line or else in the
  308. configuration file given by the @option{-c} command-line flag.
  309. Configuration variables are typed, and @option{-v} will display their
  310. types. The type can be of one of the following:
  311. @itemize @bullet
  312. @item
  313. @samp{integer} or @samp{float}: a numeric value;
  314. @item
  315. @samp{boolean}: a truth value, one of @samp{true} or @samp{false};
  316. @item
  317. @samp{tristate}: one of @samp{false}, @samp{maybe} or @samp{true};
  318. @item
  319. @samp{4-state}, one of @samp{false}, @samp{reluctantly},
  320. @samp{happily} or @samp{true};
  321. @item
  322. @samp{5-state}, one of @samp{false}, @samp{reluctantly}, @samp{maybe},
  323. @samp{happily} or @samp{true};
  324. @item
  325. @samp{atom}, a string written within double quotes @samp{"});
  326. @item
  327. @samp{list}, a comma-separated list of strings;
  328. @item
  329. @samp{intlist}, a comma-separated list of integers and ranges of
  330. integers (of the form `@var{n}--@var{m}').
  331. @end itemize
  332. The configuration file has a very simple syntax. All blank lines are
  333. ignored, as are lines starting with a hash sign @samp{#}. Other lines
  334. must be of the form
  335. @example
  336. @var{var} = @var{val}
  337. @end example
  338. where @var{var} is a variable to set and @var{val} is the value to set
  339. it to.
  340. It is possible to change the configuration of a running polipo by
  341. using the local configuration interface (@pxref{Web interface}).
  342. @node Daemon, Logging, Configuring Polipo, Polipo Invocation
  343. @subsection Running as a daemon
  344. @cindex daemon
  345. @cindex terminal
  346. @cindex pid
  347. @vindex daemonise
  348. @vindex pidFile
  349. If the configuration variable @code{daemonise} is set to true, Polipo
  350. will run as a daemon: it will fork and detach from its controlling
  351. terminal (if any). The variable @code{daemonise} defaults to false.
  352. When Polipo is run as a daemon, it can be useful to get it to
  353. atomically write its @emph{pid} to a file. If the variable
  354. @code{pidFile} is defined, it should be the name of a file where
  355. Polipo will write its @emph{pid}. If the file already exists when it
  356. is started, Polipo will refuse to run.
  357. @node Logging, , Daemon, Polipo Invocation
  358. @subsection Logging
  359. @cindex logging
  360. @vindex logLevel
  361. @vindex logFile
  362. @vindex logFilePermissions
  363. @vindex logSyslog
  364. @vindex logFacility
  365. @vindex scrubLogs
  366. When it encounters a difficulty, Polipo will print a friendly message.
  367. The location where these messages go is controlled by the
  368. configuration variables @code{logFile} and @code{logSyslog}.
  369. If @code{logSyslog} is @code{true}, error messages go to the system log
  370. facility given by @code{logFacility}. If @code{logFile} is set, it is
  371. the name of a file where all output will accumulate. If @code{logSyslog}
  372. is @code{false} and @code{logFile} is empty, messages go to the error
  373. output of the process (normally the terminal).
  374. The variable @code{logFile} defaults to empty if @code{daemonise} is
  375. false, and to @samp{/var/log/polipo} otherwise. The variable
  376. @code{logSyslog} defaults to @code{false}, and @code{logFacility}
  377. defaults to @samp{user}.
  378. If @code{logFile} is set, then the variable @code{logFilePermissions}
  379. controls the Unix permissions with which the log file will be created if
  380. it doesn't exist. It defaults to 0640.
  381. The amount of logging is controlled by the variable @code{logLevel}.
  382. Please see the file @samp{log.h} in the Polipo sources for the
  383. possible values of @code{logLevel}.
  384. Keeping extensive logs on your users browsing habits is probably
  385. a serere violation of their privacy. If the variable @code{scrubLogs}
  386. is set, then Polipo will scrub most, if not all, private information
  387. from its logs.
  388. @node Browser configuration, Stopping, Polipo Invocation, Running
  389. @section Configuring your browser
  390. @cindex browser configuration
  391. @cindex user-agent configuration
  392. Telling your user-agent (web browser) to use Polipo is an operation
  393. that depends on the browser. Many user-agents will transparently use
  394. Polipo if the environment variable @samp{http_proxy} points at it;
  395. e.g.@:
  396. @example
  397. $ export http_proxy=http://localhost:8123/
  398. @end example
  399. Netscape Navigator, Mozilla, Mozilla Firefox, KDE's Konqueror and
  400. probably other browsers require that you configure them manually
  401. through their @emph{Preferences} or @emph{Configure} menu.
  402. If your user-agent sports such options, tell it to use persistent
  403. connections when speaking to proxies, to speak HTTP/1.1 and to use
  404. HTTP/1.1 pipelining.
  405. @node Stopping, Local server, Browser configuration, Running
  406. @section Stopping Polipo and getting it to reload
  407. @cindex signals
  408. @cindex shutting down
  409. @cindex stopping
  410. Polipo will shut down cleanly if it receives @code{SIGHUP},
  411. @code{SIGTERM} or @code{SIGINT} signals; this will normally happen
  412. when a Polipo in the foreground receives a @code{^C} key press, when
  413. your system shuts down, or when you use the @code{kill} command with
  414. no flags. Polipo will then write-out all its in-memory data to disk
  415. and quit.
  416. If Polipo receives the @code{SIGUSR1} signal, it will write out all
  417. the in-memory data to disk (but won't discard them), reopen the log
  418. file, and then reload the forbidden URLs file (@pxref{Forbidden}).
  419. Finally, if Polipo receives the @code{SIGUSR2} signal, it will write
  420. out all the in-memory data to disk and discard as much of the memory
  421. cache as possible. It will then reopen the log file and reload the
  422. forbidden URLs file.
  423. @node Local server, , Stopping, Running
  424. @section The local web server
  425. @vindex localDocumentRoot
  426. @vindex disableProxy
  427. @cindex web server
  428. @cindex local server
  429. Polipo includes a local web server, which is accessible on the same
  430. port as the one the proxy listens to. Therefore, by default you can
  431. access Polipo's local web server as @samp{http://localhost:8123/}.
  432. The data for the local web server can be configured by setting
  433. @code{localDocumentRoot}, which defaults to
  434. @file{/usr/share/polipo/www/}. Setting this variable to @samp{""}
  435. will disable the local server.
  436. Polipo assumes that the local web tree doesn't change behind its back.
  437. If you change any of the local files, you will need to notify Polipo
  438. by sending it a @code{SIGUSR2} signal (@pxref{Stopping}).
  439. If you use polipo as a publicly accessible web server, you might want
  440. to set the variable @code{disableProxy}, which will prevent it from
  441. acting as a web proxy. (You will also want to set
  442. @code{disableLocalInterface} (@pxref{Web interface}), and perhaps run
  443. Polipo in a @emph{chroot} jail.)
  444. @menu
  445. * Web interface:: The web interface.
  446. @end menu
  447. @node Web interface, , Local server, Local server
  448. @subsection The web interface
  449. @cindex runtime configuration
  450. @cindex web interface
  451. @vindex disableLocalInterface
  452. @vindex disableConfiguration
  453. @vindex disableServersList
  454. The subtree of the local web space rooted at
  455. @samp{http://localhost:8123/polipo/} is treated specially: URLs under
  456. this root do not correspond to on-disk files, but are generated by
  457. Polipo on-the-fly. We call this subtree Polipo's @dfn{local web
  458. interface}.
  459. The page @samp{http://localhost:8123/polipo/config?} contains the
  460. values of all configuration variables, and allows setting most of them.
  461. The page @samp{http://localhost:8123/polipo/status?} provides a summary
  462. status report about the running Polipo, and allows performing a number
  463. of actions on the proxy, notably flushing the in-memory cache.
  464. The page @samp{http://localhost:8123/polipo/servers?} contains the list
  465. of known servers, and the statistics maintained about them
  466. (@pxref{Server statistics}).
  467. The pages starting with @samp{http://localhost:8123/polipo/index?}
  468. contain indices of the disk cache. For example, the following page
  469. contains the index of the cached pages from the server of some random
  470. company:
  471. @example
  472. http://localhost:8123/polipo/index?http://www.microsoft.com/
  473. @end example
  474. The pages starting with
  475. @samp{http://localhost:8123/polipo/recursive-index?} contain recursive
  476. indices of various servers. This functionality is disabled by
  477. default, and can be enabled by setting the variable
  478. @code{disableIndexing}.
  479. If you have multiple users, you will probably want to disable the
  480. local interface by setting the variable @code{disableLocalInterface}.
  481. You may also selectively control setting of variables, indexing and
  482. listing known servers by setting the variables
  483. @code{disableConfiguration}, @code{disableIndexing} and
  484. @code{disableServersList}.
  485. @node Network, Caching, Running, Top
  486. @chapter Polipo and the network
  487. @menu
  488. * Client connections:: Speaking to clients
  489. * Contacting servers:: Contacting servers.
  490. * HTTP tuning:: Tuning at the HTTP level.
  491. * Offline browsing:: Browsing with poor connectivity.
  492. * Server statistics:: Polipo keeps statistics about servers.
  493. * Server-side behaviour:: Tuning the server-side behaviour.
  494. * PMM:: Poor Man's Multiplexing.
  495. * Forbidden:: You can forbid some URLs.
  496. * DNS:: How Polipo finds hosts.
  497. * Parent proxies:: Fetching data from other proxies.
  498. * Tuning POST and PUT:: Tuning POST and PUT requests.
  499. * Tunnelling connections:: Tunnelling foreign protocols and https.
  500. @end menu
  501. @node Client connections, Contacting servers, Network, Network
  502. @section Client connections
  503. @vindex proxyAddress
  504. @vindex proxyPort
  505. @vindex proxyName
  506. @vindex displayName
  507. @cindex address
  508. @cindex port
  509. @cindex IPv6
  510. @cindex proxy loop
  511. @cindex loop
  512. @cindex proxy name
  513. @cindex via
  514. @cindex loopback address
  515. @cindex security
  516. There are three fundamental values that control how Polipo speaks to
  517. clients. The variable @code{proxyAddress}, defines the IP address on
  518. which Polipo will listen; by default, its value is the @dfn{loopback
  519. address} @code{"127.0.0.1"}, meaning that Polipo will listen on the
  520. IPv4 loopback interface (the local host) only. By setting this
  521. variable to a global IP address or to one of the special values
  522. @code{"::"} or @code{"0.0.0.0"}, it is possible to allow Polipo to
  523. serve remote clients. This is likely to be a security hole unless you
  524. set @code{allowedClients} to a reasonable value (@pxref{Access control}).
  525. Note that the type of address that you specify for @code{proxyAddress}
  526. will determine whether Polipo listens to IPv4 or IPv6. Currently, the
  527. only way to have Polipo listen to both protocols is to specify the
  528. IPv6 unspecified address (@code{"::"}) for @code{proxyAddress}.
  529. The variable @code{proxyPort}, by default 8123, defines the TCP port
  530. on which Polipo will listen.
  531. The variable @code{proxyName}, which defaults to the host name of the
  532. machine on which Polipo is running, defines the @dfn{name} of the
  533. proxy. This can be an arbitrary string that should be unique among
  534. all instances of Polipo that you are running. Polipo uses it in error
  535. messages and optionally for detecting proxy loops (by using the
  536. @samp{Via} HTTP header, @pxref{Censoring headers}). Finally, the
  537. @code{displayName} variable specifies the name used in user-visible
  538. error messages (default ``Polipo'').
  539. @menu
  540. * Access control:: Deciding who can connect.
  541. @end menu
  542. @node Access control, , Client connections, Client connections
  543. @subsection Access control
  544. @vindex proxyAddress
  545. @vindex authCredentials
  546. @vindex authRealm
  547. @vindex allowedClients
  548. @cindex access control
  549. @cindex authentication
  550. @cindex loopback address
  551. @cindex security
  552. @cindex username
  553. @cindex password
  554. By making it possible to have Polipo listen on a non-routable address
  555. (for example the loopback address @samp{127.0.0.1}), the variable
  556. @code{proxyAddress} provides a very crude form of @dfn{access
  557. control}: the ability to decide which hosts are allowed to connect.
  558. A finer form of access control can be implemented by specifying
  559. explicitly a number of client addresses or ranges of addresses
  560. (networks) that a client is allowed to connect from. This is done
  561. by setting the variable @code{allowedClients}.
  562. Every entry in @code{allowedClients} can be an IP address, for example
  563. @samp{134.157.168.57} or @samp{::1}. It can also be a network
  564. address, i.e.@: an IP address and the number of bits in the network
  565. prefix, for example @samp{134.157.168.0/24} or
  566. @samp{2001:660:116::/48}. Typical uses of @samp{allowedClients}
  567. variable include
  568. @example
  569. allowedClients = 127.0.0.1, ::1, 134.157.168.0/24, 2001:660:116::/48
  570. @end example
  571. or, for an IPv4-only version of Polipo,
  572. @example
  573. allowedClients = 127.0.0.1, 134.157.168.0/24
  574. @end example
  575. A different form of access control can be implemented by requiring
  576. each client to @dfn{authenticate}, i.e.@: to prove its identity before
  577. connecting. Polipo currently only implements the most insecure form
  578. of authentication, @dfn{HTTP basic authentication}, which sends
  579. usernames and passwords in clear over the network. HTTP basic
  580. authentication is required when the variable @code{authCredentials} is
  581. not null; its value should be of the form @samp{username:password}.
  582. Note that both IP-based authentication and HTTP basic authentication
  583. are insecure: the former is vulnerable to IP address spoofing, the
  584. latter to replay attacks. If you need to access Polipo over the
  585. public Internet, the only secure option is to have it listen over the
  586. loopback interface only and use an ssh tunnel (@pxref{Parent
  587. proxies})@footnote{It is not quite clear to me whether HTTP digest
  588. authentication is worth implementing. On the one hand, if implemented
  589. correctly, it appears to provide secure authentication; on the other
  590. hand, and unlike ssh or SSL, it doesn't make any attempt at ensuring
  591. privacy, and its optional integrity guarantees are impossible to
  592. implement without significantly impairing latency.}.
  593. @node Contacting servers, HTTP tuning, Client connections, Network
  594. @section Contacting servers
  595. @cindex multiple addresses
  596. @cindex IPv6
  597. @vindex useTemporarySourceAddress
  598. @vindex proxyOutgoingAddress
  599. A server can have multiple addresses, for example if it is
  600. @dfn{multihomed} (connected to multiple networks) or if it can speak
  601. both IPv4 and IPv6. Polipo will try all of a hosts addresses in turn;
  602. once it has found one that works, it will stick to that address until
  603. it fails again.
  604. If your host has multiple IP addresses, you can specify an IP address
  605. to use for outgoing connections with the @code{proxyOutgoingAddress}
  606. variable. If not specified (the default), it will be determined by
  607. the host OS.
  608. If connecting via IPv6 there is the possibility to use temporary
  609. source addresses to increase privacy (RFC@tie{}3041). The variable
  610. @code{useTemporarySourceAddress} controls the use of temporary
  611. addresses for outgoing connections; if set to @code{true}
  612. temporary addresses are preferred, if set to @code{false} static addresses
  613. are used and if set to @code{maybe} (the default) the operation
  614. system default is in effect. This setting is not available
  615. on all operation systems.
  616. @menu
  617. * Allowed ports:: Where the proxy is allowed to connect.
  618. @end menu
  619. @node Allowed ports, , Contacting servers, Contacting servers
  620. @subsection Allowed ports
  621. @cindex Allowed ports
  622. @cindex Forbidden ports
  623. @cindex ports
  624. @vindex allowedPorts
  625. A TCP service is identified not only by the IP address of the machine
  626. it is running on, but also by a small integer, the TCP @dfn{port} it
  627. is @dfn{listening} on. Normally, web servers listen on port 80, but
  628. it is not uncommon to have them listen on different ports; Polipo's
  629. internal web server, for example, listens on port 8123 by default.
  630. The variable @code{allowedPorts} contains the list of ports that
  631. Polipo will accept to connect to on behalf of clients; it defaults to
  632. @samp{80-100, 1024-65535}. Set this variable to @samp{1-65535} if your
  633. clients (and the web pages they consult!) are fully trusted. (The
  634. variable @code{allowedPorts} is not considered for tunnelled
  635. connections; @pxref{Tunnelling connections}).
  636. @node HTTP tuning, Offline browsing, Contacting servers, Network
  637. @section Tuning at the HTTP level
  638. @cindex HTTP
  639. @cindex headers
  640. @menu
  641. * Tuning the HTTP parser:: Tuning parsing of HTTP headers.
  642. * Censoring headers:: Censoring HTTP headers.
  643. * Intermediate proxies:: Adjusting intermediate proxy behaviour.
  644. @end menu
  645. @node Tuning the HTTP parser, Censoring headers, HTTP tuning, HTTP tuning
  646. @subsection Tuning the HTTP parser
  647. @vindex laxHttpParser
  648. @vindex bigBufferSize
  649. As a number of HTTP servers and CGI scripts serve incorrect HTTP
  650. headers, Polipo uses a @emph{lax} parser, meaning that incorrect HTTP
  651. headers will be ignored (a warning will be logged by default). If the
  652. variable @code{laxHttpParser} is not set (it is set by default),
  653. Polipo will use a @emph{strict} parser, and refuse to serve an
  654. instance unless it could parse all the headers.
  655. When the amount of headers exceeds one chunk's worth (@pxref{Chunk
  656. memory}), Polipo will allocate a @dfn{big buffer} in order to store
  657. the headers. The size of big buffers, and therefore the maximum
  658. amount of headers Polipo can parse, is specified by the variable
  659. @code{bigBufferSize} (32@dmn{kB} by default).
  660. @node Censoring headers, Intermediate proxies, Tuning the HTTP parser, HTTP tuning
  661. @subsection Censoring headers
  662. @cindex privacy
  663. @cindex anonymity
  664. @cindex Referer
  665. @cindex cookies
  666. @vindex censorReferer
  667. @vindex censoredHeaders
  668. @vindex proxyName
  669. @vindex disableVia
  670. Polipo offers the option to censor given HTTP headers in both client
  671. requests and server replies. The main application of this feature is
  672. to very slightly improve the user's privacy by eliminating cookies and
  673. some content-negotiation headers.
  674. It is important to understand that these features merely make it
  675. slightly more difficult to gather statistics about the user's
  676. behaviour. While they do not actually prevent such statistics from
  677. being collected, they might make it less cost-effective to do so.
  678. The general mechanism is controlled by the variable
  679. @code{censoredHeaders}, the value of which is a case-insensitive list
  680. of headers to unconditionally censor. By default, it is empty, but
  681. I recommend that you set it to @samp{From, Accept-Language}. Adding
  682. headers such as @samp{Set-Cookie}, @samp{Set-Cookie2}, @samp{Cookie},
  683. @samp{Cookie2} or @samp{User-Agent} to this list will probably break
  684. many web sites.
  685. The case of the @samp{Referer}@footnote{HTTP contains many mistakes
  686. and even one spelling error.} header is treated specially because many
  687. sites will refuse to serve pages when it is not provided. If
  688. @code{censorReferer} is @code{false} (the default), @samp{Referer}
  689. headers are passed unchanged to the server. If @code{censorReferer}
  690. is @code{maybe}, @samp{Referer} headers are passed to the server only
  691. when they refer to the same host as the resource being fetched. If
  692. @code{censorReferer} is @code{true}, all @samp{Referer} headers are
  693. censored. I recommend setting @code{censorReferer} to @code{maybe}.
  694. Another header that can have privacy implications is the @samp{Via}
  695. header, which is used to specify the chain of proxies through which
  696. a given request has passed. Polipo will generate @samp{Via} headers
  697. if the variable @code{disableVia} is @code{false} (it is true by
  698. default). If you choose to generate @samp{Via} headers, you may want
  699. to set the @code{proxyName} variable to some innocuous string
  700. (@pxref{Client connections}).
  701. @menu
  702. * Censor Accept-Language:: Why Accept-Language is evil.
  703. @end menu
  704. @node Censor Accept-Language, , Censoring headers, Censoring headers
  705. @subsubsection Why censor Accept-Language
  706. @cindex negotiation
  707. @cindex content negotiation
  708. @cindex Accept-Language
  709. Recent versions of HTTP include a mechanism known as @dfn{content
  710. negotiation} which allows a user-agent and a server to negotiate the
  711. best representation (instance) for a given resource. For example, a
  712. server that provides both PNG and GIF versions of an image will serve
  713. the PNG version to user-agents that support PNG, and the GIF version
  714. to Internet Explorer.
  715. Content negotiation requires that a client should send with every
  716. single request a number of headers specifying the user's cultural and
  717. technical preferences. Most of these headers do not expose sensitive
  718. information (who cares whether your browser supports PNG?). The
  719. @samp{Accept-Language} header, however, is meant to convey the user's
  720. linguistic preferences. In some cases, this information is sufficient
  721. to pinpoint with great precision the user's origins and even his
  722. political or religious opinions; think, for example, of the
  723. implications of sending @samp{Accept-Language: yi} or @samp{ar_PS}.
  724. At any rate, @samp{Accept-Language} is not useful. Its design is
  725. based on the assumption that language is merely another representation
  726. for the same information, and @samp{Accept-Language} simply carries a
  727. prioritised list of languages, which is not enough to usefully
  728. describe a literate user's preferences. A typical French user, for
  729. example, will prefer an English-language original to a French
  730. (mis-)translation, while still wanting to see French language texts
  731. when they are original. Such a situation cannot be described by the
  732. simple-minded @samp{Accept-Language} header.
  733. @node Intermediate proxies, , Censoring headers, HTTP tuning
  734. @subsection Adjusting intermediate proxy behaviour
  735. @vindex alwaysAddNoTransform
  736. @cindex intermediate proxies
  737. Implementors of intermediate caches (proxies) have found it useful to
  738. convert the media type of certain entity bodies. A non-transparent
  739. proxy might, for example, convert between image formats in order to
  740. save cache space or to reduce the amount of traffic on a slow link.
  741. If @code{alwaysAddNoTransform} is true (it is false by default),
  742. Polipo will add a 'no-transform' cache control directive to all
  743. outgoing requests. This directive forbids (compliant) intermediate
  744. caches from responding with an object that was compressed or
  745. transformed in any way.
  746. @node Offline browsing, Server statistics, HTTP tuning, Network
  747. @section Offline browsing
  748. @vindex proxyOffline
  749. @cindex offline browsing
  750. @cindex browsing offline
  751. @cindex connectivity
  752. @cindex warning
  753. @cindex shift-click
  754. In an ideal world, all machines would have perfect connectivity to the
  755. network at all times and servers would never crash. In the real
  756. world, it may be necessary to avoid hitting the network and have
  757. Polipo serve stale objects from its cache.
  758. Setting @code{proxyOffline} to @code{true} prevents Polipo from
  759. contacting remote servers, no matter what. This setting is suitable
  760. when you have no network connection whatsoever.
  761. If @code{proxyOffline} is false, Polipo's caching behaviour is
  762. controlled by a number of variables documented in @ref{Tweaking validation}.
  763. @node Server statistics, Server-side behaviour, Offline browsing, Network
  764. @section Server statistics
  765. @vindex serverExpireTime
  766. @cindex server statistics
  767. @cindex round-trip time
  768. @cindex transfer rate
  769. In order to decide when to pipeline requests (@pxref{Pipelining}) and
  770. whether to perform Poor Man's Multiplexing
  771. (@pxref{Poor Mans Multiplexing}), Polipo needs to keep statistics
  772. about servers. These include the server's ability to handle
  773. persistent connections, the server's ability to handle pipelined
  774. requests, the round-trip time to the server, and the server's transfer
  775. rate. The statistics are accessible from Polipo's web interface
  776. (@pxref{Web interface}).
  777. The variable @samp{serverExpireTime} (default 1 day) specifies how
  778. long such information remains valid. If a server has not been
  779. accessed for a time interval of at least @code{serverExpireTime},
  780. information about it will be discarded.
  781. As Polipo will eventually recover from incorrect information about a
  782. server, this value can be made fairly large. The reason why it exists
  783. at all is to limit the amount of memory used up by information about
  784. servers.
  785. @node Server-side behaviour, PMM, Server statistics, Network
  786. @section Tweaking server-side behaviour
  787. @vindex serverSlots
  788. @vindex serverSlots1
  789. @vindex serverMaxSlots
  790. @vindex smallRequestTime
  791. @vindex replyUnpipelineTime
  792. @vindex replyUnpipelineSize
  793. @vindex maxPipelineTrain
  794. @vindex pipelineAdditionalRequests
  795. @vindex maxSideBuffering
  796. @cindex small request
  797. @cindex large request
  798. @cindex breaking pipelines
  799. The most important piece of information about a server is whether it
  800. supports persistent connections. If this is the case, Polipo will
  801. open at most @code{serverSlots} connections to that server
  802. (@code{serverSlots1} if the server only implements HTTP/1.0), and
  803. attempt to pipeline; if not, Polipo will hit the server harder,
  804. opening up to @code{serverMaxSlots} connections.
  805. Another use of server information is to decide whether to pipeline
  806. additional requests on a connection that already has in-flight
  807. requests. This is controlled by the variable
  808. @code{pipelineAdditionalRequests}; if it is @code{false}, no
  809. additional requests will be pipelined. If it is @code{true},
  810. additional requests will be pipelined whenever possible. If it is
  811. @code{maybe} (the default), additional requests will only be pipelined
  812. following @dfn{small} requests, where a small request one whose
  813. download is estimated to take no more than @code{smallRequestTime}
  814. (default 5@dmn{s}).
  815. Sometimes, a request has been pipelined after a request that prompts a
  816. very large reply from the server; when that happens, the pipeline
  817. needs be broken in order to reduce latency. A reply is @dfn{large}
  818. and will cause a pipeline to be broken if either its size is at least
  819. @code{replyUnpipelineSize} (default one megabyte) or else the server's
  820. transfer rate is known and the body is expected to take at least
  821. @code{replyUnpipelineTime} to download (default 15@dmn{s}).
  822. The variable @code{maxPipelineTrain} defines the maximum number of
  823. requests that will be pipelined in a single write (default 10).
  824. Setting this variable to a very low value might (or might not) fix
  825. interaction with some unreliable servers that the normal heuristics
  826. are unable to detect.
  827. The variable @code{maxSideBuffering} specifies how much data will be
  828. buffered in a PUT or POST request; it defaults to 1500 bytes. Setting
  829. this variable to 0 may cause some media players that abuse the HTTP
  830. protocol to work.
  831. @node PMM, Forbidden, Server-side behaviour, Network
  832. @section Poor Man's Multiplexing
  833. @cindex Poor Man's Multiplexing
  834. @cindex multiplexing
  835. @vindex pmmSize
  836. @vindex pmmFirstSize
  837. By default, Polipo does not use Poor Man's Multiplexing (@pxref{Poor
  838. Mans Multiplexing}). If the variable @code{pmmSize} is set to a
  839. positive value, Polipo will use PMM when speaking to servers that are
  840. known to support pipelining. It will request resources by segments of
  841. @code{pmmSize} bytes. The first segment requested has a size of
  842. @code{pmmFirstSize}, which defaults to twice @code{pmmSize}.
  843. PMM is an intrinsically unreliable technique. Polipo makes heroic
  844. efforts to make it at least usable, requesting that the server disable
  845. PMM when not useful (by using the @samp{If-Range} header) and
  846. disabling it on its own if a resource turns out to be dynamic.
  847. Notwithstanding these precautions, unless the server
  848. cooperates@footnote{More precisely, unless CGI scripts cooperate.},
  849. you will see failures when using PMM, which will usually result in
  850. blank pages and broken image icons; hitting @emph{Reload} on your
  851. browser will usually cause Polipo to notice that something went wrong
  852. and correct the problem.
  853. @node Forbidden, DNS, PMM, Network
  854. @section Forbidden and redirected URLs
  855. @cindex forbidden
  856. @cindex redirect
  857. @cindex web counter
  858. @cindex counter
  859. @cindex web bug
  860. @cindex bug
  861. @cindex advertisement
  862. @cindex web ad
  863. @cindex banner ad
  864. The web contains advertisements that a user-agent is supposed to
  865. download together with the requested pages. Not only do
  866. advertisements pollute the user's brain, pushing them around takes
  867. time and uses up network bandwidth.
  868. Many so-called content providers also track user activities by using
  869. @dfn{web bugs}, tiny embedded images that cause a server to log where
  870. they are requested from. Such images can be detected because they are
  871. usually uncacheable (@pxref{Cache transparency}) and therefore logged
  872. by Polipo by default.
  873. Polipo can be configured to prevent certain URLs from reaching the
  874. browser, either by returning a @emph{forbidden} error message to the
  875. user, or by @emph{redirecting} such URLs to some other URL.
  876. Some content providers attempt to subvert content filtering as well as
  877. malware scans by tunnelling their questionable content as https or other
  878. encrypted protocols. Other content providers are so clueless as to inject
  879. content from external providers into supposedly safe webpages.
  880. Polipo has therefore the ability to selectively block tunneled connections
  881. based on hostname and port information.
  882. @menu
  883. * Internal forbidden list:: Specifying forbidden URLs.
  884. * External redirectors:: Using an external redirector.
  885. * Forbidden Tunnels:: Specifying hosts forbidden for tunnelling.
  886. @end menu
  887. @node Internal forbidden list, External redirectors, Forbidden, Forbidden
  888. @subsection Internal forbidden list
  889. @cindex forbidden
  890. @cindex redirect
  891. @vindex forbiddenFile
  892. @vindex forbiddenUrl
  893. @vindex forbiddenRedirectCode
  894. The file pointed at by the variable @code{forbiddenFile} (defaults to
  895. @file{~/.polipo-forbidden} or @file{/etc/polipo/forbidden}, whichever
  896. exists) specifies the set of URLs that should never be fetched. If
  897. @code{forbiddenFile} is a directory, it will be recursively searched
  898. for files with forbidden URLs.
  899. Every line in a file listing forbidden URLs can either be a domain
  900. name --- a string that doesn't contain any of @samp{/}, @samp{*} or
  901. @samp{\} ---, or a POSIX extended regular expression. Blank lines are
  902. ignored, as are those that start with a hash sign @samp{#}.
  903. By default, whenever it attempts to fetch a forbidden URL, the browser
  904. will receive a @emph{403 forbidden} error from Polipo. Some users
  905. prefer to have the browser display a different page or an image.
  906. If @code{forbiddenUrl} is not null, it should represent a URL to which
  907. all forbidden URLs will be redirected. The kind of redirection used
  908. is specified by @code{forbiddenRedirectCode}; if this is 302 (the
  909. default) the redirection will be marked as temporary, if 301 it will
  910. be a permanent one.
  911. @node External redirectors, Forbidden Tunnels, Internal forbidden list, Forbidden
  912. @subsection External redirectors
  913. @cindex forbidden
  914. @cindex redirect
  915. @cindex redirector
  916. @cindex Squid-style redirector
  917. @cindex Adzapper
  918. @vindex redirector
  919. @vindex redirectorRedirectCode
  920. Polipo can also use an external process (a @dfn{Squid-style
  921. redirector}) to determine which URLs should be redirected. The name
  922. of the redirector binary is determined from the variable
  923. @code{redirector}, and the kind of redirection generated is specified
  924. by @code{redirectorRedirectCode}, which should be 302 (the default) or
  925. 301.
  926. For example, to use Adzapper to redirect ads to an innocuous image, just set
  927. @example
  928. redirector = /usr/bin/adzapper
  929. @end example
  930. @node Forbidden Tunnels, , External redirectors, Forbidden
  931. @subsection Forbidden Tunnels
  932. Polipo does by default allow tunnelled connections
  933. (@pxref{Tunnelling connections}), however sometimes it is desirable to
  934. block connections selectively.
  935. Because polipo does only pass through tunnelled connections filtering is
  936. possible based on hostname and port information only. Filtering based on
  937. protocol specific types of information like pathname is not possible.
  938. Obviously the web browser (and other software) must be configured to use
  939. polipo as tunneling proxy for this to work. The tunnelled traffic is neither
  940. touched nor inspected in any way by polipo, thus encryption, certification
  941. and all other security and integrity guarantees implemented in the browser
  942. are not in any way affected.
  943. The file pointed at by the variable @code{forbiddenTunnelsFile} (defaults to
  944. @file{~/.polipo-forbiddenTunnels} or @file{/etc/polipo/forbiddenTunnels},
  945. whichever exists) specifies the set of tunnel specifications that should
  946. be blocked.
  947. Every line in a file listing forbidden Tunnels can either be a domain
  948. name --- a string that doesn't contain any of @samp{/}, @samp{*} or
  949. @samp{\} ---, or a POSIX extended regular expression. Blank lines are
  950. ignored, as are those that start with a hash sign @samp{#}.
  951. Entries in the form of regular expressions will be matched against
  952. tunnel reqeusts of the form @code{hostname:portnumber}.
  953. Tunnelled and blocked connections will be logged if the configuration variable
  954. @code{logLevel} is set to a value such that @code{((logLevel & 0x80) !=0)}
  955. Example @code{forbiddenTunnelsFile} :
  956. @example
  957. # simple case, exact match of hostnames
  958. www.massfuel.com
  959. # match hostname against regexp
  960. \.hitbox\.
  961. # match hostname and port against regexp
  962. # this will block tunnels to example.com but also www.example.com
  963. # for ports in the range 600-999
  964. # Also watch for effects of 'tunnelAllowedPorts'
  965. example.com\:[6-9][0-9][0-9]
  966. # random examples
  967. \.liveperson\.
  968. \.atdmt\.com
  969. .*doubleclick\.net
  970. .*webtrekk\.de
  971. ^count\..*
  972. .*\.offerstrategy\.com
  973. .*\.ivwbox\.de
  974. .*adwords.*
  975. .*\.sitestat\.com
  976. \.xiti\.com
  977. webtrekk\..*
  978. @end example
  979. @node DNS, Parent proxies, Forbidden, Network
  980. @section The domain name service
  981. @cindex DNS
  982. @cindex name server
  983. @cindex gethostbyname
  984. @cindex resolver
  985. @cindex IPv6
  986. @vindex dnsMaxTimeout
  987. @vindex dnsUseGethostbyname
  988. @vindex dnsNameServer
  989. @vindex dnsNameServerPort
  990. @vindex dnsNegativeTtl
  991. @vindex dnsGethostbynameTtl
  992. @vindex dnsQueryIPv6
  993. The low-level protocols beneath HTTP identify machines by IP
  994. addresses, sequences of four 8-bit integers such as
  995. @samp{199.232.41.10}@footnote{Or sequences of eight 16-bit integers if
  996. you are running IPv6.}. HTTP, on the other hand, and most application
  997. protocols, manipulate host names, strings such as @samp{www.polipo.org}.
  998. The @dfn{domain name service} (DNS) is a distributed database that
  999. maps host names to IP addresses. When an application wants to make
  1000. use of the DNS, it invokes a @dfn{resolver}, a local library or
  1001. process that contacts remote name servers.
  1002. Polipo usually tries to speak the DNS protocol itself rather than
  1003. using the system resolver@footnote{The Unix interface to the resolver
  1004. is provided by the @code{gethostbyname}(3) library call
  1005. (@code{getaddrinfo}(3) on recent systems), which was designed at
  1006. a time when a host lookup consisted in searching for one of five hosts
  1007. in a @samp{HOSTS.TXT} file. The @code{gethostbyname} call is
  1008. @dfn{blocking}, meaning that all activity must cease while a host
  1009. lookup is in progress. When the call eventually returns, it doesn't
  1010. provide a @dfn{time to live} (TTL) value to indicate how long the
  1011. address may be cached. For these reasons, @code{gethostbyname} is
  1012. hardly useful for programs that need to contact more than a few hosts.
  1013. (Recent systems replace @code{gethostbyname}(3) by
  1014. @code{getaddrinfo}(3), which is reentrant. While this removes one
  1015. important problem that multi-threaded programs encounter, it doesn't
  1016. solve any of the other issues with @code{gethostbyname}.)}. Its
  1017. precise behaviour is controlled by the value of
  1018. @code{dnsUseGethostbyname}. If @code{dnsUseGethostbyname} is
  1019. @code{false}, Polipo never uses the system resolver. If it is
  1020. @code{reluctantly} (the default), Polipo tries to speak DNS and falls
  1021. back to the system resolver if a name server could not be contacted.
  1022. If it is @code{happily}, Polipo tries to speak DNS, and falls back to
  1023. the system resolver if the host couldn't be found for any reason (this
  1024. is not a good idea for shared proxies). Finally, if
  1025. @code{dnsUseGethostbyname} is @code{true}, Polipo never tries to speak
  1026. DNS itself and uses the system resolver straight away (this is not
  1027. recommended).
  1028. If the internal DNS support is used, Polipo must be given a recursive
  1029. name server to speak to. By default, this information is taken from
  1030. the @samp{/etc/resolv.conf} file at startup; however, if you wish to use
  1031. a different name server, you may set the @code{dnsNameServer} and
  1032. optionally @code{dnsNameServerPort} variables to an IP address and port
  1033. number of a listening DNS server@footnote{While Polipo does its own
  1034. caching of DNS data, I recommend that you run a local caching name server.
  1035. I am very happy with @uref{http://www.thekelleys.org.uk/dnsmasq/doc.html,,@code{dnsmasq}}.}.
  1036. When the reply to a DNS request is late to come, Polipo will retry
  1037. multiple times using an exponentially increasing timeout. The maximum
  1038. timeout used before Polipo gives up is defined by @code{dnsMaxTimeout}
  1039. (default 60@dmn{s}); the total time before Polipo gives up on a DNS
  1040. query will be roughly twice @code{dnsMaxTimeout}.
  1041. The variable @code{dnsNegativeTtl} specifies the time during which
  1042. negative DNS information (information that a host @emph{doesn't}
  1043. exist) will be cached; this defaults to 120@dmn{s}. Increasing this
  1044. value reduces both latency and network traffic but may cause a failed
  1045. host not to be noticed when it comes back up.
  1046. The variable @code{dnsQueryIPv6} specifies whether to query for IPv4
  1047. or IPv6 addresses. If @code{dnsQueryIPv6} is @code{false}, only IPv4
  1048. addresses are queried. If @code{dnsQueryIPv6} is @code{reluctantly},
  1049. both types of addresses are queried, but IPv4 addresses are preferred.
  1050. If @code{dnsQueryIPv6} is @code{happily} (the default), IPv6 addresses
  1051. are preferred. Finally, if @code{dnsQueryIPv6} is @code{true}, only
  1052. IPv6 addresses are queried.
  1053. If the system resolver is used, the value @code{dnsGethostbynameTtl}
  1054. specifies the time during which a @code{gethostbyname} reply will be
  1055. cached (default 5 minutes).
  1056. @node Parent proxies, Tuning POST and PUT, DNS, Network
  1057. @section Parent proxies
  1058. Polipo will usually fetch instances directly from source servers as
  1059. this configuration minimises latency. In some cases, however, it may
  1060. be useful to have Polipo fetch instances from a @dfn{parent} proxy.
  1061. Polipo can use two protocols to speak to a parent proxy: HTTP and
  1062. SOCKS. When configured to use both HTTP and SOCKS proxying, Polipo
  1063. will contact an HTTP proxy over SOCKS --- in other words, SOCKS is
  1064. considered as being at a lower (sub)layer than HTTP.
  1065. @menu
  1066. * HTTP parent proxies:: Using an HTTP parent proxy.
  1067. * SOCKS parent proxies:: Using a SOCKS4a parent proxy.
  1068. @end menu
  1069. @node HTTP parent proxies, SOCKS parent proxies, Parent proxies, Parent proxies
  1070. @subsection HTTP parent proxies
  1071. @vindex parentProxy
  1072. @vindex parentAuthCredentials
  1073. @cindex parent proxy
  1074. @cindex upstream proxy
  1075. @cindex firewall
  1076. @cindex authentication
  1077. The variable @code{parentProxy} specifies the hostname and port number
  1078. of an HTTP parent proxy; it should have the form @samp{host:port}.
  1079. If the parent proxy requires authorisation, the username and password
  1080. should be specified in the variable @code{parentAuthCredentials} in
  1081. the form @samp{username:password}. Only @emph{Basic} authentication
  1082. is supported, which is vulnerable to replay attacks.
  1083. The main application of the parent proxy support is to cross
  1084. firewalls. Given a machine, say @code{trurl}, with unrestricted
  1085. access to the web, the following evades a firewall by using an
  1086. encrypted compressed @code{ssh} link:
  1087. @example
  1088. $ ssh -f -C -L 8124:localhost:8123 trurl polipo
  1089. $ polipo parentProxy=localhost:8124
  1090. @end example
  1091. @node SOCKS parent proxies, , HTTP parent proxies, Parent proxies
  1092. @subsection SOCKS parent proxies
  1093. @cindex SOCKS
  1094. @vindex socksParentProxy
  1095. @vindex socksAuthCredentials
  1096. @vindex socksProxyType
  1097. The variable @code{socksParentProxy} specifies the hostname and port
  1098. number of a SOCKS parent proxy; it should have the form
  1099. @samp{host:port}. The variant of the SOCKS protocol being used is
  1100. defined by @code{socksProxyType}, which can be either @samp{socks4a}
  1101. or @samp{socks5}; the latter value specifies ``SOCKS5 with
  1102. hostnames'', and is the default.
  1103. The variable @code{socksAuthCredentials} can be used if your SOCKS
  1104. proxy requires authentication. For SOCKS4 and 4a, it is just
  1105. a username; for SOCKS5 it is of the form @samp{username:password}.
  1106. The main application of the SOCKS support is to use
  1107. @uref{http://tor.eff.org,,Tor} to evade overly restrictive or
  1108. misconfigured firewalls. Assuming you have a Tor client running on
  1109. the local host listening on the default port (9050), the following
  1110. uses Tor for all outgoing HTTP traffic:
  1111. @example
  1112. $ polipo socksParentProxy=localhost:9050
  1113. @end example
  1114. @node Tuning POST and PUT, Tunnelling connections, Parent proxies, Network
  1115. @section Tuning POST and PUT requests
  1116. @cindex POST request
  1117. @cindex PUT request
  1118. @vindex expectContinue
  1119. The main assumption behind the design of the HTTP protocol is that
  1120. requests are idempotent: since a request can be repeated by a client,
  1121. a server is allowed to drop a connection at any time. This fact, more
  1122. than anything else, explains the amazing scalability of the protocol.
  1123. This assumption breaks down in the case of POST requests. Indeed, a
  1124. POST request usually causes some action to be performed (a page to be
  1125. printed, a significant amount of money to be transferred from your
  1126. bank account, or, in Florida, a vote to be registered), and such a
  1127. request should not be repeated.
  1128. The only solution to this problem is to reserve HTTP to idempotent
  1129. activities, and use reliable protocols for action-effecting ones.
  1130. Notwithstanding that, HTTP/1.1 makes a weak attempt at making POST
  1131. requests slightly more reliable and efficient than they are in
  1132. HTTP/1.0.
  1133. When speaking to an HTTP/1.1 server, an HTTP client is allowed to
  1134. request that the server check @emph{a priori} whether it intends to
  1135. honour a POST request. This is done by sending @dfn{an expectation},
  1136. a specific header with the request, @samp{Expect: 100-continue}, and
  1137. waiting for either an error message or a @samp{100 Continue} reply
  1138. from the server. If the latter arrives, the client is welcome to send
  1139. the rest of the POST request@footnote{This, of course, is only part of
  1140. the story. Additionally, the server is not required to reply with
  1141. @samp{100 Continue}, hence the client must implement a timeout.
  1142. Furthermore, according to the obsolete RFC2068, the server is
  1143. allowed to spontaneously send @samp{100 Continue}, so the client must
  1144. be prepared to ignore such a reply at any time.}.
  1145. Polipo's behaviour w.r.t.@: client expectations is controlled by the
  1146. variable @code{expectContinue}. If this variable is false, Polipo
  1147. will never send an expectation to the server; if a client sends an
  1148. expectation, Polipo will fail the expectation straight away, causing
  1149. the client (if correctly implemented) to retry with no expectation.
  1150. If @code{expectContinue} is @code{maybe} (the default), Polipo will
  1151. behave in a standards-compliant manner: it will forward expectations
  1152. to the server when allowed to do so, and fail client expectations
  1153. otherwise. Finally, if @code{expectContinue} is @code{true}, Polipo
  1154. will always send expectations when it is reasonable to do so; this
  1155. violates the relevant standards and will break some websites, but
  1156. might decrease network traffic under some circumstances.
  1157. @node Tunnelling connections, , Tuning POST and PUT, Network
  1158. @section Tunnelling connections
  1159. @cindex tunnel
  1160. @cindex tunnelling proxy
  1161. @cindex https
  1162. @cindex HTTP/SSL
  1163. @cindex rsync
  1164. @cindex CONNECT
  1165. @vindex tunnelAllowedPorts
  1166. Polipo is an HTTP proxy; it proxies HTTP traffic, and clients using
  1167. other protocols should either establish a direct connection to the
  1168. server or use an @emph{ad hoc} proxy.
  1169. In many circumstances, however, it is not possible to establish
  1170. a direct connection to the server, for example due to mis-configured
  1171. firewalls or when trying to access the IPv4 Internet from an IPv6-only
  1172. host. In such situations, it is possible to have Polipo behave as
  1173. a @emph{tunnelling} proxy --- a proxy that merely forwards traffic
  1174. between the client and the server without understanding it. Polipo
  1175. enters tunnel mode when the client requests it by using the HTTP
  1176. @samp{CONNECT} method.
  1177. Most web browsers will use this technique for HTTP over SSL if
  1178. configured to use Polipo as their `https proxy'. More generally, the
  1179. author has successfully used it to cross mis-configured firewalls
  1180. using OpenSSH, rsync, Jabber, IRC, etc.
  1181. The variable @code{tunnelAllowedPorts} specifies the set of ports that
  1182. Polipo will accept to tunnel traffic to. It defaults to allowing ssh,
  1183. HTTP, https, rsync, IMAP, imaps, POP, pops, Jabber, CVS and Git traffic.
  1184. It is possible to selectively block tunneled connections,
  1185. @pxref{Forbidden Tunnels}
  1186. @node Caching, Memory usage, Network, Top
  1187. @chapter Caching
  1188. @menu
  1189. * Cache transparency:: Fresh and stale data.
  1190. * Memory cache:: The in-memory cache.
  1191. * Disk cache:: The on-disk cache.
  1192. @end menu
  1193. @node Cache transparency, Memory cache, Caching, Caching
  1194. @section Cache transparency and validation
  1195. @cindex transparent cache
  1196. @cindex cache transparency
  1197. @cindex out-of-date instances
  1198. @cindex validation
  1199. @cindex revalidation
  1200. @cindex expire
  1201. @cindex stale
  1202. @cindex fresh
  1203. If resources on a server change, it is possible for a cached instance
  1204. to become out-of date. Ideally, a cache would be perfectly
  1205. @dfn{transparent}, meaning that it never serves an out-of-date
  1206. instance; in a universe with a finite speed of signal propagation,
  1207. however, this ideal is impossible to achieve.
  1208. If a caching proxy decides that a cached instance is new enough to
  1209. likely still be valid, it will directly serve the instance to the
  1210. client; we then say that the cache decided that the instance is
  1211. @dfn{fresh}. When an instance is @dfn{stale} (not fresh), the cache
  1212. will check with the upstream server whether the resource has changed;
  1213. we say that the cached instance is being @dfn{revalidated}.
  1214. In HTTP/1.1, responsibility for revalidation is shared between the
  1215. client, the server and the proxy itself. The client can override
  1216. revalidation policy by using the @samp{Cache-Control}
  1217. header@footnote{Or the obsolete @samp{Pragma} header.}; for example,
  1218. some user-agents will request end-to-end revalidation in this way when
  1219. the user shift-clicks on @emph{reload}. The server may choose to
  1220. specify revalidation policy by using the @samp{Expires} and
  1221. @samp{Cache-Control} headers. As to the proxy, it needs to choose a
  1222. revalidation policy for instances with neither server- nor client-side
  1223. cache control information. Of course, nothing (except the HTTP/1.1
  1224. spec, but that is easily ignored) prevents a proxy from overriding the
  1225. client's and server's cache control directives.
  1226. @menu
  1227. * Tuning validation:: Tuning Polipo's validation behaviour.
  1228. * Tweaking validation:: Further tweaking of validation.
  1229. @end menu
  1230. @node Tuning validation, Tweaking validation, Cache transparency, Cache transparency
  1231. @subsection Tuning validation behaviour
  1232. @cindex age
  1233. @vindex maxAge
  1234. @vindex maxAgeFraction
  1235. @vindex maxExpiresAge
  1236. @vindex maxNoModifiedAge
  1237. Polipo's revalidation behaviour is controlled by a number of
  1238. variables. In the following, an resource's @dfn{age} is the time since
  1239. it was last validated, either because it was fetched from the server
  1240. or because it was revalidated.
  1241. The policy defining when cached instances become stale in the absence
  1242. of server-provided information is controlled by the variables
  1243. @code{maxAge}, @code{maxAgeFraction}, @code{maxExpiresAge} and
  1244. @code{maxNoModifiedAge}. If an instance has an @samp{Expires} header,
  1245. it becomes stale at the date given by that header, or when its age
  1246. becomes larger than @code{maxExpiresAge}, whichever happens first. If
  1247. an instance has no @samp{Expires} header but has a @samp{LastModified}
  1248. header, it becomes stale when its age reaches either
  1249. @code{maxAgeFraction} of the time since it was last modified or else
  1250. the absolute value @code{maxAge}, whichever happens first. Finally,
  1251. if an instance has neither @samp{Expires} nor @samp{Last-Modified}, it
  1252. will become stale when its age reaches @code{maxNoModifiedAge}.
  1253. @node Tweaking validation, , Tuning validation, Cache transparency
  1254. @subsection Further tweaking of validation behaviour
  1255. @cindex uncachable
  1256. @cindex vary
  1257. @vindex cacheIsShared
  1258. @vindex mindlesslyCacheVary
  1259. @vindex uncachableFile
  1260. @vindex dontCacheCookies
  1261. @vindex dontCacheRedirects
  1262. @vindex dontTrustVaryETag
  1263. If @code{cacheIsShared} is false (it is true by default), Polipo will
  1264. ignore the server-side @samp{Cache-Control} directives @samp{private},
  1265. @samp{s-maxage} and @samp{proxy-must-revalidate}. This is highly
  1266. desirable behaviour when the proxy is used by just one user, but might
  1267. break some sites if the proxy is shared.
  1268. When connectivity is very poor, the variable @code{relaxTransparency}
  1269. can be used to cause Polipo to serve stale instances under some
  1270. circumstances. If @code{relaxTransparency} is @code{false} (the
  1271. default), all stale instances are validated (@pxref{Cache
  1272. transparency}), and failures to connect are reported to the client.
  1273. This is the default mode of operation of most other proxies, and the
  1274. least likely to surprise the user.
  1275. If @code{relaxTransparency} is @code{maybe}, all stale instances are
  1276. still validated, but a failure to connect is only reported as an error
  1277. if no data is available in the cache. If a connection fails and stale
  1278. data is available, it is served to the client with a suitable HTTP/1.1
  1279. @samp{Warning} header. Current user-agents do not provide visible
  1280. indication of such warnings, however, and this setting will typically
  1281. cause the browser to display stale data with no indication that
  1282. anything went wrong. It is useful when you are consulting a live web
  1283. site but don't want to be bothered with failed revalidations.
  1284. If @code{relaxTransparency} is @code{true}, missing data is fetched
  1285. from remote servers, but stale data are unconditionally served with no
  1286. validation. Client-side @samp{Cache-Control} directives are still
  1287. honoured, which means that you can force an end-to-end revalidation
  1288. from the browser's interface (typically by shift-clicking on
  1289. ``reload''). This setting is only useful if you have very bad network
  1290. connectivity or are consulting a very slow web site or one that
  1291. provides incorrect cache control information@footnote{This is for
  1292. example the case of @code{www.microsoft.com}, and also of websites
  1293. generated by a popular Free content management system written in
  1294. Python.} and are willing to manually revalidate pages that you suspect
  1295. are stale.
  1296. If @code{mindlesslyCacheVary} is true, the presence of a @samp{Vary}
  1297. header (which indicates that content-negotiation occurred,
  1298. @pxref{Censor Accept-Language}) is ignored, and cached negotiated
  1299. instances are mindlessly returned to the client. If it is false (the
  1300. default), negotiated instances are revalidated on every client
  1301. request.
  1302. Unfortunately, a number of servers (most notably some versions of
  1303. Apache's @code{mod_deflate} module) send objects with a @samp{ETag}
  1304. header that will confuse Polipo in the presence of a @samp{Vary}
  1305. header. Polipo will make a reasonable check for consistency if
  1306. @samp{dontTrustVaryETag} is set to @samp{maybe} (the default); it will
  1307. systematically ignore @samp{ETag} headers on objects with @samp{Vary}
  1308. headers if it is set to @samp{true}.
  1309. A number of websites incorrectly mark variable resources as cachable;
  1310. such issues can be worked around in polipo by manually marking given
  1311. categories of objects as uncachable. If @code{dontCacheCookies} is
  1312. true, all pages carrying HTTP cookies will be treated as uncachable.
  1313. If @code{dontCacheRedirects} is true, all redirects (301 and 302) will
  1314. be treated as uncachable. Finally, if everything else fails, a list
  1315. of uncachable URLs can be given in the file specified by
  1316. @code{uncachableFile}, which has the same format as the
  1317. @code{forbiddenFile} (@pxref{Internal forbidden list}). If not
  1318. specified, its location defaults to @samp{~/.polipo-uncachable} or
  1319. @samp{/etc/polipo/uncachable}, whichever exists.
  1320. @node Memory cache, Disk cache, Cache transparency, Caching
  1321. @section The in-memory cache
  1322. The in-memory cache consists of a list of HTTP and DNS objects
  1323. maintained in least-recently used order. An index to the in-memory
  1324. cache is maintained as a (closed) hash table.
  1325. When the in-memory cache grows beyond a certain size (controlled by a
  1326. number of variables, @pxref{Memory usage}), or when a hash table
  1327. collision occurs, resources are written out to disk.
  1328. @node Disk cache, , Memory cache, Caching
  1329. @section The on-disk cache
  1330. @cindex filesystem
  1331. @cindex NFS
  1332. @vindex diskCacheRoot
  1333. @vindex maxDiskEntries
  1334. @vindex diskCacheWriteoutOnClose
  1335. @vindex diskCacheFilePermissions
  1336. @vindex diskCacheDirectoryPermissions
  1337. @vindex maxDiskCacheEntrySize
  1338. The on-disk cache consists in a filesystem subtree rooted at
  1339. a location defined by the variable @code{diskCacheRoot}, by default
  1340. @code{"/var/cache/polipo/"}. This directory should normally be
  1341. writeable, readable and seekable by the user running Polipo. While it
  1342. is best to use a local filesystem for the on-disk cache, a NFSv3- or
  1343. AFS-mounted filesystem should be safe in most implementations. Do not
  1344. use NFSv2, as it will cause cache corruption @footnote{Polipo assumes
  1345. that @samp{open(O_CREAT | O_EXCL)} works reliably.}.
  1346. If @code{diskCacheRoot} is an empty string, no disk cache is used.
  1347. The value @code{maxDiskEntries} (32 by default) is the absolute
  1348. maximum of file descriptors held open for on-disk objects. When this
  1349. limit is reached, Polipo will close descriptors on
  1350. a least-recently-used basis. This value should be set to be slightly
  1351. larger than the number of resources that you expect to be live at
  1352. a single time; defining the right notion of liveness is left as an
  1353. exercise for the interested reader.
  1354. The value @code{diskCacheWriteoutOnClose} (64@dmn{kB} by default) is
  1355. the amount of data that Polipo will write out when closing a disk
  1356. file. Writing out data when closing a file can avoid subsequently
  1357. reopening it, but causes unnecessary work if the instance is later
  1358. superseded.
  1359. The integers @code{diskCacheDirectoryPermissions} and
  1360. @code{diskCacheFilePermissions} are the Unix filesystem permissions
  1361. with which files and directories are created in the on-disk cache;
  1362. they default to @samp{0700} and @samp{0600} respectively.
  1363. The variable @code{maxDiskCacheEntrySize} specifies the maximum size,
  1364. in bytes, of an instance that is stored in the on-disk cache. If set
  1365. to -1 (the default), all objects are stored in the on-disk cache,
  1366. @menu
  1367. * Asynchronous writing:: Writing out data when idle.
  1368. * Purging:: Purging the on-disk cache.
  1369. * Disk format:: Format of the on-disk cache.
  1370. * Modifying the on-disk cache::
  1371. @end menu
  1372. @node Asynchronous writing, Purging, Disk cache, Disk cache
  1373. @subsection Asynchronous writing
  1374. @vindex idleTime
  1375. @vindex maxObjectsWhenIdle
  1376. @vindex maxWriteoutWhenIdle
  1377. When Polipo runs out of memory (@pxref{Limiting memory usage}), it
  1378. will start discarding instances from its memory cache. If a disk
  1379. cache has been configured, it will write out any instance that it
  1380. discards. Any memory allocation that prompted the purge must then
  1381. wait for the write to complete.
  1382. In order to avoid the latency hit that this causes, Polipo will
  1383. preemptively write out instances to the disk cache whenever it is
  1384. idle. The integer @code{idleTime} specifies the time during which
  1385. Polipo will remain idle before it starts writing out random objects to
  1386. the on-disk cache; this value defaults to 20@dmn{s}. You may want to
  1387. decrease this value for a busy cache with little memory, or increase
  1388. it if your cache is often idle and has a lot of memory.
  1389. The value @code{maxObjectsWhenIdle} (default 32) specifies the maximum
  1390. number of instances that an idle Polipo will write out without
  1391. checking whether there's any new work to do. The value
  1392. @code{maxWriteoutWhenIdle} specifies the maximum amount of data
  1393. (default 64@dmn{kB}) that Polipo will write out without checking for
  1394. new activity. Increasing these values will make asynchronous
  1395. write-out slightly faster, at the cost of possibly increasing Polipo's
  1396. latency in some rare circumstances.
  1397. @node Purging, Disk format, Asynchronous writing, Disk cache
  1398. @subsection Purging the on-disk cache
  1399. @cindex purging
  1400. @vindex diskCacheUnlinkTime
  1401. @vindex diskCacheTruncateTime
  1402. @vindex diskCacheTruncateSize
  1403. @vindex preciseExpiry
  1404. Polipo never removes a file in its on-disk cache, except when it finds
  1405. that the instance that it represents has been superseded by a newer
  1406. version. In order to keep the on-disk cache from growing without
  1407. bound, it is necessary to @dfn{purge} it once in a while. Purging the
  1408. cache typically consists in removing some files, truncating large
  1409. files (@pxref{Partial instances}) or moving them to off-line storage.
  1410. Polipo itself can be used to purge its on-disk cache; this is done by
  1411. invoking Polipo with the @option{-x} flag. This can safely be done
  1412. when Polipo is running (@pxref{Modifying the on-disk cache}).
  1413. For a purge to be effective, it is necessary to cause Polipo to
  1414. write-out its in-memory cache to disk (@pxref{Stopping}).
  1415. Additionally, Polipo will not necessarily notice the changed files
  1416. until it attempts to access them; thus, you will want it to discard
  1417. its in-memory cache after performing the purge. The safe way to
  1418. perform a purge is therefore:
  1419. @example
  1420. $ kill -USR1 @var{polipo-pid}
  1421. $ sleep 1
  1422. $ polipo -x
  1423. $ kill -USR2 @var{polipo-pid}
  1424. @end example
  1425. The behaviour of the @option{-x} flag is controlled by three
  1426. configuration variables. The variable @code{diskCacheUnlinkTime}
  1427. specifies the time during which an on-disk entry should remain unused
  1428. before it is eligible for removal; it defaults to 32 days.
  1429. The variable @code{diskCacheTruncateTime} specifies the time for which
  1430. an on-disk entry should remain unused before it is eligible for
  1431. truncation; it defaults to 4 days and a half. The variable
  1432. @code{diskCacheTruncateSize} specifies the size at which files are
  1433. truncated after they have not been accessed for
  1434. @code{diskCacheTruncateTime}; it defaults to 1@dmn{MB}.
  1435. Usually, Polipo uses a file's modification time in order to determine
  1436. whether it is old enough to be expirable. This heuristic can be
  1437. disabled by setting the variable @code{preciseExpiry} to true.
  1438. @node Disk format, Modifying the on-disk cache, Purging, Disk cache
  1439. @subsection Format of the on-disk cache
  1440. @vindex DISK_CACHE_BODY_OFFSET
  1441. @cindex on-disk file
  1442. @cindex on-disk cache
  1443. The on-disk cache consists of a collection of files, one per instance.
  1444. The format of an on-disk resource is similar to that of an HTTP
  1445. message: it starts with an HTTP status line, followed by HTTP headers,
  1446. followed by a blank line (@samp{\r\n\r\n}). The blank line is
  1447. optionally followed by a number of binary zeroes. The body of the
  1448. instance follows.
  1449. The headers of an on-disk file have a few minor differences with HTTP
  1450. messages. Obviously, there is never a @samp{Transfer-Encoding} line.
  1451. A few additional headers are used by Polipo for its internal
  1452. bookkeeping:
  1453. @itemize
  1454. @item
  1455. @samp{X-Polipo-Location}: this is the URL of the resource stored in this
  1456. file. This is always present.
  1457. @item
  1458. @samp{X-Polipo-Date}: this is Polipo's estimation of the date at which
  1459. this instance was last validated, and is used for generating the
  1460. @samp{Age} header of HTTP messages. This is optional, and only stored
  1461. if different from the instance's date.
  1462. @item
  1463. @samp{X-Polipo-Access}: this is the date when the instance was last
  1464. accessed by Polipo, and is used for cache purging (@pxref{Purging}).
  1465. This is optional, and is absent if the instance was never accessed.
  1466. @item
  1467. @samp{X-Polipo-Body-Offset}: the presence of this line indicates that
  1468. the blank line following the headers is followed by a number of zero
  1469. bytes. Its value is an integer, which indicates the offset since the
  1470. beginning of the file at which the instance body actually starts.
  1471. This line is optional, and if absent the body starts immediately after
  1472. the blank line.
  1473. @end itemize
  1474. @node Modifying the on-disk cache, , Disk format, Disk cache
  1475. @subsection Modifying the on-disk cache
  1476. @cindex on-disk cache
  1477. It is safe to modify the on-disk cache while Polipo is running as long
  1478. as no file is ever modified in place. More precisely, the only safe
  1479. operations are to unlink (remove, delete) files in the disk cache, or
  1480. to atomically add new files to the cache (by performing an exclusive
  1481. open, or by using one of the @samp{link} or @samp{rename} system
  1482. calls). It is @emph{not} safe to truncate a file in place.
  1483. @node Memory usage, Copying, Caching, Top
  1484. @chapter Memory usage
  1485. @cindex memory
  1486. Polipo uses two distinct pools of memory, the @dfn{chunk pool} and
  1487. the @dfn{malloc pool}.
  1488. @menu
  1489. * Chunk memory:: Chunk memory.
  1490. * Malloc memory:: Malloc memory.
  1491. * Limiting memory usage:: Limiting Polipo's memory usage.
  1492. @end menu
  1493. @node Chunk memory, Malloc memory, Memory usage, Memory usage
  1494. @section Chunk memory
  1495. @vindex CHUNK_SIZE
  1496. @vindex MALLOC_CHUNKS
  1497. @cindex chunk
  1498. @cindex memory
  1499. Most of the memory used by Polipo is stored in chunks, fixed-size
  1500. blocks of memory; the size of a chunk is defined by the compile-time
  1501. constant @code{CHUNK_SIZE}, and defaults to 4096 bytes on 32-bit
  1502. platforms, 8192 on 64-bit ones. Chunks are used for storing object
  1503. data (bodies of instances) and for temporary I/O buffers. Increasing
  1504. the chunk size increases performance somewhat, but at the cost of
  1505. larger granularity of allocation and hence larger memory usage.
  1506. By default, Polipo uses a hand-crafted memory allocator based on
  1507. @code{mmap}(2) (@code{VirtualAlloc} under Windows) for allocating
  1508. chunks; while this is very slightly faster than the stock memory
  1509. allocator, its main benefit is that it limits memory fragmentation.
  1510. It is possible to disable the chunk allocator, and use
  1511. @code{malloc}(3) for all memory allocation, by defining
  1512. @code{MALLOC_CHUNKS} at compile time; this is probably only useful for
  1513. debugging.
  1514. There is one assumption made about @code{CHUNK_SIZE}:
  1515. @code{CHUNK_SIZE} multiplied by the number of bits in an
  1516. @code{unsigned long} (actually in a @code{ChunkBitmap} --- see
  1517. @file{chunk.c}) must be a multiple of the page size, which is 4096 on
  1518. most systems (8192 on Alpha, and apparently 65536 on Windows).
  1519. As all network I/O will be performed in units of one to two chunks,
  1520. @code{CHUNK_SIZE} should be at least equal to your network interface's
  1521. MTU (typically 1500 bytes). Additionally, as much I/O will be done at
  1522. @code{CHUNK_SIZE}-aligned addresses, @code{CHUNK_SIZE} should ideally
  1523. be a multiple of the page size.
  1524. In summary, 2048, 4096, 8192 and 16384 are good choices for
  1525. @code{CHUNK_SIZE}.
  1526. @node Malloc memory, Limiting memory usage, Chunk memory, Memory usage
  1527. @section Malloc allocation
  1528. @cindex malloc
  1529. @cindex memory
  1530. Polipo uses the standard @code{malloc}(3) memory allocator for
  1531. allocating small data structures (up to 100 bytes), small strings and
  1532. atoms (unique strings).
  1533. @node Limiting memory usage, , Malloc memory, Memory usage
  1534. @section Limiting Polipo's memory usage
  1535. @cindex limiting memory
  1536. @cindex memory
  1537. Polipo is designed to work well when given little memory, but will
  1538. happily scale to larger configurations. For that reason, you need to
  1539. inform it of the amount of memory it can use.
  1540. @menu
  1541. * Limiting chunk usage:: Discard objects when low on chunks.
  1542. * Limiting object usage:: Limit the number of objects.
  1543. * OS usage limits:: Don't impose OS limits.
  1544. @end menu
  1545. @node Limiting chunk usage, Limiting object usage, Limiting memory usage, Limiting memory usage
  1546. @subsection Limiting chunk usage
  1547. @vindex chunkHighMark
  1548. @vindex chunkCriticalMark
  1549. @vindex chunkLowMark
  1550. @vindex CHUNK_SIZE
  1551. @cindex memory
  1552. @cindex chunk
  1553. You can limit Polipo's usage of chunk memory by setting
  1554. @code{chunkHighMark} and @code{chunkLowMark}.
  1555. The value @code{chunkHighMark} is the absolute maximum number of bytes
  1556. of allocated chunk memory. When this value is reached, Polipo will try
  1557. to purge objects from its in-memory cache; if that fails to free memory,
  1558. Polipo will start dropping connections. This value defaults to
  1559. 24@dmn{MB} or one quarter of the machine's physical memory, whichever is
  1560. less.
  1561. When chunk usage falls back below @code{chunkLowMark}, Polipo will
  1562. stop discarding in-memory objects. The value
  1563. @code{chunkCriticalMark}, which should be somewhere between
  1564. @code{chunkLowMark} and @code{chunkHighMark}, specifies the value
  1565. above which Polipo will make heroic efforts to free memory, including
  1566. punching holes in the middle of instances, but without dropping
  1567. connections.
  1568. Unless set explicitly, both @code{chunkLowMark} and
  1569. @code{chunkCriticalMark} are computed automatically from
  1570. @code{chunkHighMark}.
  1571. @node Limiting object usage, OS usage limits, Limiting chunk usage, Limiting memory usage
  1572. @subsection Limiting object usage
  1573. @vindex objectHighMark
  1574. @vindex publicObjectLowMark
  1575. @vindex objectHashTableSize
  1576. Besides limiting chunk usage, it is possible to limit Polipo's memory
  1577. usage by bounding the number of objects it keeps in memory at any given
  1578. time. This is done with @code{objectHighMark} and
  1579. @code{publicObjectLowMark}.
  1580. The value @code{objectHighMark} is the absolute maximum of objects
  1581. held in memory (including resources and server addresses). When the
  1582. number of in-memory objects that haven't been superseded yet falls
  1583. below @code{publicObjectLowMark}, Polipo will stop writing out objects
  1584. to disk (superseded objects are discarded as soon as possible).
  1585. On 32-bit architectures, every object costs 108 bytes of memory, plus
  1586. storage for every globally unique header that is not handled specially
  1587. (hopefully negligible), plus an overhead of one word (4 bytes) for
  1588. every chunk of data in the object.
  1589. You may also want to change @code{objectHashTableSize}. This is the
  1590. size of the hash table used for holding objects; it should be a power
  1591. of two and defaults to eight times @code{objectHighMark}. Increasing
  1592. this value will reduce the number of objects being written out to disk
  1593. due to hash table collisions. Every hash table entry costs one word.
  1594. @node OS usage limits, , Limiting object usage, Limiting memory usage
  1595. @subsection OS usage limits
  1596. @cindex usage limit
  1597. @cindex ulimit
  1598. @cindex OOM killer
  1599. Many operating systems permit limiting a process' memory usage by
  1600. setting a @dfn{usage limit}; on most Unix-like systems, this is done
  1601. with the @option{-v} option to the @command{ulimit} command.
  1602. Typically, the effect is to cause calls to the @code{malloc} and
  1603. @code{mmap} library functions to fail.
  1604. Polipo will usually react gracefully to failures to allocate
  1605. memory@footnote{There are exactly three places in the code where
  1606. Polipo will give up and exit if out of memory; all three are extremely
  1607. unlikely to happen in practice.}. Nonetheless, you should avoid using
  1608. OS limits to limit Polipo's memory usage: when it hits an OS limit,
  1609. Polipo cannot allocate the memory needed to schedule recovery from the
  1610. out-of-memory condition, and has no choice other than to drop a
  1611. connection.
  1612. Unfortunately, some operating system kernels (notably certain Linux
  1613. releases) fail to fail an allocation if no usage limit is given;
  1614. instead, they either crash when memory is exhausted, or else start
  1615. killing random processes with no advance warning@footnote{How I wish
  1616. for a @samp{SIGXMEM} signal.}. On such systems, imposing an
  1617. (unrealistically large) usage limit on Polipo is the safe thing to do.
  1618. @node Copying, Variable index, Memory usage, Top
  1619. @unnumbered Copying
  1620. You are allowed to do anything you wish with Polipo as long as you
  1621. don't deny my right to be recognised as its author and you don't blame
  1622. me if anything goes wrong.
  1623. More formally, Polipo is distributed under the following terms:
  1624. @quotation
  1625. Copyright @copyright{} 2003--2006 by Juliusz Chroboczek
  1626. Permission is hereby granted, free of charge, to any person obtaining a copy
  1627. of this software and associated documentation files (the "Software"), to deal
  1628. in the Software without restriction, including without limitation the rights
  1629. to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
  1630. copies of the Software, and to permit persons to whom the Software is
  1631. furnished to do so, subject to the following conditions:
  1632. The above copyright notice and this permission notice shall be included in
  1633. all copies or substantial portions of the Software.
  1634. THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
  1635. IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
  1636. FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
  1637. AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
  1638. LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
  1639. OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
  1640. THE SOFTWARE.
  1641. @end quotation
  1642. The last sentence is what happens when you allow lawyers to have it
  1643. their way with a language.
  1644. @node Variable index, Concept index, Copying, Top
  1645. @unnumbered Variable index
  1646. @printindex vr
  1647. @node Concept index, , Variable index, Top
  1648. @unnumbered Concept index
  1649. @printindex cp
  1650. @bye