12345678910111213141516171819202122232425262728293031323334353637383940414243444546474849505152535455565758596061626364656667686970717273747576777879808182838485868788899091929394959697989910010110210310410510610710810911011111211311411511611711811912012112212312412512612712812913013113213313413513613713813914014114214314414514614714814915015115215315415515615715815916016116216316416516616716816917017117217317417517617717817918018118218318418518618718818919019119219319419519619719819920020120220320420520620720820921021121221321421521621721821922022122222322422522622722822923023123223323423523623723823924024124224324424524624724824925025125225325425525625725825926026126226326426526626726826927027127227327427527627727827928028128228328428528628728828929029129229329429529629729829930030130230330430530630730830931031131231331431531631731831932032132232332432532632732832933033133233333433533633733833934034134234334434534634734834935035135235335435535635735835936036136236336436536636736836937037137237337437537637737837938038138238338438538638738838939039139239339439539639739839940040140240340440540640740840941041141241341441541641741841942042142242342442542642742842943043143243343443543643743843944044144244344444544644744844945045145245345445545645745845946046146246346446546646746846947047147247347447547647747847948048148248348448548648748848949049149249349449549649749849950050150250350450550650750850951051151251351451551651751851952052152252352452552652752852953053153253353453553653753853954054154254354454554654754854955055155255355455555655755855956056156256356456556656756856957057157257357457557657757857958058158258358458558658758858959059159259359459559659759859960060160260360460560660760860961061161261361461561661761861962062162262362462562662762862963063163263363463563663763863964064164264364464564664764864965065165265365465565665765865966066166266366466566666766866967067167267367467567667767867968068168268368468568668768868969069169269369469569669769869970070170270370470570670770870971071171271371471571671771871972072172272372472572672772872973073173273373473573673773873974074174274374474574674774874975075175275375475575675775875976076176276376476576676776876977077177277377477577677777877978078178278378478578678778878979079179279379479579679779879980080180280380480580680780880981081181281381481581681781881982082182282382482582682782882983083183283383483583683783883984084184284384484584684784884985085185285385485585685785885986086186286386486586686786886987087187287387487587687787887988088188288388488588688788888989089189289389489589689789889990090190290390490590690790890991091191291391491591691791891992092192292392492592692792892993093193293393493593693793893994094194294394494594694794894995095195295395495595695795895996096196296396496596696796896997097197297397497597697797897998098198298398498598698798898999099199299399499599699799899910001001100210031004100510061007100810091010101110121013101410151016101710181019102010211022102310241025102610271028102910301031103210331034103510361037103810391040104110421043104410451046104710481049105010511052105310541055105610571058105910601061106210631064106510661067106810691070107110721073107410751076107710781079108010811082108310841085108610871088108910901091109210931094109510961097109810991100110111021103110411051106110711081109111011111112111311141115111611171118111911201121112211231124112511261127112811291130113111321133113411351136113711381139114011411142114311441145114611471148114911501151115211531154115511561157115811591160116111621163116411651166116711681169117011711172117311741175117611771178117911801181118211831184118511861187118811891190119111921193119411951196119711981199120012011202120312041205120612071208120912101211121212131214121512161217121812191220122112221223122412251226122712281229123012311232123312341235123612371238123912401241124212431244124512461247124812491250125112521253125412551256125712581259126012611262126312641265126612671268126912701271127212731274127512761277127812791280128112821283128412851286128712881289129012911292129312941295129612971298129913001301130213031304130513061307130813091310131113121313131413151316131713181319132013211322132313241325132613271328132913301331133213331334133513361337133813391340134113421343134413451346134713481349135013511352135313541355135613571358135913601361136213631364136513661367136813691370137113721373137413751376137713781379138013811382138313841385138613871388138913901391139213931394139513961397139813991400140114021403140414051406140714081409141014111412141314141415141614171418141914201421142214231424142514261427142814291430143114321433143414351436143714381439144014411442144314441445144614471448144914501451145214531454145514561457145814591460146114621463146414651466146714681469147014711472147314741475147614771478147914801481148214831484148514861487148814891490149114921493149414951496149714981499150015011502150315041505150615071508150915101511151215131514151515161517151815191520152115221523 |
- /*********************************************************************
- *
- * File : $Source: /cvsroot/ijbswa/current/urlmatch.c,v $
- *
- * Purpose : Declares functions to match URLs against URL
- * patterns.
- *
- * Copyright : Written by and Copyright (C) 2001-2020
- * the Privoxy team. https://www.privoxy.org/
- *
- * Based on the Internet Junkbuster originally written
- * by and Copyright (C) 1997 Anonymous Coders and
- * Junkbusters Corporation. http://www.junkbusters.com
- *
- * This program is free software; you can redistribute it
- * and/or modify it under the terms of the GNU General
- * Public License as published by the Free Software
- * Foundation; either version 2 of the License, or (at
- * your option) any later version.
- *
- * This program is distributed in the hope that it will
- * be useful, but WITHOUT ANY WARRANTY; without even the
- * implied warranty of MERCHANTABILITY or FITNESS FOR A
- * PARTICULAR PURPOSE. See the GNU General Public
- * License for more details.
- *
- * The GNU General Public License should be included with
- * this file. If not, you can view it at
- * http://www.gnu.org/copyleft/gpl.html
- * or write to the Free Software Foundation, Inc., 59
- * Temple Place - Suite 330, Boston, MA 02111-1307, USA.
- *
- *********************************************************************/
- #include "config.h"
- #ifndef _WIN32
- #include <stdio.h>
- #include <sys/types.h>
- #endif
- #include <stdlib.h>
- #include <ctype.h>
- #include <assert.h>
- #include <string.h>
- #if !defined(_WIN32)
- #include <unistd.h>
- #endif
- #include "project.h"
- #include "urlmatch.h"
- #include "ssplit.h"
- #include "miscutil.h"
- #include "errlog.h"
- enum regex_anchoring
- {
- NO_ANCHORING,
- LEFT_ANCHORED,
- RIGHT_ANCHORED,
- RIGHT_ANCHORED_HOST
- };
- static jb_err compile_vanilla_host_pattern(struct pattern_spec *url, const char *host_pattern);
- #ifdef FEATURE_PCRE_HOST_PATTERNS
- static jb_err compile_pcre_host_pattern(struct pattern_spec *url, const char *host_pattern);
- #endif
- /*********************************************************************
- *
- * Function : free_http_request
- *
- * Description : Freez a http_request structure
- *
- * Parameters :
- * 1 : http = points to a http_request structure to free
- *
- * Returns : N/A
- *
- *********************************************************************/
- void free_http_request(struct http_request *http)
- {
- assert(http);
- freez(http->cmd);
- freez(http->ocmd);
- freez(http->gpc);
- freez(http->host);
- freez(http->url);
- freez(http->hostport);
- freez(http->path);
- freez(http->version);
- freez(http->host_ip_addr_str);
- freez(http->dbuffer);
- freez(http->dvec);
- http->dcount = 0;
- }
- /*********************************************************************
- *
- * Function : init_domain_components
- *
- * Description : Splits the domain name so we can compare it
- * against wildcards. It used to be part of
- * parse_http_url, but was separated because the
- * same code is required in chat in case of
- * intercepted requests.
- *
- * Parameters :
- * 1 : http = pointer to the http structure to hold elements.
- *
- * Returns : JB_ERR_OK on success
- * JB_ERR_PARSE on malformed command/URL
- * or >100 domains deep.
- *
- *********************************************************************/
- jb_err init_domain_components(struct http_request *http)
- {
- char *vec[BUFFER_SIZE];
- size_t size;
- char *p;
- http->dbuffer = strdup_or_die(http->host);
- /* map to lower case */
- for (p = http->dbuffer; *p ; p++)
- {
- *p = (char)privoxy_tolower(*p);
- }
- /* split the domain name into components */
- http->dcount = ssplit(http->dbuffer, ".", vec, SZ(vec));
- if (http->dcount <= 0)
- {
- /*
- * Error: More than SZ(vec) components in domain
- * or: no components in domain
- */
- log_error(LOG_LEVEL_ERROR, "More than SZ(vec) components in domain or none at all.");
- return JB_ERR_PARSE;
- }
- /* save a copy of the pointers in dvec */
- size = (size_t)http->dcount * sizeof(*http->dvec);
- http->dvec = malloc_or_die(size);
- memcpy(http->dvec, vec, size);
- return JB_ERR_OK;
- }
- /*********************************************************************
- *
- * Function : url_requires_percent_encoding
- *
- * Description : Checks if an URL contains invalid characters
- * according to RFC 3986 that should be percent-encoded.
- * Does not verify whether or not the passed string
- * actually is a valid URL.
- *
- * Parameters :
- * 1 : url = URL to check
- *
- * Returns : True in case of valid URLs, false otherwise
- *
- *********************************************************************/
- int url_requires_percent_encoding(const char *url)
- {
- static const char allowed_characters[128] = {
- '\0', '\0', '\0', '\0', '\0', '\0', '\0', '\0', '\0', '\0',
- '\0', '\0', '\0', '\0', '\0', '\0', '\0', '\0', '\0', '\0',
- '\0', '\0', '\0', '\0', '\0', '\0', '\0', '\0', '\0', '\0',
- '\0', '\0', '\0', '!', '\0', '#', '$', '%', '&', '\'',
- '(', ')', '*', '+', ',', '-', '.', '/', '0', '1',
- '2', '3', '4', '5', '6', '7', '8', '9', ':', ';',
- '\0', '=', '\0', '?', '@', 'A', 'B', 'C', 'D', 'E',
- 'F', 'G', 'H', 'I', 'J', 'K', 'L', 'M', 'N', 'O',
- 'P', 'Q', 'R', 'S', 'T', 'U', 'V', 'W', 'X', 'Y',
- 'Z', '[', '\0', ']', '\0', '_', '\0', 'a', 'b', 'c',
- 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k', 'l', 'm',
- 'n', 'o', 'p', 'q', 'r', 's', 't', 'u', 'v', 'w',
- 'x', 'y', 'z', '\0', '\0', '\0', '~', '\0'
- };
- while (*url != '\0')
- {
- const unsigned int i = (unsigned char)*url++;
- if (i >= sizeof(allowed_characters) || '\0' == allowed_characters[i])
- {
- return TRUE;
- }
- }
- return FALSE;
- }
- /*********************************************************************
- *
- * Function : parse_http_url
- *
- * Description : Parse out the host and port from the URL. Find the
- * hostname & path, port (if ':'), and/or password (if '@')
- *
- * Parameters :
- * 1 : url = URL (or is it URI?) to break down
- * 2 : http = pointer to the http structure to hold elements.
- * Must be initialized with valid values (like NULLs).
- * 3 : require_protocol = Whether or not URLs without
- * protocol are acceptable.
- *
- * Returns : JB_ERR_OK on success
- * JB_ERR_PARSE on malformed command/URL
- * or >100 domains deep.
- *
- *********************************************************************/
- jb_err parse_http_url(const char *url, struct http_request *http, int require_protocol)
- {
- int host_available = 1; /* A proxy can dream. */
- /*
- * Save our initial URL
- */
- http->url = strdup_or_die(url);
- /*
- * Check for * URI. If found, we're done.
- */
- if (*http->url == '*')
- {
- http->path = strdup_or_die("*");
- http->hostport = strdup_or_die("");
- if (http->url[1] != '\0')
- {
- return JB_ERR_PARSE;
- }
- return JB_ERR_OK;
- }
- /*
- * Split URL into protocol,hostport,path.
- */
- {
- char *buf;
- char *url_noproto;
- char *url_path;
- buf = strdup_or_die(url);
- /* Find the start of the URL in our scratch space */
- url_noproto = buf;
- if (strncmpic(url_noproto, "http://", 7) == 0)
- {
- url_noproto += 7;
- }
- else if (strncmpic(url_noproto, "https://", 8) == 0)
- {
- /*
- * Should only happen when called from cgi_show_url_info().
- */
- url_noproto += 8;
- http->ssl = 1;
- }
- else if (*url_noproto == '/')
- {
- /*
- * Short request line without protocol and host.
- * Most likely because the client's request
- * was intercepted and redirected into Privoxy.
- */
- http->host = NULL;
- host_available = 0;
- }
- else if (require_protocol)
- {
- freez(buf);
- return JB_ERR_PARSE;
- }
- url_path = strchr(url_noproto, '/');
- if (url_path != NULL)
- {
- /*
- * Got a path.
- *
- * If FEATURE_HTTPS_INSPECTION isn't available, ignore the
- * path for https URLs so that we get consistent behaviour
- * if a https URL is parsed. When the URL is actually
- * retrieved, https hides the path part.
- */
- http->path = strdup_or_die(
- #ifndef FEATURE_HTTPS_INSPECTION
- http->ssl ? "/" :
- #endif
- url_path
- );
- *url_path = '\0';
- http->hostport = string_tolower(url_noproto);
- }
- else
- {
- /*
- * Repair broken HTTP requests that don't contain a path,
- * or CONNECT requests
- */
- http->path = strdup_or_die("/");
- http->hostport = string_tolower(url_noproto);
- }
- freez(buf);
- if (http->hostport == NULL)
- {
- return JB_ERR_PARSE;
- }
- }
- if (!host_available)
- {
- /* Without host, there is nothing left to do here */
- return JB_ERR_OK;
- }
- /*
- * Split hostport into user/password (ignored), host, port.
- */
- {
- char *buf;
- char *host;
- char *port;
- buf = strdup_or_die(http->hostport);
- /* check if url contains username and/or password */
- host = strchr(buf, '@');
- if (host != NULL)
- {
- /* Contains username/password, skip it and the @ sign. */
- host++;
- }
- else
- {
- /* No username or password. */
- host = buf;
- }
- /* Move after hostname before port number */
- if (*host == '[')
- {
- /* Numeric IPv6 address delimited by brackets */
- host++;
- port = strchr(host, ']');
- if (port == NULL)
- {
- /* Missing closing bracket */
- freez(buf);
- return JB_ERR_PARSE;
- }
- *port++ = '\0';
- if (*port == '\0')
- {
- port = NULL;
- }
- else if (*port != ':')
- {
- /* Garbage after closing bracket */
- freez(buf);
- return JB_ERR_PARSE;
- }
- }
- else
- {
- /* Plain non-escaped hostname */
- port = strchr(host, ':');
- }
- /* check if url contains port */
- if (port != NULL)
- {
- /* Contains port */
- char *endptr;
- long parsed_port;
- /* Terminate hostname and point to start of port string */
- *port++ = '\0';
- parsed_port = strtol(port, &endptr, 10);
- if ((parsed_port <= 0) || (parsed_port > 65535) || (*endptr != '\0'))
- {
- log_error(LOG_LEVEL_ERROR, "Invalid port in URL: %s.", url);
- freez(buf);
- return JB_ERR_PARSE;
- }
- http->port = (int)parsed_port;
- }
- else
- {
- /* No port specified. */
- http->port = (http->ssl ? 443 : 80);
- }
- http->host = strdup_or_die(host);
- freez(buf);
- }
- /* Split domain name so we can compare it against wildcards */
- return init_domain_components(http);
- }
- /*********************************************************************
- *
- * Function : unknown_method
- *
- * Description : Checks whether a method is unknown.
- *
- * Parameters :
- * 1 : method = points to a http method
- *
- * Returns : TRUE if it's unknown, FALSE otherwise.
- *
- *********************************************************************/
- static int unknown_method(const char *method)
- {
- static const char * const known_http_methods[] = {
- /* Basic HTTP request type */
- "GET", "HEAD", "POST", "PUT", "DELETE", "OPTIONS", "TRACE", "CONNECT",
- /* webDAV extensions (RFC2518) */
- "PROPFIND", "PROPPATCH", "MOVE", "COPY", "MKCOL", "LOCK", "UNLOCK",
- /*
- * Microsoft webDAV extension for Exchange 2000. See:
- * http://lists.w3.org/Archives/Public/w3c-dist-auth/2002JanMar/0001.html
- * http://msdn.microsoft.com/library/en-us/wss/wss/_webdav_methods.asp
- */
- "BCOPY", "BMOVE", "BDELETE", "BPROPFIND", "BPROPPATCH",
- /*
- * Another Microsoft webDAV extension for Exchange 2000. See:
- * http://systems.cs.colorado.edu/grunwald/MobileComputing/Papers/draft-cohen-gena-p-base-00.txt
- * http://lists.w3.org/Archives/Public/w3c-dist-auth/2002JanMar/0001.html
- * http://msdn.microsoft.com/library/en-us/wss/wss/_webdav_methods.asp
- */
- "SUBSCRIBE", "UNSUBSCRIBE", "NOTIFY", "POLL",
- /*
- * Yet another WebDAV extension, this time for
- * Web Distributed Authoring and Versioning (RFC3253)
- */
- "VERSION-CONTROL", "REPORT", "CHECKOUT", "CHECKIN", "UNCHECKOUT",
- "MKWORKSPACE", "UPDATE", "LABEL", "MERGE", "BASELINE-CONTROL", "MKACTIVITY",
- /*
- * The PATCH method is defined by RFC5789, the format of the
- * actual patch in the body depends on the application, but from
- * Privoxy's point of view it doesn't matter.
- */
- "PATCH",
- };
- int i;
- for (i = 0; i < SZ(known_http_methods); i++)
- {
- if (0 == strcmpic(method, known_http_methods[i]))
- {
- return FALSE;
- }
- }
- return TRUE;
- }
- /*********************************************************************
- *
- * Function : normalize_http_version
- *
- * Description : Take a supported HTTP version string and remove
- * leading zeroes etc., reject unsupported versions.
- *
- * This is an explicit RFC 2616 (3.1) MUST and
- * RFC 7230 mandates that intermediaries send their
- * own HTTP-version in forwarded messages.
- *
- * Parameters :
- * 1 : http_version = HTTP version string
- *
- * Returns : JB_ERR_OK on success
- * JB_ERR_PARSE if the HTTP version is unsupported
- *
- *********************************************************************/
- static jb_err normalize_http_version(char *http_version)
- {
- unsigned int major_version;
- unsigned int minor_version;
- if (2 != sscanf(http_version, "HTTP/%u.%u", &major_version, &minor_version))
- {
- log_error(LOG_LEVEL_ERROR, "Unsupported HTTP version: %s", http_version);
- return JB_ERR_PARSE;
- }
- if (major_version != 1 || (minor_version != 0 && minor_version != 1))
- {
- log_error(LOG_LEVEL_ERROR, "The only supported HTTP "
- "versions are 1.0 and 1.1. This rules out: %s", http_version);
- return JB_ERR_PARSE;
- }
- assert(strlen(http_version) >= 8);
- snprintf(http_version, 9, "HTTP/%u.%u", major_version, minor_version);
- return JB_ERR_OK;
- }
- /*********************************************************************
- *
- * Function : parse_http_request
- *
- * Description : Parse out the host and port from the URL. Find the
- * hostname & path, port (if ':'), and/or password (if '@')
- *
- * Parameters :
- * 1 : req = HTTP request line to break down
- * 2 : http = pointer to the http structure to hold elements
- *
- * Returns : JB_ERR_OK on success
- * JB_ERR_CGI_PARAMS on malformed command/URL
- * or >100 domains deep.
- *
- *********************************************************************/
- jb_err parse_http_request(const char *req, struct http_request *http)
- {
- char *buf;
- char *v[3];
- int n;
- jb_err err;
- memset(http, '\0', sizeof(*http));
- buf = strdup_or_die(req);
- n = ssplit(buf, " \r\n", v, SZ(v));
- if (n != 3)
- {
- freez(buf);
- return JB_ERR_PARSE;
- }
- /*
- * Fail in case of unknown methods
- * which we might not handle correctly.
- *
- * XXX: There should be a config option
- * to forward requests with unknown methods
- * anyway. Most of them don't need special
- * steps.
- */
- if (unknown_method(v[0]))
- {
- log_error(LOG_LEVEL_ERROR, "Unknown HTTP method detected: %s", v[0]);
- freez(buf);
- return JB_ERR_PARSE;
- }
- if (JB_ERR_OK != normalize_http_version(v[2]))
- {
- freez(buf);
- return JB_ERR_PARSE;
- }
- http->ssl = !strcmpic(v[0], "CONNECT");
- err = parse_http_url(v[1], http, !http->ssl);
- if (err)
- {
- freez(buf);
- return err;
- }
- /*
- * Copy the details into the structure
- */
- http->cmd = strdup_or_die(req);
- http->gpc = strdup_or_die(v[0]);
- http->version = strdup_or_die(v[2]);
- http->ocmd = strdup_or_die(http->cmd);
- freez(buf);
- return JB_ERR_OK;
- }
- /*********************************************************************
- *
- * Function : compile_pattern
- *
- * Description : Compiles a host, domain or TAG pattern.
- *
- * Parameters :
- * 1 : pattern = The pattern to compile.
- * 2 : anchoring = How the regex should be modified
- * before compilation. Can be either
- * one of NO_ANCHORING, LEFT_ANCHORED,
- * RIGHT_ANCHORED or RIGHT_ANCHORED_HOST.
- * 3 : url = In case of failures, the spec member is
- * logged and the structure freed.
- * 4 : regex = Where the compiled regex should be stored.
- *
- * Returns : JB_ERR_OK - Success
- * JB_ERR_PARSE - Cannot parse regex
- *
- *********************************************************************/
- static jb_err compile_pattern(const char *pattern, enum regex_anchoring anchoring,
- struct pattern_spec *url, regex_t **regex)
- {
- int errcode;
- const char *fmt = NULL;
- char *rebuf;
- size_t rebuf_size;
- assert(pattern);
- if (pattern[0] == '\0')
- {
- *regex = NULL;
- return JB_ERR_OK;
- }
- switch (anchoring)
- {
- case NO_ANCHORING:
- fmt = "%s";
- break;
- case RIGHT_ANCHORED:
- fmt = "%s$";
- break;
- case RIGHT_ANCHORED_HOST:
- fmt = "%s\\.?$";
- break;
- case LEFT_ANCHORED:
- fmt = "^%s";
- break;
- default:
- log_error(LOG_LEVEL_FATAL,
- "Invalid anchoring in compile_pattern %d", anchoring);
- }
- rebuf_size = strlen(pattern) + strlen(fmt);
- rebuf = malloc_or_die(rebuf_size);
- *regex = zalloc_or_die(sizeof(**regex));
- snprintf(rebuf, rebuf_size, fmt, pattern);
- errcode = regcomp(*regex, rebuf, (REG_EXTENDED|REG_NOSUB|REG_ICASE));
- if (errcode)
- {
- size_t errlen = regerror(errcode, *regex, rebuf, rebuf_size);
- if (errlen > (rebuf_size - (size_t)1))
- {
- errlen = rebuf_size - (size_t)1;
- }
- rebuf[errlen] = '\0';
- log_error(LOG_LEVEL_ERROR, "error compiling %s from %s: %s",
- pattern, url->spec, rebuf);
- free_pattern_spec(url);
- freez(rebuf);
- return JB_ERR_PARSE;
- }
- freez(rebuf);
- return JB_ERR_OK;
- }
- /*********************************************************************
- *
- * Function : compile_url_pattern
- *
- * Description : Compiles the three parts of an URL pattern.
- *
- * Parameters :
- * 1 : url = Target pattern_spec to be filled in.
- * 2 : buf = The url pattern to compile. Will be messed up.
- *
- * Returns : JB_ERR_OK - Success
- * JB_ERR_MEMORY - Out of memory
- * JB_ERR_PARSE - Cannot parse regex
- *
- *********************************************************************/
- static jb_err compile_url_pattern(struct pattern_spec *url, char *buf)
- {
- char *p;
- #ifdef FEATURE_PCRE_HOST_PATTERNS
- const size_t prefix_length = 18;
- if (strncmpic(buf, "PCRE-HOST-PATTERN:", prefix_length) == 0)
- {
- url->pattern.url_spec.host_regex_type = PCRE_HOST_PATTERN;
- /* Overwrite the "PCRE-HOST-PATTERN:" prefix */
- memmove(buf, buf+prefix_length, strlen(buf+prefix_length)+1);
- }
- else
- {
- url->pattern.url_spec.host_regex_type = VANILLA_HOST_PATTERN;
- }
- #endif
- p = strchr(buf, '/');
- if (NULL != p)
- {
- /*
- * Only compile the regex if it consists of more than
- * a single slash, otherwise it wouldn't affect the result.
- */
- if (p[1] != '\0')
- {
- /*
- * XXX: does it make sense to compile the slash at the beginning?
- */
- jb_err err = compile_pattern(p, LEFT_ANCHORED, url, &url->pattern.url_spec.preg);
- if (JB_ERR_OK != err)
- {
- return err;
- }
- }
- *p = '\0';
- }
- /*
- * IPv6 numeric hostnames can contain colons, thus we need
- * to delimit the hostname before the real port separator.
- * As brackets are already used in the hostname pattern,
- * we use angle brackets ('<', '>') instead.
- */
- if ((buf[0] == '<') && (NULL != (p = strchr(buf + 1, '>'))))
- {
- *p++ = '\0';
- buf++;
- if (*p == '\0')
- {
- /* IPv6 address without port number */
- p = NULL;
- }
- else if (*p != ':')
- {
- /* Garbage after address delimiter */
- return JB_ERR_PARSE;
- }
- }
- else
- {
- p = strchr(buf, ':');
- }
- if (NULL != p)
- {
- *p++ = '\0';
- url->pattern.url_spec.port_list = strdup_or_die(p);
- }
- else
- {
- url->pattern.url_spec.port_list = NULL;
- }
- if (buf[0] != '\0')
- {
- #ifdef FEATURE_PCRE_HOST_PATTERNS
- if (url->pattern.url_spec.host_regex_type == PCRE_HOST_PATTERN)
- {
- return compile_pcre_host_pattern(url, buf);
- }
- else
- #endif
- {
- return compile_vanilla_host_pattern(url, buf);
- }
- }
- return JB_ERR_OK;
- }
- #ifdef FEATURE_PCRE_HOST_PATTERNS
- /*********************************************************************
- *
- * Function : compile_pcre_host_pattern
- *
- * Description : Parses and compiles a pcre host pattern.
- *
- * Parameters :
- * 1 : url = Target pattern_spec to be filled in.
- * 2 : host_pattern = Host pattern to compile.
- *
- * Returns : JB_ERR_OK - Success
- * JB_ERR_MEMORY - Out of memory
- * JB_ERR_PARSE - Cannot parse regex
- *
- *********************************************************************/
- static jb_err compile_pcre_host_pattern(struct pattern_spec *url, const char *host_pattern)
- {
- return compile_pattern(host_pattern, RIGHT_ANCHORED_HOST, url, &url->pattern.url_spec.host_regex);
- }
- #endif /* def FEATURE_PCRE_HOST_PATTERNS */
- /*********************************************************************
- *
- * Function : compile_vanilla_host_pattern
- *
- * Description : Parses and "compiles" an old-school host pattern.
- *
- * Parameters :
- * 1 : url = Target pattern_spec to be filled in.
- * 2 : host_pattern = Host pattern to parse.
- *
- * Returns : JB_ERR_OK - Success
- * JB_ERR_PARSE - Cannot parse regex
- *
- *********************************************************************/
- static jb_err compile_vanilla_host_pattern(struct pattern_spec *url, const char *host_pattern)
- {
- char *v[150];
- size_t size;
- char *p;
- /*
- * Parse domain part
- */
- if (host_pattern[strlen(host_pattern) - 1] == '.')
- {
- url->pattern.url_spec.unanchored |= ANCHOR_RIGHT;
- }
- if (host_pattern[0] == '.')
- {
- url->pattern.url_spec.unanchored |= ANCHOR_LEFT;
- }
- /*
- * Split domain into components
- */
- url->pattern.url_spec.dbuffer = strdup_or_die(host_pattern);
- /*
- * Map to lower case
- */
- for (p = url->pattern.url_spec.dbuffer; *p ; p++)
- {
- *p = (char)privoxy_tolower(*p);
- }
- /*
- * Split the domain name into components
- */
- url->pattern.url_spec.dcount = ssplit(url->pattern.url_spec.dbuffer, ".", v, SZ(v));
- if (url->pattern.url_spec.dcount < 0)
- {
- free_pattern_spec(url);
- return JB_ERR_PARSE;
- }
- else if (url->pattern.url_spec.dcount != 0)
- {
- /*
- * Save a copy of the pointers in dvec
- */
- size = (size_t)url->pattern.url_spec.dcount * sizeof(*url->pattern.url_spec.dvec);
- url->pattern.url_spec.dvec = malloc_or_die(size);
- memcpy(url->pattern.url_spec.dvec, v, size);
- }
- /*
- * else dcount == 0 in which case we needn't do anything,
- * since dvec will never be accessed and the pattern will
- * match all domains.
- */
- return JB_ERR_OK;
- }
- /*********************************************************************
- *
- * Function : simplematch
- *
- * Description : String matching, with a (greedy) '*' wildcard that
- * stands for zero or more arbitrary characters and
- * character classes in [], which take both enumerations
- * and ranges.
- *
- * Parameters :
- * 1 : pattern = pattern for matching
- * 2 : text = text to be matched
- *
- * Returns : 0 if match, else nonzero
- *
- *********************************************************************/
- static int simplematch(const char *pattern, const char *text)
- {
- const unsigned char *pat = (const unsigned char *)pattern;
- const unsigned char *txt = (const unsigned char *)text;
- const unsigned char *fallback = pat;
- int wildcard = 0;
- unsigned char lastchar = 'a';
- unsigned i;
- unsigned char charmap[32];
- while (*txt)
- {
- /* EOF pattern but !EOF text? */
- if (*pat == '\0')
- {
- if (wildcard)
- {
- pat = fallback;
- }
- else
- {
- return 1;
- }
- }
- /* '*' in the pattern? */
- if (*pat == '*')
- {
- /* The pattern ends afterwards? Speed up the return. */
- if (*++pat == '\0')
- {
- return 0;
- }
- /* Else, set wildcard mode and remember position after '*' */
- wildcard = 1;
- fallback = pat;
- }
- /* Character range specification? */
- if (*pat == '[')
- {
- memset(charmap, '\0', sizeof(charmap));
- while (*++pat != ']')
- {
- if (!*pat)
- {
- return 1;
- }
- else if (*pat == '-')
- {
- if ((*++pat == ']') || *pat == '\0')
- {
- return(1);
- }
- for (i = lastchar; i <= *pat; i++)
- {
- charmap[i / 8] |= (unsigned char)(1 << (i % 8));
- }
- }
- else
- {
- charmap[*pat / 8] |= (unsigned char)(1 << (*pat % 8));
- lastchar = *pat;
- }
- }
- } /* -END- if Character range specification */
- /*
- * Char match, or char range match?
- */
- if ((*pat == *txt)
- || (*pat == '?')
- || ((*pat == ']') && (charmap[*txt / 8] & (1 << (*txt % 8)))))
- {
- /*
- * Success: Go ahead
- */
- pat++;
- }
- else if (!wildcard)
- {
- /*
- * No match && no wildcard: No luck
- */
- return 1;
- }
- else if (pat != fallback)
- {
- /*
- * Increment text pointer if in char range matching
- */
- if (*pat == ']')
- {
- txt++;
- }
- /*
- * Wildcard mode && nonmatch beyond fallback: Rewind pattern
- */
- pat = fallback;
- /*
- * Restart matching from current text pointer
- */
- continue;
- }
- txt++;
- }
- /* Cut off extra '*'s */
- if (*pat == '*') pat++;
- /* If this is the pattern's end, fine! */
- return(*pat);
- }
- /*********************************************************************
- *
- * Function : simple_domaincmp
- *
- * Description : Domain-wise Compare fqdn's. The comparison is
- * both left- and right-anchored. The individual
- * domain names are compared with simplematch().
- * This is only used by domain_match.
- *
- * Parameters :
- * 1 : pv = array of patterns to compare
- * 2 : fv = array of domain components to compare
- * 3 : len = length of the arrays (both arrays are the
- * same length - if they weren't, it couldn't
- * possibly be a match).
- *
- * Returns : 0 => domains are equivalent, else no match.
- *
- *********************************************************************/
- static int simple_domaincmp(char **pv, char **fv, int len)
- {
- int n;
- for (n = 0; n < len; n++)
- {
- if (simplematch(pv[n], fv[n]))
- {
- return 1;
- }
- }
- return 0;
- }
- /*********************************************************************
- *
- * Function : domain_match
- *
- * Description : Domain-wise Compare fqdn's. Governed by the bimap in
- * p.pattern->unachored, the comparison is un-, left-,
- * right-anchored, or both.
- * The individual domain names are compared with
- * simplematch().
- *
- * Parameters :
- * 1 : p = a domain that may contain a '*' as a wildcard.
- * 2 : fqdn = domain name against which the patterns are compared.
- *
- * Returns : 0 => domains are equivalent, else no match.
- *
- *********************************************************************/
- static int domain_match(const struct pattern_spec *p, const struct http_request *fqdn)
- {
- char **pv, **fv; /* vectors */
- int plen, flen;
- int unanchored = p->pattern.url_spec.unanchored & (ANCHOR_RIGHT | ANCHOR_LEFT);
- plen = p->pattern.url_spec.dcount;
- flen = fqdn->dcount;
- if (flen < plen)
- {
- /* fqdn is too short to match this pattern */
- return 1;
- }
- pv = p->pattern.url_spec.dvec;
- fv = fqdn->dvec;
- if (unanchored == ANCHOR_LEFT)
- {
- /*
- * Right anchored.
- *
- * Convert this into a fully anchored pattern with
- * the fqdn and pattern the same length
- */
- fv += (flen - plen); /* flen - plen >= 0 due to check above */
- return simple_domaincmp(pv, fv, plen);
- }
- else if (unanchored == 0)
- {
- /* Fully anchored, check length */
- if (flen != plen)
- {
- return 1;
- }
- return simple_domaincmp(pv, fv, plen);
- }
- else if (unanchored == ANCHOR_RIGHT)
- {
- /* Left anchored, ignore all extra in fqdn */
- return simple_domaincmp(pv, fv, plen);
- }
- else
- {
- /* Unanchored */
- int n;
- int maxn = flen - plen;
- for (n = 0; n <= maxn; n++)
- {
- if (!simple_domaincmp(pv, fv, plen))
- {
- return 0;
- }
- /*
- * Doesn't match from start of fqdn
- * Try skipping first part of fqdn
- */
- fv++;
- }
- return 1;
- }
- }
- /*********************************************************************
- *
- * Function : create_pattern_spec
- *
- * Description : Creates a "pattern_spec" structure from a string.
- * When finished, free with free_pattern_spec().
- *
- * Parameters :
- * 1 : pattern = Target pattern_spec to be filled in.
- * Will be zeroed before use.
- * 2 : buf = Source pattern, null terminated. NOTE: The
- * contents of this buffer are destroyed by this
- * function. If this function succeeds, the
- * buffer is copied to pattern->spec. If this
- * function fails, the contents of the buffer
- * are lost forever.
- *
- * Returns : JB_ERR_OK - Success
- * JB_ERR_PARSE - Cannot parse regex (Detailed message
- * written to system log)
- *
- *********************************************************************/
- jb_err create_pattern_spec(struct pattern_spec *pattern, char *buf)
- {
- static const struct
- {
- /** The tag pattern prefix to match */
- const char *prefix;
- /** The length of the prefix to match */
- const size_t prefix_length;
- /** The pattern flag */
- const unsigned flag;
- } tag_pattern[] = {
- { "TAG:", 4, PATTERN_SPEC_TAG_PATTERN},
- #ifdef FEATURE_CLIENT_TAGS
- { "CLIENT-TAG:", 11, PATTERN_SPEC_CLIENT_TAG_PATTERN},
- #endif
- { "NO-REQUEST-TAG:", 15, PATTERN_SPEC_NO_REQUEST_TAG_PATTERN},
- { "NO-RESPONSE-TAG:", 16, PATTERN_SPEC_NO_RESPONSE_TAG_PATTERN}
- };
- int i;
- assert(pattern);
- assert(buf);
- memset(pattern, '\0', sizeof(*pattern));
- /* Remember the original specification for the CGI pages. */
- pattern->spec = strdup_or_die(buf);
- /* Check if it's a tag pattern */
- for (i = 0; i < SZ(tag_pattern); i++)
- {
- if (0 == strncmpic(pattern->spec, tag_pattern[i].prefix, tag_pattern[i].prefix_length))
- {
- /* The regex starts after the prefix */
- const char *tag_regex = buf + tag_pattern[i].prefix_length;
- pattern->flags |= tag_pattern[i].flag;
- return compile_pattern(tag_regex, NO_ANCHORING, pattern,
- &pattern->pattern.tag_regex);
- }
- }
- /* If it isn't a tag pattern it must be an URL pattern. */
- pattern->flags |= PATTERN_SPEC_URL_PATTERN;
- return compile_url_pattern(pattern, buf);
- }
- /*********************************************************************
- *
- * Function : free_pattern_spec
- *
- * Description : Called from the "unloaders". Freez the pattern
- * structure elements.
- *
- * Parameters :
- * 1 : pattern = pointer to a pattern_spec structure.
- *
- * Returns : N/A
- *
- *********************************************************************/
- void free_pattern_spec(struct pattern_spec *pattern)
- {
- if (pattern == NULL) return;
- freez(pattern->spec);
- #ifdef FEATURE_PCRE_HOST_PATTERNS
- if (pattern->pattern.url_spec.host_regex)
- {
- regfree(pattern->pattern.url_spec.host_regex);
- freez(pattern->pattern.url_spec.host_regex);
- }
- #endif /* def FEATURE_PCRE_HOST_PATTERNS */
- freez(pattern->pattern.url_spec.dbuffer);
- freez(pattern->pattern.url_spec.dvec);
- pattern->pattern.url_spec.dcount = 0;
- freez(pattern->pattern.url_spec.port_list);
- if (pattern->pattern.url_spec.preg)
- {
- regfree(pattern->pattern.url_spec.preg);
- freez(pattern->pattern.url_spec.preg);
- }
- if (pattern->pattern.tag_regex)
- {
- regfree(pattern->pattern.tag_regex);
- freez(pattern->pattern.tag_regex);
- }
- }
- /*********************************************************************
- *
- * Function : port_matches
- *
- * Description : Compares a port against a port list.
- *
- * Parameters :
- * 1 : port = The port to check.
- * 2 : port_list = The list of port to compare with.
- *
- * Returns : TRUE for yes, FALSE otherwise.
- *
- *********************************************************************/
- static int port_matches(const int port, const char *port_list)
- {
- return ((NULL == port_list) || match_portlist(port_list, port));
- }
- /*********************************************************************
- *
- * Function : host_matches
- *
- * Description : Compares a host against a host pattern.
- *
- * Parameters :
- * 1 : url = The URL to match
- * 2 : pattern = The URL pattern
- *
- * Returns : TRUE for yes, FALSE otherwise.
- *
- *********************************************************************/
- static int host_matches(const struct http_request *http,
- const struct pattern_spec *pattern)
- {
- assert(http->host != NULL);
- #ifdef FEATURE_PCRE_HOST_PATTERNS
- if (pattern->pattern.url_spec.host_regex_type == PCRE_HOST_PATTERN)
- {
- return ((NULL == pattern->pattern.url_spec.host_regex)
- || (0 == regexec(pattern->pattern.url_spec.host_regex,
- http->host, 0, NULL, 0)));
- }
- #endif
- return ((NULL == pattern->pattern.url_spec.dbuffer) || (0 == domain_match(pattern, http)));
- }
- /*********************************************************************
- *
- * Function : path_matches
- *
- * Description : Compares a path against a path pattern.
- *
- * Parameters :
- * 1 : path = The path to match
- * 2 : pattern = The URL pattern
- *
- * Returns : TRUE for yes, FALSE otherwise.
- *
- *********************************************************************/
- static int path_matches(const char *path, const struct pattern_spec *pattern)
- {
- return ((NULL == pattern->pattern.url_spec.preg)
- || (0 == regexec(pattern->pattern.url_spec.preg, path, 0, NULL, 0)));
- }
- /*********************************************************************
- *
- * Function : url_match
- *
- * Description : Compare a URL against a URL pattern.
- *
- * Parameters :
- * 1 : pattern = a URL pattern
- * 2 : url = URL to match
- *
- * Returns : Nonzero if the URL matches the pattern, else 0.
- *
- *********************************************************************/
- int url_match(const struct pattern_spec *pattern,
- const struct http_request *http)
- {
- if (!(pattern->flags & PATTERN_SPEC_URL_PATTERN))
- {
- /* It's not an URL pattern and thus shouldn't be matched against URLs */
- return 0;
- }
- return (port_matches(http->port, pattern->pattern.url_spec.port_list)
- && host_matches(http, pattern) && path_matches(http->path, pattern));
- }
- /*********************************************************************
- *
- * Function : match_portlist
- *
- * Description : Check if a given number is covered by a comma
- * separated list of numbers and ranges (a,b-c,d,..)
- *
- * Parameters :
- * 1 : portlist = String with list
- * 2 : port = port to check
- *
- * Returns : 0 => no match
- * 1 => match
- *
- *********************************************************************/
- int match_portlist(const char *portlist, int port)
- {
- char *min, *max, *next, *portlist_copy;
- min = portlist_copy = strdup_or_die(portlist);
- /*
- * Zero-terminate first item and remember offset for next
- */
- if (NULL != (next = strchr(portlist_copy, (int) ',')))
- {
- *next++ = '\0';
- }
- /*
- * Loop through all items, checking for match
- */
- while (NULL != min)
- {
- if (NULL == (max = strchr(min, (int) '-')))
- {
- /*
- * No dash, check for equality
- */
- if (port == atoi(min))
- {
- freez(portlist_copy);
- return(1);
- }
- }
- else
- {
- /*
- * This is a range, so check if between min and max,
- * or, if max was omitted, between min and 65K
- */
- *max++ = '\0';
- if (port >= atoi(min) && port <= (atoi(max) ? atoi(max) : 65535))
- {
- freez(portlist_copy);
- return(1);
- }
- }
- /*
- * Jump to next item
- */
- min = next;
- /*
- * Zero-terminate next item and remember offset for n+1
- */
- if ((NULL != next) && (NULL != (next = strchr(next, (int) ','))))
- {
- *next++ = '\0';
- }
- }
- freez(portlist_copy);
- return 0;
- }
- /*********************************************************************
- *
- * Function : parse_forwarder_address
- *
- * Description : Parse out the username, password, host and port from
- * a forwarder address.
- *
- * Parameters :
- * 1 : address = The forwarder address to parse.
- * 2 : hostname = Used to return the hostname. NULL on error.
- * 3 : port = Used to return the port. Untouched if no port
- * is specified.
- * 4 : username = Used to return the username if any.
- * 5 : password = Used to return the password if any.
- *
- * Returns : JB_ERR_OK on success
- * JB_ERR_MEMORY on out of memory
- * JB_ERR_PARSE on malformed address.
- *
- *********************************************************************/
- jb_err parse_forwarder_address(char *address, char **hostname, int *port,
- char **username, char **password)
- {
- char *p;
- char *tmp;
- tmp = *hostname = strdup_or_die(address);
- /* Parse username and password */
- if (username && password && (NULL != (p = strchr(*hostname, '@'))))
- {
- *p++ = '\0';
- *username = strdup_or_die(*hostname);
- *hostname = strdup_or_die(p);
- if (NULL != (p = strchr(*username, ':')))
- {
- *p++ = '\0';
- *password = strdup_or_die(p);
- }
- freez(tmp);
- }
- /* Parse hostname and port */
- p = *hostname;
- if ((*p == '[') && (NULL == strchr(p, ']')))
- {
- /* XXX: Should do some more validity checks here. */
- return JB_ERR_PARSE;
- }
- if ((**hostname == '[') && (NULL != (p = strchr(*hostname, ']'))))
- {
- *p++ = '\0';
- memmove(*hostname, (*hostname + 1), (size_t)(p - *hostname));
- if (*p == ':')
- {
- *port = (int)strtol(++p, NULL, 0);
- }
- }
- else if (NULL != (p = strchr(*hostname, ':')))
- {
- *p++ = '\0';
- *port = (int)strtol(p, NULL, 0);
- }
- return JB_ERR_OK;
- }
- /*
- Local Variables:
- tab-width: 3
- end:
- */
|