# HG changeset patch # User Mahlon E. Smith # Date 1320882877 28800 # Node ID d4ce82194b640f863a817fdd085f06b19b232005 # Parent bd746609ba460dbebca68b9267c343dc22afe778 - 1st pass at documentation with the README. - Attempt to detect trivial redirect loops. - Loosen the request parser enough to just accept URLs, so you can easily test rewrite behavior by just running volta and pasting URLs. - Alter the whitelist "negative" rule syntax to more closely match normal rules. Rename references to negative rules from "whitelist" rules, to avoid confusion. diff -r bd746609ba46 -r d4ce82194b64 Makefile --- a/Makefile Mon Nov 07 10:43:09 2011 -0800 +++ b/Makefile Wed Nov 09 15:54:37 2011 -0800 @@ -17,7 +17,7 @@ debug: CFLAGS += $(CFLAGS_DEBUG) debug: LIBS += -lprofiler else -volta: CFLAGS += -L/opt/local/lib -I/opt/local/include +volta: CFLAGS += -L/opt/local/lib -I/opt/local/include -L/usr/local/lib -I/usr/local/include debug: CFLAGS += $(CFLAGS_DEBUG)\ $(shell pkg-config --cflags-only-I --libs-only-L $(DEPS_DEBUG)) debug: LIBS += $(shell pkg-config --libs-only-l $(DEPS_DEBUG)) diff -r bd746609ba46 -r d4ce82194b64 README --- a/README Mon Nov 07 10:43:09 2011 -0800 +++ b/README Wed Nov 09 15:54:37 2011 -0800 @@ -3,14 +3,173 @@ ===== What is volta? - - high performance / low resource redirector +-------------- + +Volta is a high performance, low resource URI rewriter for use with the +Squid caching proxy server (http://www.squid-cache.org/.) With it, you +can dynamically alter URI requests that pass through Squid based on +various criteria. + +It uses a state machine to parse URIs and rules, and a constant database +to store and access those rules. + + +Why is it called "volta"? +------------------------- -Why "volta"? - - latin term, turn +It's a type of old Italian music written in triple-time. Quick! + + +How fast is it? +--------------- + +On a 2Ghz Xeon 5130, it can process a million squid requests against +10000 rules in less than 8 seconds, using about 800k of ram. On an +1.8Ghz Intel E4300, it can do it in 3 seconds. + +Your mileage may vary, but for most all intents and purposes the answer +is "definitely fast enough." + Configuring squid +----------------- + +You must enable url rewriting from within the squid.conf file. + + url_rewrite_program /usr/local/bin/volta + +... and that's it. You may need some additional customization, like where +the volta database is stored on disk: + + url_rewrite_program /usr/local/bin/volta -f /var/db/squid/volta.db + +Busy servers: + +Make sure rewrite_concurrency is disabled, volta is single threaded. +Instead, just add more volta children. They are lightweight, so load em +up. A proxy at my $DAYJOB is in use by around 450 people, and we get by +nicely with 10 volta children. + + url_rewrite_concurrency 0 + url_rewrite_children 10 + Using volta +----------- -How to +See the INSTALL file for instructions on how to compile volta. + +Volta reads its rewrite rules from a local database. You can create the +rules in a text editor, then convert it to the database like so: + + % volta -c rules.txt + +You'll be left with a "volta.db" file in the current directory. Put it +wherever you please, and use the -f flag to point to it. + + +Rule file syntax +---------------- + +Volta's rule syntax is designed to be easy to parse by humans and +machines. Blank lines are skipped, as is any line that starts with the +'#' character, so you can keep the ascii version of your rules well +documented and in version control. + +When compiling the ruleset into the database format, volta detects +malformed rules and stops if there are any problems, leaving your +original database intact. You can change the ruleset at any time while +volta is running, and the new rules will take affect within about 10 +seconds. No need to restart squid! + +There are two types of rules -- positive matches, and negative matches. +Positive matches cause the rewrite, negative matches allow the original +request to pass. Rule order is consistent, top-down, first match wins. +Fields are separated by any amount of whitespace (spaces or tabs.) + + +### Positive matches: + + First field: the hostname to match. + + You can use an exact hostname (www.example.com), or the top level + domain (tld) if you want to match everything under a specific host + (example.com.) You can also use a single '*' to match every request, + though this essentially bypasses a lot of what makes volta quick, it + is included for completeness. You may have an unlimited amount of + rules per hostname. Hostnames are compared without case sensitivity. + + + Second field: the path to match. + + This can be an exact match ('/path/to/something.html'), a regular + expression ('\.(jpg|gif|png)$'), or a single '*' to match for any + path. Regular expressions are matches without case sensitivity. There + is currently no support for capturing, though this may be added in + a future release. + + + Third field: The redirect code and url to rewrite to. + Any pieces of a url that are omitted are automatically replaced + with the original request's element -- the exception is a hostname, + which is required. If you omit a redirect code, the URL rewrite is + transparent to the client. You can attach a 301: or 302: prefix to + cause a permanent or temporary code to be respectively sent, instead. + + +### Negative matches: + + First field: the hostname to match. + + See above -- all the same rules apply. + + + Second field: the path to match. + + See above -- all the same rules apply. + + + Third field: the 'negative' marker. + + This is simply the '-' character, that signals to volta that this is + a negative matching rule. + + +You can easily test your rules by running volta on the command line, and +pasting URLs into it. Boost the debug level (-d4) if you're having any issues. + + +Examples +-------- + +Rewrite all requests to Google to the SSL version: + + google.com * 302:https://www.google.com + + This will redirect the request "http://www.google.com/search?q=test" to + "https://www.google.com/search?q=test". + + +Transparently alter all uploaded images on imgur to be my face: :) + + i.imgur.com \.(gif|png|jpg)$ http://www.martini.nu/images/mahlon.jpg + + +Expand a local, non qualified hostname to a FQDN (useful alongside the +'dns_defnames' squid setting to enforce browser proxy behaviors): + + local-example * local-example.company.com + + +Cause all blog content except for 2011 posts to permanently redirect to +an archival page: + + martini.nu /blog/2011 - + martini.nu /blog 301:martini.nu/content-archived.html + + +Turn off rewriting for specific network segment or IP address: + + Squid has this ability built in -- see the 'url_rewrite_access' setting. + diff -r bd746609ba46 -r d4ce82194b64 db.c --- a/db.c Mon Nov 07 10:43:09 2011 -0800 +++ b/db.c Wed Nov 09 15:54:37 2011 -0800 @@ -80,13 +80,11 @@ unsigned short int db_create_new( char *txt ) { + FILE *txt_f = NULL; + parsed *rule = NULL; + char buf[ LINE_BUFSIZE*10 ], tmpfile[25]; + int tmp_fd, linenum = 0, parsed = 0, error = 0; struct cdb_make cdbm; - - char buf[ LINE_BUFSIZE*10 ]; - char tmpfile[25]; - int tmp_fd; - FILE *txt_f = NULL; - int linenum = 0, parsed = 0; struct db_input *dbline; /* open temporary file */ @@ -112,19 +110,33 @@ /* skip blank lines and comments */ if ( strlen(buf) == 1 || buf[0] == '#' ) continue; - /* validate and add! */ + /* validate line */ dbline = parse_dbinput( buf ); if ( dbline == NULL ) { - debug( 0, LOC, "Invalid rule (line %d), skipping: %s", linenum, buf ); - continue; + debug( 0, LOC, "Invalid rule (line %d), stopping: %s", linenum, buf ); + error = 1; + break; } + /* validate rule */ + rule = parse_rule( dbline->val ); + if ( rule == NULL || + ( rule->negate == 1 && rule->host != NULL ) || + ( rule->negate == 0 && rule->host == NULL )) { + + debug( 0, LOC, "Invalid rule (line %d), stopping: %s", linenum, buf ); + error = 1; + break; + } + + /* looking good, add rule */ cdb_make_add( &cdbm, dbline->key, dbline->klen, dbline->val, dbline->vlen ); parsed++; free( dbline->key ); free( dbline->val ); - free( dbline ); + free( dbline ), dbline = NULL; + finish_parsed( rule ), rule = NULL; } /* write indexes */ @@ -132,13 +144,22 @@ cdb_make_finish( &cdbm ); close( tmp_fd ); - /* move cdb into place */ - if ( (rename( tmpfile, v.dbname )) == -1 ) { - debug( 0, LOC, "Unable to move temp cdb into place: %s", strerror(errno) ); - return( 1 ); + if ( error == 1 ) { + /* delete the tmp db on errors */ + if ( (unlink( tmpfile )) != 0 ) { + debug( 0, LOC, "Unable to remove temp cdb: %s", strerror(errno) ); + return( 1 ); + } + } + else { + /* move cdb into place */ + if ( (rename( tmpfile, v.dbname )) == -1 ) { + debug( 0, LOC, "Unable to move temp cdb into place: %s", strerror(errno) ); + return( 1 ); + } + debug( 0, LOC, "Added %d rules to %s.\n", parsed, v.dbname ); } - debug( 0, LOC, "Added %d rules to %s.\n", parsed, v.dbname ); return( 0 ); } @@ -169,7 +190,7 @@ vlen = cdb_datalen( &v.db ); /* pull the value from the db */ - if ( (val = calloc( vlen, sizeof(char) )) == NULL ) { + if ( (val = calloc( vlen+1, sizeof(char) )) == NULL ) { debug( 5, LOC, "Unable to allocate memory for DB value storage: %s\n", strerror(errno) ); return( NULL ); @@ -179,6 +200,7 @@ /* check it against the request */ debug( 4, LOC, "DB match for key '%s': %s\n", key, val ); rule = parse_rule( val ); + free( val ); if ( rule != NULL ) { if ( check_rule( rule, p_request ) == 0 ) { diff -r bd746609ba46 -r d4ce82194b64 parser.rl --- a/parser.rl Mon Nov 07 10:43:09 2011 -0800 +++ b/parser.rl Wed Nov 09 15:54:37 2011 -0800 @@ -66,7 +66,7 @@ parsed * parse_request( char *line ) { - /* machine required vars */ + /* machine required vars */ unsigned short int cs = 1; char *p = line; char *pe = p + strlen(p); @@ -149,16 +149,20 @@ client_ip = ipv4 >c_ip_start %c_ip_finish @!c_ip_error; method = upper+ >meth_start %meth_finish @!meth_error; - SquidLine = ( - start: channel_id? -> Url, - Url: scheme? host port? path? -> Client, + URL = ( + start: channel_id? -> Url, + Url: scheme? host port? path? -> final + ); + + Squidvars = ( + start: Client: space client_ip '/' ( hostname | '-' ) -> User, User: space pchar+ -> Method, Method: space method -> KVPairs, KVPairs: ( space any+ )? -> final ); - main := SquidLine '\n'; + main := URL Squidvars? '\n'; }%% /* state machine */ @@ -186,7 +190,7 @@ parsed * parse_rule( char *rewrite ) { - /* machine required vars */ + /* machine required vars */ unsigned short int cs = 1; char *p = rewrite; char *pe = p + strlen(p); @@ -203,7 +207,7 @@ action match_finish { MARK_E(path_re) } action redir_start { MARK_S(redir) } action redir_finish { p_parsed->tokens.redir_length = 3; } # strip trailing colon - action wl_finish { p_parsed->wl = 1; } + action negate_finish { p_parsed->negate = 1; } action scheme_start { MARK_S(scheme) } action scheme_finish { MARK_E(scheme) } @@ -225,18 +229,18 @@ ipv4 = digit{1,3} '.' digit{1,3} '.' digit{1,3} '.' digit{1,3}; ipv6 = ( xdigit | ':' )+; - whitelist = ( '-' sep ) %wl_finish; - path_re = ( any - space )+ >match_start %match_finish @!match_error; + negate = ( '-' ) %negate_finish; + path_re = ( any - space )+ >match_start %match_finish @!match_error; - redir = ( digit{3} ':' ) >redir_start %redir_finish; - scheme = ( alpha{3,5} '://' ) >scheme_start %scheme_finish; - host = ( hostname | ipv4 ) >host_start %host_finish @!host_error; - port = ( ':' digit{1,5} ) >port_start %port_finish; - path = path_segment* >path_start %path_finish; + redir = ( ('301' | '302') ':' ) >redir_start %redir_finish; + scheme = ( alpha{3,5} '://' ) >scheme_start %scheme_finish; + host = ( hostname | ipv4 ) >host_start %host_finish @!host_error; + port = ( ':' digit{1,5} ) >port_start %port_finish; + path = path_segment* >path_start %path_finish; - rewrite = ( sep redir? scheme? host port? path? ); + rewrite = ( redir? scheme? host port? path? ); - main := whitelist? path_re rewrite?; + main := path_re sep ( rewrite | negate ); }%% /* state machine */ @@ -264,7 +268,7 @@ struct db_input * parse_dbinput( char *line ) { - /* machine required vars */ + /* machine required vars */ unsigned short int cs = 1; char *p = line; char *pe = p + strlen(p); @@ -301,9 +305,9 @@ redir = ( digit{3} ':' ); host = ( hostname | ipv4 ); - key = ( host | '*' ) >key_start %key_finish @!key_error; - val = ( (token sep)? token (sep token)? ) >val_start %val_finish @!val_error; - # wl regex rewrite + key = ( host | '*' ) >key_start %key_finish @!key_error; + val = ( token sep token ) >val_start %val_finish @!val_error; + # regex rewrite or negate main:= key sep val '\n'; }%% @@ -351,7 +355,7 @@ } p_parsed->type = 0; - p_parsed->wl = 0; + p_parsed->negate = 0; p_parsed->path_re = NULL; p_parsed->redir = NULL; p_parsed->scheme = NULL; diff -r bd746609ba46 -r d4ce82194b64 process.c --- a/process.c Mon Nov 07 10:43:09 2011 -0800 +++ b/process.c Wed Nov 09 15:54:37 2011 -0800 @@ -47,11 +47,8 @@ /* If request parsing failed, return a blank line to squid to allow the request to pass through unmolested. */ - if ( p_request == NULL ) { - out( "\n" ); - finish_parsed( p_request ); - return; - } + if ( p_request == NULL ) + return pass( p_request, rule ); /* * Main rewrite logic. @@ -78,14 +75,23 @@ if ( rule == NULL ) rule = find_rule( p_request->tld, p_request ); if ( rule == NULL ) rule = find_rule( "*", p_request ); - /* no matching rule still or whitelist rule? no need to rewrite anything. */ - if ( rule == NULL || rule->wl ) { - out( "\n" ); + /* no matching rule still or negated rule? no need to rewrite anything. */ + if ( rule == NULL || rule->negate ) + return pass( p_request, rule ); + + /* avoid trivial redirect loops */ + if ( + ( rule->redir ) && + ( rule->scheme == NULL || ( strcmp(p_request->scheme, rule->scheme) == 0) ) && + ( rule->path == NULL || ( strcmp(p_request->path, rule->path) == 0) ) && + ( strcmp( p_request->host, rule->host) == 0 ) + ) { + debug( 2, LOC, "Potential rewrite loop, skipping rewrite.\n" ); + return pass( p_request, rule ); } + /* otherwise, perform the rewrite. */ - else { - rewrite( p_request, rule ); - } + rewrite( p_request, rule ); finish_parsed( rule ); finish_parsed( p_request ); @@ -94,6 +100,25 @@ /* + * Allow the request to pass through without being rewritten. + * + */ +void +pass( parsed *request, parsed *rule ) +{ + finish_parsed( rule ); + finish_parsed( request ); + + if ( v.debugmode >= 5 ) return; + + printf( "\n" ); + fflush( stdout ); + + return; +} + + +/* * Output a rewritten URL for squid. * */ @@ -109,6 +134,7 @@ printf("\n"); fflush( stdout ); + return; } diff -r bd746609ba46 -r d4ce82194b64 util.c --- a/util.c Mon Nov 07 10:43:09 2011 -0800 +++ b/util.c Wed Nov 09 15:54:37 2011 -0800 @@ -82,19 +82,6 @@ /* - * Output to stdout for squid, unless the debug level is at or above 5. - */ -void -out( const char *str ) -{ - if ( v.debugmode >= 5 ) return; - fprintf( stdout, "%s", str ); - fflush( stdout ); - return; -} - - -/* * Given a string, reverse it in place. */ void @@ -177,47 +164,6 @@ } -/* - * Read an entire file into memory, returning a pointer to the contents. - * Returns NULL on error. - * - */ -char * -slurp_file( char *file ) -{ - FILE *fh = NULL; - char *contents = NULL; - struct stat sb; - - if ( stat( file, &sb ) != 0 ) { - debug( 1, LOC, "Unable to stat() file '%s': %s\n", - file, strerror(errno) ); - return( NULL ); - } - - if ( (contents = malloc( sb.st_size + 1 )) == NULL ) { - debug( 5, LOC, "Unable to allocate memory for file '%s': %s\n", - file, strerror(errno) ); - return( NULL ); - } - - if ( (fh = fopen( file, "r" )) == NULL ) { - debug( 1, LOC, "Could not open file for reading '%s': %s\n", - file, strerror(errno) ); - return( NULL ); - } - - if ( fread( contents, sizeof(char), sb.st_size, fh ) != (unsigned int)sb.st_size ) { - debug( 5, LOC, "Short read for file '%s'?: %s\n", file ); - fclose( fh ); - return( NULL ); - } - - fclose( fh ); - return( contents ); -} - - /* * Allocate memory and copy +length+ bytes (plus 1 for null) from the given * +string+ into a new string, returning a pointer to it. diff -r bd746609ba46 -r d4ce82194b64 volta.h --- a/volta.h Mon Nov 07 10:43:09 2011 -0800 +++ b/volta.h Wed Nov 09 15:54:37 2011 -0800 @@ -117,7 +117,7 @@ */ typedef struct parsed { unsigned short int type; - unsigned short int wl; + unsigned short int negate; char *path_re; char *redir; char *scheme; @@ -162,11 +162,9 @@ void usage( char * ); void debug( int, char *, int, const char *, ... ); -void out( const char * ); void reverse_str( char * ); void lowercase_str( char *, unsigned short int ); void report_speed( void ); -char *slurp_file( char * ); char *extend_line( char *, const char * ); char *copy_string_token( char *, unsigned short int ); /* struct in_addr *copy_ipv4_token( char *, unsigned short int ); */ @@ -181,6 +179,7 @@ void finish_parsed( parsed * ); void reset_results( parsed **, unsigned int ); unsigned short int check_rule( parsed *, parsed * ); +void pass( parsed *, parsed * ); void rewrite( parsed *, parsed * ); #endif