- 1st pass at documentation with the README.
authorMahlon E. Smith <mahlon@laika.com>
Wed, 09 Nov 2011 15:54:37 -0800
changeset 18 d4ce82194b64
parent 17 bd746609ba46
child 19 e8d144ca2acd
child 26 7b28fb383da2
- 1st pass at documentation with the README. - Attempt to detect trivial redirect loops. - Loosen the request parser enough to just accept URLs, so you can easily test rewrite behavior by just running volta and pasting URLs. - Alter the whitelist "negative" rule syntax to more closely match normal rules. Rename references to negative rules from "whitelist" rules, to avoid confusion.
Makefile
README
db.c
parser.rl
process.c
util.c
volta.h
--- a/Makefile	Mon Nov 07 10:43:09 2011 -0800
+++ b/Makefile	Wed Nov 09 15:54:37 2011 -0800
@@ -17,7 +17,7 @@
 debug: CFLAGS += $(CFLAGS_DEBUG)
 debug: LIBS += -lprofiler
 else
-volta: CFLAGS += -L/opt/local/lib -I/opt/local/include
+volta: CFLAGS += -L/opt/local/lib -I/opt/local/include -L/usr/local/lib -I/usr/local/include
 debug: CFLAGS += $(CFLAGS_DEBUG)\
 	$(shell pkg-config --cflags-only-I --libs-only-L $(DEPS_DEBUG))
 debug: LIBS += $(shell pkg-config --libs-only-l $(DEPS_DEBUG))
--- a/README	Mon Nov 07 10:43:09 2011 -0800
+++ b/README	Wed Nov 09 15:54:37 2011 -0800
@@ -3,14 +3,173 @@
 =====
 
 What is volta?
-	- high performance / low resource redirector
+--------------
+
+Volta is a high performance, low resource URI rewriter for use with the
+Squid caching proxy server (http://www.squid-cache.org/.)  With it, you
+can dynamically alter URI requests that pass through Squid based on
+various criteria.
+
+It uses a state machine to parse URIs and rules, and a constant database
+to store and access those rules.
+
+
+Why is it called "volta"?
+-------------------------
 
-Why "volta"?
-	- latin term, turn
+It's a type of old Italian music written in triple-time.  Quick!
+
+
+How fast is it?
+---------------
+
+On a 2Ghz Xeon 5130, it can process a million squid requests against
+10000 rules in less than 8 seconds, using about 800k of ram.  On an
+1.8Ghz Intel E4300, it can do it in 3 seconds.
+
+Your mileage may vary, but for most all intents and purposes the answer
+is "definitely fast enough."
+
 
 Configuring squid
+-----------------
+
+You must enable url rewriting from within the squid.conf file.
+
+	url_rewrite_program /usr/local/bin/volta
+
+... and that's it.  You may need some additional customization, like where
+the volta database is stored on disk:
+
+	url_rewrite_program /usr/local/bin/volta -f /var/db/squid/volta.db
+
+Busy servers:
+
+Make sure rewrite_concurrency is disabled, volta is single threaded.
+Instead, just add more volta children.  They are lightweight, so load em
+up.  A proxy at my $DAYJOB is in use by around 450 people, and we get by
+nicely with 10 volta children.
+
+	url_rewrite_concurrency 0
+	url_rewrite_children 10
+
 
 Using volta
+-----------
 
-How to
+See the INSTALL file for instructions on how to compile volta.
+
+Volta reads its rewrite rules from a local database.  You can create the
+rules in a text editor, then convert it to the database like so:
+
+	% volta -c rules.txt
+
+You'll be left with a "volta.db" file in the current directory.  Put it
+wherever you please, and use the -f flag to point to it.
+
+
+Rule file syntax
+----------------
+
+Volta's rule syntax is designed to be easy to parse by humans and
+machines.  Blank lines are skipped, as is any line that starts with the
+'#' character, so you can keep the ascii version of your rules well
+documented and in version control.
+
+When compiling the ruleset into the database format, volta detects
+malformed rules and stops if there are any problems, leaving your
+original database intact.  You can change the ruleset at any time while
+volta is running, and the new rules will take affect within about 10
+seconds.  No need to restart squid!
+
+There are two types of rules -- positive matches, and negative matches.
+Positive matches cause the rewrite, negative matches allow the original
+request to pass.  Rule order is consistent, top-down, first match wins.
+Fields are separated by any amount of whitespace (spaces or tabs.)
+
+
+### Positive matches:
+
+    First field: the hostname to match.
+
+      You can use an exact hostname (www.example.com), or the top level
+      domain (tld) if you want to match everything under a specific host
+      (example.com.)  You can also use a single '*' to match every request,
+      though this essentially bypasses a lot of what makes volta quick, it
+      is included for completeness.  You may have an unlimited amount of
+      rules per hostname.  Hostnames are compared without case sensitivity.
+
+
+    Second field: the path to match.
+
+	  This can be an exact match ('/path/to/something.html'), a regular
+	  expression ('\.(jpg|gif|png)$'), or a single '*' to match for any
+	  path. Regular expressions are matches without case sensitivity.  There
+	  is currently no support for capturing, though this may be added in
+	  a future release.
+
+
+    Third field: The redirect code and url to rewrite to.
 
+      Any pieces of a url that are omitted are automatically replaced
+      with the original request's element -- the exception is a hostname,
+      which is required.  If you omit a redirect code, the URL rewrite is
+      transparent to the client.  You can attach a 301: or 302: prefix to
+      cause a permanent or temporary code to be respectively sent, instead.
+
+
+### Negative matches:
+
+    First field: the hostname to match.
+
+	  See above -- all the same rules apply.
+
+
+    Second field: the path to match.
+
+	  See above -- all the same rules apply.
+
+
+	Third field: the 'negative' marker.
+
+	  This is simply the '-' character, that signals to volta that this is
+	  a negative matching rule.
+
+
+You can easily test your rules by running volta on the command line, and
+pasting URLs into it.   Boost the debug level (-d4) if you're having any issues.
+
+
+Examples
+--------
+
+Rewrite all requests to Google to the SSL version:
+
+    google.com * 302:https://www.google.com
+
+	This will redirect the request "http://www.google.com/search?q=test" to
+	"https://www.google.com/search?q=test".
+
+
+Transparently alter all uploaded images on imgur to be my face:  :)
+
+	i.imgur.com \.(gif|png|jpg)$ http://www.martini.nu/images/mahlon.jpg
+
+
+Expand a local, non qualified hostname to a FQDN (useful alongside the
+'dns_defnames' squid setting to enforce browser proxy behaviors):
+
+	local-example * local-example.company.com
+
+
+Cause all blog content except for 2011 posts to permanently redirect to
+an archival page:
+
+	martini.nu /blog/2011 -
+	martini.nu /blog 301:martini.nu/content-archived.html
+
+
+Turn off rewriting for specific network segment or IP address:
+
+	Squid has this ability built in -- see the 'url_rewrite_access' setting.
+
--- a/db.c	Mon Nov 07 10:43:09 2011 -0800
+++ b/db.c	Wed Nov 09 15:54:37 2011 -0800
@@ -80,13 +80,11 @@
 unsigned short int
 db_create_new( char *txt )
 {
+	FILE *txt_f  = NULL;
+	parsed *rule = NULL;
+	char buf[ LINE_BUFSIZE*10 ], tmpfile[25];
+	int  tmp_fd, linenum = 0, parsed = 0, error = 0;
 	struct cdb_make cdbm;
-
-	char buf[ LINE_BUFSIZE*10 ];
-	char tmpfile[25];
-	int  tmp_fd;
-	FILE *txt_f = NULL;
-	int  linenum = 0, parsed = 0;
 	struct db_input *dbline;
 
 	/* open temporary file */
@@ -112,19 +110,33 @@
 		/* skip blank lines and comments */
 		if ( strlen(buf) == 1 || buf[0] == '#' ) continue;
 
-		/* validate and add! */
+		/* validate line */
 		dbline = parse_dbinput( buf );
 		if ( dbline == NULL ) {
-			debug( 0, LOC, "Invalid rule (line %d), skipping: %s", linenum, buf );
-			continue;
+			debug( 0, LOC, "Invalid rule (line %d), stopping: %s", linenum, buf );
+			error = 1;
+			break;
 		}
 
+		/* validate rule */
+		rule = parse_rule( dbline->val );
+		if ( rule == NULL ||
+			( rule->negate == 1 && rule->host != NULL ) ||
+			( rule->negate == 0 && rule->host == NULL )) {
+
+			debug( 0, LOC, "Invalid rule (line %d), stopping: %s", linenum, buf );
+			error = 1;
+			break;
+		}
+
+		/* looking good, add rule */
 		cdb_make_add( &cdbm, dbline->key, dbline->klen, dbline->val, dbline->vlen );
 		parsed++;
 
 		free( dbline->key );
 		free( dbline->val );
-		free( dbline );
+		free( dbline ), dbline = NULL;
+		finish_parsed( rule ), rule = NULL;
 	}
 
 	/* write indexes */
@@ -132,13 +144,22 @@
 	cdb_make_finish( &cdbm );
 	close( tmp_fd );
 
-	/* move cdb into place */
-	if ( (rename( tmpfile, v.dbname )) == -1 ) {
-		debug( 0, LOC, "Unable to move temp cdb into place: %s", strerror(errno) );
-		return( 1 );
+	if ( error == 1 ) {
+		/* delete the tmp db on errors */
+		if ( (unlink( tmpfile )) != 0 ) {
+			debug( 0, LOC, "Unable to remove temp cdb: %s", strerror(errno) );
+			return( 1 );
+		}
+	}
+	else {
+		/* move cdb into place */
+		if ( (rename( tmpfile, v.dbname )) == -1 ) {
+			debug( 0, LOC, "Unable to move temp cdb into place: %s", strerror(errno) );
+			return( 1 );
+		}
+		debug( 0, LOC, "Added %d rules to %s.\n", parsed, v.dbname );
 	}
 
-	debug( 0, LOC, "Added %d rules to %s.\n", parsed, v.dbname );
 	return( 0 );
 }
 
@@ -169,7 +190,7 @@
 		vlen = cdb_datalen( &v.db );
 
 		/* pull the value from the db */
-		if ( (val = calloc( vlen, sizeof(char) )) == NULL ) {
+		if ( (val = calloc( vlen+1, sizeof(char) )) == NULL ) {
 			debug( 5, LOC, "Unable to allocate memory for DB value storage: %s\n",
 					strerror(errno) );
 			return( NULL );
@@ -179,6 +200,7 @@
 		/* check it against the request */
 		debug( 4, LOC, "DB match for key '%s': %s\n", key, val );
 		rule = parse_rule( val );
+
 		free( val );
 		if ( rule != NULL ) {
 			if ( check_rule( rule, p_request ) == 0 ) {
--- a/parser.rl	Mon Nov 07 10:43:09 2011 -0800
+++ b/parser.rl	Wed Nov 09 15:54:37 2011 -0800
@@ -66,7 +66,7 @@
 parsed *
 parse_request( char *line )
 {
-   	/* machine required vars */
+	/* machine required vars */
 	unsigned short int cs = 1;
 	char *p   = line;
 	char *pe  = p + strlen(p);
@@ -149,16 +149,20 @@
 	client_ip      = ipv4                  >c_ip_start   %c_ip_finish   @!c_ip_error;
 	method         = upper+                >meth_start   %meth_finish   @!meth_error;
 
-	SquidLine = (
- 		start:   channel_id?                            -> Url,
-		Url:     scheme? host port? path?               -> Client,
+	URL = (
+		start:   channel_id?                            -> Url,
+		Url:     scheme? host port? path?               -> final
+ 	);
+
+	Squidvars = (
+		start:
 		Client:  space client_ip '/' ( hostname | '-' ) -> User,
 		User:    space pchar+                           -> Method,
 		Method:  space method                           -> KVPairs,
 		KVPairs: ( space any+ )?                        -> final
  	);
 
-	main := SquidLine '\n';
+	main := URL Squidvars? '\n';
 }%%
 
 	/* state machine */
@@ -186,7 +190,7 @@
 parsed *
 parse_rule( char *rewrite )
 {
-   	/* machine required vars */
+	/* machine required vars */
 	unsigned short int cs = 1;
 	char *p   = rewrite;
 	char *pe  = p + strlen(p);
@@ -203,7 +207,7 @@
 	action match_finish  { MARK_E(path_re) }
 	action redir_start   { MARK_S(redir) }
 	action redir_finish  { p_parsed->tokens.redir_length = 3; } # strip trailing colon
-	action wl_finish     { p_parsed->wl = 1; }
+	action negate_finish { p_parsed->negate = 1; }
 
 	action scheme_start  { MARK_S(scheme) }
 	action scheme_finish { MARK_E(scheme) }
@@ -225,18 +229,18 @@
 	ipv4      = digit{1,3} '.' digit{1,3} '.' digit{1,3} '.' digit{1,3};
 	ipv6      = ( xdigit | ':' )+;
 
-	whitelist = ( '-' sep )          %wl_finish;
-	path_re   = ( any - space )+     >match_start  %match_finish @!match_error;
+	negate    = ( '-' )                 %negate_finish;
+	path_re   = ( any - space )+        >match_start  %match_finish @!match_error;
 
-	redir     = ( digit{3} ':' )     >redir_start  %redir_finish;
-	scheme    = ( alpha{3,5} '://' ) >scheme_start %scheme_finish;
-	host      = ( hostname | ipv4 )  >host_start   %host_finish   @!host_error;
-	port      = ( ':' digit{1,5} )   >port_start   %port_finish;
-	path      = path_segment*        >path_start   %path_finish;
+	redir     = ( ('301' | '302') ':' ) >redir_start  %redir_finish;
+	scheme    = ( alpha{3,5} '://' )    >scheme_start %scheme_finish;
+	host      = ( hostname | ipv4 )     >host_start   %host_finish   @!host_error;
+	port      = ( ':' digit{1,5} )      >port_start   %port_finish;
+	path      = path_segment*           >path_start   %path_finish;
 
-	rewrite   = ( sep redir? scheme? host port? path? );
+	rewrite   = ( redir? scheme? host port? path? );
 
-	main := whitelist? path_re rewrite?;
+	main := path_re sep ( rewrite | negate );
 }%%
 
 	/* state machine */
@@ -264,7 +268,7 @@
 struct db_input *
 parse_dbinput( char *line )
 {
-   	/* machine required vars */
+	/* machine required vars */
 	unsigned short int cs = 1;
 	char *p   = line;
 	char *pe  = p + strlen(p);
@@ -301,9 +305,9 @@
 	redir          = ( digit{3} ':' );
 	host           = ( hostname | ipv4 );
 
-	key = ( host | '*' )                      >key_start %key_finish @!key_error;
-	val = ( (token sep)? token (sep token)? ) >val_start %val_finish @!val_error;
-	#           wl       regex   rewrite
+	key = ( host | '*' )      >key_start %key_finish @!key_error;
+	val = ( token sep token ) >val_start %val_finish @!val_error;
+	#       regex     rewrite or negate
 	
 	main:= key sep val '\n';
 }%%
@@ -351,7 +355,7 @@
 	}
 
 	p_parsed->type      = 0;
-	p_parsed->wl        = 0;
+	p_parsed->negate    = 0;
 	p_parsed->path_re   = NULL;
 	p_parsed->redir     = NULL;
 	p_parsed->scheme    = NULL;
--- a/process.c	Mon Nov 07 10:43:09 2011 -0800
+++ b/process.c	Wed Nov 09 15:54:37 2011 -0800
@@ -47,11 +47,8 @@
 
 	/* If request parsing failed, return a blank line to squid
 	   to allow the request to pass through unmolested. */
-	if ( p_request == NULL ) {
-		out( "\n" );
-		finish_parsed( p_request );
-		return;
-	}
+	if ( p_request == NULL )
+		return pass( p_request, rule );
 
 	/*
 	 * Main rewrite logic.
@@ -78,14 +75,23 @@
 	if ( rule == NULL ) rule = find_rule( p_request->tld, p_request );
 	if ( rule == NULL ) rule = find_rule( "*", p_request );
 
-	/* no matching rule still or whitelist rule?  no need to rewrite anything. */
-	if ( rule == NULL || rule->wl ) {
-		out( "\n" );
+	/* no matching rule still or negated rule?  no need to rewrite anything. */
+	if ( rule == NULL || rule->negate )
+		return pass( p_request, rule );
+
+	/* avoid trivial redirect loops */
+	if (
+		( rule->redir ) &&
+		( rule->scheme == NULL || ( strcmp(p_request->scheme, rule->scheme) == 0) ) &&
+		( rule->path   == NULL || ( strcmp(p_request->path,     rule->path) == 0) ) &&
+		( strcmp( p_request->host, rule->host) == 0 )
+	   ) {
+		debug( 2, LOC, "Potential rewrite loop, skipping rewrite.\n" );
+		return pass( p_request, rule );
 	}
+
 	/* otherwise, perform the rewrite. */
-	else {
-		rewrite( p_request, rule );
-	}
+	rewrite( p_request, rule );
 
 	finish_parsed( rule );
 	finish_parsed( p_request );
@@ -94,6 +100,25 @@
 
 
 /*
+ * Allow the request to pass through without being rewritten.
+ *
+ */
+void
+pass( parsed *request, parsed *rule )
+{
+	finish_parsed( rule );
+	finish_parsed( request );
+
+	if ( v.debugmode >= 5 ) return;
+
+	printf( "\n" );
+	fflush( stdout );
+
+	return;
+}
+
+
+/*
  * Output a rewritten URL for squid.
  *
  */
@@ -109,6 +134,7 @@
 
 	printf("\n");
 	fflush( stdout );
+
 	return;
 }
 
--- a/util.c	Mon Nov 07 10:43:09 2011 -0800
+++ b/util.c	Wed Nov 09 15:54:37 2011 -0800
@@ -82,19 +82,6 @@
 
 
 /*
- * Output to stdout for squid, unless the debug level is at or above 5.
- */
-void
-out( const char *str )
-{
-	if ( v.debugmode >= 5 ) return;
-	fprintf( stdout, "%s", str );
-	fflush( stdout );
-	return;
-}
-
-
-/*
  * Given a string, reverse it in place.
  */
 void
@@ -177,47 +164,6 @@
 }
 
 
-/*
- * Read an entire file into memory, returning a pointer to the contents.
- * Returns NULL on error.
- *
- */
-char *
-slurp_file( char *file )
-{
-	FILE *fh       = NULL;
-	char *contents = NULL;
-	struct stat sb;
-
-	if ( stat( file, &sb ) != 0 ) {
-		debug( 1, LOC, "Unable to stat() file '%s': %s\n",
-				file, strerror(errno) );
-		return( NULL );
-	}
-
-	if ( (contents = malloc( sb.st_size + 1 )) == NULL ) {
-		debug( 5, LOC, "Unable to allocate memory for file '%s': %s\n",
-				file, strerror(errno) );
-		return( NULL );
-	}
-
-	if ( (fh = fopen( file, "r" )) == NULL ) {
-		debug( 1, LOC, "Could not open file for reading '%s': %s\n",
-				file, strerror(errno) );
-		return( NULL );
-	}
-
-	if ( fread( contents, sizeof(char), sb.st_size, fh ) != (unsigned int)sb.st_size ) {
-		debug( 5, LOC, "Short read for file '%s'?: %s\n", file );
-		fclose( fh );
-		return( NULL );
-	}
-
-	fclose( fh );
-	return( contents );
-}
-
-
 /* 
  * Allocate memory and copy +length+ bytes (plus 1 for null) from the given
  * +string+ into a new string, returning a pointer to it.
--- a/volta.h	Mon Nov 07 10:43:09 2011 -0800
+++ b/volta.h	Wed Nov 09 15:54:37 2011 -0800
@@ -117,7 +117,7 @@
  */
 typedef struct parsed {
 	unsigned short int type;
-	unsigned short int wl;
+	unsigned short int negate;
 	char   *path_re;
 	char   *redir;
 	char   *scheme;
@@ -162,11 +162,9 @@
 
 void usage( char * );
 void debug( int, char *, int, const char *, ... );
-void out( const char * );
 void reverse_str( char * );
 void lowercase_str( char *, unsigned short int );
 void report_speed( void );
-char *slurp_file( char * );
 char *extend_line( char *, const char * );
 char *copy_string_token( char *, unsigned short int );
 /* struct in_addr *copy_ipv4_token( char *, unsigned short int ); */
@@ -181,6 +179,7 @@
 void finish_parsed( parsed * );
 void reset_results( parsed **, unsigned int );
 unsigned short int check_rule( parsed *, parsed * );
+void pass( parsed *, parsed * );
 void rewrite( parsed *, parsed * );
 
 #endif