parser.rl
changeset 10 d07309450285
parent 2 8c88756f81b0
child 12 191b3c25974a
--- a/parser.rl	Wed Sep 28 09:04:16 2011 -0700
+++ b/parser.rl	Mon Oct 17 09:12:00 2011 -0700
@@ -28,122 +28,258 @@
 SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 */
 
-/*
-Squid docs:
----------------------------------------------------------------------------
-TAG: url_rewrite_program
-Specify the location of the executable for the URL rewriter.
-Since they can perform almost any function there isn't one included.
-
-For each requested URL rewriter will receive on line with the format
-
-URL <SP> client_ip "/" fqdn <SP> user <SP> method [<SP> kvpairs]<NL>
-
-In the future, the rewriter interface will be extended with
-key=value pairs ("kvpairs" shown above).  Rewriter programs
-should be prepared to receive and possibly ignore additional
-whitespace-separated tokens on each input line.
-
-And the rewriter may return a rewritten URL. The other components of
-the request line does not need to be returned (ignored if they are).
-
-The rewriter can also indicate that a client-side redirect should
-be performed to the new URL. This is done by prefixing the returned
-URL with "301:" (moved permanently) or 302: (moved temporarily).
-
-By default, a URL rewriter is not used.
----------------------------------------------------------------------------
-*/
-
 #include "volta.h"
 
 %%{
 	machine redirector;
 
-	action parse_error {
-		debug( 2, LOC, "parse error\n" );
-		return( NULL );
-	}
+	action success { valid = 1; }
+	action error   { valid = 0; }
 
-	action yay {
-		printf( "I saw: %s", p+1 );
+	action channel_id_found  {
+		debug( 1, LOC, "Channel ID found in redirector input.  Set 'url_rewrite_concurrency' to '0' in squid.\n" );
+		fbreak;
 	}
 
-	# http://, ftp://, https://, etc
-	proto = alpha{3,5} . '://';
+	action scheme_start  { p_request->tokens.scheme_start  = fpc; }
+	action scheme_finish { p_request->tokens.scheme_length = fpc - ( *pe + p_request->tokens.scheme_start ); }
+	action scheme_error  { debug( 3, LOC, "Unable to parse scheme.\n" ); }
+
+	action host_start  { p_request->tokens.host_start  = fpc; }
+	action host_finish { p_request->tokens.host_length = fpc - ( *pe + p_request->tokens.host_start ); }
+	action host_error  { debug( 3, LOC, "Unable to parse hostname.\n" ); }
+
+	action port_start  { p_request->tokens.port_start  = fpc; }
+	action port_finish { p_request->tokens.port_length = fpc - ( *pe + p_request->tokens.port_start ); }
+
+	action path_start  { p_request->tokens.path_start  = fpc; }
+	action path_finish { p_request->tokens.path_length = fpc - ( *pe + p_request->tokens.path_start ); }
+
+	action meth_start  { p_request->tokens.meth_start  = fpc; }
+	action meth_finish { p_request->tokens.meth_length = fpc - ( *pe + p_request->tokens.meth_start ); }
+	action meth_error  { debug( 3, LOC, "Unable to parse method.\n" ); }
+
+	action c_ip_start  { p_request->tokens.c_ip_start  = fpc; }
+	action c_ip_finish { p_request->tokens.c_ip_length = fpc - ( *pe + p_request->tokens.c_ip_start ); }
+	action c_ip_error  { debug( 3, LOC, "Unable to parse the client IP address.\n" ); }
 
-	# http://mahlon:password@example.com or http://mahlon@example.com
-    #       username              optional password
-	creds = ( alnum | [+._\-] )+ . ( ':' . any+ )? . '@';
+    # 
+    # Squid line: URL <SP> client_ip "/" fqdn <SP> user <SP> method [<SP> kvpairs]<NL>
+    # 
+    # URI Syntax (RFC 3986) misc notes:
+    #
+    # - Scheme isn't passed to redirectors on CONNECT method requests
+    #
+    # - Hostname segments aren't supposed to be individually greater than 63 chars,
+    #   and the hostname in total shouldn't exceed 255.  They also shouldn't be entirely
+    #   made up of digits, or contain underscores.  In practice, these rules appear to
+    #   be violated constantly by some pretty big sites. I'm looking at you, facebook.
+    #   (( alnum ) | ( alnum . [a-zA-Z0-9\-]{0,63} . alnum )) & !( digit+ );
+    #
+    # - ipv6 has some utterly insane rules (RFC 5952) in the name of "shortcuts", which
+    #   only seem like shortcuts to someone writing IP addresses by hand.  Anyone that
+    #   has to parse (or even just read) them has a bunch of seemingly arbitrary work
+    #   dumped in their lap.  Heck, it's impossible to even search for an ipv6 address
+    #   that contains zeros in a text editor, because you have no idea what how it might
+    #   be represented.  Rad!
+    #
+    #   The parser just trusts any ipv6 squid hands us as being valid, without
+    #   any real parsing/validation, other than it consists of hex digits and colons.
+    #
+    # - This parser originally validated path/query/fragment as well, but there were
+    #   enough inconsistencies with unescaped chars and other real-life RFC deviations
+    #   that I opted to just accept what we get from squid.
+	#
+	# - Redirectors aren't handed any userinfo (http://mahlon:password@example.com),
+    #   so no need to check for that.
+    #
+
+	host_component = alnum | ( alnum [a-zA-Z0-9\-_]* alnum );
+	pchar          = ( alnum | [\-._~!$%&'()*+,;=] );
+	path_segment   = '/' ( any - space )*;
 
-	main  := ( proto . creds ) | proto @yay '\n';
+	hostname       = host_component ( '.' host_component )* '.'?;
+	ipv4           = digit{1,3} '.' digit{1,3} '.' digit{1,3} '.' digit{1,3};
+	ipv6           = ( xdigit | ':' )+;
+
+	channel_id     = ( digit+ space )      %channel_id_found;
+	scheme         = ( alpha{3,5} '://' )  >scheme_start %scheme_finish @!scheme_error;
+	host           = ( hostname | ipv4 )   >host_start   %host_finish   @!host_error;
+	port           = ( ':' digit{1,5} )    >port_start   %port_finish;
+	path           = path_segment*         >path_start   %path_finish;
+	client_ip      = ipv4                  >c_ip_start   %c_ip_finish   @!c_ip_error;
+	method         = upper+                >meth_start   %meth_finish   @!meth_error;
+
+	Line = (
+ 		start: (
+			channel_id? -> Url
+		),
+
+		Url: (
+		   	scheme? host port? path? space -> Client
+ 		),
+
+		Client: (
+			client_ip '/' ( hostname | '-' ) space -> User
+		),
+
+		User: (
+			pchar+ space -> Method
+		),
+
+		Method: (
+			method -> KVPairs
+		),
+
+		KVPairs: (
+			( space any+ )? -> final
+		)
+ 	) %success @!error;
+
+
+	main := Line '\n';
 }%%
 %% write data;
 
-/*
-%%{
-	machine redirector;
-
-	action yay {
-		printf( "I saw: %s", p+1 );
-	}
-
-	# http://, ftp://, https://, etc
-	proto = alpha{3,5} . '://';
-
-	# http://mahlon:password@example.com or http://mahlon@example.com
-    #       username              optional password
-	creds = ( alnum | [+._\-] )+ . ( ':' . any+ )? . '@';
-
-	main := ( proto . creds ) | proto @yay '\n';
-}%%
-%% write data;
+/* 
+ * Tokenize an incoming line from squid, returning a parsed and populated
+ * structure to make redirection decisions against.  This pointer should
+ * be freed using cleanup_request() after use.
+ * 
+ * Squid documentation about redirectors:
+ * ---------------------------------------------------------------------------
+ * TAG: url_rewrite_program
+ * Specify the location of the executable for the URL rewriter.
+ * Since they can perform almost any function there isn't one included.
+ * 
+ * For each requested URL rewriter will receive on line with the format
+ * 
+ * URL <SP> client_ip "/" fqdn <SP> user <SP> method [<SP> kvpairs]<NL>
+ * 
+ * In the future, the rewriter interface will be extended with
+ * key=value pairs ("kvpairs" shown above).  Rewriter programs
+ * should be prepared to receive and possibly ignore additional
+ * whitespace-separated tokens on each input line.
+ * 
+ * And the rewriter may return a rewritten URL. The other components of
+ * the request line does not need to be returned (ignored if they are).
+ * 
+ * The rewriter can also indicate that a client-side redirect should
+ * be performed to the new URL. This is done by prefixing the returned
+ * URL with "301:" (moved permanently) or 302: (moved temporarily).
+ * 
+ * By default, a URL rewriter is not used.
+ * ---------------------------------------------------------------------------
 */
-
-
-/*
-%%{
-	machine foo;
-
-	OPEN = 0;
-	CLOSE = 1;
+request *
+parse( char *line )
+{
+   	/* machine required vars */
+	int  cs   = 0;
+	char *p   = line;
+	char *pe  = p + strlen(p);
+	char *eof = NULL;
 
-	main :=
-		start:
-		door_closed: (
-			OPEN -> door_open -> final
-		),
-		door_open: (
-			CLOSE -> door_closed
-		);
-}%%
-*/
-
-struct request *
-parse( char *p )
-{
-   	/* initial machine state */
-	short int cs = 0;
-
-	/* the client request object */
-	request c_request;
-	request *cp_request = &c_request;
-
-	/*
-	char ip[ INET_ADDRSTRLEN ];
-	inet_pton( AF_INET, "127.0.0.1", &cp_request->ip );
-	inet_ntop( AF_INET, &cp_request->ip, ip, INET_ADDRSTRLEN );
-	*/
-
-	/* initalize state machine with current line */
-	char *pe = p + strlen(p) + 1;
+	/* the client request pointer */
+	unsigned char valid = 0;
+	request *p_request = init_request();
 
 	/* enter state machine */
 	%% write init;
 	%% write exec;
 
-	/* reset the request */
-	/* c_request = reset_request; */
-	return( cp_request );
+	/* If we were given an invalid line, bail early */
+	if ( valid == 0 ) {
+		free( p_request ), p_request = NULL;
+		debug( 3, LOC, "Invalid line (%d), skipped\n", v.timer.lines + 1 );
+		debug( 4, LOC, "%s", line );
+		return( NULL );
+	}
+
+	(void)populate_request( p_request );
+	return( p_request );
 }
 
+
+/*
+ * Initialize and return a pointer to a new request object.
+ *
+ */
+request *
+init_request( void )
+{
+	request *p_request = NULL;
+	if ( (p_request = malloc( sizeof(request) )) == NULL ) {
+		debug( 1, LOC, "Unable to allocate memory for request struct: %s\n", strerror(errno) );
+		return( NULL );
+	}
+	p_request->scheme    = NULL;
+	p_request->host      = NULL;
+	p_request->port      = NULL;
+	p_request->path      = NULL;
+	p_request->user      = NULL;
+	p_request->method    = NULL;
+	p_request->client_ip = NULL;
+
+	p_request->tokens.scheme_start  = NULL;
+	p_request->tokens.host_start    = NULL;
+	p_request->tokens.port_start    = NULL;
+	p_request->tokens.path_start    = NULL;
+	p_request->tokens.meth_start    = NULL;
+	p_request->tokens.c_ip_start    = NULL;
+	p_request->tokens.scheme_length = 0;
+	p_request->tokens.host_length   = 0;
+	p_request->tokens.port_length   = 0;
+	p_request->tokens.path_length   = 0;
+	p_request->tokens.meth_length   = 0;
+	p_request->tokens.c_ip_length   = 0;
+
+	return p_request;
+}
+
+
+/*
+ * Take the previously parsed token locations and copy them into the request struct.
+ *
+ */
+void
+populate_request( struct request *p_request )
+{
+	p_request->scheme =
+		copy_string_token( p_request->tokens.scheme_start, p_request->tokens.scheme_length );
+	p_request->host =
+		copy_string_token( p_request->tokens.host_start, p_request->tokens.host_length );
+	p_request->port =
+		copy_string_token( p_request->tokens.port_start, p_request->tokens.port_length );
+	p_request->path =
+		copy_string_token( p_request->tokens.path_start, p_request->tokens.path_length );
+	p_request->method =
+		copy_string_token( p_request->tokens.meth_start, p_request->tokens.meth_length );
+	p_request->client_ip =
+		copy_ipv4_token( p_request->tokens.c_ip_start, p_request->tokens.c_ip_length );
+
+	return;
+}
+
+
+/*
+ * Release memory used by request struct.
+ *
+ */
+void
+cleanup_request( struct request *p_request )
+{
+	if ( p_request == NULL ) return;
+
+	free( p_request->scheme );
+	free( p_request->host );
+	free( p_request->port );
+	free( p_request->path );
+	free( p_request->method );
+	free( p_request->client_ip );
+
+	free( p_request ), p_request = NULL;
+
+	return;
+}
+