parser.rl
changeset 12 191b3c25974a
parent 10 d07309450285
child 13 23a242d7b7fa
--- a/parser.rl	Mon Oct 17 09:12:00 2011 -0700
+++ b/parser.rl	Sun Oct 23 22:59:59 2011 -0700
@@ -30,116 +30,8 @@
 
 #include "volta.h"
 
-%%{
-	machine redirector;
-
-	action success { valid = 1; }
-	action error   { valid = 0; }
-
-	action channel_id_found  {
-		debug( 1, LOC, "Channel ID found in redirector input.  Set 'url_rewrite_concurrency' to '0' in squid.\n" );
-		fbreak;
-	}
-
-	action scheme_start  { p_request->tokens.scheme_start  = fpc; }
-	action scheme_finish { p_request->tokens.scheme_length = fpc - ( *pe + p_request->tokens.scheme_start ); }
-	action scheme_error  { debug( 3, LOC, "Unable to parse scheme.\n" ); }
-
-	action host_start  { p_request->tokens.host_start  = fpc; }
-	action host_finish { p_request->tokens.host_length = fpc - ( *pe + p_request->tokens.host_start ); }
-	action host_error  { debug( 3, LOC, "Unable to parse hostname.\n" ); }
-
-	action port_start  { p_request->tokens.port_start  = fpc; }
-	action port_finish { p_request->tokens.port_length = fpc - ( *pe + p_request->tokens.port_start ); }
-
-	action path_start  { p_request->tokens.path_start  = fpc; }
-	action path_finish { p_request->tokens.path_length = fpc - ( *pe + p_request->tokens.path_start ); }
-
-	action meth_start  { p_request->tokens.meth_start  = fpc; }
-	action meth_finish { p_request->tokens.meth_length = fpc - ( *pe + p_request->tokens.meth_start ); }
-	action meth_error  { debug( 3, LOC, "Unable to parse method.\n" ); }
-
-	action c_ip_start  { p_request->tokens.c_ip_start  = fpc; }
-	action c_ip_finish { p_request->tokens.c_ip_length = fpc - ( *pe + p_request->tokens.c_ip_start ); }
-	action c_ip_error  { debug( 3, LOC, "Unable to parse the client IP address.\n" ); }
-
-    # 
-    # Squid line: URL <SP> client_ip "/" fqdn <SP> user <SP> method [<SP> kvpairs]<NL>
-    # 
-    # URI Syntax (RFC 3986) misc notes:
-    #
-    # - Scheme isn't passed to redirectors on CONNECT method requests
-    #
-    # - Hostname segments aren't supposed to be individually greater than 63 chars,
-    #   and the hostname in total shouldn't exceed 255.  They also shouldn't be entirely
-    #   made up of digits, or contain underscores.  In practice, these rules appear to
-    #   be violated constantly by some pretty big sites. I'm looking at you, facebook.
-    #   (( alnum ) | ( alnum . [a-zA-Z0-9\-]{0,63} . alnum )) & !( digit+ );
-    #
-    # - ipv6 has some utterly insane rules (RFC 5952) in the name of "shortcuts", which
-    #   only seem like shortcuts to someone writing IP addresses by hand.  Anyone that
-    #   has to parse (or even just read) them has a bunch of seemingly arbitrary work
-    #   dumped in their lap.  Heck, it's impossible to even search for an ipv6 address
-    #   that contains zeros in a text editor, because you have no idea what how it might
-    #   be represented.  Rad!
-    #
-    #   The parser just trusts any ipv6 squid hands us as being valid, without
-    #   any real parsing/validation, other than it consists of hex digits and colons.
-    #
-    # - This parser originally validated path/query/fragment as well, but there were
-    #   enough inconsistencies with unescaped chars and other real-life RFC deviations
-    #   that I opted to just accept what we get from squid.
-	#
-	# - Redirectors aren't handed any userinfo (http://mahlon:password@example.com),
-    #   so no need to check for that.
-    #
-
-	host_component = alnum | ( alnum [a-zA-Z0-9\-_]* alnum );
-	pchar          = ( alnum | [\-._~!$%&'()*+,;=] );
-	path_segment   = '/' ( any - space )*;
-
-	hostname       = host_component ( '.' host_component )* '.'?;
-	ipv4           = digit{1,3} '.' digit{1,3} '.' digit{1,3} '.' digit{1,3};
-	ipv6           = ( xdigit | ':' )+;
-
-	channel_id     = ( digit+ space )      %channel_id_found;
-	scheme         = ( alpha{3,5} '://' )  >scheme_start %scheme_finish @!scheme_error;
-	host           = ( hostname | ipv4 )   >host_start   %host_finish   @!host_error;
-	port           = ( ':' digit{1,5} )    >port_start   %port_finish;
-	path           = path_segment*         >path_start   %path_finish;
-	client_ip      = ipv4                  >c_ip_start   %c_ip_finish   @!c_ip_error;
-	method         = upper+                >meth_start   %meth_finish   @!meth_error;
-
-	Line = (
- 		start: (
-			channel_id? -> Url
-		),
-
-		Url: (
-		   	scheme? host port? path? space -> Client
- 		),
-
-		Client: (
-			client_ip '/' ( hostname | '-' ) space -> User
-		),
-
-		User: (
-			pchar+ space -> Method
-		),
-
-		Method: (
-			method -> KVPairs
-		),
-
-		KVPairs: (
-			( space any+ )? -> final
-		)
- 	) %success @!error;
-
-
-	main := Line '\n';
-}%%
-%% write data;
+#define MARK_S( LBL ) p_request->tokens.LBL ## _start = p;
+#define MARK_E( LBL ) p_request->tokens.LBL ## _length = p - ( *pe + p_request->tokens.LBL ## _start );
 
 /* 
  * Tokenize an incoming line from squid, returning a parsed and populated
@@ -175,21 +67,104 @@
 parse( char *line )
 {
    	/* machine required vars */
-	int  cs   = 0;
+	unsigned short int cs = 1;
 	char *p   = line;
 	char *pe  = p + strlen(p);
 	char *eof = NULL;
 
 	/* the client request pointer */
-	unsigned char valid = 0;
 	request *p_request = init_request();
 
-	/* enter state machine */
-	%% write init;
+%%{
+	machine input_parser;
+
+	action channel_id_found  {
+		debug( 1, LOC, "Channel ID found in redirector input.  Set 'url_rewrite_concurrency' to '0' in squid.\n" );
+		fbreak;
+	}
+
+	action scheme_start  { MARK_S(scheme) }
+	action scheme_finish { MARK_E(scheme) }
+	action host_start    { MARK_S(host) }
+	action host_finish   { MARK_E(host) }
+	action port_start    { MARK_S(port) }
+	action port_finish   { MARK_E(port) }
+	action path_start    { MARK_S(path) }
+	action path_finish   { MARK_E(path) }
+	action meth_start    { MARK_S(meth) }
+	action meth_finish   { MARK_E(meth) }
+	action c_ip_start    { MARK_S(c_ip) }
+	action c_ip_finish   { MARK_E(c_ip) }
+
+	action host_error   { debug( 3, LOC, "Unable to parse hostname.\n" ); }
+	action scheme_error { debug( 3, LOC, "Unable to parse scheme.\n" ); }
+	action meth_error   { debug( 3, LOC, "Unable to parse method.\n" ); }
+	action c_ip_error   { debug( 3, LOC, "Unable to parse the client IP address.\n" ); }
+
+	# 
+	# Squid line: URL <SP> client_ip "/" fqdn <SP> user <SP> method [<SP> kvpairs]<NL>
+	# 
+	# URI Syntax (RFC 3986) misc notes:
+	#
+	# - Scheme isn't passed to redirectors on CONNECT method requests
+	#
+	# - Hostname segments aren't supposed to be individually greater than 63 chars,
+	#   and the hostname in total shouldn't exceed 255.  They also shouldn't be entirely
+	#   made up of digits, or contain underscores.  In practice, these rules appear to
+	#   be violated constantly by some pretty big sites.
+	#   (( alnum ) | ( alnum . [a-zA-Z0-9\-]{0,63} . alnum )) & !( digit+ );
+	#
+	# - ipv6 has some utterly insane rules (RFC 5952) in the name of "shortcuts", which
+	#   only seem like shortcuts to someone writing IP addresses by hand.  Anyone that
+	#   has to parse (or even just read) them has a bunch of seemingly arbitrary work
+	#   dumped in their lap.  Heck, it's impossible to even search for an ipv6 address
+	#   that contains zeros in a text editor, because you have no idea what how it might
+	#   be represented.  Rad!
+	#
+	#   The parser just trusts any ipv6 squid hands us as being valid, without
+	#   any real parsing/validation, other than it consists of hex digits and colons.
+	#
+	# - This parser originally validated path/query/fragment as well, but there were
+	#   enough inconsistencies with unescaped chars and other real-life RFC deviations
+	#   that I opted to just accept what we get from squid.
+	#
+	# - Redirectors aren't handed any userinfo (http://mahlon:password@example.com),
+	#   so no need to check for that.
+	#
+
+	host_component = alnum | ( alnum [a-zA-Z0-9\-_]* alnum );
+	pchar          = ( alnum | [\-._~!$%&'()*+,;=] );
+	path_segment   = '/' ( any - space )*;
+
+	hostname       = host_component ( '.' host_component )* '.'?;
+	ipv4           = digit{1,3} '.' digit{1,3} '.' digit{1,3} '.' digit{1,3};
+	ipv6           = ( xdigit | ':' )+;
+
+	channel_id     = ( digit+ space )      %channel_id_found;
+	scheme         = ( alpha{3,5} '://' )  >scheme_start %scheme_finish @!scheme_error;
+	host           = ( hostname | ipv4 )   >host_start   %host_finish   @!host_error;
+	port           = ( ':' digit{1,5} )    >port_start   %port_finish;
+	path           = path_segment*         >path_start   %path_finish;
+	client_ip      = ipv4                  >c_ip_start   %c_ip_finish   @!c_ip_error;
+	method         = upper+                >meth_start   %meth_finish   @!meth_error;
+
+	SquidLine = (
+ 		start:   channel_id?                            -> Url,
+		Url:     scheme? host port? path? space         -> Client,
+		Client:  client_ip '/' ( hostname | '-' ) space -> User,
+		User:    pchar+ space                           -> Method,
+		Method:  method                                 -> KVPairs,
+		KVPairs: ( space any+ )?                        -> final
+ 	);
+
+	main := SquidLine '\n';
+}%%
+
+	/* state machine */
 	%% write exec;
 
 	/* If we were given an invalid line, bail early */
-	if ( valid == 0 ) {
+	if ( cs < %%{ write first_final; }%% ) {
 		free( p_request ), p_request = NULL;
 		debug( 3, LOC, "Invalid line (%d), skipped\n", v.timer.lines + 1 );
 		debug( 4, LOC, "%s", line );
@@ -210,11 +185,13 @@
 {
 	request *p_request = NULL;
 	if ( (p_request = malloc( sizeof(request) )) == NULL ) {
-		debug( 1, LOC, "Unable to allocate memory for request struct: %s\n", strerror(errno) );
+		debug( 5, LOC, "Unable to allocate memory for request struct: %s\n", strerror(errno) );
 		return( NULL );
 	}
+
 	p_request->scheme    = NULL;
 	p_request->host      = NULL;
+	p_request->tld       = NULL;
 	p_request->port      = NULL;
 	p_request->path      = NULL;
 	p_request->user      = NULL;
@@ -222,48 +199,95 @@
 	p_request->client_ip = NULL;
 
 	p_request->tokens.scheme_start  = NULL;
+	p_request->tokens.scheme_length = 0;
 	p_request->tokens.host_start    = NULL;
+	p_request->tokens.host_length   = 0;
 	p_request->tokens.port_start    = NULL;
+	p_request->tokens.port_length   = 0;
 	p_request->tokens.path_start    = NULL;
+	p_request->tokens.path_length   = 0;
 	p_request->tokens.meth_start    = NULL;
+	p_request->tokens.meth_length   = 0;
 	p_request->tokens.c_ip_start    = NULL;
-	p_request->tokens.scheme_length = 0;
-	p_request->tokens.host_length   = 0;
-	p_request->tokens.port_length   = 0;
-	p_request->tokens.path_length   = 0;
-	p_request->tokens.meth_length   = 0;
 	p_request->tokens.c_ip_length   = 0;
 
 	return p_request;
 }
 
 
+#define COPY_STR( LBL ) copy_string_token( p_request->tokens.LBL ## _start, p_request->tokens.LBL ## _length )
+#define COPY_IP4( LBL ) copy_ipv4_token(   p_request->tokens.LBL ## _start, p_request->tokens.LBL ## _length )
+
 /*
  * Take the previously parsed token locations and copy them into the request struct.
  *
  */
 void
-populate_request( struct request *p_request )
+populate_request( request *p_request )
 {
-	p_request->scheme =
-		copy_string_token( p_request->tokens.scheme_start, p_request->tokens.scheme_length );
-	p_request->host =
-		copy_string_token( p_request->tokens.host_start, p_request->tokens.host_length );
-	p_request->port =
-		copy_string_token( p_request->tokens.port_start, p_request->tokens.port_length );
-	p_request->path =
-		copy_string_token( p_request->tokens.path_start, p_request->tokens.path_length );
-	p_request->method =
-		copy_string_token( p_request->tokens.meth_start, p_request->tokens.meth_length );
-	p_request->client_ip =
-		copy_ipv4_token( p_request->tokens.c_ip_start, p_request->tokens.c_ip_length );
+	p_request->scheme    = COPY_STR( scheme );
+	p_request->host      = COPY_STR( host );
+	p_request->port      = COPY_STR( port );
+	p_request->path      = COPY_STR( path );
+	p_request->method    = COPY_STR( meth );
+	p_request->client_ip = COPY_IP4( c_ip );
+
+	parse_tld( p_request );
 
 	return;
 }
 
 
 /*
- * Release memory used by request struct.
+ * Pull the top level domain out of the requested hostname.
+ *
+ */
+void
+parse_tld( request *p_request )
+{
+	unsigned short int cs = 5, mark = 0;
+	char *p   = p_request->host;
+	char *pe  = p + p_request->tokens.host_length;
+	char *ts  = 0, *te = 0, *eof = NULL;
+
+%%{
+    machine tld_parser;
+
+	host_component = alnum | ( alnum [a-zA-Z0-9\-_]* alnum );
+	tld = ( host_component '.' host_component );
+
+	main := |*
+		tld => { mark = ( p_request->tokens.host_length - (int)strlen(te) ); };
+	*|;
+}%%
+
+	/* It's far easier (and faster) to scan from left to right rather than
+	   backtrack, so start by reversing the requested host string. */
+	reverse_str( p_request->host );
+
+	/* scanner */
+	%% write exec;
+
+	/* If there was a mark, then allocate memory and copy. */
+	if ( mark != 0 ) {
+		if ( (p_request->tld = calloc( mark + 1, sizeof(char) )) == NULL ) {
+			debug( 5, LOC, "Unable to allocate memory for tld token: %s\n", strerror(errno) );
+			reverse_str( p_request->host );
+			return;
+		}
+
+		memcpy( p_request->tld, p_request->host, mark );
+		reverse_str( p_request->tld );
+	}
+
+	/* restore the hostname. */
+	reverse_str( p_request->host );
+	return;
+}
+
+
+/*
+ * Release memory used by the request struct.
  *
  */
 void
@@ -273,6 +297,7 @@
 
 	free( p_request->scheme );
 	free( p_request->host );
+	free( p_request->tld );
 	free( p_request->port );
 	free( p_request->path );
 	free( p_request->method );