diff -r 9aa5114326e8 -r 191b3c25974a parser.rl --- a/parser.rl Mon Oct 17 09:12:00 2011 -0700 +++ b/parser.rl Sun Oct 23 22:59:59 2011 -0700 @@ -30,116 +30,8 @@ #include "volta.h" -%%{ - machine redirector; - - action success { valid = 1; } - action error { valid = 0; } - - action channel_id_found { - debug( 1, LOC, "Channel ID found in redirector input. Set 'url_rewrite_concurrency' to '0' in squid.\n" ); - fbreak; - } - - action scheme_start { p_request->tokens.scheme_start = fpc; } - action scheme_finish { p_request->tokens.scheme_length = fpc - ( *pe + p_request->tokens.scheme_start ); } - action scheme_error { debug( 3, LOC, "Unable to parse scheme.\n" ); } - - action host_start { p_request->tokens.host_start = fpc; } - action host_finish { p_request->tokens.host_length = fpc - ( *pe + p_request->tokens.host_start ); } - action host_error { debug( 3, LOC, "Unable to parse hostname.\n" ); } - - action port_start { p_request->tokens.port_start = fpc; } - action port_finish { p_request->tokens.port_length = fpc - ( *pe + p_request->tokens.port_start ); } - - action path_start { p_request->tokens.path_start = fpc; } - action path_finish { p_request->tokens.path_length = fpc - ( *pe + p_request->tokens.path_start ); } - - action meth_start { p_request->tokens.meth_start = fpc; } - action meth_finish { p_request->tokens.meth_length = fpc - ( *pe + p_request->tokens.meth_start ); } - action meth_error { debug( 3, LOC, "Unable to parse method.\n" ); } - - action c_ip_start { p_request->tokens.c_ip_start = fpc; } - action c_ip_finish { p_request->tokens.c_ip_length = fpc - ( *pe + p_request->tokens.c_ip_start ); } - action c_ip_error { debug( 3, LOC, "Unable to parse the client IP address.\n" ); } - - # - # Squid line: URL client_ip "/" fqdn user method [ kvpairs] - # - # URI Syntax (RFC 3986) misc notes: - # - # - Scheme isn't passed to redirectors on CONNECT method requests - # - # - Hostname segments aren't supposed to be individually greater than 63 chars, - # and the hostname in total shouldn't exceed 255. They also shouldn't be entirely - # made up of digits, or contain underscores. In practice, these rules appear to - # be violated constantly by some pretty big sites. I'm looking at you, facebook. - # (( alnum ) | ( alnum . [a-zA-Z0-9\-]{0,63} . alnum )) & !( digit+ ); - # - # - ipv6 has some utterly insane rules (RFC 5952) in the name of "shortcuts", which - # only seem like shortcuts to someone writing IP addresses by hand. Anyone that - # has to parse (or even just read) them has a bunch of seemingly arbitrary work - # dumped in their lap. Heck, it's impossible to even search for an ipv6 address - # that contains zeros in a text editor, because you have no idea what how it might - # be represented. Rad! - # - # The parser just trusts any ipv6 squid hands us as being valid, without - # any real parsing/validation, other than it consists of hex digits and colons. - # - # - This parser originally validated path/query/fragment as well, but there were - # enough inconsistencies with unescaped chars and other real-life RFC deviations - # that I opted to just accept what we get from squid. - # - # - Redirectors aren't handed any userinfo (http://mahlon:password@example.com), - # so no need to check for that. - # - - host_component = alnum | ( alnum [a-zA-Z0-9\-_]* alnum ); - pchar = ( alnum | [\-._~!$%&'()*+,;=] ); - path_segment = '/' ( any - space )*; - - hostname = host_component ( '.' host_component )* '.'?; - ipv4 = digit{1,3} '.' digit{1,3} '.' digit{1,3} '.' digit{1,3}; - ipv6 = ( xdigit | ':' )+; - - channel_id = ( digit+ space ) %channel_id_found; - scheme = ( alpha{3,5} '://' ) >scheme_start %scheme_finish @!scheme_error; - host = ( hostname | ipv4 ) >host_start %host_finish @!host_error; - port = ( ':' digit{1,5} ) >port_start %port_finish; - path = path_segment* >path_start %path_finish; - client_ip = ipv4 >c_ip_start %c_ip_finish @!c_ip_error; - method = upper+ >meth_start %meth_finish @!meth_error; - - Line = ( - start: ( - channel_id? -> Url - ), - - Url: ( - scheme? host port? path? space -> Client - ), - - Client: ( - client_ip '/' ( hostname | '-' ) space -> User - ), - - User: ( - pchar+ space -> Method - ), - - Method: ( - method -> KVPairs - ), - - KVPairs: ( - ( space any+ )? -> final - ) - ) %success @!error; - - - main := Line '\n'; -}%% -%% write data; +#define MARK_S( LBL ) p_request->tokens.LBL ## _start = p; +#define MARK_E( LBL ) p_request->tokens.LBL ## _length = p - ( *pe + p_request->tokens.LBL ## _start ); /* * Tokenize an incoming line from squid, returning a parsed and populated @@ -175,21 +67,104 @@ parse( char *line ) { /* machine required vars */ - int cs = 0; + unsigned short int cs = 1; char *p = line; char *pe = p + strlen(p); char *eof = NULL; /* the client request pointer */ - unsigned char valid = 0; request *p_request = init_request(); - /* enter state machine */ - %% write init; +%%{ + machine input_parser; + + action channel_id_found { + debug( 1, LOC, "Channel ID found in redirector input. Set 'url_rewrite_concurrency' to '0' in squid.\n" ); + fbreak; + } + + action scheme_start { MARK_S(scheme) } + action scheme_finish { MARK_E(scheme) } + action host_start { MARK_S(host) } + action host_finish { MARK_E(host) } + action port_start { MARK_S(port) } + action port_finish { MARK_E(port) } + action path_start { MARK_S(path) } + action path_finish { MARK_E(path) } + action meth_start { MARK_S(meth) } + action meth_finish { MARK_E(meth) } + action c_ip_start { MARK_S(c_ip) } + action c_ip_finish { MARK_E(c_ip) } + + action host_error { debug( 3, LOC, "Unable to parse hostname.\n" ); } + action scheme_error { debug( 3, LOC, "Unable to parse scheme.\n" ); } + action meth_error { debug( 3, LOC, "Unable to parse method.\n" ); } + action c_ip_error { debug( 3, LOC, "Unable to parse the client IP address.\n" ); } + + # + # Squid line: URL client_ip "/" fqdn user method [ kvpairs] + # + # URI Syntax (RFC 3986) misc notes: + # + # - Scheme isn't passed to redirectors on CONNECT method requests + # + # - Hostname segments aren't supposed to be individually greater than 63 chars, + # and the hostname in total shouldn't exceed 255. They also shouldn't be entirely + # made up of digits, or contain underscores. In practice, these rules appear to + # be violated constantly by some pretty big sites. + # (( alnum ) | ( alnum . [a-zA-Z0-9\-]{0,63} . alnum )) & !( digit+ ); + # + # - ipv6 has some utterly insane rules (RFC 5952) in the name of "shortcuts", which + # only seem like shortcuts to someone writing IP addresses by hand. Anyone that + # has to parse (or even just read) them has a bunch of seemingly arbitrary work + # dumped in their lap. Heck, it's impossible to even search for an ipv6 address + # that contains zeros in a text editor, because you have no idea what how it might + # be represented. Rad! + # + # The parser just trusts any ipv6 squid hands us as being valid, without + # any real parsing/validation, other than it consists of hex digits and colons. + # + # - This parser originally validated path/query/fragment as well, but there were + # enough inconsistencies with unescaped chars and other real-life RFC deviations + # that I opted to just accept what we get from squid. + # + # - Redirectors aren't handed any userinfo (http://mahlon:password@example.com), + # so no need to check for that. + # + + host_component = alnum | ( alnum [a-zA-Z0-9\-_]* alnum ); + pchar = ( alnum | [\-._~!$%&'()*+,;=] ); + path_segment = '/' ( any - space )*; + + hostname = host_component ( '.' host_component )* '.'?; + ipv4 = digit{1,3} '.' digit{1,3} '.' digit{1,3} '.' digit{1,3}; + ipv6 = ( xdigit | ':' )+; + + channel_id = ( digit+ space ) %channel_id_found; + scheme = ( alpha{3,5} '://' ) >scheme_start %scheme_finish @!scheme_error; + host = ( hostname | ipv4 ) >host_start %host_finish @!host_error; + port = ( ':' digit{1,5} ) >port_start %port_finish; + path = path_segment* >path_start %path_finish; + client_ip = ipv4 >c_ip_start %c_ip_finish @!c_ip_error; + method = upper+ >meth_start %meth_finish @!meth_error; + + SquidLine = ( + start: channel_id? -> Url, + Url: scheme? host port? path? space -> Client, + Client: client_ip '/' ( hostname | '-' ) space -> User, + User: pchar+ space -> Method, + Method: method -> KVPairs, + KVPairs: ( space any+ )? -> final + ); + + main := SquidLine '\n'; +}%% + + /* state machine */ %% write exec; /* If we were given an invalid line, bail early */ - if ( valid == 0 ) { + if ( cs < %%{ write first_final; }%% ) { free( p_request ), p_request = NULL; debug( 3, LOC, "Invalid line (%d), skipped\n", v.timer.lines + 1 ); debug( 4, LOC, "%s", line ); @@ -210,11 +185,13 @@ { request *p_request = NULL; if ( (p_request = malloc( sizeof(request) )) == NULL ) { - debug( 1, LOC, "Unable to allocate memory for request struct: %s\n", strerror(errno) ); + debug( 5, LOC, "Unable to allocate memory for request struct: %s\n", strerror(errno) ); return( NULL ); } + p_request->scheme = NULL; p_request->host = NULL; + p_request->tld = NULL; p_request->port = NULL; p_request->path = NULL; p_request->user = NULL; @@ -222,48 +199,95 @@ p_request->client_ip = NULL; p_request->tokens.scheme_start = NULL; + p_request->tokens.scheme_length = 0; p_request->tokens.host_start = NULL; + p_request->tokens.host_length = 0; p_request->tokens.port_start = NULL; + p_request->tokens.port_length = 0; p_request->tokens.path_start = NULL; + p_request->tokens.path_length = 0; p_request->tokens.meth_start = NULL; + p_request->tokens.meth_length = 0; p_request->tokens.c_ip_start = NULL; - p_request->tokens.scheme_length = 0; - p_request->tokens.host_length = 0; - p_request->tokens.port_length = 0; - p_request->tokens.path_length = 0; - p_request->tokens.meth_length = 0; p_request->tokens.c_ip_length = 0; return p_request; } +#define COPY_STR( LBL ) copy_string_token( p_request->tokens.LBL ## _start, p_request->tokens.LBL ## _length ) +#define COPY_IP4( LBL ) copy_ipv4_token( p_request->tokens.LBL ## _start, p_request->tokens.LBL ## _length ) + /* * Take the previously parsed token locations and copy them into the request struct. * */ void -populate_request( struct request *p_request ) +populate_request( request *p_request ) { - p_request->scheme = - copy_string_token( p_request->tokens.scheme_start, p_request->tokens.scheme_length ); - p_request->host = - copy_string_token( p_request->tokens.host_start, p_request->tokens.host_length ); - p_request->port = - copy_string_token( p_request->tokens.port_start, p_request->tokens.port_length ); - p_request->path = - copy_string_token( p_request->tokens.path_start, p_request->tokens.path_length ); - p_request->method = - copy_string_token( p_request->tokens.meth_start, p_request->tokens.meth_length ); - p_request->client_ip = - copy_ipv4_token( p_request->tokens.c_ip_start, p_request->tokens.c_ip_length ); + p_request->scheme = COPY_STR( scheme ); + p_request->host = COPY_STR( host ); + p_request->port = COPY_STR( port ); + p_request->path = COPY_STR( path ); + p_request->method = COPY_STR( meth ); + p_request->client_ip = COPY_IP4( c_ip ); + + parse_tld( p_request ); return; } /* - * Release memory used by request struct. + * Pull the top level domain out of the requested hostname. + * + */ +void +parse_tld( request *p_request ) +{ + unsigned short int cs = 5, mark = 0; + char *p = p_request->host; + char *pe = p + p_request->tokens.host_length; + char *ts = 0, *te = 0, *eof = NULL; + +%%{ + machine tld_parser; + + host_component = alnum | ( alnum [a-zA-Z0-9\-_]* alnum ); + tld = ( host_component '.' host_component ); + + main := |* + tld => { mark = ( p_request->tokens.host_length - (int)strlen(te) ); }; + *|; +}%% + + /* It's far easier (and faster) to scan from left to right rather than + backtrack, so start by reversing the requested host string. */ + reverse_str( p_request->host ); + + /* scanner */ + %% write exec; + + /* If there was a mark, then allocate memory and copy. */ + if ( mark != 0 ) { + if ( (p_request->tld = calloc( mark + 1, sizeof(char) )) == NULL ) { + debug( 5, LOC, "Unable to allocate memory for tld token: %s\n", strerror(errno) ); + reverse_str( p_request->host ); + return; + } + + memcpy( p_request->tld, p_request->host, mark ); + reverse_str( p_request->tld ); + } + + /* restore the hostname. */ + reverse_str( p_request->host ); + return; +} + + +/* + * Release memory used by the request struct. * */ void @@ -273,6 +297,7 @@ free( p_request->scheme ); free( p_request->host ); + free( p_request->tld ); free( p_request->port ); free( p_request->path ); free( p_request->method );