parser.rl
changeset 12 191b3c25974a
parent 10 d07309450285
child 13 23a242d7b7fa
equal deleted inserted replaced
11:9aa5114326e8 12:191b3c25974a
    28 SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
    28 SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
    29 */
    29 */
    30 
    30 
    31 #include "volta.h"
    31 #include "volta.h"
    32 
    32 
       
    33 #define MARK_S( LBL ) p_request->tokens.LBL ## _start = p;
       
    34 #define MARK_E( LBL ) p_request->tokens.LBL ## _length = p - ( *pe + p_request->tokens.LBL ## _start );
       
    35 
       
    36 /* 
       
    37  * Tokenize an incoming line from squid, returning a parsed and populated
       
    38  * structure to make redirection decisions against.  This pointer should
       
    39  * be freed using cleanup_request() after use.
       
    40  * 
       
    41  * Squid documentation about redirectors:
       
    42  * ---------------------------------------------------------------------------
       
    43  * TAG: url_rewrite_program
       
    44  * Specify the location of the executable for the URL rewriter.
       
    45  * Since they can perform almost any function there isn't one included.
       
    46  * 
       
    47  * For each requested URL rewriter will receive on line with the format
       
    48  * 
       
    49  * URL <SP> client_ip "/" fqdn <SP> user <SP> method [<SP> kvpairs]<NL>
       
    50  * 
       
    51  * In the future, the rewriter interface will be extended with
       
    52  * key=value pairs ("kvpairs" shown above).  Rewriter programs
       
    53  * should be prepared to receive and possibly ignore additional
       
    54  * whitespace-separated tokens on each input line.
       
    55  * 
       
    56  * And the rewriter may return a rewritten URL. The other components of
       
    57  * the request line does not need to be returned (ignored if they are).
       
    58  * 
       
    59  * The rewriter can also indicate that a client-side redirect should
       
    60  * be performed to the new URL. This is done by prefixing the returned
       
    61  * URL with "301:" (moved permanently) or 302: (moved temporarily).
       
    62  * 
       
    63  * By default, a URL rewriter is not used.
       
    64  * ---------------------------------------------------------------------------
       
    65 */
       
    66 request *
       
    67 parse( char *line )
       
    68 {
       
    69    	/* machine required vars */
       
    70 	unsigned short int cs = 1;
       
    71 	char *p   = line;
       
    72 	char *pe  = p + strlen(p);
       
    73 	char *eof = NULL;
       
    74 
       
    75 	/* the client request pointer */
       
    76 	request *p_request = init_request();
       
    77 
    33 %%{
    78 %%{
    34 	machine redirector;
    79 	machine input_parser;
    35 
       
    36 	action success { valid = 1; }
       
    37 	action error   { valid = 0; }
       
    38 
    80 
    39 	action channel_id_found  {
    81 	action channel_id_found  {
    40 		debug( 1, LOC, "Channel ID found in redirector input.  Set 'url_rewrite_concurrency' to '0' in squid.\n" );
    82 		debug( 1, LOC, "Channel ID found in redirector input.  Set 'url_rewrite_concurrency' to '0' in squid.\n" );
    41 		fbreak;
    83 		fbreak;
    42 	}
    84 	}
    43 
    85 
    44 	action scheme_start  { p_request->tokens.scheme_start  = fpc; }
    86 	action scheme_start  { MARK_S(scheme) }
    45 	action scheme_finish { p_request->tokens.scheme_length = fpc - ( *pe + p_request->tokens.scheme_start ); }
    87 	action scheme_finish { MARK_E(scheme) }
    46 	action scheme_error  { debug( 3, LOC, "Unable to parse scheme.\n" ); }
    88 	action host_start    { MARK_S(host) }
    47 
    89 	action host_finish   { MARK_E(host) }
    48 	action host_start  { p_request->tokens.host_start  = fpc; }
    90 	action port_start    { MARK_S(port) }
    49 	action host_finish { p_request->tokens.host_length = fpc - ( *pe + p_request->tokens.host_start ); }
    91 	action port_finish   { MARK_E(port) }
    50 	action host_error  { debug( 3, LOC, "Unable to parse hostname.\n" ); }
    92 	action path_start    { MARK_S(path) }
    51 
    93 	action path_finish   { MARK_E(path) }
    52 	action port_start  { p_request->tokens.port_start  = fpc; }
    94 	action meth_start    { MARK_S(meth) }
    53 	action port_finish { p_request->tokens.port_length = fpc - ( *pe + p_request->tokens.port_start ); }
    95 	action meth_finish   { MARK_E(meth) }
    54 
    96 	action c_ip_start    { MARK_S(c_ip) }
    55 	action path_start  { p_request->tokens.path_start  = fpc; }
    97 	action c_ip_finish   { MARK_E(c_ip) }
    56 	action path_finish { p_request->tokens.path_length = fpc - ( *pe + p_request->tokens.path_start ); }
    98 
    57 
    99 	action host_error   { debug( 3, LOC, "Unable to parse hostname.\n" ); }
    58 	action meth_start  { p_request->tokens.meth_start  = fpc; }
   100 	action scheme_error { debug( 3, LOC, "Unable to parse scheme.\n" ); }
    59 	action meth_finish { p_request->tokens.meth_length = fpc - ( *pe + p_request->tokens.meth_start ); }
   101 	action meth_error   { debug( 3, LOC, "Unable to parse method.\n" ); }
    60 	action meth_error  { debug( 3, LOC, "Unable to parse method.\n" ); }
   102 	action c_ip_error   { debug( 3, LOC, "Unable to parse the client IP address.\n" ); }
    61 
   103 
    62 	action c_ip_start  { p_request->tokens.c_ip_start  = fpc; }
   104 	# 
    63 	action c_ip_finish { p_request->tokens.c_ip_length = fpc - ( *pe + p_request->tokens.c_ip_start ); }
   105 	# Squid line: URL <SP> client_ip "/" fqdn <SP> user <SP> method [<SP> kvpairs]<NL>
    64 	action c_ip_error  { debug( 3, LOC, "Unable to parse the client IP address.\n" ); }
   106 	# 
    65 
   107 	# URI Syntax (RFC 3986) misc notes:
    66     # 
   108 	#
    67     # Squid line: URL <SP> client_ip "/" fqdn <SP> user <SP> method [<SP> kvpairs]<NL>
   109 	# - Scheme isn't passed to redirectors on CONNECT method requests
    68     # 
   110 	#
    69     # URI Syntax (RFC 3986) misc notes:
   111 	# - Hostname segments aren't supposed to be individually greater than 63 chars,
    70     #
   112 	#   and the hostname in total shouldn't exceed 255.  They also shouldn't be entirely
    71     # - Scheme isn't passed to redirectors on CONNECT method requests
   113 	#   made up of digits, or contain underscores.  In practice, these rules appear to
    72     #
   114 	#   be violated constantly by some pretty big sites.
    73     # - Hostname segments aren't supposed to be individually greater than 63 chars,
   115 	#   (( alnum ) | ( alnum . [a-zA-Z0-9\-]{0,63} . alnum )) & !( digit+ );
    74     #   and the hostname in total shouldn't exceed 255.  They also shouldn't be entirely
   116 	#
    75     #   made up of digits, or contain underscores.  In practice, these rules appear to
   117 	# - ipv6 has some utterly insane rules (RFC 5952) in the name of "shortcuts", which
    76     #   be violated constantly by some pretty big sites. I'm looking at you, facebook.
   118 	#   only seem like shortcuts to someone writing IP addresses by hand.  Anyone that
    77     #   (( alnum ) | ( alnum . [a-zA-Z0-9\-]{0,63} . alnum )) & !( digit+ );
   119 	#   has to parse (or even just read) them has a bunch of seemingly arbitrary work
    78     #
   120 	#   dumped in their lap.  Heck, it's impossible to even search for an ipv6 address
    79     # - ipv6 has some utterly insane rules (RFC 5952) in the name of "shortcuts", which
   121 	#   that contains zeros in a text editor, because you have no idea what how it might
    80     #   only seem like shortcuts to someone writing IP addresses by hand.  Anyone that
   122 	#   be represented.  Rad!
    81     #   has to parse (or even just read) them has a bunch of seemingly arbitrary work
   123 	#
    82     #   dumped in their lap.  Heck, it's impossible to even search for an ipv6 address
   124 	#   The parser just trusts any ipv6 squid hands us as being valid, without
    83     #   that contains zeros in a text editor, because you have no idea what how it might
   125 	#   any real parsing/validation, other than it consists of hex digits and colons.
    84     #   be represented.  Rad!
   126 	#
    85     #
   127 	# - This parser originally validated path/query/fragment as well, but there were
    86     #   The parser just trusts any ipv6 squid hands us as being valid, without
   128 	#   enough inconsistencies with unescaped chars and other real-life RFC deviations
    87     #   any real parsing/validation, other than it consists of hex digits and colons.
   129 	#   that I opted to just accept what we get from squid.
    88     #
       
    89     # - This parser originally validated path/query/fragment as well, but there were
       
    90     #   enough inconsistencies with unescaped chars and other real-life RFC deviations
       
    91     #   that I opted to just accept what we get from squid.
       
    92 	#
   130 	#
    93 	# - Redirectors aren't handed any userinfo (http://mahlon:password@example.com),
   131 	# - Redirectors aren't handed any userinfo (http://mahlon:password@example.com),
    94     #   so no need to check for that.
   132 	#   so no need to check for that.
    95     #
   133 	#
    96 
   134 
    97 	host_component = alnum | ( alnum [a-zA-Z0-9\-_]* alnum );
   135 	host_component = alnum | ( alnum [a-zA-Z0-9\-_]* alnum );
    98 	pchar          = ( alnum | [\-._~!$%&'()*+,;=] );
   136 	pchar          = ( alnum | [\-._~!$%&'()*+,;=] );
    99 	path_segment   = '/' ( any - space )*;
   137 	path_segment   = '/' ( any - space )*;
   100 
   138 
   108 	port           = ( ':' digit{1,5} )    >port_start   %port_finish;
   146 	port           = ( ':' digit{1,5} )    >port_start   %port_finish;
   109 	path           = path_segment*         >path_start   %path_finish;
   147 	path           = path_segment*         >path_start   %path_finish;
   110 	client_ip      = ipv4                  >c_ip_start   %c_ip_finish   @!c_ip_error;
   148 	client_ip      = ipv4                  >c_ip_start   %c_ip_finish   @!c_ip_error;
   111 	method         = upper+                >meth_start   %meth_finish   @!meth_error;
   149 	method         = upper+                >meth_start   %meth_finish   @!meth_error;
   112 
   150 
   113 	Line = (
   151 	SquidLine = (
   114  		start: (
   152  		start:   channel_id?                            -> Url,
   115 			channel_id? -> Url
   153 		Url:     scheme? host port? path? space         -> Client,
   116 		),
   154 		Client:  client_ip '/' ( hostname | '-' ) space -> User,
   117 
   155 		User:    pchar+ space                           -> Method,
   118 		Url: (
   156 		Method:  method                                 -> KVPairs,
   119 		   	scheme? host port? path? space -> Client
   157 		KVPairs: ( space any+ )?                        -> final
   120  		),
   158  	);
   121 
   159 
   122 		Client: (
   160 	main := SquidLine '\n';
   123 			client_ip '/' ( hostname | '-' ) space -> User
       
   124 		),
       
   125 
       
   126 		User: (
       
   127 			pchar+ space -> Method
       
   128 		),
       
   129 
       
   130 		Method: (
       
   131 			method -> KVPairs
       
   132 		),
       
   133 
       
   134 		KVPairs: (
       
   135 			( space any+ )? -> final
       
   136 		)
       
   137  	) %success @!error;
       
   138 
       
   139 
       
   140 	main := Line '\n';
       
   141 }%%
   161 }%%
   142 %% write data;
   162 
   143 
   163 	/* state machine */
   144 /* 
       
   145  * Tokenize an incoming line from squid, returning a parsed and populated
       
   146  * structure to make redirection decisions against.  This pointer should
       
   147  * be freed using cleanup_request() after use.
       
   148  * 
       
   149  * Squid documentation about redirectors:
       
   150  * ---------------------------------------------------------------------------
       
   151  * TAG: url_rewrite_program
       
   152  * Specify the location of the executable for the URL rewriter.
       
   153  * Since they can perform almost any function there isn't one included.
       
   154  * 
       
   155  * For each requested URL rewriter will receive on line with the format
       
   156  * 
       
   157  * URL <SP> client_ip "/" fqdn <SP> user <SP> method [<SP> kvpairs]<NL>
       
   158  * 
       
   159  * In the future, the rewriter interface will be extended with
       
   160  * key=value pairs ("kvpairs" shown above).  Rewriter programs
       
   161  * should be prepared to receive and possibly ignore additional
       
   162  * whitespace-separated tokens on each input line.
       
   163  * 
       
   164  * And the rewriter may return a rewritten URL. The other components of
       
   165  * the request line does not need to be returned (ignored if they are).
       
   166  * 
       
   167  * The rewriter can also indicate that a client-side redirect should
       
   168  * be performed to the new URL. This is done by prefixing the returned
       
   169  * URL with "301:" (moved permanently) or 302: (moved temporarily).
       
   170  * 
       
   171  * By default, a URL rewriter is not used.
       
   172  * ---------------------------------------------------------------------------
       
   173 */
       
   174 request *
       
   175 parse( char *line )
       
   176 {
       
   177    	/* machine required vars */
       
   178 	int  cs   = 0;
       
   179 	char *p   = line;
       
   180 	char *pe  = p + strlen(p);
       
   181 	char *eof = NULL;
       
   182 
       
   183 	/* the client request pointer */
       
   184 	unsigned char valid = 0;
       
   185 	request *p_request = init_request();
       
   186 
       
   187 	/* enter state machine */
       
   188 	%% write init;
       
   189 	%% write exec;
   164 	%% write exec;
   190 
   165 
   191 	/* If we were given an invalid line, bail early */
   166 	/* If we were given an invalid line, bail early */
   192 	if ( valid == 0 ) {
   167 	if ( cs < %%{ write first_final; }%% ) {
   193 		free( p_request ), p_request = NULL;
   168 		free( p_request ), p_request = NULL;
   194 		debug( 3, LOC, "Invalid line (%d), skipped\n", v.timer.lines + 1 );
   169 		debug( 3, LOC, "Invalid line (%d), skipped\n", v.timer.lines + 1 );
   195 		debug( 4, LOC, "%s", line );
   170 		debug( 4, LOC, "%s", line );
   196 		return( NULL );
   171 		return( NULL );
   197 	}
   172 	}
   208 request *
   183 request *
   209 init_request( void )
   184 init_request( void )
   210 {
   185 {
   211 	request *p_request = NULL;
   186 	request *p_request = NULL;
   212 	if ( (p_request = malloc( sizeof(request) )) == NULL ) {
   187 	if ( (p_request = malloc( sizeof(request) )) == NULL ) {
   213 		debug( 1, LOC, "Unable to allocate memory for request struct: %s\n", strerror(errno) );
   188 		debug( 5, LOC, "Unable to allocate memory for request struct: %s\n", strerror(errno) );
   214 		return( NULL );
   189 		return( NULL );
   215 	}
   190 	}
       
   191 
   216 	p_request->scheme    = NULL;
   192 	p_request->scheme    = NULL;
   217 	p_request->host      = NULL;
   193 	p_request->host      = NULL;
       
   194 	p_request->tld       = NULL;
   218 	p_request->port      = NULL;
   195 	p_request->port      = NULL;
   219 	p_request->path      = NULL;
   196 	p_request->path      = NULL;
   220 	p_request->user      = NULL;
   197 	p_request->user      = NULL;
   221 	p_request->method    = NULL;
   198 	p_request->method    = NULL;
   222 	p_request->client_ip = NULL;
   199 	p_request->client_ip = NULL;
   223 
   200 
   224 	p_request->tokens.scheme_start  = NULL;
   201 	p_request->tokens.scheme_start  = NULL;
       
   202 	p_request->tokens.scheme_length = 0;
   225 	p_request->tokens.host_start    = NULL;
   203 	p_request->tokens.host_start    = NULL;
       
   204 	p_request->tokens.host_length   = 0;
   226 	p_request->tokens.port_start    = NULL;
   205 	p_request->tokens.port_start    = NULL;
       
   206 	p_request->tokens.port_length   = 0;
   227 	p_request->tokens.path_start    = NULL;
   207 	p_request->tokens.path_start    = NULL;
       
   208 	p_request->tokens.path_length   = 0;
   228 	p_request->tokens.meth_start    = NULL;
   209 	p_request->tokens.meth_start    = NULL;
       
   210 	p_request->tokens.meth_length   = 0;
   229 	p_request->tokens.c_ip_start    = NULL;
   211 	p_request->tokens.c_ip_start    = NULL;
   230 	p_request->tokens.scheme_length = 0;
       
   231 	p_request->tokens.host_length   = 0;
       
   232 	p_request->tokens.port_length   = 0;
       
   233 	p_request->tokens.path_length   = 0;
       
   234 	p_request->tokens.meth_length   = 0;
       
   235 	p_request->tokens.c_ip_length   = 0;
   212 	p_request->tokens.c_ip_length   = 0;
   236 
   213 
   237 	return p_request;
   214 	return p_request;
   238 }
   215 }
   239 
   216 
       
   217 
       
   218 #define COPY_STR( LBL ) copy_string_token( p_request->tokens.LBL ## _start, p_request->tokens.LBL ## _length )
       
   219 #define COPY_IP4( LBL ) copy_ipv4_token(   p_request->tokens.LBL ## _start, p_request->tokens.LBL ## _length )
   240 
   220 
   241 /*
   221 /*
   242  * Take the previously parsed token locations and copy them into the request struct.
   222  * Take the previously parsed token locations and copy them into the request struct.
   243  *
   223  *
   244  */
   224  */
   245 void
   225 void
   246 populate_request( struct request *p_request )
   226 populate_request( request *p_request )
   247 {
   227 {
   248 	p_request->scheme =
   228 	p_request->scheme    = COPY_STR( scheme );
   249 		copy_string_token( p_request->tokens.scheme_start, p_request->tokens.scheme_length );
   229 	p_request->host      = COPY_STR( host );
   250 	p_request->host =
   230 	p_request->port      = COPY_STR( port );
   251 		copy_string_token( p_request->tokens.host_start, p_request->tokens.host_length );
   231 	p_request->path      = COPY_STR( path );
   252 	p_request->port =
   232 	p_request->method    = COPY_STR( meth );
   253 		copy_string_token( p_request->tokens.port_start, p_request->tokens.port_length );
   233 	p_request->client_ip = COPY_IP4( c_ip );
   254 	p_request->path =
   234 
   255 		copy_string_token( p_request->tokens.path_start, p_request->tokens.path_length );
   235 	parse_tld( p_request );
   256 	p_request->method =
       
   257 		copy_string_token( p_request->tokens.meth_start, p_request->tokens.meth_length );
       
   258 	p_request->client_ip =
       
   259 		copy_ipv4_token( p_request->tokens.c_ip_start, p_request->tokens.c_ip_length );
       
   260 
   236 
   261 	return;
   237 	return;
   262 }
   238 }
   263 
   239 
   264 
   240 
   265 /*
   241 /*
   266  * Release memory used by request struct.
   242  * Pull the top level domain out of the requested hostname.
       
   243  *
       
   244  */
       
   245 void
       
   246 parse_tld( request *p_request )
       
   247 {
       
   248 	unsigned short int cs = 5, mark = 0;
       
   249 	char *p   = p_request->host;
       
   250 	char *pe  = p + p_request->tokens.host_length;
       
   251 	char *ts  = 0, *te = 0, *eof = NULL;
       
   252 
       
   253 %%{
       
   254     machine tld_parser;
       
   255 
       
   256 	host_component = alnum | ( alnum [a-zA-Z0-9\-_]* alnum );
       
   257 	tld = ( host_component '.' host_component );
       
   258 
       
   259 	main := |*
       
   260 		tld => { mark = ( p_request->tokens.host_length - (int)strlen(te) ); };
       
   261 	*|;
       
   262 }%%
       
   263 
       
   264 	/* It's far easier (and faster) to scan from left to right rather than
       
   265 	   backtrack, so start by reversing the requested host string. */
       
   266 	reverse_str( p_request->host );
       
   267 
       
   268 	/* scanner */
       
   269 	%% write exec;
       
   270 
       
   271 	/* If there was a mark, then allocate memory and copy. */
       
   272 	if ( mark != 0 ) {
       
   273 		if ( (p_request->tld = calloc( mark + 1, sizeof(char) )) == NULL ) {
       
   274 			debug( 5, LOC, "Unable to allocate memory for tld token: %s\n", strerror(errno) );
       
   275 			reverse_str( p_request->host );
       
   276 			return;
       
   277 		}
       
   278 
       
   279 		memcpy( p_request->tld, p_request->host, mark );
       
   280 		reverse_str( p_request->tld );
       
   281 	}
       
   282 
       
   283 	/* restore the hostname. */
       
   284 	reverse_str( p_request->host );
       
   285 	return;
       
   286 }
       
   287 
       
   288 
       
   289 /*
       
   290  * Release memory used by the request struct.
   267  *
   291  *
   268  */
   292  */
   269 void
   293 void
   270 cleanup_request( struct request *p_request )
   294 cleanup_request( struct request *p_request )
   271 {
   295 {
   272 	if ( p_request == NULL ) return;
   296 	if ( p_request == NULL ) return;
   273 
   297 
   274 	free( p_request->scheme );
   298 	free( p_request->scheme );
   275 	free( p_request->host );
   299 	free( p_request->host );
       
   300 	free( p_request->tld );
   276 	free( p_request->port );
   301 	free( p_request->port );
   277 	free( p_request->path );
   302 	free( p_request->path );
   278 	free( p_request->method );
   303 	free( p_request->method );
   279 	free( p_request->client_ip );
   304 	free( p_request->client_ip );
   280 
   305