parser.rl
changeset 10 d07309450285
parent 2 8c88756f81b0
child 12 191b3c25974a
equal deleted inserted replaced
9:bdf20e6eefd7 10:d07309450285
    26 ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
    26 ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
    27 (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
    27 (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
    28 SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
    28 SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
    29 */
    29 */
    30 
    30 
    31 /*
       
    32 Squid docs:
       
    33 ---------------------------------------------------------------------------
       
    34 TAG: url_rewrite_program
       
    35 Specify the location of the executable for the URL rewriter.
       
    36 Since they can perform almost any function there isn't one included.
       
    37 
       
    38 For each requested URL rewriter will receive on line with the format
       
    39 
       
    40 URL <SP> client_ip "/" fqdn <SP> user <SP> method [<SP> kvpairs]<NL>
       
    41 
       
    42 In the future, the rewriter interface will be extended with
       
    43 key=value pairs ("kvpairs" shown above).  Rewriter programs
       
    44 should be prepared to receive and possibly ignore additional
       
    45 whitespace-separated tokens on each input line.
       
    46 
       
    47 And the rewriter may return a rewritten URL. The other components of
       
    48 the request line does not need to be returned (ignored if they are).
       
    49 
       
    50 The rewriter can also indicate that a client-side redirect should
       
    51 be performed to the new URL. This is done by prefixing the returned
       
    52 URL with "301:" (moved permanently) or 302: (moved temporarily).
       
    53 
       
    54 By default, a URL rewriter is not used.
       
    55 ---------------------------------------------------------------------------
       
    56 */
       
    57 
       
    58 #include "volta.h"
    31 #include "volta.h"
    59 
    32 
    60 %%{
    33 %%{
    61 	machine redirector;
    34 	machine redirector;
    62 
    35 
    63 	action parse_error {
    36 	action success { valid = 1; }
    64 		debug( 2, LOC, "parse error\n" );
    37 	action error   { valid = 0; }
    65 		return( NULL );
    38 
       
    39 	action channel_id_found  {
       
    40 		debug( 1, LOC, "Channel ID found in redirector input.  Set 'url_rewrite_concurrency' to '0' in squid.\n" );
       
    41 		fbreak;
    66 	}
    42 	}
    67 
    43 
    68 	action yay {
    44 	action scheme_start  { p_request->tokens.scheme_start  = fpc; }
    69 		printf( "I saw: %s", p+1 );
    45 	action scheme_finish { p_request->tokens.scheme_length = fpc - ( *pe + p_request->tokens.scheme_start ); }
    70 	}
    46 	action scheme_error  { debug( 3, LOC, "Unable to parse scheme.\n" ); }
    71 
    47 
    72 	# http://, ftp://, https://, etc
    48 	action host_start  { p_request->tokens.host_start  = fpc; }
    73 	proto = alpha{3,5} . '://';
    49 	action host_finish { p_request->tokens.host_length = fpc - ( *pe + p_request->tokens.host_start ); }
    74 
    50 	action host_error  { debug( 3, LOC, "Unable to parse hostname.\n" ); }
    75 	# http://mahlon:password@example.com or http://mahlon@example.com
    51 
    76     #       username              optional password
    52 	action port_start  { p_request->tokens.port_start  = fpc; }
    77 	creds = ( alnum | [+._\-] )+ . ( ':' . any+ )? . '@';
    53 	action port_finish { p_request->tokens.port_length = fpc - ( *pe + p_request->tokens.port_start ); }
    78 
    54 
    79 	main  := ( proto . creds ) | proto @yay '\n';
    55 	action path_start  { p_request->tokens.path_start  = fpc; }
       
    56 	action path_finish { p_request->tokens.path_length = fpc - ( *pe + p_request->tokens.path_start ); }
       
    57 
       
    58 	action meth_start  { p_request->tokens.meth_start  = fpc; }
       
    59 	action meth_finish { p_request->tokens.meth_length = fpc - ( *pe + p_request->tokens.meth_start ); }
       
    60 	action meth_error  { debug( 3, LOC, "Unable to parse method.\n" ); }
       
    61 
       
    62 	action c_ip_start  { p_request->tokens.c_ip_start  = fpc; }
       
    63 	action c_ip_finish { p_request->tokens.c_ip_length = fpc - ( *pe + p_request->tokens.c_ip_start ); }
       
    64 	action c_ip_error  { debug( 3, LOC, "Unable to parse the client IP address.\n" ); }
       
    65 
       
    66     # 
       
    67     # Squid line: URL <SP> client_ip "/" fqdn <SP> user <SP> method [<SP> kvpairs]<NL>
       
    68     # 
       
    69     # URI Syntax (RFC 3986) misc notes:
       
    70     #
       
    71     # - Scheme isn't passed to redirectors on CONNECT method requests
       
    72     #
       
    73     # - Hostname segments aren't supposed to be individually greater than 63 chars,
       
    74     #   and the hostname in total shouldn't exceed 255.  They also shouldn't be entirely
       
    75     #   made up of digits, or contain underscores.  In practice, these rules appear to
       
    76     #   be violated constantly by some pretty big sites. I'm looking at you, facebook.
       
    77     #   (( alnum ) | ( alnum . [a-zA-Z0-9\-]{0,63} . alnum )) & !( digit+ );
       
    78     #
       
    79     # - ipv6 has some utterly insane rules (RFC 5952) in the name of "shortcuts", which
       
    80     #   only seem like shortcuts to someone writing IP addresses by hand.  Anyone that
       
    81     #   has to parse (or even just read) them has a bunch of seemingly arbitrary work
       
    82     #   dumped in their lap.  Heck, it's impossible to even search for an ipv6 address
       
    83     #   that contains zeros in a text editor, because you have no idea what how it might
       
    84     #   be represented.  Rad!
       
    85     #
       
    86     #   The parser just trusts any ipv6 squid hands us as being valid, without
       
    87     #   any real parsing/validation, other than it consists of hex digits and colons.
       
    88     #
       
    89     # - This parser originally validated path/query/fragment as well, but there were
       
    90     #   enough inconsistencies with unescaped chars and other real-life RFC deviations
       
    91     #   that I opted to just accept what we get from squid.
       
    92 	#
       
    93 	# - Redirectors aren't handed any userinfo (http://mahlon:password@example.com),
       
    94     #   so no need to check for that.
       
    95     #
       
    96 
       
    97 	host_component = alnum | ( alnum [a-zA-Z0-9\-_]* alnum );
       
    98 	pchar          = ( alnum | [\-._~!$%&'()*+,;=] );
       
    99 	path_segment   = '/' ( any - space )*;
       
   100 
       
   101 	hostname       = host_component ( '.' host_component )* '.'?;
       
   102 	ipv4           = digit{1,3} '.' digit{1,3} '.' digit{1,3} '.' digit{1,3};
       
   103 	ipv6           = ( xdigit | ':' )+;
       
   104 
       
   105 	channel_id     = ( digit+ space )      %channel_id_found;
       
   106 	scheme         = ( alpha{3,5} '://' )  >scheme_start %scheme_finish @!scheme_error;
       
   107 	host           = ( hostname | ipv4 )   >host_start   %host_finish   @!host_error;
       
   108 	port           = ( ':' digit{1,5} )    >port_start   %port_finish;
       
   109 	path           = path_segment*         >path_start   %path_finish;
       
   110 	client_ip      = ipv4                  >c_ip_start   %c_ip_finish   @!c_ip_error;
       
   111 	method         = upper+                >meth_start   %meth_finish   @!meth_error;
       
   112 
       
   113 	Line = (
       
   114  		start: (
       
   115 			channel_id? -> Url
       
   116 		),
       
   117 
       
   118 		Url: (
       
   119 		   	scheme? host port? path? space -> Client
       
   120  		),
       
   121 
       
   122 		Client: (
       
   123 			client_ip '/' ( hostname | '-' ) space -> User
       
   124 		),
       
   125 
       
   126 		User: (
       
   127 			pchar+ space -> Method
       
   128 		),
       
   129 
       
   130 		Method: (
       
   131 			method -> KVPairs
       
   132 		),
       
   133 
       
   134 		KVPairs: (
       
   135 			( space any+ )? -> final
       
   136 		)
       
   137  	) %success @!error;
       
   138 
       
   139 
       
   140 	main := Line '\n';
    80 }%%
   141 }%%
    81 %% write data;
   142 %% write data;
    82 
   143 
    83 /*
   144 /* 
    84 %%{
   145  * Tokenize an incoming line from squid, returning a parsed and populated
    85 	machine redirector;
   146  * structure to make redirection decisions against.  This pointer should
    86 
   147  * be freed using cleanup_request() after use.
    87 	action yay {
   148  * 
    88 		printf( "I saw: %s", p+1 );
   149  * Squid documentation about redirectors:
    89 	}
   150  * ---------------------------------------------------------------------------
    90 
   151  * TAG: url_rewrite_program
    91 	# http://, ftp://, https://, etc
   152  * Specify the location of the executable for the URL rewriter.
    92 	proto = alpha{3,5} . '://';
   153  * Since they can perform almost any function there isn't one included.
    93 
   154  * 
    94 	# http://mahlon:password@example.com or http://mahlon@example.com
   155  * For each requested URL rewriter will receive on line with the format
    95     #       username              optional password
   156  * 
    96 	creds = ( alnum | [+._\-] )+ . ( ':' . any+ )? . '@';
   157  * URL <SP> client_ip "/" fqdn <SP> user <SP> method [<SP> kvpairs]<NL>
    97 
   158  * 
    98 	main := ( proto . creds ) | proto @yay '\n';
   159  * In the future, the rewriter interface will be extended with
    99 }%%
   160  * key=value pairs ("kvpairs" shown above).  Rewriter programs
   100 %% write data;
   161  * should be prepared to receive and possibly ignore additional
       
   162  * whitespace-separated tokens on each input line.
       
   163  * 
       
   164  * And the rewriter may return a rewritten URL. The other components of
       
   165  * the request line does not need to be returned (ignored if they are).
       
   166  * 
       
   167  * The rewriter can also indicate that a client-side redirect should
       
   168  * be performed to the new URL. This is done by prefixing the returned
       
   169  * URL with "301:" (moved permanently) or 302: (moved temporarily).
       
   170  * 
       
   171  * By default, a URL rewriter is not used.
       
   172  * ---------------------------------------------------------------------------
   101 */
   173 */
   102 
   174 request *
   103 
   175 parse( char *line )
   104 /*
   176 {
   105 %%{
   177    	/* machine required vars */
   106 	machine foo;
   178 	int  cs   = 0;
   107 
   179 	char *p   = line;
   108 	OPEN = 0;
   180 	char *pe  = p + strlen(p);
   109 	CLOSE = 1;
   181 	char *eof = NULL;
   110 
   182 
   111 	main :=
   183 	/* the client request pointer */
   112 		start:
   184 	unsigned char valid = 0;
   113 		door_closed: (
   185 	request *p_request = init_request();
   114 			OPEN -> door_open -> final
       
   115 		),
       
   116 		door_open: (
       
   117 			CLOSE -> door_closed
       
   118 		);
       
   119 }%%
       
   120 */
       
   121 
       
   122 struct request *
       
   123 parse( char *p )
       
   124 {
       
   125    	/* initial machine state */
       
   126 	short int cs = 0;
       
   127 
       
   128 	/* the client request object */
       
   129 	request c_request;
       
   130 	request *cp_request = &c_request;
       
   131 
       
   132 	/*
       
   133 	char ip[ INET_ADDRSTRLEN ];
       
   134 	inet_pton( AF_INET, "127.0.0.1", &cp_request->ip );
       
   135 	inet_ntop( AF_INET, &cp_request->ip, ip, INET_ADDRSTRLEN );
       
   136 	*/
       
   137 
       
   138 	/* initalize state machine with current line */
       
   139 	char *pe = p + strlen(p) + 1;
       
   140 
   186 
   141 	/* enter state machine */
   187 	/* enter state machine */
   142 	%% write init;
   188 	%% write init;
   143 	%% write exec;
   189 	%% write exec;
   144 
   190 
   145 	/* reset the request */
   191 	/* If we were given an invalid line, bail early */
   146 	/* c_request = reset_request; */
   192 	if ( valid == 0 ) {
   147 	return( cp_request );
   193 		free( p_request ), p_request = NULL;
   148 }
   194 		debug( 3, LOC, "Invalid line (%d), skipped\n", v.timer.lines + 1 );
   149 
   195 		debug( 4, LOC, "%s", line );
       
   196 		return( NULL );
       
   197 	}
       
   198 
       
   199 	(void)populate_request( p_request );
       
   200 	return( p_request );
       
   201 }
       
   202 
       
   203 
       
   204 /*
       
   205  * Initialize and return a pointer to a new request object.
       
   206  *
       
   207  */
       
   208 request *
       
   209 init_request( void )
       
   210 {
       
   211 	request *p_request = NULL;
       
   212 	if ( (p_request = malloc( sizeof(request) )) == NULL ) {
       
   213 		debug( 1, LOC, "Unable to allocate memory for request struct: %s\n", strerror(errno) );
       
   214 		return( NULL );
       
   215 	}
       
   216 	p_request->scheme    = NULL;
       
   217 	p_request->host      = NULL;
       
   218 	p_request->port      = NULL;
       
   219 	p_request->path      = NULL;
       
   220 	p_request->user      = NULL;
       
   221 	p_request->method    = NULL;
       
   222 	p_request->client_ip = NULL;
       
   223 
       
   224 	p_request->tokens.scheme_start  = NULL;
       
   225 	p_request->tokens.host_start    = NULL;
       
   226 	p_request->tokens.port_start    = NULL;
       
   227 	p_request->tokens.path_start    = NULL;
       
   228 	p_request->tokens.meth_start    = NULL;
       
   229 	p_request->tokens.c_ip_start    = NULL;
       
   230 	p_request->tokens.scheme_length = 0;
       
   231 	p_request->tokens.host_length   = 0;
       
   232 	p_request->tokens.port_length   = 0;
       
   233 	p_request->tokens.path_length   = 0;
       
   234 	p_request->tokens.meth_length   = 0;
       
   235 	p_request->tokens.c_ip_length   = 0;
       
   236 
       
   237 	return p_request;
       
   238 }
       
   239 
       
   240 
       
   241 /*
       
   242  * Take the previously parsed token locations and copy them into the request struct.
       
   243  *
       
   244  */
       
   245 void
       
   246 populate_request( struct request *p_request )
       
   247 {
       
   248 	p_request->scheme =
       
   249 		copy_string_token( p_request->tokens.scheme_start, p_request->tokens.scheme_length );
       
   250 	p_request->host =
       
   251 		copy_string_token( p_request->tokens.host_start, p_request->tokens.host_length );
       
   252 	p_request->port =
       
   253 		copy_string_token( p_request->tokens.port_start, p_request->tokens.port_length );
       
   254 	p_request->path =
       
   255 		copy_string_token( p_request->tokens.path_start, p_request->tokens.path_length );
       
   256 	p_request->method =
       
   257 		copy_string_token( p_request->tokens.meth_start, p_request->tokens.meth_length );
       
   258 	p_request->client_ip =
       
   259 		copy_ipv4_token( p_request->tokens.c_ip_start, p_request->tokens.c_ip_length );
       
   260 
       
   261 	return;
       
   262 }
       
   263 
       
   264 
       
   265 /*
       
   266  * Release memory used by request struct.
       
   267  *
       
   268  */
       
   269 void
       
   270 cleanup_request( struct request *p_request )
       
   271 {
       
   272 	if ( p_request == NULL ) return;
       
   273 
       
   274 	free( p_request->scheme );
       
   275 	free( p_request->host );
       
   276 	free( p_request->port );
       
   277 	free( p_request->path );
       
   278 	free( p_request->method );
       
   279 	free( p_request->client_ip );
       
   280 
       
   281 	free( p_request ), p_request = NULL;
       
   282 
       
   283 	return;
       
   284 }
       
   285