parser.rl
changeset 14 51eb85ae4de4
parent 13 23a242d7b7fa
child 15 2706fc514dea
--- a/parser.rl	Mon Oct 31 17:17:07 2011 -0700
+++ b/parser.rl	Fri Nov 04 20:34:28 2011 -0700
@@ -30,13 +30,13 @@
 
 #include "volta.h"
 
-#define MARK_S( LBL ) p_request->tokens.LBL ## _start = p;
-#define MARK_E( LBL ) p_request->tokens.LBL ## _length = p - ( *pe + p_request->tokens.LBL ## _start );
+#define MARK_S( LBL ) p_parsed->tokens.LBL ## _start = p;
+#define MARK_E( LBL ) p_parsed->tokens.LBL ## _length = p - ( *pe + p_parsed->tokens.LBL ## _start );
 
 /* 
  * Tokenize an incoming line from squid, returning a parsed and populated
  * structure to make redirection decisions against.  This pointer should
- * be freed using cleanup_request() after use.
+ * be freed using finish_parsed() after use.
  * 
  * Squid documentation about redirectors:
  * ---------------------------------------------------------------------------
@@ -63,8 +63,8 @@
  * By default, a URL rewriter is not used.
  * ---------------------------------------------------------------------------
 */
-request *
-parse( char *line )
+parsed *
+parse_request( char *line )
 {
    	/* machine required vars */
 	unsigned short int cs = 1;
@@ -73,10 +73,11 @@
 	char *eof = pe;
 
 	/* the client request pointer */
-	request *p_request = init_request();
+	parsed *p_parsed = init_parsed();
+	p_parsed->type   = REQUEST;
 
 %%{
-	machine squidline_parser;
+	machine request_parser;
 
 	action channel_id_found  {
 		debug( 1, LOC, "Channel ID found in redirector input.  Set 'url_rewrite_concurrency' to '0' in squid.\n" );
@@ -87,7 +88,7 @@
 	action scheme_finish { MARK_E(scheme) }
 	action host_start    { MARK_S(host) }
 	action host_finish   { MARK_E(host) }
-	action port_start    { p_request->tokens.port_start = p+1; } # strip leading colon
+	action port_start    { p_parsed->tokens.port_start = p+1; } # strip leading colon
 	action port_finish   { MARK_E(port) }
 	action path_start    { MARK_S(path) }
 	action path_finish   { MARK_E(path) }
@@ -96,9 +97,9 @@
 	action c_ip_start    { MARK_S(c_ip) }
 	action c_ip_finish   { MARK_E(c_ip) }
 
-	action host_error   { debug( 3, LOC, "Unable to parse hostname.\n" ); }
-	action scheme_error { debug( 3, LOC, "Unable to parse scheme.\n" ); }
-	action meth_error   { debug( 3, LOC, "Unable to parse method.\n" ); }
+	action host_error   { debug( 3, LOC, "Unable to parse the request hostname.\n" ); }
+	action scheme_error { debug( 3, LOC, "Unable to parse the request scheme.\n" ); }
+	action meth_error   { debug( 3, LOC, "Unable to parse the request method.\n" ); }
 	action c_ip_error   { debug( 3, LOC, "Unable to parse the client IP address.\n" ); }
 
 	# 
@@ -150,10 +151,10 @@
 
 	SquidLine = (
  		start:   channel_id?                            -> Url,
-		Url:     scheme? host port? path? space         -> Client,
-		Client:  client_ip '/' ( hostname | '-' ) space -> User,
-		User:    pchar+ space                           -> Method,
-		Method:  method                                 -> KVPairs,
+		Url:     scheme? host port? path?               -> Client,
+		Client:  space client_ip '/' ( hostname | '-' ) -> User,
+		User:    space pchar+                           -> Method,
+		Method:  space method                           -> KVPairs,
 		KVPairs: ( space any+ )?                        -> final
  	);
 
@@ -165,94 +166,277 @@
 
 	/* If we were given an invalid line, bail early */
 	if ( cs < %%{ write first_final; }%% ) {
-		free( p_request ), p_request = NULL;
-		debug( 3, LOC, "Invalid line (%d), skipped\n", v.timer.lines + 1 );
+		free( p_parsed ), p_parsed = NULL;
+		debug( 3, LOC, "Invalid request line (%d), skipped\n", v.timer.lines + 1 );
 		debug( 4, LOC, "%s", line );
 		return( NULL );
 	}
 
-	(void)populate_request( p_request );
-	return( p_request );
+	debug( 6, LOC, "%s", line );
+	(void)populate_parsed( p_parsed );
+	return( p_parsed );
+}
+
+
+/*
+ * Tokenize a value string from a successful database lookup, returning a parsed
+ * and populated structure. This pointer should be freed using finish_parsed() after use.
+ *
+ */
+parsed *
+parse_rule( char *rewrite )
+{
+   	/* machine required vars */
+	unsigned short int cs = 1;
+	char *p   = rewrite;
+	char *pe  = p + strlen(p);
+	char *eof = pe;
+
+	/* the client rule pointer */
+	parsed *p_parsed = init_parsed();
+	p_parsed->type   = RULE;
+
+%%{
+	machine rule_parser;
+
+	action match_start    { MARK_S(path_re) }
+	action match_finish   { MARK_E(path_re) }
+	action redir_start    { MARK_S(redir) }
+	action redir_finish   { p_parsed->tokens.redir_length = 3; } # strip trailing colon
+
+	action scheme_start  { MARK_S(scheme) }
+	action scheme_finish { MARK_E(scheme) }
+	action host_start    { MARK_S(host) }
+	action host_finish   { MARK_E(host) }
+	action port_start    { p_parsed->tokens.port_start = p+1; } # strip leading colon
+	action port_finish   { MARK_E(port) }
+	action path_start    { MARK_S(path) }
+	action path_finish   { MARK_E(path) }
+
+	action match_error { debug( 3, LOC, "Unable to parse the rule path matcher.\n" ); }
+	action host_error  { debug( 3, LOC, "Unable to parse the rule hostname.\n" ); }
+
+	host_component  = alnum | ( alnum [a-zA-Z0-9\-_]* alnum );
+	path_segment    = '/' ( any - space )*;
+
+	sep            = space+;
+	hostname       = host_component ( '.' host_component )* '.'?;
+	ipv4           = digit{1,3} '.' digit{1,3} '.' digit{1,3} '.' digit{1,3};
+	ipv6           = ( xdigit | ':' )+;
+
+	path_re        = ( any - space )+      >match_start  %match_finish @!match_error;
+	redir          = ( digit{3} ':' )      >redir_start  %redir_finish;
+
+	scheme         = ( alpha{3,5} '://' )  >scheme_start %scheme_finish;
+	host           = ( hostname | ipv4 )   >host_start   %host_finish   @!host_error;
+	port           = ( ':' digit{1,5} )    >port_start   %port_finish;
+	path           = path_segment*         >path_start   %path_finish;
+
+	main := path_re sep ( redir? scheme? host port? path? );
+}%%
+
+	/* state machine */
+	%% write exec;
+
+	/* If we were given an invalid rule, bail early */
+	if ( cs < %%{ write first_final; }%% ) {
+		free( p_parsed ), p_parsed = NULL;
+		debug( 3, LOC, "Invalid rule\n" );
+		debug( 4, LOC, "%s\n", rewrite );
+		return( NULL );
+	}
+
+	(void)populate_parsed( p_parsed );
+	return( p_parsed );
 }
 
 
 /*
- * Initialize and return a pointer to a new request object.
+ * Tokenize a line from an ascii representation of the database, returning
+ * a pointer to a parsed struct.  Used for creation of a new cdb file,
+ * validating data prior to use.
  *
  */
-request *
-init_request( void )
+struct db_input *
+parse_dbinput( char *line )
 {
-	request *p_request = NULL;
-	if ( (p_request = malloc( sizeof(request) )) == NULL ) {
-		debug( 5, LOC, "Unable to allocate memory for request struct: %s\n", strerror(errno) );
+   	/* machine required vars */
+	unsigned short int cs = 1;
+	char *p   = line;
+	char *pe  = p + strlen(p);
+	char *eof = pe;
+
+	/* the db line input pointer */
+	struct db_input *dbline = NULL;
+	if ( (dbline = malloc( sizeof(struct db_input) )) == NULL ) {
+		debug( 5, LOC, "Unable to allocate memory for db input: %s\n", strerror(errno) );
+		return( NULL );
+	}
+	dbline->klen   = 0;
+	dbline->vlen   = 0;
+	dbline->kstart = NULL;
+	dbline->key    = NULL;
+	dbline->vstart = NULL;
+	dbline->val    = NULL;
+
+%%{
+	machine dbinput_parser;
+
+	action key_start  { dbline->kstart = p; }
+	action key_finish { dbline->klen = p - ( *pe + dbline->kstart ); }
+	action key_error  { debug( 0, LOC, "Invalid key format\n" ); }
+	action val_start  { dbline->vstart = p; }
+	action val_finish { dbline->vlen = p - ( *pe + dbline->vstart ); }
+	action val_error  { debug( 0, LOC, "Invalid rewrite value\n" ); }
+
+	sep            = space+;
+	host_component = alnum | ( alnum [a-zA-Z0-9\-_]* alnum );
+	hostname       = host_component ( '.' host_component )* '.'?;
+	ipv4           = digit{1,3} '.' digit{1,3} '.' digit{1,3} '.' digit{1,3};
+	token          = ( any - space )+;
+	redir          = ( digit{3} ':' );
+	host           = ( hostname | ipv4 );
+
+	key = ( host | '*' )      >key_start %key_finish @!key_error;
+	val = ( token sep token ) >val_start %val_finish @!val_error;
+	
+	main:= key sep val '\n';
+}%%
+
+	/* state machine */
+	%% write exec;
+
+	/* If the input line was invalid, bail early */
+	if ( cs < %%{ write first_final; }%% ) {
+		free( dbline ), dbline = NULL;
 		return( NULL );
 	}
 
-	p_request->scheme    = NULL;
-	p_request->host      = NULL;
-	p_request->tld       = NULL;
-	p_request->port      = 0;
-	p_request->path      = NULL;
-	p_request->user      = NULL;
-	p_request->method    = NULL;
-	p_request->client_ip = NULL;
+	/* populate struct */
+	dbline->key = copy_string_token( dbline->kstart, dbline->klen );
+	dbline->val = copy_string_token( dbline->vstart, dbline->vlen );
 
-	p_request->tokens.scheme_start  = NULL;
-	p_request->tokens.scheme_length = 0;
-	p_request->tokens.host_start    = NULL;
-	p_request->tokens.host_length   = 0;
-	p_request->tokens.port_start    = NULL;
-	p_request->tokens.port_length   = 0;
-	p_request->tokens.path_start    = NULL;
-	p_request->tokens.path_length   = 0;
-	p_request->tokens.meth_start    = NULL;
-	p_request->tokens.meth_length   = 0;
-	p_request->tokens.c_ip_start    = NULL;
-	p_request->tokens.c_ip_length   = 0;
+	/* check the val to make sure it is a valid rewrite rule */
+	parsed *valstr = NULL;
+	valstr = parse_rule( dbline->val );
+	if ( valstr == NULL ) {
+		free( dbline->key );
+		free( dbline->val );
+		free( dbline );
+		return( NULL );
+	}
+	finish_parsed( valstr );
 
-	return p_request;
+	return( dbline );
 }
 
 
-#define COPY_STR( LBL ) copy_string_token( p_request->tokens.LBL ## _start, p_request->tokens.LBL ## _length )
-#define COPY_IP4( LBL ) copy_ipv4_token(   p_request->tokens.LBL ## _start, p_request->tokens.LBL ## _length )
+
+/*
+ * Initialize and return a pointer to a new parser object.
+ *
+ */
+parsed *
+init_parsed( void )
+{
+	parsed *p_parsed = NULL;
+	if ( (p_parsed = malloc( sizeof(parsed) )) == NULL ) {
+		debug( 5, LOC, "Unable to allocate memory for parsed struct: %s\n", strerror(errno) );
+		return( NULL );
+	}
+
+	p_parsed->type      = 0;
+	p_parsed->path_re   = NULL;
+	p_parsed->redir     = NULL;
+	p_parsed->scheme    = NULL;
+	p_parsed->host      = NULL;
+	p_parsed->tld       = NULL;
+	p_parsed->port      = NULL;
+	p_parsed->path      = NULL;
+	p_parsed->user      = NULL;
+	p_parsed->method    = NULL;
+	p_parsed->client_ip = NULL;
+
+	p_parsed->tokens.path_re_start  = NULL;
+	p_parsed->tokens.redir_start    = NULL;
+	p_parsed->tokens.scheme_start   = NULL;
+	p_parsed->tokens.host_start     = NULL;
+	p_parsed->tokens.port_start     = NULL;
+	p_parsed->tokens.path_start     = NULL;
+	p_parsed->tokens.meth_start     = NULL;
+	p_parsed->tokens.c_ip_start     = NULL;
+	p_parsed->tokens.path_re_length = 0;
+	p_parsed->tokens.redir_length   = 0;
+	p_parsed->tokens.scheme_length  = 0;
+	p_parsed->tokens.host_length    = 0;
+	p_parsed->tokens.port_length    = 0;
+	p_parsed->tokens.path_length    = 0;
+	p_parsed->tokens.meth_length    = 0;
+	p_parsed->tokens.c_ip_length    = 0;
+
+	return p_parsed;
+}
+
+
+/*
+ * Release memory used by the parsed struct.
+ *
+ */
+void
+finish_parsed( parsed *p_parsed )
+{
+	if ( p_parsed == NULL ) return;
+
+	free( p_parsed->scheme );
+	free( p_parsed->host );
+	free( p_parsed->path );
+	free( p_parsed->port );
+
+	if ( p_parsed->type == REQUEST ) {
+		free( p_parsed->tld );
+		free( p_parsed->method );
+		free( p_parsed->client_ip );
+	}
+
+	if ( p_parsed->type == RULE ) {
+		free( p_parsed->path_re );
+		free( p_parsed->redir );
+	}
+
+	free( p_parsed ), p_parsed = NULL;
+
+	return;
+}
+
+
+#define COPY_STR( LBL ) copy_string_token( p_parsed->tokens.LBL ## _start, p_parsed->tokens.LBL ## _length )
+/* #define COPY_IP4( LBL ) copy_ipv4_token(   p_request->tokens.LBL ## _start, p_request->tokens.LBL ## _length ) */
 
 /*
  * Take the previously parsed token locations and copy them into the request struct.
  *
  */
 void
-populate_request( request *p_request )
+populate_parsed( parsed *p_parsed )
 {
-	p_request->scheme    = COPY_STR( scheme );
-	p_request->host      = COPY_STR( host );
-	p_request->path      = COPY_STR( path );
-	p_request->method    = COPY_STR( meth );
-	p_request->client_ip = COPY_IP4( c_ip );
-
-	(void)parse_port( p_request );
-	(void)parse_tld(  p_request );
-
-	return;
-}
-
+	p_parsed->scheme = COPY_STR( scheme );
+	p_parsed->host   = COPY_STR( host );
+	p_parsed->path   = COPY_STR( path );
+	p_parsed->port   = COPY_STR( port );
 
-/*
- * Pull out the port number and convert it to an integer before
- * storing in the request struct.
- *
- */
-void
-parse_port( request *p_request )
-{
-	if ( p_request->tokens.port_start == NULL || p_request->tokens.port_length == 0 ) return;
+	if ( p_parsed->type == REQUEST ) {
+		p_parsed->method    = COPY_STR( meth );
+		p_parsed->client_ip = COPY_STR( c_ip );
+		/* p_request->client_ip = COPY_IP4( c_ip ); */
 
-	char port[5];
+		(void)lowercase_str( p_parsed->host, p_parsed->tokens.host_length );
+		(void)parse_tld( p_parsed );
+	}
 
-	(void)strncpy( port, p_request->tokens.port_start, p_request->tokens.port_length );
-	port[ p_request->tokens.port_length ] = '\0';
-	(void)sscanf( port, "%hu", &p_request->port );
+	if ( p_parsed->type == RULE ) {
+		p_parsed->path_re = COPY_STR( path_re );
+		p_parsed->redir   = COPY_STR( redir );
+	}
 
 	return;
 }
@@ -263,7 +447,7 @@
  *
  */
 void
-parse_tld( request *p_request )
+parse_tld( parsed *p_request )
 {
 	unsigned short int cs = 5, mark = 0;
 	char *p   = p_request->host;
@@ -305,25 +489,3 @@
 	return;
 }
 
-
-/*
- * Release memory used by the request struct.
- *
- */
-void
-finish_request( request *p_request )
-{
-	if ( p_request == NULL ) return;
-
-	free( p_request->scheme );
-	free( p_request->host );
-	free( p_request->tld );
-	free( p_request->path );
-	free( p_request->method );
-	free( p_request->client_ip );
-
-	free( p_request ), p_request = NULL;
-
-	return;
-}
-