28 SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. |
28 SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. |
29 */ |
29 */ |
30 |
30 |
31 #include "volta.h" |
31 #include "volta.h" |
32 |
32 |
|
33 #define MARK_S( LBL ) p_request->tokens.LBL ## _start = p; |
|
34 #define MARK_E( LBL ) p_request->tokens.LBL ## _length = p - ( *pe + p_request->tokens.LBL ## _start ); |
|
35 |
|
36 /* |
|
37 * Tokenize an incoming line from squid, returning a parsed and populated |
|
38 * structure to make redirection decisions against. This pointer should |
|
39 * be freed using cleanup_request() after use. |
|
40 * |
|
41 * Squid documentation about redirectors: |
|
42 * --------------------------------------------------------------------------- |
|
43 * TAG: url_rewrite_program |
|
44 * Specify the location of the executable for the URL rewriter. |
|
45 * Since they can perform almost any function there isn't one included. |
|
46 * |
|
47 * For each requested URL rewriter will receive on line with the format |
|
48 * |
|
49 * URL <SP> client_ip "/" fqdn <SP> user <SP> method [<SP> kvpairs]<NL> |
|
50 * |
|
51 * In the future, the rewriter interface will be extended with |
|
52 * key=value pairs ("kvpairs" shown above). Rewriter programs |
|
53 * should be prepared to receive and possibly ignore additional |
|
54 * whitespace-separated tokens on each input line. |
|
55 * |
|
56 * And the rewriter may return a rewritten URL. The other components of |
|
57 * the request line does not need to be returned (ignored if they are). |
|
58 * |
|
59 * The rewriter can also indicate that a client-side redirect should |
|
60 * be performed to the new URL. This is done by prefixing the returned |
|
61 * URL with "301:" (moved permanently) or 302: (moved temporarily). |
|
62 * |
|
63 * By default, a URL rewriter is not used. |
|
64 * --------------------------------------------------------------------------- |
|
65 */ |
|
66 request * |
|
67 parse( char *line ) |
|
68 { |
|
69 /* machine required vars */ |
|
70 unsigned short int cs = 1; |
|
71 char *p = line; |
|
72 char *pe = p + strlen(p); |
|
73 char *eof = NULL; |
|
74 |
|
75 /* the client request pointer */ |
|
76 request *p_request = init_request(); |
|
77 |
33 %%{ |
78 %%{ |
34 machine redirector; |
79 machine input_parser; |
35 |
|
36 action success { valid = 1; } |
|
37 action error { valid = 0; } |
|
38 |
80 |
39 action channel_id_found { |
81 action channel_id_found { |
40 debug( 1, LOC, "Channel ID found in redirector input. Set 'url_rewrite_concurrency' to '0' in squid.\n" ); |
82 debug( 1, LOC, "Channel ID found in redirector input. Set 'url_rewrite_concurrency' to '0' in squid.\n" ); |
41 fbreak; |
83 fbreak; |
42 } |
84 } |
43 |
85 |
44 action scheme_start { p_request->tokens.scheme_start = fpc; } |
86 action scheme_start { MARK_S(scheme) } |
45 action scheme_finish { p_request->tokens.scheme_length = fpc - ( *pe + p_request->tokens.scheme_start ); } |
87 action scheme_finish { MARK_E(scheme) } |
46 action scheme_error { debug( 3, LOC, "Unable to parse scheme.\n" ); } |
88 action host_start { MARK_S(host) } |
47 |
89 action host_finish { MARK_E(host) } |
48 action host_start { p_request->tokens.host_start = fpc; } |
90 action port_start { MARK_S(port) } |
49 action host_finish { p_request->tokens.host_length = fpc - ( *pe + p_request->tokens.host_start ); } |
91 action port_finish { MARK_E(port) } |
50 action host_error { debug( 3, LOC, "Unable to parse hostname.\n" ); } |
92 action path_start { MARK_S(path) } |
51 |
93 action path_finish { MARK_E(path) } |
52 action port_start { p_request->tokens.port_start = fpc; } |
94 action meth_start { MARK_S(meth) } |
53 action port_finish { p_request->tokens.port_length = fpc - ( *pe + p_request->tokens.port_start ); } |
95 action meth_finish { MARK_E(meth) } |
54 |
96 action c_ip_start { MARK_S(c_ip) } |
55 action path_start { p_request->tokens.path_start = fpc; } |
97 action c_ip_finish { MARK_E(c_ip) } |
56 action path_finish { p_request->tokens.path_length = fpc - ( *pe + p_request->tokens.path_start ); } |
98 |
57 |
99 action host_error { debug( 3, LOC, "Unable to parse hostname.\n" ); } |
58 action meth_start { p_request->tokens.meth_start = fpc; } |
100 action scheme_error { debug( 3, LOC, "Unable to parse scheme.\n" ); } |
59 action meth_finish { p_request->tokens.meth_length = fpc - ( *pe + p_request->tokens.meth_start ); } |
101 action meth_error { debug( 3, LOC, "Unable to parse method.\n" ); } |
60 action meth_error { debug( 3, LOC, "Unable to parse method.\n" ); } |
102 action c_ip_error { debug( 3, LOC, "Unable to parse the client IP address.\n" ); } |
61 |
103 |
62 action c_ip_start { p_request->tokens.c_ip_start = fpc; } |
104 # |
63 action c_ip_finish { p_request->tokens.c_ip_length = fpc - ( *pe + p_request->tokens.c_ip_start ); } |
105 # Squid line: URL <SP> client_ip "/" fqdn <SP> user <SP> method [<SP> kvpairs]<NL> |
64 action c_ip_error { debug( 3, LOC, "Unable to parse the client IP address.\n" ); } |
106 # |
65 |
107 # URI Syntax (RFC 3986) misc notes: |
66 # |
108 # |
67 # Squid line: URL <SP> client_ip "/" fqdn <SP> user <SP> method [<SP> kvpairs]<NL> |
109 # - Scheme isn't passed to redirectors on CONNECT method requests |
68 # |
110 # |
69 # URI Syntax (RFC 3986) misc notes: |
111 # - Hostname segments aren't supposed to be individually greater than 63 chars, |
70 # |
112 # and the hostname in total shouldn't exceed 255. They also shouldn't be entirely |
71 # - Scheme isn't passed to redirectors on CONNECT method requests |
113 # made up of digits, or contain underscores. In practice, these rules appear to |
72 # |
114 # be violated constantly by some pretty big sites. |
73 # - Hostname segments aren't supposed to be individually greater than 63 chars, |
115 # (( alnum ) | ( alnum . [a-zA-Z0-9\-]{0,63} . alnum )) & !( digit+ ); |
74 # and the hostname in total shouldn't exceed 255. They also shouldn't be entirely |
116 # |
75 # made up of digits, or contain underscores. In practice, these rules appear to |
117 # - ipv6 has some utterly insane rules (RFC 5952) in the name of "shortcuts", which |
76 # be violated constantly by some pretty big sites. I'm looking at you, facebook. |
118 # only seem like shortcuts to someone writing IP addresses by hand. Anyone that |
77 # (( alnum ) | ( alnum . [a-zA-Z0-9\-]{0,63} . alnum )) & !( digit+ ); |
119 # has to parse (or even just read) them has a bunch of seemingly arbitrary work |
78 # |
120 # dumped in their lap. Heck, it's impossible to even search for an ipv6 address |
79 # - ipv6 has some utterly insane rules (RFC 5952) in the name of "shortcuts", which |
121 # that contains zeros in a text editor, because you have no idea what how it might |
80 # only seem like shortcuts to someone writing IP addresses by hand. Anyone that |
122 # be represented. Rad! |
81 # has to parse (or even just read) them has a bunch of seemingly arbitrary work |
123 # |
82 # dumped in their lap. Heck, it's impossible to even search for an ipv6 address |
124 # The parser just trusts any ipv6 squid hands us as being valid, without |
83 # that contains zeros in a text editor, because you have no idea what how it might |
125 # any real parsing/validation, other than it consists of hex digits and colons. |
84 # be represented. Rad! |
126 # |
85 # |
127 # - This parser originally validated path/query/fragment as well, but there were |
86 # The parser just trusts any ipv6 squid hands us as being valid, without |
128 # enough inconsistencies with unescaped chars and other real-life RFC deviations |
87 # any real parsing/validation, other than it consists of hex digits and colons. |
129 # that I opted to just accept what we get from squid. |
88 # |
|
89 # - This parser originally validated path/query/fragment as well, but there were |
|
90 # enough inconsistencies with unescaped chars and other real-life RFC deviations |
|
91 # that I opted to just accept what we get from squid. |
|
92 # |
130 # |
93 # - Redirectors aren't handed any userinfo (http://mahlon:password@example.com), |
131 # - Redirectors aren't handed any userinfo (http://mahlon:password@example.com), |
94 # so no need to check for that. |
132 # so no need to check for that. |
95 # |
133 # |
96 |
134 |
97 host_component = alnum | ( alnum [a-zA-Z0-9\-_]* alnum ); |
135 host_component = alnum | ( alnum [a-zA-Z0-9\-_]* alnum ); |
98 pchar = ( alnum | [\-._~!$%&'()*+,;=] ); |
136 pchar = ( alnum | [\-._~!$%&'()*+,;=] ); |
99 path_segment = '/' ( any - space )*; |
137 path_segment = '/' ( any - space )*; |
100 |
138 |
108 port = ( ':' digit{1,5} ) >port_start %port_finish; |
146 port = ( ':' digit{1,5} ) >port_start %port_finish; |
109 path = path_segment* >path_start %path_finish; |
147 path = path_segment* >path_start %path_finish; |
110 client_ip = ipv4 >c_ip_start %c_ip_finish @!c_ip_error; |
148 client_ip = ipv4 >c_ip_start %c_ip_finish @!c_ip_error; |
111 method = upper+ >meth_start %meth_finish @!meth_error; |
149 method = upper+ >meth_start %meth_finish @!meth_error; |
112 |
150 |
113 Line = ( |
151 SquidLine = ( |
114 start: ( |
152 start: channel_id? -> Url, |
115 channel_id? -> Url |
153 Url: scheme? host port? path? space -> Client, |
116 ), |
154 Client: client_ip '/' ( hostname | '-' ) space -> User, |
117 |
155 User: pchar+ space -> Method, |
118 Url: ( |
156 Method: method -> KVPairs, |
119 scheme? host port? path? space -> Client |
157 KVPairs: ( space any+ )? -> final |
120 ), |
158 ); |
121 |
159 |
122 Client: ( |
160 main := SquidLine '\n'; |
123 client_ip '/' ( hostname | '-' ) space -> User |
|
124 ), |
|
125 |
|
126 User: ( |
|
127 pchar+ space -> Method |
|
128 ), |
|
129 |
|
130 Method: ( |
|
131 method -> KVPairs |
|
132 ), |
|
133 |
|
134 KVPairs: ( |
|
135 ( space any+ )? -> final |
|
136 ) |
|
137 ) %success @!error; |
|
138 |
|
139 |
|
140 main := Line '\n'; |
|
141 }%% |
161 }%% |
142 %% write data; |
162 |
143 |
163 /* state machine */ |
144 /* |
|
145 * Tokenize an incoming line from squid, returning a parsed and populated |
|
146 * structure to make redirection decisions against. This pointer should |
|
147 * be freed using cleanup_request() after use. |
|
148 * |
|
149 * Squid documentation about redirectors: |
|
150 * --------------------------------------------------------------------------- |
|
151 * TAG: url_rewrite_program |
|
152 * Specify the location of the executable for the URL rewriter. |
|
153 * Since they can perform almost any function there isn't one included. |
|
154 * |
|
155 * For each requested URL rewriter will receive on line with the format |
|
156 * |
|
157 * URL <SP> client_ip "/" fqdn <SP> user <SP> method [<SP> kvpairs]<NL> |
|
158 * |
|
159 * In the future, the rewriter interface will be extended with |
|
160 * key=value pairs ("kvpairs" shown above). Rewriter programs |
|
161 * should be prepared to receive and possibly ignore additional |
|
162 * whitespace-separated tokens on each input line. |
|
163 * |
|
164 * And the rewriter may return a rewritten URL. The other components of |
|
165 * the request line does not need to be returned (ignored if they are). |
|
166 * |
|
167 * The rewriter can also indicate that a client-side redirect should |
|
168 * be performed to the new URL. This is done by prefixing the returned |
|
169 * URL with "301:" (moved permanently) or 302: (moved temporarily). |
|
170 * |
|
171 * By default, a URL rewriter is not used. |
|
172 * --------------------------------------------------------------------------- |
|
173 */ |
|
174 request * |
|
175 parse( char *line ) |
|
176 { |
|
177 /* machine required vars */ |
|
178 int cs = 0; |
|
179 char *p = line; |
|
180 char *pe = p + strlen(p); |
|
181 char *eof = NULL; |
|
182 |
|
183 /* the client request pointer */ |
|
184 unsigned char valid = 0; |
|
185 request *p_request = init_request(); |
|
186 |
|
187 /* enter state machine */ |
|
188 %% write init; |
|
189 %% write exec; |
164 %% write exec; |
190 |
165 |
191 /* If we were given an invalid line, bail early */ |
166 /* If we were given an invalid line, bail early */ |
192 if ( valid == 0 ) { |
167 if ( cs < %%{ write first_final; }%% ) { |
193 free( p_request ), p_request = NULL; |
168 free( p_request ), p_request = NULL; |
194 debug( 3, LOC, "Invalid line (%d), skipped\n", v.timer.lines + 1 ); |
169 debug( 3, LOC, "Invalid line (%d), skipped\n", v.timer.lines + 1 ); |
195 debug( 4, LOC, "%s", line ); |
170 debug( 4, LOC, "%s", line ); |
196 return( NULL ); |
171 return( NULL ); |
197 } |
172 } |
208 request * |
183 request * |
209 init_request( void ) |
184 init_request( void ) |
210 { |
185 { |
211 request *p_request = NULL; |
186 request *p_request = NULL; |
212 if ( (p_request = malloc( sizeof(request) )) == NULL ) { |
187 if ( (p_request = malloc( sizeof(request) )) == NULL ) { |
213 debug( 1, LOC, "Unable to allocate memory for request struct: %s\n", strerror(errno) ); |
188 debug( 5, LOC, "Unable to allocate memory for request struct: %s\n", strerror(errno) ); |
214 return( NULL ); |
189 return( NULL ); |
215 } |
190 } |
|
191 |
216 p_request->scheme = NULL; |
192 p_request->scheme = NULL; |
217 p_request->host = NULL; |
193 p_request->host = NULL; |
|
194 p_request->tld = NULL; |
218 p_request->port = NULL; |
195 p_request->port = NULL; |
219 p_request->path = NULL; |
196 p_request->path = NULL; |
220 p_request->user = NULL; |
197 p_request->user = NULL; |
221 p_request->method = NULL; |
198 p_request->method = NULL; |
222 p_request->client_ip = NULL; |
199 p_request->client_ip = NULL; |
223 |
200 |
224 p_request->tokens.scheme_start = NULL; |
201 p_request->tokens.scheme_start = NULL; |
|
202 p_request->tokens.scheme_length = 0; |
225 p_request->tokens.host_start = NULL; |
203 p_request->tokens.host_start = NULL; |
|
204 p_request->tokens.host_length = 0; |
226 p_request->tokens.port_start = NULL; |
205 p_request->tokens.port_start = NULL; |
|
206 p_request->tokens.port_length = 0; |
227 p_request->tokens.path_start = NULL; |
207 p_request->tokens.path_start = NULL; |
|
208 p_request->tokens.path_length = 0; |
228 p_request->tokens.meth_start = NULL; |
209 p_request->tokens.meth_start = NULL; |
|
210 p_request->tokens.meth_length = 0; |
229 p_request->tokens.c_ip_start = NULL; |
211 p_request->tokens.c_ip_start = NULL; |
230 p_request->tokens.scheme_length = 0; |
|
231 p_request->tokens.host_length = 0; |
|
232 p_request->tokens.port_length = 0; |
|
233 p_request->tokens.path_length = 0; |
|
234 p_request->tokens.meth_length = 0; |
|
235 p_request->tokens.c_ip_length = 0; |
212 p_request->tokens.c_ip_length = 0; |
236 |
213 |
237 return p_request; |
214 return p_request; |
238 } |
215 } |
239 |
216 |
|
217 |
|
218 #define COPY_STR( LBL ) copy_string_token( p_request->tokens.LBL ## _start, p_request->tokens.LBL ## _length ) |
|
219 #define COPY_IP4( LBL ) copy_ipv4_token( p_request->tokens.LBL ## _start, p_request->tokens.LBL ## _length ) |
240 |
220 |
241 /* |
221 /* |
242 * Take the previously parsed token locations and copy them into the request struct. |
222 * Take the previously parsed token locations and copy them into the request struct. |
243 * |
223 * |
244 */ |
224 */ |
245 void |
225 void |
246 populate_request( struct request *p_request ) |
226 populate_request( request *p_request ) |
247 { |
227 { |
248 p_request->scheme = |
228 p_request->scheme = COPY_STR( scheme ); |
249 copy_string_token( p_request->tokens.scheme_start, p_request->tokens.scheme_length ); |
229 p_request->host = COPY_STR( host ); |
250 p_request->host = |
230 p_request->port = COPY_STR( port ); |
251 copy_string_token( p_request->tokens.host_start, p_request->tokens.host_length ); |
231 p_request->path = COPY_STR( path ); |
252 p_request->port = |
232 p_request->method = COPY_STR( meth ); |
253 copy_string_token( p_request->tokens.port_start, p_request->tokens.port_length ); |
233 p_request->client_ip = COPY_IP4( c_ip ); |
254 p_request->path = |
234 |
255 copy_string_token( p_request->tokens.path_start, p_request->tokens.path_length ); |
235 parse_tld( p_request ); |
256 p_request->method = |
|
257 copy_string_token( p_request->tokens.meth_start, p_request->tokens.meth_length ); |
|
258 p_request->client_ip = |
|
259 copy_ipv4_token( p_request->tokens.c_ip_start, p_request->tokens.c_ip_length ); |
|
260 |
236 |
261 return; |
237 return; |
262 } |
238 } |
263 |
239 |
264 |
240 |
265 /* |
241 /* |
266 * Release memory used by request struct. |
242 * Pull the top level domain out of the requested hostname. |
|
243 * |
|
244 */ |
|
245 void |
|
246 parse_tld( request *p_request ) |
|
247 { |
|
248 unsigned short int cs = 5, mark = 0; |
|
249 char *p = p_request->host; |
|
250 char *pe = p + p_request->tokens.host_length; |
|
251 char *ts = 0, *te = 0, *eof = NULL; |
|
252 |
|
253 %%{ |
|
254 machine tld_parser; |
|
255 |
|
256 host_component = alnum | ( alnum [a-zA-Z0-9\-_]* alnum ); |
|
257 tld = ( host_component '.' host_component ); |
|
258 |
|
259 main := |* |
|
260 tld => { mark = ( p_request->tokens.host_length - (int)strlen(te) ); }; |
|
261 *|; |
|
262 }%% |
|
263 |
|
264 /* It's far easier (and faster) to scan from left to right rather than |
|
265 backtrack, so start by reversing the requested host string. */ |
|
266 reverse_str( p_request->host ); |
|
267 |
|
268 /* scanner */ |
|
269 %% write exec; |
|
270 |
|
271 /* If there was a mark, then allocate memory and copy. */ |
|
272 if ( mark != 0 ) { |
|
273 if ( (p_request->tld = calloc( mark + 1, sizeof(char) )) == NULL ) { |
|
274 debug( 5, LOC, "Unable to allocate memory for tld token: %s\n", strerror(errno) ); |
|
275 reverse_str( p_request->host ); |
|
276 return; |
|
277 } |
|
278 |
|
279 memcpy( p_request->tld, p_request->host, mark ); |
|
280 reverse_str( p_request->tld ); |
|
281 } |
|
282 |
|
283 /* restore the hostname. */ |
|
284 reverse_str( p_request->host ); |
|
285 return; |
|
286 } |
|
287 |
|
288 |
|
289 /* |
|
290 * Release memory used by the request struct. |
267 * |
291 * |
268 */ |
292 */ |
269 void |
293 void |
270 cleanup_request( struct request *p_request ) |
294 cleanup_request( struct request *p_request ) |
271 { |
295 { |
272 if ( p_request == NULL ) return; |
296 if ( p_request == NULL ) return; |
273 |
297 |
274 free( p_request->scheme ); |
298 free( p_request->scheme ); |
275 free( p_request->host ); |
299 free( p_request->host ); |
|
300 free( p_request->tld ); |
276 free( p_request->port ); |
301 free( p_request->port ); |
277 free( p_request->path ); |
302 free( p_request->path ); |
278 free( p_request->method ); |
303 free( p_request->method ); |
279 free( p_request->client_ip ); |
304 free( p_request->client_ip ); |
280 |
305 |