26 ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT |
26 ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT |
27 (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS |
27 (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS |
28 SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. |
28 SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. |
29 */ |
29 */ |
30 |
30 |
31 /* |
|
32 Squid docs: |
|
33 --------------------------------------------------------------------------- |
|
34 TAG: url_rewrite_program |
|
35 Specify the location of the executable for the URL rewriter. |
|
36 Since they can perform almost any function there isn't one included. |
|
37 |
|
38 For each requested URL rewriter will receive on line with the format |
|
39 |
|
40 URL <SP> client_ip "/" fqdn <SP> user <SP> method [<SP> kvpairs]<NL> |
|
41 |
|
42 In the future, the rewriter interface will be extended with |
|
43 key=value pairs ("kvpairs" shown above). Rewriter programs |
|
44 should be prepared to receive and possibly ignore additional |
|
45 whitespace-separated tokens on each input line. |
|
46 |
|
47 And the rewriter may return a rewritten URL. The other components of |
|
48 the request line does not need to be returned (ignored if they are). |
|
49 |
|
50 The rewriter can also indicate that a client-side redirect should |
|
51 be performed to the new URL. This is done by prefixing the returned |
|
52 URL with "301:" (moved permanently) or 302: (moved temporarily). |
|
53 |
|
54 By default, a URL rewriter is not used. |
|
55 --------------------------------------------------------------------------- |
|
56 */ |
|
57 |
|
58 #include "volta.h" |
31 #include "volta.h" |
59 |
32 |
60 %%{ |
33 %%{ |
61 machine redirector; |
34 machine redirector; |
62 |
35 |
63 action parse_error { |
36 action success { valid = 1; } |
64 debug( 2, LOC, "parse error\n" ); |
37 action error { valid = 0; } |
65 return( NULL ); |
38 |
|
39 action channel_id_found { |
|
40 debug( 1, LOC, "Channel ID found in redirector input. Set 'url_rewrite_concurrency' to '0' in squid.\n" ); |
|
41 fbreak; |
66 } |
42 } |
67 |
43 |
68 action yay { |
44 action scheme_start { p_request->tokens.scheme_start = fpc; } |
69 printf( "I saw: %s", p+1 ); |
45 action scheme_finish { p_request->tokens.scheme_length = fpc - ( *pe + p_request->tokens.scheme_start ); } |
70 } |
46 action scheme_error { debug( 3, LOC, "Unable to parse scheme.\n" ); } |
71 |
47 |
72 # http://, ftp://, https://, etc |
48 action host_start { p_request->tokens.host_start = fpc; } |
73 proto = alpha{3,5} . '://'; |
49 action host_finish { p_request->tokens.host_length = fpc - ( *pe + p_request->tokens.host_start ); } |
74 |
50 action host_error { debug( 3, LOC, "Unable to parse hostname.\n" ); } |
75 # http://mahlon:password@example.com or http://mahlon@example.com |
51 |
76 # username optional password |
52 action port_start { p_request->tokens.port_start = fpc; } |
77 creds = ( alnum | [+._\-] )+ . ( ':' . any+ )? . '@'; |
53 action port_finish { p_request->tokens.port_length = fpc - ( *pe + p_request->tokens.port_start ); } |
78 |
54 |
79 main := ( proto . creds ) | proto @yay '\n'; |
55 action path_start { p_request->tokens.path_start = fpc; } |
|
56 action path_finish { p_request->tokens.path_length = fpc - ( *pe + p_request->tokens.path_start ); } |
|
57 |
|
58 action meth_start { p_request->tokens.meth_start = fpc; } |
|
59 action meth_finish { p_request->tokens.meth_length = fpc - ( *pe + p_request->tokens.meth_start ); } |
|
60 action meth_error { debug( 3, LOC, "Unable to parse method.\n" ); } |
|
61 |
|
62 action c_ip_start { p_request->tokens.c_ip_start = fpc; } |
|
63 action c_ip_finish { p_request->tokens.c_ip_length = fpc - ( *pe + p_request->tokens.c_ip_start ); } |
|
64 action c_ip_error { debug( 3, LOC, "Unable to parse the client IP address.\n" ); } |
|
65 |
|
66 # |
|
67 # Squid line: URL <SP> client_ip "/" fqdn <SP> user <SP> method [<SP> kvpairs]<NL> |
|
68 # |
|
69 # URI Syntax (RFC 3986) misc notes: |
|
70 # |
|
71 # - Scheme isn't passed to redirectors on CONNECT method requests |
|
72 # |
|
73 # - Hostname segments aren't supposed to be individually greater than 63 chars, |
|
74 # and the hostname in total shouldn't exceed 255. They also shouldn't be entirely |
|
75 # made up of digits, or contain underscores. In practice, these rules appear to |
|
76 # be violated constantly by some pretty big sites. I'm looking at you, facebook. |
|
77 # (( alnum ) | ( alnum . [a-zA-Z0-9\-]{0,63} . alnum )) & !( digit+ ); |
|
78 # |
|
79 # - ipv6 has some utterly insane rules (RFC 5952) in the name of "shortcuts", which |
|
80 # only seem like shortcuts to someone writing IP addresses by hand. Anyone that |
|
81 # has to parse (or even just read) them has a bunch of seemingly arbitrary work |
|
82 # dumped in their lap. Heck, it's impossible to even search for an ipv6 address |
|
83 # that contains zeros in a text editor, because you have no idea what how it might |
|
84 # be represented. Rad! |
|
85 # |
|
86 # The parser just trusts any ipv6 squid hands us as being valid, without |
|
87 # any real parsing/validation, other than it consists of hex digits and colons. |
|
88 # |
|
89 # - This parser originally validated path/query/fragment as well, but there were |
|
90 # enough inconsistencies with unescaped chars and other real-life RFC deviations |
|
91 # that I opted to just accept what we get from squid. |
|
92 # |
|
93 # - Redirectors aren't handed any userinfo (http://mahlon:password@example.com), |
|
94 # so no need to check for that. |
|
95 # |
|
96 |
|
97 host_component = alnum | ( alnum [a-zA-Z0-9\-_]* alnum ); |
|
98 pchar = ( alnum | [\-._~!$%&'()*+,;=] ); |
|
99 path_segment = '/' ( any - space )*; |
|
100 |
|
101 hostname = host_component ( '.' host_component )* '.'?; |
|
102 ipv4 = digit{1,3} '.' digit{1,3} '.' digit{1,3} '.' digit{1,3}; |
|
103 ipv6 = ( xdigit | ':' )+; |
|
104 |
|
105 channel_id = ( digit+ space ) %channel_id_found; |
|
106 scheme = ( alpha{3,5} '://' ) >scheme_start %scheme_finish @!scheme_error; |
|
107 host = ( hostname | ipv4 ) >host_start %host_finish @!host_error; |
|
108 port = ( ':' digit{1,5} ) >port_start %port_finish; |
|
109 path = path_segment* >path_start %path_finish; |
|
110 client_ip = ipv4 >c_ip_start %c_ip_finish @!c_ip_error; |
|
111 method = upper+ >meth_start %meth_finish @!meth_error; |
|
112 |
|
113 Line = ( |
|
114 start: ( |
|
115 channel_id? -> Url |
|
116 ), |
|
117 |
|
118 Url: ( |
|
119 scheme? host port? path? space -> Client |
|
120 ), |
|
121 |
|
122 Client: ( |
|
123 client_ip '/' ( hostname | '-' ) space -> User |
|
124 ), |
|
125 |
|
126 User: ( |
|
127 pchar+ space -> Method |
|
128 ), |
|
129 |
|
130 Method: ( |
|
131 method -> KVPairs |
|
132 ), |
|
133 |
|
134 KVPairs: ( |
|
135 ( space any+ )? -> final |
|
136 ) |
|
137 ) %success @!error; |
|
138 |
|
139 |
|
140 main := Line '\n'; |
80 }%% |
141 }%% |
81 %% write data; |
142 %% write data; |
82 |
143 |
83 /* |
144 /* |
84 %%{ |
145 * Tokenize an incoming line from squid, returning a parsed and populated |
85 machine redirector; |
146 * structure to make redirection decisions against. This pointer should |
86 |
147 * be freed using cleanup_request() after use. |
87 action yay { |
148 * |
88 printf( "I saw: %s", p+1 ); |
149 * Squid documentation about redirectors: |
89 } |
150 * --------------------------------------------------------------------------- |
90 |
151 * TAG: url_rewrite_program |
91 # http://, ftp://, https://, etc |
152 * Specify the location of the executable for the URL rewriter. |
92 proto = alpha{3,5} . '://'; |
153 * Since they can perform almost any function there isn't one included. |
93 |
154 * |
94 # http://mahlon:password@example.com or http://mahlon@example.com |
155 * For each requested URL rewriter will receive on line with the format |
95 # username optional password |
156 * |
96 creds = ( alnum | [+._\-] )+ . ( ':' . any+ )? . '@'; |
157 * URL <SP> client_ip "/" fqdn <SP> user <SP> method [<SP> kvpairs]<NL> |
97 |
158 * |
98 main := ( proto . creds ) | proto @yay '\n'; |
159 * In the future, the rewriter interface will be extended with |
99 }%% |
160 * key=value pairs ("kvpairs" shown above). Rewriter programs |
100 %% write data; |
161 * should be prepared to receive and possibly ignore additional |
|
162 * whitespace-separated tokens on each input line. |
|
163 * |
|
164 * And the rewriter may return a rewritten URL. The other components of |
|
165 * the request line does not need to be returned (ignored if they are). |
|
166 * |
|
167 * The rewriter can also indicate that a client-side redirect should |
|
168 * be performed to the new URL. This is done by prefixing the returned |
|
169 * URL with "301:" (moved permanently) or 302: (moved temporarily). |
|
170 * |
|
171 * By default, a URL rewriter is not used. |
|
172 * --------------------------------------------------------------------------- |
101 */ |
173 */ |
102 |
174 request * |
103 |
175 parse( char *line ) |
104 /* |
176 { |
105 %%{ |
177 /* machine required vars */ |
106 machine foo; |
178 int cs = 0; |
107 |
179 char *p = line; |
108 OPEN = 0; |
180 char *pe = p + strlen(p); |
109 CLOSE = 1; |
181 char *eof = NULL; |
110 |
182 |
111 main := |
183 /* the client request pointer */ |
112 start: |
184 unsigned char valid = 0; |
113 door_closed: ( |
185 request *p_request = init_request(); |
114 OPEN -> door_open -> final |
|
115 ), |
|
116 door_open: ( |
|
117 CLOSE -> door_closed |
|
118 ); |
|
119 }%% |
|
120 */ |
|
121 |
|
122 struct request * |
|
123 parse( char *p ) |
|
124 { |
|
125 /* initial machine state */ |
|
126 short int cs = 0; |
|
127 |
|
128 /* the client request object */ |
|
129 request c_request; |
|
130 request *cp_request = &c_request; |
|
131 |
|
132 /* |
|
133 char ip[ INET_ADDRSTRLEN ]; |
|
134 inet_pton( AF_INET, "127.0.0.1", &cp_request->ip ); |
|
135 inet_ntop( AF_INET, &cp_request->ip, ip, INET_ADDRSTRLEN ); |
|
136 */ |
|
137 |
|
138 /* initalize state machine with current line */ |
|
139 char *pe = p + strlen(p) + 1; |
|
140 |
186 |
141 /* enter state machine */ |
187 /* enter state machine */ |
142 %% write init; |
188 %% write init; |
143 %% write exec; |
189 %% write exec; |
144 |
190 |
145 /* reset the request */ |
191 /* If we were given an invalid line, bail early */ |
146 /* c_request = reset_request; */ |
192 if ( valid == 0 ) { |
147 return( cp_request ); |
193 free( p_request ), p_request = NULL; |
148 } |
194 debug( 3, LOC, "Invalid line (%d), skipped\n", v.timer.lines + 1 ); |
149 |
195 debug( 4, LOC, "%s", line ); |
|
196 return( NULL ); |
|
197 } |
|
198 |
|
199 (void)populate_request( p_request ); |
|
200 return( p_request ); |
|
201 } |
|
202 |
|
203 |
|
204 /* |
|
205 * Initialize and return a pointer to a new request object. |
|
206 * |
|
207 */ |
|
208 request * |
|
209 init_request( void ) |
|
210 { |
|
211 request *p_request = NULL; |
|
212 if ( (p_request = malloc( sizeof(request) )) == NULL ) { |
|
213 debug( 1, LOC, "Unable to allocate memory for request struct: %s\n", strerror(errno) ); |
|
214 return( NULL ); |
|
215 } |
|
216 p_request->scheme = NULL; |
|
217 p_request->host = NULL; |
|
218 p_request->port = NULL; |
|
219 p_request->path = NULL; |
|
220 p_request->user = NULL; |
|
221 p_request->method = NULL; |
|
222 p_request->client_ip = NULL; |
|
223 |
|
224 p_request->tokens.scheme_start = NULL; |
|
225 p_request->tokens.host_start = NULL; |
|
226 p_request->tokens.port_start = NULL; |
|
227 p_request->tokens.path_start = NULL; |
|
228 p_request->tokens.meth_start = NULL; |
|
229 p_request->tokens.c_ip_start = NULL; |
|
230 p_request->tokens.scheme_length = 0; |
|
231 p_request->tokens.host_length = 0; |
|
232 p_request->tokens.port_length = 0; |
|
233 p_request->tokens.path_length = 0; |
|
234 p_request->tokens.meth_length = 0; |
|
235 p_request->tokens.c_ip_length = 0; |
|
236 |
|
237 return p_request; |
|
238 } |
|
239 |
|
240 |
|
241 /* |
|
242 * Take the previously parsed token locations and copy them into the request struct. |
|
243 * |
|
244 */ |
|
245 void |
|
246 populate_request( struct request *p_request ) |
|
247 { |
|
248 p_request->scheme = |
|
249 copy_string_token( p_request->tokens.scheme_start, p_request->tokens.scheme_length ); |
|
250 p_request->host = |
|
251 copy_string_token( p_request->tokens.host_start, p_request->tokens.host_length ); |
|
252 p_request->port = |
|
253 copy_string_token( p_request->tokens.port_start, p_request->tokens.port_length ); |
|
254 p_request->path = |
|
255 copy_string_token( p_request->tokens.path_start, p_request->tokens.path_length ); |
|
256 p_request->method = |
|
257 copy_string_token( p_request->tokens.meth_start, p_request->tokens.meth_length ); |
|
258 p_request->client_ip = |
|
259 copy_ipv4_token( p_request->tokens.c_ip_start, p_request->tokens.c_ip_length ); |
|
260 |
|
261 return; |
|
262 } |
|
263 |
|
264 |
|
265 /* |
|
266 * Release memory used by request struct. |
|
267 * |
|
268 */ |
|
269 void |
|
270 cleanup_request( struct request *p_request ) |
|
271 { |
|
272 if ( p_request == NULL ) return; |
|
273 |
|
274 free( p_request->scheme ); |
|
275 free( p_request->host ); |
|
276 free( p_request->port ); |
|
277 free( p_request->path ); |
|
278 free( p_request->method ); |
|
279 free( p_request->client_ip ); |
|
280 |
|
281 free( p_request ), p_request = NULL; |
|
282 |
|
283 return; |
|
284 } |
|
285 |