1 /*
2 * ====================================================================
3 * Licensed to the Apache Software Foundation (ASF) under one
4 * or more contributor license agreements. See the NOTICE file
5 * distributed with this work for additional information
6 * regarding copyright ownership. The ASF licenses this file
7 * to you under the Apache License, Version 2.0 (the
8 * "License"); you may not use this file except in compliance
9 * with the License. You may obtain a copy of the License at
10 *
11 * http://www.apache.org/licenses/LICENSE-2.0
12 *
13 * Unless required by applicable law or agreed to in writing,
14 * software distributed under the License is distributed on an
15 * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
16 * KIND, either express or implied. See the License for the
17 * specific language governing permissions and limitations
18 * under the License.
19 * ====================================================================
20 *
21 * This software consists of voluntary contributions made by many
22 * individuals on behalf of the Apache Software Foundation. For more
23 * information on the Apache Software Foundation, please see
24 * <http://www.apache.org/>.
25 *
26 */
27
28 package org.apache.http.client.utils;
29
30 import java.io.IOException;
31 import java.net.URI;
32 import java.nio.ByteBuffer;
33 import java.nio.CharBuffer;
34 import java.nio.charset.Charset;
35 import java.util.ArrayList;
36 import java.util.BitSet;
37 import java.util.Collections;
38 import java.util.List;
39 import java.util.Scanner;
40
41 import org.apache.http.annotation.Immutable;
42 import org.apache.http.entity.ContentType;
43
44 import org.apache.http.Consts;
45 import org.apache.http.Header;
46 import org.apache.http.HeaderElement;
47 import org.apache.http.HttpEntity;
48 import org.apache.http.NameValuePair;
49 import org.apache.http.message.BasicHeaderValueParser;
50 import org.apache.http.message.BasicNameValuePair;
51 import org.apache.http.message.ParserCursor;
52 import org.apache.http.protocol.HTTP;
53 import org.apache.http.util.CharArrayBuffer;
54 import org.apache.http.util.EntityUtils;
55
56 /**
57 * A collection of utilities for encoding URLs.
58 *
59 * @since 4.0
60 */
61 @Immutable
62 public class URLEncodedUtils {
63
64 public static final String CONTENT_TYPE = "application/x-www-form-urlencoded";
65 private static final String PARAMETER_SEPARATOR = "&";
66 private static final String NAME_VALUE_SEPARATOR = "=";
67
68 /**
69 * Returns a list of {@link NameValuePair NameValuePairs} as built from the
70 * URI's query portion. For example, a URI of
71 * http://example.org/path/to/file?a=1&b=2&c=3 would return a list of three
72 * NameValuePairs, one for a=1, one for b=2, and one for c=3.
73 * <p>
74 * This is typically useful while parsing an HTTP PUT.
75 *
76 * @param uri
77 * uri to parse
78 * @param encoding
79 * encoding to use while parsing the query
80 */
81 public static List <NameValuePair> parse (final URI uri, final String encoding) {
82 final String query = uri.getRawQuery();
83 if (query != null && query.length() > 0) {
84 List<NameValuePair> result = new ArrayList<NameValuePair>();
85 Scanner scanner = new Scanner(query);
86 parse(result, scanner, encoding);
87 return result;
88 } else {
89 return Collections.emptyList();
90 }
91 }
92
93 /**
94 * Returns a list of {@link NameValuePair NameValuePairs} as parsed from an
95 * {@link HttpEntity}. The encoding is taken from the entity's
96 * Content-Encoding header.
97 * <p>
98 * This is typically used while parsing an HTTP POST.
99 *
100 * @param entity
101 * The entity to parse
102 * @throws IOException
103 * If there was an exception getting the entity's data.
104 */
105 public static List <NameValuePair> parse (
106 final HttpEntity entity) throws IOException {
107 ContentType contentType = ContentType.get(entity);
108 if (contentType != null && contentType.getMimeType().equalsIgnoreCase(CONTENT_TYPE)) {
109 String content = EntityUtils.toString(entity, Consts.ASCII);
110 if (content != null && content.length() > 0) {
111 Charset charset = contentType.getCharset();
112 if (charset == null) {
113 charset = HTTP.DEF_CONTENT_CHARSET;
114 }
115 return parse(content, charset);
116 }
117 }
118 return Collections.emptyList();
119 }
120
121 /**
122 * Returns true if the entity's Content-Type header is
123 * <code>application/x-www-form-urlencoded</code>.
124 */
125 public static boolean isEncoded (final HttpEntity entity) {
126 Header h = entity.getContentType();
127 if (h != null) {
128 HeaderElement[] elems = h.getElements();
129 if (elems.length > 0) {
130 String contentType = elems[0].getName();
131 return contentType.equalsIgnoreCase(CONTENT_TYPE);
132 } else {
133 return false;
134 }
135 } else {
136 return false;
137 }
138 }
139
140 /**
141 * Adds all parameters within the Scanner to the list of
142 * <code>parameters</code>, as encoded by <code>encoding</code>. For
143 * example, a scanner containing the string <code>a=1&b=2&c=3</code> would
144 * add the {@link NameValuePair NameValuePairs} a=1, b=2, and c=3 to the
145 * list of parameters.
146 *
147 * @param parameters
148 * List to add parameters to.
149 * @param scanner
150 * Input that contains the parameters to parse.
151 * @param charset
152 * Encoding to use when decoding the parameters.
153 */
154 public static void parse (
155 final List <NameValuePair> parameters,
156 final Scanner scanner,
157 final String charset) {
158 scanner.useDelimiter(PARAMETER_SEPARATOR);
159 while (scanner.hasNext()) {
160 String name = null;
161 String value = null;
162 String token = scanner.next();
163 int i = token.indexOf(NAME_VALUE_SEPARATOR);
164 if (i != -1) {
165 name = decodeFormFields(token.substring(0, i).trim(), charset);
166 value = decodeFormFields(token.substring(i + 1).trim(), charset);
167 } else {
168 name = decodeFormFields(token.trim(), charset);
169 }
170 parameters.add(new BasicNameValuePair(name, value));
171 }
172 }
173
174 private static final char[] DELIM = new char[] { '&' };
175
176 /**
177 * Returns a list of {@link NameValuePair NameValuePairs} as parsed from the given string
178 * using the given character encoding.
179 *
180 * @param s
181 * text to parse.
182 * @param charset
183 * Encoding to use when decoding the parameters.
184 *
185 * @since 4.2
186 */
187 public static List<NameValuePair> parse (final String s, final Charset charset) {
188 if (s == null) {
189 return Collections.emptyList();
190 }
191 BasicHeaderValueParser parser = BasicHeaderValueParser.DEFAULT;
192 CharArrayBuffer buffer = new CharArrayBuffer(s.length());
193 buffer.append(s);
194 ParserCursor cursor = new ParserCursor(0, buffer.length());
195 List<NameValuePair> list = new ArrayList<NameValuePair>();
196 while (!cursor.atEnd()) {
197 NameValuePair nvp = parser.parseNameValuePair(buffer, cursor, DELIM);
198 if (nvp.getName().length() > 0) {
199 list.add(new BasicNameValuePair(
200 decodeFormFields(nvp.getName(), charset),
201 decodeFormFields(nvp.getValue(), charset)));
202 }
203 }
204 return list;
205 }
206
207 /**
208 * Returns a String that is suitable for use as an <code>application/x-www-form-urlencoded</code>
209 * list of parameters in an HTTP PUT or HTTP POST.
210 *
211 * @param parameters The parameters to include.
212 * @param encoding The encoding to use.
213 */
214 public static String format (
215 final List <? extends NameValuePair> parameters,
216 final String encoding) {
217 final StringBuilder result = new StringBuilder();
218 for (final NameValuePair parameter : parameters) {
219 final String encodedName = encodeFormFields(parameter.getName(), encoding);
220 final String encodedValue = encodeFormFields(parameter.getValue(), encoding);
221 if (result.length() > 0) {
222 result.append(PARAMETER_SEPARATOR);
223 }
224 result.append(encodedName);
225 if (encodedValue != null) {
226 result.append(NAME_VALUE_SEPARATOR);
227 result.append(encodedValue);
228 }
229 }
230 return result.toString();
231 }
232
233 /**
234 * Returns a String that is suitable for use as an <code>application/x-www-form-urlencoded</code>
235 * list of parameters in an HTTP PUT or HTTP POST.
236 *
237 * @param parameters The parameters to include.
238 * @param charset The encoding to use.
239 *
240 * @since 4.2
241 */
242 public static String format (
243 final Iterable<? extends NameValuePair> parameters,
244 final Charset charset) {
245 final StringBuilder result = new StringBuilder();
246 for (final NameValuePair parameter : parameters) {
247 final String encodedName = encodeFormFields(parameter.getName(), charset);
248 final String encodedValue = encodeFormFields(parameter.getValue(), charset);
249 if (result.length() > 0) {
250 result.append(PARAMETER_SEPARATOR);
251 }
252 result.append(encodedName);
253 if (encodedValue != null) {
254 result.append(NAME_VALUE_SEPARATOR);
255 result.append(encodedValue);
256 }
257 }
258 return result.toString();
259 }
260
261 /**
262 * Unreserved characters, i.e. alphanumeric, plus: {@code _ - ! . ~ ' ( ) *}
263 * <p>
264 * This list is the same as the {@code unreserved} list in
265 * <a href="http://www.ietf.org/rfc/rfc2396.txt">RFC 2396</a>
266 */
267 private static final BitSet UNRESERVED = new BitSet(256);
268 /**
269 * Punctuation characters: , ; : $ & + =
270 * <p>
271 * These are the additional characters allowed by userinfo.
272 */
273 private static final BitSet PUNCT = new BitSet(256);
274 /** Characters which are safe to use in userinfo, i.e. {@link #UNRESERVED} plus {@link #PUNCT}uation */
275 private static final BitSet USERINFO = new BitSet(256);
276 /** Characters which are safe to use in a path, i.e. {@link #UNRESERVED} plus {@link #PUNCT}uation plus / @ */
277 private static final BitSet PATHSAFE = new BitSet(256);
278 /** Characters which are safe to use in a fragment, i.e. {@link #RESERVED} plus {@link #UNRESERVED} */
279 private static final BitSet FRAGMENT = new BitSet(256);
280
281 /**
282 * Reserved characters, i.e. {@code ;/?:@&=+$,[]}
283 * <p>
284 * This list is the same as the {@code reserved} list in
285 * <a href="http://www.ietf.org/rfc/rfc2396.txt">RFC 2396</a>
286 * as augmented by
287 * <a href="http://www.ietf.org/rfc/rfc2732.txt">RFC 2732</a>
288 */
289 private static final BitSet RESERVED = new BitSet(256);
290
291
292 /**
293 * Safe characters for x-www-form-urlencoded data, as per java.net.URLEncoder and browser behaviour,
294 * i.e. alphanumeric plus {@code "-", "_", ".", "*"}
295 */
296 private static final BitSet URLENCODER = new BitSet(256);
297
298 static {
299 // unreserved chars
300 // alpha characters
301 for (int i = 'a'; i <= 'z'; i++) {
302 UNRESERVED.set(i);
303 }
304 for (int i = 'A'; i <= 'Z'; i++) {
305 UNRESERVED.set(i);
306 }
307 // numeric characters
308 for (int i = '0'; i <= '9'; i++) {
309 UNRESERVED.set(i);
310 }
311 UNRESERVED.set('_'); // these are the charactes of the "mark" list
312 UNRESERVED.set('-');
313 UNRESERVED.set('.');
314 UNRESERVED.set('*');
315 URLENCODER.or(UNRESERVED); // skip remaining unreserved characters
316 UNRESERVED.set('!');
317 UNRESERVED.set('~');
318 UNRESERVED.set('\'');
319 UNRESERVED.set('(');
320 UNRESERVED.set(')');
321 // punct chars
322 PUNCT.set(',');
323 PUNCT.set(';');
324 PUNCT.set(':');
325 PUNCT.set('$');
326 PUNCT.set('&');
327 PUNCT.set('+');
328 PUNCT.set('=');
329 // Safe for userinfo
330 USERINFO.or(UNRESERVED);
331 USERINFO.or(PUNCT);
332
333 // URL path safe
334 PATHSAFE.or(UNRESERVED);
335 PATHSAFE.set('/'); // segment separator
336 PATHSAFE.set(';'); // param separator
337 PATHSAFE.set(':'); // rest as per list in 2396, i.e. : @ & = + $ ,
338 PATHSAFE.set('@');
339 PATHSAFE.set('&');
340 PATHSAFE.set('=');
341 PATHSAFE.set('+');
342 PATHSAFE.set('$');
343 PATHSAFE.set(',');
344
345 RESERVED.set(';');
346 RESERVED.set('/');
347 RESERVED.set('?');
348 RESERVED.set(':');
349 RESERVED.set('@');
350 RESERVED.set('&');
351 RESERVED.set('=');
352 RESERVED.set('+');
353 RESERVED.set('$');
354 RESERVED.set(',');
355 RESERVED.set('['); // added by RFC 2732
356 RESERVED.set(']'); // added by RFC 2732
357
358 FRAGMENT.or(RESERVED);
359 FRAGMENT.or(UNRESERVED);
360 }
361
362 private static final int RADIX = 16;
363
364 private static String urlencode(
365 final String content,
366 final Charset charset,
367 final BitSet safechars,
368 final boolean blankAsPlus) {
369 if (content == null) {
370 return null;
371 }
372 StringBuilder buf = new StringBuilder();
373 ByteBuffer bb = charset.encode(content);
374 while (bb.hasRemaining()) {
375 int b = bb.get() & 0xff;
376 if (safechars.get(b)) {
377 buf.append((char) b);
378 } else if (blankAsPlus && b == ' ') {
379 buf.append('+');
380 } else {
381 buf.append("%");
382 char hex1 = Character.toUpperCase(Character.forDigit((b >> 4) & 0xF, RADIX));
383 char hex2 = Character.toUpperCase(Character.forDigit(b & 0xF, RADIX));
384 buf.append(hex1);
385 buf.append(hex2);
386 }
387 }
388 return buf.toString();
389 }
390
391 /**
392 * Decode/unescape a portion of a URL, to use with the query part ensure {@code plusAsBlank} is true.
393 *
394 * @param content the portion to decode
395 * @param charset the charset to use
396 * @param plusAsBlank if {@code true}, then convert '+' to space (e.g. for www-url-form-encoded content), otherwise leave as is.
397 * @return encoded string
398 */
399 private static String urldecode(
400 final String content,
401 final Charset charset,
402 final boolean plusAsBlank) {
403 if (content == null) {
404 return null;
405 }
406 ByteBuffer bb = ByteBuffer.allocate(content.length());
407 CharBuffer cb = CharBuffer.wrap(content);
408 while (cb.hasRemaining()) {
409 char c = cb.get();
410 if (c == '%' && cb.remaining() >= 2) {
411 char uc = cb.get();
412 char lc = cb.get();
413 int u = Character.digit(uc, 16);
414 int l = Character.digit(lc, 16);
415 if (u != -1 && l != -1) {
416 bb.put((byte) ((u << 4) + l));
417 } else {
418 bb.put((byte) '%');
419 bb.put((byte) uc);
420 bb.put((byte) lc);
421 }
422 } else if (plusAsBlank && c == '+') {
423 bb.put((byte) ' ');
424 } else {
425 bb.put((byte) c);
426 }
427 }
428 bb.flip();
429 return charset.decode(bb).toString();
430 }
431
432 /**
433 * Decode/unescape www-url-form-encoded content.
434 *
435 * @param content the content to decode, will decode '+' as space
436 * @param charset the charset to use
437 * @return encoded string
438 */
439 private static String decodeFormFields (final String content, final String charset) {
440 if (content == null) {
441 return null;
442 }
443 return urldecode(content, charset != null ? Charset.forName(charset) : Consts.UTF_8, true);
444 }
445
446 /**
447 * Decode/unescape www-url-form-encoded content.
448 *
449 * @param content the content to decode, will decode '+' as space
450 * @param charset the charset to use
451 * @return encoded string
452 */
453 private static String decodeFormFields (final String content, final Charset charset) {
454 if (content == null) {
455 return null;
456 }
457 return urldecode(content, charset != null ? charset : Consts.UTF_8, true);
458 }
459
460 /**
461 * Encode/escape www-url-form-encoded content.
462 * <p>
463 * Uses the {@link #URLENCODER} set of characters, rather than
464 * the {@link #UNRSERVED} set; this is for compatibilty with previous
465 * releases, URLEncoder.encode() and most browsers.
466 *
467 * @param content the content to encode, will convert space to '+'
468 * @param charset the charset to use
469 * @return encoded string
470 */
471 private static String encodeFormFields (final String content, final String charset) {
472 if (content == null) {
473 return null;
474 }
475 return urlencode(content, charset != null ? Charset.forName(charset) :
476 Consts.UTF_8, URLENCODER, true);
477 }
478
479 /**
480 * Encode/escape www-url-form-encoded content.
481 * <p>
482 * Uses the {@link #URLENCODER} set of characters, rather than
483 * the {@link #UNRSERVED} set; this is for compatibilty with previous
484 * releases, URLEncoder.encode() and most browsers.
485 *
486 * @param content the content to encode, will convert space to '+'
487 * @param charset the charset to use
488 * @return encoded string
489 */
490 private static String encodeFormFields (final String content, final Charset charset) {
491 if (content == null) {
492 return null;
493 }
494 return urlencode(content, charset != null ? charset : Consts.UTF_8, URLENCODER, true);
495 }
496
497 /**
498 * Encode a String using the {@link #USERINFO} set of characters.
499 * <p>
500 * Used by URIBuilder to encode the userinfo segment.
501 *
502 * @param content the string to encode, does not convert space to '+'
503 * @param charset the charset to use
504 * @return the encoded string
505 */
506 static String encUserInfo(final String content, final Charset charset) {
507 return urlencode(content, charset, USERINFO, false);
508 }
509
510 /**
511 * Encode a String using the {@link #FRAGMENT} set of characters.
512 * <p>
513 * Used by URIBuilder to encode the userinfo segment.
514 *
515 * @param content the string to encode, does not convert space to '+'
516 * @param charset the charset to use
517 * @return the encoded string
518 */
519 static String encFragment(final String content, final Charset charset) {
520 return urlencode(content, charset, FRAGMENT, false);
521 }
522
523 /**
524 * Encode a String using the {@link #PATHSAFE} set of characters.
525 * <p>
526 * Used by URIBuilder to encode path segments.
527 *
528 * @param content the string to encode, does not convert space to '+'
529 * @param charset the charset to use
530 * @return the encoded string
531 */
532 static String encPath(final String content, final Charset charset) {
533 return urlencode(content, charset, PATHSAFE, false);
534 }
535
536 }