1 /*
2 * ====================================================================
3 * Licensed to the Apache Software Foundation (ASF) under one
4 * or more contributor license agreements. See the NOTICE file
5 * distributed with this work for additional information
6 * regarding copyright ownership. The ASF licenses this file
7 * to you under the Apache License, Version 2.0 (the
8 * "License"); you may not use this file except in compliance
9 * with the License. You may obtain a copy of the License at
10 *
11 * http://www.apache.org/licenses/LICENSE-2.0
12 *
13 * Unless required by applicable law or agreed to in writing,
14 * software distributed under the License is distributed on an
15 * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
16 * KIND, either express or implied. See the License for the
17 * specific language governing permissions and limitations
18 * under the License.
19 * ====================================================================
20 *
21 * This software consists of voluntary contributions made by many
22 * individuals on behalf of the Apache Software Foundation. For more
23 * information on the Apache Software Foundation, please see
24 * <http://www.apache.org/>.
25 *
26 */
27
28 package org.apache.http.client.utils;
29
30 import java.io.IOException;
31 import java.net.URI;
32 import java.nio.ByteBuffer;
33 import java.nio.CharBuffer;
34 import java.nio.charset.Charset;
35 import java.util.ArrayList;
36 import java.util.BitSet;
37 import java.util.Collections;
38 import java.util.List;
39 import java.util.Scanner;
40
41 import org.apache.http.Consts;
42 import org.apache.http.Header;
43 import org.apache.http.HeaderElement;
44 import org.apache.http.HttpEntity;
45 import org.apache.http.NameValuePair;
46 import org.apache.http.annotation.Immutable;
47 import org.apache.http.entity.ContentType;
48 import org.apache.http.message.BasicHeaderValueParser;
49 import org.apache.http.message.BasicNameValuePair;
50 import org.apache.http.message.ParserCursor;
51 import org.apache.http.protocol.HTTP;
52 import org.apache.http.util.CharArrayBuffer;
53 import org.apache.http.util.EntityUtils;
54
55 /**
56 * A collection of utilities for encoding URLs.
57 *
58 * @since 4.0
59 */
60 @Immutable
61 public class URLEncodedUtils {
62
63 /**
64 * The default HTML form content type.
65 */
66 public static final String CONTENT_TYPE = "application/x-www-form-urlencoded";
67
68 private static final char QP_SEP_A = '&';
69 private static final char QP_SEP_S = ';';
70 private static final String NAME_VALUE_SEPARATOR = "=";
71
72 /**
73 * Returns a list of {@link NameValuePair NameValuePairs} as built from the URI's query portion. For example, a URI
74 * of http://example.org/path/to/file?a=1&b=2&c=3 would return a list of three NameValuePairs, one for a=1, one for
75 * b=2, and one for c=3. By convention, {@code '&'} and {@code ';'} are accepted as parameter separators.
76 * <p>
77 * This is typically useful while parsing an HTTP PUT.
78 *
79 * This API is currently only used for testing.
80 *
81 * @param uri
82 * URI to parse
83 * @param charset
84 * Charset name to use while parsing the query
85 * @return a list of {@link NameValuePair} as built from the URI's query portion.
86 */
87 public static List <NameValuePair> parse(final URI uri, final String charset) {
88 final String query = uri.getRawQuery();
89 if (query != null && query.length() > 0) {
90 final List<NameValuePair> result = new ArrayList<NameValuePair>();
91 final Scanner scanner = new Scanner(query);
92 parse(result, scanner, QP_SEP_PATTERN, charset);
93 return result;
94 }
95 return Collections.emptyList();
96 }
97
98 /**
99 * Returns a list of {@link NameValuePair NameValuePairs} as parsed from an {@link HttpEntity}. The encoding is
100 * taken from the entity's Content-Encoding header.
101 * <p>
102 * This is typically used while parsing an HTTP POST.
103 *
104 * @param entity
105 * The entity to parse
106 * @return a list of {@link NameValuePair} as built from the URI's query portion.
107 * @throws IOException
108 * If there was an exception getting the entity's data.
109 */
110 public static List <NameValuePair> parse(
111 final HttpEntity entity) throws IOException {
112 final ContentType contentType = ContentType.get(entity);
113 if (contentType != null && contentType.getMimeType().equalsIgnoreCase(CONTENT_TYPE)) {
114 final String content = EntityUtils.toString(entity, Consts.ASCII);
115 if (content != null && content.length() > 0) {
116 Charset charset = contentType.getCharset();
117 if (charset == null) {
118 charset = HTTP.DEF_CONTENT_CHARSET;
119 }
120 return parse(content, charset, QP_SEPS);
121 }
122 }
123 return Collections.emptyList();
124 }
125
126 /**
127 * Returns true if the entity's Content-Type header is
128 * <code>application/x-www-form-urlencoded</code>.
129 */
130 public static boolean isEncoded(final HttpEntity entity) {
131 final Header h = entity.getContentType();
132 if (h != null) {
133 final HeaderElement[] elems = h.getElements();
134 if (elems.length > 0) {
135 final String contentType = elems[0].getName();
136 return contentType.equalsIgnoreCase(CONTENT_TYPE);
137 } else {
138 return false;
139 }
140 } else {
141 return false;
142 }
143 }
144
145 /**
146 * Adds all parameters within the Scanner to the list of <code>parameters</code>, as encoded by
147 * <code>encoding</code>. For example, a scanner containing the string <code>a=1&b=2&c=3</code> would add the
148 * {@link NameValuePair NameValuePairs} a=1, b=2, and c=3 to the list of parameters. By convention, {@code '&'} and
149 * {@code ';'} are accepted as parameter separators.
150 *
151 * @param parameters
152 * List to add parameters to.
153 * @param scanner
154 * Input that contains the parameters to parse.
155 * @param charset
156 * Encoding to use when decoding the parameters.
157 */
158 public static void parse(
159 final List <NameValuePair> parameters,
160 final Scanner scanner,
161 final String charset) {
162 parse(parameters, scanner, QP_SEP_PATTERN, charset);
163 }
164
165 /**
166 * Adds all parameters within the Scanner to the list of
167 * <code>parameters</code>, as encoded by <code>encoding</code>. For
168 * example, a scanner containing the string <code>a=1&b=2&c=3</code> would
169 * add the {@link NameValuePair NameValuePairs} a=1, b=2, and c=3 to the
170 * list of parameters.
171 *
172 * @param parameters
173 * List to add parameters to.
174 * @param scanner
175 * Input that contains the parameters to parse.
176 * @param parameterSepartorPattern
177 * The Pattern string for parameter separators, by convention {@code "[&;]"}
178 * @param charset
179 * Encoding to use when decoding the parameters.
180 */
181 public static void parse(
182 final List <NameValuePair> parameters,
183 final Scanner scanner,
184 final String parameterSepartorPattern,
185 final String charset) {
186 scanner.useDelimiter(parameterSepartorPattern);
187 while (scanner.hasNext()) {
188 String name = null;
189 String value = null;
190 final String token = scanner.next();
191 final int i = token.indexOf(NAME_VALUE_SEPARATOR);
192 if (i != -1) {
193 name = decodeFormFields(token.substring(0, i).trim(), charset);
194 value = decodeFormFields(token.substring(i + 1).trim(), charset);
195 } else {
196 name = decodeFormFields(token.trim(), charset);
197 }
198 parameters.add(new BasicNameValuePair(name, value));
199 }
200 }
201
202 /**
203 * Query parameter separators.
204 */
205 private static final char[] QP_SEPS = new char[] { QP_SEP_A, QP_SEP_S };
206
207 /**
208 * Query parameter separator pattern.
209 */
210 private static final String QP_SEP_PATTERN = "[" + new String(QP_SEPS) + "]";
211
212 /**
213 * Returns a list of {@link NameValuePair NameValuePairs} as parsed from the given string using the given character
214 * encoding. By convention, {@code '&'} and {@code ';'} are accepted as parameter separators.
215 *
216 * @param s
217 * text to parse.
218 * @param charset
219 * Encoding to use when decoding the parameters.
220 * @return a list of {@link NameValuePair} as built from the URI's query portion.
221 *
222 * @since 4.2
223 */
224 public static List<NameValuePair> parse(final String s, final Charset charset) {
225 return parse(s, charset, QP_SEPS);
226 }
227
228 /**
229 * Returns a list of {@link NameValuePair NameValuePairs} as parsed from the given string using the given character
230 * encoding.
231 *
232 * @param s
233 * text to parse.
234 * @param charset
235 * Encoding to use when decoding the parameters.
236 * @param parameterSeparator
237 * The characters used to separate parameters, by convention, {@code '&'} and {@code ';'}.
238 * @return a list of {@link NameValuePair} as built from the URI's query portion.
239 *
240 * @since 4.3
241 */
242 public static List<NameValuePair> parse(final String s, final Charset charset, final char... parameterSeparator) {
243 if (s == null) {
244 return Collections.emptyList();
245 }
246 final BasicHeaderValueParser parser = BasicHeaderValueParser.INSTANCE;
247 final CharArrayBuffer buffer = new CharArrayBuffer(s.length());
248 buffer.append(s);
249 final ParserCursor cursor = new ParserCursor(0, buffer.length());
250 final List<NameValuePair> list = new ArrayList<NameValuePair>();
251 while (!cursor.atEnd()) {
252 final NameValuePair nvp = parser.parseNameValuePair(buffer, cursor, parameterSeparator);
253 if (nvp.getName().length() > 0) {
254 list.add(new BasicNameValuePair(
255 decodeFormFields(nvp.getName(), charset),
256 decodeFormFields(nvp.getValue(), charset)));
257 }
258 }
259 return list;
260 }
261
262 /**
263 * Returns a String that is suitable for use as an {@code application/x-www-form-urlencoded}
264 * list of parameters in an HTTP PUT or HTTP POST.
265 *
266 * @param parameters The parameters to include.
267 * @param charset The encoding to use.
268 * @return An {@code application/x-www-form-urlencoded} string
269 */
270 public static String format(
271 final List <? extends NameValuePair> parameters,
272 final String charset) {
273 return format(parameters, QP_SEP_A, charset);
274 }
275
276 /**
277 * Returns a String that is suitable for use as an {@code application/x-www-form-urlencoded}
278 * list of parameters in an HTTP PUT or HTTP POST.
279 *
280 * @param parameters The parameters to include.
281 * @param parameterSeparator The parameter separator, by convention, {@code '&'} or {@code ';'}.
282 * @param charset The encoding to use.
283 * @return An {@code application/x-www-form-urlencoded} string
284 *
285 * @since 4.3
286 */
287 public static String format(
288 final List <? extends NameValuePair> parameters,
289 final char parameterSeparator,
290 final String charset) {
291 final StringBuilder result = new StringBuilder();
292 for (final NameValuePair parameter : parameters) {
293 final String encodedName = encodeFormFields(parameter.getName(), charset);
294 final String encodedValue = encodeFormFields(parameter.getValue(), charset);
295 if (result.length() > 0) {
296 result.append(parameterSeparator);
297 }
298 result.append(encodedName);
299 if (encodedValue != null) {
300 result.append(NAME_VALUE_SEPARATOR);
301 result.append(encodedValue);
302 }
303 }
304 return result.toString();
305 }
306
307 /**
308 * Returns a String that is suitable for use as an {@code application/x-www-form-urlencoded}
309 * list of parameters in an HTTP PUT or HTTP POST.
310 *
311 * @param parameters The parameters to include.
312 * @param charset The encoding to use.
313 * @return An {@code application/x-www-form-urlencoded} string
314 *
315 * @since 4.2
316 */
317 public static String format(
318 final Iterable<? extends NameValuePair> parameters,
319 final Charset charset) {
320 return format(parameters, QP_SEP_A, charset);
321 }
322
323 /**
324 * Returns a String that is suitable for use as an {@code application/x-www-form-urlencoded}
325 * list of parameters in an HTTP PUT or HTTP POST.
326 *
327 * @param parameters The parameters to include.
328 * @param parameterSeparator The parameter separator, by convention, {@code '&'} or {@code ';'}.
329 * @param charset The encoding to use.
330 * @return An {@code application/x-www-form-urlencoded} string
331 *
332 * @since 4.3
333 */
334 public static String format(
335 final Iterable<? extends NameValuePair> parameters,
336 final char parameterSeparator,
337 final Charset charset) {
338 final StringBuilder result = new StringBuilder();
339 for (final NameValuePair parameter : parameters) {
340 final String encodedName = encodeFormFields(parameter.getName(), charset);
341 final String encodedValue = encodeFormFields(parameter.getValue(), charset);
342 if (result.length() > 0) {
343 result.append(parameterSeparator);
344 }
345 result.append(encodedName);
346 if (encodedValue != null) {
347 result.append(NAME_VALUE_SEPARATOR);
348 result.append(encodedValue);
349 }
350 }
351 return result.toString();
352 }
353
354 /**
355 * Unreserved characters, i.e. alphanumeric, plus: {@code _ - ! . ~ ' ( ) *}
356 * <p>
357 * This list is the same as the {@code unreserved} list in
358 * <a href="http://www.ietf.org/rfc/rfc2396.txt">RFC 2396</a>
359 */
360 private static final BitSet UNRESERVED = new BitSet(256);
361 /**
362 * Punctuation characters: , ; : $ & + =
363 * <p>
364 * These are the additional characters allowed by userinfo.
365 */
366 private static final BitSet PUNCT = new BitSet(256);
367 /** Characters which are safe to use in userinfo,
368 * i.e. {@link #UNRESERVED} plus {@link #PUNCT}uation */
369 private static final BitSet USERINFO = new BitSet(256);
370 /** Characters which are safe to use in a path,
371 * i.e. {@link #UNRESERVED} plus {@link #PUNCT}uation plus / @ */
372 private static final BitSet PATHSAFE = new BitSet(256);
373 /** Characters which are safe to use in a query or a fragment,
374 * i.e. {@link #RESERVED} plus {@link #UNRESERVED} */
375 private static final BitSet URIC = new BitSet(256);
376
377 /**
378 * Reserved characters, i.e. {@code ;/?:@&=+$,[]}
379 * <p>
380 * This list is the same as the {@code reserved} list in
381 * <a href="http://www.ietf.org/rfc/rfc2396.txt">RFC 2396</a>
382 * as augmented by
383 * <a href="http://www.ietf.org/rfc/rfc2732.txt">RFC 2732</a>
384 */
385 private static final BitSet RESERVED = new BitSet(256);
386
387
388 /**
389 * Safe characters for x-www-form-urlencoded data, as per java.net.URLEncoder and browser behaviour,
390 * i.e. alphanumeric plus {@code "-", "_", ".", "*"}
391 */
392 private static final BitSet URLENCODER = new BitSet(256);
393
394 static {
395 // unreserved chars
396 // alpha characters
397 for (int i = 'a'; i <= 'z'; i++) {
398 UNRESERVED.set(i);
399 }
400 for (int i = 'A'; i <= 'Z'; i++) {
401 UNRESERVED.set(i);
402 }
403 // numeric characters
404 for (int i = '0'; i <= '9'; i++) {
405 UNRESERVED.set(i);
406 }
407 UNRESERVED.set('_'); // these are the charactes of the "mark" list
408 UNRESERVED.set('-');
409 UNRESERVED.set('.');
410 UNRESERVED.set('*');
411 URLENCODER.or(UNRESERVED); // skip remaining unreserved characters
412 UNRESERVED.set('!');
413 UNRESERVED.set('~');
414 UNRESERVED.set('\'');
415 UNRESERVED.set('(');
416 UNRESERVED.set(')');
417 // punct chars
418 PUNCT.set(',');
419 PUNCT.set(';');
420 PUNCT.set(':');
421 PUNCT.set('$');
422 PUNCT.set('&');
423 PUNCT.set('+');
424 PUNCT.set('=');
425 // Safe for userinfo
426 USERINFO.or(UNRESERVED);
427 USERINFO.or(PUNCT);
428
429 // URL path safe
430 PATHSAFE.or(UNRESERVED);
431 PATHSAFE.set('/'); // segment separator
432 PATHSAFE.set(';'); // param separator
433 PATHSAFE.set(':'); // rest as per list in 2396, i.e. : @ & = + $ ,
434 PATHSAFE.set('@');
435 PATHSAFE.set('&');
436 PATHSAFE.set('=');
437 PATHSAFE.set('+');
438 PATHSAFE.set('$');
439 PATHSAFE.set(',');
440
441 RESERVED.set(';');
442 RESERVED.set('/');
443 RESERVED.set('?');
444 RESERVED.set(':');
445 RESERVED.set('@');
446 RESERVED.set('&');
447 RESERVED.set('=');
448 RESERVED.set('+');
449 RESERVED.set('$');
450 RESERVED.set(',');
451 RESERVED.set('['); // added by RFC 2732
452 RESERVED.set(']'); // added by RFC 2732
453
454 URIC.or(RESERVED);
455 URIC.or(UNRESERVED);
456 }
457
458 private static final int RADIX = 16;
459
460 private static String urlEncode(
461 final String content,
462 final Charset charset,
463 final BitSet safechars,
464 final boolean blankAsPlus) {
465 if (content == null) {
466 return null;
467 }
468 final StringBuilder buf = new StringBuilder();
469 final ByteBuffer bb = charset.encode(content);
470 while (bb.hasRemaining()) {
471 final int b = bb.get() & 0xff;
472 if (safechars.get(b)) {
473 buf.append((char) b);
474 } else if (blankAsPlus && b == ' ') {
475 buf.append('+');
476 } else {
477 buf.append("%");
478 final char hex1 = Character.toUpperCase(Character.forDigit((b >> 4) & 0xF, RADIX));
479 final char hex2 = Character.toUpperCase(Character.forDigit(b & 0xF, RADIX));
480 buf.append(hex1);
481 buf.append(hex2);
482 }
483 }
484 return buf.toString();
485 }
486
487 /**
488 * Decode/unescape a portion of a URL, to use with the query part ensure {@code plusAsBlank} is true.
489 *
490 * @param content the portion to decode
491 * @param charset the charset to use
492 * @param plusAsBlank if {@code true}, then convert '+' to space (e.g. for www-url-form-encoded content), otherwise leave as is.
493 * @return encoded string
494 */
495 private static String urlDecode(
496 final String content,
497 final Charset charset,
498 final boolean plusAsBlank) {
499 if (content == null) {
500 return null;
501 }
502 final ByteBuffer bb = ByteBuffer.allocate(content.length());
503 final CharBuffer cb = CharBuffer.wrap(content);
504 while (cb.hasRemaining()) {
505 final char c = cb.get();
506 if (c == '%' && cb.remaining() >= 2) {
507 final char uc = cb.get();
508 final char lc = cb.get();
509 final int u = Character.digit(uc, 16);
510 final int l = Character.digit(lc, 16);
511 if (u != -1 && l != -1) {
512 bb.put((byte) ((u << 4) + l));
513 } else {
514 bb.put((byte) '%');
515 bb.put((byte) uc);
516 bb.put((byte) lc);
517 }
518 } else if (plusAsBlank && c == '+') {
519 bb.put((byte) ' ');
520 } else {
521 bb.put((byte) c);
522 }
523 }
524 bb.flip();
525 return charset.decode(bb).toString();
526 }
527
528 /**
529 * Decode/unescape www-url-form-encoded content.
530 *
531 * @param content the content to decode, will decode '+' as space
532 * @param charset the charset to use
533 * @return encoded string
534 */
535 private static String decodeFormFields (final String content, final String charset) {
536 if (content == null) {
537 return null;
538 }
539 return urlDecode(content, charset != null ? Charset.forName(charset) : Consts.UTF_8, true);
540 }
541
542 /**
543 * Decode/unescape www-url-form-encoded content.
544 *
545 * @param content the content to decode, will decode '+' as space
546 * @param charset the charset to use
547 * @return encoded string
548 */
549 private static String decodeFormFields (final String content, final Charset charset) {
550 if (content == null) {
551 return null;
552 }
553 return urlDecode(content, charset != null ? charset : Consts.UTF_8, true);
554 }
555
556 /**
557 * Encode/escape www-url-form-encoded content.
558 * <p>
559 * Uses the {@link #URLENCODER} set of characters, rather than
560 * the {@link #UNRSERVED} set; this is for compatibilty with previous
561 * releases, URLEncoder.encode() and most browsers.
562 *
563 * @param content the content to encode, will convert space to '+'
564 * @param charset the charset to use
565 * @return encoded string
566 */
567 private static String encodeFormFields(final String content, final String charset) {
568 if (content == null) {
569 return null;
570 }
571 return urlEncode(content, charset != null ? Charset.forName(charset) : Consts.UTF_8, URLENCODER, true);
572 }
573
574 /**
575 * Encode/escape www-url-form-encoded content.
576 * <p>
577 * Uses the {@link #URLENCODER} set of characters, rather than
578 * the {@link #UNRSERVED} set; this is for compatibilty with previous
579 * releases, URLEncoder.encode() and most browsers.
580 *
581 * @param content the content to encode, will convert space to '+'
582 * @param charset the charset to use
583 * @return encoded string
584 */
585 private static String encodeFormFields (final String content, final Charset charset) {
586 if (content == null) {
587 return null;
588 }
589 return urlEncode(content, charset != null ? charset : Consts.UTF_8, URLENCODER, true);
590 }
591
592 /**
593 * Encode a String using the {@link #USERINFO} set of characters.
594 * <p>
595 * Used by URIBuilder to encode the userinfo segment.
596 *
597 * @param content the string to encode, does not convert space to '+'
598 * @param charset the charset to use
599 * @return the encoded string
600 */
601 static String encUserInfo(final String content, final Charset charset) {
602 return urlEncode(content, charset, USERINFO, false);
603 }
604
605 /**
606 * Encode a String using the {@link #URIC} set of characters.
607 * <p>
608 * Used by URIBuilder to encode the query and fragment segments.
609 *
610 * @param content the string to encode, does not convert space to '+'
611 * @param charset the charset to use
612 * @return the encoded string
613 */
614 static String encUric(final String content, final Charset charset) {
615 return urlEncode(content, charset, URIC, false);
616 }
617
618 /**
619 * Encode a String using the {@link #PATHSAFE} set of characters.
620 * <p>
621 * Used by URIBuilder to encode path segments.
622 *
623 * @param content the string to encode, does not convert space to '+'
624 * @param charset the charset to use
625 * @return the encoded string
626 */
627 static String encPath(final String content, final Charset charset) {
628 return urlEncode(content, charset, PATHSAFE, false);
629 }
630
631 }