View Javadoc

1   /*
2    * ====================================================================
3    * Licensed to the Apache Software Foundation (ASF) under one
4    * or more contributor license agreements.  See the NOTICE file
5    * distributed with this work for additional information
6    * regarding copyright ownership.  The ASF licenses this file
7    * to you under the Apache License, Version 2.0 (the
8    * "License"); you may not use this file except in compliance
9    * with the License.  You may obtain a copy of the License at
10   *
11   *   http://www.apache.org/licenses/LICENSE-2.0
12   *
13   * Unless required by applicable law or agreed to in writing,
14   * software distributed under the License is distributed on an
15   * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
16   * KIND, either express or implied.  See the License for the
17   * specific language governing permissions and limitations
18   * under the License.
19   * ====================================================================
20   *
21   * This software consists of voluntary contributions made by many
22   * individuals on behalf of the Apache Software Foundation.  For more
23   * information on the Apache Software Foundation, please see
24   * <http://www.apache.org/>.
25   *
26   */
27  
28  package org.apache.http.client.utils;
29  
30  import java.io.IOException;
31  import java.net.URI;
32  import java.nio.ByteBuffer;
33  import java.nio.CharBuffer;
34  import java.nio.charset.Charset;
35  import java.util.ArrayList;
36  import java.util.BitSet;
37  import java.util.Collections;
38  import java.util.List;
39  import java.util.Scanner;
40  
41  import org.apache.http.Consts;
42  import org.apache.http.Header;
43  import org.apache.http.HeaderElement;
44  import org.apache.http.HttpEntity;
45  import org.apache.http.NameValuePair;
46  import org.apache.http.annotation.Immutable;
47  import org.apache.http.entity.ContentType;
48  import org.apache.http.message.BasicHeaderValueParser;
49  import org.apache.http.message.BasicNameValuePair;
50  import org.apache.http.message.ParserCursor;
51  import org.apache.http.protocol.HTTP;
52  import org.apache.http.util.CharArrayBuffer;
53  import org.apache.http.util.EntityUtils;
54  
55  /**
56   * A collection of utilities for encoding URLs.
57   *
58   * @since 4.0
59   */
60  @Immutable
61  public class URLEncodedUtils {
62  
63      /**
64       * The default HTML form content type.
65       */
66      public static final String CONTENT_TYPE = "application/x-www-form-urlencoded";
67  
68      private static final char QP_SEP_A = '&';
69      private static final char QP_SEP_S = ';';
70      private static final String NAME_VALUE_SEPARATOR = "=";
71  
72      /**
73       * Returns a list of {@link NameValuePair NameValuePairs} as built from the URI's query portion. For example, a URI
74       * of http://example.org/path/to/file?a=1&b=2&c=3 would return a list of three NameValuePairs, one for a=1, one for
75       * b=2, and one for c=3. By convention, {@code '&'} and {@code ';'} are accepted as parameter separators.
76       * <p>
77       * This is typically useful while parsing an HTTP PUT.
78       *
79       * This API is currently only used for testing.
80       *
81       * @param uri
82       *            URI to parse
83       * @param charset
84       *            Charset name to use while parsing the query
85       * @return a list of {@link NameValuePair} as built from the URI's query portion.
86       */
87      public static List <NameValuePair> parse(final URI uri, final String charset) {
88          final String query = uri.getRawQuery();
89          if (query != null && query.length() > 0) {
90              final List<NameValuePair> result = new ArrayList<NameValuePair>();
91              final Scanner scanner = new Scanner(query);
92              parse(result, scanner, QP_SEP_PATTERN, charset);
93              return result;
94          }
95          return Collections.emptyList();
96      }
97  
98      /**
99       * Returns a list of {@link NameValuePair NameValuePairs} as parsed from an {@link HttpEntity}. The encoding is
100      * taken from the entity's Content-Encoding header.
101      * <p>
102      * This is typically used while parsing an HTTP POST.
103      *
104      * @param entity
105      *            The entity to parse
106      * @return a list of {@link NameValuePair} as built from the URI's query portion.
107      * @throws IOException
108      *             If there was an exception getting the entity's data.
109      */
110     public static List <NameValuePair> parse(
111             final HttpEntity entity) throws IOException {
112         final ContentType contentType = ContentType.get(entity);
113         if (contentType != null && contentType.getMimeType().equalsIgnoreCase(CONTENT_TYPE)) {
114             final String content = EntityUtils.toString(entity, Consts.ASCII);
115             if (content != null && content.length() > 0) {
116                 Charset charset = contentType.getCharset();
117                 if (charset == null) {
118                     charset = HTTP.DEF_CONTENT_CHARSET;
119                 }
120                 return parse(content, charset, QP_SEPS);
121             }
122         }
123         return Collections.emptyList();
124     }
125 
126     /**
127      * Returns true if the entity's Content-Type header is
128      * <code>application/x-www-form-urlencoded</code>.
129      */
130     public static boolean isEncoded(final HttpEntity entity) {
131         final Header h = entity.getContentType();
132         if (h != null) {
133             final HeaderElement[] elems = h.getElements();
134             if (elems.length > 0) {
135                 final String contentType = elems[0].getName();
136                 return contentType.equalsIgnoreCase(CONTENT_TYPE);
137             }
138         }
139         return false;
140     }
141 
142     /**
143      * Adds all parameters within the Scanner to the list of <code>parameters</code>, as encoded by
144      * <code>encoding</code>. For example, a scanner containing the string <code>a=1&b=2&c=3</code> would add the
145      * {@link NameValuePair NameValuePairs} a=1, b=2, and c=3 to the list of parameters. By convention, {@code '&'} and
146      * {@code ';'} are accepted as parameter separators.
147      *
148      * @param parameters
149      *            List to add parameters to.
150      * @param scanner
151      *            Input that contains the parameters to parse.
152      * @param charset
153      *            Encoding to use when decoding the parameters.
154      */
155     public static void parse(
156             final List <NameValuePair> parameters,
157             final Scanner scanner,
158             final String charset) {
159         parse(parameters, scanner, QP_SEP_PATTERN, charset);
160     }
161 
162     /**
163      * Adds all parameters within the Scanner to the list of
164      * <code>parameters</code>, as encoded by <code>encoding</code>. For
165      * example, a scanner containing the string <code>a=1&b=2&c=3</code> would
166      * add the {@link NameValuePair NameValuePairs} a=1, b=2, and c=3 to the
167      * list of parameters.
168      *
169      * @param parameters
170      *            List to add parameters to.
171      * @param scanner
172      *            Input that contains the parameters to parse.
173      * @param parameterSepartorPattern
174      *            The Pattern string for parameter separators, by convention {@code "[&;]"}
175      * @param charset
176      *            Encoding to use when decoding the parameters.
177      */
178     public static void parse(
179             final List <NameValuePair> parameters,
180             final Scanner scanner,
181             final String parameterSepartorPattern,
182             final String charset) {
183         scanner.useDelimiter(parameterSepartorPattern);
184         while (scanner.hasNext()) {
185             String name = null;
186             String value = null;
187             final String token = scanner.next();
188             final int i = token.indexOf(NAME_VALUE_SEPARATOR);
189             if (i != -1) {
190                 name = decodeFormFields(token.substring(0, i).trim(), charset);
191                 value = decodeFormFields(token.substring(i + 1).trim(), charset);
192             } else {
193                 name = decodeFormFields(token.trim(), charset);
194             }
195             parameters.add(new BasicNameValuePair(name, value));
196         }
197     }
198 
199     /**
200      * Query parameter separators.
201      */
202     private static final char[] QP_SEPS = new char[] { QP_SEP_A, QP_SEP_S };
203 
204     /**
205      * Query parameter separator pattern.
206      */
207     private static final String QP_SEP_PATTERN = "[" + new String(QP_SEPS) + "]";
208 
209     /**
210      * Returns a list of {@link NameValuePair NameValuePairs} as parsed from the given string using the given character
211      * encoding. By convention, {@code '&'} and {@code ';'} are accepted as parameter separators.
212      *
213      * @param s
214      *            text to parse.
215      * @param charset
216      *            Encoding to use when decoding the parameters.
217      * @return a list of {@link NameValuePair} as built from the URI's query portion.
218      *
219      * @since 4.2
220      */
221     public static List<NameValuePair> parse(final String s, final Charset charset) {
222         return parse(s, charset, QP_SEPS);
223     }
224 
225     /**
226      * Returns a list of {@link NameValuePair NameValuePairs} as parsed from the given string using the given character
227      * encoding.
228      *
229      * @param s
230      *            text to parse.
231      * @param charset
232      *            Encoding to use when decoding the parameters.
233      * @param parameterSeparator
234      *            The characters used to separate parameters, by convention, {@code '&'} and {@code ';'}.
235      * @return a list of {@link NameValuePair} as built from the URI's query portion.
236      *
237      * @since 4.3
238      */
239     public static List<NameValuePair> parse(final String s, final Charset charset, final char... parameterSeparator) {
240         if (s == null) {
241             return Collections.emptyList();
242         }
243         final BasicHeaderValueParser parser = BasicHeaderValueParser.INSTANCE;
244         final CharArrayBuffer buffer = new CharArrayBuffer(s.length());
245         buffer.append(s);
246         final ParserCursor cursor = new ParserCursor(0, buffer.length());
247         final List<NameValuePair> list = new ArrayList<NameValuePair>();
248         while (!cursor.atEnd()) {
249             final NameValuePair nvp = parser.parseNameValuePair(buffer, cursor, parameterSeparator);
250             if (nvp.getName().length() > 0) {
251                 list.add(new BasicNameValuePair(
252                         decodeFormFields(nvp.getName(), charset),
253                         decodeFormFields(nvp.getValue(), charset)));
254             }
255         }
256         return list;
257     }
258 
259     /**
260      * Returns a String that is suitable for use as an {@code application/x-www-form-urlencoded}
261      * list of parameters in an HTTP PUT or HTTP POST.
262      *
263      * @param parameters  The parameters to include.
264      * @param charset The encoding to use.
265      * @return An {@code application/x-www-form-urlencoded} string
266      */
267     public static String format(
268             final List <? extends NameValuePair> parameters,
269             final String charset) {
270         return format(parameters, QP_SEP_A, charset);
271     }
272 
273     /**
274      * Returns a String that is suitable for use as an {@code application/x-www-form-urlencoded}
275      * list of parameters in an HTTP PUT or HTTP POST.
276      *
277      * @param parameters  The parameters to include.
278      * @param parameterSeparator The parameter separator, by convention, {@code '&'} or {@code ';'}.
279      * @param charset The encoding to use.
280      * @return An {@code application/x-www-form-urlencoded} string
281      *
282      * @since 4.3
283      */
284     public static String format(
285             final List <? extends NameValuePair> parameters,
286             final char parameterSeparator,
287             final String charset) {
288         final StringBuilder result = new StringBuilder();
289         for (final NameValuePair parameter : parameters) {
290             final String encodedName = encodeFormFields(parameter.getName(), charset);
291             final String encodedValue = encodeFormFields(parameter.getValue(), charset);
292             if (result.length() > 0) {
293                 result.append(parameterSeparator);
294             }
295             result.append(encodedName);
296             if (encodedValue != null) {
297                 result.append(NAME_VALUE_SEPARATOR);
298                 result.append(encodedValue);
299             }
300         }
301         return result.toString();
302     }
303 
304     /**
305      * Returns a String that is suitable for use as an {@code application/x-www-form-urlencoded}
306      * list of parameters in an HTTP PUT or HTTP POST.
307      *
308      * @param parameters  The parameters to include.
309      * @param charset The encoding to use.
310      * @return An {@code application/x-www-form-urlencoded} string
311      *
312      * @since 4.2
313      */
314     public static String format(
315             final Iterable<? extends NameValuePair> parameters,
316             final Charset charset) {
317         return format(parameters, QP_SEP_A, charset);
318     }
319 
320     /**
321      * Returns a String that is suitable for use as an {@code application/x-www-form-urlencoded}
322      * list of parameters in an HTTP PUT or HTTP POST.
323      *
324      * @param parameters  The parameters to include.
325      * @param parameterSeparator The parameter separator, by convention, {@code '&'} or {@code ';'}.
326      * @param charset The encoding to use.
327      * @return An {@code application/x-www-form-urlencoded} string
328      *
329      * @since 4.3
330      */
331     public static String format(
332             final Iterable<? extends NameValuePair> parameters,
333             final char parameterSeparator,
334             final Charset charset) {
335         final StringBuilder result = new StringBuilder();
336         for (final NameValuePair parameter : parameters) {
337             final String encodedName = encodeFormFields(parameter.getName(), charset);
338             final String encodedValue = encodeFormFields(parameter.getValue(), charset);
339             if (result.length() > 0) {
340                 result.append(parameterSeparator);
341             }
342             result.append(encodedName);
343             if (encodedValue != null) {
344                 result.append(NAME_VALUE_SEPARATOR);
345                 result.append(encodedValue);
346             }
347         }
348         return result.toString();
349     }
350 
351     /**
352      * Unreserved characters, i.e. alphanumeric, plus: {@code _ - ! . ~ ' ( ) *}
353      * <p>
354      *  This list is the same as the {@code unreserved} list in
355      *  <a href="http://www.ietf.org/rfc/rfc2396.txt">RFC 2396</a>
356      */
357     private static final BitSet UNRESERVED   = new BitSet(256);
358     /**
359      * Punctuation characters: , ; : $ & + =
360      * <p>
361      * These are the additional characters allowed by userinfo.
362      */
363     private static final BitSet PUNCT        = new BitSet(256);
364     /** Characters which are safe to use in userinfo,
365      * i.e. {@link #UNRESERVED} plus {@link #PUNCT}uation */
366     private static final BitSet USERINFO     = new BitSet(256);
367     /** Characters which are safe to use in a path,
368      * i.e. {@link #UNRESERVED} plus {@link #PUNCT}uation plus / @ */
369     private static final BitSet PATHSAFE     = new BitSet(256);
370     /** Characters which are safe to use in a query or a fragment,
371      * i.e. {@link #RESERVED} plus {@link #UNRESERVED} */
372     private static final BitSet URIC     = new BitSet(256);
373 
374     /**
375      * Reserved characters, i.e. {@code ;/?:@&=+$,[]}
376      * <p>
377      *  This list is the same as the {@code reserved} list in
378      *  <a href="http://www.ietf.org/rfc/rfc2396.txt">RFC 2396</a>
379      *  as augmented by
380      *  <a href="http://www.ietf.org/rfc/rfc2732.txt">RFC 2732</a>
381      */
382     private static final BitSet RESERVED     = new BitSet(256);
383 
384 
385     /**
386      * Safe characters for x-www-form-urlencoded data, as per java.net.URLEncoder and browser behaviour,
387      * i.e. alphanumeric plus {@code "-", "_", ".", "*"}
388      */
389     private static final BitSet URLENCODER   = new BitSet(256);
390 
391     static {
392         // unreserved chars
393         // alpha characters
394         for (int i = 'a'; i <= 'z'; i++) {
395             UNRESERVED.set(i);
396         }
397         for (int i = 'A'; i <= 'Z'; i++) {
398             UNRESERVED.set(i);
399         }
400         // numeric characters
401         for (int i = '0'; i <= '9'; i++) {
402             UNRESERVED.set(i);
403         }
404         UNRESERVED.set('_'); // these are the charactes of the "mark" list
405         UNRESERVED.set('-');
406         UNRESERVED.set('.');
407         UNRESERVED.set('*');
408         URLENCODER.or(UNRESERVED); // skip remaining unreserved characters
409         UNRESERVED.set('!');
410         UNRESERVED.set('~');
411         UNRESERVED.set('\'');
412         UNRESERVED.set('(');
413         UNRESERVED.set(')');
414         // punct chars
415         PUNCT.set(',');
416         PUNCT.set(';');
417         PUNCT.set(':');
418         PUNCT.set('$');
419         PUNCT.set('&');
420         PUNCT.set('+');
421         PUNCT.set('=');
422         // Safe for userinfo
423         USERINFO.or(UNRESERVED);
424         USERINFO.or(PUNCT);
425 
426         // URL path safe
427         PATHSAFE.or(UNRESERVED);
428         PATHSAFE.set('/'); // segment separator
429         PATHSAFE.set(';'); // param separator
430         PATHSAFE.set(':'); // rest as per list in 2396, i.e. : @ & = + $ ,
431         PATHSAFE.set('@');
432         PATHSAFE.set('&');
433         PATHSAFE.set('=');
434         PATHSAFE.set('+');
435         PATHSAFE.set('$');
436         PATHSAFE.set(',');
437 
438         RESERVED.set(';');
439         RESERVED.set('/');
440         RESERVED.set('?');
441         RESERVED.set(':');
442         RESERVED.set('@');
443         RESERVED.set('&');
444         RESERVED.set('=');
445         RESERVED.set('+');
446         RESERVED.set('$');
447         RESERVED.set(',');
448         RESERVED.set('['); // added by RFC 2732
449         RESERVED.set(']'); // added by RFC 2732
450 
451         URIC.or(RESERVED);
452         URIC.or(UNRESERVED);
453     }
454 
455     private static final int RADIX = 16;
456 
457     private static String urlEncode(
458             final String content,
459             final Charset charset,
460             final BitSet safechars,
461             final boolean blankAsPlus) {
462         if (content == null) {
463             return null;
464         }
465         final StringBuilder buf = new StringBuilder();
466         final ByteBuffer bb = charset.encode(content);
467         while (bb.hasRemaining()) {
468             final int b = bb.get() & 0xff;
469             if (safechars.get(b)) {
470                 buf.append((char) b);
471             } else if (blankAsPlus && b == ' ') {
472                 buf.append('+');
473             } else {
474                 buf.append("%");
475                 final char hex1 = Character.toUpperCase(Character.forDigit((b >> 4) & 0xF, RADIX));
476                 final char hex2 = Character.toUpperCase(Character.forDigit(b & 0xF, RADIX));
477                 buf.append(hex1);
478                 buf.append(hex2);
479             }
480         }
481         return buf.toString();
482     }
483 
484     /**
485      * Decode/unescape a portion of a URL, to use with the query part ensure {@code plusAsBlank} is true.
486      *
487      * @param content the portion to decode
488      * @param charset the charset to use
489      * @param plusAsBlank if {@code true}, then convert '+' to space (e.g. for www-url-form-encoded content), otherwise leave as is.
490      * @return encoded string
491      */
492     private static String urlDecode(
493             final String content,
494             final Charset charset,
495             final boolean plusAsBlank) {
496         if (content == null) {
497             return null;
498         }
499         final ByteBuffer bb = ByteBuffer.allocate(content.length());
500         final CharBuffer cb = CharBuffer.wrap(content);
501         while (cb.hasRemaining()) {
502             final char c = cb.get();
503             if (c == '%' && cb.remaining() >= 2) {
504                 final char uc = cb.get();
505                 final char lc = cb.get();
506                 final int u = Character.digit(uc, 16);
507                 final int l = Character.digit(lc, 16);
508                 if (u != -1 && l != -1) {
509                     bb.put((byte) ((u << 4) + l));
510                 } else {
511                     bb.put((byte) '%');
512                     bb.put((byte) uc);
513                     bb.put((byte) lc);
514                 }
515             } else if (plusAsBlank && c == '+') {
516                 bb.put((byte) ' ');
517             } else {
518                 bb.put((byte) c);
519             }
520         }
521         bb.flip();
522         return charset.decode(bb).toString();
523     }
524 
525     /**
526      * Decode/unescape www-url-form-encoded content.
527      *
528      * @param content the content to decode, will decode '+' as space
529      * @param charset the charset to use
530      * @return encoded string
531      */
532     private static String decodeFormFields (final String content, final String charset) {
533         if (content == null) {
534             return null;
535         }
536         return urlDecode(content, charset != null ? Charset.forName(charset) : Consts.UTF_8, true);
537     }
538 
539     /**
540      * Decode/unescape www-url-form-encoded content.
541      *
542      * @param content the content to decode, will decode '+' as space
543      * @param charset the charset to use
544      * @return encoded string
545      */
546     private static String decodeFormFields (final String content, final Charset charset) {
547         if (content == null) {
548             return null;
549         }
550         return urlDecode(content, charset != null ? charset : Consts.UTF_8, true);
551     }
552 
553     /**
554      * Encode/escape www-url-form-encoded content.
555      * <p>
556      * Uses the {@link #URLENCODER} set of characters, rather than
557      * the {@link #UNRSERVED} set; this is for compatibilty with previous
558      * releases, URLEncoder.encode() and most browsers.
559      *
560      * @param content the content to encode, will convert space to '+'
561      * @param charset the charset to use
562      * @return encoded string
563      */
564     private static String encodeFormFields(final String content, final String charset) {
565         if (content == null) {
566             return null;
567         }
568         return urlEncode(content, charset != null ? Charset.forName(charset) : Consts.UTF_8, URLENCODER, true);
569     }
570 
571     /**
572      * Encode/escape www-url-form-encoded content.
573      * <p>
574      * Uses the {@link #URLENCODER} set of characters, rather than
575      * the {@link #UNRSERVED} set; this is for compatibilty with previous
576      * releases, URLEncoder.encode() and most browsers.
577      *
578      * @param content the content to encode, will convert space to '+'
579      * @param charset the charset to use
580      * @return encoded string
581      */
582     private static String encodeFormFields (final String content, final Charset charset) {
583         if (content == null) {
584             return null;
585         }
586         return urlEncode(content, charset != null ? charset : Consts.UTF_8, URLENCODER, true);
587     }
588 
589     /**
590      * Encode a String using the {@link #USERINFO} set of characters.
591      * <p>
592      * Used by URIBuilder to encode the userinfo segment.
593      *
594      * @param content the string to encode, does not convert space to '+'
595      * @param charset the charset to use
596      * @return the encoded string
597      */
598     static String encUserInfo(final String content, final Charset charset) {
599         return urlEncode(content, charset, USERINFO, false);
600     }
601 
602     /**
603      * Encode a String using the {@link #URIC} set of characters.
604      * <p>
605      * Used by URIBuilder to encode the query and fragment segments.
606      *
607      * @param content the string to encode, does not convert space to '+'
608      * @param charset the charset to use
609      * @return the encoded string
610      */
611     static String encUric(final String content, final Charset charset) {
612         return urlEncode(content, charset, URIC, false);
613     }
614 
615     /**
616      * Encode a String using the {@link #PATHSAFE} set of characters.
617      * <p>
618      * Used by URIBuilder to encode path segments.
619      *
620      * @param content the string to encode, does not convert space to '+'
621      * @param charset the charset to use
622      * @return the encoded string
623      */
624     static String encPath(final String content, final Charset charset) {
625         return urlEncode(content, charset, PATHSAFE, false);
626     }
627 
628 }