View Javadoc

1   /*
2    * ====================================================================
3    * Licensed to the Apache Software Foundation (ASF) under one
4    * or more contributor license agreements.  See the NOTICE file
5    * distributed with this work for additional information
6    * regarding copyright ownership.  The ASF licenses this file
7    * to you under the Apache License, Version 2.0 (the
8    * "License"); you may not use this file except in compliance
9    * with the License.  You may obtain a copy of the License at
10   *
11   *   http://www.apache.org/licenses/LICENSE-2.0
12   *
13   * Unless required by applicable law or agreed to in writing,
14   * software distributed under the License is distributed on an
15   * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
16   * KIND, either express or implied.  See the License for the
17   * specific language governing permissions and limitations
18   * under the License.
19   * ====================================================================
20   *
21   * This software consists of voluntary contributions made by many
22   * individuals on behalf of the Apache Software Foundation.  For more
23   * information on the Apache Software Foundation, please see
24   * <http://www.apache.org/>.
25   *
26   */
27  
28  package org.apache.http.client.utils;
29  
30  import java.io.IOException;
31  import java.net.URI;
32  import java.nio.ByteBuffer;
33  import java.nio.CharBuffer;
34  import java.nio.charset.Charset;
35  import java.util.ArrayList;
36  import java.util.BitSet;
37  import java.util.Collections;
38  import java.util.List;
39  import java.util.Scanner;
40  
41  import org.apache.http.Consts;
42  import org.apache.http.Header;
43  import org.apache.http.HeaderElement;
44  import org.apache.http.HttpEntity;
45  import org.apache.http.NameValuePair;
46  import org.apache.http.annotation.Immutable;
47  import org.apache.http.entity.ContentType;
48  import org.apache.http.message.BasicHeaderValueParser;
49  import org.apache.http.message.BasicNameValuePair;
50  import org.apache.http.message.ParserCursor;
51  import org.apache.http.protocol.HTTP;
52  import org.apache.http.util.CharArrayBuffer;
53  import org.apache.http.util.EntityUtils;
54  
55  /**
56   * A collection of utilities for encoding URLs.
57   *
58   * @since 4.0
59   */
60  @Immutable
61  public class URLEncodedUtils {
62  
63      /**
64       * The default HTML form content type.
65       */
66      public static final String CONTENT_TYPE = "application/x-www-form-urlencoded";
67  
68      private static final char QP_SEP_A = '&';
69      private static final char QP_SEP_S = ';';
70      private static final String NAME_VALUE_SEPARATOR = "=";
71  
72      /**
73       * Returns a list of {@link NameValuePair NameValuePairs} as built from the URI's query portion. For example, a URI
74       * of http://example.org/path/to/file?a=1&b=2&c=3 would return a list of three NameValuePairs, one for a=1, one for
75       * b=2, and one for c=3. By convention, {@code '&'} and {@code ';'} are accepted as parameter separators.
76       * <p>
77       * This is typically useful while parsing an HTTP PUT.
78       *
79       * This API is currently only used for testing.
80       *
81       * @param uri
82       *            URI to parse
83       * @param charset
84       *            Charset name to use while parsing the query
85       * @return a list of {@link NameValuePair} as built from the URI's query portion.
86       */
87      public static List <NameValuePair> parse(final URI uri, final String charset) {
88          final String query = uri.getRawQuery();
89          if (query != null && query.length() > 0) {
90              final List<NameValuePair> result = new ArrayList<NameValuePair>();
91              final Scanner scanner = new Scanner(query);
92              parse(result, scanner, QP_SEP_PATTERN, charset);
93              return result;
94          }
95          return Collections.emptyList();
96      }
97  
98      /**
99       * Returns a list of {@link NameValuePair NameValuePairs} as parsed from an {@link HttpEntity}. The encoding is
100      * taken from the entity's Content-Encoding header.
101      * <p>
102      * This is typically used while parsing an HTTP POST.
103      *
104      * @param entity
105      *            The entity to parse
106      * @return a list of {@link NameValuePair} as built from the URI's query portion.
107      * @throws IOException
108      *             If there was an exception getting the entity's data.
109      */
110     public static List <NameValuePair> parse(
111             final HttpEntity entity) throws IOException {
112         final ContentType contentType = ContentType.get(entity);
113         if (contentType != null && contentType.getMimeType().equalsIgnoreCase(CONTENT_TYPE)) {
114             final String content = EntityUtils.toString(entity, Consts.ASCII);
115             if (content != null && content.length() > 0) {
116                 Charset charset = contentType.getCharset();
117                 if (charset == null) {
118                     charset = HTTP.DEF_CONTENT_CHARSET;
119                 }
120                 return parse(content, charset, QP_SEPS);
121             }
122         }
123         return Collections.emptyList();
124     }
125 
126     /**
127      * Returns true if the entity's Content-Type header is
128      * <code>application/x-www-form-urlencoded</code>.
129      */
130     public static boolean isEncoded(final HttpEntity entity) {
131         final Header h = entity.getContentType();
132         if (h != null) {
133             final HeaderElement[] elems = h.getElements();
134             if (elems.length > 0) {
135                 final String contentType = elems[0].getName();
136                 return contentType.equalsIgnoreCase(CONTENT_TYPE);
137             } else {
138                 return false;
139             }
140         } else {
141             return false;
142         }
143     }
144 
145     /**
146      * Adds all parameters within the Scanner to the list of <code>parameters</code>, as encoded by
147      * <code>encoding</code>. For example, a scanner containing the string <code>a=1&b=2&c=3</code> would add the
148      * {@link NameValuePair NameValuePairs} a=1, b=2, and c=3 to the list of parameters. By convention, {@code '&'} and
149      * {@code ';'} are accepted as parameter separators.
150      *
151      * @param parameters
152      *            List to add parameters to.
153      * @param scanner
154      *            Input that contains the parameters to parse.
155      * @param charset
156      *            Encoding to use when decoding the parameters.
157      */
158     public static void parse(
159             final List <NameValuePair> parameters,
160             final Scanner scanner,
161             final String charset) {
162         parse(parameters, scanner, QP_SEP_PATTERN, charset);
163     }
164 
165     /**
166      * Adds all parameters within the Scanner to the list of
167      * <code>parameters</code>, as encoded by <code>encoding</code>. For
168      * example, a scanner containing the string <code>a=1&b=2&c=3</code> would
169      * add the {@link NameValuePair NameValuePairs} a=1, b=2, and c=3 to the
170      * list of parameters.
171      *
172      * @param parameters
173      *            List to add parameters to.
174      * @param scanner
175      *            Input that contains the parameters to parse.
176      * @param parameterSepartorPattern
177      *            The Pattern string for parameter separators, by convention {@code "[&;]"}
178      * @param charset
179      *            Encoding to use when decoding the parameters.
180      */
181     public static void parse(
182             final List <NameValuePair> parameters,
183             final Scanner scanner,
184             final String parameterSepartorPattern,
185             final String charset) {
186         scanner.useDelimiter(parameterSepartorPattern);
187         while (scanner.hasNext()) {
188             String name = null;
189             String value = null;
190             final String token = scanner.next();
191             final int i = token.indexOf(NAME_VALUE_SEPARATOR);
192             if (i != -1) {
193                 name = decodeFormFields(token.substring(0, i).trim(), charset);
194                 value = decodeFormFields(token.substring(i + 1).trim(), charset);
195             } else {
196                 name = decodeFormFields(token.trim(), charset);
197             }
198             parameters.add(new BasicNameValuePair(name, value));
199         }
200     }
201 
202     /**
203      * Query parameter separators.
204      */
205     private static final char[] QP_SEPS = new char[] { QP_SEP_A, QP_SEP_S };
206 
207     /**
208      * Query parameter separator pattern.
209      */
210     private static final String QP_SEP_PATTERN = "[" + new String(QP_SEPS) + "]";
211 
212     /**
213      * Returns a list of {@link NameValuePair NameValuePairs} as parsed from the given string using the given character
214      * encoding. By convention, {@code '&'} and {@code ';'} are accepted as parameter separators.
215      *
216      * @param s
217      *            text to parse.
218      * @param charset
219      *            Encoding to use when decoding the parameters.
220      * @return a list of {@link NameValuePair} as built from the URI's query portion.
221      *
222      * @since 4.2
223      */
224     public static List<NameValuePair> parse(final String s, final Charset charset) {
225         return parse(s, charset, QP_SEPS);
226     }
227 
228     /**
229      * Returns a list of {@link NameValuePair NameValuePairs} as parsed from the given string using the given character
230      * encoding.
231      *
232      * @param s
233      *            text to parse.
234      * @param charset
235      *            Encoding to use when decoding the parameters.
236      * @param parameterSeparator
237      *            The characters used to separate parameters, by convention, {@code '&'} and {@code ';'}.
238      * @return a list of {@link NameValuePair} as built from the URI's query portion.
239      *
240      * @since 4.3
241      */
242     public static List<NameValuePair> parse(final String s, final Charset charset, final char... parameterSeparator) {
243         if (s == null) {
244             return Collections.emptyList();
245         }
246         final BasicHeaderValueParser parser = BasicHeaderValueParser.INSTANCE;
247         final CharArrayBuffer buffer = new CharArrayBuffer(s.length());
248         buffer.append(s);
249         final ParserCursor cursor = new ParserCursor(0, buffer.length());
250         final List<NameValuePair> list = new ArrayList<NameValuePair>();
251         while (!cursor.atEnd()) {
252             final NameValuePair nvp = parser.parseNameValuePair(buffer, cursor, parameterSeparator);
253             if (nvp.getName().length() > 0) {
254                 list.add(new BasicNameValuePair(
255                         decodeFormFields(nvp.getName(), charset),
256                         decodeFormFields(nvp.getValue(), charset)));
257             }
258         }
259         return list;
260     }
261 
262     /**
263      * Returns a String that is suitable for use as an {@code application/x-www-form-urlencoded}
264      * list of parameters in an HTTP PUT or HTTP POST.
265      *
266      * @param parameters  The parameters to include.
267      * @param charset The encoding to use.
268      * @return An {@code application/x-www-form-urlencoded} string
269      */
270     public static String format(
271             final List <? extends NameValuePair> parameters,
272             final String charset) {
273         return format(parameters, QP_SEP_A, charset);
274     }
275 
276     /**
277      * Returns a String that is suitable for use as an {@code application/x-www-form-urlencoded}
278      * list of parameters in an HTTP PUT or HTTP POST.
279      *
280      * @param parameters  The parameters to include.
281      * @param parameterSeparator The parameter separator, by convention, {@code '&'} or {@code ';'}.
282      * @param charset The encoding to use.
283      * @return An {@code application/x-www-form-urlencoded} string
284      *
285      * @since 4.3
286      */
287     public static String format(
288             final List <? extends NameValuePair> parameters,
289             final char parameterSeparator,
290             final String charset) {
291         final StringBuilder result = new StringBuilder();
292         for (final NameValuePair parameter : parameters) {
293             final String encodedName = encodeFormFields(parameter.getName(), charset);
294             final String encodedValue = encodeFormFields(parameter.getValue(), charset);
295             if (result.length() > 0) {
296                 result.append(parameterSeparator);
297             }
298             result.append(encodedName);
299             if (encodedValue != null) {
300                 result.append(NAME_VALUE_SEPARATOR);
301                 result.append(encodedValue);
302             }
303         }
304         return result.toString();
305     }
306 
307     /**
308      * Returns a String that is suitable for use as an {@code application/x-www-form-urlencoded}
309      * list of parameters in an HTTP PUT or HTTP POST.
310      *
311      * @param parameters  The parameters to include.
312      * @param charset The encoding to use.
313      * @return An {@code application/x-www-form-urlencoded} string
314      *
315      * @since 4.2
316      */
317     public static String format(
318             final Iterable<? extends NameValuePair> parameters,
319             final Charset charset) {
320         return format(parameters, QP_SEP_A, charset);
321     }
322 
323     /**
324      * Returns a String that is suitable for use as an {@code application/x-www-form-urlencoded}
325      * list of parameters in an HTTP PUT or HTTP POST.
326      *
327      * @param parameters  The parameters to include.
328      * @param parameterSeparator The parameter separator, by convention, {@code '&'} or {@code ';'}.
329      * @param charset The encoding to use.
330      * @return An {@code application/x-www-form-urlencoded} string
331      *
332      * @since 4.3
333      */
334     public static String format(
335             final Iterable<? extends NameValuePair> parameters,
336             final char parameterSeparator,
337             final Charset charset) {
338         final StringBuilder result = new StringBuilder();
339         for (final NameValuePair parameter : parameters) {
340             final String encodedName = encodeFormFields(parameter.getName(), charset);
341             final String encodedValue = encodeFormFields(parameter.getValue(), charset);
342             if (result.length() > 0) {
343                 result.append(parameterSeparator);
344             }
345             result.append(encodedName);
346             if (encodedValue != null) {
347                 result.append(NAME_VALUE_SEPARATOR);
348                 result.append(encodedValue);
349             }
350         }
351         return result.toString();
352     }
353 
354     /**
355      * Unreserved characters, i.e. alphanumeric, plus: {@code _ - ! . ~ ' ( ) *}
356      * <p>
357      *  This list is the same as the {@code unreserved} list in
358      *  <a href="http://www.ietf.org/rfc/rfc2396.txt">RFC 2396</a>
359      */
360     private static final BitSet UNRESERVED   = new BitSet(256);
361     /**
362      * Punctuation characters: , ; : $ & + =
363      * <p>
364      * These are the additional characters allowed by userinfo.
365      */
366     private static final BitSet PUNCT        = new BitSet(256);
367     /** Characters which are safe to use in userinfo,
368      * i.e. {@link #UNRESERVED} plus {@link #PUNCT}uation */
369     private static final BitSet USERINFO     = new BitSet(256);
370     /** Characters which are safe to use in a path,
371      * i.e. {@link #UNRESERVED} plus {@link #PUNCT}uation plus / @ */
372     private static final BitSet PATHSAFE     = new BitSet(256);
373     /** Characters which are safe to use in a query or a fragment,
374      * i.e. {@link #RESERVED} plus {@link #UNRESERVED} */
375     private static final BitSet URIC     = new BitSet(256);
376 
377     /**
378      * Reserved characters, i.e. {@code ;/?:@&=+$,[]}
379      * <p>
380      *  This list is the same as the {@code reserved} list in
381      *  <a href="http://www.ietf.org/rfc/rfc2396.txt">RFC 2396</a>
382      *  as augmented by
383      *  <a href="http://www.ietf.org/rfc/rfc2732.txt">RFC 2732</a>
384      */
385     private static final BitSet RESERVED     = new BitSet(256);
386 
387 
388     /**
389      * Safe characters for x-www-form-urlencoded data, as per java.net.URLEncoder and browser behaviour,
390      * i.e. alphanumeric plus {@code "-", "_", ".", "*"}
391      */
392     private static final BitSet URLENCODER   = new BitSet(256);
393 
394     static {
395         // unreserved chars
396         // alpha characters
397         for (int i = 'a'; i <= 'z'; i++) {
398             UNRESERVED.set(i);
399         }
400         for (int i = 'A'; i <= 'Z'; i++) {
401             UNRESERVED.set(i);
402         }
403         // numeric characters
404         for (int i = '0'; i <= '9'; i++) {
405             UNRESERVED.set(i);
406         }
407         UNRESERVED.set('_'); // these are the charactes of the "mark" list
408         UNRESERVED.set('-');
409         UNRESERVED.set('.');
410         UNRESERVED.set('*');
411         URLENCODER.or(UNRESERVED); // skip remaining unreserved characters
412         UNRESERVED.set('!');
413         UNRESERVED.set('~');
414         UNRESERVED.set('\'');
415         UNRESERVED.set('(');
416         UNRESERVED.set(')');
417         // punct chars
418         PUNCT.set(',');
419         PUNCT.set(';');
420         PUNCT.set(':');
421         PUNCT.set('$');
422         PUNCT.set('&');
423         PUNCT.set('+');
424         PUNCT.set('=');
425         // Safe for userinfo
426         USERINFO.or(UNRESERVED);
427         USERINFO.or(PUNCT);
428 
429         // URL path safe
430         PATHSAFE.or(UNRESERVED);
431         PATHSAFE.set('/'); // segment separator
432         PATHSAFE.set(';'); // param separator
433         PATHSAFE.set(':'); // rest as per list in 2396, i.e. : @ & = + $ ,
434         PATHSAFE.set('@');
435         PATHSAFE.set('&');
436         PATHSAFE.set('=');
437         PATHSAFE.set('+');
438         PATHSAFE.set('$');
439         PATHSAFE.set(',');
440 
441         RESERVED.set(';');
442         RESERVED.set('/');
443         RESERVED.set('?');
444         RESERVED.set(':');
445         RESERVED.set('@');
446         RESERVED.set('&');
447         RESERVED.set('=');
448         RESERVED.set('+');
449         RESERVED.set('$');
450         RESERVED.set(',');
451         RESERVED.set('['); // added by RFC 2732
452         RESERVED.set(']'); // added by RFC 2732
453 
454         URIC.or(RESERVED);
455         URIC.or(UNRESERVED);
456     }
457 
458     private static final int RADIX = 16;
459 
460     private static String urlEncode(
461             final String content,
462             final Charset charset,
463             final BitSet safechars,
464             final boolean blankAsPlus) {
465         if (content == null) {
466             return null;
467         }
468         final StringBuilder buf = new StringBuilder();
469         final ByteBuffer bb = charset.encode(content);
470         while (bb.hasRemaining()) {
471             final int b = bb.get() & 0xff;
472             if (safechars.get(b)) {
473                 buf.append((char) b);
474             } else if (blankAsPlus && b == ' ') {
475                 buf.append('+');
476             } else {
477                 buf.append("%");
478                 final char hex1 = Character.toUpperCase(Character.forDigit((b >> 4) & 0xF, RADIX));
479                 final char hex2 = Character.toUpperCase(Character.forDigit(b & 0xF, RADIX));
480                 buf.append(hex1);
481                 buf.append(hex2);
482             }
483         }
484         return buf.toString();
485     }
486 
487     /**
488      * Decode/unescape a portion of a URL, to use with the query part ensure {@code plusAsBlank} is true.
489      *
490      * @param content the portion to decode
491      * @param charset the charset to use
492      * @param plusAsBlank if {@code true}, then convert '+' to space (e.g. for www-url-form-encoded content), otherwise leave as is.
493      * @return encoded string
494      */
495     private static String urlDecode(
496             final String content,
497             final Charset charset,
498             final boolean plusAsBlank) {
499         if (content == null) {
500             return null;
501         }
502         final ByteBuffer bb = ByteBuffer.allocate(content.length());
503         final CharBuffer cb = CharBuffer.wrap(content);
504         while (cb.hasRemaining()) {
505             final char c = cb.get();
506             if (c == '%' && cb.remaining() >= 2) {
507                 final char uc = cb.get();
508                 final char lc = cb.get();
509                 final int u = Character.digit(uc, 16);
510                 final int l = Character.digit(lc, 16);
511                 if (u != -1 && l != -1) {
512                     bb.put((byte) ((u << 4) + l));
513                 } else {
514                     bb.put((byte) '%');
515                     bb.put((byte) uc);
516                     bb.put((byte) lc);
517                 }
518             } else if (plusAsBlank && c == '+') {
519                 bb.put((byte) ' ');
520             } else {
521                 bb.put((byte) c);
522             }
523         }
524         bb.flip();
525         return charset.decode(bb).toString();
526     }
527 
528     /**
529      * Decode/unescape www-url-form-encoded content.
530      *
531      * @param content the content to decode, will decode '+' as space
532      * @param charset the charset to use
533      * @return encoded string
534      */
535     private static String decodeFormFields (final String content, final String charset) {
536         if (content == null) {
537             return null;
538         }
539         return urlDecode(content, charset != null ? Charset.forName(charset) : Consts.UTF_8, true);
540     }
541 
542     /**
543      * Decode/unescape www-url-form-encoded content.
544      *
545      * @param content the content to decode, will decode '+' as space
546      * @param charset the charset to use
547      * @return encoded string
548      */
549     private static String decodeFormFields (final String content, final Charset charset) {
550         if (content == null) {
551             return null;
552         }
553         return urlDecode(content, charset != null ? charset : Consts.UTF_8, true);
554     }
555 
556     /**
557      * Encode/escape www-url-form-encoded content.
558      * <p>
559      * Uses the {@link #URLENCODER} set of characters, rather than
560      * the {@link #UNRSERVED} set; this is for compatibilty with previous
561      * releases, URLEncoder.encode() and most browsers.
562      *
563      * @param content the content to encode, will convert space to '+'
564      * @param charset the charset to use
565      * @return encoded string
566      */
567     private static String encodeFormFields(final String content, final String charset) {
568         if (content == null) {
569             return null;
570         }
571         return urlEncode(content, charset != null ? Charset.forName(charset) : Consts.UTF_8, URLENCODER, true);
572     }
573 
574     /**
575      * Encode/escape www-url-form-encoded content.
576      * <p>
577      * Uses the {@link #URLENCODER} set of characters, rather than
578      * the {@link #UNRSERVED} set; this is for compatibilty with previous
579      * releases, URLEncoder.encode() and most browsers.
580      *
581      * @param content the content to encode, will convert space to '+'
582      * @param charset the charset to use
583      * @return encoded string
584      */
585     private static String encodeFormFields (final String content, final Charset charset) {
586         if (content == null) {
587             return null;
588         }
589         return urlEncode(content, charset != null ? charset : Consts.UTF_8, URLENCODER, true);
590     }
591 
592     /**
593      * Encode a String using the {@link #USERINFO} set of characters.
594      * <p>
595      * Used by URIBuilder to encode the userinfo segment.
596      *
597      * @param content the string to encode, does not convert space to '+'
598      * @param charset the charset to use
599      * @return the encoded string
600      */
601     static String encUserInfo(final String content, final Charset charset) {
602         return urlEncode(content, charset, USERINFO, false);
603     }
604 
605     /**
606      * Encode a String using the {@link #URIC} set of characters.
607      * <p>
608      * Used by URIBuilder to encode the query and fragment segments.
609      *
610      * @param content the string to encode, does not convert space to '+'
611      * @param charset the charset to use
612      * @return the encoded string
613      */
614     static String encUric(final String content, final Charset charset) {
615         return urlEncode(content, charset, URIC, false);
616     }
617 
618     /**
619      * Encode a String using the {@link #PATHSAFE} set of characters.
620      * <p>
621      * Used by URIBuilder to encode path segments.
622      *
623      * @param content the string to encode, does not convert space to '+'
624      * @param charset the charset to use
625      * @return the encoded string
626      */
627     static String encPath(final String content, final Charset charset) {
628         return urlEncode(content, charset, PATHSAFE, false);
629     }
630 
631 }