View Javadoc

1   /*
2    * ====================================================================
3    * Licensed to the Apache Software Foundation (ASF) under one
4    * or more contributor license agreements.  See the NOTICE file
5    * distributed with this work for additional information
6    * regarding copyright ownership.  The ASF licenses this file
7    * to you under the Apache License, Version 2.0 (the
8    * "License"); you may not use this file except in compliance
9    * with the License.  You may obtain a copy of the License at
10   *
11   *   http://www.apache.org/licenses/LICENSE-2.0
12   *
13   * Unless required by applicable law or agreed to in writing,
14   * software distributed under the License is distributed on an
15   * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
16   * KIND, either express or implied.  See the License for the
17   * specific language governing permissions and limitations
18   * under the License.
19   * ====================================================================
20   *
21   * This software consists of voluntary contributions made by many
22   * individuals on behalf of the Apache Software Foundation.  For more
23   * information on the Apache Software Foundation, please see
24   * <http://www.apache.org/>.
25   *
26   */
27  
28  package org.apache.http.client.utils;
29  
30  import java.io.IOException;
31  import java.io.InputStream;
32  import java.io.InputStreamReader;
33  import java.io.Reader;
34  import java.net.URI;
35  import java.nio.ByteBuffer;
36  import java.nio.CharBuffer;
37  import java.nio.charset.Charset;
38  import java.util.ArrayList;
39  import java.util.BitSet;
40  import java.util.Collections;
41  import java.util.List;
42  import java.util.Scanner;
43  
44  import org.apache.http.Consts;
45  import org.apache.http.Header;
46  import org.apache.http.HeaderElement;
47  import org.apache.http.HttpEntity;
48  import org.apache.http.NameValuePair;
49  import org.apache.http.annotation.Immutable;
50  import org.apache.http.entity.ContentType;
51  import org.apache.http.message.BasicNameValuePair;
52  import org.apache.http.message.ParserCursor;
53  import org.apache.http.message.TokenParser;
54  import org.apache.http.protocol.HTTP;
55  import org.apache.http.util.Args;
56  import org.apache.http.util.CharArrayBuffer;
57  
58  /**
59   * A collection of utilities for encoding URLs.
60   *
61   * @since 4.0
62   */
63  @Immutable
64  public class URLEncodedUtils {
65  
66      /**
67       * The default HTML form content type.
68       */
69      public static final String CONTENT_TYPE = "application/x-www-form-urlencoded";
70  
71      private static final char QP_SEP_A = '&';
72      private static final char QP_SEP_S = ';';
73      private static final String NAME_VALUE_SEPARATOR = "=";
74  
75      /**
76       * Returns a list of {@link NameValuePair NameValuePairs} as built from the URI's query portion. For example, a URI
77       * of {@code http://example.org/path/to/file?a=1&b=2&c=3} would return a list of three NameValuePairs, one for a=1,
78       * one for b=2, and one for c=3. By convention, {@code '&'} and {@code ';'} are accepted as parameter separators.
79       * <p>
80       * This is typically useful while parsing an HTTP PUT.
81       *
82       * This API is currently only used for testing.
83       *
84       * @param uri
85       *        URI to parse
86       * @param charset
87       *        Charset name to use while parsing the query
88       * @return a list of {@link NameValuePair} as built from the URI's query portion.
89       */
90      public static List <NameValuePair> parse(final URI uri, final String charset) {
91          final String query = uri.getRawQuery();
92          if (query != null && !query.isEmpty()) {
93              return parse(query, Charset.forName(charset));
94          }
95          return Collections.emptyList();
96      }
97  
98      /**
99       * Returns a list of {@link NameValuePair NameValuePairs} as parsed from an {@link HttpEntity}.
100      * The encoding is taken from the entity's Content-Encoding header.
101      * <p>
102      * This is typically used while parsing an HTTP POST.
103      *
104      * @param entity
105      *            The entity to parse
106      * @return a list of {@link NameValuePair} as built from the URI's query portion.
107      * @throws IOException
108      *             If there was an exception getting the entity's data.
109      */
110     public static List <NameValuePair> parse(
111             final HttpEntity entity) throws IOException {
112         final ContentType contentType = ContentType.get(entity);
113         if (contentType == null || !contentType.getMimeType().equalsIgnoreCase(CONTENT_TYPE)) {
114             return Collections.emptyList();
115         }
116         final long len = entity.getContentLength();
117         Args.check(len <= Integer.MAX_VALUE, "HTTP entity is too large");
118         final Charset charset = contentType.getCharset() != null ? contentType.getCharset() : HTTP.DEF_CONTENT_CHARSET;
119         final InputStream instream = entity.getContent();
120         if (instream == null) {
121             return Collections.emptyList();
122         }
123         final CharArrayBuffer buf;
124         try {
125             buf = new CharArrayBuffer(len > 0 ? (int) len : 1024);
126             final Reader reader = new InputStreamReader(instream, charset);
127             final char[] tmp = new char[1024];
128             int l;
129             while((l = reader.read(tmp)) != -1) {
130                 buf.append(tmp, 0, l);
131             }
132 
133         } finally {
134             instream.close();
135         }
136         if (buf.length() == 0) {
137             return Collections.emptyList();
138         }
139         return parse(buf, charset, QP_SEP_A);
140     }
141 
142     /**
143      * Returns true if the entity's Content-Type header is
144      * {@code application/x-www-form-urlencoded}.
145      */
146     public static boolean isEncoded(final HttpEntity entity) {
147         final Header h = entity.getContentType();
148         if (h != null) {
149             final HeaderElement[] elems = h.getElements();
150             if (elems.length > 0) {
151                 final String contentType = elems[0].getName();
152                 return contentType.equalsIgnoreCase(CONTENT_TYPE);
153             }
154         }
155         return false;
156     }
157 
158     /**
159      * Adds all parameters within the Scanner to the list of {@code parameters}, as encoded by
160      * {@code encoding}. For example, a scanner containing the string {@code a=1&b=2&c=3} would add the
161      * {@link NameValuePair NameValuePairs} a=1, b=2, and c=3 to the list of parameters. By convention, {@code '&'} and
162      * {@code ';'} are accepted as parameter separators.
163      *
164      * @param parameters
165      *            List to add parameters to.
166      * @param scanner
167      *            Input that contains the parameters to parse.
168      * @param charset
169      *            Encoding to use when decoding the parameters.
170      *
171      * @deprecated (4.4) use {@link #parse(String, java.nio.charset.Charset)}
172      */
173     @Deprecated
174     public static void parse(
175             final List<NameValuePair> parameters,
176             final Scanner scanner,
177             final String charset) {
178         parse(parameters, scanner, "[" + QP_SEP_A + QP_SEP_S + "]", charset);
179     }
180 
181     /**
182      * Adds all parameters within the Scanner to the list of
183      * {@code parameters}, as encoded by {@code encoding}. For
184      * example, a scanner containing the string {@code a=1&b=2&c=3} would
185      * add the {@link NameValuePair NameValuePairs} a=1, b=2, and c=3 to the
186      * list of parameters.
187      *
188      * @param parameters
189      *            List to add parameters to.
190      * @param scanner
191      *            Input that contains the parameters to parse.
192      * @param parameterSepartorPattern
193      *            The Pattern string for parameter separators, by convention {@code "[&;]"}
194      * @param charset
195      *            Encoding to use when decoding the parameters.
196      *
197      * @deprecated (4.4) use {@link #parse(org.apache.http.util.CharArrayBuffer, java.nio.charset.Charset, char...)}
198      */
199     @Deprecated
200     public static void parse(
201             final List <NameValuePair> parameters,
202             final Scanner scanner,
203             final String parameterSepartorPattern,
204             final String charset) {
205         scanner.useDelimiter(parameterSepartorPattern);
206         while (scanner.hasNext()) {
207             final String name;
208             final String value;
209             final String token = scanner.next();
210             final int i = token.indexOf(NAME_VALUE_SEPARATOR);
211             if (i != -1) {
212                 name = decodeFormFields(token.substring(0, i).trim(), charset);
213                 value = decodeFormFields(token.substring(i + 1).trim(), charset);
214             } else {
215                 name = decodeFormFields(token.trim(), charset);
216                 value = null;
217             }
218             parameters.add(new BasicNameValuePair(name, value));
219         }
220     }
221 
222     /**
223      * Returns a list of {@link NameValuePair NameValuePairs} as parsed from the given string using the given character
224      * encoding. By convention, {@code '&'} and {@code ';'} are accepted as parameter separators.
225      *
226      * @param s
227      *            text to parse.
228      * @param charset
229      *            Encoding to use when decoding the parameters.
230      * @return a list of {@link NameValuePair} as built from the URI's query portion.
231      *
232      * @since 4.2
233      */
234     public static List<NameValuePair> parse(final String s, final Charset charset) {
235         final CharArrayBuffer buffer = new CharArrayBuffer(s.length());
236         buffer.append(s);
237         return parse(buffer, charset, QP_SEP_A, QP_SEP_S);
238     }
239 
240     /**
241      * Returns a list of {@link NameValuePair NameValuePairs} as parsed from the given string using the given character
242      * encoding.
243      *
244      * @param s
245      *            text to parse.
246      * @param charset
247      *            Encoding to use when decoding the parameters.
248      * @param separators
249      *            element separators.
250      * @return a list of {@link NameValuePair} as built from the URI's query portion.
251      *
252      * @since 4.3
253      */
254     public static List<NameValuePair> parse(final String s, final Charset charset, final char... separators) {
255         if (s == null) {
256             return Collections.emptyList();
257         }
258         final CharArrayBuffer buffer = new CharArrayBuffer(s.length());
259         buffer.append(s);
260         return parse(buffer, charset, separators);
261     }
262 
263     /**
264      * Returns a list of {@link NameValuePair NameValuePairs} as parsed from the given string using
265      * the given character encoding.
266      *
267      * @param buf
268      *            text to parse.
269      * @param charset
270      *            Encoding to use when decoding the parameters.
271      * @param separators
272      *            element separators.
273      * @return a list of {@link NameValuePair} as built from the URI's query portion.
274      *
275      * @since 4.4
276      */
277     public static List<NameValuePair> parse(
278             final CharArrayBuffer buf, final Charset charset, final char... separators) {
279         Args.notNull(buf, "Char array buffer");
280         final TokenParser tokenParser = TokenParser.INSTANCE;
281         final BitSet delimSet = new BitSet();
282         for (char separator: separators) {
283             delimSet.set(separator);
284         }
285         final ParserCursor cursor = new ParserCursor(0, buf.length());
286         final List<NameValuePair> list = new ArrayList<NameValuePair>();
287         while (!cursor.atEnd()) {
288             delimSet.set('=');
289             final String name = tokenParser.parseToken(buf, cursor, delimSet);
290             String value = null;
291             if (!cursor.atEnd()) {
292                 final int delim = buf.charAt(cursor.getPos());
293                 cursor.updatePos(cursor.getPos() + 1);
294                 if (delim == '=') {
295                     delimSet.clear('=');
296                     value = tokenParser.parseValue(buf, cursor, delimSet);
297                     if (!cursor.atEnd()) {
298                         cursor.updatePos(cursor.getPos() + 1);
299                     }
300                 }
301             }
302             if (!name.isEmpty()) {
303                 list.add(new BasicNameValuePair(
304                         decodeFormFields(name, charset),
305                         decodeFormFields(value, charset)));
306             }
307         }
308         return list;
309     }
310 
311     /**
312      * Returns a String that is suitable for use as an {@code application/x-www-form-urlencoded}
313      * list of parameters in an HTTP PUT or HTTP POST.
314      *
315      * @param parameters  The parameters to include.
316      * @param charset The encoding to use.
317      * @return An {@code application/x-www-form-urlencoded} string
318      */
319     public static String format(
320             final List <? extends NameValuePair> parameters,
321             final String charset) {
322         return format(parameters, QP_SEP_A, charset);
323     }
324 
325     /**
326      * Returns a String that is suitable for use as an {@code application/x-www-form-urlencoded}
327      * list of parameters in an HTTP PUT or HTTP POST.
328      *
329      * @param parameters  The parameters to include.
330      * @param parameterSeparator The parameter separator, by convention, {@code '&'} or {@code ';'}.
331      * @param charset The encoding to use.
332      * @return An {@code application/x-www-form-urlencoded} string
333      *
334      * @since 4.3
335      */
336     public static String format(
337             final List <? extends NameValuePair> parameters,
338             final char parameterSeparator,
339             final String charset) {
340         final StringBuilder result = new StringBuilder();
341         for (final NameValuePair parameter : parameters) {
342             final String encodedName = encodeFormFields(parameter.getName(), charset);
343             final String encodedValue = encodeFormFields(parameter.getValue(), charset);
344             if (result.length() > 0) {
345                 result.append(parameterSeparator);
346             }
347             result.append(encodedName);
348             if (encodedValue != null) {
349                 result.append(NAME_VALUE_SEPARATOR);
350                 result.append(encodedValue);
351             }
352         }
353         return result.toString();
354     }
355 
356     /**
357      * Returns a String that is suitable for use as an {@code application/x-www-form-urlencoded}
358      * list of parameters in an HTTP PUT or HTTP POST.
359      *
360      * @param parameters  The parameters to include.
361      * @param charset The encoding to use.
362      * @return An {@code application/x-www-form-urlencoded} string
363      *
364      * @since 4.2
365      */
366     public static String format(
367             final Iterable<? extends NameValuePair> parameters,
368             final Charset charset) {
369         return format(parameters, QP_SEP_A, charset);
370     }
371 
372     /**
373      * Returns a String that is suitable for use as an {@code application/x-www-form-urlencoded}
374      * list of parameters in an HTTP PUT or HTTP POST.
375      *
376      * @param parameters  The parameters to include.
377      * @param parameterSeparator The parameter separator, by convention, {@code '&'} or {@code ';'}.
378      * @param charset The encoding to use.
379      * @return An {@code application/x-www-form-urlencoded} string
380      *
381      * @since 4.3
382      */
383     public static String format(
384             final Iterable<? extends NameValuePair> parameters,
385             final char parameterSeparator,
386             final Charset charset) {
387         final StringBuilder result = new StringBuilder();
388         for (final NameValuePair parameter : parameters) {
389             final String encodedName = encodeFormFields(parameter.getName(), charset);
390             final String encodedValue = encodeFormFields(parameter.getValue(), charset);
391             if (result.length() > 0) {
392                 result.append(parameterSeparator);
393             }
394             result.append(encodedName);
395             if (encodedValue != null) {
396                 result.append(NAME_VALUE_SEPARATOR);
397                 result.append(encodedValue);
398             }
399         }
400         return result.toString();
401     }
402 
403     /**
404      * Unreserved characters, i.e. alphanumeric, plus: {@code _ - ! . ~ ' ( ) *}
405      * <p>
406      *  This list is the same as the {@code unreserved} list in
407      *  <a href="http://www.ietf.org/rfc/rfc2396.txt">RFC 2396</a>
408      */
409     private static final BitSet UNRESERVED   = new BitSet(256);
410     /**
411      * Punctuation characters: , ; : $ & + =
412      * <p>
413      * These are the additional characters allowed by userinfo.
414      */
415     private static final BitSet PUNCT        = new BitSet(256);
416     /** Characters which are safe to use in userinfo,
417      * i.e. {@link #UNRESERVED} plus {@link #PUNCT}uation */
418     private static final BitSet USERINFO     = new BitSet(256);
419     /** Characters which are safe to use in a path,
420      * i.e. {@link #UNRESERVED} plus {@link #PUNCT}uation plus / @ */
421     private static final BitSet PATHSAFE     = new BitSet(256);
422     /** Characters which are safe to use in a query or a fragment,
423      * i.e. {@link #RESERVED} plus {@link #UNRESERVED} */
424     private static final BitSet URIC     = new BitSet(256);
425 
426     /**
427      * Reserved characters, i.e. {@code ;/?:@&=+$,[]}
428      * <p>
429      *  This list is the same as the {@code reserved} list in
430      *  <a href="http://www.ietf.org/rfc/rfc2396.txt">RFC 2396</a>
431      *  as augmented by
432      *  <a href="http://www.ietf.org/rfc/rfc2732.txt">RFC 2732</a>
433      */
434     private static final BitSet RESERVED     = new BitSet(256);
435 
436 
437     /**
438      * Safe characters for x-www-form-urlencoded data, as per java.net.URLEncoder and browser behaviour,
439      * i.e. alphanumeric plus {@code "-", "_", ".", "*"}
440      */
441     private static final BitSet URLENCODER   = new BitSet(256);
442 
443     static {
444         // unreserved chars
445         // alpha characters
446         for (int i = 'a'; i <= 'z'; i++) {
447             UNRESERVED.set(i);
448         }
449         for (int i = 'A'; i <= 'Z'; i++) {
450             UNRESERVED.set(i);
451         }
452         // numeric characters
453         for (int i = '0'; i <= '9'; i++) {
454             UNRESERVED.set(i);
455         }
456         UNRESERVED.set('_'); // these are the charactes of the "mark" list
457         UNRESERVED.set('-');
458         UNRESERVED.set('.');
459         UNRESERVED.set('*');
460         URLENCODER.or(UNRESERVED); // skip remaining unreserved characters
461         UNRESERVED.set('!');
462         UNRESERVED.set('~');
463         UNRESERVED.set('\'');
464         UNRESERVED.set('(');
465         UNRESERVED.set(')');
466         // punct chars
467         PUNCT.set(',');
468         PUNCT.set(';');
469         PUNCT.set(':');
470         PUNCT.set('$');
471         PUNCT.set('&');
472         PUNCT.set('+');
473         PUNCT.set('=');
474         // Safe for userinfo
475         USERINFO.or(UNRESERVED);
476         USERINFO.or(PUNCT);
477 
478         // URL path safe
479         PATHSAFE.or(UNRESERVED);
480         PATHSAFE.set('/'); // segment separator
481         PATHSAFE.set(';'); // param separator
482         PATHSAFE.set(':'); // rest as per list in 2396, i.e. : @ & = + $ ,
483         PATHSAFE.set('@');
484         PATHSAFE.set('&');
485         PATHSAFE.set('=');
486         PATHSAFE.set('+');
487         PATHSAFE.set('$');
488         PATHSAFE.set(',');
489 
490         RESERVED.set(';');
491         RESERVED.set('/');
492         RESERVED.set('?');
493         RESERVED.set(':');
494         RESERVED.set('@');
495         RESERVED.set('&');
496         RESERVED.set('=');
497         RESERVED.set('+');
498         RESERVED.set('$');
499         RESERVED.set(',');
500         RESERVED.set('['); // added by RFC 2732
501         RESERVED.set(']'); // added by RFC 2732
502 
503         URIC.or(RESERVED);
504         URIC.or(UNRESERVED);
505     }
506 
507     private static final int RADIX = 16;
508 
509     private static String urlEncode(
510             final String content,
511             final Charset charset,
512             final BitSet safechars,
513             final boolean blankAsPlus) {
514         if (content == null) {
515             return null;
516         }
517         final StringBuilder buf = new StringBuilder();
518         final ByteBuffer bb = charset.encode(content);
519         while (bb.hasRemaining()) {
520             final int b = bb.get() & 0xff;
521             if (safechars.get(b)) {
522                 buf.append((char) b);
523             } else if (blankAsPlus && b == ' ') {
524                 buf.append('+');
525             } else {
526                 buf.append("%");
527                 final char hex1 = Character.toUpperCase(Character.forDigit((b >> 4) & 0xF, RADIX));
528                 final char hex2 = Character.toUpperCase(Character.forDigit(b & 0xF, RADIX));
529                 buf.append(hex1);
530                 buf.append(hex2);
531             }
532         }
533         return buf.toString();
534     }
535 
536     /**
537      * Decode/unescape a portion of a URL, to use with the query part ensure {@code plusAsBlank} is true.
538      *
539      * @param content the portion to decode
540      * @param charset the charset to use
541      * @param plusAsBlank if {@code true}, then convert '+' to space (e.g. for www-url-form-encoded content), otherwise leave as is.
542      * @return encoded string
543      */
544     private static String urlDecode(
545             final String content,
546             final Charset charset,
547             final boolean plusAsBlank) {
548         if (content == null) {
549             return null;
550         }
551         final ByteBuffer bb = ByteBuffer.allocate(content.length());
552         final CharBuffer cb = CharBuffer.wrap(content);
553         while (cb.hasRemaining()) {
554             final char c = cb.get();
555             if (c == '%' && cb.remaining() >= 2) {
556                 final char uc = cb.get();
557                 final char lc = cb.get();
558                 final int u = Character.digit(uc, 16);
559                 final int l = Character.digit(lc, 16);
560                 if (u != -1 && l != -1) {
561                     bb.put((byte) ((u << 4) + l));
562                 } else {
563                     bb.put((byte) '%');
564                     bb.put((byte) uc);
565                     bb.put((byte) lc);
566                 }
567             } else if (plusAsBlank && c == '+') {
568                 bb.put((byte) ' ');
569             } else {
570                 bb.put((byte) c);
571             }
572         }
573         bb.flip();
574         return charset.decode(bb).toString();
575     }
576 
577     /**
578      * Decode/unescape www-url-form-encoded content.
579      *
580      * @param content the content to decode, will decode '+' as space
581      * @param charset the charset to use
582      * @return encoded string
583      */
584     private static String decodeFormFields (final String content, final String charset) {
585         if (content == null) {
586             return null;
587         }
588         return urlDecode(content, charset != null ? Charset.forName(charset) : Consts.UTF_8, true);
589     }
590 
591     /**
592      * Decode/unescape www-url-form-encoded content.
593      *
594      * @param content the content to decode, will decode '+' as space
595      * @param charset the charset to use
596      * @return encoded string
597      */
598     private static String decodeFormFields (final String content, final Charset charset) {
599         if (content == null) {
600             return null;
601         }
602         return urlDecode(content, charset != null ? charset : Consts.UTF_8, true);
603     }
604 
605     /**
606      * Encode/escape www-url-form-encoded content.
607      * <p>
608      * Uses the {@link #URLENCODER} set of characters, rather than
609      * the {@link #UNRESERVED} set; this is for compatibilty with previous
610      * releases, URLEncoder.encode() and most browsers.
611      *
612      * @param content the content to encode, will convert space to '+'
613      * @param charset the charset to use
614      * @return encoded string
615      */
616     private static String encodeFormFields(final String content, final String charset) {
617         if (content == null) {
618             return null;
619         }
620         return urlEncode(content, charset != null ? Charset.forName(charset) : Consts.UTF_8, URLENCODER, true);
621     }
622 
623     /**
624      * Encode/escape www-url-form-encoded content.
625      * <p>
626      * Uses the {@link #URLENCODER} set of characters, rather than
627      * the {@link #UNRESERVED} set; this is for compatibilty with previous
628      * releases, URLEncoder.encode() and most browsers.
629      *
630      * @param content the content to encode, will convert space to '+'
631      * @param charset the charset to use
632      * @return encoded string
633      */
634     private static String encodeFormFields (final String content, final Charset charset) {
635         if (content == null) {
636             return null;
637         }
638         return urlEncode(content, charset != null ? charset : Consts.UTF_8, URLENCODER, true);
639     }
640 
641     /**
642      * Encode a String using the {@link #USERINFO} set of characters.
643      * <p>
644      * Used by URIBuilder to encode the userinfo segment.
645      *
646      * @param content the string to encode, does not convert space to '+'
647      * @param charset the charset to use
648      * @return the encoded string
649      */
650     static String encUserInfo(final String content, final Charset charset) {
651         return urlEncode(content, charset, USERINFO, false);
652     }
653 
654     /**
655      * Encode a String using the {@link #URIC} set of characters.
656      * <p>
657      * Used by URIBuilder to encode the query and fragment segments.
658      *
659      * @param content the string to encode, does not convert space to '+'
660      * @param charset the charset to use
661      * @return the encoded string
662      */
663     static String encUric(final String content, final Charset charset) {
664         return urlEncode(content, charset, URIC, false);
665     }
666 
667     /**
668      * Encode a String using the {@link #PATHSAFE} set of characters.
669      * <p>
670      * Used by URIBuilder to encode path segments.
671      *
672      * @param content the string to encode, does not convert space to '+'
673      * @param charset the charset to use
674      * @return the encoded string
675      */
676     static String encPath(final String content, final Charset charset) {
677         return urlEncode(content, charset, PATHSAFE, false);
678     }
679 
680 }