View Javadoc

1   /*
2    * ====================================================================
3    * Licensed to the Apache Software Foundation (ASF) under one
4    * or more contributor license agreements.  See the NOTICE file
5    * distributed with this work for additional information
6    * regarding copyright ownership.  The ASF licenses this file
7    * to you under the Apache License, Version 2.0 (the
8    * "License"); you may not use this file except in compliance
9    * with the License.  You may obtain a copy of the License at
10   *
11   *   http://www.apache.org/licenses/LICENSE-2.0
12   *
13   * Unless required by applicable law or agreed to in writing,
14   * software distributed under the License is distributed on an
15   * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
16   * KIND, either express or implied.  See the License for the
17   * specific language governing permissions and limitations
18   * under the License.
19   * ====================================================================
20   *
21   * This software consists of voluntary contributions made by many
22   * individuals on behalf of the Apache Software Foundation.  For more
23   * information on the Apache Software Foundation, please see
24   * <http://www.apache.org/>.
25   *
26   */
27  
28  package org.apache.http.client.utils;
29  
30  import java.io.IOException;
31  import java.net.URI;
32  import java.nio.ByteBuffer;
33  import java.nio.CharBuffer;
34  import java.nio.charset.Charset;
35  import java.util.ArrayList;
36  import java.util.BitSet;
37  import java.util.Collections;
38  import java.util.List;
39  import java.util.Scanner;
40  
41  import org.apache.http.annotation.Immutable;
42  import org.apache.http.entity.ContentType;
43  
44  import org.apache.http.Consts;
45  import org.apache.http.Header;
46  import org.apache.http.HeaderElement;
47  import org.apache.http.HttpEntity;
48  import org.apache.http.NameValuePair;
49  import org.apache.http.message.BasicHeaderValueParser;
50  import org.apache.http.message.BasicNameValuePair;
51  import org.apache.http.message.ParserCursor;
52  import org.apache.http.protocol.HTTP;
53  import org.apache.http.util.CharArrayBuffer;
54  import org.apache.http.util.EntityUtils;
55  
56  /**
57   * A collection of utilities for encoding URLs.
58   *
59   * @since 4.0
60   */
61  @Immutable
62  public class URLEncodedUtils {
63  
64      public static final String CONTENT_TYPE = "application/x-www-form-urlencoded";
65      private static final String PARAMETER_SEPARATOR = "&";
66      private static final String NAME_VALUE_SEPARATOR = "=";
67  
68      /**
69       * Returns a list of {@link NameValuePair NameValuePairs} as built from the
70       * URI's query portion. For example, a URI of
71       * http://example.org/path/to/file?a=1&b=2&c=3 would return a list of three
72       * NameValuePairs, one for a=1, one for b=2, and one for c=3.
73       * <p>
74       * This is typically useful while parsing an HTTP PUT.
75       *
76       * @param uri
77       *            uri to parse
78       * @param encoding
79       *            encoding to use while parsing the query
80       */
81      public static List <NameValuePair> parse (final URI uri, final String encoding) {
82          final String query = uri.getRawQuery();
83          if (query != null && query.length() > 0) {
84              List<NameValuePair> result = new ArrayList<NameValuePair>();
85              Scanner scanner = new Scanner(query);
86              parse(result, scanner, encoding);
87              return result;
88          } else {
89              return Collections.emptyList();
90          }
91      }
92  
93      /**
94       * Returns a list of {@link NameValuePair NameValuePairs} as parsed from an
95       * {@link HttpEntity}. The encoding is taken from the entity's
96       * Content-Encoding header.
97       * <p>
98       * This is typically used while parsing an HTTP POST.
99       *
100      * @param entity
101      *            The entity to parse
102      * @throws IOException
103      *             If there was an exception getting the entity's data.
104      */
105     public static List <NameValuePair> parse (
106             final HttpEntity entity) throws IOException {
107         ContentType contentType = ContentType.get(entity);
108         if (contentType != null && contentType.getMimeType().equalsIgnoreCase(CONTENT_TYPE)) {
109             String content = EntityUtils.toString(entity, Consts.ASCII);
110             if (content != null && content.length() > 0) {
111                 Charset charset = contentType.getCharset();
112                 if (charset == null) {
113                     charset = HTTP.DEF_CONTENT_CHARSET;
114                 }
115                 return parse(content, charset);
116             }
117         }
118         return Collections.emptyList();
119     }
120 
121     /**
122      * Returns true if the entity's Content-Type header is
123      * <code>application/x-www-form-urlencoded</code>.
124      */
125     public static boolean isEncoded (final HttpEntity entity) {
126         Header h = entity.getContentType();
127         if (h != null) {
128             HeaderElement[] elems = h.getElements();
129             if (elems.length > 0) {
130                 String contentType = elems[0].getName();
131                 return contentType.equalsIgnoreCase(CONTENT_TYPE);
132             } else {
133                 return false;
134             }
135         } else {
136             return false;
137         }
138     }
139 
140     /**
141      * Adds all parameters within the Scanner to the list of
142      * <code>parameters</code>, as encoded by <code>encoding</code>. For
143      * example, a scanner containing the string <code>a=1&b=2&c=3</code> would
144      * add the {@link NameValuePair NameValuePairs} a=1, b=2, and c=3 to the
145      * list of parameters.
146      *
147      * @param parameters
148      *            List to add parameters to.
149      * @param scanner
150      *            Input that contains the parameters to parse.
151      * @param charset
152      *            Encoding to use when decoding the parameters.
153      */
154     public static void parse (
155             final List <NameValuePair> parameters,
156             final Scanner scanner,
157             final String charset) {
158         scanner.useDelimiter(PARAMETER_SEPARATOR);
159         while (scanner.hasNext()) {
160             String name = null;
161             String value = null;
162             String token = scanner.next();
163             int i = token.indexOf(NAME_VALUE_SEPARATOR);
164             if (i != -1) {
165                 name = decodeFormFields(token.substring(0, i).trim(), charset);
166                 value = decodeFormFields(token.substring(i + 1).trim(), charset);
167             } else {
168                 name = decodeFormFields(token.trim(), charset);
169             }
170             parameters.add(new BasicNameValuePair(name, value));
171         }
172     }
173 
174     private static final char[] DELIM = new char[] { '&' };
175 
176     /**
177      * Returns a list of {@link NameValuePair NameValuePairs} as parsed from the given string
178      * using the given character encoding.
179      *
180      * @param s
181      *            text to parse.
182      * @param charset
183      *            Encoding to use when decoding the parameters.
184      *
185      * @since 4.2
186      */
187     public static List<NameValuePair> parse (final String s, final Charset charset) {
188         if (s == null) {
189             return Collections.emptyList();
190         }
191         BasicHeaderValueParser parser = BasicHeaderValueParser.DEFAULT;
192         CharArrayBuffer buffer = new CharArrayBuffer(s.length());
193         buffer.append(s);
194         ParserCursor cursor = new ParserCursor(0, buffer.length());
195         List<NameValuePair> list = new ArrayList<NameValuePair>();
196         while (!cursor.atEnd()) {
197             NameValuePair nvp = parser.parseNameValuePair(buffer, cursor, DELIM);
198             if (nvp.getName().length() > 0) {
199                 list.add(new BasicNameValuePair(
200                         decodeFormFields(nvp.getName(), charset),
201                         decodeFormFields(nvp.getValue(), charset)));
202             }
203         }
204         return list;
205     }
206 
207     /**
208      * Returns a String that is suitable for use as an <code>application/x-www-form-urlencoded</code>
209      * list of parameters in an HTTP PUT or HTTP POST.
210      *
211      * @param parameters  The parameters to include.
212      * @param encoding The encoding to use.
213      */
214     public static String format (
215             final List <? extends NameValuePair> parameters,
216             final String encoding) {
217         final StringBuilder result = new StringBuilder();
218         for (final NameValuePair parameter : parameters) {
219             final String encodedName = encodeFormFields(parameter.getName(), encoding);
220             final String encodedValue = encodeFormFields(parameter.getValue(), encoding);
221             if (result.length() > 0) {
222                 result.append(PARAMETER_SEPARATOR);
223             }
224             result.append(encodedName);
225             if (encodedValue != null) {
226                 result.append(NAME_VALUE_SEPARATOR);
227                 result.append(encodedValue);
228             }
229         }
230         return result.toString();
231     }
232 
233     /**
234      * Returns a String that is suitable for use as an <code>application/x-www-form-urlencoded</code>
235      * list of parameters in an HTTP PUT or HTTP POST.
236      *
237      * @param parameters  The parameters to include.
238      * @param charset The encoding to use.
239      *
240      * @since 4.2
241      */
242     public static String format (
243             final Iterable<? extends NameValuePair> parameters,
244             final Charset charset) {
245         final StringBuilder result = new StringBuilder();
246         for (final NameValuePair parameter : parameters) {
247             final String encodedName = encodeFormFields(parameter.getName(), charset);
248             final String encodedValue = encodeFormFields(parameter.getValue(), charset);
249             if (result.length() > 0) {
250                 result.append(PARAMETER_SEPARATOR);
251             }
252             result.append(encodedName);
253             if (encodedValue != null) {
254                 result.append(NAME_VALUE_SEPARATOR);
255                 result.append(encodedValue);
256             }
257         }
258         return result.toString();
259     }
260 
261     /** 
262      * Unreserved characters, i.e. alphanumeric, plus: {@code _ - ! . ~ ' ( ) *}
263      * <p>
264      *  This list is the same as the {@code unreserved} list in
265      *  <a href="http://www.ietf.org/rfc/rfc2396.txt">RFC 2396</a>
266      */
267     private static final BitSet UNRESERVED   = new BitSet(256);
268     /**
269      * Punctuation characters: , ; : $ & + =
270      * <p>
271      * These are the additional characters allowed by userinfo.
272      */
273     private static final BitSet PUNCT        = new BitSet(256);
274     /** Characters which are safe to use in userinfo, i.e. {@link #UNRESERVED} plus {@link #PUNCT}uation */
275     private static final BitSet USERINFO     = new BitSet(256);
276     /** Characters which are safe to use in a path, i.e. {@link #UNRESERVED} plus {@link #PUNCT}uation plus / @ */
277     private static final BitSet PATHSAFE     = new BitSet(256);
278     /** Characters which are safe to use in a fragment, i.e. {@link #RESERVED} plus {@link #UNRESERVED} */
279     private static final BitSet FRAGMENT     = new BitSet(256);
280 
281     /** 
282      * Reserved characters, i.e. {@code ;/?:@&=+$,[]}
283      * <p>
284      *  This list is the same as the {@code reserved} list in 
285      *  <a href="http://www.ietf.org/rfc/rfc2396.txt">RFC 2396</a>
286      *  as augmented by
287      *  <a href="http://www.ietf.org/rfc/rfc2732.txt">RFC 2732</a>
288      */
289     private static final BitSet RESERVED     = new BitSet(256);
290 
291     
292     /** 
293      * Safe characters for x-www-form-urlencoded data, as per java.net.URLEncoder and browser behaviour,
294      * i.e. alphanumeric plus {@code "-", "_", ".", "*"}
295      */
296     private static final BitSet URLENCODER   = new BitSet(256);
297 
298     static {
299         // unreserved chars
300         // alpha characters
301         for (int i = 'a'; i <= 'z'; i++) {
302             UNRESERVED.set(i);
303         }
304         for (int i = 'A'; i <= 'Z'; i++) {
305             UNRESERVED.set(i);
306         }
307         // numeric characters
308         for (int i = '0'; i <= '9'; i++) {
309             UNRESERVED.set(i);
310         }
311         UNRESERVED.set('_'); // these are the charactes of the "mark" list
312         UNRESERVED.set('-');
313         UNRESERVED.set('.');
314         UNRESERVED.set('*');
315         URLENCODER.or(UNRESERVED); // skip remaining unreserved characters
316         UNRESERVED.set('!');
317         UNRESERVED.set('~');
318         UNRESERVED.set('\'');
319         UNRESERVED.set('(');
320         UNRESERVED.set(')');
321         // punct chars
322         PUNCT.set(',');
323         PUNCT.set(';');
324         PUNCT.set(':');
325         PUNCT.set('$');
326         PUNCT.set('&');
327         PUNCT.set('+');
328         PUNCT.set('=');
329         // Safe for userinfo
330         USERINFO.or(UNRESERVED);
331         USERINFO.or(PUNCT);
332 
333         // URL path safe
334         PATHSAFE.or(UNRESERVED);
335         PATHSAFE.set('/'); // segment separator
336         PATHSAFE.set(';'); // param separator
337         PATHSAFE.set(':'); // rest as per list in 2396, i.e. : @ & = + $ ,
338         PATHSAFE.set('@');
339         PATHSAFE.set('&');
340         PATHSAFE.set('=');
341         PATHSAFE.set('+');
342         PATHSAFE.set('$');
343         PATHSAFE.set(',');
344         
345         RESERVED.set(';');
346         RESERVED.set('/');
347         RESERVED.set('?');
348         RESERVED.set(':');
349         RESERVED.set('@');
350         RESERVED.set('&');
351         RESERVED.set('=');
352         RESERVED.set('+');
353         RESERVED.set('$');
354         RESERVED.set(',');
355         RESERVED.set('['); // added by RFC 2732
356         RESERVED.set(']'); // added by RFC 2732
357         
358         FRAGMENT.or(RESERVED);
359         FRAGMENT.or(UNRESERVED);
360     }
361 
362     private static final int RADIX = 16;
363 
364     private static String urlencode(
365             final String content,
366             final Charset charset,
367             final BitSet safechars,
368             final boolean blankAsPlus) {
369         if (content == null) {
370             return null;
371         }
372         StringBuilder buf = new StringBuilder();
373         ByteBuffer bb = charset.encode(content);
374         while (bb.hasRemaining()) {
375             int b = bb.get() & 0xff;
376             if (safechars.get(b)) {
377                 buf.append((char) b);
378             } else if (blankAsPlus && b == ' ') {
379                 buf.append('+');
380             } else {
381                 buf.append("%");
382                 char hex1 = Character.toUpperCase(Character.forDigit((b >> 4) & 0xF, RADIX));
383                 char hex2 = Character.toUpperCase(Character.forDigit(b & 0xF, RADIX));
384                 buf.append(hex1);
385                 buf.append(hex2);
386             }
387         }
388         return buf.toString();
389     }
390 
391     /**
392      * Decode/unescape a portion of a URL, to use with the query part ensure {@code plusAsBlank} is true.
393      * 
394      * @param content the portion to decode
395      * @param charset the charset to use
396      * @param plusAsBlank if {@code true}, then convert '+' to space (e.g. for www-url-form-encoded content), otherwise leave as is.
397      * @return encoded string
398      */
399     private static String urldecode(
400             final String content,
401             final Charset charset,
402             final boolean plusAsBlank) {
403         if (content == null) {
404             return null;
405         }
406         ByteBuffer bb = ByteBuffer.allocate(content.length());
407         CharBuffer cb = CharBuffer.wrap(content);
408         while (cb.hasRemaining()) {
409             char c = cb.get();
410             if (c == '%' && cb.remaining() >= 2) {
411                 char uc = cb.get();
412                 char lc = cb.get();
413                 int u = Character.digit(uc, 16);
414                 int l = Character.digit(lc, 16);
415                 if (u != -1 && l != -1) {
416                     bb.put((byte) ((u << 4) + l));
417                 } else {
418                     bb.put((byte) '%');
419                     bb.put((byte) uc);
420                     bb.put((byte) lc);
421                 }
422             } else if (plusAsBlank && c == '+') {
423                 bb.put((byte) ' ');
424             } else {
425                 bb.put((byte) c);
426             }
427         }
428         bb.flip();
429         return charset.decode(bb).toString();
430     }
431 
432     /**
433      * Decode/unescape www-url-form-encoded content.
434      * 
435      * @param content the content to decode, will decode '+' as space
436      * @param charset the charset to use
437      * @return encoded string
438      */
439     private static String decodeFormFields (final String content, final String charset) {
440         if (content == null) {
441             return null;
442         }
443         return urldecode(content, charset != null ? Charset.forName(charset) : Consts.UTF_8, true);
444     }
445 
446     /**
447      * Decode/unescape www-url-form-encoded content.
448      * 
449      * @param content the content to decode, will decode '+' as space
450      * @param charset the charset to use
451      * @return encoded string
452      */
453     private static String decodeFormFields (final String content, final Charset charset) {
454         if (content == null) {
455             return null;
456         }
457         return urldecode(content, charset != null ? charset : Consts.UTF_8, true);
458     }
459 
460     /**
461      * Encode/escape www-url-form-encoded content.
462      * <p>
463      * Uses the {@link #URLENCODER} set of characters, rather than
464      * the {@link #UNRSERVED} set; this is for compatibilty with previous
465      * releases, URLEncoder.encode() and most browsers.
466      * 
467      * @param content the content to encode, will convert space to '+'
468      * @param charset the charset to use
469      * @return encoded string
470      */
471     private static String encodeFormFields (final String content, final String charset) {
472         if (content == null) {
473             return null;
474         }
475         return urlencode(content, charset != null ? Charset.forName(charset) :
476             Consts.UTF_8, URLENCODER, true);
477     }
478 
479     /**
480      * Encode/escape www-url-form-encoded content.
481      * <p>
482      * Uses the {@link #URLENCODER} set of characters, rather than
483      * the {@link #UNRSERVED} set; this is for compatibilty with previous
484      * releases, URLEncoder.encode() and most browsers.
485      * 
486      * @param content the content to encode, will convert space to '+'
487      * @param charset the charset to use
488      * @return encoded string
489      */
490     private static String encodeFormFields (final String content, final Charset charset) {
491         if (content == null) {
492             return null;
493         }
494         return urlencode(content, charset != null ? charset : Consts.UTF_8, URLENCODER, true);
495     }
496 
497     /**
498      * Encode a String using the {@link #USERINFO} set of characters.
499      * <p>
500      * Used by URIBuilder to encode the userinfo segment.
501      * 
502      * @param content the string to encode, does not convert space to '+'
503      * @param charset the charset to use
504      * @return the encoded string
505      */
506     static String encUserInfo(final String content, final Charset charset) {
507         return urlencode(content, charset, USERINFO, false);
508     }
509 
510     /**
511      * Encode a String using the {@link #FRAGMENT} set of characters.
512      * <p>
513      * Used by URIBuilder to encode the userinfo segment.
514      * 
515      * @param content the string to encode, does not convert space to '+'
516      * @param charset the charset to use
517      * @return the encoded string
518      */
519     static String encFragment(final String content, final Charset charset) {
520         return urlencode(content, charset, FRAGMENT, false);
521     }
522 
523     /**
524      * Encode a String using the {@link #PATHSAFE} set of characters.
525      * <p>
526      * Used by URIBuilder to encode path segments.
527      * 
528      * @param content the string to encode, does not convert space to '+'
529      * @param charset the charset to use
530      * @return the encoded string
531      */
532     static String encPath(final String content, final Charset charset) {
533         return urlencode(content, charset, PATHSAFE, false);
534     }
535 
536 }