View Javadoc
1   /*
2    * ====================================================================
3    * Licensed to the Apache Software Foundation (ASF) under one
4    * or more contributor license agreements.  See the NOTICE file
5    * distributed with this work for additional information
6    * regarding copyright ownership.  The ASF licenses this file
7    * to you under the Apache License, Version 2.0 (the
8    * "License"); you may not use this file except in compliance
9    * with the License.  You may obtain a copy of the License at
10   *
11   *   http://www.apache.org/licenses/LICENSE-2.0
12   *
13   * Unless required by applicable law or agreed to in writing,
14   * software distributed under the License is distributed on an
15   * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
16   * KIND, either express or implied.  See the License for the
17   * specific language governing permissions and limitations
18   * under the License.
19   * ====================================================================
20   *
21   * This software consists of voluntary contributions made by many
22   * individuals on behalf of the Apache Software Foundation.  For more
23   * information on the Apache Software Foundation, please see
24   * <http://www.apache.org/>.
25   *
26   */
27  
28  package org.apache.hc.core5.net;
29  
30  import java.net.URI;
31  import java.nio.ByteBuffer;
32  import java.nio.CharBuffer;
33  import java.nio.charset.Charset;
34  import java.nio.charset.StandardCharsets;
35  import java.util.ArrayList;
36  import java.util.BitSet;
37  import java.util.List;
38  
39  import org.apache.hc.core5.http.NameValuePair;
40  import org.apache.hc.core5.http.message.BasicNameValuePair;
41  import org.apache.hc.core5.http.message.ParserCursor;
42  import org.apache.hc.core5.http.message.TokenParser;
43  import org.apache.hc.core5.util.Args;
44  
45  /**
46   * A collection of utilities for encoding URLs.
47   *
48   * @since 4.0
49   */
50  public class URLEncodedUtils {
51  
52      /**
53       * The default HTML form content type.
54       */
55      public static final String CONTENT_TYPE = "application/x-www-form-urlencoded";
56  
57      private static final char QP_SEP_A = '&';
58      private static final char QP_SEP_S = ';';
59      private static final String NAME_VALUE_SEPARATOR = "=";
60  
61      /**
62       * Returns a list of {@link NameValuePair NameValuePairs} as built from the URI's query portion. For example, a URI
63       * of {@code http://example.org/path/to/file?a=1&b=2&c=3} would return a list of three NameValuePairs, one for a=1,
64       * one for b=2, and one for c=3. By convention, {@code '&'} and {@code ';'} are accepted as parameter separators.
65       * <p>
66       * This is typically useful while parsing an HTTP PUT.
67       *
68       * This API is currently only used for testing.
69       *
70       * @param uri
71       *        URI to parse
72       * @param charset
73       *        Charset to use while parsing the query
74       * @return a list of {@link NameValuePair} as built from the URI's query portion.
75       *
76       * @since 4.5
77       */
78      public static List <NameValuePair> parse(final URI uri, final Charset charset) {
79          Args.notNull(uri, "URI");
80          final String query = uri.getRawQuery();
81          if (query != null && !query.isEmpty()) {
82              return parse(query, charset);
83          }
84          return createEmptyList();
85      }
86  
87      /**
88       * Returns a list of {@link NameValuePair NameValuePairs} as parsed from the given string using the given character
89       * encoding. By convention, {@code '&'} and {@code ';'} are accepted as parameter separators.
90       *
91       * @param s
92       *            text to parse.
93       * @param charset
94       *            Encoding to use when decoding the parameters.
95       * @return a list of {@link NameValuePair} as built from the URI's query portion.
96       *
97       * @since 4.2
98       */
99      public static List<NameValuePair> parse(final CharSequence s, final Charset charset) {
100         if (s == null) {
101             return createEmptyList();
102         }
103         return parse(s, charset, QP_SEP_A, QP_SEP_S);
104     }
105 
106     /**
107      * Returns a list of {@link NameValuePair NameValuePairs} as parsed from the given string using
108      * the given character encoding.
109      *
110      * @param s
111      *            text to parse.
112      * @param charset
113      *            Encoding to use when decoding the parameters.
114      * @param separators
115      *            element separators.
116      * @return a list of {@link NameValuePair} as built from the URI's query portion.
117      *
118      * @since 4.4
119      */
120     public static List<NameValuePair> parse(
121             final CharSequence s, final Charset charset, final char... separators) {
122         Args.notNull(s, "Char array buffer");
123         final TokenParser tokenParser = TokenParser.INSTANCE;
124         final BitSet delimSet = new BitSet();
125         for (final char separator: separators) {
126             delimSet.set(separator);
127         }
128         final ParserCursor cursor = new ParserCursor(0, s.length());
129         final List<NameValuePair> list = new ArrayList<>();
130         while (!cursor.atEnd()) {
131             delimSet.set('=');
132             final String name = tokenParser.parseToken(s, cursor, delimSet);
133             String value = null;
134             if (!cursor.atEnd()) {
135                 final int delim = s.charAt(cursor.getPos());
136                 cursor.updatePos(cursor.getPos() + 1);
137                 if (delim == '=') {
138                     delimSet.clear('=');
139                     value = tokenParser.parseToken(s, cursor, delimSet);
140                     if (!cursor.atEnd()) {
141                         cursor.updatePos(cursor.getPos() + 1);
142                     }
143                 }
144             }
145             if (!name.isEmpty()) {
146                 list.add(new BasicNameValuePair(
147                         decodeFormFields(name, charset),
148                         decodeFormFields(value, charset)));
149             }
150         }
151         return list;
152     }
153 
154     /**
155      * Returns a String that is suitable for use as an {@code application/x-www-form-urlencoded}
156      * list of parameters in an HTTP PUT or HTTP POST.
157      *
158      * @param parameters  The parameters to include.
159      * @param charset The encoding to use.
160      * @return An {@code application/x-www-form-urlencoded} string
161      *
162      * @since 4.2
163      */
164     public static String format(
165             final Iterable<? extends NameValuePair> parameters,
166             final Charset charset) {
167         return format(parameters, QP_SEP_A, charset);
168     }
169 
170     /**
171      * Returns a String that is suitable for use as an {@code application/x-www-form-urlencoded}
172      * list of parameters in an HTTP PUT or HTTP POST.
173      *
174      * @param parameters  The parameters to include.
175      * @param parameterSeparator The parameter separator, by convention, {@code '&'} or {@code ';'}.
176      * @param charset The encoding to use.
177      * @return An {@code application/x-www-form-urlencoded} string
178      *
179      * @since 4.3
180      */
181     public static String format(
182             final Iterable<? extends NameValuePair> parameters,
183             final char parameterSeparator,
184             final Charset charset) {
185         Args.notNull(parameters, "Parameters");
186         final StringBuilder result = new StringBuilder();
187         for (final NameValuePair parameter : parameters) {
188             final String encodedName = encodeFormFields(parameter.getName(), charset);
189             final String encodedValue = encodeFormFields(parameter.getValue(), charset);
190             if (result.length() > 0) {
191                 result.append(parameterSeparator);
192             }
193             result.append(encodedName);
194             if (encodedValue != null) {
195                 result.append(NAME_VALUE_SEPARATOR);
196                 result.append(encodedValue);
197             }
198         }
199         return result.toString();
200     }
201 
202     /**
203      * Unreserved characters, i.e. alphanumeric, plus: {@code _ - ! . ~ ' ( ) *}
204      * <p>
205      *  This list is the same as the {@code unreserved} list in
206      *  <a href="http://www.ietf.org/rfc/rfc2396.txt">RFC 2396</a>
207      */
208     private static final BitSet UNRESERVED   = new BitSet(256);
209     /**
210      * Punctuation characters: , ; : $ & + =
211      * <p>
212      * These are the additional characters allowed by userinfo.
213      */
214     private static final BitSet PUNCT        = new BitSet(256);
215     /** Characters which are safe to use in userinfo,
216      * i.e. {@link #UNRESERVED} plus {@link #PUNCT}uation */
217     private static final BitSet USERINFO     = new BitSet(256);
218     /** Characters which are safe to use in a path,
219      * i.e. {@link #UNRESERVED} plus {@link #PUNCT}uation plus / @ */
220     private static final BitSet PATHSAFE     = new BitSet(256);
221     /** Characters which are safe to use in a query or a fragment,
222      * i.e. {@link #RESERVED} plus {@link #UNRESERVED} */
223     private static final BitSet URIC     = new BitSet(256);
224 
225     /**
226      * Reserved characters, i.e. {@code ;/?:@&=+$,[]}
227      * <p>
228      *  This list is the same as the {@code reserved} list in
229      *  <a href="http://www.ietf.org/rfc/rfc2396.txt">RFC 2396</a>
230      *  as augmented by
231      *  <a href="http://www.ietf.org/rfc/rfc2732.txt">RFC 2732</a>
232      */
233     private static final BitSet RESERVED     = new BitSet(256);
234 
235 
236     /**
237      * Safe characters for x-www-form-urlencoded data, as per java.net.URLEncoder and browser behaviour,
238      * i.e. alphanumeric plus {@code "-", "_", ".", "*"}
239      */
240     private static final BitSet URLENCODER   = new BitSet(256);
241 
242     static {
243         // unreserved chars
244         // alpha characters
245         for (int i = 'a'; i <= 'z'; i++) {
246             UNRESERVED.set(i);
247         }
248         for (int i = 'A'; i <= 'Z'; i++) {
249             UNRESERVED.set(i);
250         }
251         // numeric characters
252         for (int i = '0'; i <= '9'; i++) {
253             UNRESERVED.set(i);
254         }
255         UNRESERVED.set('_'); // these are the charactes of the "mark" list
256         UNRESERVED.set('-');
257         UNRESERVED.set('.');
258         UNRESERVED.set('*');
259         URLENCODER.or(UNRESERVED); // skip remaining unreserved characters
260         UNRESERVED.set('!');
261         UNRESERVED.set('~');
262         UNRESERVED.set('\'');
263         UNRESERVED.set('(');
264         UNRESERVED.set(')');
265         // punct chars
266         PUNCT.set(',');
267         PUNCT.set(';');
268         PUNCT.set(':');
269         PUNCT.set('$');
270         PUNCT.set('&');
271         PUNCT.set('+');
272         PUNCT.set('=');
273         // Safe for userinfo
274         USERINFO.or(UNRESERVED);
275         USERINFO.or(PUNCT);
276 
277         // URL path safe
278         PATHSAFE.or(UNRESERVED);
279         PATHSAFE.set('/'); // segment separator
280         PATHSAFE.set(';'); // param separator
281         PATHSAFE.set(':'); // rest as per list in 2396, i.e. : @ & = + $ ,
282         PATHSAFE.set('@');
283         PATHSAFE.set('&');
284         PATHSAFE.set('=');
285         PATHSAFE.set('+');
286         PATHSAFE.set('$');
287         PATHSAFE.set(',');
288 
289         RESERVED.set(';');
290         RESERVED.set('/');
291         RESERVED.set('?');
292         RESERVED.set(':');
293         RESERVED.set('@');
294         RESERVED.set('&');
295         RESERVED.set('=');
296         RESERVED.set('+');
297         RESERVED.set('$');
298         RESERVED.set(',');
299         RESERVED.set('['); // added by RFC 2732
300         RESERVED.set(']'); // added by RFC 2732
301 
302         URIC.or(RESERVED);
303         URIC.or(UNRESERVED);
304     }
305 
306     private static final int RADIX = 16;
307 
308     private static List<NameValuePair> createEmptyList() {
309         return new ArrayList<>(0);
310     }
311 
312     private static String urlEncode(
313             final String content,
314             final Charset charset,
315             final BitSet safechars,
316             final boolean blankAsPlus) {
317         if (content == null) {
318             return null;
319         }
320         final StringBuilder buf = new StringBuilder();
321         final ByteBuffer bb = charset.encode(content);
322         while (bb.hasRemaining()) {
323             final int b = bb.get() & 0xff;
324             if (safechars.get(b)) {
325                 buf.append((char) b);
326             } else if (blankAsPlus && b == ' ') {
327                 buf.append('+');
328             } else {
329                 buf.append("%");
330                 final char hex1 = Character.toUpperCase(Character.forDigit((b >> 4) & 0xF, RADIX));
331                 final char hex2 = Character.toUpperCase(Character.forDigit(b & 0xF, RADIX));
332                 buf.append(hex1);
333                 buf.append(hex2);
334             }
335         }
336         return buf.toString();
337     }
338 
339     /**
340      * Decode/unescape a portion of a URL, to use with the query part ensure {@code plusAsBlank} is true.
341      *
342      * @param content the portion to decode
343      * @param charset the charset to use
344      * @param plusAsBlank if {@code true}, then convert '+' to space (e.g. for www-url-form-encoded content), otherwise leave as is.
345      * @return encoded string
346      */
347     private static String urlDecode(
348             final String content,
349             final Charset charset,
350             final boolean plusAsBlank) {
351         if (content == null) {
352             return null;
353         }
354         final ByteBuffer bb = ByteBuffer.allocate(content.length());
355         final CharBuffer cb = CharBuffer.wrap(content);
356         while (cb.hasRemaining()) {
357             final char c = cb.get();
358             if (c == '%' && cb.remaining() >= 2) {
359                 final char uc = cb.get();
360                 final char lc = cb.get();
361                 final int u = Character.digit(uc, 16);
362                 final int l = Character.digit(lc, 16);
363                 if (u != -1 && l != -1) {
364                     bb.put((byte) ((u << 4) + l));
365                 } else {
366                     bb.put((byte) '%');
367                     bb.put((byte) uc);
368                     bb.put((byte) lc);
369                 }
370             } else if (plusAsBlank && c == '+') {
371                 bb.put((byte) ' ');
372             } else {
373                 bb.put((byte) c);
374             }
375         }
376         bb.flip();
377         return charset.decode(bb).toString();
378     }
379 
380     /**
381      * Decode/unescape www-url-form-encoded content.
382      *
383      * @param content the content to decode, will decode '+' as space
384      * @param charset the charset to use
385      * @return encoded string
386      */
387     private static String decodeFormFields (final String content, final Charset charset) {
388         if (content == null) {
389             return null;
390         }
391         return urlDecode(content, charset != null ? charset : StandardCharsets.UTF_8, true);
392     }
393 
394     /**
395      * Encode/escape www-url-form-encoded content.
396      * <p>
397      * Uses the {@link #URLENCODER} set of characters, rather than
398      * the {@link #UNRESERVED} set; this is for compatibilty with previous
399      * releases, URLEncoder.encode() and most browsers.
400      *
401      * @param content the content to encode, will convert space to '+'
402      * @param charset the charset to use
403      * @return encoded string
404      */
405     private static String encodeFormFields (final String content, final Charset charset) {
406         if (content == null) {
407             return null;
408         }
409         return urlEncode(content, charset != null ? charset : StandardCharsets.UTF_8, URLENCODER, true);
410     }
411 
412     /**
413      * Encode a String using the {@link #USERINFO} set of characters.
414      * <p>
415      * Used by URIBuilder to encode the userinfo segment.
416      *
417      * @param content the string to encode, does not convert space to '+'
418      * @param charset the charset to use
419      * @return the encoded string
420      */
421     static String encUserInfo(final String content, final Charset charset) {
422         return urlEncode(content, charset, USERINFO, false);
423     }
424 
425     /**
426      * Encode a String using the {@link #URIC} set of characters.
427      * <p>
428      * Used by URIBuilder to encode the query and fragment segments.
429      *
430      * @param content the string to encode, does not convert space to '+'
431      * @param charset the charset to use
432      * @return the encoded string
433      */
434     static String encUric(final String content, final Charset charset) {
435         return urlEncode(content, charset, URIC, false);
436     }
437 
438     /**
439      * Encode a String using the {@link #PATHSAFE} set of characters.
440      * <p>
441      * Used by URIBuilder to encode path segments.
442      *
443      * @param content the string to encode, does not convert space to '+'
444      * @param charset the charset to use
445      * @return the encoded string
446      */
447     static String encPath(final String content, final Charset charset) {
448         return urlEncode(content, charset, PATHSAFE, false);
449     }
450 
451 }