View Javadoc
1   /*
2    * ====================================================================
3    * Licensed to the Apache Software Foundation (ASF) under one
4    * or more contributor license agreements.  See the NOTICE file
5    * distributed with this work for additional information
6    * regarding copyright ownership.  The ASF licenses this file
7    * to you under the Apache License, Version 2.0 (the
8    * "License"); you may not use this file except in compliance
9    * with the License.  You may obtain a copy of the License at
10   *
11   *   http://www.apache.org/licenses/LICENSE-2.0
12   *
13   * Unless required by applicable law or agreed to in writing,
14   * software distributed under the License is distributed on an
15   * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
16   * KIND, either express or implied.  See the License for the
17   * specific language governing permissions and limitations
18   * under the License.
19   * ====================================================================
20   *
21   * This software consists of voluntary contributions made by many
22   * individuals on behalf of the Apache Software Foundation.  For more
23   * information on the Apache Software Foundation, please see
24   * <http://www.apache.org/>.
25   *
26   */
27  
28  package org.apache.hc.core5.net;
29  
30  import java.net.URI;
31  import java.nio.ByteBuffer;
32  import java.nio.CharBuffer;
33  import java.nio.charset.Charset;
34  import java.nio.charset.StandardCharsets;
35  import java.util.ArrayList;
36  import java.util.BitSet;
37  import java.util.List;
38  
39  import org.apache.hc.core5.http.NameValuePair;
40  import org.apache.hc.core5.http.message.BasicNameValuePair;
41  import org.apache.hc.core5.http.message.ParserCursor;
42  import org.apache.hc.core5.http.message.TokenParser;
43  import org.apache.hc.core5.util.Args;
44  import org.apache.hc.core5.util.CharArrayBuffer;
45  
46  /**
47   * A collection of utilities for encoding URLs.
48   *
49   * @since 4.0
50   */
51  public class URLEncodedUtils {
52  
53      /**
54       * The default HTML form content type.
55       */
56      public static final String CONTENT_TYPE = "application/x-www-form-urlencoded";
57  
58      private static final char QP_SEP_A = '&';
59      private static final char QP_SEP_S = ';';
60      private static final String NAME_VALUE_SEPARATOR = "=";
61  
62      /**
63       * Returns a list of {@link NameValuePair NameValuePairs} as built from the URI's query portion. For example, a URI
64       * of {@code http://example.org/path/to/file?a=1&b=2&c=3} would return a list of three NameValuePairs, one for a=1,
65       * one for b=2, and one for c=3. By convention, {@code '&'} and {@code ';'} are accepted as parameter separators.
66       * <p>
67       * This is typically useful while parsing an HTTP PUT.
68       *
69       * This API is currently only used for testing.
70       *
71       * @param uri
72       *        URI to parse
73       * @param charset
74       *        Charset to use while parsing the query
75       * @return a list of {@link NameValuePair} as built from the URI's query portion.
76       *
77       * @since 4.5
78       */
79      public static List <NameValuePair> parse(final URI uri, final Charset charset) {
80          Args.notNull(uri, "URI");
81          final String query = uri.getRawQuery();
82          if (query != null && !query.isEmpty()) {
83              return parse(query, charset);
84          }
85          return createEmptyList();
86      }
87  
88      /**
89       * Returns a list of {@link NameValuePair NameValuePairs} as parsed from the given string using the given character
90       * encoding. By convention, {@code '&'} and {@code ';'} are accepted as parameter separators.
91       *
92       * @param s
93       *            text to parse.
94       * @param charset
95       *            Encoding to use when decoding the parameters.
96       * @return a list of {@link NameValuePair} as built from the URI's query portion.
97       *
98       * @since 4.2
99       */
100     public static List<NameValuePair> parse(final String s, final Charset charset) {
101         if (s == null) {
102             return createEmptyList();
103         }
104         final CharArrayBuffer buffer = new CharArrayBuffer(s.length());
105         buffer.append(s);
106         return parse(buffer, charset, QP_SEP_A, QP_SEP_S);
107     }
108 
109     /**
110      * Returns a list of {@link NameValuePair NameValuePairs} as parsed from the given string using the given character
111      * encoding.
112      *
113      * @param s
114      *            text to parse.
115      * @param charset
116      *            Encoding to use when decoding the parameters.
117      * @param separators
118      *            element separators.
119      * @return a list of {@link NameValuePair} as built from the URI's query portion.
120      *
121      * @since 4.3
122      */
123     public static List<NameValuePair> parse(final String s, final Charset charset, final char... separators) {
124         if (s == null) {
125             return createEmptyList();
126         }
127         final CharArrayBuffer buffer = new CharArrayBuffer(s.length());
128         buffer.append(s);
129         return parse(buffer, charset, separators);
130     }
131 
132     /**
133      * Returns a list of {@link NameValuePair NameValuePairs} as parsed from the given string using
134      * the given character encoding.
135      *
136      * @param buf
137      *            text to parse.
138      * @param charset
139      *            Encoding to use when decoding the parameters.
140      * @param separators
141      *            element separators.
142      * @return a list of {@link NameValuePair} as built from the URI's query portion.
143      *
144      * @since 4.4
145      */
146     public static List<NameValuePair> parse(
147             final CharArrayBuffer buf, final Charset charset, final char... separators) {
148         Args.notNull(buf, "Char array buffer");
149         final TokenParser tokenParser = TokenParser.INSTANCE;
150         final BitSet delimSet = new BitSet();
151         for (final char separator: separators) {
152             delimSet.set(separator);
153         }
154         final ParserCursor cursor = new ParserCursor(0, buf.length());
155         final List<NameValuePair> list = new ArrayList<>();
156         while (!cursor.atEnd()) {
157             delimSet.set('=');
158             final String name = tokenParser.parseToken(buf, cursor, delimSet);
159             String value = null;
160             if (!cursor.atEnd()) {
161                 final int delim = buf.charAt(cursor.getPos());
162                 cursor.updatePos(cursor.getPos() + 1);
163                 if (delim == '=') {
164                     delimSet.clear('=');
165                     value = tokenParser.parseValue(buf, cursor, delimSet);
166                     if (!cursor.atEnd()) {
167                         cursor.updatePos(cursor.getPos() + 1);
168                     }
169                 }
170             }
171             if (!name.isEmpty()) {
172                 list.add(new BasicNameValuePair(
173                         decodeFormFields(name, charset),
174                         decodeFormFields(value, charset)));
175             }
176         }
177         return list;
178     }
179 
180     /**
181      * Returns a String that is suitable for use as an {@code application/x-www-form-urlencoded}
182      * list of parameters in an HTTP PUT or HTTP POST.
183      *
184      * @param parameters  The parameters to include.
185      * @param charset The encoding to use.
186      * @return An {@code application/x-www-form-urlencoded} string
187      *
188      * @since 4.2
189      */
190     public static String format(
191             final Iterable<? extends NameValuePair> parameters,
192             final Charset charset) {
193         return format(parameters, QP_SEP_A, charset);
194     }
195 
196     /**
197      * Returns a String that is suitable for use as an {@code application/x-www-form-urlencoded}
198      * list of parameters in an HTTP PUT or HTTP POST.
199      *
200      * @param parameters  The parameters to include.
201      * @param parameterSeparator The parameter separator, by convention, {@code '&'} or {@code ';'}.
202      * @param charset The encoding to use.
203      * @return An {@code application/x-www-form-urlencoded} string
204      *
205      * @since 4.3
206      */
207     public static String format(
208             final Iterable<? extends NameValuePair> parameters,
209             final char parameterSeparator,
210             final Charset charset) {
211         Args.notNull(parameters, "Parameters");
212         final StringBuilder result = new StringBuilder();
213         for (final NameValuePair parameter : parameters) {
214             final String encodedName = encodeFormFields(parameter.getName(), charset);
215             final String encodedValue = encodeFormFields(parameter.getValue(), charset);
216             if (result.length() > 0) {
217                 result.append(parameterSeparator);
218             }
219             result.append(encodedName);
220             if (encodedValue != null) {
221                 result.append(NAME_VALUE_SEPARATOR);
222                 result.append(encodedValue);
223             }
224         }
225         return result.toString();
226     }
227 
228     /**
229      * Unreserved characters, i.e. alphanumeric, plus: {@code _ - ! . ~ ' ( ) *}
230      * <p>
231      *  This list is the same as the {@code unreserved} list in
232      *  <a href="http://www.ietf.org/rfc/rfc2396.txt">RFC 2396</a>
233      */
234     private static final BitSet UNRESERVED   = new BitSet(256);
235     /**
236      * Punctuation characters: , ; : $ & + =
237      * <p>
238      * These are the additional characters allowed by userinfo.
239      */
240     private static final BitSet PUNCT        = new BitSet(256);
241     /** Characters which are safe to use in userinfo,
242      * i.e. {@link #UNRESERVED} plus {@link #PUNCT}uation */
243     private static final BitSet USERINFO     = new BitSet(256);
244     /** Characters which are safe to use in a path,
245      * i.e. {@link #UNRESERVED} plus {@link #PUNCT}uation plus / @ */
246     private static final BitSet PATHSAFE     = new BitSet(256);
247     /** Characters which are safe to use in a query or a fragment,
248      * i.e. {@link #RESERVED} plus {@link #UNRESERVED} */
249     private static final BitSet URIC     = new BitSet(256);
250 
251     /**
252      * Reserved characters, i.e. {@code ;/?:@&=+$,[]}
253      * <p>
254      *  This list is the same as the {@code reserved} list in
255      *  <a href="http://www.ietf.org/rfc/rfc2396.txt">RFC 2396</a>
256      *  as augmented by
257      *  <a href="http://www.ietf.org/rfc/rfc2732.txt">RFC 2732</a>
258      */
259     private static final BitSet RESERVED     = new BitSet(256);
260 
261 
262     /**
263      * Safe characters for x-www-form-urlencoded data, as per java.net.URLEncoder and browser behaviour,
264      * i.e. alphanumeric plus {@code "-", "_", ".", "*"}
265      */
266     private static final BitSet URLENCODER   = new BitSet(256);
267 
268     static {
269         // unreserved chars
270         // alpha characters
271         for (int i = 'a'; i <= 'z'; i++) {
272             UNRESERVED.set(i);
273         }
274         for (int i = 'A'; i <= 'Z'; i++) {
275             UNRESERVED.set(i);
276         }
277         // numeric characters
278         for (int i = '0'; i <= '9'; i++) {
279             UNRESERVED.set(i);
280         }
281         UNRESERVED.set('_'); // these are the charactes of the "mark" list
282         UNRESERVED.set('-');
283         UNRESERVED.set('.');
284         UNRESERVED.set('*');
285         URLENCODER.or(UNRESERVED); // skip remaining unreserved characters
286         UNRESERVED.set('!');
287         UNRESERVED.set('~');
288         UNRESERVED.set('\'');
289         UNRESERVED.set('(');
290         UNRESERVED.set(')');
291         // punct chars
292         PUNCT.set(',');
293         PUNCT.set(';');
294         PUNCT.set(':');
295         PUNCT.set('$');
296         PUNCT.set('&');
297         PUNCT.set('+');
298         PUNCT.set('=');
299         // Safe for userinfo
300         USERINFO.or(UNRESERVED);
301         USERINFO.or(PUNCT);
302 
303         // URL path safe
304         PATHSAFE.or(UNRESERVED);
305         PATHSAFE.set('/'); // segment separator
306         PATHSAFE.set(';'); // param separator
307         PATHSAFE.set(':'); // rest as per list in 2396, i.e. : @ & = + $ ,
308         PATHSAFE.set('@');
309         PATHSAFE.set('&');
310         PATHSAFE.set('=');
311         PATHSAFE.set('+');
312         PATHSAFE.set('$');
313         PATHSAFE.set(',');
314 
315         RESERVED.set(';');
316         RESERVED.set('/');
317         RESERVED.set('?');
318         RESERVED.set(':');
319         RESERVED.set('@');
320         RESERVED.set('&');
321         RESERVED.set('=');
322         RESERVED.set('+');
323         RESERVED.set('$');
324         RESERVED.set(',');
325         RESERVED.set('['); // added by RFC 2732
326         RESERVED.set(']'); // added by RFC 2732
327 
328         URIC.or(RESERVED);
329         URIC.or(UNRESERVED);
330     }
331 
332     private static final int RADIX = 16;
333 
334     private static List<NameValuePair> createEmptyList() {
335         return new ArrayList<>(0);
336     }
337 
338     private static String urlEncode(
339             final String content,
340             final Charset charset,
341             final BitSet safechars,
342             final boolean blankAsPlus) {
343         if (content == null) {
344             return null;
345         }
346         final StringBuilder buf = new StringBuilder();
347         final ByteBuffer bb = charset.encode(content);
348         while (bb.hasRemaining()) {
349             final int b = bb.get() & 0xff;
350             if (safechars.get(b)) {
351                 buf.append((char) b);
352             } else if (blankAsPlus && b == ' ') {
353                 buf.append('+');
354             } else {
355                 buf.append("%");
356                 final char hex1 = Character.toUpperCase(Character.forDigit((b >> 4) & 0xF, RADIX));
357                 final char hex2 = Character.toUpperCase(Character.forDigit(b & 0xF, RADIX));
358                 buf.append(hex1);
359                 buf.append(hex2);
360             }
361         }
362         return buf.toString();
363     }
364 
365     /**
366      * Decode/unescape a portion of a URL, to use with the query part ensure {@code plusAsBlank} is true.
367      *
368      * @param content the portion to decode
369      * @param charset the charset to use
370      * @param plusAsBlank if {@code true}, then convert '+' to space (e.g. for www-url-form-encoded content), otherwise leave as is.
371      * @return encoded string
372      */
373     private static String urlDecode(
374             final String content,
375             final Charset charset,
376             final boolean plusAsBlank) {
377         if (content == null) {
378             return null;
379         }
380         final ByteBuffer bb = ByteBuffer.allocate(content.length());
381         final CharBuffer cb = CharBuffer.wrap(content);
382         while (cb.hasRemaining()) {
383             final char c = cb.get();
384             if (c == '%' && cb.remaining() >= 2) {
385                 final char uc = cb.get();
386                 final char lc = cb.get();
387                 final int u = Character.digit(uc, 16);
388                 final int l = Character.digit(lc, 16);
389                 if (u != -1 && l != -1) {
390                     bb.put((byte) ((u << 4) + l));
391                 } else {
392                     bb.put((byte) '%');
393                     bb.put((byte) uc);
394                     bb.put((byte) lc);
395                 }
396             } else if (plusAsBlank && c == '+') {
397                 bb.put((byte) ' ');
398             } else {
399                 bb.put((byte) c);
400             }
401         }
402         bb.flip();
403         return charset.decode(bb).toString();
404     }
405 
406     /**
407      * Decode/unescape www-url-form-encoded content.
408      *
409      * @param content the content to decode, will decode '+' as space
410      * @param charset the charset to use
411      * @return encoded string
412      */
413     private static String decodeFormFields (final String content, final Charset charset) {
414         if (content == null) {
415             return null;
416         }
417         return urlDecode(content, charset != null ? charset : StandardCharsets.UTF_8, true);
418     }
419 
420     /**
421      * Encode/escape www-url-form-encoded content.
422      * <p>
423      * Uses the {@link #URLENCODER} set of characters, rather than
424      * the {@link #UNRESERVED} set; this is for compatibilty with previous
425      * releases, URLEncoder.encode() and most browsers.
426      *
427      * @param content the content to encode, will convert space to '+'
428      * @param charset the charset to use
429      * @return encoded string
430      */
431     private static String encodeFormFields (final String content, final Charset charset) {
432         if (content == null) {
433             return null;
434         }
435         return urlEncode(content, charset != null ? charset : StandardCharsets.UTF_8, URLENCODER, true);
436     }
437 
438     /**
439      * Encode a String using the {@link #USERINFO} set of characters.
440      * <p>
441      * Used by URIBuilder to encode the userinfo segment.
442      *
443      * @param content the string to encode, does not convert space to '+'
444      * @param charset the charset to use
445      * @return the encoded string
446      */
447     static String encUserInfo(final String content, final Charset charset) {
448         return urlEncode(content, charset, USERINFO, false);
449     }
450 
451     /**
452      * Encode a String using the {@link #URIC} set of characters.
453      * <p>
454      * Used by URIBuilder to encode the query and fragment segments.
455      *
456      * @param content the string to encode, does not convert space to '+'
457      * @param charset the charset to use
458      * @return the encoded string
459      */
460     static String encUric(final String content, final Charset charset) {
461         return urlEncode(content, charset, URIC, false);
462     }
463 
464     /**
465      * Encode a String using the {@link #PATHSAFE} set of characters.
466      * <p>
467      * Used by URIBuilder to encode path segments.
468      *
469      * @param content the string to encode, does not convert space to '+'
470      * @param charset the charset to use
471      * @return the encoded string
472      */
473     static String encPath(final String content, final Charset charset) {
474         return urlEncode(content, charset, PATHSAFE, false);
475     }
476 
477 }