View Javadoc

1   /*
2    * ====================================================================
3    * Licensed to the Apache Software Foundation (ASF) under one
4    * or more contributor license agreements.  See the NOTICE file
5    * distributed with this work for additional information
6    * regarding copyright ownership.  The ASF licenses this file
7    * to you under the Apache License, Version 2.0 (the
8    * "License"); you may not use this file except in compliance
9    * with the License.  You may obtain a copy of the License at
10   *
11   *   http://www.apache.org/licenses/LICENSE-2.0
12   *
13   * Unless required by applicable law or agreed to in writing,
14   * software distributed under the License is distributed on an
15   * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
16   * KIND, either express or implied.  See the License for the
17   * specific language governing permissions and limitations
18   * under the License.
19   * ====================================================================
20   *
21   * This software consists of voluntary contributions made by many
22   * individuals on behalf of the Apache Software Foundation.  For more
23   * information on the Apache Software Foundation, please see
24   * <http://www.apache.org/>.
25   *
26   */
27  
28  package org.apache.http.message;
29  
30  import java.util.NoSuchElementException;
31  
32  import org.apache.http.HeaderIterator;
33  import org.apache.http.ParseException;
34  import org.apache.http.TokenIterator;
35  import org.apache.http.annotation.NotThreadSafe;
36  import org.apache.http.util.Args;
37  
38  /**
39   * Basic implementation of a {@link TokenIterator}.
40   * This implementation parses <tt>#token<tt> sequences as
41   * defined by RFC 2616, section 2.
42   * It extends that definition somewhat beyond US-ASCII.
43   *
44   * @since 4.0
45   */
46  @NotThreadSafe
47  public class BasicTokenIterator implements TokenIterator {
48  
49      /** The HTTP separator characters. Defined in RFC 2616, section 2.2. */
50      // the order of the characters here is adjusted to put the
51      // most likely candidates at the beginning of the collection
52      public final static String HTTP_SEPARATORS = " ,;=()<>@:\\\"/[]?{}\t";
53  
54  
55      /** The iterator from which to obtain the next header. */
56      protected final HeaderIterator headerIt;
57  
58      /**
59       * The value of the current header.
60       * This is the header value that includes {@link #currentToken}.
61       * Undefined if the iteration is over.
62       */
63      protected String currentHeader;
64  
65      /**
66       * The token to be returned by the next call to {@link #currentToken}.
67       * <code>null</code> if the iteration is over.
68       */
69      protected String currentToken;
70  
71      /**
72       * The position after {@link #currentToken} in {@link #currentHeader}.
73       * Undefined if the iteration is over.
74       */
75      protected int searchPos;
76  
77  
78      /**
79       * Creates a new instance of {@link BasicTokenIterator}.
80       *
81       * @param headerIterator    the iterator for the headers to tokenize
82       */
83      public BasicTokenIterator(final HeaderIterator headerIterator) {
84          super();
85          this.headerIt = Args.notNull(headerIterator, "Header iterator");
86          this.searchPos = findNext(-1);
87      }
88  
89  
90      // non-javadoc, see interface TokenIterator
91      public boolean hasNext() {
92          return (this.currentToken != null);
93      }
94  
95  
96      /**
97       * Obtains the next token from this iteration.
98       *
99       * @return  the next token in this iteration
100      *
101      * @throws NoSuchElementException   if the iteration is already over
102      * @throws ParseException   if an invalid header value is encountered
103      */
104     public String nextToken()
105         throws NoSuchElementException, ParseException {
106 
107         if (this.currentToken == null) {
108             throw new NoSuchElementException("Iteration already finished.");
109         }
110 
111         final String result = this.currentToken;
112         // updates currentToken, may trigger ParseException:
113         this.searchPos = findNext(this.searchPos);
114 
115         return result;
116     }
117 
118 
119     /**
120      * Returns the next token.
121      * Same as {@link #nextToken}, but with generic return type.
122      *
123      * @return  the next token in this iteration
124      *
125      * @throws NoSuchElementException   if there are no more tokens
126      * @throws ParseException   if an invalid header value is encountered
127      */
128     public final Object next()
129         throws NoSuchElementException, ParseException {
130         return nextToken();
131     }
132 
133 
134     /**
135      * Removing tokens is not supported.
136      *
137      * @throws UnsupportedOperationException    always
138      */
139     public final void remove()
140         throws UnsupportedOperationException {
141 
142         throw new UnsupportedOperationException
143             ("Removing tokens is not supported.");
144     }
145 
146 
147     /**
148      * Determines the next token.
149      * If found, the token is stored in {@link #currentToken}.
150      * The return value indicates the position after the token
151      * in {@link #currentHeader}. If necessary, the next header
152      * will be obtained from {@link #headerIt}.
153      * If not found, {@link #currentToken} is set to <code>null</code>.
154      *
155      * @param from      the position in the current header at which to
156      *                  start the search, -1 to search in the first header
157      *
158      * @return  the position after the found token in the current header, or
159      *          negative if there was no next token
160      *
161      * @throws ParseException   if an invalid header value is encountered
162      */
163     protected int findNext(int from)
164         throws ParseException {
165 
166         if (from < 0) {
167             // called from the constructor, initialize the first header
168             if (!this.headerIt.hasNext()) {
169                 return -1;
170             }
171             this.currentHeader = this.headerIt.nextHeader().getValue();
172             from = 0;
173         } else {
174             // called after a token, make sure there is a separator
175             from = findTokenSeparator(from);
176         }
177 
178         final int start = findTokenStart(from);
179         if (start < 0) {
180             this.currentToken = null;
181             return -1; // nothing found
182         }
183 
184         final int end = findTokenEnd(start);
185         this.currentToken = createToken(this.currentHeader, start, end);
186         return end;
187     }
188 
189 
190     /**
191      * Creates a new token to be returned.
192      * Called from {@link #findNext findNext} after the token is identified.
193      * The default implementation simply calls
194      * {@link java.lang.String#substring String.substring}.
195      * <br/>
196      * If header values are significantly longer than tokens, and some
197      * tokens are permanently referenced by the application, there can
198      * be problems with garbage collection. A substring will hold a
199      * reference to the full characters of the original string and
200      * therefore occupies more memory than might be expected.
201      * To avoid this, override this method and create a new string
202      * instead of a substring.
203      *
204      * @param value     the full header value from which to create a token
205      * @param start     the index of the first token character
206      * @param end       the index after the last token character
207      *
208      * @return  a string representing the token identified by the arguments
209      */
210     protected String createToken(final String value, final int start, final int end) {
211         return value.substring(start, end);
212     }
213 
214 
215     /**
216      * Determines the starting position of the next token.
217      * This method will iterate over headers if necessary.
218      *
219      * @param from      the position in the current header at which to
220      *                  start the search
221      *
222      * @return  the position of the token start in the current header,
223      *          negative if no token start could be found
224      */
225     protected int findTokenStart(int from) {
226         Args.notNegative(from, "Search position");
227         boolean found = false;
228         while (!found && (this.currentHeader != null)) {
229 
230             final int to = this.currentHeader.length();
231             while (!found && (from < to)) {
232 
233                 final char ch = this.currentHeader.charAt(from);
234                 if (isTokenSeparator(ch) || isWhitespace(ch)) {
235                     // whitspace and token separators are skipped
236                     from++;
237                 } else if (isTokenChar(this.currentHeader.charAt(from))) {
238                     // found the start of a token
239                     found = true;
240                 } else {
241                     throw new ParseException
242                         ("Invalid character before token (pos " + from +
243                          "): " + this.currentHeader);
244                 }
245             }
246             if (!found) {
247                 if (this.headerIt.hasNext()) {
248                     this.currentHeader = this.headerIt.nextHeader().getValue();
249                     from = 0;
250                 } else {
251                     this.currentHeader = null;
252                 }
253             }
254         } // while headers
255 
256         return found ? from : -1;
257     }
258 
259 
260     /**
261      * Determines the position of the next token separator.
262      * Because of multi-header joining rules, the end of a
263      * header value is a token separator. This method does
264      * therefore not need to iterate over headers.
265      *
266      * @param from      the position in the current header at which to
267      *                  start the search
268      *
269      * @return  the position of a token separator in the current header,
270      *          or at the end
271      *
272      * @throws ParseException
273      *         if a new token is found before a token separator.
274      *         RFC 2616, section 2.1 explicitly requires a comma between
275      *         tokens for <tt>#</tt>.
276      */
277     protected int findTokenSeparator(int from) {
278         Args.notNegative(from, "Search position");
279         boolean found = false;
280         final int to = this.currentHeader.length();
281         while (!found && (from < to)) {
282             final char ch = this.currentHeader.charAt(from);
283             if (isTokenSeparator(ch)) {
284                 found = true;
285             } else if (isWhitespace(ch)) {
286                 from++;
287             } else if (isTokenChar(ch)) {
288                 throw new ParseException
289                     ("Tokens without separator (pos " + from +
290                      "): " + this.currentHeader);
291             } else {
292                 throw new ParseException
293                     ("Invalid character after token (pos " + from +
294                      "): " + this.currentHeader);
295             }
296         }
297 
298         return from;
299     }
300 
301 
302     /**
303      * Determines the ending position of the current token.
304      * This method will not leave the current header value,
305      * since the end of the header value is a token boundary.
306      *
307      * @param from      the position of the first character of the token
308      *
309      * @return  the position after the last character of the token.
310      *          The behavior is undefined if <code>from</code> does not
311      *          point to a token character in the current header value.
312      */
313     protected int findTokenEnd(final int from) {
314         Args.notNegative(from, "Search position");
315         final int to = this.currentHeader.length();
316         int end = from+1;
317         while ((end < to) && isTokenChar(this.currentHeader.charAt(end))) {
318             end++;
319         }
320 
321         return end;
322     }
323 
324 
325     /**
326      * Checks whether a character is a token separator.
327      * RFC 2616, section 2.1 defines comma as the separator for
328      * <tt>#token</tt> sequences. The end of a header value will
329      * also separate tokens, but that is not a character check.
330      *
331      * @param ch        the character to check
332      *
333      * @return  <code>true</code> if the character is a token separator,
334      *          <code>false</code> otherwise
335      */
336     protected boolean isTokenSeparator(final char ch) {
337         return (ch == ',');
338     }
339 
340 
341     /**
342      * Checks whether a character is a whitespace character.
343      * RFC 2616, section 2.2 defines space and horizontal tab as whitespace.
344      * The optional preceeding line break is irrelevant, since header
345      * continuation is handled transparently when parsing messages.
346      *
347      * @param ch        the character to check
348      *
349      * @return  <code>true</code> if the character is whitespace,
350      *          <code>false</code> otherwise
351      */
352     protected boolean isWhitespace(final char ch) {
353 
354         // we do not use Character.isWhitspace(ch) here, since that allows
355         // many control characters which are not whitespace as per RFC 2616
356         return ((ch == '\t') || Character.isSpaceChar(ch));
357     }
358 
359 
360     /**
361      * Checks whether a character is a valid token character.
362      * Whitespace, control characters, and HTTP separators are not
363      * valid token characters. The HTTP specification (RFC 2616, section 2.2)
364      * defines tokens only for the US-ASCII character set, this
365      * method extends the definition to other character sets.
366      *
367      * @param ch        the character to check
368      *
369      * @return  <code>true</code> if the character is a valid token start,
370      *          <code>false</code> otherwise
371      */
372     protected boolean isTokenChar(final char ch) {
373 
374         // common sense extension of ALPHA + DIGIT
375         if (Character.isLetterOrDigit(ch)) {
376             return true;
377         }
378 
379         // common sense extension of CTL
380         if (Character.isISOControl(ch)) {
381             return false;
382         }
383 
384         // no common sense extension for this
385         if (isHttpSeparator(ch)) {
386             return false;
387         }
388 
389         // RFC 2616, section 2.2 defines a token character as
390         // "any CHAR except CTLs or separators". The controls
391         // and separators are included in the checks above.
392         // This will yield unexpected results for Unicode format characters.
393         // If that is a problem, overwrite isHttpSeparator(char) to filter
394         // out the false positives.
395         return true;
396     }
397 
398 
399     /**
400      * Checks whether a character is an HTTP separator.
401      * The implementation in this class checks only for the HTTP separators
402      * defined in RFC 2616, section 2.2. If you need to detect other
403      * separators beyond the US-ASCII character set, override this method.
404      *
405      * @param ch        the character to check
406      *
407      * @return  <code>true</code> if the character is an HTTP separator
408      */
409     protected boolean isHttpSeparator(final char ch) {
410         return (HTTP_SEPARATORS.indexOf(ch) >= 0);
411     }
412 
413 
414 } // class BasicTokenIterator
415