View Javadoc

1   /*
2    * ====================================================================
3    * Licensed to the Apache Software Foundation (ASF) under one
4    * or more contributor license agreements.  See the NOTICE file
5    * distributed with this work for additional information
6    * regarding copyright ownership.  The ASF licenses this file
7    * to you under the Apache License, Version 2.0 (the
8    * "License"); you may not use this file except in compliance
9    * with the License.  You may obtain a copy of the License at
10   *
11   *   http://www.apache.org/licenses/LICENSE-2.0
12   *
13   * Unless required by applicable law or agreed to in writing,
14   * software distributed under the License is distributed on an
15   * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
16   * KIND, either express or implied.  See the License for the
17   * specific language governing permissions and limitations
18   * under the License.
19   * ====================================================================
20   *
21   * This software consists of voluntary contributions made by many
22   * individuals on behalf of the Apache Software Foundation.  For more
23   * information on the Apache Software Foundation, please see
24   * <http://www.apache.org/>.
25   *
26   */
27  
28  package org.apache.http.message;
29  
30  import java.util.NoSuchElementException;
31  
32  import org.apache.http.HeaderIterator;
33  import org.apache.http.ParseException;
34  import org.apache.http.TokenIterator;
35  import org.apache.http.annotation.NotThreadSafe;
36  
37  /**
38   * Basic implementation of a {@link TokenIterator}.
39   * This implementation parses <tt>#token<tt> sequences as
40   * defined by RFC 2616, section 2.
41   * It extends that definition somewhat beyond US-ASCII.
42   *
43   * @since 4.0
44   */
45  @NotThreadSafe
46  public class BasicTokenIterator implements TokenIterator {
47  
48      /** The HTTP separator characters. Defined in RFC 2616, section 2.2. */
49      // the order of the characters here is adjusted to put the
50      // most likely candidates at the beginning of the collection
51      public final static String HTTP_SEPARATORS = " ,;=()<>@:\\\"/[]?{}\t";
52  
53  
54      /** The iterator from which to obtain the next header. */
55      protected final HeaderIterator headerIt;
56  
57      /**
58       * The value of the current header.
59       * This is the header value that includes {@link #currentToken}.
60       * Undefined if the iteration is over.
61       */
62      protected String currentHeader;
63  
64      /**
65       * The token to be returned by the next call to {@link #currentToken}.
66       * <code>null</code> if the iteration is over.
67       */
68      protected String currentToken;
69  
70      /**
71       * The position after {@link #currentToken} in {@link #currentHeader}.
72       * Undefined if the iteration is over.
73       */
74      protected int searchPos;
75  
76  
77      /**
78       * Creates a new instance of {@link BasicTokenIterator}.
79       *
80       * @param headerIterator    the iterator for the headers to tokenize
81       */
82      public BasicTokenIterator(final HeaderIterator headerIterator) {
83          if (headerIterator == null) {
84              throw new IllegalArgumentException
85                  ("Header iterator must not be null.");
86          }
87  
88          this.headerIt = headerIterator;
89          this.searchPos = findNext(-1);
90      }
91  
92  
93      // non-javadoc, see interface TokenIterator
94      public boolean hasNext() {
95          return (this.currentToken != null);
96      }
97  
98  
99      /**
100      * Obtains the next token from this iteration.
101      *
102      * @return  the next token in this iteration
103      *
104      * @throws NoSuchElementException   if the iteration is already over
105      * @throws ParseException   if an invalid header value is encountered
106      */
107     public String nextToken()
108         throws NoSuchElementException, ParseException {
109 
110         if (this.currentToken == null) {
111             throw new NoSuchElementException("Iteration already finished.");
112         }
113 
114         final String result = this.currentToken;
115         // updates currentToken, may trigger ParseException:
116         this.searchPos = findNext(this.searchPos);
117 
118         return result;
119     }
120 
121 
122     /**
123      * Returns the next token.
124      * Same as {@link #nextToken}, but with generic return type.
125      *
126      * @return  the next token in this iteration
127      *
128      * @throws NoSuchElementException   if there are no more tokens
129      * @throws ParseException   if an invalid header value is encountered
130      */
131     public final Object next()
132         throws NoSuchElementException, ParseException {
133         return nextToken();
134     }
135 
136 
137     /**
138      * Removing tokens is not supported.
139      *
140      * @throws UnsupportedOperationException    always
141      */
142     public final void remove()
143         throws UnsupportedOperationException {
144 
145         throw new UnsupportedOperationException
146             ("Removing tokens is not supported.");
147     }
148 
149 
150     /**
151      * Determines the next token.
152      * If found, the token is stored in {@link #currentToken}.
153      * The return value indicates the position after the token
154      * in {@link #currentHeader}. If necessary, the next header
155      * will be obtained from {@link #headerIt}.
156      * If not found, {@link #currentToken} is set to <code>null</code>.
157      *
158      * @param from      the position in the current header at which to
159      *                  start the search, -1 to search in the first header
160      *
161      * @return  the position after the found token in the current header, or
162      *          negative if there was no next token
163      *
164      * @throws ParseException   if an invalid header value is encountered
165      */
166     protected int findNext(int from)
167         throws ParseException {
168 
169         if (from < 0) {
170             // called from the constructor, initialize the first header
171             if (!this.headerIt.hasNext()) {
172                 return -1;
173             }
174             this.currentHeader = this.headerIt.nextHeader().getValue();
175             from = 0;
176         } else {
177             // called after a token, make sure there is a separator
178             from = findTokenSeparator(from);
179         }
180 
181         int start = findTokenStart(from);
182         if (start < 0) {
183             this.currentToken = null;
184             return -1; // nothing found
185         }
186 
187         int end = findTokenEnd(start);
188         this.currentToken = createToken(this.currentHeader, start, end);
189         return end;
190     }
191 
192 
193     /**
194      * Creates a new token to be returned.
195      * Called from {@link #findNext findNext} after the token is identified.
196      * The default implementation simply calls
197      * {@link java.lang.String#substring String.substring}.
198      * <br/>
199      * If header values are significantly longer than tokens, and some
200      * tokens are permanently referenced by the application, there can
201      * be problems with garbage collection. A substring will hold a
202      * reference to the full characters of the original string and
203      * therefore occupies more memory than might be expected.
204      * To avoid this, override this method and create a new string
205      * instead of a substring.
206      *
207      * @param value     the full header value from which to create a token
208      * @param start     the index of the first token character
209      * @param end       the index after the last token character
210      *
211      * @return  a string representing the token identified by the arguments
212      */
213     protected String createToken(String value, int start, int end) {
214         return value.substring(start, end);
215     }
216 
217 
218     /**
219      * Determines the starting position of the next token.
220      * This method will iterate over headers if necessary.
221      *
222      * @param from      the position in the current header at which to
223      *                  start the search
224      *
225      * @return  the position of the token start in the current header,
226      *          negative if no token start could be found
227      */
228     protected int findTokenStart(int from) {
229         if (from < 0) {
230             throw new IllegalArgumentException
231                 ("Search position must not be negative: " + from);
232         }
233 
234         boolean found = false;
235         while (!found && (this.currentHeader != null)) {
236 
237             final int to = this.currentHeader.length();
238             while (!found && (from < to)) {
239 
240                 final char ch = this.currentHeader.charAt(from);
241                 if (isTokenSeparator(ch) || isWhitespace(ch)) {
242                     // whitspace and token separators are skipped
243                     from++;
244                 } else if (isTokenChar(this.currentHeader.charAt(from))) {
245                     // found the start of a token
246                     found = true;
247                 } else {
248                     throw new ParseException
249                         ("Invalid character before token (pos " + from +
250                          "): " + this.currentHeader);
251                 }
252             }
253             if (!found) {
254                 if (this.headerIt.hasNext()) {
255                     this.currentHeader = this.headerIt.nextHeader().getValue();
256                     from = 0;
257                 } else {
258                     this.currentHeader = null;
259                 }
260             }
261         } // while headers
262 
263         return found ? from : -1;
264     }
265 
266 
267     /**
268      * Determines the position of the next token separator.
269      * Because of multi-header joining rules, the end of a
270      * header value is a token separator. This method does
271      * therefore not need to iterate over headers.
272      *
273      * @param from      the position in the current header at which to
274      *                  start the search
275      *
276      * @return  the position of a token separator in the current header,
277      *          or at the end
278      *
279      * @throws ParseException
280      *         if a new token is found before a token separator.
281      *         RFC 2616, section 2.1 explicitly requires a comma between
282      *         tokens for <tt>#</tt>.
283      */
284     protected int findTokenSeparator(int from) {
285         if (from < 0) {
286             throw new IllegalArgumentException
287                 ("Search position must not be negative: " + from);
288         }
289 
290         boolean found = false;
291         final int to = this.currentHeader.length();
292         while (!found && (from < to)) {
293             final char ch = this.currentHeader.charAt(from);
294             if (isTokenSeparator(ch)) {
295                 found = true;
296             } else if (isWhitespace(ch)) {
297                 from++;
298             } else if (isTokenChar(ch)) {
299                 throw new ParseException
300                     ("Tokens without separator (pos " + from +
301                      "): " + this.currentHeader);
302             } else {
303                 throw new ParseException
304                     ("Invalid character after token (pos " + from +
305                      "): " + this.currentHeader);
306             }
307         }
308 
309         return from;
310     }
311 
312 
313     /**
314      * Determines the ending position of the current token.
315      * This method will not leave the current header value,
316      * since the end of the header value is a token boundary.
317      *
318      * @param from      the position of the first character of the token
319      *
320      * @return  the position after the last character of the token.
321      *          The behavior is undefined if <code>from</code> does not
322      *          point to a token character in the current header value.
323      */
324     protected int findTokenEnd(int from) {
325         if (from < 0) {
326             throw new IllegalArgumentException
327                 ("Token start position must not be negative: " + from);
328         }
329 
330         final int to = this.currentHeader.length();
331         int end = from+1;
332         while ((end < to) && isTokenChar(this.currentHeader.charAt(end))) {
333             end++;
334         }
335 
336         return end;
337     }
338 
339 
340     /**
341      * Checks whether a character is a token separator.
342      * RFC 2616, section 2.1 defines comma as the separator for
343      * <tt>#token</tt> sequences. The end of a header value will
344      * also separate tokens, but that is not a character check.
345      *
346      * @param ch        the character to check
347      *
348      * @return  <code>true</code> if the character is a token separator,
349      *          <code>false</code> otherwise
350      */
351     protected boolean isTokenSeparator(char ch) {
352         return (ch == ',');
353     }
354 
355 
356     /**
357      * Checks whether a character is a whitespace character.
358      * RFC 2616, section 2.2 defines space and horizontal tab as whitespace.
359      * The optional preceeding line break is irrelevant, since header
360      * continuation is handled transparently when parsing messages.
361      *
362      * @param ch        the character to check
363      *
364      * @return  <code>true</code> if the character is whitespace,
365      *          <code>false</code> otherwise
366      */
367     protected boolean isWhitespace(char ch) {
368 
369         // we do not use Character.isWhitspace(ch) here, since that allows
370         // many control characters which are not whitespace as per RFC 2616
371         return ((ch == '\t') || Character.isSpaceChar(ch));
372     }
373 
374 
375     /**
376      * Checks whether a character is a valid token character.
377      * Whitespace, control characters, and HTTP separators are not
378      * valid token characters. The HTTP specification (RFC 2616, section 2.2)
379      * defines tokens only for the US-ASCII character set, this
380      * method extends the definition to other character sets.
381      *
382      * @param ch        the character to check
383      *
384      * @return  <code>true</code> if the character is a valid token start,
385      *          <code>false</code> otherwise
386      */
387     protected boolean isTokenChar(char ch) {
388 
389         // common sense extension of ALPHA + DIGIT
390         if (Character.isLetterOrDigit(ch))
391             return true;
392 
393         // common sense extension of CTL
394         if (Character.isISOControl(ch))
395             return false;
396 
397         // no common sense extension for this
398         if (isHttpSeparator(ch))
399             return false;
400 
401         // RFC 2616, section 2.2 defines a token character as
402         // "any CHAR except CTLs or separators". The controls
403         // and separators are included in the checks above.
404         // This will yield unexpected results for Unicode format characters.
405         // If that is a problem, overwrite isHttpSeparator(char) to filter
406         // out the false positives.
407         return true;
408     }
409 
410 
411     /**
412      * Checks whether a character is an HTTP separator.
413      * The implementation in this class checks only for the HTTP separators
414      * defined in RFC 2616, section 2.2. If you need to detect other
415      * separators beyond the US-ASCII character set, override this method.
416      *
417      * @param ch        the character to check
418      *
419      * @return  <code>true</code> if the character is an HTTP separator
420      */
421     protected boolean isHttpSeparator(char ch) {
422         return (HTTP_SEPARATORS.indexOf(ch) >= 0);
423     }
424 
425 
426 } // class BasicTokenIterator
427