View Javadoc

1   /*
2    * ====================================================================
3    * Licensed to the Apache Software Foundation (ASF) under one
4    * or more contributor license agreements.  See the NOTICE file
5    * distributed with this work for additional information
6    * regarding copyright ownership.  The ASF licenses this file
7    * to you under the Apache License, Version 2.0 (the
8    * "License"); you may not use this file except in compliance
9    * with the License.  You may obtain a copy of the License at
10   *
11   *   http://www.apache.org/licenses/LICENSE-2.0
12   *
13   * Unless required by applicable law or agreed to in writing,
14   * software distributed under the License is distributed on an
15   * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
16   * KIND, either express or implied.  See the License for the
17   * specific language governing permissions and limitations
18   * under the License.
19   * ====================================================================
20   *
21   * This software consists of voluntary contributions made by many
22   * individuals on behalf of the Apache Software Foundation.  For more
23   * information on the Apache Software Foundation, please see
24   * <http://www.apache.org/>.
25   *
26   */
27  
28  package org.apache.http.message;
29  
30  import java.util.NoSuchElementException;
31  
32  import org.apache.http.HeaderIterator;
33  import org.apache.http.ParseException;
34  import org.apache.http.TokenIterator;
35  import org.apache.http.annotation.NotThreadSafe;
36  import org.apache.http.util.Args;
37  
38  /**
39   * Basic implementation of a {@link TokenIterator}.
40   * This implementation parses <tt>#token<tt> sequences as
41   * defined by RFC 2616, section 2.
42   * It extends that definition somewhat beyond US-ASCII.
43   *
44   * @since 4.0
45   */
46  @NotThreadSafe
47  public class BasicTokenIterator implements TokenIterator {
48  
49      /** The HTTP separator characters. Defined in RFC 2616, section 2.2. */
50      // the order of the characters here is adjusted to put the
51      // most likely candidates at the beginning of the collection
52      public final static String HTTP_SEPARATORS = " ,;=()<>@:\\\"/[]?{}\t";
53  
54  
55      /** The iterator from which to obtain the next header. */
56      protected final HeaderIterator headerIt;
57  
58      /**
59       * The value of the current header.
60       * This is the header value that includes {@link #currentToken}.
61       * Undefined if the iteration is over.
62       */
63      protected String currentHeader;
64  
65      /**
66       * The token to be returned by the next call to {@link #nextToken()}.
67       * <code>null</code> if the iteration is over.
68       */
69      protected String currentToken;
70  
71      /**
72       * The position after {@link #currentToken} in {@link #currentHeader}.
73       * Undefined if the iteration is over.
74       */
75      protected int searchPos;
76  
77  
78      /**
79       * Creates a new instance of {@link BasicTokenIterator}.
80       *
81       * @param headerIterator    the iterator for the headers to tokenize
82       */
83      public BasicTokenIterator(final HeaderIterator headerIterator) {
84          super();
85          this.headerIt = Args.notNull(headerIterator, "Header iterator");
86          this.searchPos = findNext(-1);
87      }
88  
89  
90      // non-javadoc, see interface TokenIterator
91      @Override
92      public boolean hasNext() {
93          return (this.currentToken != null);
94      }
95  
96  
97      /**
98       * Obtains the next token from this iteration.
99       *
100      * @return  the next token in this iteration
101      *
102      * @throws NoSuchElementException   if the iteration is already over
103      * @throws ParseException   if an invalid header value is encountered
104      */
105     @Override
106     public String nextToken()
107         throws NoSuchElementException, ParseException {
108 
109         if (this.currentToken == null) {
110             throw new NoSuchElementException("Iteration already finished.");
111         }
112 
113         final String result = this.currentToken;
114         // updates currentToken, may trigger ParseException:
115         this.searchPos = findNext(this.searchPos);
116 
117         return result;
118     }
119 
120 
121     /**
122      * Returns the next token.
123      * Same as {@link #nextToken}, but with generic return type.
124      *
125      * @return  the next token in this iteration
126      *
127      * @throws NoSuchElementException   if there are no more tokens
128      * @throws ParseException   if an invalid header value is encountered
129      */
130     @Override
131     public final Object next()
132         throws NoSuchElementException, ParseException {
133         return nextToken();
134     }
135 
136 
137     /**
138      * Removing tokens is not supported.
139      *
140      * @throws UnsupportedOperationException    always
141      */
142     @Override
143     public final void remove()
144         throws UnsupportedOperationException {
145 
146         throw new UnsupportedOperationException
147             ("Removing tokens is not supported.");
148     }
149 
150 
151     /**
152      * Determines the next token.
153      * If found, the token is stored in {@link #currentToken}.
154      * The return value indicates the position after the token
155      * in {@link #currentHeader}. If necessary, the next header
156      * will be obtained from {@link #headerIt}.
157      * If not found, {@link #currentToken} is set to <code>null</code>.
158      *
159      * @param pos       the position in the current header at which to
160      *                  start the search, -1 to search in the first header
161      *
162      * @return  the position after the found token in the current header, or
163      *          negative if there was no next token
164      *
165      * @throws ParseException   if an invalid header value is encountered
166      */
167     protected int findNext(final int pos) throws ParseException {
168         int from = pos;
169         if (from < 0) {
170             // called from the constructor, initialize the first header
171             if (!this.headerIt.hasNext()) {
172                 return -1;
173             }
174             this.currentHeader = this.headerIt.nextHeader().getValue();
175             from = 0;
176         } else {
177             // called after a token, make sure there is a separator
178             from = findTokenSeparator(from);
179         }
180 
181         final int start = findTokenStart(from);
182         if (start < 0) {
183             this.currentToken = null;
184             return -1; // nothing found
185         }
186 
187         final int end = findTokenEnd(start);
188         this.currentToken = createToken(this.currentHeader, start, end);
189         return end;
190     }
191 
192 
193     /**
194      * Creates a new token to be returned.
195      * Called from {@link #findNext findNext} after the token is identified.
196      * The default implementation simply calls
197      * {@link java.lang.String#substring String.substring}.
198      * <br/>
199      * If header values are significantly longer than tokens, and some
200      * tokens are permanently referenced by the application, there can
201      * be problems with garbage collection. A substring will hold a
202      * reference to the full characters of the original string and
203      * therefore occupies more memory than might be expected.
204      * To avoid this, override this method and create a new string
205      * instead of a substring.
206      *
207      * @param value     the full header value from which to create a token
208      * @param start     the index of the first token character
209      * @param end       the index after the last token character
210      *
211      * @return  a string representing the token identified by the arguments
212      */
213     protected String createToken(final String value, final int start, final int end) {
214         return value.substring(start, end);
215     }
216 
217 
218     /**
219      * Determines the starting position of the next token.
220      * This method will iterate over headers if necessary.
221      *
222      * @param pos       the position in the current header at which to
223      *                  start the search
224      *
225      * @return  the position of the token start in the current header,
226      *          negative if no token start could be found
227      */
228     protected int findTokenStart(final int pos) {
229         int from = Args.notNegative(pos, "Search position");
230         boolean found = false;
231         while (!found && (this.currentHeader != null)) {
232 
233             final int to = this.currentHeader.length();
234             while (!found && (from < to)) {
235 
236                 final char ch = this.currentHeader.charAt(from);
237                 if (isTokenSeparator(ch) || isWhitespace(ch)) {
238                     // whitspace and token separators are skipped
239                     from++;
240                 } else if (isTokenChar(this.currentHeader.charAt(from))) {
241                     // found the start of a token
242                     found = true;
243                 } else {
244                     throw new ParseException
245                         ("Invalid character before token (pos " + from +
246                          "): " + this.currentHeader);
247                 }
248             }
249             if (!found) {
250                 if (this.headerIt.hasNext()) {
251                     this.currentHeader = this.headerIt.nextHeader().getValue();
252                     from = 0;
253                 } else {
254                     this.currentHeader = null;
255                 }
256             }
257         } // while headers
258 
259         return found ? from : -1;
260     }
261 
262 
263     /**
264      * Determines the position of the next token separator.
265      * Because of multi-header joining rules, the end of a
266      * header value is a token separator. This method does
267      * therefore not need to iterate over headers.
268      *
269      * @param pos       the position in the current header at which to
270      *                  start the search
271      *
272      * @return  the position of a token separator in the current header,
273      *          or at the end
274      *
275      * @throws ParseException
276      *         if a new token is found before a token separator.
277      *         RFC 2616, section 2.1 explicitly requires a comma between
278      *         tokens for <tt>#</tt>.
279      */
280     protected int findTokenSeparator(final int pos) {
281         int from = Args.notNegative(pos, "Search position");
282         boolean found = false;
283         final int to = this.currentHeader.length();
284         while (!found && (from < to)) {
285             final char ch = this.currentHeader.charAt(from);
286             if (isTokenSeparator(ch)) {
287                 found = true;
288             } else if (isWhitespace(ch)) {
289                 from++;
290             } else if (isTokenChar(ch)) {
291                 throw new ParseException
292                     ("Tokens without separator (pos " + from +
293                      "): " + this.currentHeader);
294             } else {
295                 throw new ParseException
296                     ("Invalid character after token (pos " + from +
297                      "): " + this.currentHeader);
298             }
299         }
300 
301         return from;
302     }
303 
304 
305     /**
306      * Determines the ending position of the current token.
307      * This method will not leave the current header value,
308      * since the end of the header value is a token boundary.
309      *
310      * @param from      the position of the first character of the token
311      *
312      * @return  the position after the last character of the token.
313      *          The behavior is undefined if <code>from</code> does not
314      *          point to a token character in the current header value.
315      */
316     protected int findTokenEnd(final int from) {
317         Args.notNegative(from, "Search position");
318         final int to = this.currentHeader.length();
319         int end = from+1;
320         while ((end < to) && isTokenChar(this.currentHeader.charAt(end))) {
321             end++;
322         }
323 
324         return end;
325     }
326 
327 
328     /**
329      * Checks whether a character is a token separator.
330      * RFC 2616, section 2.1 defines comma as the separator for
331      * <tt>#token</tt> sequences. The end of a header value will
332      * also separate tokens, but that is not a character check.
333      *
334      * @param ch        the character to check
335      *
336      * @return  <code>true</code> if the character is a token separator,
337      *          <code>false</code> otherwise
338      */
339     protected boolean isTokenSeparator(final char ch) {
340         return (ch == ',');
341     }
342 
343 
344     /**
345      * Checks whether a character is a whitespace character.
346      * RFC 2616, section 2.2 defines space and horizontal tab as whitespace.
347      * The optional preceeding line break is irrelevant, since header
348      * continuation is handled transparently when parsing messages.
349      *
350      * @param ch        the character to check
351      *
352      * @return  <code>true</code> if the character is whitespace,
353      *          <code>false</code> otherwise
354      */
355     protected boolean isWhitespace(final char ch) {
356 
357         // we do not use Character.isWhitspace(ch) here, since that allows
358         // many control characters which are not whitespace as per RFC 2616
359         return ((ch == '\t') || Character.isSpaceChar(ch));
360     }
361 
362 
363     /**
364      * Checks whether a character is a valid token character.
365      * Whitespace, control characters, and HTTP separators are not
366      * valid token characters. The HTTP specification (RFC 2616, section 2.2)
367      * defines tokens only for the US-ASCII character set, this
368      * method extends the definition to other character sets.
369      *
370      * @param ch        the character to check
371      *
372      * @return  <code>true</code> if the character is a valid token start,
373      *          <code>false</code> otherwise
374      */
375     protected boolean isTokenChar(final char ch) {
376 
377         // common sense extension of ALPHA + DIGIT
378         if (Character.isLetterOrDigit(ch)) {
379             return true;
380         }
381 
382         // common sense extension of CTL
383         if (Character.isISOControl(ch)) {
384             return false;
385         }
386 
387         // no common sense extension for this
388         if (isHttpSeparator(ch)) {
389             return false;
390         }
391 
392         // RFC 2616, section 2.2 defines a token character as
393         // "any CHAR except CTLs or separators". The controls
394         // and separators are included in the checks above.
395         // This will yield unexpected results for Unicode format characters.
396         // If that is a problem, overwrite isHttpSeparator(char) to filter
397         // out the false positives.
398         return true;
399     }
400 
401 
402     /**
403      * Checks whether a character is an HTTP separator.
404      * The implementation in this class checks only for the HTTP separators
405      * defined in RFC 2616, section 2.2. If you need to detect other
406      * separators beyond the US-ASCII character set, override this method.
407      *
408      * @param ch        the character to check
409      *
410      * @return  <code>true</code> if the character is an HTTP separator
411      */
412     protected boolean isHttpSeparator(final char ch) {
413         return (HTTP_SEPARATORS.indexOf(ch) >= 0);
414     }
415 
416 
417 } // class BasicTokenIterator
418