View Javadoc

1   /*
2    * ====================================================================
3    * Licensed to the Apache Software Foundation (ASF) under one
4    * or more contributor license agreements.  See the NOTICE file
5    * distributed with this work for additional information
6    * regarding copyright ownership.  The ASF licenses this file
7    * to you under the Apache License, Version 2.0 (the
8    * "License"); you may not use this file except in compliance
9    * with the License.  You may obtain a copy of the License at
10   *
11   *   http://www.apache.org/licenses/LICENSE-2.0
12   *
13   * Unless required by applicable law or agreed to in writing,
14   * software distributed under the License is distributed on an
15   * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
16   * KIND, either express or implied.  See the License for the
17   * specific language governing permissions and limitations
18   * under the License.
19   * ====================================================================
20   *
21   * This software consists of voluntary contributions made by many
22   * individuals on behalf of the Apache Software Foundation.  For more
23   * information on the Apache Software Foundation, please see
24   * <http://www.apache.org/>.
25   *
26   */
27  
28  package org.apache.http.message;
29  
30  import java.util.NoSuchElementException;
31  
32  import org.apache.http.HeaderIterator;
33  import org.apache.http.ParseException;
34  import org.apache.http.TokenIterator;
35  import org.apache.http.annotation.NotThreadSafe;
36  import org.apache.http.util.Args;
37  
38  /**
39   * Basic implementation of a {@link TokenIterator}.
40   * This implementation parses {@code #token} sequences as
41   * defined by RFC 2616, section 2.
42   * It extends that definition somewhat beyond US-ASCII.
43   *
44   * @since 4.0
45   */
46  @NotThreadSafe
47  public class BasicTokenIterator implements TokenIterator {
48  
49      /** The HTTP separator characters. Defined in RFC 2616, section 2.2. */
50      // the order of the characters here is adjusted to put the
51      // most likely candidates at the beginning of the collection
52      public final static String HTTP_SEPARATORS = " ,;=()<>@:\\\"/[]?{}\t";
53  
54  
55      /** The iterator from which to obtain the next header. */
56      protected final HeaderIterator headerIt;
57  
58      /**
59       * The value of the current header.
60       * This is the header value that includes {@link #currentToken}.
61       * Undefined if the iteration is over.
62       */
63      protected String currentHeader;
64  
65      /**
66       * The token to be returned by the next call to {@link #nextToken()}.
67       * {@code null} if the iteration is over.
68       */
69      protected String currentToken;
70  
71      /**
72       * The position after {@link #currentToken} in {@link #currentHeader}.
73       * Undefined if the iteration is over.
74       */
75      protected int searchPos;
76  
77  
78      /**
79       * Creates a new instance of {@link BasicTokenIterator}.
80       *
81       * @param headerIterator    the iterator for the headers to tokenize
82       */
83      public BasicTokenIterator(final HeaderIterator headerIterator) {
84          super();
85          this.headerIt = Args.notNull(headerIterator, "Header iterator");
86          this.searchPos = findNext(-1);
87      }
88  
89  
90      // non-javadoc, see interface TokenIterator
91      @Override
92      public boolean hasNext() {
93          return (this.currentToken != null);
94      }
95  
96  
97      /**
98       * Obtains the next token from this iteration.
99       *
100      * @return  the next token in this iteration
101      *
102      * @throws NoSuchElementException   if the iteration is already over
103      * @throws ParseException   if an invalid header value is encountered
104      */
105     @Override
106     public String nextToken()
107         throws NoSuchElementException, ParseException {
108 
109         if (this.currentToken == null) {
110             throw new NoSuchElementException("Iteration already finished.");
111         }
112 
113         final String result = this.currentToken;
114         // updates currentToken, may trigger ParseException:
115         this.searchPos = findNext(this.searchPos);
116 
117         return result;
118     }
119 
120 
121     /**
122      * Returns the next token.
123      * Same as {@link #nextToken}, but with generic return type.
124      *
125      * @return  the next token in this iteration
126      *
127      * @throws NoSuchElementException   if there are no more tokens
128      * @throws ParseException   if an invalid header value is encountered
129      */
130     @Override
131     public final Object next()
132         throws NoSuchElementException, ParseException {
133         return nextToken();
134     }
135 
136 
137     /**
138      * Removing tokens is not supported.
139      *
140      * @throws UnsupportedOperationException    always
141      */
142     @Override
143     public final void remove()
144         throws UnsupportedOperationException {
145 
146         throw new UnsupportedOperationException
147             ("Removing tokens is not supported.");
148     }
149 
150 
151     /**
152      * Determines the next token.
153      * If found, the token is stored in {@link #currentToken}.
154      * The return value indicates the position after the token
155      * in {@link #currentHeader}. If necessary, the next header
156      * will be obtained from {@link #headerIt}.
157      * If not found, {@link #currentToken} is set to {@code null}.
158      *
159      * @param pos       the position in the current header at which to
160      *                  start the search, -1 to search in the first header
161      *
162      * @return  the position after the found token in the current header, or
163      *          negative if there was no next token
164      *
165      * @throws ParseException   if an invalid header value is encountered
166      */
167     protected int findNext(final int pos) throws ParseException {
168         int from = pos;
169         if (from < 0) {
170             // called from the constructor, initialize the first header
171             if (!this.headerIt.hasNext()) {
172                 return -1;
173             }
174             this.currentHeader = this.headerIt.nextHeader().getValue();
175             from = 0;
176         } else {
177             // called after a token, make sure there is a separator
178             from = findTokenSeparator(from);
179         }
180 
181         final int start = findTokenStart(from);
182         if (start < 0) {
183             this.currentToken = null;
184             return -1; // nothing found
185         }
186 
187         final int end = findTokenEnd(start);
188         this.currentToken = createToken(this.currentHeader, start, end);
189         return end;
190     }
191 
192 
193     /**
194      * Creates a new token to be returned.
195      * Called from {@link #findNext findNext} after the token is identified.
196      * The default implementation simply calls
197      * {@link java.lang.String#substring String.substring}.
198      * <p>
199      * If header values are significantly longer than tokens, and some
200      * tokens are permanently referenced by the application, there can
201      * be problems with garbage collection. A substring will hold a
202      * reference to the full characters of the original string and
203      * therefore occupies more memory than might be expected.
204      * To avoid this, override this method and create a new string
205      * instead of a substring.
206      * </p>
207      *
208      * @param value     the full header value from which to create a token
209      * @param start     the index of the first token character
210      * @param end       the index after the last token character
211      *
212      * @return  a string representing the token identified by the arguments
213      */
214     protected String createToken(final String value, final int start, final int end) {
215         return value.substring(start, end);
216     }
217 
218 
219     /**
220      * Determines the starting position of the next token.
221      * This method will iterate over headers if necessary.
222      *
223      * @param pos       the position in the current header at which to
224      *                  start the search
225      *
226      * @return  the position of the token start in the current header,
227      *          negative if no token start could be found
228      */
229     protected int findTokenStart(final int pos) {
230         int from = Args.notNegative(pos, "Search position");
231         boolean found = false;
232         while (!found && (this.currentHeader != null)) {
233 
234             final int to = this.currentHeader.length();
235             while (!found && (from < to)) {
236 
237                 final char ch = this.currentHeader.charAt(from);
238                 if (isTokenSeparator(ch) || isWhitespace(ch)) {
239                     // whitspace and token separators are skipped
240                     from++;
241                 } else if (isTokenChar(this.currentHeader.charAt(from))) {
242                     // found the start of a token
243                     found = true;
244                 } else {
245                     throw new ParseException
246                         ("Invalid character before token (pos " + from +
247                          "): " + this.currentHeader);
248                 }
249             }
250             if (!found) {
251                 if (this.headerIt.hasNext()) {
252                     this.currentHeader = this.headerIt.nextHeader().getValue();
253                     from = 0;
254                 } else {
255                     this.currentHeader = null;
256                 }
257             }
258         } // while headers
259 
260         return found ? from : -1;
261     }
262 
263 
264     /**
265      * Determines the position of the next token separator.
266      * Because of multi-header joining rules, the end of a
267      * header value is a token separator. This method does
268      * therefore not need to iterate over headers.
269      *
270      * @param pos       the position in the current header at which to
271      *                  start the search
272      *
273      * @return  the position of a token separator in the current header,
274      *          or at the end
275      *
276      * @throws ParseException
277      *         if a new token is found before a token separator.
278      *         RFC 2616, section 2.1 explicitly requires a comma between
279      *         tokens for {@code #}.
280      */
281     protected int findTokenSeparator(final int pos) {
282         int from = Args.notNegative(pos, "Search position");
283         boolean found = false;
284         final int to = this.currentHeader.length();
285         while (!found && (from < to)) {
286             final char ch = this.currentHeader.charAt(from);
287             if (isTokenSeparator(ch)) {
288                 found = true;
289             } else if (isWhitespace(ch)) {
290                 from++;
291             } else if (isTokenChar(ch)) {
292                 throw new ParseException
293                     ("Tokens without separator (pos " + from +
294                      "): " + this.currentHeader);
295             } else {
296                 throw new ParseException
297                     ("Invalid character after token (pos " + from +
298                      "): " + this.currentHeader);
299             }
300         }
301 
302         return from;
303     }
304 
305 
306     /**
307      * Determines the ending position of the current token.
308      * This method will not leave the current header value,
309      * since the end of the header value is a token boundary.
310      *
311      * @param from      the position of the first character of the token
312      *
313      * @return  the position after the last character of the token.
314      *          The behavior is undefined if {@code from} does not
315      *          point to a token character in the current header value.
316      */
317     protected int findTokenEnd(final int from) {
318         Args.notNegative(from, "Search position");
319         final int to = this.currentHeader.length();
320         int end = from+1;
321         while ((end < to) && isTokenChar(this.currentHeader.charAt(end))) {
322             end++;
323         }
324 
325         return end;
326     }
327 
328 
329     /**
330      * Checks whether a character is a token separator.
331      * RFC 2616, section 2.1 defines comma as the separator for
332      * {@code #token} sequences. The end of a header value will
333      * also separate tokens, but that is not a character check.
334      *
335      * @param ch        the character to check
336      *
337      * @return  {@code true} if the character is a token separator,
338      *          {@code false} otherwise
339      */
340     protected boolean isTokenSeparator(final char ch) {
341         return (ch == ',');
342     }
343 
344 
345     /**
346      * Checks whether a character is a whitespace character.
347      * RFC 2616, section 2.2 defines space and horizontal tab as whitespace.
348      * The optional preceeding line break is irrelevant, since header
349      * continuation is handled transparently when parsing messages.
350      *
351      * @param ch        the character to check
352      *
353      * @return  {@code true} if the character is whitespace,
354      *          {@code false} otherwise
355      */
356     protected boolean isWhitespace(final char ch) {
357 
358         // we do not use Character.isWhitspace(ch) here, since that allows
359         // many control characters which are not whitespace as per RFC 2616
360         return ((ch == '\t') || Character.isSpaceChar(ch));
361     }
362 
363 
364     /**
365      * Checks whether a character is a valid token character.
366      * Whitespace, control characters, and HTTP separators are not
367      * valid token characters. The HTTP specification (RFC 2616, section 2.2)
368      * defines tokens only for the US-ASCII character set, this
369      * method extends the definition to other character sets.
370      *
371      * @param ch        the character to check
372      *
373      * @return  {@code true} if the character is a valid token start,
374      *          {@code false} otherwise
375      */
376     protected boolean isTokenChar(final char ch) {
377 
378         // common sense extension of ALPHA + DIGIT
379         if (Character.isLetterOrDigit(ch)) {
380             return true;
381         }
382 
383         // common sense extension of CTL
384         if (Character.isISOControl(ch)) {
385             return false;
386         }
387 
388         // no common sense extension for this
389         if (isHttpSeparator(ch)) {
390             return false;
391         }
392 
393         // RFC 2616, section 2.2 defines a token character as
394         // "any CHAR except CTLs or separators". The controls
395         // and separators are included in the checks above.
396         // This will yield unexpected results for Unicode format characters.
397         // If that is a problem, overwrite isHttpSeparator(char) to filter
398         // out the false positives.
399         return true;
400     }
401 
402 
403     /**
404      * Checks whether a character is an HTTP separator.
405      * The implementation in this class checks only for the HTTP separators
406      * defined in RFC 2616, section 2.2. If you need to detect other
407      * separators beyond the US-ASCII character set, override this method.
408      *
409      * @param ch        the character to check
410      *
411      * @return  {@code true} if the character is an HTTP separator
412      */
413     protected boolean isHttpSeparator(final char ch) {
414         return (HTTP_SEPARATORS.indexOf(ch) >= 0);
415     }
416 
417 
418 } // class BasicTokenIterator
419