View Javadoc

1   /*
2    * ====================================================================
3    * Licensed to the Apache Software Foundation (ASF) under one
4    * or more contributor license agreements.  See the NOTICE file
5    * distributed with this work for additional information
6    * regarding copyright ownership.  The ASF licenses this file
7    * to you under the Apache License, Version 2.0 (the
8    * "License"); you may not use this file except in compliance
9    * with the License.  You may obtain a copy of the License at
10   *
11   *   http://www.apache.org/licenses/LICENSE-2.0
12   *
13   * Unless required by applicable law or agreed to in writing,
14   * software distributed under the License is distributed on an
15   * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
16   * KIND, either express or implied.  See the License for the
17   * specific language governing permissions and limitations
18   * under the License.
19   * ====================================================================
20   *
21   * This software consists of voluntary contributions made by many
22   * individuals on behalf of the Apache Software Foundation.  For more
23   * information on the Apache Software Foundation, please see
24   * <http://www.apache.org/>.
25   *
26   */
27  
28  package org.apache.http.message;
29  
30  import java.util.NoSuchElementException;
31  
32  import org.apache.http.HeaderIterator;
33  import org.apache.http.ParseException;
34  import org.apache.http.TokenIterator;
35  import org.apache.http.annotation.NotThreadSafe;
36  import org.apache.http.util.Args;
37  
38  /**
39   * Basic implementation of a {@link TokenIterator}.
40   * This implementation parses <tt>#token<tt> sequences as
41   * defined by RFC 2616, section 2.
42   * It extends that definition somewhat beyond US-ASCII.
43   *
44   * @since 4.0
45   */
46  @NotThreadSafe
47  public class BasicTokenIterator implements TokenIterator {
48  
49      /** The HTTP separator characters. Defined in RFC 2616, section 2.2. */
50      // the order of the characters here is adjusted to put the
51      // most likely candidates at the beginning of the collection
52      public final static String HTTP_SEPARATORS = " ,;=()<>@:\\\"/[]?{}\t";
53  
54  
55      /** The iterator from which to obtain the next header. */
56      protected final HeaderIterator headerIt;
57  
58      /**
59       * The value of the current header.
60       * This is the header value that includes {@link #currentToken}.
61       * Undefined if the iteration is over.
62       */
63      protected String currentHeader;
64  
65      /**
66       * The token to be returned by the next call to {@link #nextToken()}.
67       * <code>null</code> if the iteration is over.
68       */
69      protected String currentToken;
70  
71      /**
72       * The position after {@link #currentToken} in {@link #currentHeader}.
73       * Undefined if the iteration is over.
74       */
75      protected int searchPos;
76  
77  
78      /**
79       * Creates a new instance of {@link BasicTokenIterator}.
80       *
81       * @param headerIterator    the iterator for the headers to tokenize
82       */
83      public BasicTokenIterator(final HeaderIterator headerIterator) {
84          super();
85          this.headerIt = Args.notNull(headerIterator, "Header iterator");
86          this.searchPos = findNext(-1);
87      }
88  
89  
90      // non-javadoc, see interface TokenIterator
91      public boolean hasNext() {
92          return (this.currentToken != null);
93      }
94  
95  
96      /**
97       * Obtains the next token from this iteration.
98       *
99       * @return  the next token in this iteration
100      *
101      * @throws NoSuchElementException   if the iteration is already over
102      * @throws ParseException   if an invalid header value is encountered
103      */
104     public String nextToken()
105         throws NoSuchElementException, ParseException {
106 
107         if (this.currentToken == null) {
108             throw new NoSuchElementException("Iteration already finished.");
109         }
110 
111         final String result = this.currentToken;
112         // updates currentToken, may trigger ParseException:
113         this.searchPos = findNext(this.searchPos);
114 
115         return result;
116     }
117 
118 
119     /**
120      * Returns the next token.
121      * Same as {@link #nextToken}, but with generic return type.
122      *
123      * @return  the next token in this iteration
124      *
125      * @throws NoSuchElementException   if there are no more tokens
126      * @throws ParseException   if an invalid header value is encountered
127      */
128     public final Object next()
129         throws NoSuchElementException, ParseException {
130         return nextToken();
131     }
132 
133 
134     /**
135      * Removing tokens is not supported.
136      *
137      * @throws UnsupportedOperationException    always
138      */
139     public final void remove()
140         throws UnsupportedOperationException {
141 
142         throw new UnsupportedOperationException
143             ("Removing tokens is not supported.");
144     }
145 
146 
147     /**
148      * Determines the next token.
149      * If found, the token is stored in {@link #currentToken}.
150      * The return value indicates the position after the token
151      * in {@link #currentHeader}. If necessary, the next header
152      * will be obtained from {@link #headerIt}.
153      * If not found, {@link #currentToken} is set to <code>null</code>.
154      *
155      * @param pos       the position in the current header at which to
156      *                  start the search, -1 to search in the first header
157      *
158      * @return  the position after the found token in the current header, or
159      *          negative if there was no next token
160      *
161      * @throws ParseException   if an invalid header value is encountered
162      */
163     protected int findNext(final int pos) throws ParseException {
164         int from = pos;
165         if (from < 0) {
166             // called from the constructor, initialize the first header
167             if (!this.headerIt.hasNext()) {
168                 return -1;
169             }
170             this.currentHeader = this.headerIt.nextHeader().getValue();
171             from = 0;
172         } else {
173             // called after a token, make sure there is a separator
174             from = findTokenSeparator(from);
175         }
176 
177         final int start = findTokenStart(from);
178         if (start < 0) {
179             this.currentToken = null;
180             return -1; // nothing found
181         }
182 
183         final int end = findTokenEnd(start);
184         this.currentToken = createToken(this.currentHeader, start, end);
185         return end;
186     }
187 
188 
189     /**
190      * Creates a new token to be returned.
191      * Called from {@link #findNext findNext} after the token is identified.
192      * The default implementation simply calls
193      * {@link java.lang.String#substring String.substring}.
194      * <br/>
195      * If header values are significantly longer than tokens, and some
196      * tokens are permanently referenced by the application, there can
197      * be problems with garbage collection. A substring will hold a
198      * reference to the full characters of the original string and
199      * therefore occupies more memory than might be expected.
200      * To avoid this, override this method and create a new string
201      * instead of a substring.
202      *
203      * @param value     the full header value from which to create a token
204      * @param start     the index of the first token character
205      * @param end       the index after the last token character
206      *
207      * @return  a string representing the token identified by the arguments
208      */
209     protected String createToken(final String value, final int start, final int end) {
210         return value.substring(start, end);
211     }
212 
213 
214     /**
215      * Determines the starting position of the next token.
216      * This method will iterate over headers if necessary.
217      *
218      * @param pos       the position in the current header at which to
219      *                  start the search
220      *
221      * @return  the position of the token start in the current header,
222      *          negative if no token start could be found
223      */
224     protected int findTokenStart(final int pos) {
225         int from = Args.notNegative(pos, "Search position");
226         boolean found = false;
227         while (!found && (this.currentHeader != null)) {
228 
229             final int to = this.currentHeader.length();
230             while (!found && (from < to)) {
231 
232                 final char ch = this.currentHeader.charAt(from);
233                 if (isTokenSeparator(ch) || isWhitespace(ch)) {
234                     // whitspace and token separators are skipped
235                     from++;
236                 } else if (isTokenChar(this.currentHeader.charAt(from))) {
237                     // found the start of a token
238                     found = true;
239                 } else {
240                     throw new ParseException
241                         ("Invalid character before token (pos " + from +
242                          "): " + this.currentHeader);
243                 }
244             }
245             if (!found) {
246                 if (this.headerIt.hasNext()) {
247                     this.currentHeader = this.headerIt.nextHeader().getValue();
248                     from = 0;
249                 } else {
250                     this.currentHeader = null;
251                 }
252             }
253         } // while headers
254 
255         return found ? from : -1;
256     }
257 
258 
259     /**
260      * Determines the position of the next token separator.
261      * Because of multi-header joining rules, the end of a
262      * header value is a token separator. This method does
263      * therefore not need to iterate over headers.
264      *
265      * @param pos       the position in the current header at which to
266      *                  start the search
267      *
268      * @return  the position of a token separator in the current header,
269      *          or at the end
270      *
271      * @throws ParseException
272      *         if a new token is found before a token separator.
273      *         RFC 2616, section 2.1 explicitly requires a comma between
274      *         tokens for <tt>#</tt>.
275      */
276     protected int findTokenSeparator(final int pos) {
277         int from = Args.notNegative(pos, "Search position");
278         boolean found = false;
279         final int to = this.currentHeader.length();
280         while (!found && (from < to)) {
281             final char ch = this.currentHeader.charAt(from);
282             if (isTokenSeparator(ch)) {
283                 found = true;
284             } else if (isWhitespace(ch)) {
285                 from++;
286             } else if (isTokenChar(ch)) {
287                 throw new ParseException
288                     ("Tokens without separator (pos " + from +
289                      "): " + this.currentHeader);
290             } else {
291                 throw new ParseException
292                     ("Invalid character after token (pos " + from +
293                      "): " + this.currentHeader);
294             }
295         }
296 
297         return from;
298     }
299 
300 
301     /**
302      * Determines the ending position of the current token.
303      * This method will not leave the current header value,
304      * since the end of the header value is a token boundary.
305      *
306      * @param from      the position of the first character of the token
307      *
308      * @return  the position after the last character of the token.
309      *          The behavior is undefined if <code>from</code> does not
310      *          point to a token character in the current header value.
311      */
312     protected int findTokenEnd(final int from) {
313         Args.notNegative(from, "Search position");
314         final int to = this.currentHeader.length();
315         int end = from+1;
316         while ((end < to) && isTokenChar(this.currentHeader.charAt(end))) {
317             end++;
318         }
319 
320         return end;
321     }
322 
323 
324     /**
325      * Checks whether a character is a token separator.
326      * RFC 2616, section 2.1 defines comma as the separator for
327      * <tt>#token</tt> sequences. The end of a header value will
328      * also separate tokens, but that is not a character check.
329      *
330      * @param ch        the character to check
331      *
332      * @return  <code>true</code> if the character is a token separator,
333      *          <code>false</code> otherwise
334      */
335     protected boolean isTokenSeparator(final char ch) {
336         return (ch == ',');
337     }
338 
339 
340     /**
341      * Checks whether a character is a whitespace character.
342      * RFC 2616, section 2.2 defines space and horizontal tab as whitespace.
343      * The optional preceeding line break is irrelevant, since header
344      * continuation is handled transparently when parsing messages.
345      *
346      * @param ch        the character to check
347      *
348      * @return  <code>true</code> if the character is whitespace,
349      *          <code>false</code> otherwise
350      */
351     protected boolean isWhitespace(final char ch) {
352 
353         // we do not use Character.isWhitspace(ch) here, since that allows
354         // many control characters which are not whitespace as per RFC 2616
355         return ((ch == '\t') || Character.isSpaceChar(ch));
356     }
357 
358 
359     /**
360      * Checks whether a character is a valid token character.
361      * Whitespace, control characters, and HTTP separators are not
362      * valid token characters. The HTTP specification (RFC 2616, section 2.2)
363      * defines tokens only for the US-ASCII character set, this
364      * method extends the definition to other character sets.
365      *
366      * @param ch        the character to check
367      *
368      * @return  <code>true</code> if the character is a valid token start,
369      *          <code>false</code> otherwise
370      */
371     protected boolean isTokenChar(final char ch) {
372 
373         // common sense extension of ALPHA + DIGIT
374         if (Character.isLetterOrDigit(ch)) {
375             return true;
376         }
377 
378         // common sense extension of CTL
379         if (Character.isISOControl(ch)) {
380             return false;
381         }
382 
383         // no common sense extension for this
384         if (isHttpSeparator(ch)) {
385             return false;
386         }
387 
388         // RFC 2616, section 2.2 defines a token character as
389         // "any CHAR except CTLs or separators". The controls
390         // and separators are included in the checks above.
391         // This will yield unexpected results for Unicode format characters.
392         // If that is a problem, overwrite isHttpSeparator(char) to filter
393         // out the false positives.
394         return true;
395     }
396 
397 
398     /**
399      * Checks whether a character is an HTTP separator.
400      * The implementation in this class checks only for the HTTP separators
401      * defined in RFC 2616, section 2.2. If you need to detect other
402      * separators beyond the US-ASCII character set, override this method.
403      *
404      * @param ch        the character to check
405      *
406      * @return  <code>true</code> if the character is an HTTP separator
407      */
408     protected boolean isHttpSeparator(final char ch) {
409         return (HTTP_SEPARATORS.indexOf(ch) >= 0);
410     }
411 
412 
413 } // class BasicTokenIterator
414