1 /*
2 * ====================================================================
3 * Licensed to the Apache Software Foundation (ASF) under one
4 * or more contributor license agreements. See the NOTICE file
5 * distributed with this work for additional information
6 * regarding copyright ownership. The ASF licenses this file
7 * to you under the Apache License, Version 2.0 (the
8 * "License"); you may not use this file except in compliance
9 * with the License. You may obtain a copy of the License at
10 *
11 * http://www.apache.org/licenses/LICENSE-2.0
12 *
13 * Unless required by applicable law or agreed to in writing,
14 * software distributed under the License is distributed on an
15 * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
16 * KIND, either express or implied. See the License for the
17 * specific language governing permissions and limitations
18 * under the License.
19 * ====================================================================
20 *
21 * This software consists of voluntary contributions made by many
22 * individuals on behalf of the Apache Software Foundation. For more
23 * information on the Apache Software Foundation, please see
24 * <http://www.apache.org/>.
25 *
26 */
27
28 package org.apache.http.message;
29
30 import java.util.NoSuchElementException;
31
32 import org.apache.http.HeaderIterator;
33 import org.apache.http.ParseException;
34 import org.apache.http.TokenIterator;
35 import org.apache.http.annotation.NotThreadSafe;
36 import org.apache.http.util.Args;
37
38 /**
39 * Basic implementation of a {@link TokenIterator}.
40 * This implementation parses <tt>#token<tt> sequences as
41 * defined by RFC 2616, section 2.
42 * It extends that definition somewhat beyond US-ASCII.
43 *
44 * @since 4.0
45 */
46 @NotThreadSafe
47 public class BasicTokenIterator implements TokenIterator {
48
49 /** The HTTP separator characters. Defined in RFC 2616, section 2.2. */
50 // the order of the characters here is adjusted to put the
51 // most likely candidates at the beginning of the collection
52 public final static String HTTP_SEPARATORS = " ,;=()<>@:\\\"/[]?{}\t";
53
54
55 /** The iterator from which to obtain the next header. */
56 protected final HeaderIterator headerIt;
57
58 /**
59 * The value of the current header.
60 * This is the header value that includes {@link #currentToken}.
61 * Undefined if the iteration is over.
62 */
63 protected String currentHeader;
64
65 /**
66 * The token to be returned by the next call to {@link #currentToken}.
67 * <code>null</code> if the iteration is over.
68 */
69 protected String currentToken;
70
71 /**
72 * The position after {@link #currentToken} in {@link #currentHeader}.
73 * Undefined if the iteration is over.
74 */
75 protected int searchPos;
76
77
78 /**
79 * Creates a new instance of {@link BasicTokenIterator}.
80 *
81 * @param headerIterator the iterator for the headers to tokenize
82 */
83 public BasicTokenIterator(final HeaderIterator headerIterator) {
84 super();
85 this.headerIt = Args.notNull(headerIterator, "Header iterator");
86 this.searchPos = findNext(-1);
87 }
88
89
90 // non-javadoc, see interface TokenIterator
91 public boolean hasNext() {
92 return (this.currentToken != null);
93 }
94
95
96 /**
97 * Obtains the next token from this iteration.
98 *
99 * @return the next token in this iteration
100 *
101 * @throws NoSuchElementException if the iteration is already over
102 * @throws ParseException if an invalid header value is encountered
103 */
104 public String nextToken()
105 throws NoSuchElementException, ParseException {
106
107 if (this.currentToken == null) {
108 throw new NoSuchElementException("Iteration already finished.");
109 }
110
111 final String result = this.currentToken;
112 // updates currentToken, may trigger ParseException:
113 this.searchPos = findNext(this.searchPos);
114
115 return result;
116 }
117
118
119 /**
120 * Returns the next token.
121 * Same as {@link #nextToken}, but with generic return type.
122 *
123 * @return the next token in this iteration
124 *
125 * @throws NoSuchElementException if there are no more tokens
126 * @throws ParseException if an invalid header value is encountered
127 */
128 public final Object next()
129 throws NoSuchElementException, ParseException {
130 return nextToken();
131 }
132
133
134 /**
135 * Removing tokens is not supported.
136 *
137 * @throws UnsupportedOperationException always
138 */
139 public final void remove()
140 throws UnsupportedOperationException {
141
142 throw new UnsupportedOperationException
143 ("Removing tokens is not supported.");
144 }
145
146
147 /**
148 * Determines the next token.
149 * If found, the token is stored in {@link #currentToken}.
150 * The return value indicates the position after the token
151 * in {@link #currentHeader}. If necessary, the next header
152 * will be obtained from {@link #headerIt}.
153 * If not found, {@link #currentToken} is set to <code>null</code>.
154 *
155 * @param from the position in the current header at which to
156 * start the search, -1 to search in the first header
157 *
158 * @return the position after the found token in the current header, or
159 * negative if there was no next token
160 *
161 * @throws ParseException if an invalid header value is encountered
162 */
163 protected int findNext(int from)
164 throws ParseException {
165
166 if (from < 0) {
167 // called from the constructor, initialize the first header
168 if (!this.headerIt.hasNext()) {
169 return -1;
170 }
171 this.currentHeader = this.headerIt.nextHeader().getValue();
172 from = 0;
173 } else {
174 // called after a token, make sure there is a separator
175 from = findTokenSeparator(from);
176 }
177
178 final int start = findTokenStart(from);
179 if (start < 0) {
180 this.currentToken = null;
181 return -1; // nothing found
182 }
183
184 final int end = findTokenEnd(start);
185 this.currentToken = createToken(this.currentHeader, start, end);
186 return end;
187 }
188
189
190 /**
191 * Creates a new token to be returned.
192 * Called from {@link #findNext findNext} after the token is identified.
193 * The default implementation simply calls
194 * {@link java.lang.String#substring String.substring}.
195 * <br/>
196 * If header values are significantly longer than tokens, and some
197 * tokens are permanently referenced by the application, there can
198 * be problems with garbage collection. A substring will hold a
199 * reference to the full characters of the original string and
200 * therefore occupies more memory than might be expected.
201 * To avoid this, override this method and create a new string
202 * instead of a substring.
203 *
204 * @param value the full header value from which to create a token
205 * @param start the index of the first token character
206 * @param end the index after the last token character
207 *
208 * @return a string representing the token identified by the arguments
209 */
210 protected String createToken(final String value, final int start, final int end) {
211 return value.substring(start, end);
212 }
213
214
215 /**
216 * Determines the starting position of the next token.
217 * This method will iterate over headers if necessary.
218 *
219 * @param from the position in the current header at which to
220 * start the search
221 *
222 * @return the position of the token start in the current header,
223 * negative if no token start could be found
224 */
225 protected int findTokenStart(int from) {
226 Args.notNegative(from, "Search position");
227 boolean found = false;
228 while (!found && (this.currentHeader != null)) {
229
230 final int to = this.currentHeader.length();
231 while (!found && (from < to)) {
232
233 final char ch = this.currentHeader.charAt(from);
234 if (isTokenSeparator(ch) || isWhitespace(ch)) {
235 // whitspace and token separators are skipped
236 from++;
237 } else if (isTokenChar(this.currentHeader.charAt(from))) {
238 // found the start of a token
239 found = true;
240 } else {
241 throw new ParseException
242 ("Invalid character before token (pos " + from +
243 "): " + this.currentHeader);
244 }
245 }
246 if (!found) {
247 if (this.headerIt.hasNext()) {
248 this.currentHeader = this.headerIt.nextHeader().getValue();
249 from = 0;
250 } else {
251 this.currentHeader = null;
252 }
253 }
254 } // while headers
255
256 return found ? from : -1;
257 }
258
259
260 /**
261 * Determines the position of the next token separator.
262 * Because of multi-header joining rules, the end of a
263 * header value is a token separator. This method does
264 * therefore not need to iterate over headers.
265 *
266 * @param from the position in the current header at which to
267 * start the search
268 *
269 * @return the position of a token separator in the current header,
270 * or at the end
271 *
272 * @throws ParseException
273 * if a new token is found before a token separator.
274 * RFC 2616, section 2.1 explicitly requires a comma between
275 * tokens for <tt>#</tt>.
276 */
277 protected int findTokenSeparator(int from) {
278 Args.notNegative(from, "Search position");
279 boolean found = false;
280 final int to = this.currentHeader.length();
281 while (!found && (from < to)) {
282 final char ch = this.currentHeader.charAt(from);
283 if (isTokenSeparator(ch)) {
284 found = true;
285 } else if (isWhitespace(ch)) {
286 from++;
287 } else if (isTokenChar(ch)) {
288 throw new ParseException
289 ("Tokens without separator (pos " + from +
290 "): " + this.currentHeader);
291 } else {
292 throw new ParseException
293 ("Invalid character after token (pos " + from +
294 "): " + this.currentHeader);
295 }
296 }
297
298 return from;
299 }
300
301
302 /**
303 * Determines the ending position of the current token.
304 * This method will not leave the current header value,
305 * since the end of the header value is a token boundary.
306 *
307 * @param from the position of the first character of the token
308 *
309 * @return the position after the last character of the token.
310 * The behavior is undefined if <code>from</code> does not
311 * point to a token character in the current header value.
312 */
313 protected int findTokenEnd(final int from) {
314 Args.notNegative(from, "Search position");
315 final int to = this.currentHeader.length();
316 int end = from+1;
317 while ((end < to) && isTokenChar(this.currentHeader.charAt(end))) {
318 end++;
319 }
320
321 return end;
322 }
323
324
325 /**
326 * Checks whether a character is a token separator.
327 * RFC 2616, section 2.1 defines comma as the separator for
328 * <tt>#token</tt> sequences. The end of a header value will
329 * also separate tokens, but that is not a character check.
330 *
331 * @param ch the character to check
332 *
333 * @return <code>true</code> if the character is a token separator,
334 * <code>false</code> otherwise
335 */
336 protected boolean isTokenSeparator(final char ch) {
337 return (ch == ',');
338 }
339
340
341 /**
342 * Checks whether a character is a whitespace character.
343 * RFC 2616, section 2.2 defines space and horizontal tab as whitespace.
344 * The optional preceeding line break is irrelevant, since header
345 * continuation is handled transparently when parsing messages.
346 *
347 * @param ch the character to check
348 *
349 * @return <code>true</code> if the character is whitespace,
350 * <code>false</code> otherwise
351 */
352 protected boolean isWhitespace(final char ch) {
353
354 // we do not use Character.isWhitspace(ch) here, since that allows
355 // many control characters which are not whitespace as per RFC 2616
356 return ((ch == '\t') || Character.isSpaceChar(ch));
357 }
358
359
360 /**
361 * Checks whether a character is a valid token character.
362 * Whitespace, control characters, and HTTP separators are not
363 * valid token characters. The HTTP specification (RFC 2616, section 2.2)
364 * defines tokens only for the US-ASCII character set, this
365 * method extends the definition to other character sets.
366 *
367 * @param ch the character to check
368 *
369 * @return <code>true</code> if the character is a valid token start,
370 * <code>false</code> otherwise
371 */
372 protected boolean isTokenChar(final char ch) {
373
374 // common sense extension of ALPHA + DIGIT
375 if (Character.isLetterOrDigit(ch)) {
376 return true;
377 }
378
379 // common sense extension of CTL
380 if (Character.isISOControl(ch)) {
381 return false;
382 }
383
384 // no common sense extension for this
385 if (isHttpSeparator(ch)) {
386 return false;
387 }
388
389 // RFC 2616, section 2.2 defines a token character as
390 // "any CHAR except CTLs or separators". The controls
391 // and separators are included in the checks above.
392 // This will yield unexpected results for Unicode format characters.
393 // If that is a problem, overwrite isHttpSeparator(char) to filter
394 // out the false positives.
395 return true;
396 }
397
398
399 /**
400 * Checks whether a character is an HTTP separator.
401 * The implementation in this class checks only for the HTTP separators
402 * defined in RFC 2616, section 2.2. If you need to detect other
403 * separators beyond the US-ASCII character set, override this method.
404 *
405 * @param ch the character to check
406 *
407 * @return <code>true</code> if the character is an HTTP separator
408 */
409 protected boolean isHttpSeparator(final char ch) {
410 return (HTTP_SEPARATORS.indexOf(ch) >= 0);
411 }
412
413
414 } // class BasicTokenIterator
415