1 /* 2 * ==================================================================== 3 * Licensed to the Apache Software Foundation (ASF) under one 4 * or more contributor license agreements. See the NOTICE file 5 * distributed with this work for additional information 6 * regarding copyright ownership. The ASF licenses this file 7 * to you under the Apache License, Version 2.0 (the 8 * "License"); you may not use this file except in compliance 9 * with the License. You may obtain a copy of the License at 10 * 11 * http://www.apache.org/licenses/LICENSE-2.0 12 * 13 * Unless required by applicable law or agreed to in writing, 14 * software distributed under the License is distributed on an 15 * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY 16 * KIND, either express or implied. See the License for the 17 * specific language governing permissions and limitations 18 * under the License. 19 * ==================================================================== 20 * 21 * This software consists of voluntary contributions made by many 22 * individuals on behalf of the Apache Software Foundation. For more 23 * information on the Apache Software Foundation, please see 24 * <http://www.apache.org/>. 25 * 26 */ 27 28 package org.apache.http.message; 29 30 import java.util.NoSuchElementException; 31 32 import org.apache.http.HeaderIterator; 33 import org.apache.http.ParseException; 34 import org.apache.http.TokenIterator; 35 import org.apache.http.util.Args; 36 37 /** 38 * Basic implementation of a {@link TokenIterator}. 39 * This implementation parses {@code #token} sequences as 40 * defined by RFC 2616, section 2. 41 * It extends that definition somewhat beyond US-ASCII. 42 * 43 * @since 4.0 44 */ 45 public class BasicTokenIterator implements TokenIterator { 46 47 /** The HTTP separator characters. Defined in RFC 2616, section 2.2. */ 48 // the order of the characters here is adjusted to put the 49 // most likely candidates at the beginning of the collection 50 public final static String HTTP_SEPARATORS = " ,;=()<>@:\\\"/[]?{}\t"; 51 52 53 /** The iterator from which to obtain the next header. */ 54 protected final HeaderIterator headerIt; 55 56 /** 57 * The value of the current header. 58 * This is the header value that includes {@link #currentToken}. 59 * Undefined if the iteration is over. 60 */ 61 protected String currentHeader; 62 63 /** 64 * The token to be returned by the next call to {@link #nextToken()}. 65 * {@code null} if the iteration is over. 66 */ 67 protected String currentToken; 68 69 /** 70 * The position after {@link #currentToken} in {@link #currentHeader}. 71 * Undefined if the iteration is over. 72 */ 73 protected int searchPos; 74 75 76 /** 77 * Creates a new instance of {@link BasicTokenIterator}. 78 * 79 * @param headerIterator the iterator for the headers to tokenize 80 */ 81 public BasicTokenIterator(final HeaderIterator headerIterator) { 82 super(); 83 this.headerIt = Args.notNull(headerIterator, "Header iterator"); 84 this.searchPos = findNext(-1); 85 } 86 87 88 // non-javadoc, see interface TokenIterator 89 @Override 90 public boolean hasNext() { 91 return (this.currentToken != null); 92 } 93 94 95 /** 96 * Obtains the next token from this iteration. 97 * 98 * @return the next token in this iteration 99 * 100 * @throws NoSuchElementException if the iteration is already over 101 * @throws ParseException if an invalid header value is encountered 102 */ 103 @Override 104 public String nextToken() 105 throws NoSuchElementException, ParseException { 106 107 if (this.currentToken == null) { 108 throw new NoSuchElementException("Iteration already finished."); 109 } 110 111 final String result = this.currentToken; 112 // updates currentToken, may trigger ParseException: 113 this.searchPos = findNext(this.searchPos); 114 115 return result; 116 } 117 118 119 /** 120 * Returns the next token. 121 * Same as {@link #nextToken}, but with generic return type. 122 * 123 * @return the next token in this iteration 124 * 125 * @throws NoSuchElementException if there are no more tokens 126 * @throws ParseException if an invalid header value is encountered 127 */ 128 @Override 129 public final Object next() 130 throws NoSuchElementException, ParseException { 131 return nextToken(); 132 } 133 134 135 /** 136 * Removing tokens is not supported. 137 * 138 * @throws UnsupportedOperationException always 139 */ 140 @Override 141 public final void remove() 142 throws UnsupportedOperationException { 143 144 throw new UnsupportedOperationException 145 ("Removing tokens is not supported."); 146 } 147 148 149 /** 150 * Determines the next token. 151 * If found, the token is stored in {@link #currentToken}. 152 * The return value indicates the position after the token 153 * in {@link #currentHeader}. If necessary, the next header 154 * will be obtained from {@link #headerIt}. 155 * If not found, {@link #currentToken} is set to {@code null}. 156 * 157 * @param pos the position in the current header at which to 158 * start the search, -1 to search in the first header 159 * 160 * @return the position after the found token in the current header, or 161 * negative if there was no next token 162 * 163 * @throws ParseException if an invalid header value is encountered 164 */ 165 protected int findNext(final int pos) throws ParseException { 166 int from = pos; 167 if (from < 0) { 168 // called from the constructor, initialize the first header 169 if (!this.headerIt.hasNext()) { 170 return -1; 171 } 172 this.currentHeader = this.headerIt.nextHeader().getValue(); 173 from = 0; 174 } else { 175 // called after a token, make sure there is a separator 176 from = findTokenSeparator(from); 177 } 178 179 final int start = findTokenStart(from); 180 if (start < 0) { 181 this.currentToken = null; 182 return -1; // nothing found 183 } 184 185 final int end = findTokenEnd(start); 186 this.currentToken = createToken(this.currentHeader, start, end); 187 return end; 188 } 189 190 191 /** 192 * Creates a new token to be returned. 193 * Called from {@link #findNext findNext} after the token is identified. 194 * The default implementation simply calls 195 * {@link java.lang.String#substring String.substring}. 196 * <p> 197 * If header values are significantly longer than tokens, and some 198 * tokens are permanently referenced by the application, there can 199 * be problems with garbage collection. A substring will hold a 200 * reference to the full characters of the original string and 201 * therefore occupies more memory than might be expected. 202 * To avoid this, override this method and create a new string 203 * instead of a substring. 204 * </p> 205 * 206 * @param value the full header value from which to create a token 207 * @param start the index of the first token character 208 * @param end the index after the last token character 209 * 210 * @return a string representing the token identified by the arguments 211 */ 212 protected String createToken(final String value, final int start, final int end) { 213 return value.substring(start, end); 214 } 215 216 217 /** 218 * Determines the starting position of the next token. 219 * This method will iterate over headers if necessary. 220 * 221 * @param pos the position in the current header at which to 222 * start the search 223 * 224 * @return the position of the token start in the current header, 225 * negative if no token start could be found 226 */ 227 protected int findTokenStart(final int pos) { 228 int from = Args.notNegative(pos, "Search position"); 229 boolean found = false; 230 while (!found && (this.currentHeader != null)) { 231 232 final int to = this.currentHeader.length(); 233 while (!found && (from < to)) { 234 235 final char ch = this.currentHeader.charAt(from); 236 if (isTokenSeparator(ch) || isWhitespace(ch)) { 237 // whitspace and token separators are skipped 238 from++; 239 } else if (isTokenChar(this.currentHeader.charAt(from))) { 240 // found the start of a token 241 found = true; 242 } else { 243 throw new ParseException 244 ("Invalid character before token (pos " + from + 245 "): " + this.currentHeader); 246 } 247 } 248 if (!found) { 249 if (this.headerIt.hasNext()) { 250 this.currentHeader = this.headerIt.nextHeader().getValue(); 251 from = 0; 252 } else { 253 this.currentHeader = null; 254 } 255 } 256 } // while headers 257 258 return found ? from : -1; 259 } 260 261 262 /** 263 * Determines the position of the next token separator. 264 * Because of multi-header joining rules, the end of a 265 * header value is a token separator. This method does 266 * therefore not need to iterate over headers. 267 * 268 * @param pos the position in the current header at which to 269 * start the search 270 * 271 * @return the position of a token separator in the current header, 272 * or at the end 273 * 274 * @throws ParseException 275 * if a new token is found before a token separator. 276 * RFC 2616, section 2.1 explicitly requires a comma between 277 * tokens for {@code #}. 278 */ 279 protected int findTokenSeparator(final int pos) { 280 int from = Args.notNegative(pos, "Search position"); 281 boolean found = false; 282 final int to = this.currentHeader.length(); 283 while (!found && (from < to)) { 284 final char ch = this.currentHeader.charAt(from); 285 if (isTokenSeparator(ch)) { 286 found = true; 287 } else if (isWhitespace(ch)) { 288 from++; 289 } else if (isTokenChar(ch)) { 290 throw new ParseException 291 ("Tokens without separator (pos " + from + 292 "): " + this.currentHeader); 293 } else { 294 throw new ParseException 295 ("Invalid character after token (pos " + from + 296 "): " + this.currentHeader); 297 } 298 } 299 300 return from; 301 } 302 303 304 /** 305 * Determines the ending position of the current token. 306 * This method will not leave the current header value, 307 * since the end of the header value is a token boundary. 308 * 309 * @param from the position of the first character of the token 310 * 311 * @return the position after the last character of the token. 312 * The behavior is undefined if {@code from} does not 313 * point to a token character in the current header value. 314 */ 315 protected int findTokenEnd(final int from) { 316 Args.notNegative(from, "Search position"); 317 final int to = this.currentHeader.length(); 318 int end = from+1; 319 while ((end < to) && isTokenChar(this.currentHeader.charAt(end))) { 320 end++; 321 } 322 323 return end; 324 } 325 326 327 /** 328 * Checks whether a character is a token separator. 329 * RFC 2616, section 2.1 defines comma as the separator for 330 * {@code #token} sequences. The end of a header value will 331 * also separate tokens, but that is not a character check. 332 * 333 * @param ch the character to check 334 * 335 * @return {@code true} if the character is a token separator, 336 * {@code false} otherwise 337 */ 338 protected boolean isTokenSeparator(final char ch) { 339 return (ch == ','); 340 } 341 342 343 /** 344 * Checks whether a character is a whitespace character. 345 * RFC 2616, section 2.2 defines space and horizontal tab as whitespace. 346 * The optional preceeding line break is irrelevant, since header 347 * continuation is handled transparently when parsing messages. 348 * 349 * @param ch the character to check 350 * 351 * @return {@code true} if the character is whitespace, 352 * {@code false} otherwise 353 */ 354 protected boolean isWhitespace(final char ch) { 355 356 // we do not use Character.isWhitspace(ch) here, since that allows 357 // many control characters which are not whitespace as per RFC 2616 358 return ((ch == '\t') || Character.isSpaceChar(ch)); 359 } 360 361 362 /** 363 * Checks whether a character is a valid token character. 364 * Whitespace, control characters, and HTTP separators are not 365 * valid token characters. The HTTP specification (RFC 2616, section 2.2) 366 * defines tokens only for the US-ASCII character set, this 367 * method extends the definition to other character sets. 368 * 369 * @param ch the character to check 370 * 371 * @return {@code true} if the character is a valid token start, 372 * {@code false} otherwise 373 */ 374 protected boolean isTokenChar(final char ch) { 375 376 // common sense extension of ALPHA + DIGIT 377 if (Character.isLetterOrDigit(ch)) { 378 return true; 379 } 380 381 // common sense extension of CTL 382 if (Character.isISOControl(ch)) { 383 return false; 384 } 385 386 // no common sense extension for this 387 if (isHttpSeparator(ch)) { 388 return false; 389 } 390 391 // RFC 2616, section 2.2 defines a token character as 392 // "any CHAR except CTLs or separators". The controls 393 // and separators are included in the checks above. 394 // This will yield unexpected results for Unicode format characters. 395 // If that is a problem, overwrite isHttpSeparator(char) to filter 396 // out the false positives. 397 return true; 398 } 399 400 401 /** 402 * Checks whether a character is an HTTP separator. 403 * The implementation in this class checks only for the HTTP separators 404 * defined in RFC 2616, section 2.2. If you need to detect other 405 * separators beyond the US-ASCII character set, override this method. 406 * 407 * @param ch the character to check 408 * 409 * @return {@code true} if the character is an HTTP separator 410 */ 411 protected boolean isHttpSeparator(final char ch) { 412 return (HTTP_SEPARATORS.indexOf(ch) >= 0); 413 } 414 415 416 } // class BasicTokenIterator 417