View Javadoc
1   /*
2    * ====================================================================
3    * Licensed to the Apache Software Foundation (ASF) under one
4    * or more contributor license agreements.  See the NOTICE file
5    * distributed with this work for additional information
6    * regarding copyright ownership.  The ASF licenses this file
7    * to you under the Apache License, Version 2.0 (the
8    * "License"); you may not use this file except in compliance
9    * with the License.  You may obtain a copy of the License at
10   *
11   *   http://www.apache.org/licenses/LICENSE-2.0
12   *
13   * Unless required by applicable law or agreed to in writing,
14   * software distributed under the License is distributed on an
15   * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
16   * KIND, either express or implied.  See the License for the
17   * specific language governing permissions and limitations
18   * under the License.
19   * ====================================================================
20   *
21   * This software consists of voluntary contributions made by many
22   * individuals on behalf of the Apache Software Foundation.  For more
23   * information on the Apache Software Foundation, please see
24   * <http://www.apache.org/>.
25   *
26   */
27  
28  package org.apache.hc.core5.util;
29  
30  import java.util.BitSet;
31  
32  import org.apache.hc.core5.annotation.Contract;
33  import org.apache.hc.core5.annotation.ThreadingBehavior;
34  
35  /**
36   * Tokenizer that can be used as a foundation for more complex parsing routines.
37   * Methods of this class are designed to produce near zero intermediate garbage
38   * and make no intermediate copies of input data.
39   * <p>
40   * This class is immutable and thread safe.
41   *
42   * @since 5.1
43   */
44  @Contract(threading = ThreadingBehavior.IMMUTABLE)
45  public class Tokenizer {
46  
47      public static class Cursor {
48  
49          private final int lowerBound;
50          private final int upperBound;
51          private int pos;
52  
53          public Cursor(final int lowerBound, final int upperBound) {
54              super();
55              Args.notNegative(lowerBound, "lowerBound");
56              Args.check(lowerBound <= upperBound, "lowerBound cannot be greater than upperBound");
57              this.lowerBound = lowerBound;
58              this.upperBound = upperBound;
59              this.pos = lowerBound;
60          }
61  
62          public int getLowerBound() {
63              return this.lowerBound;
64          }
65  
66          public int getUpperBound() {
67              return this.upperBound;
68          }
69  
70          public int getPos() {
71              return this.pos;
72          }
73  
74          public void updatePos(final int pos) {
75              Args.check(pos >= this.lowerBound, "pos: %s < lowerBound: %s", pos, this.lowerBound);
76              Args.check(pos <= this.upperBound, "pos: %s > upperBound: %s", pos, this.upperBound);
77              this.pos = pos;
78          }
79  
80          public boolean atEnd() {
81              return this.pos >= this.upperBound;
82          }
83  
84          @Override
85          public String toString() {
86              final StringBuilder buffer = new StringBuilder();
87              buffer.append('[');
88              buffer.append(this.lowerBound);
89              buffer.append('>');
90              buffer.append(this.pos);
91              buffer.append('>');
92              buffer.append(this.upperBound);
93              buffer.append(']');
94              return buffer.toString();
95          }
96  
97      }
98  
99      /**
100      * @deprecated Do not use.
101      */
102     @Deprecated
103     public static BitSet INIT_BITSET(final int ... b) {
104         final BitSet bitset = new BitSet();
105         for (final int aB : b) {
106             bitset.set(aB);
107         }
108         return bitset;
109     }
110 
111     /** Double quote */
112     public static final char DQUOTE = '\"';
113 
114     /** Backward slash / escape character */
115     public static final char ESCAPE = '\\';
116 
117     public static final int CR = 13; // <US-ASCII CR, carriage return (13)>
118     public static final int LF = 10; // <US-ASCII LF, linefeed (10)>
119     public static final int SP = 32; // <US-ASCII SP, space (32)>
120     public static final int HT = 9;  // <US-ASCII HT, horizontal-tab (9)>
121 
122     public static boolean isWhitespace(final char ch) {
123         return ch == SP || ch == HT || ch == CR || ch == LF;
124     }
125 
126     /**
127      * Represents a predicate whether the given character is a delimiter.
128      *
129      * @since 5.3
130      */
131     @FunctionalInterface
132     public interface Delimiter {
133 
134         boolean test(char ch);
135 
136     }
137 
138     /**
139      * @since 5.3
140      */
141     public static Delimiter delimiters(final BitSet delimiters) {
142         return delimiters::get;
143     }
144 
145     /**
146      * @since 5.3
147      */
148     public static Delimiter delimiters(final char... delimiters) {
149         return ch -> {
150             for (final char delimiter : delimiters) {
151                 if (delimiter == ch) {
152                     return true;
153                 }
154             }
155             return false;
156         };
157     }
158 
159     /**
160      * @since 5.3
161      */
162     public static Delimiter delimiters(final char delimiter) {
163         return ch -> ch == delimiter;
164     }
165 
166     /**
167      * @since 5.3
168      */
169     public static Delimiter delimiters(final char delimiter1, final char delimiter2) {
170         return ch -> ch == delimiter1 || ch == delimiter2;
171     }
172 
173     /**
174      * @since 5.3
175      */
176     public static Delimiter delimiters(final char delimiter1, final char delimiter2, final char delimiter3) {
177         return ch -> ch == delimiter1 || ch == delimiter2 || ch == delimiter3;
178     }
179 
180     public static final Tokenizer INSTANCE = new Tokenizer();
181 
182     /**
183      * Extracts from the sequence of chars a token terminated with any of the given delimiters
184      * or a whitespace characters.
185      *
186      * @param buf buffer with the sequence of chars to be parsed
187      * @param cursor defines the bounds and current position of the buffer
188      * @param delimiterPredicate delimiter predicate. Can be {@code null} if the token
189      *  is not delimited by any character.
190      */
191     public String parseContent(final CharSequence buf, final Cursor cursor, final Delimiter delimiterPredicate) {
192         Args.notNull(buf, "Char sequence");
193         Args.notNull(cursor, "Parser cursor");
194         final StringBuilder dst = new StringBuilder();
195         copyContent(buf, cursor, delimiterPredicate, dst);
196         return dst.toString();
197     }
198 
199     /**
200      * @deprecated use {@link #parseContent(CharSequence, Cursor, Delimiter)}
201      */
202     @Deprecated
203     public String parseContent(final CharSequence buf, final Cursor cursor, final BitSet bitSet) {
204         return parseContent(buf, cursor, bitSet != null ? bitSet::get : null);
205     }
206 
207     /**
208      * Extracts from the sequence of chars a token terminated with any of the given delimiters
209      * discarding semantically insignificant whitespace characters.
210      *
211      * @param buf buffer with the sequence of chars to be parsed
212      * @param cursor defines the bounds and current position of the buffer
213      * @param delimiterPredicate delimiter predicate. Can be {@code null} if the token
214      *  is not delimited by any character.
215      */
216     public String parseToken(final CharSequence buf, final Cursor cursor, final Delimiter delimiterPredicate) {
217         Args.notNull(buf, "Char sequence");
218         Args.notNull(cursor, "Parser cursor");
219         final StringBuilder dst = new StringBuilder();
220         boolean whitespace = false;
221         while (!cursor.atEnd()) {
222             final char current = buf.charAt(cursor.getPos());
223             if (delimiterPredicate != null && delimiterPredicate.test(current)) {
224                 break;
225             } else if (isWhitespace(current)) {
226                 skipWhiteSpace(buf, cursor);
227                 whitespace = true;
228             } else {
229                 if (whitespace && dst.length() > 0) {
230                     dst.append(' ');
231                 }
232                 copyContent(buf, cursor, delimiterPredicate, dst);
233                 whitespace = false;
234             }
235         }
236         return dst.toString();
237     }
238 
239     /**
240      * @deprecated use {@link #parseToken(CharSequence, Cursor, Delimiter)}
241      */
242     @Deprecated
243     public String parseToken(final CharSequence buf, final Cursor cursor, final BitSet bitSet) {
244         return parseToken(buf, cursor, bitSet != null ? bitSet::get : null);
245     }
246 
247     /**
248      * Extracts from the sequence of chars a value which can be enclosed in quote marks and
249      * terminated with any of the given delimiters discarding semantically insignificant
250      * whitespace characters.
251      *
252      * @param buf buffer with the sequence of chars to be parsed
253      * @param cursor defines the bounds and current position of the buffer
254      * @param delimiterPredicate delimiter predicate. Can be {@code null} if the token
255      *  is not delimited by any character.
256      */
257     public String parseValue(final CharSequence buf, final Cursor cursor, final Delimiter delimiterPredicate) {
258         Args.notNull(buf, "Char sequence");
259         Args.notNull(cursor, "Parser cursor");
260         final StringBuilder dst = new StringBuilder();
261         boolean whitespace = false;
262         while (!cursor.atEnd()) {
263             final char current = buf.charAt(cursor.getPos());
264             if (delimiterPredicate != null && delimiterPredicate.test(current)) {
265                 break;
266             } else if (isWhitespace(current)) {
267                 skipWhiteSpace(buf, cursor);
268                 whitespace = true;
269             } else if (current == DQUOTE) {
270                 if (whitespace && dst.length() > 0) {
271                     dst.append(' ');
272                 }
273                 copyQuotedContent(buf, cursor, dst);
274                 whitespace = false;
275             } else {
276                 if (whitespace && dst.length() > 0) {
277                     dst.append(' ');
278                 }
279                 copyUnquotedContent(buf, cursor, delimiterPredicate, dst);
280                 whitespace = false;
281             }
282         }
283         return dst.toString();
284     }
285 
286     /**
287      * @deprecated use {@link #parseValue(CharSequence, Cursor, Delimiter)}
288      */
289     @Deprecated
290     public String parseValue(final CharSequence buf, final Cursor cursor, final BitSet bitSet) {
291         return parseValue(buf, cursor, bitSet != null ? bitSet::get : null);
292     }
293 
294     /**
295      * Skips semantically insignificant whitespace characters and moves the cursor to the closest
296      * non-whitespace character.
297      *
298      * @param buf buffer with the sequence of chars to be parsed
299      * @param cursor defines the bounds and current position of the buffer
300      */
301     public void skipWhiteSpace(final CharSequence buf, final Cursor cursor) {
302         Args.notNull(buf, "Char sequence");
303         Args.notNull(cursor, "Parser cursor");
304         int pos = cursor.getPos();
305         final int indexFrom = cursor.getPos();
306         final int indexTo = cursor.getUpperBound();
307         for (int i = indexFrom; i < indexTo; i++) {
308             final char current = buf.charAt(i);
309             if (!isWhitespace(current)) {
310                 break;
311             }
312             pos++;
313         }
314         cursor.updatePos(pos);
315     }
316 
317     /**
318      * Transfers content into the destination buffer until a whitespace character or any of
319      * the given delimiters is encountered.
320      *
321      * @param buf buffer with the sequence of chars to be parsed
322      * @param cursor defines the bounds and current position of the buffer
323      * @param delimiterPredicate delimiter predicate. Can be {@code null} if the token
324      *  is delimited by a whitespace only.
325      * @param dst destination buffer
326      */
327     public void copyContent(final CharSequence buf, final Cursor cursor, final Delimiter delimiterPredicate,
328                             final StringBuilder dst) {
329         Args.notNull(buf, "Char sequence");
330         Args.notNull(cursor, "Parser cursor");
331         Args.notNull(dst, "String builder");
332         int pos = cursor.getPos();
333         final int indexFrom = cursor.getPos();
334         final int indexTo = cursor.getUpperBound();
335         for (int i = indexFrom; i < indexTo; i++) {
336             final char current = buf.charAt(i);
337             if ((delimiterPredicate != null && delimiterPredicate.test(current)) || isWhitespace(current)) {
338                 break;
339             }
340             pos++;
341             dst.append(current);
342         }
343         cursor.updatePos(pos);
344     }
345 
346     /**
347      * @deprecated Use {@link #copyContent(CharSequence, Cursor, Delimiter, StringBuilder)}
348      */
349     @Deprecated
350     public void copyContent(final CharSequence buf, final Cursor cursor, final BitSet bitSet,
351                             final StringBuilder dst) {
352         copyContent(buf, cursor, bitSet != null ? bitSet::get : null, dst);
353     }
354 
355     /**
356      * Transfers content into the destination buffer until a whitespace character,  a quote,
357      * or any of the given delimiters is encountered.
358      *
359      * @param buf buffer with the sequence of chars to be parsed
360      * @param cursor defines the bounds and current position of the buffer
361      * @param delimiterPredicate delimiter predicate. Can be {@code null} if the token
362      *  is delimited by a whitespace or a quote only.
363      * @param dst destination buffer
364      */
365     public void copyUnquotedContent(final CharSequence buf, final Cursor cursor,
366             final Delimiter delimiterPredicate, final StringBuilder dst) {
367         Args.notNull(buf, "Char sequence");
368         Args.notNull(cursor, "Parser cursor");
369         Args.notNull(dst, "String builder");
370         int pos = cursor.getPos();
371         final int indexFrom = cursor.getPos();
372         final int indexTo = cursor.getUpperBound();
373         for (int i = indexFrom; i < indexTo; i++) {
374             final char current = buf.charAt(i);
375             if ((delimiterPredicate != null && delimiterPredicate.test(current))
376                     || isWhitespace(current) || current == DQUOTE) {
377                 break;
378             }
379             pos++;
380             dst.append(current);
381         }
382         cursor.updatePos(pos);
383     }
384 
385     /**
386      * @deprecated Use {@link #copyUnquotedContent(CharSequence, Cursor, Delimiter, StringBuilder)}
387      */
388     @Deprecated
389     public void copyUnquotedContent(final CharSequence buf, final Cursor cursor,
390                                     final BitSet bitSet, final StringBuilder dst) {
391         copyUnquotedContent(buf, cursor, bitSet != null ? bitSet::get : null, dst);
392     }
393 
394     /**
395      * Transfers content enclosed with quote marks into the destination buffer.
396      *
397      * @param buf buffer with the sequence of chars to be parsed
398      * @param cursor defines the bounds and current position of the buffer
399      * @param dst destination buffer
400      */
401     public void copyQuotedContent(final CharSequence buf, final Cursor cursor,
402             final StringBuilder dst) {
403         Args.notNull(buf, "Char sequence");
404         Args.notNull(cursor, "Parser cursor");
405         Args.notNull(dst, "String builder");
406         if (cursor.atEnd()) {
407             return;
408         }
409         int pos = cursor.getPos();
410         int indexFrom = cursor.getPos();
411         final int indexTo = cursor.getUpperBound();
412         char current = buf.charAt(pos);
413         if (current != DQUOTE) {
414             return;
415         }
416         pos++;
417         indexFrom++;
418         boolean escaped = false;
419         for (int i = indexFrom; i < indexTo; i++, pos++) {
420             current = buf.charAt(i);
421             if (escaped) {
422                 if (current != DQUOTE && current != ESCAPE) {
423                     dst.append(ESCAPE);
424                 }
425                 dst.append(current);
426                 escaped = false;
427             } else {
428                 if (current == DQUOTE) {
429                     pos++;
430                     break;
431                 }
432                 if (current == ESCAPE) {
433                     escaped = true;
434                 } else if (current != CR && current != LF) {
435                     dst.append(current);
436                 }
437             }
438         }
439         cursor.updatePos(pos);
440     }
441 
442 }