View Javadoc
1   /*
2    * ====================================================================
3    * Licensed to the Apache Software Foundation (ASF) under one
4    * or more contributor license agreements.  See the NOTICE file
5    * distributed with this work for additional information
6    * regarding copyright ownership.  The ASF licenses this file
7    * to you under the Apache License, Version 2.0 (the
8    * "License"); you may not use this file except in compliance
9    * with the License.  You may obtain a copy of the License at
10   *
11   *   http://www.apache.org/licenses/LICENSE-2.0
12   *
13   * Unless required by applicable law or agreed to in writing,
14   * software distributed under the License is distributed on an
15   * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
16   * KIND, either express or implied.  See the License for the
17   * specific language governing permissions and limitations
18   * under the License.
19   * ====================================================================
20   *
21   * This software consists of voluntary contributions made by many
22   * individuals on behalf of the Apache Software Foundation.  For more
23   * information on the Apache Software Foundation, please see
24   * <http://www.apache.org/>.
25   *
26   */
27  
28  package org.apache.hc.core5.util;
29  
30  import java.util.BitSet;
31  
32  import org.apache.hc.core5.annotation.Contract;
33  import org.apache.hc.core5.annotation.ThreadingBehavior;
34  
35  /**
36   * Tokenizer that can be used as a foundation for more complex parsing routines.
37   * Methods of this class are designed to produce near zero intermediate garbage
38   * and make no intermediate copies of input data.
39   * <p>
40   * This class is immutable and thread safe.
41   *
42   * @since 5.1
43   */
44  @Contract(threading = ThreadingBehavior.IMMUTABLE)
45  public class Tokenizer {
46  
47      public static class Cursor {
48  
49          private final int lowerBound;
50          private final int upperBound;
51          private int pos;
52  
53          public Cursor(final int lowerBound, final int upperBound) {
54              super();
55              Args.notNegative(lowerBound, "lowerBound");
56              Args.check(lowerBound <= upperBound, "lowerBound cannot be greater than upperBound");
57              this.lowerBound = lowerBound;
58              this.upperBound = upperBound;
59              this.pos = lowerBound;
60          }
61  
62          public int getLowerBound() {
63              return this.lowerBound;
64          }
65  
66          public int getUpperBound() {
67              return this.upperBound;
68          }
69  
70          public int getPos() {
71              return this.pos;
72          }
73  
74          public void updatePos(final int pos) {
75              if (pos < this.lowerBound) {
76                  throw new IndexOutOfBoundsException("pos: "+pos+" < lowerBound: "+this.lowerBound);
77              }
78              if (pos > this.upperBound) {
79                  throw new IndexOutOfBoundsException("pos: "+pos+" > upperBound: "+this.upperBound);
80              }
81              this.pos = pos;
82          }
83  
84          public boolean atEnd() {
85              return this.pos >= this.upperBound;
86          }
87  
88          @Override
89          public String toString() {
90              final StringBuilder buffer = new StringBuilder();
91              buffer.append('[');
92              buffer.append(this.lowerBound);
93              buffer.append('>');
94              buffer.append(this.pos);
95              buffer.append('>');
96              buffer.append(this.upperBound);
97              buffer.append(']');
98              return buffer.toString();
99          }
100 
101     }
102 
103     public static BitSet INIT_BITSET(final int ... b) {
104         final BitSet bitset = new BitSet();
105         for (final int aB : b) {
106             bitset.set(aB);
107         }
108         return bitset;
109     }
110 
111     /** Double quote */
112     public static final char DQUOTE = '\"';
113 
114     /** Backward slash / escape character */
115     public static final char ESCAPE = '\\';
116 
117     public static final int CR = 13; // <US-ASCII CR, carriage return (13)>
118     public static final int LF = 10; // <US-ASCII LF, linefeed (10)>
119     public static final int SP = 32; // <US-ASCII SP, space (32)>
120     public static final int HT = 9;  // <US-ASCII HT, horizontal-tab (9)>
121 
122     public static boolean isWhitespace(final char ch) {
123         return ch == SP || ch == HT || ch == CR || ch == LF;
124     }
125 
126     public static final Tokenizerizer.html#Tokenizer">Tokenizer INSTANCE = new Tokenizer();
127 
128     /**
129      * Extracts from the sequence of chars a token terminated with any of the given delimiters
130      * or a whitespace characters.
131      *
132      * @param buf buffer with the sequence of chars to be parsed
133      * @param cursor defines the bounds and current position of the buffer
134      * @param delimiters set of delimiting characters. Can be {@code null} if the token
135      *  is not delimited by any character.
136      */
137     public String parseContent(final CharSequence buf, final Cursor cursor, final BitSet delimiters) {
138         Args.notNull(buf, "Char sequence");
139         Args.notNull(cursor, "Parser cursor");
140         final StringBuilder dst = new StringBuilder();
141         copyContent(buf, cursor, delimiters, dst);
142         return dst.toString();
143     }
144 
145     /**
146      * Extracts from the sequence of chars a token terminated with any of the given delimiters
147      * discarding semantically insignificant whitespace characters.
148      *
149      * @param buf buffer with the sequence of chars to be parsed
150      * @param cursor defines the bounds and current position of the buffer
151      * @param delimiters set of delimiting characters. Can be {@code null} if the token
152      *  is not delimited by any character.
153      */
154     public String parseToken(final CharSequence buf, final Cursor cursor, final BitSet delimiters) {
155         Args.notNull(buf, "Char sequence");
156         Args.notNull(cursor, "Parser cursor");
157         final StringBuilder dst = new StringBuilder();
158         boolean whitespace = false;
159         while (!cursor.atEnd()) {
160             final char current = buf.charAt(cursor.getPos());
161             if (delimiters != null && delimiters.get(current)) {
162                 break;
163             } else if (isWhitespace(current)) {
164                 skipWhiteSpace(buf, cursor);
165                 whitespace = true;
166             } else {
167                 if (whitespace && dst.length() > 0) {
168                     dst.append(' ');
169                 }
170                 copyContent(buf, cursor, delimiters, dst);
171                 whitespace = false;
172             }
173         }
174         return dst.toString();
175     }
176 
177     /**
178      * Extracts from the sequence of chars a value which can be enclosed in quote marks and
179      * terminated with any of the given delimiters discarding semantically insignificant
180      * whitespace characters.
181      *
182      * @param buf buffer with the sequence of chars to be parsed
183      * @param cursor defines the bounds and current position of the buffer
184      * @param delimiters set of delimiting characters. Can be {@code null} if the value
185      *  is not delimited by any character.
186      */
187     public String parseValue(final CharSequence buf, final Cursor cursor, final BitSet delimiters) {
188         Args.notNull(buf, "Char sequence");
189         Args.notNull(cursor, "Parser cursor");
190         final StringBuilder dst = new StringBuilder();
191         boolean whitespace = false;
192         while (!cursor.atEnd()) {
193             final char current = buf.charAt(cursor.getPos());
194             if (delimiters != null && delimiters.get(current)) {
195                 break;
196             } else if (isWhitespace(current)) {
197                 skipWhiteSpace(buf, cursor);
198                 whitespace = true;
199             } else if (current == DQUOTE) {
200                 if (whitespace && dst.length() > 0) {
201                     dst.append(' ');
202                 }
203                 copyQuotedContent(buf, cursor, dst);
204                 whitespace = false;
205             } else {
206                 if (whitespace && dst.length() > 0) {
207                     dst.append(' ');
208                 }
209                 copyUnquotedContent(buf, cursor, delimiters, dst);
210                 whitespace = false;
211             }
212         }
213         return dst.toString();
214     }
215 
216     /**
217      * Skips semantically insignificant whitespace characters and moves the cursor to the closest
218      * non-whitespace character.
219      *
220      * @param buf buffer with the sequence of chars to be parsed
221      * @param cursor defines the bounds and current position of the buffer
222      */
223     public void skipWhiteSpace(final CharSequence buf, final Cursor cursor) {
224         Args.notNull(buf, "Char sequence");
225         Args.notNull(cursor, "Parser cursor");
226         int pos = cursor.getPos();
227         final int indexFrom = cursor.getPos();
228         final int indexTo = cursor.getUpperBound();
229         for (int i = indexFrom; i < indexTo; i++) {
230             final char current = buf.charAt(i);
231             if (!isWhitespace(current)) {
232                 break;
233             }
234             pos++;
235         }
236         cursor.updatePos(pos);
237     }
238 
239     /**
240      * Transfers content into the destination buffer until a whitespace character or any of
241      * the given delimiters is encountered.
242      *
243      * @param buf buffer with the sequence of chars to be parsed
244      * @param cursor defines the bounds and current position of the buffer
245      * @param delimiters set of delimiting characters. Can be {@code null} if the value
246      *  is delimited by a whitespace only.
247      * @param dst destination buffer
248      */
249     public void copyContent(final CharSequence buf, final Cursor cursor, final BitSet delimiters,
250                             final StringBuilder dst) {
251         Args.notNull(buf, "Char sequence");
252         Args.notNull(cursor, "Parser cursor");
253         Args.notNull(dst, "String builder");
254         int pos = cursor.getPos();
255         final int indexFrom = cursor.getPos();
256         final int indexTo = cursor.getUpperBound();
257         for (int i = indexFrom; i < indexTo; i++) {
258             final char current = buf.charAt(i);
259             if ((delimiters != null && delimiters.get(current)) || isWhitespace(current)) {
260                 break;
261             }
262             pos++;
263             dst.append(current);
264         }
265         cursor.updatePos(pos);
266     }
267 
268     /**
269      * Transfers content into the destination buffer until a whitespace character,  a quote,
270      * or any of the given delimiters is encountered.
271      *
272      * @param buf buffer with the sequence of chars to be parsed
273      * @param cursor defines the bounds and current position of the buffer
274      * @param delimiters set of delimiting characters. Can be {@code null} if the value
275      *  is delimited by a whitespace or a quote only.
276      * @param dst destination buffer
277      */
278     public void copyUnquotedContent(final CharSequence buf, final Cursor cursor,
279             final BitSet delimiters, final StringBuilder dst) {
280         Args.notNull(buf, "Char sequence");
281         Args.notNull(cursor, "Parser cursor");
282         Args.notNull(dst, "String builder");
283         int pos = cursor.getPos();
284         final int indexFrom = cursor.getPos();
285         final int indexTo = cursor.getUpperBound();
286         for (int i = indexFrom; i < indexTo; i++) {
287             final char current = buf.charAt(i);
288             if ((delimiters != null && delimiters.get(current))
289                     || isWhitespace(current) || current == DQUOTE) {
290                 break;
291             }
292             pos++;
293             dst.append(current);
294         }
295         cursor.updatePos(pos);
296     }
297 
298     /**
299      * Transfers content enclosed with quote marks into the destination buffer.
300      *
301      * @param buf buffer with the sequence of chars to be parsed
302      * @param cursor defines the bounds and current position of the buffer
303      * @param dst destination buffer
304      */
305     public void copyQuotedContent(final CharSequence buf, final Cursor cursor,
306             final StringBuilder dst) {
307         Args.notNull(buf, "Char sequence");
308         Args.notNull(cursor, "Parser cursor");
309         Args.notNull(dst, "String builder");
310         if (cursor.atEnd()) {
311             return;
312         }
313         int pos = cursor.getPos();
314         int indexFrom = cursor.getPos();
315         final int indexTo = cursor.getUpperBound();
316         char current = buf.charAt(pos);
317         if (current != DQUOTE) {
318             return;
319         }
320         pos++;
321         indexFrom++;
322         boolean escaped = false;
323         for (int i = indexFrom; i < indexTo; i++, pos++) {
324             current = buf.charAt(i);
325             if (escaped) {
326                 if (current != DQUOTE && current != ESCAPE) {
327                     dst.append(ESCAPE);
328                 }
329                 dst.append(current);
330                 escaped = false;
331             } else {
332                 if (current == DQUOTE) {
333                     pos++;
334                     break;
335                 }
336                 if (current == ESCAPE) {
337                     escaped = true;
338                 } else if (current != CR && current != LF) {
339                     dst.append(current);
340                 }
341             }
342         }
343         cursor.updatePos(pos);
344     }
345 
346 }