001    /*
002     * Licensed to the Apache Software Foundation (ASF) under one or more
003     * contributor license agreements.  See the NOTICE file distributed with
004     * this work for additional information regarding copyright ownership.
005     * The ASF licenses this file to You under the Apache License, Version 2.0
006     * (the "License"); you may not use this file except in compliance with
007     * the License.  You may obtain a copy of the License at
008     *
009     *      http://www.apache.org/licenses/LICENSE-2.0
010     *
011     * Unless required by applicable law or agreed to in writing, software
012     * distributed under the License is distributed on an "AS IS" BASIS,
013     * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
014     * See the License for the specific language governing permissions and
015     * limitations under the License.
016     */
017    package org.apache.commons.lang.text;
018    
019    import java.util.ArrayList;
020    import java.util.Collections;
021    import java.util.List;
022    import java.util.ListIterator;
023    import java.util.NoSuchElementException;
024    
025    /**
026     * Tokenizes a string based based on delimiters (separators)
027     * and supporting quoting and ignored character concepts.
028     * <p>
029     * This class can split a String into many smaller strings. It aims
030     * to do a similar job to {@link java.util.StringTokenizer StringTokenizer},
031     * however it offers much more control and flexibility including implementing
032     * the <code>ListIterator</code> interface. By default, it is set up
033     * like <code>StringTokenizer</code>.
034     * <p>
035     * The input String is split into a number of <i>tokens</i>.
036     * Each token is separated from the next String by a <i>delimiter</i>.
037     * One or more delimiter characters must be specified.
038     * <p>
039     * Each token may be surrounded by quotes.
040     * The <i>quote</i> matcher specifies the quote character(s).
041     * A quote may be escaped within a quoted section by duplicating itself.
042     * <p>
043     * Between each token and the delimiter are potentially characters that need trimming.
044     * The <i>trimmer</i> matcher specifies these characters.
045     * One usage might be to trim whitespace characters.
046     * <p>
047     * At any point outside the quotes there might potentially be invalid characters.
048     * The <i>ignored</i> matcher specifies these characters to be removed.
049     * One usage might be to remove new line characters.
050     * <p>
051     * Empty tokens may be removed or returned as null.
052     * <pre>
053     * "a,b,c"         - Three tokens "a","b","c"   (comma delimiter)
054     * " a, b , c "    - Three tokens "a","b","c"   (default CSV processing trims whitespace)
055     * "a, ", b ,", c" - Three tokens "a, " , " b ", ", c" (quoted text untouched)
056     * </pre>
057     * <p>
058     *
059     * This tokenizer has the following properties and options:
060     *
061     * <table>
062     *  <tr>
063     *   <th>Property</th><th>Type</th><th>Default</th>
064     *  </tr>
065     *  <tr>
066     *   <td>delim</td><td>CharSetMatcher</td><td>{ \t\n\r\f}</td>
067     *  </tr>
068     *  <tr>
069     *   <td>quote</td><td>NoneMatcher</td><td>{}</td>
070     *  </tr>
071     *  <tr>
072     *   <td>ignore</td><td>NoneMatcher</td><td>{}</td>
073     *  </tr>
074     *  <tr>
075     *   <td>emptyTokenAsNull</td><td>boolean</td><td>false</td>
076     *  </tr>
077     *  <tr>
078     *   <td>ignoreEmptyTokens</td><td>boolean</td><td>true</td>
079     *  </tr>
080     * </table>
081     *
082     * @author Matthew Inger
083     * @author Stephen Colebourne
084     * @author Gary D. Gregory
085     * @since 2.2
086     * @version $Id: StrTokenizer.java 592077 2007-11-05 16:47:10Z mbenson $
087     */
088    public class StrTokenizer implements ListIterator, Cloneable {
089    
090        private static final StrTokenizer CSV_TOKENIZER_PROTOTYPE;
091        private static final StrTokenizer TSV_TOKENIZER_PROTOTYPE;
092        static {
093            CSV_TOKENIZER_PROTOTYPE = new StrTokenizer();
094            CSV_TOKENIZER_PROTOTYPE.setDelimiterMatcher(StrMatcher.commaMatcher());
095            CSV_TOKENIZER_PROTOTYPE.setQuoteMatcher(StrMatcher.doubleQuoteMatcher());
096            CSV_TOKENIZER_PROTOTYPE.setIgnoredMatcher(StrMatcher.noneMatcher());
097            CSV_TOKENIZER_PROTOTYPE.setTrimmerMatcher(StrMatcher.trimMatcher());
098            CSV_TOKENIZER_PROTOTYPE.setEmptyTokenAsNull(false);
099            CSV_TOKENIZER_PROTOTYPE.setIgnoreEmptyTokens(false);
100    
101            TSV_TOKENIZER_PROTOTYPE = new StrTokenizer();
102            TSV_TOKENIZER_PROTOTYPE.setDelimiterMatcher(StrMatcher.tabMatcher());
103            TSV_TOKENIZER_PROTOTYPE.setQuoteMatcher(StrMatcher.doubleQuoteMatcher());
104            TSV_TOKENIZER_PROTOTYPE.setIgnoredMatcher(StrMatcher.noneMatcher());
105            TSV_TOKENIZER_PROTOTYPE.setTrimmerMatcher(StrMatcher.trimMatcher());
106            TSV_TOKENIZER_PROTOTYPE.setEmptyTokenAsNull(false);
107            TSV_TOKENIZER_PROTOTYPE.setIgnoreEmptyTokens(false);
108        }
109    
110        /** The text to work on. */
111        private char chars[];
112        /** The parsed tokens */
113        private String tokens[];
114        /** The current iteration position */
115        private int tokenPos;
116    
117        /** The delimiter matcher */
118        private StrMatcher delimMatcher = StrMatcher.splitMatcher();
119        /** The quote matcher */
120        private StrMatcher quoteMatcher = StrMatcher.noneMatcher();
121        /** The ignored matcher */
122        private StrMatcher ignoredMatcher = StrMatcher.noneMatcher();
123        /** The trimmer matcher */
124        private StrMatcher trimmerMatcher = StrMatcher.noneMatcher();
125    
126        /** Whether to return empty tokens as null */
127        private boolean emptyAsNull = false;
128        /** Whether to ignore empty tokens */
129        private boolean ignoreEmptyTokens = true;
130    
131        //-----------------------------------------------------------------------
132    
133        /**
134         * Returns a clone of <code>CSV_TOKENIZER_PROTOTYPE</code>.
135         * 
136         * @return a clone of <code>CSV_TOKENIZER_PROTOTYPE</code>.
137         */
138        private static StrTokenizer getCSVClone() {
139            return (StrTokenizer) CSV_TOKENIZER_PROTOTYPE.clone();
140        }
141    
142        /**
143         * Gets a new tokenizer instance which parses Comma Separated Value strings
144         * initializing it with the given input.  The default for CSV processing
145         * will be trim whitespace from both ends (which can be overridden with
146         * the setTrimmer method).
147         * <p>
148         * You must call a "reset" method to set the string which you want to parse.
149         * @return a new tokenizer instance which parses Comma Separated Value strings
150         */
151        public static StrTokenizer getCSVInstance() {
152            return getCSVClone();
153        }
154    
155        /**
156         * Gets a new tokenizer instance which parses Comma Separated Value strings
157         * initializing it with the given input.  The default for CSV processing
158         * will be trim whitespace from both ends (which can be overridden with
159         * the setTrimmer method).
160         *
161         * @param input  the text to parse
162         * @return a new tokenizer instance which parses Comma Separated Value strings
163         */
164        public static StrTokenizer getCSVInstance(String input) {
165            StrTokenizer tok = getCSVClone();
166            tok.reset(input);
167            return tok;
168        }
169    
170        /**
171         * Gets a new tokenizer instance which parses Comma Separated Value strings
172         * initializing it with the given input.  The default for CSV processing
173         * will be trim whitespace from both ends (which can be overridden with
174         * the setTrimmer method).
175         *
176         * @param input  the text to parse
177         * @return a new tokenizer instance which parses Comma Separated Value strings
178         */
179        public static StrTokenizer getCSVInstance(char[] input) {
180            StrTokenizer tok = getCSVClone();
181            tok.reset(input);
182            return tok;
183        }
184    
185        /**
186         * Returns a clone of <code>TSV_TOKENIZER_PROTOTYPE</code>.
187         * 
188         * @return a clone of <code>TSV_TOKENIZER_PROTOTYPE</code>.
189         */
190        private static StrTokenizer getTSVClone() {
191            return (StrTokenizer) TSV_TOKENIZER_PROTOTYPE.clone();
192        }
193    
194    
195        /**
196         * Gets a new tokenizer instance which parses Tab Separated Value strings.
197         * The default for CSV processing will be trim whitespace from both ends
198         * (which can be overridden with the setTrimmer method).
199         * <p>
200         * You must call a "reset" method to set the string which you want to parse.
201         * @return a new tokenizer instance which parses Tab Separated Value strings.
202         */
203        public static StrTokenizer getTSVInstance() {
204            return getTSVClone();
205        }
206    
207        /**
208         * Gets a new tokenizer instance which parses Tab Separated Value strings.
209         * The default for CSV processing will be trim whitespace from both ends
210         * (which can be overridden with the setTrimmer method).
211         * @param input  the string to parse
212         * @return a new tokenizer instance which parses Tab Separated Value strings.
213         */
214        public static StrTokenizer getTSVInstance(String input) {
215            StrTokenizer tok = getTSVClone();
216            tok.reset(input);
217            return tok;
218        }
219    
220        /**
221         * Gets a new tokenizer instance which parses Tab Separated Value strings.
222         * The default for CSV processing will be trim whitespace from both ends
223         * (which can be overridden with the setTrimmer method).
224         * @param input  the string to parse
225         * @return a new tokenizer instance which parses Tab Separated Value strings.
226         */
227        public static StrTokenizer getTSVInstance(char[] input) {
228            StrTokenizer tok = getTSVClone();
229            tok.reset(input);
230            return tok;
231        }
232    
233        //-----------------------------------------------------------------------
234        /**
235         * Constructs a tokenizer splitting on space, tab, newline and formfeed
236         * as per StringTokenizer, but with no text to tokenize.
237         * <p>
238         * This constructor is normally used with {@link #reset(String)}.
239         */
240        public StrTokenizer() {
241            super();
242            this.chars = null;
243        }
244    
245        /**
246         * Constructs a tokenizer splitting on space, tab, newline and formfeed
247         * as per StringTokenizer.
248         *
249         * @param input  the string which is to be parsed
250         */
251        public StrTokenizer(String input) {
252            super();
253            if (input != null) {
254                chars = input.toCharArray();
255            } else {
256                chars = null;
257            }
258        }
259    
260        /**
261         * Constructs a tokenizer splitting on the specified delimiter character.
262         *
263         * @param input  the string which is to be parsed
264         * @param delim  the field delimiter character
265         */
266        public StrTokenizer(String input, char delim) {
267            this(input);
268            setDelimiterChar(delim);
269        }
270    
271        /**
272         * Constructs a tokenizer splitting on the specified delimiter string.
273         *
274         * @param input  the string which is to be parsed
275         * @param delim  the field delimiter string
276         */
277        public StrTokenizer(String input, String delim) {
278            this(input);
279            setDelimiterString(delim);
280        }
281    
282        /**
283         * Constructs a tokenizer splitting using the specified delimiter matcher.
284         *
285         * @param input  the string which is to be parsed
286         * @param delim  the field delimiter matcher
287         */
288        public StrTokenizer(String input, StrMatcher delim) {
289            this(input);
290            setDelimiterMatcher(delim);
291        }
292    
293        /**
294         * Constructs a tokenizer splitting on the specified delimiter character
295         * and handling quotes using the specified quote character.
296         *
297         * @param input  the string which is to be parsed
298         * @param delim  the field delimiter character
299         * @param quote  the field quoted string character
300         */
301        public StrTokenizer(String input, char delim, char quote) {
302            this(input, delim);
303            setQuoteChar(quote);
304        }
305    
306        /**
307         * Constructs a tokenizer splitting using the specified delimiter matcher
308         * and handling quotes using the specified quote matcher.
309         *
310         * @param input  the string which is to be parsed
311         * @param delim  the field delimiter matcher
312         * @param quote  the field quoted string matcher
313         */
314        public StrTokenizer(String input, StrMatcher delim, StrMatcher quote) {
315            this(input, delim);
316            setQuoteMatcher(quote);
317        }
318    
319        /**
320         * Constructs a tokenizer splitting on space, tab, newline and formfeed
321         * as per StringTokenizer.
322         * <p>
323         * The input character array is not cloned, and must not be altered after
324         * passing in to this method.
325         *
326         * @param input  the string which is to be parsed, not cloned
327         */
328        public StrTokenizer(char[] input) {
329            super();
330            this.chars = input;
331        }
332    
333        /**
334         * Constructs a tokenizer splitting on the specified character.
335         * <p>
336         * The input character array is not cloned, and must not be altered after
337         * passing in to this method.
338         *
339         * @param input  the string which is to be parsed, not cloned
340         * @param delim the field delimiter character
341         */
342        public StrTokenizer(char[] input, char delim) {
343            this(input);
344            setDelimiterChar(delim);
345        }
346    
347        /**
348         * Constructs a tokenizer splitting on the specified string.
349         * <p>
350         * The input character array is not cloned, and must not be altered after
351         * passing in to this method.
352         *
353         * @param input  the string which is to be parsed, not cloned
354         * @param delim the field delimiter string
355         */
356        public StrTokenizer(char[] input, String delim) {
357            this(input);
358            setDelimiterString(delim);
359        }
360    
361        /**
362         * Constructs a tokenizer splitting using the specified delimiter matcher.
363         * <p>
364         * The input character array is not cloned, and must not be altered after
365         * passing in to this method.
366         *
367         * @param input  the string which is to be parsed, not cloned
368         * @param delim  the field delimiter matcher
369         */
370        public StrTokenizer(char[] input, StrMatcher delim) {
371            this(input);
372            setDelimiterMatcher(delim);
373        }
374    
375        /**
376         * Constructs a tokenizer splitting on the specified delimiter character
377         * and handling quotes using the specified quote character.
378         * <p>
379         * The input character array is not cloned, and must not be altered after
380         * passing in to this method.
381         *
382         * @param input  the string which is to be parsed, not cloned
383         * @param delim  the field delimiter character
384         * @param quote  the field quoted string character
385         */
386        public StrTokenizer(char[] input, char delim, char quote) {
387            this(input, delim);
388            setQuoteChar(quote);
389        }
390    
391        /**
392         * Constructs a tokenizer splitting using the specified delimiter matcher
393         * and handling quotes using the specified quote matcher.
394         * <p>
395         * The input character array is not cloned, and must not be altered after
396         * passing in to this method.
397         *
398         * @param input  the string which is to be parsed, not cloned
399         * @param delim  the field delimiter character
400         * @param quote  the field quoted string character
401         */
402        public StrTokenizer(char[] input, StrMatcher delim, StrMatcher quote) {
403            this(input, delim);
404            setQuoteMatcher(quote);
405        }
406    
407        // API
408        //-----------------------------------------------------------------------
409        /**
410         * Gets the number of tokens found in the String.
411         *
412         * @return the number of matched tokens
413         */
414        public int size() {
415            checkTokenized();
416            return tokens.length;
417        }
418    
419        /**
420         * Gets the next token from the String.
421         *
422         * @return the next sequential token, or null when no more tokens are found
423         */
424        public String nextToken() {
425            if (hasNext()) {
426                return tokens[tokenPos++];
427            }
428            return null;
429        }
430    
431        /**
432         * Gets the previous token from the String.
433         *
434         * @return the previous sequential token, or null when no more tokens are found
435         */
436        public String previousToken() {
437            if (hasPrevious()) {
438                return tokens[--tokenPos];
439            }
440            return null;
441        }
442    
443        /**
444         * Gets a copy of the full token list as an independent modifiable array.
445         *
446         * @return the tokens as a String array
447         */
448        public String[] getTokenArray() {
449            checkTokenized();
450            return (String[]) tokens.clone();
451        }
452    
453        /**
454         * Gets a copy of the full token list as an independent modifiable list.
455         *
456         * @return the tokens as a String array
457         */
458        public List getTokenList() {
459            checkTokenized();
460            List list = new ArrayList(tokens.length);
461            for (int i = 0; i < tokens.length; i++) {
462                list.add(tokens[i]);
463            }
464            return list;
465        }
466    
467        /**
468         * Resets this tokenizer, forgetting all parsing and iteration already completed.
469         * <p>
470         * This method allows the same tokenizer to be reused for the same String.
471         *
472         * @return this, to enable chaining
473         */
474        public StrTokenizer reset() {
475            tokenPos = 0;
476            tokens = null;
477            return this;
478        }
479    
480        /**
481         * Reset this tokenizer, giving it a new input string to parse.
482         * In this manner you can re-use a tokenizer with the same settings
483         * on multiple input lines.
484         *
485         * @param input  the new string to tokenize, null sets no text to parse
486         * @return this, to enable chaining
487         */
488        public StrTokenizer reset(String input) {
489            reset();
490            if (input != null) {
491                this.chars = input.toCharArray();
492            } else {
493                this.chars = null;
494            }
495            return this;
496        }
497    
498        /**
499         * Reset this tokenizer, giving it a new input string to parse.
500         * In this manner you can re-use a tokenizer with the same settings
501         * on multiple input lines.
502         * <p>
503         * The input character array is not cloned, and must not be altered after
504         * passing in to this method.
505         *
506         * @param input  the new character array to tokenize, not cloned, null sets no text to parse
507         * @return this, to enable chaining
508         */
509        public StrTokenizer reset(char[] input) {
510            reset();
511            this.chars = input;
512            return this;
513        }
514    
515        // ListIterator
516        //-----------------------------------------------------------------------
517        /**
518         * Checks whether there are any more tokens.
519         *
520         * @return true if there are more tokens
521         */
522        public boolean hasNext() {
523            checkTokenized();
524            return tokenPos < tokens.length;
525        }
526    
527        /**
528         * Gets the next token. This method is equivalent to {@link #nextToken()}.
529         *
530         * @return the next String token
531         */
532        public Object next() {
533            if (hasNext()) {
534                return tokens[tokenPos++];
535            }
536            throw new NoSuchElementException();
537        }
538    
539        /**
540         * Gets the index of the next token to return.
541         *
542         * @return the next token index
543         */
544        public int nextIndex() {
545            return tokenPos;
546        }
547    
548        /**
549         * Checks whether there are any previous tokens that can be iterated to.
550         *
551         * @return true if there are previous tokens
552         */
553        public boolean hasPrevious() {
554            checkTokenized();
555            return tokenPos > 0;
556        }
557    
558        /**
559         * Gets the token previous to the last returned token.
560         *
561         * @return the previous token
562         */
563        public Object previous() {
564            if (hasPrevious()) {
565                return tokens[--tokenPos];
566            }
567            throw new NoSuchElementException();
568        }
569    
570        /**
571         * Gets the index of the previous token.
572         *
573         * @return the previous token index
574         */
575        public int previousIndex() {
576            return tokenPos - 1;
577        }
578    
579        /**
580         * Unsupported ListIterator operation.
581         *
582         * @throws UnsupportedOperationException always
583         */
584        public void remove() {
585            throw new UnsupportedOperationException("remove() is unsupported");
586        }
587    
588        /**
589         * Unsupported ListIterator operation.
590         * @param obj this parameter ignored.
591         * @throws UnsupportedOperationException always
592         */
593        public void set(Object obj) {
594            throw new UnsupportedOperationException("set() is unsupported");
595        }
596    
597        /**
598         * Unsupported ListIterator operation.
599         * @param obj this parameter ignored.
600         * @throws UnsupportedOperationException always
601         */
602        public void add(Object obj) {
603            throw new UnsupportedOperationException("add() is unsupported");
604        }
605    
606        // Implementation
607        //-----------------------------------------------------------------------
608        /**
609         * Checks if tokenization has been done, and if not then do it.
610         */
611        private void checkTokenized() {
612            if (tokens == null) {
613                if (chars == null) {
614                    // still call tokenize as subclass may do some work
615                    List split = tokenize(null, 0, 0);
616                    tokens = (String[]) split.toArray(new String[split.size()]);
617                } else {
618                    List split = tokenize(chars, 0, chars.length);
619                    tokens = (String[]) split.toArray(new String[split.size()]);
620                }
621            }
622        }
623    
624        /**
625         * Internal method to performs the tokenization.
626         * <p>
627         * Most users of this class do not need to call this method. This method
628         * will be called automatically by other (public) methods when required.
629         * <p>
630         * This method exists to allow subclasses to add code before or after the
631         * tokenization. For example, a subclass could alter the character array,
632         * offset or count to be parsed, or call the tokenizer multiple times on
633         * multiple strings. It is also be possible to filter the results.
634         * <p>
635         * <code>StrTokenizer</code> will always pass a zero offset and a count
636         * equal to the length of the array to this method, however a subclass
637         * may pass other values, or even an entirely different array.
638         * 
639         * @param chars  the character array being tokenized, may be null
640         * @param offset  the start position within the character array, must be valid
641         * @param count  the number of characters to tokenize, must be valid
642         * @return the modifiable list of String tokens, unmodifiable if null array or zero count
643         */
644        protected List tokenize(char[] chars, int offset, int count) {
645            if (chars == null || count == 0) {
646                return Collections.EMPTY_LIST;
647            }
648            StrBuilder buf = new StrBuilder();
649            List tokens = new ArrayList();
650            int pos = offset;
651            
652            // loop around the entire buffer
653            while (pos >= 0 && pos < count) {
654                // find next token
655                pos = readNextToken(chars, pos, count, buf, tokens);
656                
657                // handle case where end of string is a delimiter
658                if (pos >= count) {
659                    addToken(tokens, "");
660                }
661            }
662            return tokens;
663        }
664    
665        /**
666         * Adds a token to a list, paying attention to the parameters we've set.
667         *
668         * @param list  the list to add to
669         * @param tok  the token to add
670         */
671        private void addToken(List list, String tok) {
672            if (tok == null || tok.length() == 0) {
673                if (isIgnoreEmptyTokens()) {
674                    return;
675                }
676                if (isEmptyTokenAsNull()) {
677                    tok = null;
678                }
679            }
680            list.add(tok);
681        }
682    
683        /**
684         * Reads character by character through the String to get the next token.
685         *
686         * @param chars  the character array being tokenized
687         * @param start  the first character of field
688         * @param len  the length of the character array being tokenized
689         * @param workArea  a temporary work area
690         * @param tokens  the list of parsed tokens
691         * @return the starting position of the next field (the character
692         *  immediately after the delimiter), or -1 if end of string found
693         */
694        private int readNextToken(char[] chars, int start, int len, StrBuilder workArea, List tokens) {
695            // skip all leading whitespace, unless it is the
696            // field delimiter or the quote character
697            while (start < len) {
698                int removeLen = Math.max(
699                        getIgnoredMatcher().isMatch(chars, start, start, len),
700                        getTrimmerMatcher().isMatch(chars, start, start, len));
701                if (removeLen == 0 ||
702                    getDelimiterMatcher().isMatch(chars, start, start, len) > 0 ||
703                    getQuoteMatcher().isMatch(chars, start, start, len) > 0) {
704                    break;
705                }
706                start += removeLen;
707            }
708            
709            // handle reaching end
710            if (start >= len) {
711                addToken(tokens, "");
712                return -1;
713            }
714            
715            // handle empty token
716            int delimLen = getDelimiterMatcher().isMatch(chars, start, start, len);
717            if (delimLen > 0) {
718                addToken(tokens, "");
719                return start + delimLen;
720            }
721            
722            // handle found token
723            int quoteLen = getQuoteMatcher().isMatch(chars, start, start, len);
724            if (quoteLen > 0) {
725                return readWithQuotes(chars, start + quoteLen, len, workArea, tokens, start, quoteLen);
726            }
727            return readWithQuotes(chars, start, len, workArea, tokens, 0, 0);
728        }
729    
730        /**
731         * Reads a possibly quoted string token.
732         *
733         * @param chars  the character array being tokenized
734         * @param start  the first character of field
735         * @param len  the length of the character array being tokenized
736         * @param workArea  a temporary work area
737         * @param tokens  the list of parsed tokens
738         * @param quoteStart  the start position of the matched quote, 0 if no quoting
739         * @param quoteLen  the length of the matched quote, 0 if no quoting
740         * @return the starting position of the next field (the character
741         *  immediately after the delimiter, or if end of string found,
742         *  then the length of string
743         */
744        private int readWithQuotes(char[] chars, int start, int len, StrBuilder workArea, 
745                                   List tokens, int quoteStart, int quoteLen) 
746        {
747            // Loop until we've found the end of the quoted
748            // string or the end of the input
749            workArea.clear();
750            int pos = start;
751            boolean quoting = (quoteLen > 0);
752            int trimStart = 0;
753            
754            while (pos < len) {
755                // quoting mode can occur several times throughout a string
756                // we must switch between quoting and non-quoting until we
757                // encounter a non-quoted delimiter, or end of string
758                if (quoting) {
759                    // In quoting mode
760                    
761                    // If we've found a quote character, see if it's
762                    // followed by a second quote.  If so, then we need
763                    // to actually put the quote character into the token
764                    // rather than end the token.
765                    if (isQuote(chars, pos, len, quoteStart, quoteLen)) {
766                        if (isQuote(chars, pos + quoteLen, len, quoteStart, quoteLen)) {
767                            // matched pair of quotes, thus an escaped quote
768                            workArea.append(chars, pos, quoteLen);
769                            pos += (quoteLen * 2);
770                            trimStart = workArea.size();
771                            continue;
772                        }
773                        
774                        // end of quoting
775                        quoting = false;
776                        pos += quoteLen;
777                        continue;
778                    }
779                    
780                    // copy regular character from inside quotes
781                    workArea.append(chars[pos++]);
782                    trimStart = workArea.size();
783                    
784                } else {
785                    // Not in quoting mode
786                    
787                    // check for delimiter, and thus end of token
788                    int delimLen = getDelimiterMatcher().isMatch(chars, pos, start, len);
789                    if (delimLen > 0) {
790                        // return condition when end of token found
791                        addToken(tokens, workArea.substring(0, trimStart));
792                        return pos + delimLen;
793                    }
794                    
795                    // check for quote, and thus back into quoting mode
796                    if (quoteLen > 0) {
797                        if (isQuote(chars, pos, len, quoteStart, quoteLen)) {
798                            quoting = true;
799                            pos += quoteLen;
800                            continue;
801                        }
802                    }
803                    
804                    // check for ignored (outside quotes), and ignore
805                    int ignoredLen = getIgnoredMatcher().isMatch(chars, pos, start, len);
806                    if (ignoredLen > 0) {
807                        pos += ignoredLen;
808                        continue;
809                    }
810                    
811                    // check for trimmed character
812                    // don't yet know if its at the end, so copy to workArea
813                    // use trimStart to keep track of trim at the end
814                    int trimmedLen = getTrimmerMatcher().isMatch(chars, pos, start, len);
815                    if (trimmedLen > 0) {
816                        workArea.append(chars, pos, trimmedLen);
817                        pos += trimmedLen;
818                        continue;
819                    }
820                    
821                    // copy regular character from outside quotes
822                    workArea.append(chars[pos++]);
823                    trimStart = workArea.size();
824                }
825            }
826            
827            // return condition when end of string found
828            addToken(tokens, workArea.substring(0, trimStart));
829            return -1;
830        }
831    
832        /**
833         * Checks if the characters at the index specified match the quote
834         * already matched in readNextToken().
835         *
836         * @param chars  the character array being tokenized
837         * @param pos  the position to check for a quote
838         * @param len  the length of the character array being tokenized
839         * @param quoteStart  the start position of the matched quote, 0 if no quoting
840         * @param quoteLen  the length of the matched quote, 0 if no quoting
841         * @return true if a quote is matched
842         */
843        private boolean isQuote(char[] chars, int pos, int len, int quoteStart, int quoteLen) {
844            for (int i = 0; i < quoteLen; i++) {
845                if ((pos + i) >= len || chars[pos + i] != chars[quoteStart + i]) {
846                    return false;
847                }
848            }
849            return true;
850        }
851    
852        // Delimiter
853        //-----------------------------------------------------------------------
854        /**
855         * Gets the field delimiter matcher.
856         *
857         * @return the delimiter matcher in use
858         */
859        public StrMatcher getDelimiterMatcher() {
860            return this.delimMatcher;
861        }
862    
863        /**
864         * Sets the field delimiter matcher.
865         * <p>
866         * The delimitier is used to separate one token from another.
867         *
868         * @param delim  the delimiter matcher to use
869         * @return this, to enable chaining
870         */
871        public StrTokenizer setDelimiterMatcher(StrMatcher delim) {
872            if (delim == null) {
873                this.delimMatcher = StrMatcher.noneMatcher();
874            } else {
875                this.delimMatcher = delim;
876            }
877            return this;
878        }
879    
880        /**
881         * Sets the field delimiter character.
882         *
883         * @param delim  the delimiter character to use
884         * @return this, to enable chaining
885         */
886        public StrTokenizer setDelimiterChar(char delim) {
887            return setDelimiterMatcher(StrMatcher.charMatcher(delim));
888        }
889    
890        /**
891         * Sets the field delimiter string.
892         *
893         * @param delim  the delimiter string to use
894         * @return this, to enable chaining
895         */
896        public StrTokenizer setDelimiterString(String delim) {
897            return setDelimiterMatcher(StrMatcher.stringMatcher(delim));
898        }
899    
900        // Quote
901        //-----------------------------------------------------------------------
902        /**
903         * Gets the quote matcher currently in use.
904         * <p>
905         * The quote character is used to wrap data between the tokens.
906         * This enables delimiters to be entered as data.
907         * The default value is '"' (double quote).
908         *
909         * @return the quote matcher in use
910         */
911        public StrMatcher getQuoteMatcher() {
912            return quoteMatcher;
913        }
914    
915        /**
916         * Set the quote matcher to use.
917         * <p>
918         * The quote character is used to wrap data between the tokens.
919         * This enables delimiters to be entered as data.
920         *
921         * @param quote  the quote matcher to use, null ignored
922         * @return this, to enable chaining
923         */
924        public StrTokenizer setQuoteMatcher(StrMatcher quote) {
925            if (quote != null) {
926                this.quoteMatcher = quote;
927            }
928            return this;
929        }
930    
931        /**
932         * Sets the quote character to use.
933         * <p>
934         * The quote character is used to wrap data between the tokens.
935         * This enables delimiters to be entered as data.
936         *
937         * @param quote  the quote character to use
938         * @return this, to enable chaining
939         */
940        public StrTokenizer setQuoteChar(char quote) {
941            return setQuoteMatcher(StrMatcher.charMatcher(quote));
942        }
943    
944        // Ignored
945        //-----------------------------------------------------------------------
946        /**
947         * Gets the ignored character matcher.
948         * <p>
949         * These characters are ignored when parsing the String, unless they are
950         * within a quoted region.
951         * The default value is not to ignore anything.
952         *
953         * @return the ignored matcher in use
954         */
955        public StrMatcher getIgnoredMatcher() {
956            return ignoredMatcher;
957        }
958    
959        /**
960         * Set the matcher for characters to ignore.
961         * <p>
962         * These characters are ignored when parsing the String, unless they are
963         * within a quoted region.
964         *
965         * @param ignored  the ignored matcher to use, null ignored
966         * @return this, to enable chaining
967         */
968        public StrTokenizer setIgnoredMatcher(StrMatcher ignored) {
969            if (ignored != null) {
970                this.ignoredMatcher = ignored;
971            }
972            return this;
973        }
974    
975        /**
976         * Set the character to ignore.
977         * <p>
978         * This character is ignored when parsing the String, unless it is
979         * within a quoted region.
980         *
981         * @param ignored  the ignored character to use
982         * @return this, to enable chaining
983         */
984        public StrTokenizer setIgnoredChar(char ignored) {
985            return setIgnoredMatcher(StrMatcher.charMatcher(ignored));
986        }
987    
988        // Trimmer
989        //-----------------------------------------------------------------------
990        /**
991         * Gets the trimmer character matcher.
992         * <p>
993         * These characters are trimmed off on each side of the delimiter
994         * until the token or quote is found.
995         * The default value is not to trim anything.
996         *
997         * @return the trimmer matcher in use
998         */
999        public StrMatcher getTrimmerMatcher() {
1000            return trimmerMatcher;
1001        }
1002    
1003        /**
1004         * Sets the matcher for characters to trim.
1005         * <p>
1006         * These characters are trimmed off on each side of the delimiter
1007         * until the token or quote is found.
1008         *
1009         * @param trimmer  the trimmer matcher to use, null ignored
1010         * @return this, to enable chaining
1011         */
1012        public StrTokenizer setTrimmerMatcher(StrMatcher trimmer) {
1013            if (trimmer != null) {
1014                this.trimmerMatcher = trimmer;
1015            }
1016            return this;
1017        }
1018    
1019        //-----------------------------------------------------------------------
1020        /**
1021         * Gets whether the tokenizer currently returns empty tokens as null.
1022         * The default for this property is false.
1023         *
1024         * @return true if empty tokens are returned as null
1025         */
1026        public boolean isEmptyTokenAsNull() {
1027            return this.emptyAsNull;
1028        }
1029    
1030        /**
1031         * Sets whether the tokenizer should return empty tokens as null.
1032         * The default for this property is false.
1033         *
1034         * @param emptyAsNull  whether empty tokens are returned as null
1035         * @return this, to enable chaining
1036         */
1037        public StrTokenizer setEmptyTokenAsNull(boolean emptyAsNull) {
1038            this.emptyAsNull = emptyAsNull;
1039            return this;
1040        }
1041    
1042        //-----------------------------------------------------------------------
1043        /**
1044         * Gets whether the tokenizer currently ignores empty tokens.
1045         * The default for this property is true.
1046         *
1047         * @return true if empty tokens are not returned
1048         */
1049        public boolean isIgnoreEmptyTokens() {
1050            return ignoreEmptyTokens;
1051        }
1052    
1053        /**
1054         * Sets whether the tokenizer should ignore and not return empty tokens.
1055         * The default for this property is true.
1056         *
1057         * @param ignoreEmptyTokens  whether empty tokens are not returned
1058         * @return this, to enable chaining
1059         */
1060        public StrTokenizer setIgnoreEmptyTokens(boolean ignoreEmptyTokens) {
1061            this.ignoreEmptyTokens = ignoreEmptyTokens;
1062            return this;
1063        }
1064    
1065        //-----------------------------------------------------------------------
1066        /**
1067         * Gets the String content that the tokenizer is parsing.
1068         *
1069         * @return the string content being parsed
1070         */
1071        public String getContent() {
1072            if (chars == null) {
1073                return null;
1074            }
1075            return new String(chars);
1076        }
1077    
1078        //-----------------------------------------------------------------------
1079        /**
1080         * Creates a new instance of this Tokenizer. The new instance is reset so
1081         * that it will be at the start of the token list.
1082         * If a {@link CloneNotSupportedException} is caught, return <code>null</code>.
1083         * 
1084         * @return a new instance of this Tokenizer which has been reset.
1085         */
1086        public Object clone() {
1087            try {
1088                return cloneReset();
1089            } catch (CloneNotSupportedException ex) {
1090                return null;
1091            }
1092        }
1093    
1094        /**
1095         * Creates a new instance of this Tokenizer. The new instance is reset so that
1096         * it will be at the start of the token list.
1097         * 
1098         * @return a new instance of this Tokenizer which has been reset.
1099         * @throws CloneNotSupportedException if there is a problem cloning
1100         */
1101        Object cloneReset() throws CloneNotSupportedException {
1102            // this method exists to enable 100% test coverage
1103            StrTokenizer cloned = (StrTokenizer) super.clone();
1104            if (cloned.chars != null) {
1105                cloned.chars = (char[]) cloned.chars.clone();
1106            }
1107            cloned.reset();
1108            return cloned;
1109        }
1110    
1111        //-----------------------------------------------------------------------
1112        /**
1113         * Gets the String content that the tokenizer is parsing.
1114         *
1115         * @return the string content being parsed
1116         */
1117        public String toString() {
1118            if (tokens == null) {
1119                return "StrTokenizer[not tokenized yet]";
1120            }
1121            return "StrTokenizer" + getTokenList();
1122        }
1123    
1124    }