001    /*
002     * Licensed to the Apache Software Foundation (ASF) under one or more
003     * contributor license agreements.  See the NOTICE file distributed with
004     * this work for additional information regarding copyright ownership.
005     * The ASF licenses this file to You under the Apache License, Version 2.0
006     * (the "License"); you may not use this file except in compliance with
007     * the License.  You may obtain a copy of the License at
008     * 
009     *      http://www.apache.org/licenses/LICENSE-2.0
010     * 
011     * Unless required by applicable law or agreed to in writing, software
012     * distributed under the License is distributed on an "AS IS" BASIS,
013     * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
014     * See the License for the specific language governing permissions and
015     * limitations under the License.
016     */
017    package org.apache.commons.lang;
018    
019    import java.io.Serializable;
020    import java.util.HashMap;
021    import java.util.HashSet;
022    import java.util.Iterator;
023    import java.util.Map;
024    import java.util.Set;
025    
026    /**
027     * <p>A set of characters.</p>
028     *
029     * <p>Instances are immutable, but instances of subclasses may not be.</p>
030     *
031     * @author Stephen Colebourne
032     * @author Phil Steitz
033     * @author Pete Gieser
034     * @author Gary Gregory
035     * @since 1.0
036     * @version $Id: CharSet.java 618884 2008-02-06 04:37:17Z bayard $
037     */
038    public class CharSet implements Serializable {
039    
040        /**
041         * Required for serialization support. Lang version 2.0. 
042         * 
043         * @see java.io.Serializable
044         */
045        private static final long serialVersionUID = 5947847346149275958L;
046    
047        /** 
048         * A CharSet defining no characters. 
049         * @since 2.0
050         */
051        public static final CharSet EMPTY = new CharSet((String) null);
052    
053        /** 
054         * A CharSet defining ASCII alphabetic characters "a-zA-Z".
055         * @since 2.0
056         */
057        public static final CharSet ASCII_ALPHA = new CharSet("a-zA-Z");
058    
059        /** 
060         * A CharSet defining ASCII alphabetic characters "a-z".
061         * @since 2.0
062         */
063        public static final CharSet ASCII_ALPHA_LOWER = new CharSet("a-z");
064    
065        /** 
066         * A CharSet defining ASCII alphabetic characters "A-Z".
067         * @since 2.0
068         */
069        public static final CharSet ASCII_ALPHA_UPPER = new CharSet("A-Z");
070    
071        /** 
072         * A CharSet defining ASCII alphabetic characters "0-9".
073         * @since 2.0
074         */
075        public static final CharSet ASCII_NUMERIC = new CharSet("0-9");
076    
077        /**
078         * A Map of the common cases used in the factory.
079         * Subclasses can add more common patterns if desired.
080         * @since 2.0
081         */
082        protected static final Map COMMON = new HashMap();
083        
084        static {
085            COMMON.put(null, EMPTY);
086            COMMON.put("", EMPTY);
087            COMMON.put("a-zA-Z", ASCII_ALPHA);
088            COMMON.put("A-Za-z", ASCII_ALPHA);
089            COMMON.put("a-z", ASCII_ALPHA_LOWER);
090            COMMON.put("A-Z", ASCII_ALPHA_UPPER);
091            COMMON.put("0-9", ASCII_NUMERIC);
092        }
093    
094        /** The set of CharRange objects. */
095        private Set set = new HashSet();
096    
097        //-----------------------------------------------------------------------
098        /**
099         * <p>Factory method to create a new CharSet using a special syntax.</p>
100         *
101         * <ul>
102         *  <li><code>null</code> or empty string ("")
103         * - set containing no characters</li>
104         *  <li>Single character, such as "a"
105         *  - set containing just that character</li>
106         *  <li>Multi character, such as "a-e"
107         *  - set containing characters from one character to the other</li>
108         *  <li>Negated, such as "^a" or "^a-e"
109         *  - set containing all characters except those defined</li>
110         *  <li>Combinations, such as "abe-g"
111         *  - set containing all the characters from the individual sets</li>
112         * </ul>
113         *
114         * <p>The matching order is:</p>
115         * <ol>
116         *  <li>Negated multi character range, such as "^a-e"
117         *  <li>Ordinary multi character range, such as "a-e"
118         *  <li>Negated single character, such as "^a"
119         *  <li>Ordinary single character, such as "a"
120         * </ol>
121         * <p>Matching works left to right. Once a match is found the
122         * search starts again from the next character.</p>
123         *
124         * <p>If the same range is defined twice using the same syntax, only
125         * one range will be kept.
126         * Thus, "a-ca-c" creates only one range of "a-c".</p>
127         *
128         * <p>If the start and end of a range are in the wrong order,
129         * they are reversed. Thus "a-e" is the same as "e-a".
130         * As a result, "a-ee-a" would create only one range,
131         * as the "a-e" and "e-a" are the same.</p>
132         *
133         * <p>The set of characters represented is the union of the specified ranges.</p>
134         *
135         * <p>All CharSet objects returned by this method will be immutable.</p>
136         *
137         * @param setStr  the String describing the set, may be null
138         * @return a CharSet instance
139         * @since 2.0
140         */
141        public static CharSet getInstance(String setStr) {
142            Object set = COMMON.get(setStr);
143            if (set != null) {
144                return (CharSet) set;
145            }
146            return new CharSet(setStr);
147        }
148    
149        /**
150         * <p>Constructs a new CharSet using the set syntax.
151         * Each string is merged in with the set.</p>
152         *
153         * @param setStrs  Strings to merge into the initial set, may be null
154         * @return a CharSet instance
155         * @since 2.4
156         */
157        public static CharSet getInstance(String[] setStrs) {
158            if (setStrs == null) {
159                return null;
160            }
161            return new CharSet(setStrs); 
162        }
163    
164        //-----------------------------------------------------------------------
165        /**
166         * <p>Constructs a new CharSet using the set syntax.</p>
167         *
168         * @param setStr  the String describing the set, may be null
169         * @since 2.0
170         */
171        protected CharSet(String setStr) {
172            super();
173            add(setStr);
174        }
175    
176        /**
177         * <p>Constructs a new CharSet using the set syntax.
178         * Each string is merged in with the set.</p>
179         *
180         * @param set  Strings to merge into the initial set
181         * @throws NullPointerException if set is <code>null</code>
182         */
183        protected CharSet(String[] set) {
184            super();
185            int sz = set.length;
186            for (int i = 0; i < sz; i++) {
187                add(set[i]);
188            }
189        }
190    
191        //-----------------------------------------------------------------------
192        /**
193         * <p>Add a set definition string to the <code>CharSet</code>.</p>
194         *
195         * @param str  set definition string
196         */
197        protected void add(String str) {
198            if (str == null) {
199                return;
200            }
201    
202            int len = str.length();
203            int pos = 0;
204            while (pos < len) {
205                int remainder = (len - pos);
206                if (remainder >= 4 && str.charAt(pos) == '^' && str.charAt(pos + 2) == '-') {
207                    // negated range
208                    set.add(new CharRange(str.charAt(pos + 1), str.charAt(pos + 3), true));
209                    pos += 4;
210                } else if (remainder >= 3 && str.charAt(pos + 1) == '-') {
211                    // range
212                    set.add(new CharRange(str.charAt(pos), str.charAt(pos + 2)));
213                    pos += 3;
214                } else if (remainder >= 2 && str.charAt(pos) == '^') {
215                    // negated char
216                    set.add(new CharRange(str.charAt(pos + 1), true));
217                    pos += 2;
218                } else {
219                    // char
220                    set.add(new CharRange(str.charAt(pos)));
221                    pos += 1;
222                }
223            }
224        }
225    
226        //-----------------------------------------------------------------------
227        /**
228         * <p>Gets the internal set as an array of CharRange objects.</p>
229         *
230         * @return an array of immutable CharRange objects
231         * @since 2.0
232         */
233        public CharRange[] getCharRanges() {
234            return (CharRange[]) set.toArray(new CharRange[set.size()]);
235        }
236    
237        //-----------------------------------------------------------------------
238        /**
239         * <p>Does the <code>CharSet</code> contain the specified
240         * character <code>ch</code>.</p>
241         *
242         * @param ch  the character to check for
243         * @return <code>true</code> if the set contains the characters
244         */
245        public boolean contains(char ch) {
246            for (Iterator it = set.iterator(); it.hasNext();) {
247                CharRange range = (CharRange) it.next();
248                if (range.contains(ch)) {
249                    return true;
250                }
251            }
252            return false;
253        }
254    
255        // Basics
256        //-----------------------------------------------------------------------
257        /**
258         * <p>Compares two CharSet objects, returning true if they represent
259         * exactly the same set of characters defined in the same way.</p>
260         *
261         * <p>The two sets <code>abc</code> and <code>a-c</code> are <i>not</i>
262         * equal according to this method.</p>
263         *
264         * @param obj  the object to compare to
265         * @return true if equal
266         * @since 2.0
267         */
268        public boolean equals(Object obj) {
269            if (obj == this) {
270                return true;
271            }
272            if (obj instanceof CharSet == false) {
273                return false;
274            }
275            CharSet other = (CharSet) obj;
276            return set.equals(other.set);
277        }
278    
279        /**
280         * <p>Gets a hashCode compatible with the equals method.</p>
281         *
282         * @return a suitable hashCode
283         * @since 2.0
284         */
285        public int hashCode() {
286            return 89 + set.hashCode();
287        }
288    
289        /**
290         * <p>Gets a string representation of the set.</p>
291         *
292         * @return string representation of the set
293         */
294        public String toString() {
295            return set.toString();
296        }
297    
298    }