aboutsummaryrefslogtreecommitdiff
path: root/src/main/java/org/apache/commons/lang3/CharSet.java
blob: a6b7bcbb55beac1959cfbffe81e47222ab6ba00e (plain)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
/*
 * Licensed to the Apache Software Foundation (ASF) under one or more
 * contributor license agreements.  See the NOTICE file distributed with
 * this work for additional information regarding copyright ownership.
 * The ASF licenses this file to You under the Apache License, Version 2.0
 * (the "License"); you may not use this file except in compliance with
 * the License.  You may obtain a copy of the License at
 *
 *      http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
package org.apache.commons.lang3;

import java.io.Serializable;
import java.util.Collections;
import java.util.HashMap;
import java.util.HashSet;
import java.util.Map;
import java.util.Set;
import java.util.stream.Stream;

/**
 * A set of characters.
 *
 * <p>Instances are immutable, but instances of subclasses may not be.</p>
 *
 * <p>#ThreadSafe#</p>
 * @since 1.0
 */
public class CharSet implements Serializable {

    /**
     * Required for serialization support. Lang version 2.0.
     *
     * @see java.io.Serializable
     */
    private static final long serialVersionUID = 5947847346149275958L;

    /**
     * A CharSet defining no characters.
     * @since 2.0
     */
    public static final CharSet EMPTY = new CharSet((String) null);

    /**
     * A CharSet defining ASCII alphabetic characters "a-zA-Z".
     * @since 2.0
     */
    public static final CharSet ASCII_ALPHA = new CharSet("a-zA-Z");

    /**
     * A CharSet defining ASCII alphabetic characters "a-z".
     * @since 2.0
     */
    public static final CharSet ASCII_ALPHA_LOWER = new CharSet("a-z");

    /**
     * A CharSet defining ASCII alphabetic characters "A-Z".
     * @since 2.0
     */
    public static final CharSet ASCII_ALPHA_UPPER = new CharSet("A-Z");

    /**
     * A CharSet defining ASCII alphabetic characters "0-9".
     * @since 2.0
     */
    public static final CharSet ASCII_NUMERIC = new CharSet("0-9");

    /**
     * A Map of the common cases used in the factory.
     * Subclasses can add more common patterns if desired
     * @since 2.0
     */
    protected static final Map<String, CharSet> COMMON = Collections.synchronizedMap(new HashMap<>());

    static {
        COMMON.put(null, EMPTY);
        COMMON.put(StringUtils.EMPTY, EMPTY);
        COMMON.put("a-zA-Z", ASCII_ALPHA);
        COMMON.put("A-Za-z", ASCII_ALPHA);
        COMMON.put("a-z", ASCII_ALPHA_LOWER);
        COMMON.put("A-Z", ASCII_ALPHA_UPPER);
        COMMON.put("0-9", ASCII_NUMERIC);
    }

    /** The set of CharRange objects. */
    private final Set<CharRange> set = Collections.synchronizedSet(new HashSet<>());

    /**
     * Factory method to create a new CharSet using a special syntax.
     *
     * <ul>
     *  <li>{@code null} or empty string ("")
     * - set containing no characters</li>
     *  <li>Single character, such as "a"
     *  - set containing just that character</li>
     *  <li>Multi character, such as "a-e"
     *  - set containing characters from one character to the other</li>
     *  <li>Negated, such as "^a" or "^a-e"
     *  - set containing all characters except those defined</li>
     *  <li>Combinations, such as "abe-g"
     *  - set containing all the characters from the individual sets</li>
     * </ul>
     *
     * <p>The matching order is:</p>
     * <ol>
     *  <li>Negated multi character range, such as "^a-e"
     *  <li>Ordinary multi character range, such as "a-e"
     *  <li>Negated single character, such as "^a"
     *  <li>Ordinary single character, such as "a"
     * </ol>
     *
     * <p>Matching works left to right. Once a match is found the
     * search starts again from the next character.</p>
     *
     * <p>If the same range is defined twice using the same syntax, only
     * one range will be kept.
     * Thus, "a-ca-c" creates only one range of "a-c".</p>
     *
     * <p>If the start and end of a range are in the wrong order,
     * they are reversed. Thus "a-e" is the same as "e-a".
     * As a result, "a-ee-a" would create only one range,
     * as the "a-e" and "e-a" are the same.</p>
     *
     * <p>The set of characters represented is the union of the specified ranges.</p>
     *
     * <p>There are two ways to add a literal negation character ({@code ^}):</p>
     * <ul>
     *     <li>As the last character in a string, e.g. {@code CharSet.getInstance("a-z^")}</li>
     *     <li>As a separate element, e.g. {@code CharSet.getInstance("^", "a-z")}</li>
     * </ul>
     *
     * <p>Examples using the negation character:</p>
     * <pre>
     *     CharSet.getInstance("^a-c").contains('a') = false
     *     CharSet.getInstance("^a-c").contains('d') = true
     *     CharSet.getInstance("^^a-c").contains('a') = true // (only '^' is negated)
     *     CharSet.getInstance("^^a-c").contains('^') = false
     *     CharSet.getInstance("^a-cd-f").contains('d') = true
     *     CharSet.getInstance("a-c^").contains('^') = true
     *     CharSet.getInstance("^", "a-c").contains('^') = true
     * </pre>
     *
     * <p>All CharSet objects returned by this method will be immutable.</p>
     *
     * @param setStrs  Strings to merge into the set, may be null
     * @return a CharSet instance
     * @since 2.4
     */
    public static CharSet getInstance(final String... setStrs) {
        if (setStrs == null) {
            return null;
        }
        if (setStrs.length == 1) {
            final CharSet common = COMMON.get(setStrs[0]);
            if (common != null) {
                return common;
            }
        }
        return new CharSet(setStrs);
    }

    /**
     * Constructs a new CharSet using the set syntax.
     * Each string is merged in with the set.
     *
     * @param set  Strings to merge into the initial set
     * @throws NullPointerException if set is {@code null}
     */
    protected CharSet(final String... set) {
        Stream.of(set).forEach(this::add);
    }

    /**
     * Add a set definition string to the {@link CharSet}.
     *
     * @param str  set definition string
     */
    protected void add(final String str) {
        if (str == null) {
            return;
        }

        final int len = str.length();
        int pos = 0;
        while (pos < len) {
            final int remainder = len - pos;
            if (remainder >= 4 && str.charAt(pos) == '^' && str.charAt(pos + 2) == '-') {
                // negated range
                set.add(CharRange.isNotIn(str.charAt(pos + 1), str.charAt(pos + 3)));
                pos += 4;
            } else if (remainder >= 3 && str.charAt(pos + 1) == '-') {
                // range
                set.add(CharRange.isIn(str.charAt(pos), str.charAt(pos + 2)));
                pos += 3;
            } else if (remainder >= 2 && str.charAt(pos) == '^') {
                // negated char
                set.add(CharRange.isNot(str.charAt(pos + 1)));
                pos += 2;
            } else {
                // char
                set.add(CharRange.is(str.charAt(pos)));
                pos += 1;
            }
        }
    }

    /**
     * Gets the internal set as an array of CharRange objects.
     *
     * @return an array of immutable CharRange objects
     * @since 2.0
     */
// NOTE: This is no longer public as CharRange is no longer a public class.
//       It may be replaced when CharSet moves to Range.
    /*public*/ CharRange[] getCharRanges() {
        return set.toArray(CharRange.EMPTY_ARRAY);
    }

    /**
     * Does the {@link CharSet} contain the specified
     * character {@code ch}.
     *
     * @param ch  the character to check for
     * @return {@code true} if the set contains the characters
     */
    public boolean contains(final char ch) {
        synchronized (set) {
            return set.stream().anyMatch(range -> range.contains(ch));
        }
    }

    // Basics
    /**
     * Compares two {@link CharSet} objects, returning true if they represent
     * exactly the same set of characters defined in the same way.
     *
     * <p>The two sets {@code abc} and {@code a-c} are <i>not</i>
     * equal according to this method.</p>
     *
     * @param obj  the object to compare to
     * @return true if equal
     * @since 2.0
     */
    @Override
    public boolean equals(final Object obj) {
        if (obj == this) {
            return true;
        }
        if (!(obj instanceof CharSet)) {
            return false;
        }
        final CharSet other = (CharSet) obj;
        return set.equals(other.set);
    }

    /**
     * Gets a hash code compatible with the equals method.
     *
     * @return a suitable hash code
     * @since 2.0
     */
    @Override
    public int hashCode() {
        return 89 + set.hashCode();
    }

    /**
     * Gets a string representation of the set.
     *
     * @return string representation of the set
     */
    @Override
    public String toString() {
        return set.toString();
    }

}