summaryrefslogtreecommitdiff
path: root/icu4j/main/translit/src/main/java/com/ibm/icu/text/AnyTransliterator.java
blob: 87e98af7fdf56b100877bff13f3cf06eb1ea5012 (plain)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
// © 2016 and later: Unicode, Inc. and others.
// License & terms of use: http://www.unicode.org/copyright.html
/*
*****************************************************************
* Copyright (c) 2002-2014, International Business Machines Corporation
* and others.  All Rights Reserved.
*****************************************************************
* Date        Name        Description
* 06/06/2002  aliu        Creation.
*****************************************************************
*/
package com.ibm.icu.text;

import java.util.ArrayList;
import java.util.Enumeration;
import java.util.HashMap;
import java.util.HashSet;
import java.util.List;
import java.util.MissingResourceException;
import java.util.Set;
import java.util.concurrent.ConcurrentHashMap;

import com.ibm.icu.lang.UScript;
/**
 * A transliterator that translates multiple input scripts to a single
 * output script.  It is named Any-T or Any-T/V, where T is the target
 * and V is the optional variant.  The target T is a script.
 *
 * <p>An AnyTransliterator partitions text into runs of the same
 * script, together with adjacent COMMON or INHERITED characters.
 * After determining the script of each run, it transliterates from
 * that script to the given target/variant.  It does so by
 * instantiating a transliterator from the source script to the
 * target/variant.  If a run consists only of the target script,
 * COMMON, or INHERITED characters, then the run is not changed.
 *
 * <p>At startup, all possible AnyTransliterators are registered with
 * the system, as determined by examining the registered script
 * transliterators.
 *
 * @since ICU 2.2
 * @author Alan Liu
 */
class AnyTransliterator extends Transliterator {

    //------------------------------------------------------------
    // Constants

    static final char TARGET_SEP = '-';
    static final char VARIANT_SEP = '/';
    static final String ANY = "Any";
    static final String NULL_ID = "Null";
    static final String LATIN_PIVOT = "-Latin;Latin-";

    /**
     * Cache mapping UScriptCode values to Transliterator*.
     */
    private ConcurrentHashMap<Integer, Transliterator> cache;

    /**
     * The target or target/variant string.
     */
    private String target;

    /**
     * The target script code.  Never USCRIPT_INVALID_CODE.
     */
    private int targetScript;

    /**
     * Lazily initialize a special Transliterator for handling width characters.
     */
    private static class WidthFix {
        private static final String ID = "[[:dt=Nar:][:dt=Wide:]] nfkd";

        static final Transliterator INSTANCE = Transliterator.getInstance(ID);
    }

    /**
     * Implements {@link Transliterator#handleTransliterate}.
     */
    @Override
    protected void handleTransliterate(Replaceable text,
                                       Position pos, boolean isIncremental) {
        int allStart = pos.start;
        int allLimit = pos.limit;

        ScriptRunIterator it =
            new ScriptRunIterator(text, pos.contextStart, pos.contextLimit);

        while (it.next()) {
            // Ignore runs in the ante context
            if (it.limit <= allStart) continue;

            // Try to instantiate transliterator from it.scriptCode to
            // our target or target/variant
            Transliterator t = getTransliterator(it.scriptCode);

            if (t == null) {
                // We have no transliterator.  Do nothing, but keep
                // pos.start up to date.
                pos.start = it.limit;
                continue;
            }

            // If the run end is before the transliteration limit, do
            // a non-incremental transliteration.  Otherwise do an
            // incremental one.
            boolean incremental = isIncremental && (it.limit >= allLimit);

            pos.start = Math.max(allStart, it.start);
            pos.limit = Math.min(allLimit, it.limit);
            int limit = pos.limit;
            t.filteredTransliterate(text, pos, incremental);
            int delta = pos.limit - limit;
            allLimit += delta;
            it.adjustLimit(delta);

            // We're done if we enter the post context
            if (it.limit >= allLimit) break;
        }

        // Restore limit.  pos.start is fine where the last transliterator
        // left it, or at the end of the last run.
        pos.limit = allLimit;
    }

    /**
     * Private constructor
     * @param id the ID of the form S-T or S-T/V, where T is theTarget
     * and V is theVariant.  Must not be empty.
     * @param theTarget the target name.  Must not be empty, and must
     * name a script corresponding to theTargetScript.
     * @param theVariant the variant name, or the empty string if
     * there is no variant
     * @param theTargetScript the script code corresponding to
     * theTarget.
     */
    private AnyTransliterator(String id,
                              String theTarget,
                              String theVariant,
                              int theTargetScript) {
        super(id, null);
        targetScript = theTargetScript;
        cache = new ConcurrentHashMap<Integer, Transliterator>();

        target = theTarget;
        if (theVariant.length() > 0) {
            target = theTarget + VARIANT_SEP + theVariant;
        }
    }

    /**
     * @param id the ID of the form S-T or S-T/V, where T is theTarget
     * and V is theVariant.  Must not be empty.
     * @param filter The Unicode filter.
     * @param target2 the target name.
     * @param targetScript2 the script code corresponding to theTarget.
     * @param widthFix2 Not used. This parameter is deprecated.
     * @param cache2 The Map object for cache.
     */
    public AnyTransliterator(String id, UnicodeFilter filter, String target2,
            int targetScript2, Transliterator widthFix2, ConcurrentHashMap<Integer, Transliterator> cache2) {
        super(id, filter);
        targetScript = targetScript2;
        cache = cache2;
        target = target2;
    }

    /**
     * Returns a transliterator from the given source to our target or
     * target/variant.  Returns NULL if the source is the same as our
     * target script, or if the source is USCRIPT_INVALID_CODE.
     * Caches the result and returns the same transliterator the next
     * time.  The caller does NOT own the result and must not delete
     * it.
     */
    private Transliterator getTransliterator(int source) {
        if (source == targetScript || source == UScript.INVALID_CODE) {
            if (isWide(targetScript)) {
                return null;
            } else {
                return WidthFix.INSTANCE;
            }
        }

        Integer key = Integer.valueOf(source);
        Transliterator t = cache.get(key);
        if (t == null) {
            String sourceName = UScript.getName(source);
            String id = sourceName + TARGET_SEP + target;

            try {
                t = Transliterator.getInstance(id, FORWARD);
            } catch (RuntimeException e) { }
            if (t == null) {

                // Try to pivot around Latin, our most common script
                id = sourceName + LATIN_PIVOT + target;
                try {
                    t = Transliterator.getInstance(id, FORWARD);
                } catch (RuntimeException e) { }
            }

            if (t != null) {
                if (!isWide(targetScript)) {
                    List<Transliterator> v = new ArrayList<Transliterator>();
                    v.add(WidthFix.INSTANCE);
                    v.add(t);
                    t = new CompoundTransliterator(v);
                }
                Transliterator prevCachedT = cache.putIfAbsent(key, t);
                if (prevCachedT != null) {
                    t = prevCachedT;
                }
            } else if (!isWide(targetScript)) {
                return WidthFix.INSTANCE;
            }
        }

        return t;
    }

    /**
     * @param targetScript2
     * @return
     */
    private boolean isWide(int script) {
        return script == UScript.BOPOMOFO || script == UScript.HAN || script == UScript.HANGUL || script == UScript.HIRAGANA || script == UScript.KATAKANA;
    }

    /**
     * Registers standard transliterators with the system.  Called by
     * Transliterator during initialization.  Scan all current targets
     * and register those that are scripts T as Any-T/V.
     */
    static void register() {

        HashMap<String, Set<String>> seen = new HashMap<String, Set<String>>(); // old code used set, but was dependent on order

        for (Enumeration<String> s = Transliterator.getAvailableSources(); s.hasMoreElements(); ) {
            String source = s.nextElement();

            // Ignore the "Any" source
            if (source.equalsIgnoreCase(ANY)) continue;

            for (Enumeration<String> t = Transliterator.getAvailableTargets(source);
                 t.hasMoreElements(); ) {
                String target = t.nextElement();

                // Get the script code for the target.  If not a script, ignore.
                int targetScript = scriptNameToCode(target);
                if (targetScript == UScript.INVALID_CODE) {
                    continue;
                }

                Set<String> seenVariants = seen.get(target);
                if (seenVariants == null) {
                    seen.put(target, seenVariants = new HashSet<String>());
                }

                for (Enumeration<String> v = Transliterator.getAvailableVariants(source, target);
                     v.hasMoreElements(); ) {
                    String variant = v.nextElement();

                    // Only process each target/variant pair once
                    if (seenVariants.contains(variant)) {
                        continue;
                    }
                    seenVariants.add(variant);

                    String id;
                    id = TransliteratorIDParser.STVtoID(ANY, target, variant);
                    AnyTransliterator trans = new AnyTransliterator(id, target, variant,
                                                                    targetScript);
                    Transliterator.registerInstance(trans);
                    Transliterator.registerSpecialInverse(target, NULL_ID, false);
                }
            }
        }
    }

    /**
     * Return the script code for a given name, or
     * UScript.INVALID_CODE if not found.
     */
    private static int scriptNameToCode(String name) {
        try{
            int[] codes = UScript.getCode(name);
            return codes != null ? codes[0] : UScript.INVALID_CODE;
        }catch( MissingResourceException e){
            ///CLOVER:OFF
            return UScript.INVALID_CODE;
            ///CLOVER:ON
        }
    }

    //------------------------------------------------------------
    // ScriptRunIterator

    /**
     * Returns a series of ranges corresponding to scripts. They will be
     * of the form:
     *
     * ccccSScSSccccTTcTcccc   - c = common, S = first script, T = second
     * |            |          - first run (start, limit)
     *          |           |  - second run (start, limit)
     *
     * That is, the runs will overlap. The reason for this is so that a
     * transliterator can consider common characters both before and after
     * the scripts.
     */
    private static class ScriptRunIterator {

        private Replaceable text;
        private int textStart;
        private int textLimit;

        /**
         * The code of the current run, valid after next() returns.  May
         * be UScript.INVALID_CODE if and only if the entire text is
         * COMMON/INHERITED.
         */
        public int scriptCode;

        /**
         * The start of the run, inclusive, valid after next() returns.
         */
        public int start;

        /**
         * The end of the run, exclusive, valid after next() returns.
         */
        public int limit;

        /**
         * Constructs a run iterator over the given text from start
         * (inclusive) to limit (exclusive).
         */
        public ScriptRunIterator(Replaceable text, int start, int limit) {
            this.text = text;
            this.textStart = start;
            this.textLimit = limit;
            this.limit = start;
        }


        /**
         * Returns true if there are any more runs.  true is always
         * returned at least once.  Upon return, the caller should
         * examine scriptCode, start, and limit.
         */
        public boolean next() {
            int ch;
            int s;

            scriptCode = UScript.INVALID_CODE; // don't know script yet
            start = limit;

            // Are we done?
            if (start == textLimit) {
                return false;
            }

            // Move start back to include adjacent COMMON or INHERITED
            // characters
            while (start > textStart) {
                ch = text.char32At(start - 1); // look back
                s = UScript.getScript(ch);
                if (s == UScript.COMMON || s == UScript.INHERITED) {
                    --start;
                } else {
                    break;
                }
            }

            // Move limit ahead to include COMMON, INHERITED, and characters
            // of the current script.
            while (limit < textLimit) {
                ch = text.char32At(limit); // look ahead
                s = UScript.getScript(ch);
                if (s != UScript.COMMON && s != UScript.INHERITED) {
                    if (scriptCode == UScript.INVALID_CODE) {
                        scriptCode = s;
                    } else if (s != scriptCode) {
                        break;
                    }
                }
                ++limit;
            }

            // Return true even if the entire text is COMMON / INHERITED, in
            // which case scriptCode will be UScript.INVALID_CODE.
            return true;
        }

        /**
         * Adjusts internal indices for a change in the limit index of the
         * given delta.  A positive delta means the limit has increased.
         */
        public void adjustLimit(int delta) {
            limit += delta;
            textLimit += delta;
        }
    }

    /**
     * Temporary hack for registry problem. Needs to be replaced by better architecture.
     */
    public Transliterator safeClone() {
        UnicodeFilter filter = getFilter();
        if (filter != null && filter instanceof UnicodeSet) {
            filter = new UnicodeSet((UnicodeSet)filter);
        }
        return new AnyTransliterator(getID(), filter, target, targetScript, null, cache);
    }

    /* (non-Javadoc)
     * @see com.ibm.icu.text.Transliterator#addSourceTargetSet(com.ibm.icu.text.UnicodeSet, com.ibm.icu.text.UnicodeSet, com.ibm.icu.text.UnicodeSet)
     */
    @Override
    public void addSourceTargetSet(UnicodeSet inputFilter, UnicodeSet sourceSet, UnicodeSet targetSet) {
        UnicodeSet myFilter = getFilterAsUnicodeSet(inputFilter);
        // Assume that it can modify any character to any other character
        sourceSet.addAll(myFilter);
        if (myFilter.size() != 0) {
            targetSet.addAll(0, 0x10FFFF);
        }
    }
}