libs/minikin/WordBreaker.cpp


1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328

/*
 * Copyright (C) 2015 The Android Open Source Project
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 *      http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

#include "WordBreaker.h"

#include <list>
#include <map>

#include <unicode/ubrk.h>
#include <unicode/uchar.h>
#include <unicode/utf16.h>

#include "minikin/Emoji.h"
#include "minikin/Hyphenator.h"

#include "Locale.h"
#include "MinikinInternal.h"

namespace minikin {

namespace {
static UBreakIterator* createNewIterator(const Locale& locale, LineBreakStyle lbStyle,
                                         LineBreakWordStyle lbWordStyle) {
    // TODO: handle failure status
    UErrorCode status = U_ZERO_ERROR;
    char localeID[ULOC_FULLNAME_CAPACITY] = {};
    uloc_forLanguageTag(locale.getStringWithLineBreakOption(lbStyle, lbWordStyle).c_str(), localeID,
                        ULOC_FULLNAME_CAPACITY, nullptr, &status);
    return ubrk_open(UBreakIteratorType::UBRK_LINE, localeID, nullptr, 0, &status);
}
}  // namespace

ICULineBreakerPool::Slot ICULineBreakerPoolImpl::acquire(const Locale& locale,
                                                         LineBreakStyle lbStyle,
                                                         LineBreakWordStyle lbWordStyle) {
    const uint64_t id = locale.getIdentifier();
    std::lock_guard<std::mutex> lock(mMutex);
    for (auto i = mPool.begin(); i != mPool.end(); i++) {
        if (i->localeId == id && i->lbStyle == lbStyle && i->lbWordStyle == lbWordStyle) {
            Slot slot = std::move(*i);
            mPool.erase(i);
            return slot;
        }
    }

    // Not found in pool. Create new one.
    return {id, lbStyle, lbWordStyle,
            IcuUbrkUniquePtr(createNewIterator(locale, lbStyle, lbWordStyle))};
}

void ICULineBreakerPoolImpl::release(ICULineBreakerPool::Slot&& slot) {
    if (slot.breaker.get() == nullptr) {
        return;  // Already released slot. Do nothing.
    }
    std::lock_guard<std::mutex> lock(mMutex);
    if (mPool.size() >= MAX_POOL_SIZE) {
        // Pool is full. Move to local variable, so that the given slot will be released when the
        // variable leaves the scope.
        Slot localSlot = std::move(slot);
        return;
    }
    mPool.push_front(std::move(slot));
}

WordBreaker::WordBreaker()
        : mPool(&ICULineBreakerPoolImpl::getInstance()), mUText(nullptr, &utext_close) {}

WordBreaker::WordBreaker(ICULineBreakerPool* pool) : mPool(pool), mUText(nullptr, &utext_close) {}

ssize_t WordBreaker::followingWithLocale(const Locale& locale, LineBreakStyle lbStyle,
                                         LineBreakWordStyle lbWordStyle, size_t from) {
    if (!mUText) {
        return mCurrent;
    }
    mIcuBreaker = mPool->acquire(locale, lbStyle, lbWordStyle);
    UErrorCode status = U_ZERO_ERROR;
    MINIKIN_ASSERT(mText != nullptr, "setText must be called first");
    // TODO: handle failure status
    ubrk_setUText(mIcuBreaker.breaker.get(), mUText.get(), &status);
    if (mInEmailOrUrl) {
        // Note:
        // Don't reset mCurrent, mLast, or mScanOffset for keeping email/URL context.
        // The email/URL detection doesn't support following() functionality, so that we can't
        // restart from the specific position. This means following() can not be supported in
        // general, but keeping old email/URL context works for LineBreaker since it just wants to
        // re-calculate the next break point with the new locale.
    } else {
        mCurrent = mLast = mScanOffset = from;
        next();
    }
    return mCurrent;
}

void WordBreaker::setText(const uint16_t* data, size_t size) {
    mText = data;
    mTextSize = size;
    mLast = 0;
    mCurrent = 0;
    mScanOffset = 0;
    mInEmailOrUrl = false;
    UErrorCode status = U_ZERO_ERROR;
    mUText.reset(utext_openUChars(nullptr, reinterpret_cast<const UChar*>(data), size, &status));
}

ssize_t WordBreaker::current() const {
    return mCurrent;
}

/**
 * Determine whether a line break at position i within the buffer buf is valid. This
 * represents customization beyond the ICU behavior, because plain ICU provides some
 * line break opportunities that we don't want.
 **/
static bool isValidBreak(const uint16_t* buf, size_t bufEnd, int32_t i) {
    const size_t position = static_cast<size_t>(i);
    if (i == UBRK_DONE || position == bufEnd) {
        // If the iterator reaches the end, treat as break.
        return true;
    }
    uint32_t codePoint;
    size_t prev_offset = position;
    U16_PREV(buf, 0, prev_offset, codePoint);
    // Do not break on hard or soft hyphens. These are handled by automatic hyphenation.
    if (Hyphenator::isLineBreakingHyphen(codePoint) || codePoint == CHAR_SOFT_HYPHEN) {
        return false;
    }
    // For Myanmar kinzi sequences, created by <consonant, ASAT, VIRAMA, consonant>. This is to go
    // around a bug in ICU line breaking: http://bugs.icu-project.org/trac/ticket/12561. To avoid
    // too much looking around in the strings, we simply avoid breaking after any Myanmar virama,
    // where no line break could be imagined, since the Myanmar virama is a pure stacker.
    if (codePoint == 0x1039) {  // MYANMAR SIGN VIRAMA
        return false;
    }

    uint32_t next_codepoint;
    size_t next_offset = position;
    U16_NEXT(buf, next_offset, bufEnd, next_codepoint);

    // Rule LB8 for Emoji ZWJ sequences. We need to do this ourselves since we may have fresher
    // emoji data than ICU does.
    if (codePoint == CHAR_ZWJ && isEmoji(next_codepoint)) {
        return false;
    }

    // Rule LB30b. We need to this ourselves since we may have fresher emoji data than ICU does.
    if (isEmojiModifier(next_codepoint)) {
        if (codePoint == 0xFE0F && prev_offset > 0) {
            // skip over emoji variation selector
            U16_PREV(buf, 0, prev_offset, codePoint);
        }
        if (isEmojiBase(codePoint)) {
            return false;
        }
    }
    return true;
}

// Customized iteratorNext that takes care of both resets and our modifications
// to ICU's behavior.
int32_t WordBreaker::iteratorNext() {
    int32_t result = ubrk_following(mIcuBreaker.breaker.get(), mCurrent);
    while (!isValidBreak(mText, mTextSize, result)) {
        result = ubrk_next(mIcuBreaker.breaker.get());
    }
    return result;
}

// Chicago Manual of Style recommends breaking after these characters in URLs and email addresses
static bool breakAfter(uint16_t c) {
    return c == ':' || c == '=' || c == '&';
}

// Chicago Manual of Style recommends breaking before these characters in URLs and email addresses
static bool breakBefore(uint16_t c) {
    return c == '~' || c == '.' || c == ',' || c == '-' || c == '_' || c == '?' || c == '#' ||
           c == '%' || c == '=' || c == '&';
}

enum ScanState {
    START,
    SAW_AT,
    SAW_COLON,
    SAW_COLON_SLASH,
    SAW_COLON_SLASH_SLASH,
};

void WordBreaker::detectEmailOrUrl() {
    // scan forward from current ICU position for email address or URL
    if (mLast >= mScanOffset) {
        ScanState state = START;
        size_t i;
        for (i = mLast; i < mTextSize; i++) {
            uint16_t c = mText[i];
            // scan only ASCII characters, stop at space
            if (!(' ' < c && c <= 0x007E)) {
                break;
            }
            if (state == START && c == '@') {
                state = SAW_AT;
            } else if (state == START && c == ':') {
                state = SAW_COLON;
            } else if (state == SAW_COLON || state == SAW_COLON_SLASH) {
                if (c == '/') {
                    state = static_cast<ScanState>((int)state + 1);  // next state adds a slash
                } else {
                    state = START;
                }
            }
        }
        if (state == SAW_AT || state == SAW_COLON_SLASH_SLASH) {
            if (!ubrk_isBoundary(mIcuBreaker.breaker.get(), i)) {
                // If there are combining marks or such at the end of the URL or the email address,
                // consider them a part of the URL or the email, and skip to the next actual
                // boundary.
                i = ubrk_following(mIcuBreaker.breaker.get(), i);
            }
            mInEmailOrUrl = true;
        } else {
            mInEmailOrUrl = false;
        }
        mScanOffset = i;
    }
}

ssize_t WordBreaker::findNextBreakInEmailOrUrl() {
    // special rules for email addresses and URL's as per Chicago Manual of Style (16th ed.)
    uint16_t lastChar = mText[mLast];
    ssize_t i;
    for (i = mLast + 1; i < mScanOffset; i++) {
        if (breakAfter(lastChar)) {
            break;
        }
        // break after double slash
        if (lastChar == '/' && i >= mLast + 2 && mText[i - 2] == '/') {
            break;
        }
        const uint16_t thisChar = mText[i];
        // never break after hyphen
        if (lastChar != '-') {
            if (breakBefore(thisChar)) {
                break;
            }
            // break before single slash
            if (thisChar == '/' && lastChar != '/' &&
                !(i + 1 < mScanOffset && mText[i + 1] == '/')) {
                break;
            }
        }
        lastChar = thisChar;
    }
    return i;
}

ssize_t WordBreaker::next() {
    mLast = mCurrent;

    detectEmailOrUrl();
    if (mInEmailOrUrl) {
        mCurrent = findNextBreakInEmailOrUrl();
    } else {  // Business as usual
        mCurrent = (ssize_t)iteratorNext();
    }
    return mCurrent;
}

ssize_t WordBreaker::wordStart() const {
    if (mInEmailOrUrl) {
        return mLast;
    }
    ssize_t result = mLast;
    while (result < mCurrent) {
        UChar32 c;
        ssize_t ix = result;
        U16_NEXT(mText, ix, mCurrent, c);
        const int32_t lb = u_getIntPropertyValue(c, UCHAR_LINE_BREAK);
        // strip leading punctuation, defined as OP and QU line breaking classes,
        // see UAX #14
        if (!(lb == U_LB_OPEN_PUNCTUATION || lb == U_LB_QUOTATION)) {
            break;
        }
        result = ix;
    }
    return result;
}

ssize_t WordBreaker::wordEnd() const {
    if (mInEmailOrUrl) {
        return mLast;
    }
    ssize_t result = mCurrent;
    while (result > mLast) {
        UChar32 c;
        ssize_t ix = result;
        U16_PREV(mText, mLast, ix, c);
        const int32_t gc_mask = U_GET_GC_MASK(c);
        // strip trailing spaces, punctuation and control characters
        if ((gc_mask & (U_GC_ZS_MASK | U_GC_P_MASK | U_GC_CC_MASK)) == 0) {
            break;
        }
        result = ix;
    }
    return result;
}

int WordBreaker::breakBadness() const {
    return (mInEmailOrUrl && mCurrent < mScanOffset) ? 1 : 0;
}

void WordBreaker::finish() {
    mText = nullptr;
    mUText.reset();
    mPool->release(std::move(mIcuBreaker));
}

}  // namespace minikin