summaryrefslogtreecommitdiff
path: root/include/minikin/Hyphenator.h
blob: 75629739e94552cc36c53bf8e2e80a9420889876 (plain)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
/*
 * Copyright (C) 2015 The Android Open Source Project
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 *      http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

/**
 * An implementation of Liang's hyphenation algorithm.
 */

#ifndef MINIKIN_HYPHENATOR_H
#define MINIKIN_HYPHENATOR_H

#include <string>
#include <vector>

#include "minikin/Characters.h"
#include "minikin/U16StringPiece.h"

namespace minikin {

class Hyphenator;

// Registers the hyphenator.
// This doesn't take ownership of the hyphenator but we don't need to care about the ownership.
// In Android, the Hyphenator is allocated in Zygote and never gets released.
void addHyphenator(const std::string& localeStr, const Hyphenator* hyphenator);
void addHyphenatorAlias(const std::string& fromLocaleStr, const std::string& toLocaleStr);

enum class HyphenationType : uint8_t {
    // Note: There are implicit assumptions scattered in the code that DONT_BREAK is 0.

    // Do not break.
    DONT_BREAK = 0,
    // Break the line and insert a normal hyphen.
    BREAK_AND_INSERT_HYPHEN = 1,
    // Break the line and insert an Armenian hyphen (U+058A).
    BREAK_AND_INSERT_ARMENIAN_HYPHEN = 2,
    // Break the line and insert a maqaf (Hebrew hyphen, U+05BE).
    BREAK_AND_INSERT_MAQAF = 3,
    // Break the line and insert a Canadian Syllabics hyphen (U+1400).
    BREAK_AND_INSERT_UCAS_HYPHEN = 4,
    // Break the line, but don't insert a hyphen. Used for cases when there is already a hyphen
    // present or the script does not use a hyphen (e.g. in Malayalam).
    BREAK_AND_DONT_INSERT_HYPHEN = 5,
    // Break and replace the last code unit with hyphen. Used for Catalan "l·l" which hyphenates
    // as "l-/l".
    BREAK_AND_REPLACE_WITH_HYPHEN = 6,
    // Break the line, and repeat the hyphen (which is the last character) at the beginning of the
    // next line. Used in Polish (where "czerwono-niebieska" should hyphenate as
    // "czerwono-/-niebieska") and Slovenian.
    BREAK_AND_INSERT_HYPHEN_AT_NEXT_LINE = 7,
    // Break the line, insert a ZWJ and hyphen at the first line, and a ZWJ at the second line.
    // This is used in Arabic script, mostly for writing systems of Central Asia. It's our default
    // behavior when a soft hyphen is used in Arabic script.
    BREAK_AND_INSERT_HYPHEN_AND_ZWJ = 8
};

// The hyphen edit represents an edit to the string when a word is hyphenated.
// The most common hyphen edit is adding a "-" at the end of a syllable, but nonstandard hyphenation
// allows for more choices.
// One at the beginning of the string/line and one at the end.
enum class EndHyphenEdit : uint8_t {
    // Note that everything inserting characters must have a value greater than or equal to
    // INSERT_HYPHEN.
    NO_EDIT = 0b000,
    REPLACE_WITH_HYPHEN = 0b001,

    INSERT_HYPHEN = 0b010,
    INSERT_ARMENIAN_HYPHEN = 0b011,
    INSERT_MAQAF = 0b100,
    INSERT_UCAS_HYPHEN = 0b101,
    INSERT_ZWJ_AND_HYPHEN = 0b110,
};

enum class StartHyphenEdit : uint8_t {
    NO_EDIT = 0b00,

    INSERT_HYPHEN = 0b01,
    INSERT_ZWJ = 0b10,
};

typedef uint8_t HyphenEdit;
constexpr uint8_t START_BITS_SHIFT = 3;
// The following two masks must keep in sync with the definitions in the Java code at:
// frameworks/base/graphics/java/android/graphics/Paint.java
constexpr uint8_t MASK_END_OF_LINE = 0b00111;
constexpr uint8_t MASK_START_OF_LINE = 0b11000;

inline HyphenEdit packHyphenEdit(StartHyphenEdit start, EndHyphenEdit end) {
    return static_cast<uint8_t>(start) << START_BITS_SHIFT | static_cast<uint8_t>(end);
}

inline EndHyphenEdit endHyphenEdit(HyphenEdit hyphenEdit) {
    return static_cast<EndHyphenEdit>(hyphenEdit & MASK_END_OF_LINE);
}

inline StartHyphenEdit startHyphenEdit(HyphenEdit hyphenEdit) {
    return static_cast<StartHyphenEdit>(hyphenEdit >> START_BITS_SHIFT);
}

inline bool isReplacement(EndHyphenEdit hyph) {
    return hyph == EndHyphenEdit::REPLACE_WITH_HYPHEN;
}

inline bool isInsertion(StartHyphenEdit hyph) {
    return hyph != StartHyphenEdit::NO_EDIT;
}

inline bool isInsertion(EndHyphenEdit hyph) {
    return static_cast<uint8_t>(hyph) >= static_cast<uint8_t>(EndHyphenEdit::INSERT_HYPHEN);
}

template <typename T, size_t size>
constexpr size_t ARRAY_SIZE(T const (&)[size]) {
    return size;
}
constexpr uint32_t HYPHEN_STR_ZWJ[] = {CHAR_ZWJ};
constexpr uint32_t HYPHEN_STR_HYPHEN[] = {CHAR_HYPHEN};
constexpr uint32_t HYPHEN_STR_ARMENIAN_HYPHEN[] = {CHAR_ARMENIAN_HYPHEN};
constexpr uint32_t HYPHEN_STR_MAQAF[] = {CHAR_MAQAF};
constexpr uint32_t HYPHEN_STR_UCAS_HYPHEN[] = {CHAR_UCAS_HYPHEN};
constexpr uint32_t HYPHEN_STR_ZWJ_AND_HYPHEN[] = {CHAR_ZWJ, CHAR_HYPHEN};
constexpr std::pair<const uint32_t*, size_t> EMPTY_HYPHEN_STR(nullptr, 0);
#define MAKE_HYPHEN_STR(chars) std::make_pair((chars), ARRAY_SIZE(chars))

inline std::pair<const uint32_t*, size_t> getHyphenString(StartHyphenEdit hyph) {
    if (hyph == StartHyphenEdit::INSERT_ZWJ) {
        return MAKE_HYPHEN_STR(HYPHEN_STR_ZWJ);
    } else if (hyph == StartHyphenEdit::INSERT_HYPHEN) {
        return MAKE_HYPHEN_STR(HYPHEN_STR_HYPHEN);
    } else {
        return EMPTY_HYPHEN_STR;
    }
}

inline std::pair<const uint32_t*, size_t> getHyphenString(EndHyphenEdit hyph) {
    switch (hyph) {
        case EndHyphenEdit::REPLACE_WITH_HYPHEN:  // fall through
        case EndHyphenEdit::INSERT_HYPHEN:
            return MAKE_HYPHEN_STR(HYPHEN_STR_HYPHEN);
        case EndHyphenEdit::INSERT_ARMENIAN_HYPHEN:
            return MAKE_HYPHEN_STR(HYPHEN_STR_ARMENIAN_HYPHEN);
        case EndHyphenEdit::INSERT_MAQAF:
            return MAKE_HYPHEN_STR(HYPHEN_STR_MAQAF);
        case EndHyphenEdit::INSERT_UCAS_HYPHEN:
            return MAKE_HYPHEN_STR(HYPHEN_STR_UCAS_HYPHEN);
        case EndHyphenEdit::INSERT_ZWJ_AND_HYPHEN:
            return MAKE_HYPHEN_STR(HYPHEN_STR_ZWJ_AND_HYPHEN);
        case EndHyphenEdit::NO_EDIT:
        default:
            return EMPTY_HYPHEN_STR;
    }
}
#undef MAKE_HYPHEN_STR

EndHyphenEdit editForThisLine(HyphenationType type);
StartHyphenEdit editForNextLine(HyphenationType type);

// hyb file header; implementation details are in the .cpp file
struct Header;

class Hyphenator {
public:
    // Compute the hyphenation of a word, storing the hyphenation in result vector. Each entry in
    // the vector is a "hyphenation type" for a potential hyphenation that can be applied at the
    // corresponding code unit offset in the word.
    //
    // out must have at least the length of the word capacity.
    //
    // Example: word is "hyphen", result is the following, corresponding to "hy-phen":
    // [DONT_BREAK, DONT_BREAK, BREAK_AND_INSERT_HYPHEN, DONT_BREAK, DONT_BREAK, DONT_BREAK]
    void hyphenate(const U16StringPiece& word, HyphenationType* out) const;

    // Compute the hyphenation of a word.
    //
    // out will be resized to word length.
    void hyphenate(const U16StringPiece& word, std::vector<HyphenationType>* out) const {
        out->resize(word.size());
        return hyphenate(word, out->data());
    }

    // Returns true if the codepoint is like U+2010 HYPHEN in line breaking and usage: a character
    // immediately after which line breaks are allowed, but words containing it should not be
    // automatically hyphenated.
    static bool isLineBreakingHyphen(uint32_t cp);

    // pattern data is in binary format, as described in doc/hyb_file_format.md. Note:
    // the caller is responsible for ensuring that the lifetime of the pattern data is
    // at least as long as the Hyphenator object.

    // This class doesn't copy or take ownership of patternData. Caller must keep the data valid
    // until this instance is deleted.
    // Note: nullptr is valid input, in which case the hyphenator only processes soft hyphens.
    static Hyphenator* loadBinary(const uint8_t* patternData, size_t minPrefix, size_t minSuffix,
                                  const std::string& locale);

private:
    enum class HyphenationLocale : uint8_t {
        OTHER = 0,
        CATALAN = 1,
        POLISH = 2,
        SLOVENIAN = 3,
    };

    // Use Hyphenator::loadBinary instead.
    Hyphenator(const uint8_t* patternData, size_t minPrefix, size_t minSuffix,
               HyphenationLocale hyphenLocale);

    // apply various hyphenation rules including hard and soft hyphens, ignoring patterns
    void hyphenateWithNoPatterns(const U16StringPiece& word, HyphenationType* out) const;

    // Try looking up word in alphabet table, return DONT_BREAK if any code units fail to map.
    // Otherwise, returns BREAK_AND_INSERT_HYPHEN, BREAK_AND_INSERT_ARMENIAN_HYPHEN, or
    // BREAK_AND_DONT_INSERT_HYPHEN based on the the script of the characters seen.
    // Note that this method writes len+2 entries into alpha_codes (including start and stop)
    HyphenationType alphabetLookup(uint16_t* alpha_codes, const U16StringPiece& word) const;

    // calculate hyphenation from patterns, assuming alphabet lookup has already been done
    void hyphenateFromCodes(const uint16_t* codes, size_t len, HyphenationType hyphenValue,
                            HyphenationType* out) const;

    // See also LONGEST_HYPHENATED_WORD in LineBreaker.cpp. Here the constant is used so
    // that temporary buffers can be stack-allocated without waste, which is a slightly
    // different use case. It measures UTF-16 code units.
    static const size_t MAX_HYPHENATED_SIZE = 64;

    const uint8_t* mPatternData;
    const size_t mMinPrefix, mMinSuffix;
    const HyphenationLocale mHyphenationLocale;

    // accessors for binary data
    const Header* getHeader() const { return reinterpret_cast<const Header*>(mPatternData); }
};

}  // namespace minikin

#endif  // MINIKIN_HYPHENATOR_H