diff options
Diffstat (limited to 'src/include/fst/icu.h')
-rw-r--r-- | src/include/fst/icu.h | 135 |
1 files changed, 74 insertions, 61 deletions
diff --git a/src/include/fst/icu.h b/src/include/fst/icu.h index 6b74c2e..3947716 100644 --- a/src/include/fst/icu.h +++ b/src/include/fst/icu.h @@ -13,88 +13,101 @@ // limitations under the License. // // Copyright 2005-2010 Google, Inc. -// Author: roubert@google.com (Fredrik Roubert) - -// Wrapper class for UErrorCode, with conversion operators for direct use in -// ICU C and C++ APIs. -// -// Features: -// - The constructor initializes the internal UErrorCode to U_ZERO_ERROR, -// removing one common source of errors. -// - Same use in C APIs taking a UErrorCode* (pointer) and C++ taking -// UErrorCode& (reference), via conversion operators. -// - Automatic checking for success when it goes out of scope. On failure, -// the destructor will FSTERROR() an error message. -// -// Most of ICU will handle errors gracefully and provide sensible fallbacks. -// Using IcuErrorCode, it is therefore possible to write very compact code -// that does sensible things on failure and provides logging for debugging. +// Author: sorenj@google.com (Jeffrey Sorensen) +// roubert@google.com (Fredrik Roubert) // -// Example: -// -// IcuErrorCode icuerrorcode; -// return collator.compareUTF8(a, b, icuerrorcode) == UCOL_EQUAL; +// This library implements an unrestricted Thompson/Pike UTF-8 parser and +// serializer. UTF-8 is a restricted subset of this byte stream encoding. See +// http://en.wikipedia.org/wiki/UTF-8 for a good description of the encoding +// details. #ifndef FST_LIB_ICU_H_ #define FST_LIB_ICU_H_ -#include <unicode/errorcode.h> -#include <unicode/unistr.h> -#include <unicode/ustring.h> -#include <unicode/utf8.h> - -class IcuErrorCode : public icu::ErrorCode { - public: - IcuErrorCode() {} - virtual ~IcuErrorCode() { if (isFailure()) handleFailure(); } - - // Redefine 'errorName()' in order to be compatible with ICU version 4.2 - const char* errorName() const { - return u_errorName(errorCode); - } - - protected: - virtual void handleFailure() const { - FSTERROR() << errorName(); -} - - private: - DISALLOW_COPY_AND_ASSIGN(IcuErrorCode); -}; +#include <iostream> +#include <fstream> +#include <sstream> namespace fst { template <class Label> bool UTF8StringToLabels(const string &str, vector<Label> *labels) { - const char *c_str = str.c_str(); - int32_t length = str.size(); - UChar32 c; - for (int32_t i = 0; i < length; /* no update */) { - U8_NEXT(c_str, i, length, c); - if (c < 0) { - LOG(ERROR) << "UTF8StringToLabels: Invalid character found: " << c; - return false; + const char *data = str.data(); + size_t length = str.size(); + for (int i = 0; i < length; /* no update */) { + int c = data[i++] & 0xff; + if ((c & 0x80) == 0) { + labels->push_back(c); + } else { + if ((c & 0xc0) == 0x80) { + LOG(ERROR) << "UTF8StringToLabels: continuation byte as lead byte"; + return false; + } + int count = (c >= 0xc0) + (c >= 0xe0) + (c >= 0xf0) + (c >= 0xf8) + + (c >= 0xfc); + int code = c & ((1 << (6 - count)) - 1); + while (count != 0) { + if (i == length) { + LOG(ERROR) << "UTF8StringToLabels: truncated utf-8 byte sequence"; + return false; + } + char cb = data[i++]; + if ((cb & 0xc0) != 0x80) { + LOG(ERROR) << "UTF8StringToLabels: missing/invalid continuation byte"; + return false; + } + code = (code << 6) | (cb & 0x3f); + count--; + } + if (code < 0) { + // This should not be able to happen. + LOG(ERROR) << "UTF8StringToLabels: Invalid character found: " << c; + return false; + } + labels->push_back(code); } - labels->push_back(c); } return true; } template <class Label> bool LabelsToUTF8String(const vector<Label> &labels, string *str) { - icu::UnicodeString u_str; - char c_str[5]; + ostringstream ostr; for (size_t i = 0; i < labels.size(); ++i) { - u_str.setTo(labels[i]); - IcuErrorCode error; - u_strToUTF8(c_str, 5, NULL, u_str.getTerminatedBuffer(), -1, error); - if (error.isFailure()) { - LOG(ERROR) << "LabelsToUTF8String: Bad encoding: " - << error.errorName(); + int32_t code = labels[i]; + if (code < 0) { + LOG(ERROR) << "LabelsToUTF8String: Invalid character found: " << code; return false; + } else if (code < 0x80) { + ostr << static_cast<char>(code); + } else if (code < 0x800) { + ostr << static_cast<char>((code >> 6) | 0xc0); + ostr << static_cast<char>((code & 0x3f) | 0x80); + } else if (code < 0x10000) { + ostr << static_cast<char>((code >> 12) | 0xe0); + ostr << static_cast<char>(((code >> 6) & 0x3f) | 0x80); + ostr << static_cast<char>((code & 0x3f) | 0x80); + } else if (code < 0x200000) { + ostr << static_cast<char>((code >> 18) | 0xf0); + ostr << static_cast<char>(((code >> 12) & 0x3f) | 0x80); + ostr << static_cast<char>(((code >> 6) & 0x3f) | 0x80); + ostr << static_cast<char>((code & 0x3f) | 0x80); + } else if (code < 0x4000000) { + ostr << static_cast<char>((code >> 24) | 0xf8); + ostr << static_cast<char>(((code >> 18) & 0x3f) | 0x80); + ostr << static_cast<char>(((code >> 12) & 0x3f) | 0x80); + ostr << static_cast<char>(((code >> 6) & 0x3f) | 0x80); + ostr << static_cast<char>((code & 0x3f) | 0x80); + } else { + ostr << static_cast<char>((code >> 30) | 0xfc); + ostr << static_cast<char>(((code >> 24) & 0x3f) | 0x80); + ostr << static_cast<char>(((code >> 18) & 0x3f) | 0x80); + ostr << static_cast<char>(((code >> 12) & 0x3f) | 0x80); + ostr << static_cast<char>(((code >> 6) & 0x3f) | 0x80); + ostr << static_cast<char>((code & 0x3f) | 0x80); } - *str += c_str; } + *str = ostr.str(); return true; } |