aboutsummaryrefslogtreecommitdiff
path: root/src/include/fst/icu.h
diff options
context:
space:
mode:
Diffstat (limited to 'src/include/fst/icu.h')
-rw-r--r--src/include/fst/icu.h135
1 files changed, 74 insertions, 61 deletions
diff --git a/src/include/fst/icu.h b/src/include/fst/icu.h
index 6b74c2e..3947716 100644
--- a/src/include/fst/icu.h
+++ b/src/include/fst/icu.h
@@ -13,88 +13,101 @@
// limitations under the License.
//
// Copyright 2005-2010 Google, Inc.
-// Author: roubert@google.com (Fredrik Roubert)
-
-// Wrapper class for UErrorCode, with conversion operators for direct use in
-// ICU C and C++ APIs.
-//
-// Features:
-// - The constructor initializes the internal UErrorCode to U_ZERO_ERROR,
-// removing one common source of errors.
-// - Same use in C APIs taking a UErrorCode* (pointer) and C++ taking
-// UErrorCode& (reference), via conversion operators.
-// - Automatic checking for success when it goes out of scope. On failure,
-// the destructor will FSTERROR() an error message.
-//
-// Most of ICU will handle errors gracefully and provide sensible fallbacks.
-// Using IcuErrorCode, it is therefore possible to write very compact code
-// that does sensible things on failure and provides logging for debugging.
+// Author: sorenj@google.com (Jeffrey Sorensen)
+// roubert@google.com (Fredrik Roubert)
//
-// Example:
-//
-// IcuErrorCode icuerrorcode;
-// return collator.compareUTF8(a, b, icuerrorcode) == UCOL_EQUAL;
+// This library implements an unrestricted Thompson/Pike UTF-8 parser and
+// serializer. UTF-8 is a restricted subset of this byte stream encoding. See
+// http://en.wikipedia.org/wiki/UTF-8 for a good description of the encoding
+// details.
#ifndef FST_LIB_ICU_H_
#define FST_LIB_ICU_H_
-#include <unicode/errorcode.h>
-#include <unicode/unistr.h>
-#include <unicode/ustring.h>
-#include <unicode/utf8.h>
-
-class IcuErrorCode : public icu::ErrorCode {
- public:
- IcuErrorCode() {}
- virtual ~IcuErrorCode() { if (isFailure()) handleFailure(); }
-
- // Redefine 'errorName()' in order to be compatible with ICU version 4.2
- const char* errorName() const {
- return u_errorName(errorCode);
- }
-
- protected:
- virtual void handleFailure() const {
- FSTERROR() << errorName();
-}
-
- private:
- DISALLOW_COPY_AND_ASSIGN(IcuErrorCode);
-};
+#include <iostream>
+#include <fstream>
+#include <sstream>
namespace fst {
template <class Label>
bool UTF8StringToLabels(const string &str, vector<Label> *labels) {
- const char *c_str = str.c_str();
- int32_t length = str.size();
- UChar32 c;
- for (int32_t i = 0; i < length; /* no update */) {
- U8_NEXT(c_str, i, length, c);
- if (c < 0) {
- LOG(ERROR) << "UTF8StringToLabels: Invalid character found: " << c;
- return false;
+ const char *data = str.data();
+ size_t length = str.size();
+ for (int i = 0; i < length; /* no update */) {
+ int c = data[i++] & 0xff;
+ if ((c & 0x80) == 0) {
+ labels->push_back(c);
+ } else {
+ if ((c & 0xc0) == 0x80) {
+ LOG(ERROR) << "UTF8StringToLabels: continuation byte as lead byte";
+ return false;
+ }
+ int count = (c >= 0xc0) + (c >= 0xe0) + (c >= 0xf0) + (c >= 0xf8) +
+ (c >= 0xfc);
+ int code = c & ((1 << (6 - count)) - 1);
+ while (count != 0) {
+ if (i == length) {
+ LOG(ERROR) << "UTF8StringToLabels: truncated utf-8 byte sequence";
+ return false;
+ }
+ char cb = data[i++];
+ if ((cb & 0xc0) != 0x80) {
+ LOG(ERROR) << "UTF8StringToLabels: missing/invalid continuation byte";
+ return false;
+ }
+ code = (code << 6) | (cb & 0x3f);
+ count--;
+ }
+ if (code < 0) {
+ // This should not be able to happen.
+ LOG(ERROR) << "UTF8StringToLabels: Invalid character found: " << c;
+ return false;
+ }
+ labels->push_back(code);
}
- labels->push_back(c);
}
return true;
}
template <class Label>
bool LabelsToUTF8String(const vector<Label> &labels, string *str) {
- icu::UnicodeString u_str;
- char c_str[5];
+ ostringstream ostr;
for (size_t i = 0; i < labels.size(); ++i) {
- u_str.setTo(labels[i]);
- IcuErrorCode error;
- u_strToUTF8(c_str, 5, NULL, u_str.getTerminatedBuffer(), -1, error);
- if (error.isFailure()) {
- LOG(ERROR) << "LabelsToUTF8String: Bad encoding: "
- << error.errorName();
+ int32_t code = labels[i];
+ if (code < 0) {
+ LOG(ERROR) << "LabelsToUTF8String: Invalid character found: " << code;
return false;
+ } else if (code < 0x80) {
+ ostr << static_cast<char>(code);
+ } else if (code < 0x800) {
+ ostr << static_cast<char>((code >> 6) | 0xc0);
+ ostr << static_cast<char>((code & 0x3f) | 0x80);
+ } else if (code < 0x10000) {
+ ostr << static_cast<char>((code >> 12) | 0xe0);
+ ostr << static_cast<char>(((code >> 6) & 0x3f) | 0x80);
+ ostr << static_cast<char>((code & 0x3f) | 0x80);
+ } else if (code < 0x200000) {
+ ostr << static_cast<char>((code >> 18) | 0xf0);
+ ostr << static_cast<char>(((code >> 12) & 0x3f) | 0x80);
+ ostr << static_cast<char>(((code >> 6) & 0x3f) | 0x80);
+ ostr << static_cast<char>((code & 0x3f) | 0x80);
+ } else if (code < 0x4000000) {
+ ostr << static_cast<char>((code >> 24) | 0xf8);
+ ostr << static_cast<char>(((code >> 18) & 0x3f) | 0x80);
+ ostr << static_cast<char>(((code >> 12) & 0x3f) | 0x80);
+ ostr << static_cast<char>(((code >> 6) & 0x3f) | 0x80);
+ ostr << static_cast<char>((code & 0x3f) | 0x80);
+ } else {
+ ostr << static_cast<char>((code >> 30) | 0xfc);
+ ostr << static_cast<char>(((code >> 24) & 0x3f) | 0x80);
+ ostr << static_cast<char>(((code >> 18) & 0x3f) | 0x80);
+ ostr << static_cast<char>(((code >> 12) & 0x3f) | 0x80);
+ ostr << static_cast<char>(((code >> 6) & 0x3f) | 0x80);
+ ostr << static_cast<char>((code & 0x3f) | 0x80);
}
- *str += c_str;
}
+ *str = ostr.str();
return true;
}