aboutsummaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorAndroid Build Coastguard Worker <android-build-coastguard-worker@google.com>2022-06-15 21:39:32 +0000
committerAndroid Build Coastguard Worker <android-build-coastguard-worker@google.com>2022-06-15 21:39:32 +0000
commit854cabe58fe83993ab608b428c6a97c5565dcb0c (patch)
treec0a00b9b4d52ff3dfeb50f5d894bad2d71389b00
parent9c6c6103b62bb8941c2bd711f0e6cb47b6f10b2e (diff)
parent98f9e8aacdf9898e4ff093385365a233d25bf24f (diff)
downloadicing-aml_tz3_314012010.tar.gz
Change-Id: I3aff676785fcc7c7da09269c5fb50e4461fbdea1
-rw-r--r--Android.bp7
-rw-r--r--CMakeLists.txt6
-rw-r--r--OWNERS3
-rw-r--r--TEST_MAPPING7
-rw-r--r--build.gradle11
-rw-r--r--icing/file/file-backed-bitmap.cc10
-rw-r--r--icing/file/file-backed-proto-log.h358
-rw-r--r--icing/file/file-backed-proto-log_benchmark.cc251
-rw-r--r--icing/file/file-backed-proto-log_test.cc573
-rw-r--r--icing/file/file-backed-proto.h13
-rw-r--r--icing/file/file-backed-vector.h8
-rw-r--r--icing/file/file-backed-vector_test.cc78
-rw-r--r--icing/file/filesystem.cc2
-rw-r--r--icing/file/filesystem.h11
-rw-r--r--icing/file/memory-mapped-file.cc18
-rw-r--r--icing/file/portable-file-backed-proto-log.h230
-rw-r--r--icing/file/portable-file-backed-proto-log_benchmark.cc100
-rw-r--r--icing/file/portable-file-backed-proto-log_test.cc5
-rw-r--r--icing/helpers/icu/icu-data-file-helper.cc (renamed from icing/testing/icu-data-file-helper.cc)2
-rw-r--r--icing/helpers/icu/icu-data-file-helper.h (renamed from icing/testing/icu-data-file-helper.h)6
-rw-r--r--icing/icing-search-engine-with-icu-file_test.cc10
-rw-r--r--icing/icing-search-engine.cc417
-rw-r--r--icing/icing-search-engine.h44
-rw-r--r--icing/icing-search-engine_benchmark.cc61
-rw-r--r--icing/icing-search-engine_fuzz_test.cc12
-rw-r--r--icing/icing-search-engine_test.cc1391
-rw-r--r--icing/index/index-processor.cc82
-rw-r--r--icing/index/index-processor.h31
-rw-r--r--icing/index/index-processor_benchmark.cc34
-rw-r--r--icing/index/index-processor_test.cc294
-rw-r--r--icing/index/index.cc115
-rw-r--r--icing/index/index.h28
-rw-r--r--icing/index/index_test.cc405
-rw-r--r--icing/index/iterator/doc-hit-info-iterator-and.cc4
-rw-r--r--icing/index/iterator/doc-hit-info-iterator-section-restrict_test.cc10
-rw-r--r--icing/index/lite/doc-hit-info-iterator-term-lite.cc14
-rw-r--r--icing/index/lite/doc-hit-info-iterator-term-lite.h5
-rw-r--r--icing/index/lite/lite-index.cc52
-rw-r--r--icing/index/lite/lite-index.h12
-rw-r--r--icing/index/lite/lite-index_test.cc110
-rw-r--r--icing/index/main/doc-hit-info-iterator-term-main.cc5
-rw-r--r--icing/index/main/flash-index-storage.cc4
-rw-r--r--icing/index/main/flash-index-storage.h1
-rw-r--r--icing/index/main/flash-index-storage_test.cc2
-rw-r--r--icing/index/main/index-block.cc3
-rw-r--r--icing/index/main/index-block.h2
-rw-r--r--icing/index/main/main-index.cc96
-rw-r--r--icing/index/main/main-index.h18
-rw-r--r--icing/index/main/main-index_test.cc28
-rw-r--r--icing/index/main/posting-list-free.h4
-rw-r--r--icing/index/main/posting-list-used.h2
-rw-r--r--icing/jni/icing-search-engine-jni.cc37
-rw-r--r--icing/legacy/core/icing-core-types.h3
-rw-r--r--icing/legacy/core/icing-string-util.cc9
-rw-r--r--icing/legacy/core/icing-string-util.h5
-rw-r--r--icing/legacy/core/icing-timer.h3
-rw-r--r--icing/legacy/index/icing-array-storage.cc2
-rw-r--r--icing/legacy/index/icing-array-storage.h3
-rw-r--r--icing/legacy/index/icing-bit-util.h5
-rw-r--r--icing/legacy/index/icing-dynamic-trie.cc25
-rw-r--r--icing/legacy/index/icing-dynamic-trie.h12
-rw-r--r--icing/legacy/index/icing-filesystem.cc2
-rw-r--r--icing/legacy/index/icing-filesystem.h5
-rw-r--r--icing/legacy/index/icing-flash-bitmap.h4
-rw-r--r--icing/legacy/index/icing-mmapper.cc5
-rw-r--r--icing/legacy/index/icing-mock-filesystem.h9
-rw-r--r--icing/legacy/index/icing-storage-file.cc2
-rw-r--r--icing/portable/endian.h42
-rw-r--r--icing/portable/gzip_stream.cc313
-rw-r--r--icing/portable/gzip_stream.h181
-rw-r--r--icing/query/query-processor.cc2
-rw-r--r--icing/query/query-processor_benchmark.cc2
-rw-r--r--icing/query/query-processor_test.cc87
-rw-r--r--icing/query/suggestion-processor.cc96
-rw-r--r--icing/query/suggestion-processor.h68
-rw-r--r--icing/query/suggestion-processor_test.cc326
-rw-r--r--icing/result/result-retriever_test.cc20
-rw-r--r--icing/result/result-state-manager_test.cc4
-rw-r--r--icing/result/result-state_test.cc4
-rw-r--r--icing/result/snippet-retriever.cc151
-rw-r--r--icing/result/snippet-retriever_test.cc203
-rw-r--r--icing/schema/schema-store.cc192
-rw-r--r--icing/schema/schema-store.h94
-rw-r--r--icing/schema/schema-store_test.cc522
-rw-r--r--icing/schema/schema-util.cc90
-rw-r--r--icing/schema/schema-util.h30
-rw-r--r--icing/schema/schema-util_test.cc65
-rw-r--r--icing/schema/section.h5
-rw-r--r--icing/scoring/bm25f-calculator.cc51
-rw-r--r--icing/scoring/bm25f-calculator.h32
-rw-r--r--icing/scoring/ranker.cc98
-rw-r--r--icing/scoring/ranker.h13
-rw-r--r--icing/scoring/score-and-rank_benchmark.cc125
-rw-r--r--icing/scoring/scorer.cc19
-rw-r--r--icing/scoring/scorer.h4
-rw-r--r--icing/scoring/scorer_test.cc199
-rw-r--r--icing/scoring/scoring-processor.cc9
-rw-r--r--icing/scoring/scoring-processor.h4
-rw-r--r--icing/scoring/scoring-processor_test.cc460
-rw-r--r--icing/scoring/section-weights.cc151
-rw-r--r--icing/scoring/section-weights.h95
-rw-r--r--icing/scoring/section-weights_test.cc443
-rw-r--r--icing/store/document-log-creator.cc19
-rw-r--r--icing/store/document-log-creator.h12
-rw-r--r--icing/store/document-store.cc152
-rw-r--r--icing/store/document-store.h41
-rw-r--r--icing/store/document-store_benchmark.cc79
-rw-r--r--icing/store/document-store_test.cc242
-rw-r--r--icing/store/namespace-checker-impl.h51
-rw-r--r--icing/store/namespace-checker.h42
-rw-r--r--icing/testing/always-true-namespace-checker-impl.h34
-rw-r--r--icing/testing/common-matchers.h69
-rw-r--r--icing/testing/random-string.cc54
-rw-r--r--icing/testing/random-string.h5
-rw-r--r--icing/testing/random-string_test.cc54
-rw-r--r--icing/testing/snippet-helpers.cc10
-rw-r--r--icing/testing/snippet-helpers.h4
-rw-r--r--icing/tokenization/combined-tokenizer_test.cc232
-rw-r--r--icing/tokenization/icu/icu-language-segmenter.cc38
-rw-r--r--icing/tokenization/icu/icu-language-segmenter_test.cc59
-rw-r--r--icing/tokenization/language-segmenter-iterator_test.cc2
-rw-r--r--icing/tokenization/language-segmenter_benchmark.cc2
-rw-r--r--icing/tokenization/plain-tokenizer.cc16
-rw-r--r--icing/tokenization/plain-tokenizer_test.cc272
-rw-r--r--icing/tokenization/raw-query-tokenizer.cc109
-rw-r--r--icing/tokenization/raw-query-tokenizer_test.cc581
-rw-r--r--icing/tokenization/reverse_jni/reverse-jni-break-iterator.cc2
-rw-r--r--icing/tokenization/reverse_jni/reverse-jni-language-segmenter.cc32
-rw-r--r--icing/tokenization/reverse_jni/reverse-jni-language-segmenter_test.cc60
-rw-r--r--icing/tokenization/token.h5
-rw-r--r--icing/tokenization/tokenizer-factory.cc3
-rw-r--r--icing/tokenization/tokenizer.h20
-rw-r--r--icing/tokenization/verbatim-tokenizer.cc139
-rw-r--r--icing/tokenization/verbatim-tokenizer.h41
-rw-r--r--icing/tokenization/verbatim-tokenizer_test.cc209
-rw-r--r--icing/transform/icu/icu-normalizer.cc121
-rw-r--r--icing/transform/icu/icu-normalizer.h18
-rw-r--r--icing/transform/icu/icu-normalizer_benchmark.cc120
-rw-r--r--icing/transform/icu/icu-normalizer_test.cc100
-rw-r--r--icing/transform/map/map-normalizer.cc112
-rw-r--r--icing/transform/map/map-normalizer.h12
-rw-r--r--icing/transform/map/map-normalizer_benchmark.cc98
-rw-r--r--icing/transform/map/map-normalizer_test.cc99
-rw-r--r--icing/transform/map/normalization-map.cc26
-rw-r--r--icing/transform/map/normalization-map.h2
-rw-r--r--icing/transform/normalizer.h12
-rw-r--r--icing/transform/simple/none-normalizer-factory.cc53
-rw-r--r--icing/transform/simple/none-normalizer.h51
-rw-r--r--icing/transform/simple/none-normalizer_test.cc74
-rw-r--r--icing/util/character-iterator.cc118
-rw-r--r--icing/util/character-iterator.h12
-rw-r--r--icing/util/character-iterator_test.cc266
-rw-r--r--icing/util/document-validator_test.cc21
-rw-r--r--icing/util/i18n-utils.cc2
-rw-r--r--icing/util/i18n-utils.h3
-rw-r--r--java/Android.bp1
-rw-r--r--java/src/com/google/android/icing/IcingSearchEngine.java25
-rw-r--r--java/tests/instrumentation/src/androidx/appsearch/smoketest/AppSearchSmokeTest.java (renamed from java/tests/instrumentation/src/androidx/appsearch/smoketest/AndroidXSmokeTest.java)22
-rw-r--r--java/tests/instrumentation/src/androidx/appsearch/smoketest/TestDocument.java25
-rw-r--r--java/tests/instrumentation/src/com/google/android/icing/IcingSearchEngineTest.java66
-rw-r--r--proto/Android.bp1
-rw-r--r--proto/icing/proto/debug.proto127
-rw-r--r--proto/icing/proto/document.proto19
-rw-r--r--proto/icing/proto/initialize.proto15
-rw-r--r--proto/icing/proto/logging.proto53
-rw-r--r--proto/icing/proto/schema.proto25
-rw-r--r--proto/icing/proto/scoring.proto40
-rw-r--r--proto/icing/proto/search.proto101
-rw-r--r--synced_AOSP_CL_number.txt2
169 files changed, 3670 insertions, 10188 deletions
diff --git a/Android.bp b/Android.bp
index 909e3ed..dda6614 100644
--- a/Android.bp
+++ b/Android.bp
@@ -82,13 +82,14 @@ cc_library_shared {
"libutf",
],
shared_libs: [
- "libicu",
+ "libandroidicu",
"liblog",
- "libprotobuf-cpp-lite",
+ // TODO(b/147509515): We only need the full version for GzipStream. If we can remove
+ // that dependency, then we can just use libprotobuf-cpp-lite
+ "libprotobuf-cpp-full",
"libz",
],
version_script: "icing/jni.lds",
- min_sdk_version: "Tiramisu",
}
// TODO(cassiewang): Add build rules and a TEST_MAPPING for cc_tests
diff --git a/CMakeLists.txt b/CMakeLists.txt
index 8c8e439..01ee8eb 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -14,8 +14,6 @@
cmake_minimum_required(VERSION 3.10.2)
-project(icing)
-
add_definitions("-DICING_REVERSE_JNI_SEGMENTATION=1")
set(VERSION_SCRIPT "${CMAKE_CURRENT_SOURCE_DIR}/icing/jni.lds")
set(CMAKE_SHARED_LINKER_FLAGS
@@ -76,7 +74,7 @@ foreach(FILE ${Icing_PROTO_FILES})
"${Icing_PROTO_GEN_DIR}/${FILE_NOEXT}.pb.h"
COMMAND ${Protobuf_PROTOC_PATH}
--proto_path "${CMAKE_CURRENT_SOURCE_DIR}/proto"
- --cpp_out "lite:${Icing_PROTO_GEN_DIR}"
+ --cpp_out ${Icing_PROTO_GEN_DIR}
${FILE}
WORKING_DIRECTORY ${CMAKE_CURRENT_SOURCE_DIR}
DEPENDS ${CMAKE_CURRENT_SOURCE_DIR}/proto/${FILE}
@@ -129,4 +127,4 @@ target_include_directories(icing PRIVATE ${CMAKE_CURRENT_SOURCE_DIR})
target_include_directories(icing PRIVATE ${Icing_PROTO_GEN_DIR})
target_include_directories(icing PRIVATE "${Protobuf_SOURCE_DIR}/src")
target_include_directories(icing PRIVATE "${ICU_SOURCE_DIR}/include")
-target_link_libraries(icing protobuf::libprotobuf-lite libandroidicu log z)
+target_link_libraries(icing protobuf::libprotobuf libandroidicu log)
diff --git a/OWNERS b/OWNERS
deleted file mode 100644
index 6ec1a95..0000000
--- a/OWNERS
+++ /dev/null
@@ -1,3 +0,0 @@
-adorokhine@google.com
-tjbarron@google.com
-dsaadati@google.com
diff --git a/TEST_MAPPING b/TEST_MAPPING
index baef43b..37cb5fc 100644
--- a/TEST_MAPPING
+++ b/TEST_MAPPING
@@ -4,14 +4,9 @@
"name": "IcingSearchEngineTest"
}
],
- "hwasan-postsubmit": [
- {
- "name": "IcingSearchEngineTest"
- }
- ],
"imports": [
{
- "path": "packages/modules/AppSearch"
+ "path": "frameworks/base/apex/appsearch/service/java/com/android/server/appsearch"
}
]
}
diff --git a/build.gradle b/build.gradle
index 5b5f3a6..882a929 100644
--- a/build.gradle
+++ b/build.gradle
@@ -15,6 +15,7 @@
*/
import static androidx.build.SupportConfig.*
+import static androidx.build.dependencies.DependenciesKt.*
buildscript {
dependencies {
@@ -56,14 +57,14 @@ dependencies {
implementation('com.google.protobuf:protobuf-javalite:3.10.0')
- androidTestImplementation(libs.testCore)
- androidTestImplementation(libs.testRules)
- androidTestImplementation(libs.truth)
+ androidTestImplementation(ANDROIDX_TEST_CORE)
+ androidTestImplementation(ANDROIDX_TEST_RULES)
+ androidTestImplementation(TRUTH)
}
protobuf {
protoc {
- artifact = libs.protobufCompiler.get()
+ artifact = 'com.google.protobuf:protoc:3.10.0'
}
generateProtoTasks {
@@ -92,7 +93,7 @@ android.libraryVariants.all { variant ->
// only renames the java classes. Remove them here since they are unused.
// Expand the jar and remove any .proto files.
from(zipTree(configurations.detachedConfiguration(
- dependencies.create(libs.protobufLite.get())).getSingleFile())) {
+ dependencies.create(PROTOBUF_LITE)).getSingleFile())) {
exclude("**/*.proto")
}
diff --git a/icing/file/file-backed-bitmap.cc b/icing/file/file-backed-bitmap.cc
index eec7668..f1e568c 100644
--- a/icing/file/file-backed-bitmap.cc
+++ b/icing/file/file-backed-bitmap.cc
@@ -50,7 +50,7 @@ FileBackedBitmap::Create(const Filesystem* filesystem,
auto bitmap = std::unique_ptr<FileBackedBitmap>(
new FileBackedBitmap(filesystem, file_path, mmap_strategy));
- // TODO(b/216487496): Implement a more robust version of TC_RETURN_IF_ERROR
+ // TODO(b/144458732): Implement a more robust version of TC_RETURN_IF_ERROR
// that can support error logging.
libtextclassifier3::Status status = bitmap->Initialize();
if (!status.ok()) {
@@ -122,7 +122,7 @@ libtextclassifier3::Status FileBackedBitmap::FileBackedBitmap::Initialize() {
<< " of size: " << file_size;
}
- // TODO(b/216487496): Implement a more robust version of TC_RETURN_IF_ERROR
+ // TODO(b/144458732): Implement a more robust version of TC_RETURN_IF_ERROR
// that can support error logging.
libtextclassifier3::Status status = mmapper_->Remap(0, file_size);
if (!status.ok()) {
@@ -198,7 +198,7 @@ int FileBackedBitmap::NumBits() const {
libtextclassifier3::Status FileBackedBitmap::Set(int bit_index,
bool bit_value) {
if (bit_index >= NumBits()) {
- // TODO(b/216487496): Implement a more robust version of TC_RETURN_IF_ERROR
+ // TODO(b/144458732): Implement a more robust version of TC_RETURN_IF_ERROR
// that can support error logging.
libtextclassifier3::Status status = GrowTo(bit_index);
if (!status.ok()) {
@@ -261,7 +261,7 @@ libtextclassifier3::Status FileBackedBitmap::GrowTo(int new_num_bits) {
file_path_.c_str(), new_file_size));
}
- // TODO(b/216487496): Implement a more robust version of TC_RETURN_IF_ERROR
+ // TODO(b/144458732): Implement a more robust version of TC_RETURN_IF_ERROR
// that can support error logging.
libtextclassifier3::Status status = mmapper_->Remap(0, new_file_size);
if (!status.ok()) {
@@ -281,7 +281,7 @@ libtextclassifier3::Status FileBackedBitmap::TruncateTo(int new_num_bits) {
}
const size_t new_file_size = FileSizeForBits(new_num_bits);
- // TODO(b/216487496): Implement a more robust version of TC_RETURN_IF_ERROR
+ // TODO(b/144458732): Implement a more robust version of TC_RETURN_IF_ERROR
// that can support error logging.
libtextclassifier3::Status status = mmapper_->Remap(0, new_file_size);
if (!status.ok()) {
diff --git a/icing/file/file-backed-proto-log.h b/icing/file/file-backed-proto-log.h
index 686b4fb..b2b37e8 100644
--- a/icing/file/file-backed-proto-log.h
+++ b/icing/file/file-backed-proto-log.h
@@ -14,14 +14,16 @@
// File-backed log of protos with append-only writes and position based reads.
//
-// The implementation in this file is deprecated and replaced by
-// portable-file-backed-proto-log.h.
+// There should only be one instance of a FileBackedProtoLog of the same file at
+// a time; using multiple instances at the same time may lead to undefined
+// behavior.
//
-// This deprecated implementation has been made read-only for the purposes of
-// migration; writing and erasing this format of log is no longer supported and
-// the methods to accomplish this have been removed.
+// The entire checksum is computed on initialization to verify the contents are
+// valid. On failure, the log will be truncated to the last verified state when
+// PersistToDisk() was called. If the log cannot successfully restore the last
+// state due to disk corruption or some other inconsistency, then the entire log
+// will be lost.
//
-// The details of this format follow below:
// Each proto written to the file will have a metadata written just before it.
// The metadata consists of
// {
@@ -29,24 +31,45 @@
// 3 bytes of the proto size
// n bytes of the proto itself
// }
+//
+// Example usage:
+// ICING_ASSERT_OK_AND_ASSIGN(auto create_result,
+// FileBackedProtoLog<DocumentProto>::Create(filesystem, file_path_,
+// options));
+// auto proto_log = create_result.proto_log;
+//
+// Document document;
+// document.set_namespace("com.google.android.example");
+// document.set_uri("www.google.com");
+//
+// int64_t document_offset = proto_log->WriteProto(document));
+// Document same_document = proto_log->ReadProto(document_offset));
+// proto_log->PersistToDisk();
+//
// TODO(b/136514769): Add versioning to the header and a UpgradeToVersion
// migration method.
+
#ifndef ICING_FILE_FILE_BACKED_PROTO_LOG_H_
#define ICING_FILE_FILE_BACKED_PROTO_LOG_H_
+#include <cstddef>
#include <cstdint>
+#include <cstring>
#include <memory>
#include <string>
#include <string_view>
+#include <utility>
+#include <vector>
+#include "icing/text_classifier/lib3/utils/base/status.h"
#include "icing/text_classifier/lib3/utils/base/statusor.h"
+#include <google/protobuf/io/gzip_stream.h>
#include <google/protobuf/io/zero_copy_stream_impl_lite.h>
#include "icing/absl_ports/canonical_errors.h"
#include "icing/absl_ports/str_cat.h"
#include "icing/file/filesystem.h"
#include "icing/file/memory-mapped-file.h"
#include "icing/legacy/core/icing-string-util.h"
-#include "icing/portable/gzip_stream.h"
#include "icing/portable/platform.h"
#include "icing/portable/zlib.h"
#include "icing/util/crc32.h"
@@ -89,6 +112,10 @@ class FileBackedProtoLog {
// Header stored at the beginning of the file before the rest of the log
// contents. Stores metadata on the log.
+ //
+ // TODO(b/139375388): Migrate the Header struct to a proto. This makes
+ // migrations easier since we don't need to worry about different size padding
+ // (which would affect the checksum) and different endians.
struct Header {
static constexpr int32_t kMagic = 0xf4c6f67a;
@@ -168,6 +195,20 @@ class FileBackedProtoLog {
FileBackedProtoLog(const FileBackedProtoLog&) = delete;
FileBackedProtoLog& operator=(const FileBackedProtoLog&) = delete;
+ // This will update the checksum of the log as well.
+ ~FileBackedProtoLog();
+
+ // Writes the serialized proto to the underlying file. Writes are applied
+ // directly to the underlying file. Users do not need to sync the file after
+ // writing.
+ //
+ // Returns:
+ // Offset of the newly appended proto in file on success
+ // INVALID_ARGUMENT if proto is too large, as decided by
+ // Options.max_proto_size
+ // INTERNAL_ERROR on IO error
+ libtextclassifier3::StatusOr<int64_t> WriteProto(const ProtoT& proto);
+
// Reads out a proto located at file_offset from the file.
//
// Returns:
@@ -177,6 +218,31 @@ class FileBackedProtoLog {
// INTERNAL_ERROR on IO error
libtextclassifier3::StatusOr<ProtoT> ReadProto(int64_t file_offset) const;
+ // Erases the data of a proto located at file_offset from the file.
+ //
+ // Returns:
+ // OK on success
+ // OUT_OF_RANGE_ERROR if file_offset exceeds file size
+ // INTERNAL_ERROR on IO error
+ libtextclassifier3::Status EraseProto(int64_t file_offset);
+
+ // Calculates and returns the disk usage in bytes. Rounds up to the nearest
+ // block size.
+ //
+ // Returns:
+ // Disk usage on success
+ // INTERNAL_ERROR on IO error
+ libtextclassifier3::StatusOr<int64_t> GetDiskUsage() const;
+
+ // Returns the file size of all the elements held in the log. File size is in
+ // bytes. This excludes the size of any internal metadata of the log, e.g. the
+ // log's header.
+ //
+ // Returns:
+ // File size on success
+ // INTERNAL_ERROR on IO error
+ libtextclassifier3::StatusOr<int64_t> GetElementsFileSize() const;
+
// An iterator helping to find offsets of all the protos in file.
// Example usage:
//
@@ -215,6 +281,72 @@ class FileBackedProtoLog {
// behaviors could happen.
Iterator GetIterator();
+ // Persists all changes since initialization or the last call to
+ // PersistToDisk(). Any changes that aren't persisted may be lost if the
+ // system fails to close safely.
+ //
+ // Example use case:
+ //
+ // Document document;
+ // document.set_namespace("com.google.android.example");
+ // document.set_uri("www.google.com");
+ //
+ // {
+ // ICING_ASSERT_OK_AND_ASSIGN(auto create_result,
+ // FileBackedProtoLog<DocumentProto>::Create(filesystem, file_path,
+ // options));
+ // auto proto_log = std::move(create_result.proto_log);
+ //
+ // int64_t document_offset = proto_log->WriteProto(document));
+ //
+ // // We lose the document here since it wasn't persisted.
+ // // *SYSTEM CRASH*
+ // }
+ //
+ // {
+ // // Can still successfully create after a crash since the log can
+ // // rewind/truncate to recover into a previously good state
+ // ICING_ASSERT_OK_AND_ASSIGN(auto create_result,
+ // FileBackedProtoLog<DocumentProto>::Create(filesystem, file_path,
+ // options));
+ // auto proto_log = std::move(create_result.proto_log);
+ //
+ // // Lost the proto since we didn't PersistToDisk before the crash
+ // proto_log->ReadProto(document_offset)); // INVALID_ARGUMENT error
+ //
+ // int64_t document_offset = proto_log->WriteProto(document));
+ //
+ // // Persisted this time, so we should be ok.
+ // ICING_ASSERT_OK(proto_log->PersistToDisk());
+ // }
+ //
+ // {
+ // ICING_ASSERT_OK_AND_ASSIGN(auto create_result,
+ // FileBackedProtoLog<DocumentProto>::Create(filesystem, file_path,
+ // options));
+ // auto proto_log = std::move(create_result.proto_log);
+ //
+ // // SUCCESS
+ // Document same_document = proto_log->ReadProto(document_offset));
+ // }
+ //
+ // NOTE: Since all protos are already written to the file directly, this
+ // just updates the checksum and rewind position. Without these updates,
+ // future initializations will truncate the file and discard unpersisted
+ // changes.
+ //
+ // Returns:
+ // OK on success
+ // INTERNAL_ERROR on IO error
+ libtextclassifier3::Status PersistToDisk();
+
+ // Calculates the checksum of the log contents. Excludes the header content.
+ //
+ // Returns:
+ // Crc of the log content
+ // INTERNAL_ERROR on IO error
+ libtextclassifier3::StatusOr<Crc32> ComputeChecksum();
+
private:
// Object can only be instantiated via the ::Create factory.
FileBackedProtoLog(const Filesystem* filesystem, const std::string& file_path,
@@ -292,6 +424,9 @@ class FileBackedProtoLog {
static_assert(kMaxProtoSize <= 0x00FFFFFF,
"kMaxProtoSize doesn't fit in 3 bytes");
+ // Level of compression, BEST_SPEED = 1, BEST_COMPRESSION = 9
+ static constexpr int kDeflateCompressionLevel = 3;
+
// Chunks of the file to mmap at a time, so we don't mmap the entire file.
// Only used on 32-bit devices
static constexpr int kMmapChunkSize = 4 * 1024 * 1024; // 4MiB
@@ -303,6 +438,9 @@ class FileBackedProtoLog {
};
template <typename ProtoT>
+constexpr uint8_t FileBackedProtoLog<ProtoT>::kProtoMagic;
+
+template <typename ProtoT>
FileBackedProtoLog<ProtoT>::FileBackedProtoLog(const Filesystem* filesystem,
const std::string& file_path,
std::unique_ptr<Header> header)
@@ -313,6 +451,15 @@ FileBackedProtoLog<ProtoT>::FileBackedProtoLog(const Filesystem* filesystem,
}
template <typename ProtoT>
+FileBackedProtoLog<ProtoT>::~FileBackedProtoLog() {
+ if (!PersistToDisk().ok()) {
+ ICING_LOG(WARNING)
+ << "Error persisting to disk during destruction of FileBackedProtoLog: "
+ << file_path_;
+ }
+}
+
+template <typename ProtoT>
libtextclassifier3::StatusOr<typename FileBackedProtoLog<ProtoT>::CreateResult>
FileBackedProtoLog<ProtoT>::Create(const Filesystem* filesystem,
const std::string& file_path,
@@ -541,6 +688,79 @@ libtextclassifier3::StatusOr<Crc32> FileBackedProtoLog<ProtoT>::ComputeChecksum(
}
template <typename ProtoT>
+libtextclassifier3::StatusOr<int64_t> FileBackedProtoLog<ProtoT>::WriteProto(
+ const ProtoT& proto) {
+ int64_t proto_size = proto.ByteSizeLong();
+ int32_t metadata;
+ int metadata_size = sizeof(metadata);
+ int64_t current_position = filesystem_->GetCurrentPosition(fd_.get());
+
+ if (proto_size > header_->max_proto_size) {
+ return absl_ports::InvalidArgumentError(IcingStringUtil::StringPrintf(
+ "proto_size, %lld, was too large to write. Max is %d",
+ static_cast<long long>(proto_size), header_->max_proto_size));
+ }
+
+ // At this point, we've guaranteed that proto_size is under kMaxProtoSize
+ // (see
+ // ::Create), so we can safely store it in an int.
+ int final_size = 0;
+
+ std::string proto_str;
+ google::protobuf::io::StringOutputStream proto_stream(&proto_str);
+
+ if (header_->compress) {
+ google::protobuf::io::GzipOutputStream::Options options;
+ options.format = google::protobuf::io::GzipOutputStream::ZLIB;
+ options.compression_level = kDeflateCompressionLevel;
+
+ google::protobuf::io::GzipOutputStream compressing_stream(&proto_stream,
+ options);
+
+ bool success = proto.SerializeToZeroCopyStream(&compressing_stream) &&
+ compressing_stream.Close();
+
+ if (!success) {
+ return absl_ports::InternalError("Error compressing proto.");
+ }
+
+ final_size = proto_str.size();
+
+ // In case the compressed proto is larger than the original proto, we also
+ // can't write it.
+ if (final_size > header_->max_proto_size) {
+ return absl_ports::InvalidArgumentError(IcingStringUtil::StringPrintf(
+ "Compressed proto size, %d, was greater than "
+ "max_proto_size, %d",
+ final_size, header_->max_proto_size));
+ }
+ } else {
+ // Serialize the proto directly into the write buffer at an offset of the
+ // metadata.
+ proto.SerializeToZeroCopyStream(&proto_stream);
+ final_size = proto_str.size();
+ }
+
+ // 1st byte for magic, next 3 bytes for proto size.
+ metadata = (kProtoMagic << 24) | final_size;
+
+ // Actually write metadata, has to be done after we know the possibly
+ // compressed proto size
+ if (!filesystem_->Write(fd_.get(), &metadata, metadata_size)) {
+ return absl_ports::InternalError(
+ absl_ports::StrCat("Failed to write proto metadata to: ", file_path_));
+ }
+
+ // Write the serialized proto
+ if (!filesystem_->Write(fd_.get(), proto_str.data(), proto_str.size())) {
+ return absl_ports::InternalError(
+ absl_ports::StrCat("Failed to write proto to: ", file_path_));
+ }
+
+ return current_position;
+}
+
+template <typename ProtoT>
libtextclassifier3::StatusOr<ProtoT> FileBackedProtoLog<ProtoT>::ReadProto(
int64_t file_offset) const {
int64_t file_size = filesystem_->GetFileSize(fd_.get());
@@ -576,7 +796,7 @@ libtextclassifier3::StatusOr<ProtoT> FileBackedProtoLog<ProtoT>::ReadProto(
// Deserialize proto
ProtoT proto;
if (header_->compress) {
- protobuf_ports::GzipInputStream decompress_stream(&proto_stream);
+ google::protobuf::io::GzipInputStream decompress_stream(&proto_stream);
proto.ParseFromZeroCopyStream(&decompress_stream);
} else {
proto.ParseFromZeroCopyStream(&proto_stream);
@@ -586,6 +806,83 @@ libtextclassifier3::StatusOr<ProtoT> FileBackedProtoLog<ProtoT>::ReadProto(
}
template <typename ProtoT>
+libtextclassifier3::Status FileBackedProtoLog<ProtoT>::EraseProto(
+ int64_t file_offset) {
+ int64_t file_size = filesystem_->GetFileSize(fd_.get());
+ if (file_offset >= file_size) {
+ // file_size points to the next byte to write at, so subtract one to get
+ // the inclusive, actual size of file.
+ return absl_ports::OutOfRangeError(IcingStringUtil::StringPrintf(
+ "Trying to erase data at a location, %lld, "
+ "out of range of the file size, %lld",
+ static_cast<long long>(file_offset),
+ static_cast<long long>(file_size - 1)));
+ }
+
+ MemoryMappedFile mmapped_file(
+ *filesystem_, file_path_,
+ MemoryMappedFile::Strategy::READ_WRITE_AUTO_SYNC);
+
+ // Read out the metadata
+ ICING_ASSIGN_OR_RETURN(
+ int metadata, ReadProtoMetadata(&mmapped_file, file_offset, file_size));
+
+ ICING_RETURN_IF_ERROR(mmapped_file.Remap(file_offset + sizeof(metadata),
+ GetProtoSize(metadata)));
+
+ // We need to update the crc checksum if the erased area is before the
+ // rewind position.
+ if (file_offset + sizeof(metadata) < header_->rewind_offset) {
+ // We need to calculate [original string xor 0s].
+ // The xored string is the same as the original string because 0 xor 0 =
+ // 0, 1 xor 0 = 1.
+ const std::string_view xored_str(mmapped_file.region(),
+ mmapped_file.region_size());
+
+ Crc32 crc(header_->log_checksum);
+ ICING_ASSIGN_OR_RETURN(
+ uint32_t new_crc,
+ crc.UpdateWithXor(
+ xored_str,
+ /*full_data_size=*/header_->rewind_offset - sizeof(Header),
+ /*position=*/file_offset + sizeof(metadata) - sizeof(Header)));
+
+ header_->log_checksum = new_crc;
+ header_->header_checksum = header_->CalculateHeaderChecksum();
+
+ if (!filesystem_->PWrite(fd_.get(), /*offset=*/0, header_.get(),
+ sizeof(Header))) {
+ return absl_ports::InternalError(
+ absl_ports::StrCat("Failed to update header to: ", file_path_));
+ }
+ }
+
+ memset(mmapped_file.mutable_region(), '\0', mmapped_file.region_size());
+ return libtextclassifier3::Status::OK;
+}
+
+template <typename ProtoT>
+libtextclassifier3::StatusOr<int64_t> FileBackedProtoLog<ProtoT>::GetDiskUsage()
+ const {
+ int64_t size = filesystem_->GetDiskUsage(file_path_.c_str());
+ if (size == Filesystem::kBadFileSize) {
+ return absl_ports::InternalError("Failed to get disk usage of proto log");
+ }
+ return size;
+}
+
+template <typename ProtoT>
+libtextclassifier3::StatusOr<int64_t>
+FileBackedProtoLog<ProtoT>::GetElementsFileSize() const {
+ int64_t total_file_size = filesystem_->GetFileSize(file_path_.c_str());
+ if (total_file_size == Filesystem::kBadFileSize) {
+ return absl_ports::InternalError(
+ "Failed to get file size of elments in the proto log");
+ }
+ return total_file_size - sizeof(Header);
+}
+
+template <typename ProtoT>
FileBackedProtoLog<ProtoT>::Iterator::Iterator(const Filesystem& filesystem,
const std::string& file_path,
int64_t initial_offset)
@@ -667,6 +964,51 @@ libtextclassifier3::StatusOr<int> FileBackedProtoLog<ProtoT>::ReadProtoMetadata(
return metadata;
}
+template <typename ProtoT>
+libtextclassifier3::Status FileBackedProtoLog<ProtoT>::PersistToDisk() {
+ int64_t file_size = filesystem_->GetFileSize(file_path_.c_str());
+ if (file_size == header_->rewind_offset) {
+ // No new protos appended, don't need to update the checksum.
+ return libtextclassifier3::Status::OK;
+ }
+
+ int64_t new_content_size = file_size - header_->rewind_offset;
+ Crc32 crc;
+ if (new_content_size < 0) {
+ // File shrunk, recalculate the entire checksum.
+ ICING_ASSIGN_OR_RETURN(
+ crc, ComputeChecksum(filesystem_, file_path_, Crc32(), sizeof(Header),
+ file_size));
+ } else {
+ // Append new changes to the existing checksum.
+ ICING_ASSIGN_OR_RETURN(
+ crc,
+ ComputeChecksum(filesystem_, file_path_, Crc32(header_->log_checksum),
+ header_->rewind_offset, file_size));
+ }
+
+ header_->log_checksum = crc.Get();
+ header_->rewind_offset = file_size;
+ header_->header_checksum = header_->CalculateHeaderChecksum();
+
+ if (!filesystem_->PWrite(fd_.get(), /*offset=*/0, header_.get(),
+ sizeof(Header)) ||
+ !filesystem_->DataSync(fd_.get())) {
+ return absl_ports::InternalError(
+ absl_ports::StrCat("Failed to update header to: ", file_path_));
+ }
+
+ return libtextclassifier3::Status::OK;
+}
+
+template <typename ProtoT>
+libtextclassifier3::StatusOr<Crc32>
+FileBackedProtoLog<ProtoT>::ComputeChecksum() {
+ return FileBackedProtoLog<ProtoT>::ComputeChecksum(
+ filesystem_, file_path_, Crc32(), /*start=*/sizeof(Header),
+ /*end=*/filesystem_->GetFileSize(file_path_.c_str()));
+}
+
} // namespace lib
} // namespace icing
diff --git a/icing/file/file-backed-proto-log_benchmark.cc b/icing/file/file-backed-proto-log_benchmark.cc
new file mode 100644
index 0000000..c09fd5a
--- /dev/null
+++ b/icing/file/file-backed-proto-log_benchmark.cc
@@ -0,0 +1,251 @@
+// Copyright (C) 2019 Google LLC
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include <cstdint>
+#include <random>
+
+#include "testing/base/public/benchmark.h"
+#include "gmock/gmock.h"
+#include "icing/document-builder.h"
+#include "icing/file/file-backed-proto-log.h"
+#include "icing/file/filesystem.h"
+#include "icing/legacy/core/icing-string-util.h"
+#include "icing/proto/document.pb.h"
+#include "icing/testing/common-matchers.h"
+#include "icing/testing/random-string.h"
+#include "icing/testing/tmp-directory.h"
+
+// go/microbenchmarks
+//
+// To build and run on a local machine:
+// $ blaze build -c opt --dynamic_mode=off --copt=-gmlt
+// icing/file:file-backed-proto-log_benchmark
+//
+// $ blaze-bin/icing/file/file-backed-proto-log_benchmark
+// --benchmarks=all
+//
+//
+// To build and run on an Android device (must be connected and rooted):
+// $ blaze build --copt="-DGOOGLE_COMMANDLINEFLAGS_FULL_API=1"
+// --config=android_arm64 -c opt --dynamic_mode=off --copt=-gmlt
+// icing/file:file-backed-proto-log_benchmark
+//
+// $ adb root
+//
+// $ adb push
+// blaze-bin/icing/file/file-backed-proto-log_benchmark
+// /data/local/tmp/
+//
+// $ adb shell /data/local/tmp/file-backed-proto-log-benchmark
+// --benchmarks=all
+
+namespace icing {
+namespace lib {
+
+namespace {
+
+static void BM_Write(benchmark::State& state) {
+ const Filesystem filesystem;
+ int string_length = state.range(0);
+ const std::string file_path = IcingStringUtil::StringPrintf(
+ "%s%s%d%s", GetTestTempDir().c_str(), "/proto_", string_length, ".log");
+ int max_proto_size = (1 << 24) - 1; // 16 MiB
+ bool compress = true;
+
+ // Make sure it doesn't already exist.
+ filesystem.DeleteFile(file_path.c_str());
+
+ auto proto_log =
+ FileBackedProtoLog<DocumentProto>::Create(
+ &filesystem, file_path,
+ FileBackedProtoLog<DocumentProto>::Options(compress, max_proto_size))
+ .ValueOrDie()
+ .proto_log;
+
+ DocumentProto document = DocumentBuilder().SetKey("namespace", "uri").Build();
+
+ std::default_random_engine random;
+ const std::string rand_str =
+ RandomString(kAlNumAlphabet, string_length, &random);
+
+ auto document_properties = document.add_properties();
+ document_properties->set_name("string property");
+ document_properties->add_string_values(rand_str);
+
+ for (auto _ : state) {
+ testing::DoNotOptimize(proto_log->WriteProto(document));
+ }
+ state.SetBytesProcessed(static_cast<int64_t>(state.iterations()) *
+ string_length);
+
+ // Cleanup after ourselves
+ filesystem.DeleteFile(file_path.c_str());
+}
+BENCHMARK(BM_Write)
+ ->Arg(1)
+ ->Arg(32)
+ ->Arg(512)
+ ->Arg(1024)
+ ->Arg(4 * 1024)
+ ->Arg(8 * 1024)
+ ->Arg(16 * 1024)
+ ->Arg(32 * 1024)
+ ->Arg(256 * 1024)
+ ->Arg(2 * 1024 * 1024)
+ ->Arg(8 * 1024 * 1024)
+ ->Arg(15 * 1024 * 1024); // We do 15MiB here since our max proto size is
+ // 16MiB, and we need some extra space for the
+ // rest of the document properties
+
+static void BM_Read(benchmark::State& state) {
+ const Filesystem filesystem;
+ int string_length = state.range(0);
+ const std::string file_path = IcingStringUtil::StringPrintf(
+ "%s%s%d%s", GetTestTempDir().c_str(), "/proto_", string_length, ".log");
+ int max_proto_size = (1 << 24) - 1; // 16 MiB
+ bool compress = true;
+
+ // Make sure it doesn't already exist.
+ filesystem.DeleteFile(file_path.c_str());
+
+ auto proto_log =
+ FileBackedProtoLog<DocumentProto>::Create(
+ &filesystem, file_path,
+ FileBackedProtoLog<DocumentProto>::Options(compress, max_proto_size))
+ .ValueOrDie()
+ .proto_log;
+
+ DocumentProto document = DocumentBuilder().SetKey("namespace", "uri").Build();
+
+ std::default_random_engine random;
+ const std::string rand_str =
+ RandomString(kAlNumAlphabet, string_length, &random);
+
+ auto document_properties = document.add_properties();
+ document_properties->set_name("string property");
+ document_properties->add_string_values(rand_str);
+
+ ICING_ASSERT_OK_AND_ASSIGN(int64_t write_offset,
+ proto_log->WriteProto(document));
+
+ for (auto _ : state) {
+ testing::DoNotOptimize(proto_log->ReadProto(write_offset));
+ }
+ state.SetBytesProcessed(static_cast<int64_t>(state.iterations()) *
+ string_length);
+
+ // Cleanup after ourselves
+ filesystem.DeleteFile(file_path.c_str());
+}
+BENCHMARK(BM_Read)
+ ->Arg(1)
+ ->Arg(32)
+ ->Arg(512)
+ ->Arg(1024)
+ ->Arg(4 * 1024)
+ ->Arg(8 * 1024)
+ ->Arg(16 * 1024)
+ ->Arg(32 * 1024)
+ ->Arg(256 * 1024)
+ ->Arg(2 * 1024 * 1024)
+ ->Arg(8 * 1024 * 1024)
+ ->Arg(15 * 1024 * 1024); // We do 15MiB here since our max proto size is
+ // 16MiB, and we need some extra space for the
+ // rest of the document properties
+
+static void BM_Erase(benchmark::State& state) {
+ const Filesystem filesystem;
+ const std::string file_path = IcingStringUtil::StringPrintf(
+ "%s%s", GetTestTempDir().c_str(), "/proto.log");
+ int max_proto_size = (1 << 24) - 1; // 16 MiB
+ bool compress = true;
+
+ // Make sure it doesn't already exist.
+ filesystem.DeleteFile(file_path.c_str());
+
+ auto proto_log =
+ FileBackedProtoLog<DocumentProto>::Create(
+ &filesystem, file_path,
+ FileBackedProtoLog<DocumentProto>::Options(compress, max_proto_size))
+ .ValueOrDie()
+ .proto_log;
+
+ DocumentProto document = DocumentBuilder().SetKey("namespace", "uri").Build();
+
+ std::default_random_engine random;
+ const std::string rand_str = RandomString(kAlNumAlphabet, /*len=*/1, &random);
+
+ auto document_properties = document.add_properties();
+ document_properties->set_name("string property");
+ document_properties->add_string_values(rand_str);
+
+ for (auto _ : state) {
+ state.PauseTiming();
+ ICING_ASSERT_OK_AND_ASSIGN(int64_t write_offset,
+ proto_log->WriteProto(document));
+ state.ResumeTiming();
+
+ testing::DoNotOptimize(proto_log->EraseProto(write_offset));
+ }
+
+ // Cleanup after ourselves
+ filesystem.DeleteFile(file_path.c_str());
+}
+BENCHMARK(BM_Erase);
+
+static void BM_ComputeChecksum(benchmark::State& state) {
+ const Filesystem filesystem;
+ const std::string file_path = GetTestTempDir() + "/proto.log";
+ int max_proto_size = (1 << 24) - 1; // 16 MiB
+ bool compress = true;
+
+ // Make sure it doesn't already exist.
+ filesystem.DeleteFile(file_path.c_str());
+
+ auto proto_log =
+ FileBackedProtoLog<DocumentProto>::Create(
+ &filesystem, file_path,
+ FileBackedProtoLog<DocumentProto>::Options(compress, max_proto_size))
+ .ValueOrDie()
+ .proto_log;
+
+ DocumentProto document = DocumentBuilder().SetKey("namespace", "uri").Build();
+
+ // Make each document 1KiB
+ int string_length = 1024;
+ std::default_random_engine random;
+ const std::string rand_str =
+ RandomString(kAlNumAlphabet, string_length, &random);
+
+ auto document_properties = document.add_properties();
+ document_properties->set_name("string property");
+ document_properties->add_string_values(rand_str);
+
+ int num_docs = state.range(0);
+ for (int i = 0; i < num_docs; ++i) {
+ ICING_ASSERT_OK(proto_log->WriteProto(document));
+ }
+
+ for (auto _ : state) {
+ testing::DoNotOptimize(proto_log->ComputeChecksum());
+ }
+
+ // Cleanup after ourselves
+ filesystem.DeleteFile(file_path.c_str());
+}
+BENCHMARK(BM_ComputeChecksum)->Range(1024, 1 << 20);
+
+} // namespace
+} // namespace lib
+} // namespace icing
diff --git a/icing/file/file-backed-proto-log_test.cc b/icing/file/file-backed-proto-log_test.cc
index eccb0c7..d429277 100644
--- a/icing/file/file-backed-proto-log_test.cc
+++ b/icing/file/file-backed-proto-log_test.cc
@@ -19,7 +19,10 @@
#include "gmock/gmock.h"
#include "gtest/gtest.h"
+#include "icing/document-builder.h"
#include "icing/file/filesystem.h"
+#include "icing/file/mock-filesystem.h"
+#include "icing/portable/equals-proto.h"
#include "icing/proto/document.pb.h"
#include "icing/testing/common-matchers.h"
#include "icing/testing/tmp-directory.h"
@@ -29,7 +32,14 @@ namespace lib {
namespace {
+using ::icing::lib::portable_equals_proto::EqualsProto;
+using ::testing::A;
+using ::testing::Eq;
+using ::testing::Gt;
+using ::testing::Not;
using ::testing::NotNull;
+using ::testing::Pair;
+using ::testing::Return;
class FileBackedProtoLogTest : public ::testing::Test {
protected:
@@ -77,6 +87,193 @@ TEST_F(FileBackedProtoLogTest, Initialize) {
StatusIs(libtextclassifier3::StatusCode::INVALID_ARGUMENT));
}
+TEST_F(FileBackedProtoLogTest, WriteProtoTooLarge) {
+ int max_proto_size = 1;
+ ICING_ASSERT_OK_AND_ASSIGN(
+ FileBackedProtoLog<DocumentProto>::CreateResult create_result,
+ FileBackedProtoLog<DocumentProto>::Create(
+ &filesystem_, file_path_,
+ FileBackedProtoLog<DocumentProto>::Options(compress_,
+ max_proto_size)));
+ auto proto_log = std::move(create_result.proto_log);
+ ASSERT_FALSE(create_result.has_data_loss());
+
+ DocumentProto document = DocumentBuilder().SetKey("namespace", "uri").Build();
+
+ // Proto is too large for the max_proto_size_in
+ ASSERT_THAT(proto_log->WriteProto(document),
+ StatusIs(libtextclassifier3::StatusCode::INVALID_ARGUMENT));
+}
+
+TEST_F(FileBackedProtoLogTest, ReadProtoWrongKProtoMagic) {
+ ICING_ASSERT_OK_AND_ASSIGN(
+ FileBackedProtoLog<DocumentProto>::CreateResult create_result,
+ FileBackedProtoLog<DocumentProto>::Create(
+ &filesystem_, file_path_,
+ FileBackedProtoLog<DocumentProto>::Options(compress_,
+ max_proto_size_)));
+ auto proto_log = std::move(create_result.proto_log);
+ ASSERT_FALSE(create_result.has_data_loss());
+
+ // Write a proto
+ DocumentProto document = DocumentBuilder().SetKey("namespace", "uri").Build();
+
+ ICING_ASSERT_OK_AND_ASSIGN(int64_t file_offset,
+ proto_log->WriteProto(document));
+
+ // The 4 bytes of metadata that just doesn't have the same kProtoMagic
+ // specified in file-backed-proto-log.h
+ uint32_t wrong_magic = 0x7E000000;
+
+ // Sanity check that we opened the file correctly
+ int fd = filesystem_.OpenForWrite(file_path_.c_str());
+ ASSERT_GT(fd, 0);
+
+ // Write the wrong kProtoMagic in, kProtoMagics are stored at the beginning of
+ // a proto entry.
+ filesystem_.PWrite(fd, file_offset, &wrong_magic, sizeof(wrong_magic));
+
+ ASSERT_THAT(proto_log->ReadProto(file_offset),
+ StatusIs(libtextclassifier3::StatusCode::INTERNAL));
+}
+
+TEST_F(FileBackedProtoLogTest, ReadWriteUncompressedProto) {
+ int last_offset;
+ {
+ ICING_ASSERT_OK_AND_ASSIGN(
+ FileBackedProtoLog<DocumentProto>::CreateResult create_result,
+ FileBackedProtoLog<DocumentProto>::Create(
+ &filesystem_, file_path_,
+ FileBackedProtoLog<DocumentProto>::Options(
+ /*compress_in=*/false, max_proto_size_)));
+ auto proto_log = std::move(create_result.proto_log);
+ ASSERT_FALSE(create_result.has_data_loss());
+
+ // Write the first proto
+ DocumentProto document1 =
+ DocumentBuilder().SetKey("namespace1", "uri1").Build();
+
+ ICING_ASSERT_OK_AND_ASSIGN(int written_position,
+ proto_log->WriteProto(document1));
+
+ int document1_offset = written_position;
+
+ // Check that what we read is what we wrote
+ ASSERT_THAT(proto_log->ReadProto(written_position),
+ IsOkAndHolds(EqualsProto(document1)));
+
+ // Write a second proto that's close to the max size. Leave some room for
+ // the rest of the proto properties.
+ std::string long_str(max_proto_size_ - 1024, 'a');
+ DocumentProto document2 = DocumentBuilder()
+ .SetKey("namespace2", "uri2")
+ .AddStringProperty("long_str", long_str)
+ .Build();
+
+ ICING_ASSERT_OK_AND_ASSIGN(written_position,
+ proto_log->WriteProto(document2));
+
+ int document2_offset = written_position;
+ last_offset = written_position;
+ ASSERT_GT(document2_offset, document1_offset);
+
+ // Check the second proto
+ ASSERT_THAT(proto_log->ReadProto(written_position),
+ IsOkAndHolds(EqualsProto(document2)));
+
+ ICING_ASSERT_OK(proto_log->PersistToDisk());
+ }
+
+ {
+ // Make a new proto_log with the same file_path, and make sure we
+ // can still write to the same underlying file.
+ ICING_ASSERT_OK_AND_ASSIGN(
+ FileBackedProtoLog<DocumentProto>::CreateResult create_result,
+ FileBackedProtoLog<DocumentProto>::Create(
+ &filesystem_, file_path_,
+ FileBackedProtoLog<DocumentProto>::Options(
+ /*compress_in=*/false, max_proto_size_)));
+ auto recreated_proto_log = std::move(create_result.proto_log);
+ ASSERT_FALSE(create_result.has_data_loss());
+
+ // Write a third proto
+ DocumentProto document3 =
+ DocumentBuilder().SetKey("namespace3", "uri3").Build();
+
+ ASSERT_THAT(recreated_proto_log->WriteProto(document3),
+ IsOkAndHolds(Gt(last_offset)));
+ }
+}
+
+TEST_F(FileBackedProtoLogTest, ReadWriteCompressedProto) {
+ int last_offset;
+
+ {
+ ICING_ASSERT_OK_AND_ASSIGN(
+ FileBackedProtoLog<DocumentProto>::CreateResult create_result,
+ FileBackedProtoLog<DocumentProto>::Create(
+ &filesystem_, file_path_,
+ FileBackedProtoLog<DocumentProto>::Options(
+ /*compress_in=*/true, max_proto_size_)));
+ auto proto_log = std::move(create_result.proto_log);
+ ASSERT_FALSE(create_result.has_data_loss());
+
+ // Write the first proto
+ DocumentProto document1 =
+ DocumentBuilder().SetKey("namespace1", "uri1").Build();
+
+ ICING_ASSERT_OK_AND_ASSIGN(int written_position,
+ proto_log->WriteProto(document1));
+
+ int document1_offset = written_position;
+
+ // Check that what we read is what we wrote
+ ASSERT_THAT(proto_log->ReadProto(written_position),
+ IsOkAndHolds(EqualsProto(document1)));
+
+ // Write a second proto that's close to the max size. Leave some room for
+ // the rest of the proto properties.
+ std::string long_str(max_proto_size_ - 1024, 'a');
+ DocumentProto document2 = DocumentBuilder()
+ .SetKey("namespace2", "uri2")
+ .AddStringProperty("long_str", long_str)
+ .Build();
+
+ ICING_ASSERT_OK_AND_ASSIGN(written_position,
+ proto_log->WriteProto(document2));
+
+ int document2_offset = written_position;
+ last_offset = written_position;
+ ASSERT_GT(document2_offset, document1_offset);
+
+ // Check the second proto
+ ASSERT_THAT(proto_log->ReadProto(written_position),
+ IsOkAndHolds(EqualsProto(document2)));
+
+ ICING_ASSERT_OK(proto_log->PersistToDisk());
+ }
+
+ {
+ // Make a new proto_log with the same file_path, and make sure we
+ // can still write to the same underlying file.
+ ICING_ASSERT_OK_AND_ASSIGN(
+ FileBackedProtoLog<DocumentProto>::CreateResult create_result,
+ FileBackedProtoLog<DocumentProto>::Create(
+ &filesystem_, file_path_,
+ FileBackedProtoLog<DocumentProto>::Options(
+ /*compress_in=*/true, max_proto_size_)));
+ auto recreated_proto_log = std::move(create_result.proto_log);
+ ASSERT_FALSE(create_result.has_data_loss());
+
+ // Write a third proto
+ DocumentProto document3 =
+ DocumentBuilder().SetKey("namespace3", "uri3").Build();
+
+ ASSERT_THAT(recreated_proto_log->WriteProto(document3),
+ IsOkAndHolds(Gt(last_offset)));
+ }
+}
+
TEST_F(FileBackedProtoLogTest, CorruptHeader) {
{
ICING_ASSERT_OK_AND_ASSIGN(
@@ -106,6 +303,382 @@ TEST_F(FileBackedProtoLogTest, CorruptHeader) {
}
}
+TEST_F(FileBackedProtoLogTest, CorruptContent) {
+ {
+ ICING_ASSERT_OK_AND_ASSIGN(
+ FileBackedProtoLog<DocumentProto>::CreateResult create_result,
+ FileBackedProtoLog<DocumentProto>::Create(
+ &filesystem_, file_path_,
+ FileBackedProtoLog<DocumentProto>::Options(compress_,
+ max_proto_size_)));
+ auto proto_log = std::move(create_result.proto_log);
+ EXPECT_FALSE(create_result.has_data_loss());
+
+ DocumentProto document =
+ DocumentBuilder().SetKey("namespace1", "uri1").Build();
+
+ // Write and persist an document.
+ ICING_ASSERT_OK_AND_ASSIGN(int document_offset,
+ proto_log->WriteProto(document));
+ ICING_ASSERT_OK(proto_log->PersistToDisk());
+
+ // "Corrupt" the content written in the log.
+ document.set_uri("invalid");
+ std::string serialized_document = document.SerializeAsString();
+ filesystem_.PWrite(file_path_.c_str(), document_offset,
+ serialized_document.data(), serialized_document.size());
+ }
+
+ {
+ // We can recover, but we have data loss.
+ ICING_ASSERT_OK_AND_ASSIGN(
+ FileBackedProtoLog<DocumentProto>::CreateResult create_result,
+ FileBackedProtoLog<DocumentProto>::Create(
+ &filesystem_, file_path_,
+ FileBackedProtoLog<DocumentProto>::Options(compress_,
+ max_proto_size_)));
+ auto proto_log = std::move(create_result.proto_log);
+ ASSERT_TRUE(create_result.has_data_loss());
+ ASSERT_THAT(create_result.data_loss, Eq(DataLoss::COMPLETE));
+
+ // Lost everything in the log since the rewind position doesn't help if
+ // there's been data corruption within the persisted region
+ ASSERT_EQ(filesystem_.GetFileSize(file_path_.c_str()),
+ sizeof(FileBackedProtoLog<DocumentProto>::Header));
+ }
+}
+
+TEST_F(FileBackedProtoLogTest, PersistToDisk) {
+ DocumentProto document1 =
+ DocumentBuilder().SetKey("namespace1", "uri1").Build();
+ DocumentProto document2 =
+ DocumentBuilder().SetKey("namespace2", "uri2").Build();
+ int document1_offset, document2_offset;
+ int log_size;
+
+ {
+ ICING_ASSERT_OK_AND_ASSIGN(
+ FileBackedProtoLog<DocumentProto>::CreateResult create_result,
+ FileBackedProtoLog<DocumentProto>::Create(
+ &filesystem_, file_path_,
+ FileBackedProtoLog<DocumentProto>::Options(compress_,
+ max_proto_size_)));
+ auto proto_log = std::move(create_result.proto_log);
+ ASSERT_FALSE(create_result.has_data_loss());
+
+ // Write and persist the first proto
+ ICING_ASSERT_OK_AND_ASSIGN(document1_offset,
+ proto_log->WriteProto(document1));
+ ICING_ASSERT_OK(proto_log->PersistToDisk());
+
+ // Write, but don't explicitly persist the second proto
+ ICING_ASSERT_OK_AND_ASSIGN(document2_offset,
+ proto_log->WriteProto(document2));
+
+ // Check that what we read is what we wrote
+ ASSERT_THAT(proto_log->ReadProto(document1_offset),
+ IsOkAndHolds(EqualsProto(document1)));
+ ASSERT_THAT(proto_log->ReadProto(document2_offset),
+ IsOkAndHolds(EqualsProto(document2)));
+
+ log_size = filesystem_.GetFileSize(file_path_.c_str());
+ ASSERT_GT(log_size, 0);
+ }
+
+ {
+ // The header rewind position and checksum aren't updated in this "system
+ // crash" scenario.
+
+ std::string bad_proto =
+ "some incomplete proto that we didn't finish writing before the system "
+ "crashed";
+ filesystem_.PWrite(file_path_.c_str(), log_size, bad_proto.data(),
+ bad_proto.size());
+
+ // Double check that we actually wrote something to the underlying file
+ ASSERT_GT(filesystem_.GetFileSize(file_path_.c_str()), log_size);
+ }
+
+ {
+ // We can recover, but we have data loss
+ ICING_ASSERT_OK_AND_ASSIGN(
+ FileBackedProtoLog<DocumentProto>::CreateResult create_result,
+ FileBackedProtoLog<DocumentProto>::Create(
+ &filesystem_, file_path_,
+ FileBackedProtoLog<DocumentProto>::Options(compress_,
+ max_proto_size_)));
+ auto proto_log = std::move(create_result.proto_log);
+ ASSERT_TRUE(create_result.has_data_loss());
+ ASSERT_THAT(create_result.data_loss, Eq(DataLoss::PARTIAL));
+
+ // Check that everything was persisted across instances
+ ASSERT_THAT(proto_log->ReadProto(document1_offset),
+ IsOkAndHolds(EqualsProto(document1)));
+ ASSERT_THAT(proto_log->ReadProto(document2_offset),
+ IsOkAndHolds(EqualsProto(document2)));
+
+ // We correctly rewound to the last good state.
+ ASSERT_EQ(log_size, filesystem_.GetFileSize(file_path_.c_str()));
+ }
+}
+
+TEST_F(FileBackedProtoLogTest, Iterator) {
+ DocumentProto document1 =
+ DocumentBuilder().SetKey("namespace", "uri1").Build();
+ DocumentProto document2 =
+ DocumentBuilder().SetKey("namespace", "uri2").Build();
+
+ ICING_ASSERT_OK_AND_ASSIGN(
+ FileBackedProtoLog<DocumentProto>::CreateResult create_result,
+ FileBackedProtoLog<DocumentProto>::Create(
+ &filesystem_, file_path_,
+ FileBackedProtoLog<DocumentProto>::Options(compress_,
+ max_proto_size_)));
+ auto proto_log = std::move(create_result.proto_log);
+ ASSERT_FALSE(create_result.has_data_loss());
+
+ {
+ // Empty iterator
+ auto iterator = proto_log->GetIterator();
+ ASSERT_THAT(iterator.Advance(),
+ StatusIs(libtextclassifier3::StatusCode::OUT_OF_RANGE));
+ }
+
+ {
+ // Iterates through some documents
+ ICING_ASSERT_OK(proto_log->WriteProto(document1));
+ ICING_ASSERT_OK(proto_log->WriteProto(document2));
+ auto iterator = proto_log->GetIterator();
+ // 1st proto
+ ICING_ASSERT_OK(iterator.Advance());
+ ASSERT_THAT(proto_log->ReadProto(iterator.GetOffset()),
+ IsOkAndHolds(EqualsProto(document1)));
+ // 2nd proto
+ ICING_ASSERT_OK(iterator.Advance());
+ ASSERT_THAT(proto_log->ReadProto(iterator.GetOffset()),
+ IsOkAndHolds(EqualsProto(document2)));
+ // Tries to advance
+ ASSERT_THAT(iterator.Advance(),
+ StatusIs(libtextclassifier3::StatusCode::OUT_OF_RANGE));
+ }
+
+ {
+ // Iterator with bad filesystem
+ MockFilesystem mock_filesystem;
+ ON_CALL(mock_filesystem, GetFileSize(A<const char *>()))
+ .WillByDefault(Return(Filesystem::kBadFileSize));
+ FileBackedProtoLog<DocumentProto>::Iterator bad_iterator(
+ mock_filesystem, file_path_, /*initial_offset=*/0);
+ ASSERT_THAT(bad_iterator.Advance(),
+ StatusIs(libtextclassifier3::StatusCode::OUT_OF_RANGE));
+ }
+}
+
+TEST_F(FileBackedProtoLogTest, ComputeChecksum) {
+ DocumentProto document = DocumentBuilder().SetKey("namespace", "uri").Build();
+ Crc32 checksum;
+
+ {
+ ICING_ASSERT_OK_AND_ASSIGN(
+ FileBackedProtoLog<DocumentProto>::CreateResult create_result,
+ FileBackedProtoLog<DocumentProto>::Create(
+ &filesystem_, file_path_,
+ FileBackedProtoLog<DocumentProto>::Options(compress_,
+ max_proto_size_)));
+ auto proto_log = std::move(create_result.proto_log);
+ ASSERT_FALSE(create_result.has_data_loss());
+
+ ICING_EXPECT_OK(proto_log->WriteProto(document));
+
+ ICING_ASSERT_OK_AND_ASSIGN(checksum, proto_log->ComputeChecksum());
+
+ // Calling it twice with no changes should get us the same checksum
+ EXPECT_THAT(proto_log->ComputeChecksum(), IsOkAndHolds(Eq(checksum)));
+ }
+
+ {
+ ICING_ASSERT_OK_AND_ASSIGN(
+ FileBackedProtoLog<DocumentProto>::CreateResult create_result,
+ FileBackedProtoLog<DocumentProto>::Create(
+ &filesystem_, file_path_,
+ FileBackedProtoLog<DocumentProto>::Options(compress_,
+ max_proto_size_)));
+ auto proto_log = std::move(create_result.proto_log);
+ ASSERT_FALSE(create_result.has_data_loss());
+
+ // Checksum should be consistent across instances
+ EXPECT_THAT(proto_log->ComputeChecksum(), IsOkAndHolds(Eq(checksum)));
+
+ // PersistToDisk shouldn't affect the checksum value
+ ICING_EXPECT_OK(proto_log->PersistToDisk());
+ EXPECT_THAT(proto_log->ComputeChecksum(), IsOkAndHolds(Eq(checksum)));
+
+ // Check that modifying the log leads to a different checksum
+ ICING_EXPECT_OK(proto_log->WriteProto(document));
+ EXPECT_THAT(proto_log->ComputeChecksum(), IsOkAndHolds(Not(Eq(checksum))));
+ }
+}
+
+TEST_F(FileBackedProtoLogTest, EraseProtoShouldSetZero) {
+ DocumentProto document1 =
+ DocumentBuilder().SetKey("namespace", "uri1").Build();
+
+ ICING_ASSERT_OK_AND_ASSIGN(
+ FileBackedProtoLog<DocumentProto>::CreateResult create_result,
+ FileBackedProtoLog<DocumentProto>::Create(
+ &filesystem_, file_path_,
+ FileBackedProtoLog<DocumentProto>::Options(compress_,
+ max_proto_size_)));
+ auto proto_log = std::move(create_result.proto_log);
+ ASSERT_FALSE(create_result.has_data_loss());
+
+ // Writes and erases proto
+ ICING_ASSERT_OK_AND_ASSIGN(int64_t document1_offset,
+ proto_log->WriteProto(document1));
+ ICING_ASSERT_OK(proto_log->EraseProto(document1_offset));
+
+ // Checks if the erased area is set to 0.
+ int64_t file_size = filesystem_.GetFileSize(file_path_.c_str());
+ MemoryMappedFile mmapped_file(filesystem_, file_path_,
+ MemoryMappedFile::Strategy::READ_ONLY);
+
+ // document1_offset + sizeof(int) is the start byte of the proto where
+ // sizeof(int) is the size of the proto metadata.
+ mmapped_file.Remap(document1_offset + sizeof(int), file_size - 1);
+ for (size_t i = 0; i < mmapped_file.region_size(); ++i) {
+ ASSERT_THAT(mmapped_file.region()[i], Eq(0));
+ }
+}
+
+TEST_F(FileBackedProtoLogTest, EraseProtoShouldReturnNotFound) {
+ DocumentProto document1 =
+ DocumentBuilder().SetKey("namespace", "uri1").Build();
+ DocumentProto document2 =
+ DocumentBuilder().SetKey("namespace", "uri2").Build();
+
+ ICING_ASSERT_OK_AND_ASSIGN(
+ FileBackedProtoLog<DocumentProto>::CreateResult create_result,
+ FileBackedProtoLog<DocumentProto>::Create(
+ &filesystem_, file_path_,
+ FileBackedProtoLog<DocumentProto>::Options(compress_,
+ max_proto_size_)));
+ auto proto_log = std::move(create_result.proto_log);
+ ASSERT_FALSE(create_result.has_data_loss());
+
+ // Writes 2 protos
+ ICING_ASSERT_OK_AND_ASSIGN(int64_t document1_offset,
+ proto_log->WriteProto(document1));
+ ICING_ASSERT_OK_AND_ASSIGN(int64_t document2_offset,
+ proto_log->WriteProto(document2));
+
+ // Erases the first proto
+ ICING_ASSERT_OK(proto_log->EraseProto(document1_offset));
+
+ // The first proto has been erased.
+ ASSERT_THAT(proto_log->ReadProto(document1_offset),
+ StatusIs(libtextclassifier3::StatusCode::NOT_FOUND));
+ // The second proto should be returned.
+ ASSERT_THAT(proto_log->ReadProto(document2_offset),
+ IsOkAndHolds(EqualsProto(document2)));
+}
+
+TEST_F(FileBackedProtoLogTest, ChecksumShouldBeCorrectWithErasedProto) {
+ DocumentProto document1 =
+ DocumentBuilder().SetKey("namespace", "uri1").Build();
+ DocumentProto document2 =
+ DocumentBuilder().SetKey("namespace", "uri2").Build();
+ DocumentProto document3 =
+ DocumentBuilder().SetKey("namespace", "uri3").Build();
+ DocumentProto document4 =
+ DocumentBuilder().SetKey("namespace", "uri4").Build();
+
+ int64_t document2_offset;
+ int64_t document3_offset;
+
+ {
+ // Erase data after the rewind position. This won't update the checksum
+ // immediately.
+ ICING_ASSERT_OK_AND_ASSIGN(
+ FileBackedProtoLog<DocumentProto>::CreateResult create_result,
+ FileBackedProtoLog<DocumentProto>::Create(
+ &filesystem_, file_path_,
+ FileBackedProtoLog<DocumentProto>::Options(compress_,
+ max_proto_size_)));
+ auto proto_log = std::move(create_result.proto_log);
+ ASSERT_FALSE(create_result.has_data_loss());
+
+ // Writes 3 protos
+ ICING_ASSERT_OK_AND_ASSIGN(int64_t document1_offset,
+ proto_log->WriteProto(document1));
+ ICING_ASSERT_OK_AND_ASSIGN(document2_offset,
+ proto_log->WriteProto(document2));
+ ICING_ASSERT_OK_AND_ASSIGN(document3_offset,
+ proto_log->WriteProto(document3));
+
+ // Erases the 1st proto, checksum won't be updated immediately because the
+ // rewind position is 0.
+ ICING_ASSERT_OK(proto_log->EraseProto(document1_offset));
+
+ EXPECT_THAT(proto_log->ComputeChecksum(),
+ IsOkAndHolds(Eq(Crc32(2293202502))));
+ } // New checksum is updated in destructor.
+
+ {
+ // Erase data before the rewind position. This will update the checksum
+ // immediately.
+ ICING_ASSERT_OK_AND_ASSIGN(
+ FileBackedProtoLog<DocumentProto>::CreateResult create_result,
+ FileBackedProtoLog<DocumentProto>::Create(
+ &filesystem_, file_path_,
+ FileBackedProtoLog<DocumentProto>::Options(compress_,
+ max_proto_size_)));
+ auto proto_log = std::move(create_result.proto_log);
+ ASSERT_FALSE(create_result.has_data_loss());
+
+ // Erases the 2nd proto that is now before the rewind position. Checksum is
+ // updated.
+ ICING_ASSERT_OK(proto_log->EraseProto(document2_offset));
+
+ EXPECT_THAT(proto_log->ComputeChecksum(),
+ IsOkAndHolds(Eq(Crc32(639634028))));
+ }
+
+ {
+ // Append data and erase data before the rewind position. This will update
+ // the checksum twice: in EraseProto() and destructor.
+ ICING_ASSERT_OK_AND_ASSIGN(
+ FileBackedProtoLog<DocumentProto>::CreateResult create_result,
+ FileBackedProtoLog<DocumentProto>::Create(
+ &filesystem_, file_path_,
+ FileBackedProtoLog<DocumentProto>::Options(compress_,
+ max_proto_size_)));
+ auto proto_log = std::move(create_result.proto_log);
+ ASSERT_FALSE(create_result.has_data_loss());
+
+ // Append a new document which is after the rewind position.
+ ICING_ASSERT_OK(proto_log->WriteProto(document4));
+
+ // Erases the 3rd proto that is now before the rewind position. Checksum is
+ // updated.
+ ICING_ASSERT_OK(proto_log->EraseProto(document3_offset));
+
+ EXPECT_THAT(proto_log->ComputeChecksum(),
+ IsOkAndHolds(Eq(Crc32(1990198693))));
+ } // Checksum is updated with the newly appended document.
+
+ {
+ // A successful creation means that the checksum matches.
+ ICING_ASSERT_OK_AND_ASSIGN(
+ FileBackedProtoLog<DocumentProto>::CreateResult create_result,
+ FileBackedProtoLog<DocumentProto>::Create(
+ &filesystem_, file_path_,
+ FileBackedProtoLog<DocumentProto>::Options(compress_,
+ max_proto_size_)));
+ auto proto_log = std::move(create_result.proto_log);
+ EXPECT_FALSE(create_result.has_data_loss());
+ }
+}
+
} // namespace
} // namespace lib
} // namespace icing
diff --git a/icing/file/file-backed-proto.h b/icing/file/file-backed-proto.h
index d7d9bad..15a1953 100644
--- a/icing/file/file-backed-proto.h
+++ b/icing/file/file-backed-proto.h
@@ -63,17 +63,6 @@ class FileBackedProto {
// file_path : Must be a path within in a directory that already exists.
FileBackedProto(const Filesystem& filesystem, std::string_view file_path);
- // Reset the internal file_path for the file backed proto.
- // Example use:
- // auto file_backed_proto1 = *FileBackedProto<Proto>::Create(...);
- // auto file_backed_proto2 = *FileBackedProto<Proto>::Create(...);
- // filesystem.SwapFiles(file1, file2);
- // file_backed_proto1.SetSwappedFilepath(file2);
- // file_backed_proto2.SetSwappedFilepath(file1);
- void SetSwappedFilepath(std::string_view swapped_to_file_path) {
- file_path_ = swapped_to_file_path;
- }
-
// Returns a reference to the proto read from the file. It
// internally caches the read proto so that future calls are fast.
//
@@ -110,7 +99,7 @@ class FileBackedProto {
mutable absl_ports::shared_mutex mutex_;
const Filesystem* const filesystem_;
- std::string file_path_;
+ const std::string file_path_;
mutable std::unique_ptr<ProtoT> cached_proto_ ICING_GUARDED_BY(mutex_);
};
diff --git a/icing/file/file-backed-vector.h b/icing/file/file-backed-vector.h
index 7e42e32..0989935 100644
--- a/icing/file/file-backed-vector.h
+++ b/icing/file/file-backed-vector.h
@@ -56,9 +56,10 @@
#ifndef ICING_FILE_FILE_BACKED_VECTOR_H_
#define ICING_FILE_FILE_BACKED_VECTOR_H_
+#include <inttypes.h>
+#include <stdint.h>
#include <sys/mman.h>
-#include <cinttypes>
#include <cstdint>
#include <memory>
#include <string>
@@ -586,11 +587,8 @@ libtextclassifier3::Status FileBackedVector<T>::GrowIfNecessary(
}
int64_t current_file_size = filesystem_->GetFileSize(file_path_.c_str());
- if (current_file_size == Filesystem::kBadFileSize) {
- return absl_ports::InternalError("Unable to retrieve file size.");
- }
-
int64_t least_file_size_needed = sizeof(Header) + num_elements * sizeof(T);
+
if (least_file_size_needed <= current_file_size) {
// Our underlying file can hold the target num_elements cause we've grown
// before
diff --git a/icing/file/file-backed-vector_test.cc b/icing/file/file-backed-vector_test.cc
index ed94fa5..b05ce2d 100644
--- a/icing/file/file-backed-vector_test.cc
+++ b/icing/file/file-backed-vector_test.cc
@@ -14,30 +14,26 @@
#include "icing/file/file-backed-vector.h"
-#include <unistd.h>
+#include <errno.h>
#include <algorithm>
-#include <cerrno>
#include <cstdint>
#include <memory>
#include <string_view>
#include <vector>
-#include "knowledge/cerebra/sense/text_classifier/lib3/utils/base/status.h"
-#include "testing/base/public/gmock.h"
-#include "testing/base/public/gunit.h"
-#include "third_party/icing/file/filesystem.h"
-#include "third_party/icing/file/memory-mapped-file.h"
-#include "third_party/icing/file/mock-filesystem.h"
-#include "third_party/icing/testing/common-matchers.h"
-#include "third_party/icing/testing/tmp-directory.h"
-#include "third_party/icing/util/crc32.h"
-#include "third_party/icing/util/logging.h"
+#include "gmock/gmock.h"
+#include "gtest/gtest.h"
+#include "icing/file/filesystem.h"
+#include "icing/file/memory-mapped-file.h"
+#include "icing/testing/common-matchers.h"
+#include "icing/testing/tmp-directory.h"
+#include "icing/util/crc32.h"
+#include "icing/util/logging.h"
using ::testing::Eq;
using ::testing::IsTrue;
using ::testing::Pointee;
-using ::testing::Return;
namespace icing {
namespace lib {
@@ -78,8 +74,6 @@ class FileBackedVectorTest : public testing::Test {
return std::string_view(vector->array() + idx, expected_len);
}
- const Filesystem& filesystem() const { return filesystem_; }
-
Filesystem filesystem_;
std::string file_path_;
int fd_;
@@ -644,60 +638,6 @@ TEST_F(FileBackedVectorTest, InitNormalSucceeds) {
}
}
-TEST_F(FileBackedVectorTest, RemapFailureStillValidInstance) {
- auto mock_filesystem = std::make_unique<MockFilesystem>();
- ICING_ASSERT_OK_AND_ASSIGN(
- std::unique_ptr<FileBackedVector<int>> vector,
- FileBackedVector<int>::Create(
- *mock_filesystem, file_path_,
- MemoryMappedFile::Strategy::READ_WRITE_AUTO_SYNC));
-
- // 1. Write data to just before the first block resize. Running the test
- // locally has determined that we'll first resize at 65531st entry.
- constexpr int kResizingIndex = 16378;
- for (int i = 0; i < kResizingIndex; ++i) {
- ICING_ASSERT_OK(vector->Set(i, 7));
- }
-
- // 2. The next Set call should cause a resize and a remap. Make that remap
- // fail.
- int num_calls = 0;
- auto open_lambda = [this, &num_calls](const char* file_name){
- if (++num_calls == 2) {
- return -1;
- }
- return this->filesystem().OpenForWrite(file_name);
- };
- ON_CALL(*mock_filesystem, OpenForWrite(_)).WillByDefault(open_lambda);
- EXPECT_THAT(vector->Set(kResizingIndex, 7),
- StatusIs(libtextclassifier3::StatusCode::INTERNAL));
-
- // 3. We should still be able to call set correctly for earlier regions.
- ICING_EXPECT_OK(vector->Set(kResizingIndex / 2, 9));
- EXPECT_THAT(vector->Get(kResizingIndex / 2), IsOkAndHolds(Pointee(Eq(9))));
-}
-
-TEST_F(FileBackedVectorTest, BadFileSizeDuringGrowReturnsError) {
- auto mock_filesystem = std::make_unique<MockFilesystem>();
- ICING_ASSERT_OK_AND_ASSIGN(
- std::unique_ptr<FileBackedVector<int>> vector,
- FileBackedVector<int>::Create(
- *mock_filesystem, file_path_,
- MemoryMappedFile::Strategy::READ_WRITE_AUTO_SYNC));
-
- // At first, the vector is empty and has no mapping established. The first Set
- // call will cause a Grow.
- // During Grow, we will attempt to check the underlying file size to see if
- // growing is actually necessary. Return an error on the call to GetFileSize.
- ON_CALL(*mock_filesystem, GetFileSize(A<const char*>()))
- .WillByDefault(Return(Filesystem::kBadFileSize));
-
- // We should fail gracefully and return an INTERNAL error to indicate that
- // there was an issue retrieving the file size.
- EXPECT_THAT(vector->Set(0, 7),
- StatusIs(libtextclassifier3::StatusCode::INTERNAL));
-}
-
} // namespace
} // namespace lib
diff --git a/icing/file/filesystem.cc b/icing/file/filesystem.cc
index 82b8d98..0655cb9 100644
--- a/icing/file/filesystem.cc
+++ b/icing/file/filesystem.cc
@@ -16,6 +16,7 @@
#include <dirent.h>
#include <dlfcn.h>
+#include <errno.h>
#include <fcntl.h>
#include <fnmatch.h>
#include <pthread.h>
@@ -25,7 +26,6 @@
#include <unistd.h>
#include <algorithm>
-#include <cerrno>
#include <cstdint>
#include <unordered_set>
diff --git a/icing/file/filesystem.h b/icing/file/filesystem.h
index dd2c5d1..6bed8e6 100644
--- a/icing/file/filesystem.h
+++ b/icing/file/filesystem.h
@@ -17,9 +17,11 @@
#ifndef ICING_FILE_FILESYSTEM_H_
#define ICING_FILE_FILESYSTEM_H_
+#include <stdint.h>
+#include <stdio.h>
+#include <string.h>
+
#include <cstdint>
-#include <cstdio>
-#include <cstring>
#include <memory>
#include <string>
#include <unordered_set>
@@ -233,11 +235,6 @@ class Filesystem {
// Increments to_increment by size if size is valid, or sets to_increment
// to kBadFileSize if either size or to_increment is kBadFileSize.
static void IncrementByOrSetInvalid(int64_t size, int64_t* to_increment);
-
- // Return -1 if file_size is invalid. Otherwise, return file_size.
- static int64_t SanitizeFileSize(int64_t file_size) {
- return (file_size != kBadFileSize) ? file_size : -1;
- }
};
// LINT.ThenChange(//depot/google3/icing/file/mock-filesystem.h)
diff --git a/icing/file/memory-mapped-file.cc b/icing/file/memory-mapped-file.cc
index 9ff3adb..bda01f2 100644
--- a/icing/file/memory-mapped-file.cc
+++ b/icing/file/memory-mapped-file.cc
@@ -70,10 +70,10 @@ void MemoryMappedFile::MemoryMappedFile::Unmap() {
libtextclassifier3::Status MemoryMappedFile::Remap(size_t file_offset,
size_t mmap_size) {
- if (mmap_size == 0) {
- // First unmap any previously mmapped region.
- Unmap();
+ // First unmap any previously mmapped region.
+ Unmap();
+ if (mmap_size == 0) {
// Nothing more to do.
return libtextclassifier3::Status::OK;
}
@@ -118,19 +118,15 @@ libtextclassifier3::Status MemoryMappedFile::Remap(size_t file_offset,
"Unable to open file meant to be mmapped: ", file_path_));
}
- void* mmap_result = mmap(nullptr, adjusted_mmap_size, protection_flags,
- mmap_flags, fd.get(), aligned_offset);
+ mmap_result_ = mmap(nullptr, adjusted_mmap_size, protection_flags, mmap_flags,
+ fd.get(), aligned_offset);
- if (mmap_result == MAP_FAILED) {
+ if (mmap_result_ == MAP_FAILED) {
+ mmap_result_ = nullptr;
return absl_ports::InternalError(absl_ports::StrCat(
"Failed to mmap region due to error: ", strerror(errno)));
}
- // Now we know that we have successfully created a new mapping. We can free
- // the old one and switch to the new one.
- Unmap();
-
- mmap_result_ = mmap_result;
file_offset_ = file_offset;
region_ = reinterpret_cast<char*>(mmap_result_) + alignment_adjustment;
region_size_ = mmap_size;
diff --git a/icing/file/portable-file-backed-proto-log.h b/icing/file/portable-file-backed-proto-log.h
index 409ab96..825b763 100644
--- a/icing/file/portable-file-backed-proto-log.h
+++ b/icing/file/portable-file-backed-proto-log.h
@@ -64,6 +64,7 @@
#include "icing/text_classifier/lib3/utils/base/status.h"
#include "icing/text_classifier/lib3/utils/base/statusor.h"
+#include <google/protobuf/io/gzip_stream.h>
#include <google/protobuf/io/zero_copy_stream_impl_lite.h>
#include "icing/absl_ports/canonical_errors.h"
#include "icing/absl_ports/str_cat.h"
@@ -71,7 +72,6 @@
#include "icing/file/memory-mapped-file.h"
#include "icing/legacy/core/icing-string-util.h"
#include "icing/portable/endian.h"
-#include "icing/portable/gzip_stream.h"
#include "icing/portable/platform.h"
#include "icing/portable/zlib.h"
#include "icing/util/bit-util.h"
@@ -124,8 +124,6 @@ class PortableFileBackedProtoLog {
public:
static constexpr int32_t kMagic = 0xf4c6f67a;
- // We should go directly from 0 to 2 the next time we have to change the
- // format.
static constexpr int32_t kFileFormatVersion = 0;
uint32_t CalculateHeaderChecksum() const {
@@ -143,57 +141,49 @@ class PortableFileBackedProtoLog {
return crc.Get();
}
- int32_t GetMagic() const { return GNetworkToHostL(magic_nbytes_); }
+ int32_t GetMagic() const { return gntohl(magic_nbytes_); }
- void SetMagic(int32_t magic_in) {
- magic_nbytes_ = GHostToNetworkL(magic_in);
- }
+ void SetMagic(int32_t magic_in) { magic_nbytes_ = ghtonl(magic_in); }
int32_t GetFileFormatVersion() const {
- return GNetworkToHostL(file_format_version_nbytes_);
+ return gntohl(file_format_version_nbytes_);
}
void SetFileFormatVersion(int32_t file_format_version_in) {
- file_format_version_nbytes_ = GHostToNetworkL(file_format_version_in);
+ file_format_version_nbytes_ = ghtonl(file_format_version_in);
}
- int32_t GetMaxProtoSize() const {
- return GNetworkToHostL(max_proto_size_nbytes_);
- }
+ int32_t GetMaxProtoSize() const { return gntohl(max_proto_size_nbytes_); }
void SetMaxProtoSize(int32_t max_proto_size_in) {
- max_proto_size_nbytes_ = GHostToNetworkL(max_proto_size_in);
+ max_proto_size_nbytes_ = ghtonl(max_proto_size_in);
}
- int32_t GetLogChecksum() const {
- return GNetworkToHostL(log_checksum_nbytes_);
- }
+ int32_t GetLogChecksum() const { return gntohl(log_checksum_nbytes_); }
void SetLogChecksum(int32_t log_checksum_in) {
- log_checksum_nbytes_ = GHostToNetworkL(log_checksum_in);
+ log_checksum_nbytes_ = ghtonl(log_checksum_in);
}
- int64_t GetRewindOffset() const {
- return GNetworkToHostLL(rewind_offset_nbytes_);
- }
+ int64_t GetRewindOffset() const { return gntohll(rewind_offset_nbytes_); }
void SetRewindOffset(int64_t rewind_offset_in) {
- rewind_offset_nbytes_ = GHostToNetworkLL(rewind_offset_in);
+ rewind_offset_nbytes_ = ghtonll(rewind_offset_in);
}
int32_t GetHeaderChecksum() const {
- return GNetworkToHostL(header_checksum_nbytes_);
+ return gntohl(header_checksum_nbytes_);
}
void SetHeaderChecksum(int32_t header_checksum_in) {
- header_checksum_nbytes_ = GHostToNetworkL(header_checksum_in);
+ header_checksum_nbytes_ = ghtonl(header_checksum_in);
}
bool GetCompressFlag() const { return GetFlag(kCompressBit); }
void SetCompressFlag(bool compress) { SetFlag(kCompressBit, compress); }
- bool GetDirtyFlag() const { return GetFlag(kDirtyBit); }
+ bool GetDirtyFlag() { return GetFlag(kDirtyBit); }
void SetDirtyFlag(bool dirty) { SetFlag(kDirtyBit, dirty); }
@@ -219,7 +209,7 @@ class PortableFileBackedProtoLog {
// Holds the magic as a quick sanity check against file corruption.
//
// Field is in network-byte order.
- int32_t magic_nbytes_ = GHostToNetworkL(kMagic);
+ int32_t magic_nbytes_ = ghtonl(kMagic);
// Must be at the beginning after kMagic. Contains the crc checksum of
// the following fields.
@@ -233,7 +223,7 @@ class PortableFileBackedProtoLog {
// valid instead of throwing away the entire log.
//
// Field is in network-byte order.
- int64_t rewind_offset_nbytes_ = GHostToNetworkLL(kHeaderReservedBytes);
+ int64_t rewind_offset_nbytes_ = ghtonll(kHeaderReservedBytes);
// Version number tracking how we serialize the file to disk. If we change
// how/what we write to disk, this version should be updated and this class
@@ -284,7 +274,7 @@ class PortableFileBackedProtoLog {
// before updating our checksum.
bool recalculated_checksum = false;
- bool has_data_loss() const {
+ bool has_data_loss() {
return data_loss == DataLoss::PARTIAL || data_loss == DataLoss::COMPLETE;
}
};
@@ -378,7 +368,8 @@ class PortableFileBackedProtoLog {
// }
class Iterator {
public:
- Iterator(const Filesystem& filesystem, int fd, int64_t initial_offset);
+ Iterator(const Filesystem& filesystem, const std::string& file_path,
+ int64_t initial_offset);
// Advances to the position of next proto whether it has been erased or not.
//
@@ -394,12 +385,11 @@ class PortableFileBackedProtoLog {
private:
static constexpr int64_t kInvalidOffset = -1;
// Used to read proto metadata
+ MemoryMappedFile mmapped_file_;
// Offset of first proto
- const Filesystem* const filesystem_;
int64_t initial_offset_;
int64_t current_offset_;
int64_t file_size_;
- int fd_;
};
// Returns an iterator of current proto log. The caller needs to keep the
@@ -515,7 +505,7 @@ class PortableFileBackedProtoLog {
const Filesystem* filesystem, const std::string& file_path,
Crc32 initial_crc, int64_t start, int64_t end);
- // Reads out the metadata of a proto located at file_offset from the fd.
+ // Reads out the metadata of a proto located at file_offset from the file.
// Metadata will be returned in host byte order endianness.
//
// Returns:
@@ -523,8 +513,7 @@ class PortableFileBackedProtoLog {
// OUT_OF_RANGE_ERROR if file_offset exceeds file_size
// INTERNAL_ERROR if the metadata is invalid or any IO errors happen
static libtextclassifier3::StatusOr<int32_t> ReadProtoMetadata(
- const Filesystem* const filesystem, int fd, int64_t file_offset,
- int64_t file_size);
+ MemoryMappedFile* mmapped_file, int64_t file_offset, int64_t file_size);
// Writes metadata of a proto to the fd. Takes in a host byte order endianness
// metadata and converts it into a portable metadata before writing.
@@ -579,6 +568,9 @@ class PortableFileBackedProtoLog {
};
template <typename ProtoT>
+constexpr uint8_t PortableFileBackedProtoLog<ProtoT>::kProtoMagic;
+
+template <typename ProtoT>
PortableFileBackedProtoLog<ProtoT>::PortableFileBackedProtoLog(
const Filesystem* filesystem, const std::string& file_path,
std::unique_ptr<Header> header)
@@ -733,7 +725,7 @@ PortableFileBackedProtoLog<ProtoT>::InitializeExistingFile(
return absl_ports::InternalError(IcingStringUtil::StringPrintf(
"Failed to truncate '%s' to size %lld", file_path.data(),
static_cast<long long>(header->GetRewindOffset())));
- }
+ };
data_loss = DataLoss::PARTIAL;
}
@@ -889,11 +881,12 @@ PortableFileBackedProtoLog<ProtoT>::WriteProto(const ProtoT& proto) {
google::protobuf::io::StringOutputStream proto_stream(&proto_str);
if (header_->GetCompressFlag()) {
- protobuf_ports::GzipOutputStream::Options options;
- options.format = protobuf_ports::GzipOutputStream::ZLIB;
+ google::protobuf::io::GzipOutputStream::Options options;
+ options.format = google::protobuf::io::GzipOutputStream::ZLIB;
options.compression_level = kDeflateCompressionLevel;
- protobuf_ports::GzipOutputStream compressing_stream(&proto_stream, options);
+ google::protobuf::io::GzipOutputStream compressing_stream(&proto_stream,
+ options);
bool success = proto.SerializeToZeroCopyStream(&compressing_stream) &&
compressing_stream.Close();
@@ -940,42 +933,40 @@ template <typename ProtoT>
libtextclassifier3::StatusOr<ProtoT>
PortableFileBackedProtoLog<ProtoT>::ReadProto(int64_t file_offset) const {
int64_t file_size = filesystem_->GetFileSize(fd_.get());
- // Read out the metadata
- if (file_size == Filesystem::kBadFileSize) {
- return absl_ports::OutOfRangeError("Unable to correctly read size.");
+ MemoryMappedFile mmapped_file(*filesystem_, file_path_,
+ MemoryMappedFile::Strategy::READ_ONLY);
+ if (file_offset >= file_size) {
+ // file_size points to the next byte to write at, so subtract one to get
+ // the inclusive, actual size of file.
+ return absl_ports::OutOfRangeError(
+ IcingStringUtil::StringPrintf("Trying to read from a location, %lld, "
+ "out of range of the file size, %lld",
+ static_cast<long long>(file_offset),
+ static_cast<long long>(file_size - 1)));
}
+
+ // Read out the metadata
ICING_ASSIGN_OR_RETURN(
int32_t metadata,
- ReadProtoMetadata(filesystem_, fd_.get(), file_offset, file_size));
+ ReadProtoMetadata(&mmapped_file, file_offset, file_size));
// Copy out however many bytes it says the proto is
int stored_size = GetProtoSize(metadata);
- file_offset += sizeof(metadata);
- // Read the compressed proto out.
- if (file_offset + stored_size > file_size) {
- return absl_ports::OutOfRangeError(
- IcingStringUtil::StringPrintf("Trying to read from a location, %lld, "
- "out of range of the file size, %lld",
- static_cast<long long>(file_offset),
- static_cast<long long>(file_size - 1)));
- }
- auto buf = std::make_unique<char[]>(stored_size);
- if (!filesystem_->PRead(fd_.get(), buf.get(), stored_size, file_offset)) {
- return absl_ports::InternalError("");
- }
+ ICING_RETURN_IF_ERROR(
+ mmapped_file.Remap(file_offset + sizeof(metadata), stored_size));
- if (IsEmptyBuffer(buf.get(), stored_size)) {
+ if (IsEmptyBuffer(mmapped_file.region(), mmapped_file.region_size())) {
return absl_ports::NotFoundError("The proto data has been erased.");
}
- google::protobuf::io::ArrayInputStream proto_stream(buf.get(),
- stored_size);
+ google::protobuf::io::ArrayInputStream proto_stream(
+ mmapped_file.mutable_region(), stored_size);
// Deserialize proto
ProtoT proto;
if (header_->GetCompressFlag()) {
- protobuf_ports::GzipInputStream decompress_stream(&proto_stream);
+ google::protobuf::io::GzipInputStream decompress_stream(&proto_stream);
proto.ParseFromZeroCopyStream(&decompress_stream);
} else {
proto.ParseFromZeroCopyStream(&proto_stream);
@@ -988,29 +979,33 @@ template <typename ProtoT>
libtextclassifier3::Status PortableFileBackedProtoLog<ProtoT>::EraseProto(
int64_t file_offset) {
int64_t file_size = filesystem_->GetFileSize(fd_.get());
- if (file_size == Filesystem::kBadFileSize) {
- return absl_ports::OutOfRangeError("Unable to correctly read size.");
+ if (file_offset >= file_size) {
+ // file_size points to the next byte to write at, so subtract one to get
+ // the inclusive, actual size of file.
+ return absl_ports::OutOfRangeError(IcingStringUtil::StringPrintf(
+ "Trying to erase data at a location, %lld, "
+ "out of range of the file size, %lld",
+ static_cast<long long>(file_offset),
+ static_cast<long long>(file_size - 1)));
}
+ MemoryMappedFile mmapped_file(
+ *filesystem_, file_path_,
+ MemoryMappedFile::Strategy::READ_WRITE_AUTO_SYNC);
+
+ // Read out the metadata
ICING_ASSIGN_OR_RETURN(
int32_t metadata,
- ReadProtoMetadata(filesystem_, fd_.get(), file_offset, file_size));
- // Copy out however many bytes it says the proto is
- int stored_size = GetProtoSize(metadata);
- file_offset += sizeof(metadata);
- if (file_offset + stored_size > file_size) {
- return absl_ports::OutOfRangeError(
- IcingStringUtil::StringPrintf("Trying to read from a location, %lld, "
- "out of range of the file size, %lld",
- static_cast<long long>(file_offset),
- static_cast<long long>(file_size - 1)));
- }
- auto buf = std::make_unique<char[]>(stored_size);
+ ReadProtoMetadata(&mmapped_file, file_offset, file_size));
+
+ ICING_RETURN_IF_ERROR(mmapped_file.Remap(file_offset + sizeof(metadata),
+ GetProtoSize(metadata)));
// We need to update the crc checksum if the erased area is before the
// rewind position.
int32_t new_crc;
- if (file_offset < header_->GetRewindOffset()) {
+ int64_t erased_proto_offset = file_offset + sizeof(metadata);
+ if (erased_proto_offset < header_->GetRewindOffset()) {
// Set to "dirty" before we start writing anything.
header_->SetDirtyFlag(true);
header_->SetHeaderChecksum(header_->CalculateHeaderChecksum());
@@ -1023,30 +1018,24 @@ libtextclassifier3::Status PortableFileBackedProtoLog<ProtoT>::EraseProto(
// We need to calculate [original string xor 0s].
// The xored string is the same as the original string because 0 xor 0 =
// 0, 1 xor 0 = 1.
- // Read the compressed proto out.
- if (!filesystem_->PRead(fd_.get(), buf.get(), stored_size, file_offset)) {
- return absl_ports::InternalError("");
- }
- const std::string_view xored_str(buf.get(), stored_size);
+ const std::string_view xored_str(mmapped_file.region(),
+ mmapped_file.region_size());
Crc32 crc(header_->GetLogChecksum());
ICING_ASSIGN_OR_RETURN(
- new_crc,
- crc.UpdateWithXor(xored_str,
- /*full_data_size=*/header_->GetRewindOffset() -
- kHeaderReservedBytes,
- /*position=*/file_offset - kHeaderReservedBytes));
+ new_crc, crc.UpdateWithXor(
+ xored_str,
+ /*full_data_size=*/header_->GetRewindOffset() -
+ kHeaderReservedBytes,
+ /*position=*/erased_proto_offset - kHeaderReservedBytes));
}
// Clear the region.
- memset(buf.get(), '\0', stored_size);
- if (!filesystem_->PWrite(fd_.get(), file_offset, buf.get(), stored_size)) {
- return absl_ports::InternalError("");
- }
+ memset(mmapped_file.mutable_region(), '\0', mmapped_file.region_size());
// If we cleared something in our checksummed area, we should update our
// checksum and reset our dirty bit.
- if (file_offset < header_->GetRewindOffset()) {
+ if (erased_proto_offset < header_->GetRewindOffset()) {
header_->SetDirtyFlag(false);
header_->SetLogChecksum(new_crc);
header_->SetHeaderChecksum(header_->CalculateHeaderChecksum());
@@ -1084,12 +1073,13 @@ PortableFileBackedProtoLog<ProtoT>::GetElementsFileSize() const {
template <typename ProtoT>
PortableFileBackedProtoLog<ProtoT>::Iterator::Iterator(
- const Filesystem& filesystem, int fd, int64_t initial_offset)
- : filesystem_(&filesystem),
+ const Filesystem& filesystem, const std::string& file_path,
+ int64_t initial_offset)
+ : mmapped_file_(filesystem, file_path,
+ MemoryMappedFile::Strategy::READ_ONLY),
initial_offset_(initial_offset),
current_offset_(kInvalidOffset),
- fd_(fd) {
- file_size_ = filesystem_->GetFileSize(fd_);
+ file_size_(filesystem.GetFileSize(file_path.c_str())) {
if (file_size_ == Filesystem::kBadFileSize) {
// Fails all Advance() calls
file_size_ = 0;
@@ -1106,7 +1096,7 @@ PortableFileBackedProtoLog<ProtoT>::Iterator::Advance() {
// Jumps to the next proto position
ICING_ASSIGN_OR_RETURN(
int32_t metadata,
- ReadProtoMetadata(filesystem_, fd_, current_offset_, file_size_));
+ ReadProtoMetadata(&mmapped_file_, current_offset_, file_size_));
current_offset_ += sizeof(metadata) + GetProtoSize(metadata);
}
@@ -1128,15 +1118,14 @@ int64_t PortableFileBackedProtoLog<ProtoT>::Iterator::GetOffset() {
template <typename ProtoT>
typename PortableFileBackedProtoLog<ProtoT>::Iterator
PortableFileBackedProtoLog<ProtoT>::GetIterator() {
- return Iterator(*filesystem_, fd_.get(),
+ return Iterator(*filesystem_, file_path_,
/*initial_offset=*/kHeaderReservedBytes);
}
template <typename ProtoT>
libtextclassifier3::StatusOr<int32_t>
PortableFileBackedProtoLog<ProtoT>::ReadProtoMetadata(
- const Filesystem* const filesystem, int fd, int64_t file_offset,
- int64_t file_size) {
+ MemoryMappedFile* mmapped_file, int64_t file_offset, int64_t file_size) {
// Checks file_offset
if (file_offset >= file_size) {
return absl_ports::OutOfRangeError(IcingStringUtil::StringPrintf(
@@ -1154,12 +1143,12 @@ PortableFileBackedProtoLog<ProtoT>::ReadProtoMetadata(
static_cast<long long>(file_size)));
}
- if (!filesystem->PRead(fd, &portable_metadata, metadata_size, file_offset)) {
- return absl_ports::InternalError("");
- }
+ // Reads metadata
+ ICING_RETURN_IF_ERROR(mmapped_file->Remap(file_offset, metadata_size));
+ memcpy(&portable_metadata, mmapped_file->region(), metadata_size);
// Need to switch it back to host order endianness after reading from disk.
- int32_t host_order_metadata = GNetworkToHostL(portable_metadata);
+ int32_t host_order_metadata = gntohl(portable_metadata);
// Checks magic number
uint8_t stored_k_proto_magic = GetProtoMagic(host_order_metadata);
@@ -1177,7 +1166,7 @@ libtextclassifier3::Status
PortableFileBackedProtoLog<ProtoT>::WriteProtoMetadata(
const Filesystem* filesystem, int fd, int32_t host_order_metadata) {
// Convert it into portable endian format before writing to disk
- int32_t portable_metadata = GHostToNetworkL(host_order_metadata);
+ int32_t portable_metadata = ghtonl(host_order_metadata);
int portable_metadata_size = sizeof(portable_metadata);
// Write metadata
@@ -1197,7 +1186,21 @@ libtextclassifier3::Status PortableFileBackedProtoLog<ProtoT>::PersistToDisk() {
return libtextclassifier3::Status::OK;
}
- ICING_ASSIGN_OR_RETURN(Crc32 crc, ComputeChecksum());
+ int64_t new_content_size = file_size - header_->GetRewindOffset();
+ Crc32 crc;
+ if (new_content_size < 0) {
+ // File shrunk, recalculate the entire checksum.
+ ICING_ASSIGN_OR_RETURN(
+ crc,
+ ComputeChecksum(filesystem_, file_path_, Crc32(),
+ /*start=*/kHeaderReservedBytes, /*end=*/file_size));
+ } else {
+ // Append new changes to the existing checksum.
+ ICING_ASSIGN_OR_RETURN(
+ crc, ComputeChecksum(filesystem_, file_path_,
+ Crc32(header_->GetLogChecksum()),
+ header_->GetRewindOffset(), file_size));
+ }
header_->SetLogChecksum(crc.Get());
header_->SetRewindOffset(file_size);
@@ -1216,26 +1219,9 @@ libtextclassifier3::Status PortableFileBackedProtoLog<ProtoT>::PersistToDisk() {
template <typename ProtoT>
libtextclassifier3::StatusOr<Crc32>
PortableFileBackedProtoLog<ProtoT>::ComputeChecksum() {
- int64_t file_size = filesystem_->GetFileSize(file_path_.c_str());
- int64_t new_content_size = file_size - header_->GetRewindOffset();
- Crc32 crc;
- if (new_content_size == 0) {
- // No new protos appended, return cached checksum
- return Crc32(header_->GetLogChecksum());
- } else if (new_content_size < 0) {
- // File shrunk, recalculate the entire checksum.
- ICING_ASSIGN_OR_RETURN(
- crc,
- ComputeChecksum(filesystem_, file_path_, Crc32(),
- /*start=*/kHeaderReservedBytes, /*end=*/file_size));
- } else {
- // Append new changes to the existing checksum.
- ICING_ASSIGN_OR_RETURN(
- crc, ComputeChecksum(
- filesystem_, file_path_, Crc32(header_->GetLogChecksum()),
- /*start=*/header_->GetRewindOffset(), /*end=*/file_size));
- }
- return crc;
+ return PortableFileBackedProtoLog<ProtoT>::ComputeChecksum(
+ filesystem_, file_path_, Crc32(), /*start=*/kHeaderReservedBytes,
+ /*end=*/filesystem_->GetFileSize(file_path_.c_str()));
}
} // namespace lib
diff --git a/icing/file/portable-file-backed-proto-log_benchmark.cc b/icing/file/portable-file-backed-proto-log_benchmark.cc
index 80a8011..04ccab0 100644
--- a/icing/file/portable-file-backed-proto-log_benchmark.cc
+++ b/icing/file/portable-file-backed-proto-log_benchmark.cc
@@ -55,7 +55,7 @@ namespace lib {
namespace {
-void BM_Write(benchmark::State& state) {
+static void BM_Write(benchmark::State& state) {
const Filesystem filesystem;
int string_length = state.range(0);
const std::string file_path = IcingStringUtil::StringPrintf(
@@ -108,7 +108,7 @@ BENCHMARK(BM_Write)
// 16MiB, and we need some extra space for the
// rest of the document properties
-void BM_Read(benchmark::State& state) {
+static void BM_Read(benchmark::State& state) {
const Filesystem filesystem;
int string_length = state.range(0);
const std::string file_path = IcingStringUtil::StringPrintf(
@@ -164,7 +164,7 @@ BENCHMARK(BM_Read)
// 16MiB, and we need some extra space for the
// rest of the document properties
//
-void BM_Erase(benchmark::State& state) {
+static void BM_Erase(benchmark::State& state) {
const Filesystem filesystem;
const std::string file_path = IcingStringUtil::StringPrintf(
"%s%s", GetTestTempDir().c_str(), "/proto.log");
@@ -204,7 +204,7 @@ void BM_Erase(benchmark::State& state) {
}
BENCHMARK(BM_Erase);
-void BM_ComputeChecksum(benchmark::State& state) {
+static void BM_ComputeChecksum(benchmark::State& state) {
const Filesystem filesystem;
const std::string file_path = GetTestTempDir() + "/proto.log";
int max_proto_size = (1 << 24) - 1; // 16 MiB
@@ -246,98 +246,6 @@ void BM_ComputeChecksum(benchmark::State& state) {
}
BENCHMARK(BM_ComputeChecksum)->Range(1024, 1 << 20);
-void BM_ComputeChecksumWithCachedChecksum(benchmark::State& state) {
- const Filesystem filesystem;
- const std::string file_path = GetTestTempDir() + "/proto.log";
- int max_proto_size = (1 << 24) - 1; // 16 MiB
- bool compress = true;
-
- // Make sure it doesn't already exist.
- filesystem.DeleteFile(file_path.c_str());
-
- auto proto_log = PortableFileBackedProtoLog<DocumentProto>::Create(
- &filesystem, file_path,
- PortableFileBackedProtoLog<DocumentProto>::Options(
- compress, max_proto_size))
- .ValueOrDie()
- .proto_log;
-
- DocumentProto document = DocumentBuilder().SetKey("namespace", "uri").Build();
-
- // Make the document 1KiB
- int string_length = 1024;
- std::default_random_engine random;
- const std::string rand_str =
- RandomString(kAlNumAlphabet, string_length, &random);
-
- auto document_properties = document.add_properties();
- document_properties->set_name("string property");
- document_properties->add_string_values(rand_str);
-
- // Write some content and persist. This should update our cached checksum to
- // include the document.
- ICING_ASSERT_OK(proto_log->WriteProto(document));
- ICING_ASSERT_OK(proto_log->PersistToDisk());
-
- // This ComputeChecksum call shouldn't need to do any computation since we can
- // reuse our cached checksum.
- for (auto _ : state) {
- testing::DoNotOptimize(proto_log->ComputeChecksum());
- }
-
- // Cleanup after ourselves
- filesystem.DeleteFile(file_path.c_str());
-}
-BENCHMARK(BM_ComputeChecksumWithCachedChecksum);
-
-void BM_ComputeChecksumOnlyForTail(benchmark::State& state) {
- const Filesystem filesystem;
- const std::string file_path = GetTestTempDir() + "/proto.log";
- int max_proto_size = (1 << 24) - 1; // 16 MiB
- bool compress = true;
-
- // Make sure it doesn't already exist.
- filesystem.DeleteFile(file_path.c_str());
-
- auto proto_log = PortableFileBackedProtoLog<DocumentProto>::Create(
- &filesystem, file_path,
- PortableFileBackedProtoLog<DocumentProto>::Options(
- compress, max_proto_size))
- .ValueOrDie()
- .proto_log;
-
- DocumentProto document = DocumentBuilder().SetKey("namespace", "uri").Build();
-
- // Make the document 1KiB
- int string_length = 1024;
- std::default_random_engine random;
- const std::string rand_str =
- RandomString(kAlNumAlphabet, string_length, &random);
-
- auto document_properties = document.add_properties();
- document_properties->set_name("string property");
- document_properties->add_string_values(rand_str);
-
- // Write some content and persist. This should update our cached checksum to
- // include the document.
- ICING_ASSERT_OK(proto_log->WriteProto(document));
- ICING_ASSERT_OK(proto_log->PersistToDisk());
-
- // Write another proto into the tail, but it's not included in our cached
- // checksum since we didn't call persist.
- ICING_ASSERT_OK(proto_log->WriteProto(document));
-
- // ComputeChecksum should be calculating the checksum of the tail and adding
- // it to the cached checksum we have.
- for (auto _ : state) {
- testing::DoNotOptimize(proto_log->ComputeChecksum());
- }
-
- // Cleanup after ourselves
- filesystem.DeleteFile(file_path.c_str());
-}
-BENCHMARK(BM_ComputeChecksumOnlyForTail);
-
} // namespace
} // namespace lib
} // namespace icing
diff --git a/icing/file/portable-file-backed-proto-log_test.cc b/icing/file/portable-file-backed-proto-log_test.cc
index 795271a..b5fee4b 100644
--- a/icing/file/portable-file-backed-proto-log_test.cc
+++ b/icing/file/portable-file-backed-proto-log_test.cc
@@ -851,12 +851,11 @@ TEST_F(PortableFileBackedProtoLogTest, Iterator) {
{
// Iterator with bad filesystem
- ScopedFd sfd(filesystem_.OpenForRead(file_path_.c_str()));
MockFilesystem mock_filesystem;
- ON_CALL(mock_filesystem, GetFileSize(A<int>()))
+ ON_CALL(mock_filesystem, GetFileSize(A<const char*>()))
.WillByDefault(Return(Filesystem::kBadFileSize));
PortableFileBackedProtoLog<DocumentProto>::Iterator bad_iterator(
- mock_filesystem, sfd.get(), /*initial_offset=*/0);
+ mock_filesystem, file_path_, /*initial_offset=*/0);
ASSERT_THAT(bad_iterator.Advance(),
StatusIs(libtextclassifier3::StatusCode::OUT_OF_RANGE));
}
diff --git a/icing/testing/icu-data-file-helper.cc b/icing/helpers/icu/icu-data-file-helper.cc
index aaeb738..6607c40 100644
--- a/icing/testing/icu-data-file-helper.cc
+++ b/icing/helpers/icu/icu-data-file-helper.cc
@@ -12,7 +12,7 @@
// See the License for the specific language governing permissions and
// limitations under the License.
-#include "icing/testing/icu-data-file-helper.h"
+#include "icing/helpers/icu/icu-data-file-helper.h"
#include <sys/mman.h>
diff --git a/icing/testing/icu-data-file-helper.h b/icing/helpers/icu/icu-data-file-helper.h
index d0276e7..90f5bc7 100644
--- a/icing/testing/icu-data-file-helper.h
+++ b/icing/helpers/icu/icu-data-file-helper.h
@@ -12,8 +12,8 @@
// See the License for the specific language governing permissions and
// limitations under the License.
-#ifndef ICING_TESTING_ICU_DATA_FILE_HELPER
-#define ICING_TESTING_ICU_DATA_FILE_HELPER
+#ifndef ICING_HELPERS_ICU_ICU_DATA_FILE_HELPER
+#define ICING_HELPERS_ICU_ICU_DATA_FILE_HELPER
#include "icing/text_classifier/lib3/utils/base/status.h"
@@ -40,4 +40,4 @@ libtextclassifier3::Status SetUpICUDataFile(
} // namespace lib
} // namespace icing
-#endif // ICING_TESTING_ICU_DATA_FILE_HELPER
+#endif // ICING_HELPERS_ICU_ICU_DATA_FILE_HELPER
diff --git a/icing/icing-search-engine-with-icu-file_test.cc b/icing/icing-search-engine-with-icu-file_test.cc
index 1012b47..48e81e5 100644
--- a/icing/icing-search-engine-with-icu-file_test.cc
+++ b/icing/icing-search-engine-with-icu-file_test.cc
@@ -37,13 +37,13 @@ namespace {
using ::icing::lib::portable_equals_proto::EqualsProto;
using ::testing::Eq;
-constexpr PropertyConfigProto::Cardinality::Code CARDINALITY_REQUIRED =
- PropertyConfigProto::Cardinality::REQUIRED;
+constexpr PropertyConfigProto_Cardinality_Code CARDINALITY_REQUIRED =
+ PropertyConfigProto_Cardinality_Code_REQUIRED;
-constexpr StringIndexingConfig::TokenizerType::Code TOKENIZER_PLAIN =
- StringIndexingConfig::TokenizerType::PLAIN;
+constexpr StringIndexingConfig_TokenizerType_Code TOKENIZER_PLAIN =
+ StringIndexingConfig_TokenizerType_Code_PLAIN;
-constexpr TermMatchType::Code MATCH_PREFIX = TermMatchType::PREFIX;
+constexpr TermMatchType_Code MATCH_PREFIX = TermMatchType_Code_PREFIX;
std::string GetTestBaseDir() {
return GetTestTempDir() + "/icing_with_icu_files";
diff --git a/icing/icing-search-engine.cc b/icing/icing-search-engine.cc
index 952ba21..20a6bb9 100644
--- a/icing/icing-search-engine.cc
+++ b/icing/icing-search-engine.cc
@@ -18,7 +18,6 @@
#include <memory>
#include <string>
#include <string_view>
-#include <unordered_map>
#include <utility>
#include <vector>
@@ -36,7 +35,6 @@
#include "icing/index/index.h"
#include "icing/index/iterator/doc-hit-info-iterator.h"
#include "icing/legacy/index/icing-filesystem.h"
-#include "icing/portable/endian.h"
#include "icing/proto/document.pb.h"
#include "icing/proto/initialize.pb.h"
#include "icing/proto/internal/optimize.pb.h"
@@ -48,7 +46,6 @@
#include "icing/proto/search.pb.h"
#include "icing/proto/status.pb.h"
#include "icing/query/query-processor.h"
-#include "icing/query/suggestion-processor.h"
#include "icing/result/projection-tree.h"
#include "icing/result/projector.h"
#include "icing/result/result-retriever.h"
@@ -60,7 +57,6 @@
#include "icing/scoring/scoring-processor.h"
#include "icing/store/document-id.h"
#include "icing/store/document-store.h"
-#include "icing/store/namespace-checker-impl.h"
#include "icing/tokenization/language-segmenter-factory.h"
#include "icing/tokenization/language-segmenter.h"
#include "icing/transform/normalizer-factory.h"
@@ -81,30 +77,19 @@ constexpr std::string_view kDocumentSubfolderName = "document_dir";
constexpr std::string_view kIndexSubfolderName = "index_dir";
constexpr std::string_view kSchemaSubfolderName = "schema_dir";
constexpr std::string_view kSetSchemaMarkerFilename = "set_schema_marker";
-constexpr std::string_view kInitMarkerFilename = "init_marker";
constexpr std::string_view kOptimizeStatusFilename = "optimize_status";
-// The maximum number of unsuccessful initialization attempts from the current
-// state that we will tolerate before deleting all data and starting from a
-// fresh state.
-constexpr int kMaxUnsuccessfulInitAttempts = 5;
-
-// A pair that holds namespace and type.
-struct NamespaceTypePair {
- std::string namespace_;
- std::string type;
-
- bool operator==(const NamespaceTypePair& other) const {
- return namespace_ == other.namespace_ && type == other.type;
- }
-};
-
-struct NamespaceTypePairHasher {
- std::size_t operator()(const NamespaceTypePair& pair) const {
- return std::hash<std::string>()(pair.namespace_) ^
- std::hash<std::string>()(pair.type);
+libtextclassifier3::Status ValidateOptions(
+ const IcingSearchEngineOptions& options) {
+ // These options are only used in IndexProcessor, which won't be created
+ // until the first Put call. So they must be checked here, so that any
+ // errors can be surfaced in Initialize.
+ if (options.max_tokens_per_doc() <= 0) {
+ return absl_ports::InvalidArgumentError(
+ "Options::max_tokens_per_doc must be greater than zero.");
}
-};
+ return libtextclassifier3::Status::OK;
+}
libtextclassifier3::Status ValidateResultSpec(
const ResultSpecProto& result_spec) {
@@ -142,29 +127,14 @@ libtextclassifier3::Status ValidateSearchSpec(
return libtextclassifier3::Status::OK;
}
-libtextclassifier3::Status ValidateSuggestionSpec(
- const SuggestionSpecProto& suggestion_spec,
- const PerformanceConfiguration& configuration) {
- if (suggestion_spec.prefix().empty()) {
- return absl_ports::InvalidArgumentError(
- absl_ports::StrCat("SuggestionSpecProto.prefix is empty!"));
- }
- if (suggestion_spec.scoring_spec().scoring_match_type() ==
- TermMatchType::UNKNOWN) {
- return absl_ports::InvalidArgumentError(
- absl_ports::StrCat("SuggestionSpecProto.term_match_type is unknown!"));
- }
- if (suggestion_spec.num_to_return() <= 0) {
- return absl_ports::InvalidArgumentError(absl_ports::StrCat(
- "SuggestionSpecProto.num_to_return must be positive."));
- }
- if (suggestion_spec.prefix().size() > configuration.max_query_length) {
- return absl_ports::InvalidArgumentError(
- absl_ports::StrCat("SuggestionSpecProto.prefix is longer than the "
- "maximum allowed prefix length: ",
- std::to_string(configuration.max_query_length)));
- }
- return libtextclassifier3::Status::OK;
+IndexProcessor::Options CreateIndexProcessorOptions(
+ const IcingSearchEngineOptions& options) {
+ IndexProcessor::Options index_processor_options;
+ index_processor_options.max_tokens_per_document =
+ options.max_tokens_per_doc();
+ index_processor_options.token_limit_behavior =
+ IndexProcessor::Options::TokenLimitBehavior::kSuppressError;
+ return index_processor_options;
}
// Document store files are in a standalone subfolder for easier file
@@ -194,15 +164,10 @@ std::string MakeIndexDirectoryPath(const std::string& base_dir) {
std::string MakeSchemaDirectoryPath(const std::string& base_dir) {
return absl_ports::StrCat(base_dir, "/", kSchemaSubfolderName);
}
-
std::string MakeSetSchemaMarkerFilePath(const std::string& base_dir) {
return absl_ports::StrCat(base_dir, "/", kSetSchemaMarkerFilename);
}
-std::string MakeInitMarkerFilePath(const std::string& base_dir) {
- return absl_ports::StrCat(base_dir, "/", kInitMarkerFilename);
-}
-
void TransformStatus(const libtextclassifier3::Status& internal_status,
StatusProto* status_proto) {
StatusProto::Code code;
@@ -273,28 +238,6 @@ void TransformStatus(const libtextclassifier3::Status& internal_status,
status_proto->set_message(internal_status.error_message());
}
-libtextclassifier3::Status RetrieveAndAddDocumentInfo(
- const DocumentStore* document_store, DeleteByQueryResultProto& result_proto,
- std::unordered_map<NamespaceTypePair,
- DeleteByQueryResultProto::DocumentGroupInfo*,
- NamespaceTypePairHasher>& info_map,
- DocumentId document_id) {
- ICING_ASSIGN_OR_RETURN(DocumentProto document,
- document_store->Get(document_id));
- NamespaceTypePair key = {document.namespace_(), document.schema()};
- auto iter = info_map.find(key);
- if (iter == info_map.end()) {
- auto entry = result_proto.add_deleted_documents();
- entry->set_namespace_(std::move(document.namespace_()));
- entry->set_schema(std::move(document.schema()));
- entry->add_uris(std::move(document.uri()));
- info_map[key] = entry;
- } else {
- iter->second->add_uris(std::move(document.uri()));
- }
- return libtextclassifier3::Status::OK;
-}
-
} // namespace
IcingSearchEngine::IcingSearchEngine(const IcingSearchEngineOptions& options,
@@ -333,66 +276,6 @@ InitializeResultProto IcingSearchEngine::Initialize() {
return InternalInitialize();
}
-void IcingSearchEngine::ResetMembers() {
- schema_store_.reset();
- document_store_.reset();
- language_segmenter_.reset();
- normalizer_.reset();
- index_.reset();
-}
-
-libtextclassifier3::Status IcingSearchEngine::CheckInitMarkerFile(
- InitializeStatsProto* initialize_stats) {
- // Check to see if the marker file exists and if we've already passed our max
- // number of init attempts.
- std::string marker_filepath = MakeInitMarkerFilePath(options_.base_dir());
- bool file_exists = filesystem_->FileExists(marker_filepath.c_str());
- int network_init_attempts = 0;
- int host_init_attempts = 0;
-
- // Read the number of previous failed init attempts from the file. If it
- // fails, then just assume the value is zero (the most likely reason for
- // failure would be non-existence because the last init was successful
- // anyways).
- ScopedFd marker_file_fd(filesystem_->OpenForWrite(marker_filepath.c_str()));
- libtextclassifier3::Status status;
- if (file_exists &&
- filesystem_->PRead(marker_file_fd.get(), &network_init_attempts,
- sizeof(network_init_attempts), /*offset=*/0)) {
- host_init_attempts = GNetworkToHostL(network_init_attempts);
- if (host_init_attempts > kMaxUnsuccessfulInitAttempts) {
- // We're tried and failed to init too many times. We need to throw
- // everything out and start from scratch.
- ResetMembers();
- if (!filesystem_->DeleteDirectoryRecursively(
- options_.base_dir().c_str())) {
- return absl_ports::InternalError("Failed to delete icing base dir!");
- }
- status = absl_ports::DataLossError(
- "Encountered failed initialization limit. Cleared all data.");
- host_init_attempts = 0;
- }
- }
-
- // Use network_init_attempts here because we might have set host_init_attempts
- // to 0 if it exceeded the max threshold.
- initialize_stats->set_num_previous_init_failures(
- GNetworkToHostL(network_init_attempts));
-
- ++host_init_attempts;
- network_init_attempts = GHostToNetworkL(host_init_attempts);
- // Write the updated number of attempts before we get started.
- if (!filesystem_->PWrite(marker_file_fd.get(), /*offset=*/0,
- &network_init_attempts,
- sizeof(network_init_attempts)) ||
- !filesystem_->DataSync(marker_file_fd.get())) {
- return absl_ports::InternalError(
- "Failed to write and sync init marker file");
- }
-
- return status;
-}
-
InitializeResultProto IcingSearchEngine::InternalInitialize() {
ICING_VLOG(1) << "Initializing IcingSearchEngine in dir: "
<< options_.base_dir();
@@ -413,17 +296,9 @@ InitializeResultProto IcingSearchEngine::InternalInitialize() {
return result_proto;
}
- // Now go ahead and try to initialize.
libtextclassifier3::Status status = InitializeMembers(initialize_stats);
if (status.ok() || absl_ports::IsDataLoss(status)) {
- // We successfully initialized. We should delete the init marker file to
- // indicate a successful init.
- std::string marker_filepath = MakeInitMarkerFilePath(options_.base_dir());
- if (!filesystem_->DeleteFile(marker_filepath.c_str())) {
- status = absl_ports::InternalError("Failed to delete init marker file!");
- } else {
- initialized_ = true;
- }
+ initialized_ = true;
}
TransformStatus(status, result_status);
initialize_stats->set_latency_ms(initialize_timer->GetElapsedMilliseconds());
@@ -433,20 +308,7 @@ InitializeResultProto IcingSearchEngine::InternalInitialize() {
libtextclassifier3::Status IcingSearchEngine::InitializeMembers(
InitializeStatsProto* initialize_stats) {
ICING_RETURN_ERROR_IF_NULL(initialize_stats);
-
- // Make sure the base directory exists
- if (!filesystem_->CreateDirectoryRecursively(options_.base_dir().c_str())) {
- return absl_ports::InternalError(absl_ports::StrCat(
- "Could not create directory: ", options_.base_dir()));
- }
-
- // Check to see if the marker file exists and if we've already passed our max
- // number of init attempts.
- libtextclassifier3::Status status = CheckInitMarkerFile(initialize_stats);
- if (!status.ok() && !absl_ports::IsDataLoss(status)) {
- return status;
- }
-
+ ICING_RETURN_IF_ERROR(InitializeOptions());
ICING_RETURN_IF_ERROR(InitializeSchemaStore(initialize_stats));
// TODO(b/156383798) : Resolve how to specify the locale.
@@ -460,7 +322,7 @@ libtextclassifier3::Status IcingSearchEngine::InitializeMembers(
std::string marker_filepath =
MakeSetSchemaMarkerFilePath(options_.base_dir());
- libtextclassifier3::Status index_init_status;
+ libtextclassifier3::Status status;
if (absl_ports::IsNotFound(schema_store_->GetSchema().status())) {
// The schema was either lost or never set before. Wipe out the doc store
// and index directories and initialize them from scratch.
@@ -474,15 +336,14 @@ libtextclassifier3::Status IcingSearchEngine::InitializeMembers(
}
ICING_RETURN_IF_ERROR(InitializeDocumentStore(
/*force_recovery_and_revalidate_documents=*/false, initialize_stats));
- index_init_status = InitializeIndex(initialize_stats);
- if (!index_init_status.ok() && !absl_ports::IsDataLoss(index_init_status)) {
- return index_init_status;
- }
+ status = InitializeIndex(initialize_stats);
} else if (filesystem_->FileExists(marker_filepath.c_str())) {
// If the marker file is still around then something wonky happened when we
// last tried to set the schema.
ICING_RETURN_IF_ERROR(InitializeDocumentStore(
/*force_recovery_and_revalidate_documents=*/true, initialize_stats));
+ initialize_stats->set_document_store_recovery_cause(
+ InitializeStatsProto::SCHEMA_CHANGES_OUT_OF_SYNC);
// We're going to need to build the index from scratch. So just delete its
// files now.
@@ -499,12 +360,12 @@ libtextclassifier3::Status IcingSearchEngine::InitializeMembers(
std::unique_ptr<Timer> restore_timer = clock_->GetNewTimer();
IndexRestorationResult restore_result = RestoreIndexIfNeeded();
- index_init_status = std::move(restore_result.status);
+ status = std::move(restore_result.status);
// DATA_LOSS means that we have successfully initialized and re-added
// content to the index. Some indexed content was lost, but otherwise the
// index is in a valid state and can be queried.
- if (!index_init_status.ok() && !absl_ports::IsDataLoss(index_init_status)) {
- return index_init_status;
+ if (!status.ok() && !absl_ports::IsDataLoss(status)) {
+ return status;
}
// Delete the marker file to indicate that everything is now in sync with
@@ -518,22 +379,30 @@ libtextclassifier3::Status IcingSearchEngine::InitializeMembers(
} else {
ICING_RETURN_IF_ERROR(InitializeDocumentStore(
/*force_recovery_and_revalidate_documents=*/false, initialize_stats));
- index_init_status = InitializeIndex(initialize_stats);
- if (!index_init_status.ok() && !absl_ports::IsDataLoss(index_init_status)) {
- return index_init_status;
+ status = InitializeIndex(initialize_stats);
+ if (!status.ok() && !absl_ports::IsDataLoss(status)) {
+ return status;
}
}
- if (status.ok()) {
- status = index_init_status;
- }
-
result_state_manager_ = std::make_unique<ResultStateManager>(
performance_configuration_.max_num_total_hits, *document_store_);
return status;
}
+libtextclassifier3::Status IcingSearchEngine::InitializeOptions() {
+ ICING_RETURN_IF_ERROR(ValidateOptions(options_));
+
+ // Make sure the base directory exists
+ if (!filesystem_->CreateDirectoryRecursively(options_.base_dir().c_str())) {
+ return absl_ports::InternalError(absl_ports::StrCat(
+ "Could not create directory: ", options_.base_dir()));
+ }
+
+ return libtextclassifier3::Status::OK;
+}
+
libtextclassifier3::Status IcingSearchEngine::InitializeSchemaStore(
InitializeStatsProto* initialize_stats) {
ICING_RETURN_ERROR_IF_NULL(initialize_stats);
@@ -633,18 +502,15 @@ SetSchemaResultProto IcingSearchEngine::SetSchema(
StatusProto* result_status = result_proto.mutable_status();
absl_ports::unique_lock l(&mutex_);
- std::unique_ptr<Timer> timer = clock_->GetNewTimer();
if (!initialized_) {
result_status->set_code(StatusProto::FAILED_PRECONDITION);
result_status->set_message("IcingSearchEngine has not been initialized!");
- result_proto.set_latency_ms(timer->GetElapsedMilliseconds());
return result_proto;
}
auto lost_previous_schema_or = LostPreviousSchema();
if (!lost_previous_schema_or.ok()) {
TransformStatus(lost_previous_schema_or.status(), result_status);
- result_proto.set_latency_ms(timer->GetElapsedMilliseconds());
return result_proto;
}
bool lost_previous_schema = lost_previous_schema_or.ValueOrDie();
@@ -662,11 +528,10 @@ SetSchemaResultProto IcingSearchEngine::SetSchema(
std::move(new_schema), ignore_errors_and_delete_documents);
if (!set_schema_result_or.ok()) {
TransformStatus(set_schema_result_or.status(), result_status);
- result_proto.set_latency_ms(timer->GetElapsedMilliseconds());
return result_proto;
}
- SchemaStore::SetSchemaResult set_schema_result =
- std::move(set_schema_result_or).ValueOrDie();
+ const SchemaStore::SetSchemaResult set_schema_result =
+ set_schema_result_or.ValueOrDie();
for (const std::string& deleted_type :
set_schema_result.schema_types_deleted_by_name) {
@@ -678,25 +543,6 @@ SetSchemaResultProto IcingSearchEngine::SetSchema(
result_proto.add_incompatible_schema_types(incompatible_type);
}
- for (const std::string& new_type :
- set_schema_result.schema_types_new_by_name) {
- result_proto.add_new_schema_types(std::move(new_type));
- }
-
- for (const std::string& compatible_type :
- set_schema_result.schema_types_changed_fully_compatible_by_name) {
- result_proto.add_fully_compatible_changed_schema_types(
- std::move(compatible_type));
- }
-
- bool index_incompatible =
- !set_schema_result.schema_types_index_incompatible_by_name.empty();
- for (const std::string& index_incompatible_type :
- set_schema_result.schema_types_index_incompatible_by_name) {
- result_proto.add_index_incompatible_changed_schema_types(
- std::move(index_incompatible_type));
- }
-
libtextclassifier3::Status status;
if (set_schema_result.success) {
if (lost_previous_schema) {
@@ -705,7 +551,6 @@ SetSchemaResultProto IcingSearchEngine::SetSchema(
status = document_store_->UpdateSchemaStore(schema_store_.get());
if (!status.ok()) {
TransformStatus(status, result_status);
- result_proto.set_latency_ms(timer->GetElapsedMilliseconds());
return result_proto;
}
} else if (!set_schema_result.old_schema_type_ids_changed.empty() ||
@@ -715,17 +560,15 @@ SetSchemaResultProto IcingSearchEngine::SetSchema(
set_schema_result);
if (!status.ok()) {
TransformStatus(status, result_status);
- result_proto.set_latency_ms(timer->GetElapsedMilliseconds());
return result_proto;
}
}
- if (lost_previous_schema || index_incompatible) {
+ if (lost_previous_schema || set_schema_result.index_incompatible) {
// Clears all index files
status = index_->Reset();
if (!status.ok()) {
TransformStatus(status, result_status);
- result_proto.set_latency_ms(timer->GetElapsedMilliseconds());
return result_proto;
}
@@ -736,7 +579,6 @@ SetSchemaResultProto IcingSearchEngine::SetSchema(
if (!restore_result.status.ok() &&
!absl_ports::IsDataLoss(restore_result.status)) {
TransformStatus(status, result_status);
- result_proto.set_latency_ms(timer->GetElapsedMilliseconds());
return result_proto;
}
}
@@ -747,7 +589,6 @@ SetSchemaResultProto IcingSearchEngine::SetSchema(
result_status->set_message("Schema is incompatible.");
}
- result_proto.set_latency_ms(timer->GetElapsedMilliseconds());
return result_proto;
}
@@ -841,8 +682,9 @@ PutResultProto IcingSearchEngine::Put(DocumentProto&& document) {
}
DocumentId document_id = document_id_or.ValueOrDie();
- auto index_processor_or =
- IndexProcessor::Create(normalizer_.get(), index_.get(), clock_.get());
+ auto index_processor_or = IndexProcessor::Create(
+ normalizer_.get(), index_.get(), CreateIndexProcessorOptions(options_),
+ clock_.get());
if (!index_processor_or.ok()) {
TransformStatus(index_processor_or.status(), result_status);
put_document_stats->set_latency_ms(put_timer->GetElapsedMilliseconds());
@@ -853,17 +695,6 @@ PutResultProto IcingSearchEngine::Put(DocumentProto&& document) {
auto status = index_processor->IndexDocument(tokenized_document, document_id,
put_document_stats);
- if (!status.ok()) {
- // If we encountered a failure while indexing this document, then mark it as
- // deleted.
- libtextclassifier3::Status delete_status =
- document_store_->Delete(document_id);
- if (!delete_status.ok()) {
- // This is pretty dire (and, hopefully, unlikely). We can't roll back the
- // document that we just added. Wipeout the whole index.
- ResetInternal();
- }
- }
TransformStatus(status, result_status);
put_document_stats->set_latency_ms(put_timer->GetElapsedMilliseconds());
@@ -972,7 +803,7 @@ DeleteResultProto IcingSearchEngine::Delete(const std::string_view name_space,
delete_stats->set_delete_type(DeleteStatsProto::DeleteType::SINGLE);
std::unique_ptr<Timer> delete_timer = clock_->GetNewTimer();
- // TODO(b/216487496): Implement a more robust version of TC_RETURN_IF_ERROR
+ // TODO(b/144458732): Implement a more robust version of TC_RETURN_IF_ERROR
// that can support error logging.
libtextclassifier3::Status status = document_store_->Delete(name_space, uri);
if (!status.ok()) {
@@ -1006,7 +837,7 @@ DeleteByNamespaceResultProto IcingSearchEngine::DeleteByNamespace(
delete_stats->set_delete_type(DeleteStatsProto::DeleteType::NAMESPACE);
std::unique_ptr<Timer> delete_timer = clock_->GetNewTimer();
- // TODO(b/216487496): Implement a more robust version of TC_RETURN_IF_ERROR
+ // TODO(b/144458732): Implement a more robust version of TC_RETURN_IF_ERROR
// that can support error logging.
DocumentStore::DeleteByGroupResult doc_store_result =
document_store_->DeleteByNamespace(name_space);
@@ -1040,7 +871,7 @@ DeleteBySchemaTypeResultProto IcingSearchEngine::DeleteBySchemaType(
delete_stats->set_delete_type(DeleteStatsProto::DeleteType::SCHEMA_TYPE);
std::unique_ptr<Timer> delete_timer = clock_->GetNewTimer();
- // TODO(b/216487496): Implement a more robust version of TC_RETURN_IF_ERROR
+ // TODO(b/144458732): Implement a more robust version of TC_RETURN_IF_ERROR
// that can support error logging.
DocumentStore::DeleteByGroupResult doc_store_result =
document_store_->DeleteBySchemaType(schema_type);
@@ -1058,7 +889,7 @@ DeleteBySchemaTypeResultProto IcingSearchEngine::DeleteBySchemaType(
}
DeleteByQueryResultProto IcingSearchEngine::DeleteByQuery(
- const SearchSpecProto& search_spec, bool return_deleted_document_info) {
+ const SearchSpecProto& search_spec) {
ICING_VLOG(1) << "Deleting documents for query " << search_spec.query()
<< " from doc store";
@@ -1072,13 +903,9 @@ DeleteByQueryResultProto IcingSearchEngine::DeleteByQuery(
return result_proto;
}
- DeleteByQueryStatsProto* delete_stats =
- result_proto.mutable_delete_by_query_stats();
- delete_stats->set_query_length(search_spec.query().length());
- delete_stats->set_num_namespaces_filtered(
- search_spec.namespace_filters_size());
- delete_stats->set_num_schema_types_filtered(
- search_spec.schema_type_filters_size());
+ DeleteStatsProto* delete_stats = result_proto.mutable_delete_stats();
+ delete_stats->set_delete_type(DeleteStatsProto::DeleteType::QUERY);
+
std::unique_ptr<Timer> delete_timer = clock_->GetNewTimer();
libtextclassifier3::Status status =
@@ -1088,7 +915,6 @@ DeleteByQueryResultProto IcingSearchEngine::DeleteByQuery(
return result_proto;
}
- std::unique_ptr<Timer> component_timer = clock_->GetNewTimer();
// Gets unordered results from query processor
auto query_processor_or = QueryProcessor::Create(
index_.get(), language_segmenter_.get(), normalizer_.get(),
@@ -1107,32 +933,14 @@ DeleteByQueryResultProto IcingSearchEngine::DeleteByQuery(
}
QueryProcessor::QueryResults query_results =
std::move(query_results_or).ValueOrDie();
- delete_stats->set_parse_query_latency_ms(
- component_timer->GetElapsedMilliseconds());
ICING_VLOG(2) << "Deleting the docs that matched the query.";
int num_deleted = 0;
- // A map used to group deleted documents.
- // From the (namespace, type) pair to a list of uris.
- std::unordered_map<NamespaceTypePair,
- DeleteByQueryResultProto::DocumentGroupInfo*,
- NamespaceTypePairHasher>
- deleted_info_map;
- component_timer = clock_->GetNewTimer();
while (query_results.root_iterator->Advance().ok()) {
ICING_VLOG(3) << "Deleting doc "
<< query_results.root_iterator->doc_hit_info().document_id();
++num_deleted;
- if (return_deleted_document_info) {
- status = RetrieveAndAddDocumentInfo(
- document_store_.get(), result_proto, deleted_info_map,
- query_results.root_iterator->doc_hit_info().document_id());
- if (!status.ok()) {
- TransformStatus(status, result_status);
- return result_proto;
- }
- }
status = document_store_->Delete(
query_results.root_iterator->doc_hit_info().document_id());
if (!status.ok()) {
@@ -1140,13 +948,6 @@ DeleteByQueryResultProto IcingSearchEngine::DeleteByQuery(
return result_proto;
}
}
- delete_stats->set_document_removal_latency_ms(
- component_timer->GetElapsedMilliseconds());
- int term_count = 0;
- for (const auto& section_and_terms : query_results.query_terms) {
- term_count += section_and_terms.second.size();
- }
- delete_stats->set_num_terms(term_count);
if (num_deleted > 0) {
result_proto.mutable_status()->set_code(StatusProto::OK);
@@ -1201,8 +1002,12 @@ OptimizeResultProto IcingSearchEngine::Optimize() {
std::unique_ptr<Timer> optimize_timer = clock_->GetNewTimer();
OptimizeStatsProto* optimize_stats = result_proto.mutable_optimize_stats();
int64_t before_size = filesystem_->GetDiskUsage(options_.base_dir().c_str());
- optimize_stats->set_storage_size_before(
- Filesystem::SanitizeFileSize(before_size));
+ if (before_size != Filesystem::kBadFileSize) {
+ optimize_stats->set_storage_size_before(before_size);
+ } else {
+ // Set -1 as a sentinel value when failures occur.
+ optimize_stats->set_storage_size_before(-1);
+ }
// Flushes data to disk before doing optimization
auto status = InternalPersistToDisk(PersistType::FULL);
@@ -1279,8 +1084,12 @@ OptimizeResultProto IcingSearchEngine::Optimize() {
optimize_status_file.Write(std::move(optimize_status));
int64_t after_size = filesystem_->GetDiskUsage(options_.base_dir().c_str());
- optimize_stats->set_storage_size_after(
- Filesystem::SanitizeFileSize(after_size));
+ if (after_size != Filesystem::kBadFileSize) {
+ optimize_stats->set_storage_size_after(after_size);
+ } else {
+ // Set -1 as a sentinel value when failures occur.
+ optimize_stats->set_storage_size_after(-1);
+ }
optimize_stats->set_latency_ms(optimize_timer->GetElapsedMilliseconds());
TransformStatus(optimization_status, result_status);
@@ -1362,8 +1171,11 @@ StorageInfoResultProto IcingSearchEngine::GetStorageInfo() {
}
int64_t index_size = filesystem_->GetDiskUsage(options_.base_dir().c_str());
- result.mutable_storage_info()->set_total_storage_size(
- Filesystem::SanitizeFileSize(index_size));
+ if (index_size != Filesystem::kBadFileSize) {
+ result.mutable_storage_info()->set_total_storage_size(index_size);
+ } else {
+ result.mutable_storage_info()->set_total_storage_size(-1);
+ }
*result.mutable_storage_info()->mutable_document_storage_info() =
document_store_->GetStorageInfo();
*result.mutable_storage_info()->mutable_schema_store_storage_info() =
@@ -1453,8 +1265,8 @@ SearchResultProto IcingSearchEngine::Search(
component_timer = clock_->GetNewTimer();
// Scores but does not rank the results.
libtextclassifier3::StatusOr<std::unique_ptr<ScoringProcessor>>
- scoring_processor_or = ScoringProcessor::Create(
- scoring_spec, document_store_.get(), schema_store_.get());
+ scoring_processor_or =
+ ScoringProcessor::Create(scoring_spec, document_store_.get());
if (!scoring_processor_or.ok()) {
TransformStatus(scoring_processor_or.status(), result_status);
return result_proto;
@@ -1765,8 +1577,9 @@ IcingSearchEngine::RestoreIndexIfNeeded() {
return {libtextclassifier3::Status::OK, false};
}
- auto index_processor_or =
- IndexProcessor::Create(normalizer_.get(), index_.get(), clock_.get());
+ auto index_processor_or = IndexProcessor::Create(
+ normalizer_.get(), index_.get(), CreateIndexProcessorOptions(options_),
+ clock_.get());
if (!index_processor_or.ok()) {
return {index_processor_or.status(), true};
}
@@ -1844,18 +1657,22 @@ libtextclassifier3::StatusOr<bool> IcingSearchEngine::LostPreviousSchema() {
}
ResetResultProto IcingSearchEngine::Reset() {
- absl_ports::unique_lock l(&mutex_);
- return ResetInternal();
-}
-
-ResetResultProto IcingSearchEngine::ResetInternal() {
ICING_VLOG(1) << "Resetting IcingSearchEngine";
ResetResultProto result_proto;
StatusProto* result_status = result_proto.mutable_status();
+ absl_ports::unique_lock l(&mutex_);
+
initialized_ = false;
- ResetMembers();
+
+ // Resets members variables
+ schema_store_.reset();
+ document_store_.reset();
+ language_segmenter_.reset();
+ normalizer_.reset();
+ index_.reset();
+
if (!filesystem_->DeleteDirectoryRecursively(options_.base_dir().c_str())) {
result_status->set_code(StatusProto::INTERNAL);
return result_proto;
@@ -1881,65 +1698,5 @@ ResetResultProto IcingSearchEngine::ResetInternal() {
return result_proto;
}
-SuggestionResponse IcingSearchEngine::SearchSuggestions(
- const SuggestionSpecProto& suggestion_spec) {
- // TODO(b/146008613) Explore ideas to make this function read-only.
- absl_ports::unique_lock l(&mutex_);
- SuggestionResponse response;
- StatusProto* response_status = response.mutable_status();
- if (!initialized_) {
- response_status->set_code(StatusProto::FAILED_PRECONDITION);
- response_status->set_message("IcingSearchEngine has not been initialized!");
- return response;
- }
-
- libtextclassifier3::Status status =
- ValidateSuggestionSpec(suggestion_spec, performance_configuration_);
- if (!status.ok()) {
- TransformStatus(status, response_status);
- return response;
- }
-
- // Create the suggestion processor.
- auto suggestion_processor_or = SuggestionProcessor::Create(
- index_.get(), language_segmenter_.get(), normalizer_.get());
- if (!suggestion_processor_or.ok()) {
- TransformStatus(suggestion_processor_or.status(), response_status);
- return response;
- }
- std::unique_ptr<SuggestionProcessor> suggestion_processor =
- std::move(suggestion_processor_or).ValueOrDie();
-
- std::unordered_set<NamespaceId> namespace_ids;
- namespace_ids.reserve(suggestion_spec.namespace_filters_size());
- for (std::string_view name_space : suggestion_spec.namespace_filters()) {
- auto namespace_id_or = document_store_->GetNamespaceId(name_space);
- if (!namespace_id_or.ok()) {
- continue;
- }
- namespace_ids.insert(namespace_id_or.ValueOrDie());
- }
-
- // Run suggestion based on given SuggestionSpec.
- NamespaceCheckerImpl namespace_checker_impl(document_store_.get(),
- std::move(namespace_ids));
- libtextclassifier3::StatusOr<std::vector<TermMetadata>> terms_or =
- suggestion_processor->QuerySuggestions(suggestion_spec,
- &namespace_checker_impl);
- if (!terms_or.ok()) {
- TransformStatus(terms_or.status(), response_status);
- return response;
- }
-
- // Convert vector<TermMetaData> into final SuggestionResponse proto.
- for (TermMetadata& term : terms_or.ValueOrDie()) {
- SuggestionResponse::Suggestion suggestion;
- suggestion.set_query(std::move(term.content));
- response.mutable_suggestions()->Add(std::move(suggestion));
- }
- response_status->set_code(StatusProto::OK);
- return response;
-}
-
} // namespace lib
} // namespace icing
diff --git a/icing/icing-search-engine.h b/icing/icing-search-engine.h
index ff9c7fb..855401f 100644
--- a/icing/icing-search-engine.h
+++ b/icing/icing-search-engine.h
@@ -280,9 +280,8 @@ class IcingSearchEngine {
// NOT_FOUND if the query doesn't match any documents
// FAILED_PRECONDITION IcingSearchEngine has not been initialized yet
// INTERNAL_ERROR on IO error
- DeleteByQueryResultProto DeleteByQuery(
- const SearchSpecProto& search_spec,
- bool return_deleted_document_info = false) ICING_LOCKS_EXCLUDED(mutex_);
+ DeleteByQueryResultProto DeleteByQuery(const SearchSpecProto& search_spec)
+ ICING_LOCKS_EXCLUDED(mutex_);
// Retrieves, scores, ranks, and returns the results according to the specs.
// Results can be empty. If there're multiple pages of results,
@@ -303,17 +302,6 @@ class IcingSearchEngine {
const ResultSpecProto& result_spec)
ICING_LOCKS_EXCLUDED(mutex_);
- // Retrieves, scores, ranks and returns the suggested query string according
- // to the specs. Results can be empty.
- //
- // Returns a SuggestionResponse with status:
- // OK with results on success
- // INVALID_ARGUMENT if any of specs is invalid
- // FAILED_PRECONDITION IcingSearchEngine has not been initialized yet
- // INTERNAL_ERROR on any other errors
- SuggestionResponse SearchSuggestions(
- const SuggestionSpecProto& suggestion_spec) ICING_LOCKS_EXCLUDED(mutex_);
-
// Fetches the next page of results of a previously executed query. Results
// can be empty if next-page token is invalid. Invalid next page tokens are
// tokens that are either zero or were previously passed to
@@ -464,25 +452,6 @@ class IcingSearchEngine {
// Pointer to JNI class references
const std::unique_ptr<const JniCache> jni_cache_;
- // Resets all members that are created during Initialize.
- void ResetMembers() ICING_EXCLUSIVE_LOCKS_REQUIRED(mutex_);
-
- // Resets all members that are created during Initialize, deletes all
- // underlying files and initializes a fresh index.
- ResetResultProto ResetInternal() ICING_EXCLUSIVE_LOCKS_REQUIRED(mutex_);
-
- // Checks for the existence of the init marker file. If the failed init count
- // exceeds kMaxUnsuccessfulInitAttempts, all data is deleted and the index is
- // initialized from scratch. The updated count (original failed init count + 1
- // ) is written to the marker file.
- //
- // RETURNS
- // OK on success
- // INTERNAL if an IO error occurs while trying to update the marker file.
- libtextclassifier3::Status CheckInitMarkerFile(
- InitializeStatsProto* initialize_stats)
- ICING_EXCLUSIVE_LOCKS_REQUIRED(mutex_);
-
// Helper method to do the actual work to persist data to disk. We need this
// separate method so that other public methods don't need to call
// PersistToDisk(). Public methods calling each other may cause deadlock
@@ -508,6 +477,15 @@ class IcingSearchEngine {
InitializeStatsProto* initialize_stats)
ICING_EXCLUSIVE_LOCKS_REQUIRED(mutex_);
+ // Do any validation/setup required for the given IcingSearchEngineOptions
+ //
+ // Returns:
+ // OK on success
+ // INVALID_ARGUMENT if options has invalid values
+ // INTERNAL on I/O error
+ libtextclassifier3::Status InitializeOptions()
+ ICING_EXCLUSIVE_LOCKS_REQUIRED(mutex_);
+
// Do any initialization/recovery necessary to create a SchemaStore instance.
//
// Returns:
diff --git a/icing/icing-search-engine_benchmark.cc b/icing/icing-search-engine_benchmark.cc
index 5e610d5..ba9aed1 100644
--- a/icing/icing-search-engine_benchmark.cc
+++ b/icing/icing-search-engine_benchmark.cc
@@ -43,6 +43,7 @@
#include "icing/testing/common-matchers.h"
#include "icing/testing/document-generator.h"
#include "icing/testing/random-string.h"
+#include "icing/testing/recorder-test-utils.h"
#include "icing/testing/schema-generator.h"
#include "icing/testing/tmp-directory.h"
@@ -177,12 +178,12 @@ class DestructibleDirectory {
};
std::vector<DocumentProto> GenerateRandomDocuments(
- EvenDistributionTypeSelector* type_selector, int num_docs,
- const std::vector<std::string>& language) {
+ EvenDistributionTypeSelector* type_selector, int num_docs) {
std::vector<std::string> namespaces = CreateNamespaces(kAvgNumNamespaces);
EvenDistributionNamespaceSelector namespace_selector(namespaces);
std::default_random_engine random;
+ std::vector<std::string> language = CreateLanguages(kLanguageSize, &random);
UniformDistributionLanguageTokenGenerator<std::default_random_engine>
token_generator(language, &random);
@@ -226,9 +227,8 @@ void BM_IndexLatency(benchmark::State& state) {
ASSERT_THAT(icing->SetSchema(schema).status(), ProtoIsOk());
int num_docs = state.range(0);
- std::vector<std::string> language = CreateLanguages(kLanguageSize, &random);
const std::vector<DocumentProto> random_docs =
- GenerateRandomDocuments(&type_selector, num_docs, language);
+ GenerateRandomDocuments(&type_selector, num_docs);
Timer timer;
for (const DocumentProto& doc : random_docs) {
ASSERT_THAT(icing->Put(doc).status(), ProtoIsOk());
@@ -271,56 +271,6 @@ BENCHMARK(BM_IndexLatency)
->ArgPair(1 << 15, 10)
->ArgPair(1 << 17, 10);
-void BM_QueryLatency(benchmark::State& state) {
- // Initialize the filesystem
- std::string test_dir = GetTestTempDir() + "/icing/benchmark";
- Filesystem filesystem;
- DestructibleDirectory ddir(filesystem, test_dir);
-
- // Create the schema.
- std::default_random_engine random;
- int num_types = kAvgNumNamespaces * kAvgNumTypes;
- ExactStringPropertyGenerator property_generator;
- SchemaGenerator<ExactStringPropertyGenerator> schema_generator(
- /*num_properties=*/state.range(1), &property_generator);
- SchemaProto schema = schema_generator.GenerateSchema(num_types);
- EvenDistributionTypeSelector type_selector(schema);
-
- // Create the index.
- IcingSearchEngineOptions options;
- options.set_base_dir(test_dir);
- options.set_index_merge_size(kIcingFullIndexSize);
- std::unique_ptr<IcingSearchEngine> icing =
- std::make_unique<IcingSearchEngine>(options);
-
- ASSERT_THAT(icing->Initialize().status(), ProtoIsOk());
- ASSERT_THAT(icing->SetSchema(schema).status(), ProtoIsOk());
-
- int num_docs = state.range(0);
- std::vector<std::string> language = CreateLanguages(kLanguageSize, &random);
- const std::vector<DocumentProto> random_docs =
- GenerateRandomDocuments(&type_selector, num_docs, language);
- for (const DocumentProto& doc : random_docs) {
- ASSERT_THAT(icing->Put(doc).status(), ProtoIsOk());
- }
-
- SearchSpecProto search_spec = CreateSearchSpec(
- language.at(0), std::vector<std::string>(), TermMatchType::PREFIX);
- ResultSpecProto result_spec = CreateResultSpec(1000000, 1000000, 1000000);
- ScoringSpecProto scoring_spec =
- CreateScoringSpec(ScoringSpecProto::RankingStrategy::CREATION_TIMESTAMP);
- for (auto _ : state) {
- SearchResultProto results = icing->Search(
- search_spec, ScoringSpecProto::default_instance(), result_spec);
- }
-}
-BENCHMARK(BM_QueryLatency)
- // Arguments: num_indexed_documents, num_sections
- ->ArgPair(32, 2)
- ->ArgPair(128, 2)
- ->ArgPair(1 << 10, 2)
- ->ArgPair(1 << 13, 2);
-
void BM_IndexThroughput(benchmark::State& state) {
// Initialize the filesystem
std::string test_dir = GetTestTempDir() + "/icing/benchmark";
@@ -347,9 +297,8 @@ void BM_IndexThroughput(benchmark::State& state) {
ASSERT_THAT(icing->SetSchema(schema).status(), ProtoIsOk());
int num_docs = state.range(0);
- std::vector<std::string> language = CreateLanguages(kLanguageSize, &random);
const std::vector<DocumentProto> random_docs =
- GenerateRandomDocuments(&type_selector, num_docs, language);
+ GenerateRandomDocuments(&type_selector, num_docs);
for (auto s : state) {
for (const DocumentProto& doc : random_docs) {
ASSERT_THAT(icing->Put(doc).status(), ProtoIsOk());
diff --git a/icing/icing-search-engine_fuzz_test.cc b/icing/icing-search-engine_fuzz_test.cc
index bf486da..2d07e37 100644
--- a/icing/icing-search-engine_fuzz_test.cc
+++ b/icing/icing-search-engine_fuzz_test.cc
@@ -18,12 +18,12 @@
#include "icing/text_classifier/lib3/utils/base/status.h"
#include "icing/text_classifier/lib3/utils/base/statusor.h"
#include "icing/document-builder.h"
+#include "icing/helpers/icu/icu-data-file-helper.h"
#include "icing/icing-search-engine.h"
#include "icing/proto/document.pb.h"
#include "icing/proto/initialize.pb.h"
#include "icing/proto/scoring.pb.h"
#include "icing/schema-builder.h"
-#include "icing/testing/icu-data-file-helper.h"
#include "icing/testing/test-data.h"
#include "icing/testing/tmp-directory.h"
@@ -31,13 +31,13 @@ namespace icing {
namespace lib {
namespace {
-constexpr PropertyConfigProto::Cardinality::Code CARDINALITY_REQUIRED =
- PropertyConfigProto::Cardinality::REQUIRED;
+constexpr PropertyConfigProto_Cardinality_Code CARDINALITY_REQUIRED =
+ PropertyConfigProto_Cardinality_Code_REQUIRED;
-constexpr StringIndexingConfig::TokenizerType::Code TOKENIZER_PLAIN =
- StringIndexingConfig::TokenizerType::PLAIN;
+constexpr StringIndexingConfig_TokenizerType_Code TOKENIZER_PLAIN =
+ StringIndexingConfig_TokenizerType_Code_PLAIN;
-constexpr TermMatchType::Code MATCH_PREFIX = TermMatchType::PREFIX;
+constexpr TermMatchType_Code MATCH_PREFIX = TermMatchType_Code_PREFIX;
IcingSearchEngineOptions Setup() {
IcingSearchEngineOptions icing_options;
diff --git a/icing/icing-search-engine_test.cc b/icing/icing-search-engine_test.cc
index 13e77b8..4c15827 100644
--- a/icing/icing-search-engine_test.cc
+++ b/icing/icing-search-engine_test.cc
@@ -27,8 +27,8 @@
#include "icing/document-builder.h"
#include "icing/file/filesystem.h"
#include "icing/file/mock-filesystem.h"
+#include "icing/helpers/icu/icu-data-file-helper.h"
#include "icing/legacy/index/icing-mock-filesystem.h"
-#include "icing/portable/endian.h"
#include "icing/portable/equals-proto.h"
#include "icing/portable/platform.h"
#include "icing/proto/document.pb.h"
@@ -45,7 +45,6 @@
#include "icing/store/document-log-creator.h"
#include "icing/testing/common-matchers.h"
#include "icing/testing/fake-clock.h"
-#include "icing/testing/icu-data-file-helper.h"
#include "icing/testing/jni-test-helpers.h"
#include "icing/testing/random-string.h"
#include "icing/testing/snippet-helpers.h"
@@ -90,24 +89,21 @@ constexpr std::string_view kIpsumText =
"vehicula posuere vitae, convallis eu lorem. Donec semper augue eu nibh "
"placerat semper.";
-constexpr PropertyConfigProto::Cardinality::Code CARDINALITY_OPTIONAL =
- PropertyConfigProto::Cardinality::OPTIONAL;
-constexpr PropertyConfigProto::Cardinality::Code CARDINALITY_REQUIRED =
- PropertyConfigProto::Cardinality::REQUIRED;
-constexpr PropertyConfigProto::Cardinality::Code CARDINALITY_REPEATED =
- PropertyConfigProto::Cardinality::REPEATED;
+constexpr PropertyConfigProto_Cardinality_Code CARDINALITY_OPTIONAL =
+ PropertyConfigProto_Cardinality_Code_OPTIONAL;
+constexpr PropertyConfigProto_Cardinality_Code CARDINALITY_REQUIRED =
+ PropertyConfigProto_Cardinality_Code_REQUIRED;
+constexpr PropertyConfigProto_Cardinality_Code CARDINALITY_REPEATED =
+ PropertyConfigProto_Cardinality_Code_REPEATED;
-constexpr StringIndexingConfig::TokenizerType::Code TOKENIZER_PLAIN =
- StringIndexingConfig::TokenizerType::PLAIN;
-constexpr StringIndexingConfig::TokenizerType::Code TOKENIZER_NONE =
- StringIndexingConfig::TokenizerType::NONE;
+constexpr StringIndexingConfig_TokenizerType_Code TOKENIZER_PLAIN =
+ StringIndexingConfig_TokenizerType_Code_PLAIN;
+constexpr StringIndexingConfig_TokenizerType_Code TOKENIZER_NONE =
+ StringIndexingConfig_TokenizerType_Code_NONE;
-#ifndef ICING_JNI_TEST
-constexpr TermMatchType::Code MATCH_EXACT = TermMatchType::EXACT_ONLY;
-#endif // !ICING_JNI_TEST
-
-constexpr TermMatchType::Code MATCH_PREFIX = TermMatchType::PREFIX;
-constexpr TermMatchType::Code MATCH_NONE = TermMatchType::UNKNOWN;
+constexpr TermMatchType_Code MATCH_EXACT = TermMatchType_Code_EXACT_ONLY;
+constexpr TermMatchType_Code MATCH_PREFIX = TermMatchType_Code_PREFIX;
+constexpr TermMatchType_Code MATCH_NONE = TermMatchType_Code_UNKNOWN;
PortableFileBackedProtoLog<DocumentWrapper>::Header ReadDocumentLogHeader(
Filesystem filesystem, const std::string& file_path) {
@@ -362,6 +358,36 @@ TEST_F(IcingSearchEngineTest, GoodIndexMergeSizeReturnsOk) {
EXPECT_THAT(icing.Initialize().status(), ProtoIsOk());
}
+TEST_F(IcingSearchEngineTest,
+ NegativeMaxTokensPerDocSizeReturnsInvalidArgument) {
+ IcingSearchEngineOptions options = GetDefaultIcingOptions();
+ options.set_max_tokens_per_doc(-1);
+ IcingSearchEngine icing(options, GetTestJniCache());
+ EXPECT_THAT(icing.Initialize().status(),
+ ProtoStatusIs(StatusProto::INVALID_ARGUMENT));
+}
+
+TEST_F(IcingSearchEngineTest, ZeroMaxTokensPerDocSizeReturnsInvalidArgument) {
+ IcingSearchEngineOptions options = GetDefaultIcingOptions();
+ options.set_max_tokens_per_doc(0);
+ IcingSearchEngine icing(options, GetTestJniCache());
+ EXPECT_THAT(icing.Initialize().status(),
+ ProtoStatusIs(StatusProto::INVALID_ARGUMENT));
+}
+
+TEST_F(IcingSearchEngineTest, GoodMaxTokensPerDocSizeReturnsOk) {
+ IcingSearchEngineOptions options = GetDefaultIcingOptions();
+ // INT_MAX is valid - it just means that we shouldn't limit the number of
+ // tokens per document. It would be pretty inconceivable that anyone would
+ // produce such a document - the text being indexed alone would take up at
+ // least ~4.3 GiB! - and the document would be rejected before indexing
+ // for exceeding max_document_size, but there's no reason to explicitly
+ // bar it.
+ options.set_max_tokens_per_doc(std::numeric_limits<int32_t>::max());
+ IcingSearchEngine icing(options, GetTestJniCache());
+ EXPECT_THAT(icing.Initialize().status(), ProtoIsOk());
+}
+
TEST_F(IcingSearchEngineTest, NegativeMaxTokenLenReturnsInvalidArgument) {
IcingSearchEngineOptions options = GetDefaultIcingOptions();
options.set_max_token_length(-1);
@@ -478,217 +504,6 @@ TEST_F(IcingSearchEngineTest, FailToCreateDocStore) {
HasSubstr("Could not create directory"));
}
-TEST_F(IcingSearchEngineTest, InitMarkerFilePreviousFailuresAtThreshold) {
- Filesystem filesystem;
- DocumentProto email1 =
- CreateEmailDocument("namespace", "uri1", 100, "subject1", "body1");
- email1.set_creation_timestamp_ms(10000);
- DocumentProto email2 =
- CreateEmailDocument("namespace", "uri2", 50, "subject2", "body2");
- email2.set_creation_timestamp_ms(10000);
-
- {
- // Create an index with a few documents.
- IcingSearchEngine icing(GetDefaultIcingOptions(), GetTestJniCache());
- InitializeResultProto init_result = icing.Initialize();
- ASSERT_THAT(init_result.status(), ProtoIsOk());
- ASSERT_THAT(init_result.initialize_stats().num_previous_init_failures(),
- Eq(0));
- ASSERT_THAT(icing.SetSchema(CreateEmailSchema()).status(), ProtoIsOk());
- ASSERT_THAT(icing.Put(email1).status(), ProtoIsOk());
- ASSERT_THAT(icing.Put(email2).status(), ProtoIsOk());
- }
-
- // Write an init marker file with 5 previously failed attempts.
- std::string marker_filepath = GetTestBaseDir() + "/init_marker";
-
- {
- ScopedFd marker_file_fd(filesystem.OpenForWrite(marker_filepath.c_str()));
- int network_init_attempts = GHostToNetworkL(5);
- // Write the updated number of attempts before we get started.
- ASSERT_TRUE(filesystem.PWrite(marker_file_fd.get(), 0,
- &network_init_attempts,
- sizeof(network_init_attempts)));
- ASSERT_TRUE(filesystem.DataSync(marker_file_fd.get()));
- }
-
- {
- // Create the index again and verify that initialization succeeds and no
- // data is thrown out.
- IcingSearchEngine icing(GetDefaultIcingOptions(), GetTestJniCache());
- InitializeResultProto init_result = icing.Initialize();
- ASSERT_THAT(init_result.status(), ProtoIsOk());
- ASSERT_THAT(init_result.initialize_stats().num_previous_init_failures(),
- Eq(5));
- EXPECT_THAT(
- icing.Get("namespace", "uri1", GetResultSpecProto::default_instance())
- .document(),
- EqualsProto(email1));
- EXPECT_THAT(
- icing.Get("namespace", "uri2", GetResultSpecProto::default_instance())
- .document(),
- EqualsProto(email2));
- }
-
- // The successful init should have thrown out the marker file.
- ASSERT_FALSE(filesystem.FileExists(marker_filepath.c_str()));
-}
-
-TEST_F(IcingSearchEngineTest, InitMarkerFilePreviousFailuresBeyondThreshold) {
- Filesystem filesystem;
- DocumentProto email1 =
- CreateEmailDocument("namespace", "uri1", 100, "subject1", "body1");
- DocumentProto email2 =
- CreateEmailDocument("namespace", "uri2", 50, "subject2", "body2");
-
- {
- // Create an index with a few documents.
- IcingSearchEngine icing(GetDefaultIcingOptions(), GetTestJniCache());
- InitializeResultProto init_result = icing.Initialize();
- ASSERT_THAT(init_result.status(), ProtoIsOk());
- ASSERT_THAT(init_result.initialize_stats().num_previous_init_failures(),
- Eq(0));
- ASSERT_THAT(icing.SetSchema(CreateEmailSchema()).status(), ProtoIsOk());
- ASSERT_THAT(icing.Put(email1).status(), ProtoIsOk());
- ASSERT_THAT(icing.Put(email2).status(), ProtoIsOk());
- }
-
- // Write an init marker file with 6 previously failed attempts.
- std::string marker_filepath = GetTestBaseDir() + "/init_marker";
-
- {
- ScopedFd marker_file_fd(filesystem.OpenForWrite(marker_filepath.c_str()));
- int network_init_attempts = GHostToNetworkL(6);
- // Write the updated number of attempts before we get started.
- ASSERT_TRUE(filesystem.PWrite(marker_file_fd.get(), 0,
- &network_init_attempts,
- sizeof(network_init_attempts)));
- ASSERT_TRUE(filesystem.DataSync(marker_file_fd.get()));
- }
-
- {
- // Create the index again and verify that initialization succeeds and all
- // data is thrown out.
- IcingSearchEngine icing(GetDefaultIcingOptions(), GetTestJniCache());
- InitializeResultProto init_result = icing.Initialize();
- ASSERT_THAT(init_result.status(),
- ProtoStatusIs(StatusProto::WARNING_DATA_LOSS));
- ASSERT_THAT(init_result.initialize_stats().num_previous_init_failures(),
- Eq(6));
- EXPECT_THAT(
- icing.Get("namespace", "uri1", GetResultSpecProto::default_instance())
- .status(),
- ProtoStatusIs(StatusProto::NOT_FOUND));
- EXPECT_THAT(
- icing.Get("namespace", "uri2", GetResultSpecProto::default_instance())
- .status(),
- ProtoStatusIs(StatusProto::NOT_FOUND));
- }
-
- // The successful init should have thrown out the marker file.
- ASSERT_FALSE(filesystem.FileExists(marker_filepath.c_str()));
-}
-
-TEST_F(IcingSearchEngineTest, SuccessiveInitFailuresIncrementsInitMarker) {
- Filesystem filesystem;
- DocumentProto email1 =
- CreateEmailDocument("namespace", "uri1", 100, "subject1", "body1");
- DocumentProto email2 =
- CreateEmailDocument("namespace", "uri2", 50, "subject2", "body2");
-
- {
- // 1. Create an index with a few documents.
- IcingSearchEngine icing(GetDefaultIcingOptions(), GetTestJniCache());
- InitializeResultProto init_result = icing.Initialize();
- ASSERT_THAT(init_result.status(), ProtoIsOk());
- ASSERT_THAT(init_result.initialize_stats().num_previous_init_failures(),
- Eq(0));
- ASSERT_THAT(icing.SetSchema(CreateEmailSchema()).status(), ProtoIsOk());
- ASSERT_THAT(icing.Put(email1).status(), ProtoIsOk());
- ASSERT_THAT(icing.Put(email2).status(), ProtoIsOk());
- }
-
- {
- // 2. Create an index that will encounter an IO failure when trying to
- // create the document log.
- IcingSearchEngineOptions icing_options = GetDefaultIcingOptions();
-
- auto mock_filesystem = std::make_unique<MockFilesystem>();
- std::string document_log_filepath =
- icing_options.base_dir() + "/document_dir/document_log_v1";
- auto get_filesize_lambda = [this,
- &document_log_filepath](const char* filename) {
- if (strncmp(document_log_filepath.c_str(), filename,
- document_log_filepath.length()) == 0) {
- return Filesystem::kBadFileSize;
- }
- return this->filesystem()->GetFileSize(filename);
- };
- ON_CALL(*mock_filesystem, GetFileSize(A<const char*>()))
- .WillByDefault(get_filesize_lambda);
-
- TestIcingSearchEngine icing(icing_options, std::move(mock_filesystem),
- std::make_unique<IcingFilesystem>(),
- std::make_unique<FakeClock>(),
- GetTestJniCache());
-
- // Fail to initialize six times in a row.
- InitializeResultProto init_result = icing.Initialize();
- ASSERT_THAT(init_result.status(), ProtoStatusIs(StatusProto::INTERNAL));
- ASSERT_THAT(init_result.initialize_stats().num_previous_init_failures(),
- Eq(0));
-
- init_result = icing.Initialize();
- ASSERT_THAT(init_result.status(), ProtoStatusIs(StatusProto::INTERNAL));
- ASSERT_THAT(init_result.initialize_stats().num_previous_init_failures(),
- Eq(1));
-
- init_result = icing.Initialize();
- ASSERT_THAT(init_result.status(), ProtoStatusIs(StatusProto::INTERNAL));
- ASSERT_THAT(init_result.initialize_stats().num_previous_init_failures(),
- Eq(2));
-
- init_result = icing.Initialize();
- ASSERT_THAT(init_result.status(), ProtoStatusIs(StatusProto::INTERNAL));
- ASSERT_THAT(init_result.initialize_stats().num_previous_init_failures(),
- Eq(3));
-
- init_result = icing.Initialize();
- ASSERT_THAT(init_result.status(), ProtoStatusIs(StatusProto::INTERNAL));
- ASSERT_THAT(init_result.initialize_stats().num_previous_init_failures(),
- Eq(4));
-
- init_result = icing.Initialize();
- ASSERT_THAT(init_result.status(), ProtoStatusIs(StatusProto::INTERNAL));
- ASSERT_THAT(init_result.initialize_stats().num_previous_init_failures(),
- Eq(5));
- }
-
- {
- // 3. Create the index again and verify that initialization succeeds and all
- // data is thrown out.
- IcingSearchEngine icing(GetDefaultIcingOptions(), GetTestJniCache());
- InitializeResultProto init_result = icing.Initialize();
- ASSERT_THAT(init_result.status(),
- ProtoStatusIs(StatusProto::WARNING_DATA_LOSS));
- ASSERT_THAT(init_result.initialize_stats().num_previous_init_failures(),
- Eq(6));
-
- EXPECT_THAT(
- icing.Get("namespace", "uri1", GetResultSpecProto::default_instance())
- .status(),
- ProtoStatusIs(StatusProto::NOT_FOUND));
- EXPECT_THAT(
- icing.Get("namespace", "uri2", GetResultSpecProto::default_instance())
- .status(),
- ProtoStatusIs(StatusProto::NOT_FOUND));
- }
-
- // The successful init should have thrown out the marker file.
- std::string marker_filepath = GetTestBaseDir() + "/init_marker";
- ASSERT_FALSE(filesystem.FileExists(marker_filepath.c_str()));
-}
-
TEST_F(IcingSearchEngineTest,
CircularReferenceCreateSectionManagerReturnsInvalidArgument) {
// Create a type config with a circular reference.
@@ -765,7 +580,8 @@ TEST_F(IcingSearchEngineTest, FailToWriteSchema) {
auto mock_filesystem = std::make_unique<MockFilesystem>();
// This fails FileBackedProto::Write()
- ON_CALL(*mock_filesystem, OpenForWrite(HasSubstr("schema.pb")))
+ ON_CALL(*mock_filesystem,
+ OpenForWrite(Eq(icing_options.base_dir() + "/schema_dir/schema.pb")))
.WillByDefault(Return(-1));
TestIcingSearchEngine icing(icing_options, std::move(mock_filesystem),
@@ -922,13 +738,7 @@ TEST_F(IcingSearchEngineTest, SetSchemaCompatibleVersionUpdateSucceeds) {
property->set_data_type(PropertyConfigProto::DataType::STRING);
property->set_cardinality(PropertyConfigProto::Cardinality::OPTIONAL);
- SetSchemaResultProto set_schema_result = icing.SetSchema(schema);
- // Ignore latency numbers. They're covered elsewhere.
- set_schema_result.clear_latency_ms();
- SetSchemaResultProto expected_set_schema_result;
- expected_set_schema_result.mutable_status()->set_code(StatusProto::OK);
- expected_set_schema_result.mutable_new_schema_types()->Add("Email");
- EXPECT_THAT(set_schema_result, EqualsProto(expected_set_schema_result));
+ EXPECT_THAT(icing.SetSchema(schema).status(), ProtoIsOk());
EXPECT_THAT(icing.GetSchema().schema().types(0).version(), Eq(1));
}
@@ -946,20 +756,12 @@ TEST_F(IcingSearchEngineTest, SetSchemaCompatibleVersionUpdateSucceeds) {
property->set_property_name("title");
property->set_data_type(PropertyConfigProto::DataType::STRING);
property->set_cardinality(PropertyConfigProto::Cardinality::OPTIONAL);
- property = type->add_properties();
property->set_property_name("body");
property->set_data_type(PropertyConfigProto::DataType::STRING);
property->set_cardinality(PropertyConfigProto::Cardinality::OPTIONAL);
// 3. SetSchema should succeed and the version number should be updated.
- SetSchemaResultProto set_schema_result = icing.SetSchema(schema, true);
- // Ignore latency numbers. They're covered elsewhere.
- set_schema_result.clear_latency_ms();
- SetSchemaResultProto expected_set_schema_result;
- expected_set_schema_result.mutable_status()->set_code(StatusProto::OK);
- expected_set_schema_result.mutable_fully_compatible_changed_schema_types()
- ->Add("Email");
- EXPECT_THAT(set_schema_result, EqualsProto(expected_set_schema_result));
+ EXPECT_THAT(icing.SetSchema(schema, true).status(), ProtoIsOk());
EXPECT_THAT(icing.GetSchema().schema().types(0).version(), Eq(2));
}
@@ -1145,12 +947,7 @@ TEST_F(IcingSearchEngineTest,
}
TEST_F(IcingSearchEngineTest, SetSchema) {
- auto fake_clock = std::make_unique<FakeClock>();
- fake_clock->SetTimerElapsedMilliseconds(1000);
- TestIcingSearchEngine icing(GetDefaultIcingOptions(),
- std::make_unique<Filesystem>(),
- std::make_unique<IcingFilesystem>(),
- std::move(fake_clock), GetTestJniCache());
+ IcingSearchEngine icing(GetDefaultIcingOptions(), GetTestJniCache());
ASSERT_THAT(icing.Initialize().status(), ProtoIsOk());
auto message_document = CreateMessageDocument("namespace", "uri");
@@ -1179,31 +976,26 @@ TEST_F(IcingSearchEngineTest, SetSchema) {
empty_type->set_schema_type("");
// Make sure we can't set invalid schemas
- SetSchemaResultProto set_schema_result = icing.SetSchema(invalid_schema);
- EXPECT_THAT(set_schema_result.status(),
+ EXPECT_THAT(icing.SetSchema(invalid_schema).status(),
ProtoStatusIs(StatusProto::INVALID_ARGUMENT));
- EXPECT_THAT(set_schema_result.latency_ms(), Eq(1000));
// Can add an document of a set schema
- set_schema_result = icing.SetSchema(schema_with_message);
- EXPECT_THAT(set_schema_result.status(), ProtoStatusIs(StatusProto::OK));
- EXPECT_THAT(set_schema_result.latency_ms(), Eq(1000));
+ EXPECT_THAT(icing.SetSchema(schema_with_message).status(), ProtoIsOk());
EXPECT_THAT(icing.Put(message_document).status(), ProtoIsOk());
// Schema with Email doesn't have Message, so would result incompatible
// data
- set_schema_result = icing.SetSchema(schema_with_email);
- EXPECT_THAT(set_schema_result.status(),
+ EXPECT_THAT(icing.SetSchema(schema_with_email).status(),
ProtoStatusIs(StatusProto::FAILED_PRECONDITION));
- EXPECT_THAT(set_schema_result.latency_ms(), Eq(1000));
// Can expand the set of schema types and add an document of a new
// schema type
- set_schema_result = icing.SetSchema(schema_with_email_and_message);
- EXPECT_THAT(set_schema_result.status(), ProtoStatusIs(StatusProto::OK));
- EXPECT_THAT(set_schema_result.latency_ms(), Eq(1000));
-
+ EXPECT_THAT(icing.SetSchema(SchemaProto(schema_with_email_and_message))
+ .status()
+ .code(),
+ Eq(StatusProto::OK));
EXPECT_THAT(icing.Put(message_document).status(), ProtoIsOk());
+
// Can't add an document whose schema isn't set
auto photo_document = DocumentBuilder()
.SetKey("namespace", "uri")
@@ -1217,7 +1009,7 @@ TEST_F(IcingSearchEngineTest, SetSchema) {
}
TEST_F(IcingSearchEngineTest,
- SetSchemaNewIndexedPropertyTriggersIndexRestorationAndReturnsOk) {
+ SetSchemaTriggersIndexRestorationAndReturnsOk) {
IcingSearchEngine icing(GetDefaultIcingOptions(), GetTestJniCache());
ASSERT_THAT(icing.Initialize().status(), ProtoIsOk());
@@ -1226,15 +1018,8 @@ TEST_F(IcingSearchEngineTest,
->mutable_properties(0)
->clear_string_indexing_config();
- SetSchemaResultProto set_schema_result =
- icing.SetSchema(schema_with_no_indexed_property);
- // Ignore latency numbers. They're covered elsewhere.
- set_schema_result.clear_latency_ms();
- SetSchemaResultProto expected_set_schema_result;
- expected_set_schema_result.mutable_status()->set_code(StatusProto::OK);
- expected_set_schema_result.mutable_new_schema_types()->Add("Message");
- EXPECT_THAT(set_schema_result, EqualsProto(expected_set_schema_result));
-
+ EXPECT_THAT(icing.SetSchema(schema_with_no_indexed_property).status(),
+ ProtoIsOk());
// Nothing will be index and Search() won't return anything.
EXPECT_THAT(icing.Put(CreateMessageDocument("namespace", "uri")).status(),
ProtoIsOk());
@@ -1255,14 +1040,8 @@ TEST_F(IcingSearchEngineTest,
SchemaProto schema_with_indexed_property = CreateMessageSchema();
// Index restoration should be triggered here because new schema requires more
// properties to be indexed.
- set_schema_result = icing.SetSchema(schema_with_indexed_property);
- // Ignore latency numbers. They're covered elsewhere.
- set_schema_result.clear_latency_ms();
- expected_set_schema_result = SetSchemaResultProto();
- expected_set_schema_result.mutable_status()->set_code(StatusProto::OK);
- expected_set_schema_result.mutable_index_incompatible_changed_schema_types()
- ->Add("Message");
- EXPECT_THAT(set_schema_result, EqualsProto(expected_set_schema_result));
+ EXPECT_THAT(icing.SetSchema(schema_with_indexed_property).status(),
+ ProtoIsOk());
SearchResultProto expected_search_result_proto;
expected_search_result_proto.mutable_status()->set_code(StatusProto::OK);
@@ -1306,12 +1085,8 @@ TEST_F(IcingSearchEngineTest,
.Build();
SetSchemaResultProto set_schema_result = icing.SetSchema(nested_schema);
- // Ignore latency numbers. They're covered elsewhere.
- set_schema_result.clear_latency_ms();
SetSchemaResultProto expected_set_schema_result;
expected_set_schema_result.mutable_status()->set_code(StatusProto::OK);
- expected_set_schema_result.mutable_new_schema_types()->Add("Email");
- expected_set_schema_result.mutable_new_schema_types()->Add("Person");
EXPECT_THAT(set_schema_result, EqualsProto(expected_set_schema_result));
DocumentProto document =
@@ -1378,12 +1153,8 @@ TEST_F(IcingSearchEngineTest,
.Build();
set_schema_result = icing.SetSchema(no_nested_schema);
- // Ignore latency numbers. They're covered elsewhere.
- set_schema_result.clear_latency_ms();
expected_set_schema_result = SetSchemaResultProto();
expected_set_schema_result.mutable_status()->set_code(StatusProto::OK);
- expected_set_schema_result.mutable_index_incompatible_changed_schema_types()
- ->Add("Email");
EXPECT_THAT(set_schema_result, EqualsProto(expected_set_schema_result));
// document shouldn't match a query for 'Bill' in either 'sender.name' or
@@ -1426,10 +1197,7 @@ TEST_F(IcingSearchEngineTest,
SetSchemaResultProto set_schema_result =
icing.SetSchema(email_with_body_schema);
- // Ignore latency numbers. They're covered elsewhere.
- set_schema_result.clear_latency_ms();
SetSchemaResultProto expected_set_schema_result;
- expected_set_schema_result.mutable_new_schema_types()->Add("Email");
expected_set_schema_result.mutable_status()->set_code(StatusProto::OK);
EXPECT_THAT(set_schema_result, EqualsProto(expected_set_schema_result));
@@ -1475,12 +1243,8 @@ TEST_F(IcingSearchEngineTest,
set_schema_result = icing.SetSchema(
email_no_body_schema, /*ignore_errors_and_delete_documents=*/true);
- // Ignore latency numbers. They're covered elsewhere.
- set_schema_result.clear_latency_ms();
expected_set_schema_result = SetSchemaResultProto();
expected_set_schema_result.mutable_incompatible_schema_types()->Add("Email");
- expected_set_schema_result.mutable_index_incompatible_changed_schema_types()
- ->Add("Email");
expected_set_schema_result.mutable_status()->set_code(StatusProto::OK);
EXPECT_THAT(set_schema_result, EqualsProto(expected_set_schema_result));
@@ -1518,10 +1282,7 @@ TEST_F(
SetSchemaResultProto set_schema_result =
icing.SetSchema(email_with_body_schema);
- // Ignore latency numbers. They're covered elsewhere.
- set_schema_result.clear_latency_ms();
SetSchemaResultProto expected_set_schema_result;
- expected_set_schema_result.mutable_new_schema_types()->Add("Email");
expected_set_schema_result.mutable_status()->set_code(StatusProto::OK);
EXPECT_THAT(set_schema_result, EqualsProto(expected_set_schema_result));
@@ -1575,12 +1336,8 @@ TEST_F(
set_schema_result = icing.SetSchema(
email_no_body_schema, /*ignore_errors_and_delete_documents=*/true);
- // Ignore latency numbers. They're covered elsewhere.
- set_schema_result.clear_latency_ms();
expected_set_schema_result = SetSchemaResultProto();
expected_set_schema_result.mutable_incompatible_schema_types()->Add("Email");
- expected_set_schema_result.mutable_index_incompatible_changed_schema_types()
- ->Add("Email");
expected_set_schema_result.mutable_status()->set_code(StatusProto::OK);
EXPECT_THAT(set_schema_result, EqualsProto(expected_set_schema_result));
@@ -1628,11 +1385,7 @@ TEST_F(IcingSearchEngineTest, ForceSetSchemaIncompatibleNestedDocsAreDeleted) {
.Build();
SetSchemaResultProto set_schema_result = icing.SetSchema(nested_schema);
- // Ignore latency numbers. They're covered elsewhere.
- set_schema_result.clear_latency_ms();
SetSchemaResultProto expected_set_schema_result;
- expected_set_schema_result.mutable_new_schema_types()->Add("Email");
- expected_set_schema_result.mutable_new_schema_types()->Add("Person");
expected_set_schema_result.mutable_status()->set_code(StatusProto::OK);
EXPECT_THAT(set_schema_result, EqualsProto(expected_set_schema_result));
@@ -1685,15 +1438,9 @@ TEST_F(IcingSearchEngineTest, ForceSetSchemaIncompatibleNestedDocsAreDeleted) {
set_schema_result = icing.SetSchema(
nested_schema, /*ignore_errors_and_delete_documents=*/true);
- // Ignore latency numbers. They're covered elsewhere.
- set_schema_result.clear_latency_ms();
expected_set_schema_result = SetSchemaResultProto();
expected_set_schema_result.mutable_incompatible_schema_types()->Add("Person");
expected_set_schema_result.mutable_incompatible_schema_types()->Add("Email");
- expected_set_schema_result.mutable_index_incompatible_changed_schema_types()
- ->Add("Email");
- expected_set_schema_result.mutable_index_incompatible_changed_schema_types()
- ->Add("Person");
expected_set_schema_result.mutable_status()->set_code(StatusProto::OK);
EXPECT_THAT(set_schema_result, EqualsProto(expected_set_schema_result));
@@ -1752,10 +1499,6 @@ TEST_F(IcingSearchEngineTest, SetSchemaRevalidatesDocumentsAndReturnsOk) {
property->set_cardinality(PropertyConfigProto::Cardinality::REQUIRED);
// Can't set the schema since it's incompatible
- SetSchemaResultProto set_schema_result =
- icing.SetSchema(schema_with_required_subject);
- // Ignore latency numbers. They're covered elsewhere.
- set_schema_result.clear_latency_ms();
SetSchemaResultProto expected_set_schema_result_proto;
expected_set_schema_result_proto.mutable_status()->set_code(
StatusProto::FAILED_PRECONDITION);
@@ -1763,17 +1506,15 @@ TEST_F(IcingSearchEngineTest, SetSchemaRevalidatesDocumentsAndReturnsOk) {
"Schema is incompatible.");
expected_set_schema_result_proto.add_incompatible_schema_types("email");
- EXPECT_THAT(set_schema_result, EqualsProto(expected_set_schema_result_proto));
+ EXPECT_THAT(icing.SetSchema(schema_with_required_subject),
+ EqualsProto(expected_set_schema_result_proto));
// Force set it
- set_schema_result =
- icing.SetSchema(schema_with_required_subject,
- /*ignore_errors_and_delete_documents=*/true);
- // Ignore latency numbers. They're covered elsewhere.
- set_schema_result.clear_latency_ms();
expected_set_schema_result_proto.mutable_status()->set_code(StatusProto::OK);
expected_set_schema_result_proto.mutable_status()->clear_message();
- EXPECT_THAT(set_schema_result, EqualsProto(expected_set_schema_result_proto));
+ EXPECT_THAT(icing.SetSchema(schema_with_required_subject,
+ /*ignore_errors_and_delete_documents=*/true),
+ EqualsProto(expected_set_schema_result_proto));
GetResultProto expected_get_result_proto;
expected_get_result_proto.mutable_status()->set_code(StatusProto::OK);
@@ -1830,25 +1571,19 @@ TEST_F(IcingSearchEngineTest, SetSchemaDeletesDocumentsAndReturnsOk) {
type->set_schema_type("email");
// Can't set the schema since it's incompatible
- SetSchemaResultProto set_schema_result = icing.SetSchema(new_schema);
- // Ignore latency numbers. They're covered elsewhere.
- set_schema_result.clear_latency_ms();
SetSchemaResultProto expected_result;
expected_result.mutable_status()->set_code(StatusProto::FAILED_PRECONDITION);
expected_result.mutable_status()->set_message("Schema is incompatible.");
expected_result.add_deleted_schema_types("message");
- EXPECT_THAT(set_schema_result, EqualsProto(expected_result));
+ EXPECT_THAT(icing.SetSchema(new_schema), EqualsProto(expected_result));
// Force set it
- set_schema_result =
- icing.SetSchema(new_schema,
- /*ignore_errors_and_delete_documents=*/true);
- // Ignore latency numbers. They're covered elsewhere.
- set_schema_result.clear_latency_ms();
expected_result.mutable_status()->set_code(StatusProto::OK);
expected_result.mutable_status()->clear_message();
- EXPECT_THAT(set_schema_result, EqualsProto(expected_result));
+ EXPECT_THAT(icing.SetSchema(new_schema,
+ /*ignore_errors_and_delete_documents=*/true),
+ EqualsProto(expected_result));
// "email" document is still there
GetResultProto expected_get_result_proto;
@@ -2167,7 +1902,7 @@ TEST_F(IcingSearchEngineTest, SearchReturnsValidResults) {
search_spec.set_query("message");
ResultSpecProto result_spec;
- result_spec.mutable_snippet_spec()->set_max_window_utf32_length(64);
+ result_spec.mutable_snippet_spec()->set_max_window_bytes(64);
result_spec.mutable_snippet_spec()->set_num_matches_per_property(1);
result_spec.mutable_snippet_spec()->set_num_to_snippet(1);
@@ -2585,7 +2320,7 @@ TEST_F(IcingSearchEngineTest, ShouldReturnMultiplePagesWithSnippets) {
ResultSpecProto result_spec;
result_spec.set_num_per_page(2);
- result_spec.mutable_snippet_spec()->set_max_window_utf32_length(64);
+ result_spec.mutable_snippet_spec()->set_max_window_bytes(64);
result_spec.mutable_snippet_spec()->set_num_matches_per_property(1);
result_spec.mutable_snippet_spec()->set_num_to_snippet(3);
@@ -2992,17 +2727,13 @@ TEST_F(IcingSearchEngineTest, OptimizationFailureUninitializesIcing) {
};
ON_CALL(*mock_filesystem, CreateDirectoryRecursively)
.WillByDefault(create_dir_lambda);
-
auto swap_lambda = [&just_swapped_files](const char* first_dir,
const char* second_dir) {
just_swapped_files = true;
return false;
};
- IcingSearchEngineOptions options = GetDefaultIcingOptions();
- ON_CALL(*mock_filesystem, SwapFiles(HasSubstr("document_dir_optimize_tmp"),
- HasSubstr("document_dir")))
- .WillByDefault(swap_lambda);
- TestIcingSearchEngine icing(options, std::move(mock_filesystem),
+ ON_CALL(*mock_filesystem, SwapFiles).WillByDefault(swap_lambda);
+ TestIcingSearchEngine icing(GetDefaultIcingOptions(),
std::move(mock_filesystem),
std::make_unique<IcingFilesystem>(),
std::make_unique<FakeClock>(), GetTestJniCache());
@@ -3455,16 +3186,11 @@ TEST_F(IcingSearchEngineTest, DeleteByQuery) {
search_spec.set_term_match_type(TermMatchType::EXACT_ONLY);
DeleteByQueryResultProto result_proto = icing.DeleteByQuery(search_spec);
EXPECT_THAT(result_proto.status(), ProtoIsOk());
- DeleteByQueryStatsProto exp_stats;
+ DeleteStatsProto exp_stats;
+ exp_stats.set_delete_type(DeleteStatsProto::DeleteType::QUERY);
exp_stats.set_latency_ms(7);
exp_stats.set_num_documents_deleted(1);
- exp_stats.set_query_length(search_spec.query().length());
- exp_stats.set_num_terms(1);
- exp_stats.set_num_namespaces_filtered(0);
- exp_stats.set_num_schema_types_filtered(0);
- exp_stats.set_parse_query_latency_ms(7);
- exp_stats.set_document_removal_latency_ms(7);
- EXPECT_THAT(result_proto.delete_by_query_stats(), EqualsProto(exp_stats));
+ EXPECT_THAT(result_proto.delete_stats(), EqualsProto(exp_stats));
expected_get_result_proto.mutable_status()->set_code(StatusProto::NOT_FOUND);
expected_get_result_proto.mutable_status()->set_message(
@@ -3496,105 +3222,6 @@ TEST_F(IcingSearchEngineTest, DeleteByQuery) {
expected_search_result_proto));
}
-TEST_F(IcingSearchEngineTest, DeleteByQueryReturnInfo) {
- DocumentProto document1 =
- DocumentBuilder()
- .SetKey("namespace1", "uri1")
- .SetSchema("Message")
- .AddStringProperty("body", "message body1")
- .SetCreationTimestampMs(kDefaultCreationTimestampMs)
- .Build();
- DocumentProto document2 =
- DocumentBuilder()
- .SetKey("namespace2", "uri2")
- .SetSchema("Message")
- .AddStringProperty("body", "message body2")
- .SetCreationTimestampMs(kDefaultCreationTimestampMs)
- .Build();
- DocumentProto document3 =
- DocumentBuilder()
- .SetKey("namespace2", "uri3")
- .SetSchema("Message")
- .AddStringProperty("body", "message body3")
- .SetCreationTimestampMs(kDefaultCreationTimestampMs)
- .Build();
-
- auto fake_clock = std::make_unique<FakeClock>();
- fake_clock->SetTimerElapsedMilliseconds(7);
- TestIcingSearchEngine icing(GetDefaultIcingOptions(),
- std::make_unique<Filesystem>(),
- std::make_unique<IcingFilesystem>(),
- std::move(fake_clock), GetTestJniCache());
- ASSERT_THAT(icing.Initialize().status(), ProtoIsOk());
- ASSERT_THAT(icing.SetSchema(CreateMessageSchema()).status(), ProtoIsOk());
- ASSERT_THAT(icing.Put(document1).status(), ProtoIsOk());
- ASSERT_THAT(icing.Put(document2).status(), ProtoIsOk());
- ASSERT_THAT(icing.Put(document3).status(), ProtoIsOk());
-
- GetResultProto expected_get_result_proto;
- expected_get_result_proto.mutable_status()->set_code(StatusProto::OK);
- *expected_get_result_proto.mutable_document() = document1;
- EXPECT_THAT(
- icing.Get("namespace1", "uri1", GetResultSpecProto::default_instance()),
- EqualsProto(expected_get_result_proto));
-
- *expected_get_result_proto.mutable_document() = document2;
- EXPECT_THAT(
- icing.Get("namespace2", "uri2", GetResultSpecProto::default_instance()),
- EqualsProto(expected_get_result_proto));
-
- *expected_get_result_proto.mutable_document() = document3;
- EXPECT_THAT(
- icing.Get("namespace2", "uri3", GetResultSpecProto::default_instance()),
- EqualsProto(expected_get_result_proto));
-
- // Delete all docs to test the information is correctly grouped.
- SearchSpecProto search_spec;
- search_spec.set_query("message");
- search_spec.set_term_match_type(TermMatchType::EXACT_ONLY);
- DeleteByQueryResultProto result_proto =
- icing.DeleteByQuery(search_spec, true);
- EXPECT_THAT(result_proto.status(), ProtoIsOk());
- DeleteByQueryStatsProto exp_stats;
- exp_stats.set_latency_ms(7);
- exp_stats.set_num_documents_deleted(3);
- exp_stats.set_query_length(search_spec.query().length());
- exp_stats.set_num_terms(1);
- exp_stats.set_num_namespaces_filtered(0);
- exp_stats.set_num_schema_types_filtered(0);
- exp_stats.set_parse_query_latency_ms(7);
- exp_stats.set_document_removal_latency_ms(7);
- EXPECT_THAT(result_proto.delete_by_query_stats(), EqualsProto(exp_stats));
-
- // Check that DeleteByQuery can return information for deleted documents.
- DeleteByQueryResultProto::DocumentGroupInfo info1, info2;
- info1.set_namespace_("namespace1");
- info1.set_schema("Message");
- info1.add_uris("uri1");
- info2.set_namespace_("namespace2");
- info2.set_schema("Message");
- info2.add_uris("uri3");
- info2.add_uris("uri2");
- EXPECT_THAT(result_proto.deleted_documents(),
- UnorderedElementsAre(EqualsProto(info1), EqualsProto(info2)));
-
- EXPECT_THAT(
- icing.Get("namespace1", "uri1", GetResultSpecProto::default_instance())
- .status()
- .code(),
- Eq(StatusProto::NOT_FOUND));
- EXPECT_THAT(
- icing.Get("namespace2", "uri2", GetResultSpecProto::default_instance())
- .status()
- .code(),
- Eq(StatusProto::NOT_FOUND));
- EXPECT_THAT(
- icing.Get("namespace2", "uri3", GetResultSpecProto::default_instance())
- .status()
- .code(),
- Eq(StatusProto::NOT_FOUND));
-}
-
TEST_F(IcingSearchEngineTest, DeleteByQueryNotFound) {
DocumentProto document1 =
DocumentBuilder()
@@ -3755,8 +3382,7 @@ TEST_F(IcingSearchEngineTest, IcingShouldWorkFineIfOptimizationIsAborted) {
// fails. This will fail IcingSearchEngine::OptimizeDocumentStore() and makes
// it return ABORTED_ERROR.
auto mock_filesystem = std::make_unique<MockFilesystem>();
- ON_CALL(*mock_filesystem,
- DeleteDirectoryRecursively(HasSubstr("_optimize_tmp")))
+ ON_CALL(*mock_filesystem, DeleteDirectoryRecursively)
.WillByDefault(Return(false));
TestIcingSearchEngine icing(GetDefaultIcingOptions(),
@@ -3803,8 +3429,7 @@ TEST_F(IcingSearchEngineTest,
// Creates a mock filesystem in which SwapFiles() always fails and deletes the
// directories. This will fail IcingSearchEngine::OptimizeDocumentStore().
auto mock_filesystem = std::make_unique<MockFilesystem>();
- ON_CALL(*mock_filesystem, SwapFiles(HasSubstr("document_dir_optimize_tmp"),
- HasSubstr("document_dir")))
+ ON_CALL(*mock_filesystem, SwapFiles)
.WillByDefault([this](const char* one, const char* two) {
filesystem()->DeleteDirectoryRecursively(one);
filesystem()->DeleteDirectoryRecursively(two);
@@ -3875,8 +3500,7 @@ TEST_F(IcingSearchEngineTest, OptimizationShouldRecoverIfDataFilesAreMissing) {
// Creates a mock filesystem in which SwapFiles() always fails and empties the
// directories. This will fail IcingSearchEngine::OptimizeDocumentStore().
auto mock_filesystem = std::make_unique<MockFilesystem>();
- ON_CALL(*mock_filesystem, SwapFiles(HasSubstr("document_dir_optimize_tmp"),
- HasSubstr("document_dir")))
+ ON_CALL(*mock_filesystem, SwapFiles)
.WillByDefault([this](const char* one, const char* two) {
filesystem()->DeleteDirectoryRecursively(one);
filesystem()->CreateDirectoryRecursively(one);
@@ -5807,230 +5431,74 @@ TEST_F(IcingSearchEngineTest, SetSchemaCanDetectPreviousSchemaWasLost) {
EqualsSearchResultIgnoreStatsAndScores(empty_result));
}
-TEST_F(IcingSearchEngineTest, ImplicitPersistToDiskFullSavesEverything) {
- DocumentProto document = CreateMessageDocument("namespace", "uri");
+TEST_F(IcingSearchEngineTest, PersistToDisk) {
+ GetResultProto expected_get_result_proto;
+ expected_get_result_proto.mutable_status()->set_code(StatusProto::OK);
+ *expected_get_result_proto.mutable_document() =
+ CreateMessageDocument("namespace", "uri");
+
{
IcingSearchEngine icing(GetDefaultIcingOptions(), GetTestJniCache());
EXPECT_THAT(icing.Initialize().status(), ProtoIsOk());
EXPECT_THAT(icing.SetSchema(CreateMessageSchema()).status(), ProtoIsOk());
- EXPECT_THAT(icing.Put(document).status(), ProtoIsOk());
- } // Destructing calls a PersistToDisk(FULL)
-
- IcingSearchEngine icing(GetDefaultIcingOptions(), GetTestJniCache());
-
- // There should be no recovery since everything should be saved properly.
- InitializeResultProto init_result = icing.Initialize();
- EXPECT_THAT(init_result.status(), ProtoIsOk());
- EXPECT_THAT(init_result.initialize_stats().document_store_data_status(),
- Eq(InitializeStatsProto::NO_DATA_LOSS));
- EXPECT_THAT(init_result.initialize_stats().document_store_recovery_cause(),
- Eq(InitializeStatsProto::NONE));
- EXPECT_THAT(init_result.initialize_stats().schema_store_recovery_cause(),
- Eq(InitializeStatsProto::NONE));
- EXPECT_THAT(init_result.initialize_stats().index_restoration_cause(),
- Eq(InitializeStatsProto::NONE));
-
- // Schema is still intact.
- GetSchemaResultProto expected_get_schema_result_proto;
- expected_get_schema_result_proto.mutable_status()->set_code(StatusProto::OK);
- *expected_get_schema_result_proto.mutable_schema() = CreateMessageSchema();
+ EXPECT_THAT(icing.Put(CreateMessageDocument("namespace", "uri")).status(),
+ ProtoIsOk());
- EXPECT_THAT(icing.GetSchema(), EqualsProto(expected_get_schema_result_proto));
+ // Persisting shouldn't affect anything
+ EXPECT_THAT(icing.PersistToDisk(PersistType::FULL).status(), ProtoIsOk());
- // Documents are still intact.
- GetResultProto expected_get_result_proto;
- expected_get_result_proto.mutable_status()->set_code(StatusProto::OK);
- *expected_get_result_proto.mutable_document() = document;
+ EXPECT_THAT(
+ icing.Get("namespace", "uri", GetResultSpecProto::default_instance()),
+ EqualsProto(expected_get_result_proto));
+ } // Destructing persists as well
+ IcingSearchEngine icing(GetDefaultIcingOptions(), GetTestJniCache());
+ EXPECT_THAT(icing.Initialize().status(), ProtoIsOk());
EXPECT_THAT(
icing.Get("namespace", "uri", GetResultSpecProto::default_instance()),
EqualsProto(expected_get_result_proto));
-
- // Index is still intact.
- SearchSpecProto search_spec;
- search_spec.set_term_match_type(TermMatchType::PREFIX);
- search_spec.set_query("message"); // Content in the Message document.
-
- SearchResultProto expected_search_result_proto;
- expected_search_result_proto.mutable_status()->set_code(StatusProto::OK);
- *expected_search_result_proto.mutable_results()->Add()->mutable_document() =
- document;
-
- SearchResultProto actual_results =
- icing.Search(search_spec, GetDefaultScoringSpec(),
- ResultSpecProto::default_instance());
- EXPECT_THAT(actual_results, EqualsSearchResultIgnoreStatsAndScores(
- expected_search_result_proto));
}
-TEST_F(IcingSearchEngineTest, ExplicitPersistToDiskFullSavesEverything) {
- DocumentProto document = CreateMessageDocument("namespace", "uri");
-
- // Add schema and documents to our first icing1 instance.
+TEST_F(IcingSearchEngineTest, NoPersistToDiskLiteDoesntPersistPut) {
IcingSearchEngine icing1(GetDefaultIcingOptions(), GetTestJniCache());
EXPECT_THAT(icing1.Initialize().status(), ProtoIsOk());
EXPECT_THAT(icing1.SetSchema(CreateMessageSchema()).status(), ProtoIsOk());
- EXPECT_THAT(icing1.Put(document).status(), ProtoIsOk());
- EXPECT_THAT(icing1.PersistToDisk(PersistType::FULL).status(), ProtoIsOk());
-
- // Initialize a second icing2 instance which should have it's own memory
- // space. If data from icing1 isn't being persisted to the files, then icing2
- // won't be able to see those changes.
- IcingSearchEngine icing2(GetDefaultIcingOptions(), GetTestJniCache());
-
- // There should be no recovery since everything should be saved properly.
- InitializeResultProto init_result = icing2.Initialize();
- EXPECT_THAT(init_result.status(), ProtoIsOk());
- EXPECT_THAT(init_result.initialize_stats().document_store_data_status(),
- Eq(InitializeStatsProto::NO_DATA_LOSS));
- EXPECT_THAT(init_result.initialize_stats().document_store_recovery_cause(),
- Eq(InitializeStatsProto::NONE));
- EXPECT_THAT(init_result.initialize_stats().schema_store_recovery_cause(),
- Eq(InitializeStatsProto::NONE));
- EXPECT_THAT(init_result.initialize_stats().index_restoration_cause(),
- Eq(InitializeStatsProto::NONE));
-
- // Schema is still intact.
- GetSchemaResultProto expected_get_schema_result_proto;
- expected_get_schema_result_proto.mutable_status()->set_code(StatusProto::OK);
- *expected_get_schema_result_proto.mutable_schema() = CreateMessageSchema();
-
- EXPECT_THAT(icing2.GetSchema(),
- EqualsProto(expected_get_schema_result_proto));
-
- // Documents are still intact.
- GetResultProto expected_get_result_proto;
- expected_get_result_proto.mutable_status()->set_code(StatusProto::OK);
- *expected_get_result_proto.mutable_document() = document;
-
- EXPECT_THAT(
- icing2.Get("namespace", "uri", GetResultSpecProto::default_instance()),
- EqualsProto(expected_get_result_proto));
-
- // Index is still intact.
- SearchSpecProto search_spec;
- search_spec.set_term_match_type(TermMatchType::PREFIX);
- search_spec.set_query("message"); // Content in the Message document.
-
- SearchResultProto expected_search_result_proto;
- expected_search_result_proto.mutable_status()->set_code(StatusProto::OK);
- *expected_search_result_proto.mutable_results()->Add()->mutable_document() =
- document;
-
- SearchResultProto actual_results =
- icing2.Search(search_spec, GetDefaultScoringSpec(),
- ResultSpecProto::default_instance());
- EXPECT_THAT(actual_results, EqualsSearchResultIgnoreStatsAndScores(
- expected_search_result_proto));
-}
-
-TEST_F(IcingSearchEngineTest, NoPersistToDiskLosesAllDocumentsAndIndex) {
- IcingSearchEngine icing1(GetDefaultIcingOptions(), GetTestJniCache());
- EXPECT_THAT(icing1.Initialize().status(), ProtoIsOk());
- EXPECT_THAT(icing1.SetSchema(CreateMessageSchema()).status(), ProtoIsOk());
- DocumentProto document = CreateMessageDocument("namespace", "uri");
- EXPECT_THAT(icing1.Put(document).status(), ProtoIsOk());
+ DocumentProto document1 = CreateMessageDocument("namespace", "uri");
+ EXPECT_THAT(icing1.Put(document1).status(), ProtoIsOk());
EXPECT_THAT(
icing1.Get("namespace", "uri", GetResultSpecProto::default_instance())
.document(),
- EqualsProto(document));
-
- // It's intentional that no PersistToDisk call is made before initializing a
- // second instance of icing.
+ EqualsProto(document1));
IcingSearchEngine icing2(GetDefaultIcingOptions(), GetTestJniCache());
- InitializeResultProto init_result = icing2.Initialize();
- EXPECT_THAT(init_result.status(), ProtoIsOk());
- EXPECT_THAT(init_result.initialize_stats().document_store_data_status(),
- Eq(InitializeStatsProto::PARTIAL_LOSS));
- EXPECT_THAT(init_result.initialize_stats().document_store_recovery_cause(),
- Eq(InitializeStatsProto::DATA_LOSS));
- EXPECT_THAT(init_result.initialize_stats().schema_store_recovery_cause(),
- Eq(InitializeStatsProto::NONE));
- EXPECT_THAT(init_result.initialize_stats().index_restoration_cause(),
- Eq(InitializeStatsProto::NONE));
-
+ EXPECT_THAT(icing2.Initialize().status(), ProtoIsOk());
// The document shouldn't be found because we forgot to call
// PersistToDisk(LITE)!
EXPECT_THAT(
icing2.Get("namespace", "uri", GetResultSpecProto::default_instance())
.status(),
ProtoStatusIs(StatusProto::NOT_FOUND));
-
- // Searching also shouldn't get us anything because the index wasn't
- // recovered.
- SearchSpecProto search_spec;
- search_spec.set_term_match_type(TermMatchType::PREFIX);
- search_spec.set_query("message"); // Content in the Message document.
-
- SearchResultProto expected_search_result_proto;
- expected_search_result_proto.mutable_status()->set_code(StatusProto::OK);
-
- SearchResultProto actual_results =
- icing2.Search(search_spec, GetDefaultScoringSpec(),
- ResultSpecProto::default_instance());
- EXPECT_THAT(actual_results, EqualsSearchResultIgnoreStatsAndScores(
- expected_search_result_proto));
}
-TEST_F(IcingSearchEngineTest, PersistToDiskLiteSavesGroundTruth) {
- DocumentProto document = CreateMessageDocument("namespace", "uri");
-
+TEST_F(IcingSearchEngineTest, PersistToDiskLitePersistsPut) {
IcingSearchEngine icing1(GetDefaultIcingOptions(), GetTestJniCache());
EXPECT_THAT(icing1.Initialize().status(), ProtoIsOk());
EXPECT_THAT(icing1.SetSchema(CreateMessageSchema()).status(), ProtoIsOk());
- EXPECT_THAT(icing1.Put(document).status(), ProtoIsOk());
+ DocumentProto document1 = CreateMessageDocument("namespace", "uri");
+ EXPECT_THAT(icing1.Put(document1).status(), ProtoIsOk());
EXPECT_THAT(icing1.PersistToDisk(PersistType::LITE).status(), ProtoIsOk());
EXPECT_THAT(
icing1.Get("namespace", "uri", GetResultSpecProto::default_instance())
.document(),
- EqualsProto(document));
+ EqualsProto(document1));
IcingSearchEngine icing2(GetDefaultIcingOptions(), GetTestJniCache());
- InitializeResultProto init_result = icing2.Initialize();
- EXPECT_THAT(init_result.status(), ProtoIsOk());
- EXPECT_THAT(init_result.initialize_stats().document_store_data_status(),
- Eq(InitializeStatsProto::NO_DATA_LOSS));
- EXPECT_THAT(init_result.initialize_stats().schema_store_recovery_cause(),
- Eq(InitializeStatsProto::NONE));
-
- // A checksum mismatch gets reported as an IO error. The document store and
- // index didn't have their derived files included in the checksum previously,
- // so reinitializing will trigger a checksum mismatch.
- EXPECT_THAT(init_result.initialize_stats().document_store_recovery_cause(),
- Eq(InitializeStatsProto::IO_ERROR));
- EXPECT_THAT(init_result.initialize_stats().index_restoration_cause(),
- Eq(InitializeStatsProto::IO_ERROR));
-
- // Schema is still intact.
- GetSchemaResultProto expected_get_schema_result_proto;
- expected_get_schema_result_proto.mutable_status()->set_code(StatusProto::OK);
- *expected_get_schema_result_proto.mutable_schema() = CreateMessageSchema();
-
- EXPECT_THAT(icing2.GetSchema(),
- EqualsProto(expected_get_schema_result_proto));
-
+ EXPECT_THAT(icing2.Initialize().status(), ProtoIsOk());
// The document should be found because we called PersistToDisk(LITE)!
EXPECT_THAT(
icing2.Get("namespace", "uri", GetResultSpecProto::default_instance())
.document(),
- EqualsProto(document));
-
- // Recovered index is still intact.
- SearchSpecProto search_spec;
- search_spec.set_term_match_type(TermMatchType::PREFIX);
- search_spec.set_query("message"); // Content in the Message document.
-
- SearchResultProto expected_search_result_proto;
- expected_search_result_proto.mutable_status()->set_code(StatusProto::OK);
- *expected_search_result_proto.mutable_results()->Add()->mutable_document() =
- document;
-
- SearchResultProto actual_results =
- icing2.Search(search_spec, GetDefaultScoringSpec(),
- ResultSpecProto::default_instance());
- EXPECT_THAT(actual_results, EqualsSearchResultIgnoreStatsAndScores(
- expected_search_result_proto));
+ EqualsProto(document1));
}
TEST_F(IcingSearchEngineTest, ResetOk) {
@@ -6123,7 +5591,7 @@ TEST_F(IcingSearchEngineTest, SnippetNormalization) {
search_spec.set_query("mdi Zürich");
ResultSpecProto result_spec;
- result_spec.mutable_snippet_spec()->set_max_window_utf32_length(64);
+ result_spec.mutable_snippet_spec()->set_max_window_bytes(64);
result_spec.mutable_snippet_spec()->set_num_matches_per_property(2);
result_spec.mutable_snippet_spec()->set_num_to_snippet(2);
@@ -6186,7 +5654,7 @@ TEST_F(IcingSearchEngineTest, SnippetNormalizationPrefix) {
search_spec.set_query("md Zür");
ResultSpecProto result_spec;
- result_spec.mutable_snippet_spec()->set_max_window_utf32_length(64);
+ result_spec.mutable_snippet_spec()->set_max_window_bytes(64);
result_spec.mutable_snippet_spec()->set_num_matches_per_property(2);
result_spec.mutable_snippet_spec()->set_num_to_snippet(2);
@@ -6241,7 +5709,7 @@ TEST_F(IcingSearchEngineTest, SnippetSectionRestrict) {
search_spec.set_query("body:Zür");
ResultSpecProto result_spec;
- result_spec.mutable_snippet_spec()->set_max_window_utf32_length(64);
+ result_spec.mutable_snippet_spec()->set_max_window_bytes(64);
result_spec.mutable_snippet_spec()->set_num_matches_per_property(10);
result_spec.mutable_snippet_spec()->set_num_to_snippet(10);
@@ -7514,6 +6982,10 @@ TEST_F(IcingSearchEngineTest, PutDocumentShouldLogIndexingStats) {
// No merge should happen.
EXPECT_THAT(put_result_proto.put_document_stats().index_merge_latency_ms(),
Eq(0));
+ // Number of tokens should not exceed.
+ EXPECT_FALSE(put_result_proto.put_document_stats()
+ .tokenization_stats()
+ .exceeded_max_token_num());
// The input document has 2 tokens.
EXPECT_THAT(put_result_proto.put_document_stats()
.tokenization_stats()
@@ -7521,6 +6993,33 @@ TEST_F(IcingSearchEngineTest, PutDocumentShouldLogIndexingStats) {
Eq(2));
}
+TEST_F(IcingSearchEngineTest, PutDocumentShouldLogWhetherNumTokensExceeds) {
+ // Create a document with 2 tokens.
+ DocumentProto document = DocumentBuilder()
+ .SetKey("icing", "fake_type/0")
+ .SetSchema("Message")
+ .AddStringProperty("body", "message body")
+ .Build();
+
+ // Create an icing instance with max_tokens_per_doc = 1.
+ IcingSearchEngineOptions icing_options = GetDefaultIcingOptions();
+ icing_options.set_max_tokens_per_doc(1);
+ IcingSearchEngine icing(icing_options, GetTestJniCache());
+ ASSERT_THAT(icing.Initialize().status(), ProtoIsOk());
+ ASSERT_THAT(icing.SetSchema(CreateMessageSchema()).status(), ProtoIsOk());
+
+ PutResultProto put_result_proto = icing.Put(document);
+ EXPECT_THAT(put_result_proto.status(), ProtoIsOk());
+ // Number of tokens(2) exceeds the max allowed value(1).
+ EXPECT_TRUE(put_result_proto.put_document_stats()
+ .tokenization_stats()
+ .exceeded_max_token_num());
+ EXPECT_THAT(put_result_proto.put_document_stats()
+ .tokenization_stats()
+ .num_tokens_indexed(),
+ Eq(1));
+}
+
TEST_F(IcingSearchEngineTest, PutDocumentShouldLogIndexMergeLatency) {
DocumentProto document1 = DocumentBuilder()
.SetKey("icing", "fake_type/1")
@@ -7769,7 +7268,7 @@ TEST_F(IcingSearchEngineTest, QueryStatsProtoTest) {
ResultSpecProto result_spec;
result_spec.set_num_per_page(2);
- result_spec.mutable_snippet_spec()->set_max_window_utf32_length(64);
+ result_spec.mutable_snippet_spec()->set_max_window_bytes(64);
result_spec.mutable_snippet_spec()->set_num_matches_per_property(1);
result_spec.mutable_snippet_spec()->set_num_to_snippet(3);
@@ -7980,7 +7479,7 @@ TEST_F(IcingSearchEngineTest, SnippetErrorTest) {
ResultSpecProto result_spec;
result_spec.mutable_snippet_spec()->set_num_to_snippet(2);
result_spec.mutable_snippet_spec()->set_num_matches_per_property(3);
- result_spec.mutable_snippet_spec()->set_max_window_utf32_length(4);
+ result_spec.mutable_snippet_spec()->set_max_window_bytes(4);
SearchResultProto search_results =
icing.Search(search_spec, scoring_spec, result_spec);
@@ -8088,599 +7587,6 @@ TEST_F(IcingSearchEngineTest, CJKSnippetTest) {
EXPECT_THAT(match_proto.exact_match_utf16_length(), Eq(2));
}
-TEST_F(IcingSearchEngineTest, InvalidToEmptyQueryTest) {
- IcingSearchEngine icing(GetDefaultIcingOptions(), GetTestJniCache());
- ASSERT_THAT(icing.Initialize().status(), ProtoIsOk());
- ASSERT_THAT(icing.SetSchema(CreateMessageSchema()).status(), ProtoIsOk());
-
- // String: "Luca Brasi sleeps with the 🐟🐟🐟."
- // ^ ^ ^ ^ ^ ^ ^ ^ ^
- // UTF8 idx: 0 5 11 18 23 27 3135 39
- // UTF16 idx: 0 5 11 18 23 27 2931 33
- // Breaks into segments: "Luca", "Brasi", "sleeps", "with", "the", "🐟", "🐟"
- // and "🐟".
- constexpr std::string_view kSicilianMessage =
- "Luca Brasi sleeps with the 🐟🐟🐟.";
- DocumentProto document = DocumentBuilder()
- .SetKey("namespace", "uri1")
- .SetSchema("Message")
- .AddStringProperty("body", kSicilianMessage)
- .Build();
- ASSERT_THAT(icing.Put(document).status(), ProtoIsOk());
- DocumentProto document_two =
- DocumentBuilder()
- .SetKey("namespace", "uri2")
- .SetSchema("Message")
- .AddStringProperty("body", "Some other content.")
- .Build();
- ASSERT_THAT(icing.Put(document_two).status(), ProtoIsOk());
-
- // Search and request snippet matching but no windowing.
- SearchSpecProto search_spec;
- search_spec.set_query("?");
- search_spec.set_term_match_type(MATCH_PREFIX);
- ScoringSpecProto scoring_spec;
- ResultSpecProto result_spec;
-
- // Search and make sure that we got a single successful result
- SearchResultProto search_results =
- icing.Search(search_spec, scoring_spec, result_spec);
- EXPECT_THAT(search_results.status(), ProtoIsOk());
- EXPECT_THAT(search_results.results(), SizeIs(2));
-
- search_spec.set_query("。");
- search_results = icing.Search(search_spec, scoring_spec, result_spec);
- EXPECT_THAT(search_results.status(), ProtoIsOk());
- EXPECT_THAT(search_results.results(), SizeIs(2));
-
- search_spec.set_query("-");
- search_results = icing.Search(search_spec, scoring_spec, result_spec);
- EXPECT_THAT(search_results.status(), ProtoIsOk());
- EXPECT_THAT(search_results.results(), SizeIs(2));
-
- search_spec.set_query(":");
- search_results = icing.Search(search_spec, scoring_spec, result_spec);
- EXPECT_THAT(search_results.status(), ProtoIsOk());
- EXPECT_THAT(search_results.results(), SizeIs(2));
-
- search_spec.set_query("OR");
- search_results = icing.Search(search_spec, scoring_spec, result_spec);
- EXPECT_THAT(search_results.status(), ProtoIsOk());
- EXPECT_THAT(search_results.results(), SizeIs(2));
-
- search_spec.set_query(" ");
- search_results = icing.Search(search_spec, scoring_spec, result_spec);
- EXPECT_THAT(search_results.status(), ProtoIsOk());
- EXPECT_THAT(search_results.results(), SizeIs(2));
-}
-
-TEST_F(IcingSearchEngineTest, EmojiSnippetTest) {
- IcingSearchEngine icing(GetDefaultIcingOptions(), GetTestJniCache());
- ASSERT_THAT(icing.Initialize().status(), ProtoIsOk());
- ASSERT_THAT(icing.SetSchema(CreateMessageSchema()).status(), ProtoIsOk());
-
- // String: "Luca Brasi sleeps with the 🐟🐟🐟."
- // ^ ^ ^ ^ ^ ^ ^ ^ ^
- // UTF8 idx: 0 5 11 18 23 27 3135 39
- // UTF16 idx: 0 5 11 18 23 27 2931 33
- // Breaks into segments: "Luca", "Brasi", "sleeps", "with", "the", "🐟", "🐟"
- // and "🐟".
- constexpr std::string_view kSicilianMessage =
- "Luca Brasi sleeps with the 🐟🐟🐟.";
- DocumentProto document = DocumentBuilder()
- .SetKey("namespace", "uri1")
- .SetSchema("Message")
- .AddStringProperty("body", kSicilianMessage)
- .Build();
- ASSERT_THAT(icing.Put(document).status(), ProtoIsOk());
- DocumentProto document_two =
- DocumentBuilder()
- .SetKey("namespace", "uri2")
- .SetSchema("Message")
- .AddStringProperty("body", "Some other content.")
- .Build();
- ASSERT_THAT(icing.Put(document_two).status(), ProtoIsOk());
-
- // Search and request snippet matching but no windowing.
- SearchSpecProto search_spec;
- search_spec.set_query("🐟");
- search_spec.set_term_match_type(MATCH_PREFIX);
-
- ResultSpecProto result_spec;
- result_spec.mutable_snippet_spec()->set_num_to_snippet(1);
- result_spec.mutable_snippet_spec()->set_num_matches_per_property(1);
-
- // Search and make sure that we got a single successful result
- SearchResultProto search_results = icing.Search(
- search_spec, ScoringSpecProto::default_instance(), result_spec);
- ASSERT_THAT(search_results.status(), ProtoIsOk());
- ASSERT_THAT(search_results.results(), SizeIs(1));
- const SearchResultProto::ResultProto* result = &search_results.results(0);
- EXPECT_THAT(result->document().uri(), Eq("uri1"));
-
- // Ensure that one and only one property was matched and it was "body"
- ASSERT_THAT(result->snippet().entries(), SizeIs(1));
- const SnippetProto::EntryProto* entry = &result->snippet().entries(0);
- EXPECT_THAT(entry->property_name(), Eq("body"));
-
- // Get the content for "subject" and see what the match is.
- std::string_view content = GetString(&result->document(), "body");
- ASSERT_THAT(content, Eq(kSicilianMessage));
-
- // Ensure that there is one and only one match within "subject"
- ASSERT_THAT(entry->snippet_matches(), SizeIs(1));
- const SnippetMatchProto& match_proto = entry->snippet_matches(0);
-
- EXPECT_THAT(match_proto.exact_match_byte_position(), Eq(27));
- EXPECT_THAT(match_proto.exact_match_byte_length(), Eq(4));
- std::string_view match =
- content.substr(match_proto.exact_match_byte_position(),
- match_proto.exact_match_byte_length());
- ASSERT_THAT(match, Eq("🐟"));
-
- // Ensure that the utf-16 values are also as expected
- EXPECT_THAT(match_proto.exact_match_utf16_position(), Eq(27));
- EXPECT_THAT(match_proto.exact_match_utf16_length(), Eq(2));
-}
-
-TEST_F(IcingSearchEngineTest, PutDocumentIndexFailureDeletion) {
- IcingSearchEngine icing(GetDefaultIcingOptions(), GetTestJniCache());
- ASSERT_THAT(icing.Initialize().status(), ProtoIsOk());
- ASSERT_THAT(icing.SetSchema(CreateMessageSchema()).status(), ProtoIsOk());
-
- // Testing has shown that adding ~600,000 terms generated this way will
- // fill up the hit buffer.
- std::vector<std::string> terms = GenerateUniqueTerms(600000);
- std::string content = absl_ports::StrJoin(terms, " ");
- DocumentProto document = DocumentBuilder()
- .SetKey("namespace", "uri1")
- .SetSchema("Message")
- .AddStringProperty("body", "foo " + content)
- .Build();
- // We failed to add the document to the index fully. This means that we should
- // reject the document from Icing entirely.
- ASSERT_THAT(icing.Put(document).status(),
- ProtoStatusIs(StatusProto::OUT_OF_SPACE));
-
- // Make sure that the document isn't searchable.
- SearchSpecProto search_spec;
- search_spec.set_query("foo");
- search_spec.set_term_match_type(MATCH_PREFIX);
-
- SearchResultProto search_results =
- icing.Search(search_spec, ScoringSpecProto::default_instance(),
- ResultSpecProto::default_instance());
- ASSERT_THAT(search_results.status(), ProtoIsOk());
- ASSERT_THAT(search_results.results(), IsEmpty());
-
- // Make sure that the document isn't retrievable.
- GetResultProto get_result =
- icing.Get("namespace", "uri1", GetResultSpecProto::default_instance());
- ASSERT_THAT(get_result.status(), ProtoStatusIs(StatusProto::NOT_FOUND));
-}
-
-TEST_F(IcingSearchEngineTest, SearchSuggestionsTest) {
- IcingSearchEngine icing(GetDefaultIcingOptions(), GetTestJniCache());
- ASSERT_THAT(icing.Initialize().status(), ProtoIsOk());
- ASSERT_THAT(icing.SetSchema(CreatePersonAndEmailSchema()).status(),
- ProtoIsOk());
-
- // Creates and inserts 6 documents, and index 6 termSix, 5 termFive, 4
- // termFour, 3 termThree, 2 termTwo and one termOne.
- DocumentProto document1 =
- DocumentBuilder()
- .SetKey("namespace", "uri1")
- .SetSchema("Email")
- .SetCreationTimestampMs(10)
- .AddStringProperty(
- "subject", "termOne termTwo termThree termFour termFive termSix")
- .Build();
- DocumentProto document2 =
- DocumentBuilder()
- .SetKey("namespace", "uri2")
- .SetSchema("Email")
- .SetCreationTimestampMs(10)
- .AddStringProperty("subject",
- "termTwo termThree termFour termFive termSix")
- .Build();
- DocumentProto document3 =
- DocumentBuilder()
- .SetKey("namespace", "uri3")
- .SetSchema("Email")
- .SetCreationTimestampMs(10)
- .AddStringProperty("subject", "termThree termFour termFive termSix")
- .Build();
- DocumentProto document4 =
- DocumentBuilder()
- .SetKey("namespace", "uri4")
- .SetSchema("Email")
- .SetCreationTimestampMs(10)
- .AddStringProperty("subject", "termFour termFive termSix")
- .Build();
- DocumentProto document5 =
- DocumentBuilder()
- .SetKey("namespace", "uri5")
- .SetSchema("Email")
- .SetCreationTimestampMs(10)
- .AddStringProperty("subject", "termFive termSix")
- .Build();
- DocumentProto document6 = DocumentBuilder()
- .SetKey("namespace", "uri6")
- .SetSchema("Email")
- .SetCreationTimestampMs(10)
- .AddStringProperty("subject", "termSix")
- .Build();
- ASSERT_THAT(icing.Put(document1).status(), ProtoIsOk());
- ASSERT_THAT(icing.Put(document2).status(), ProtoIsOk());
- ASSERT_THAT(icing.Put(document3).status(), ProtoIsOk());
- ASSERT_THAT(icing.Put(document4).status(), ProtoIsOk());
- ASSERT_THAT(icing.Put(document5).status(), ProtoIsOk());
- ASSERT_THAT(icing.Put(document6).status(), ProtoIsOk());
-
- SuggestionSpecProto suggestion_spec;
- suggestion_spec.set_prefix("t");
- suggestion_spec.set_num_to_return(10);
- suggestion_spec.mutable_scoring_spec()->set_scoring_match_type(
- TermMatchType::PREFIX);
-
- // Query all suggestions, and they will be ranked.
- SuggestionResponse response = icing.SearchSuggestions(suggestion_spec);
- ASSERT_THAT(response.status(), ProtoIsOk());
- ASSERT_THAT(response.suggestions().at(0).query(), "termsix");
- ASSERT_THAT(response.suggestions().at(1).query(), "termfive");
- ASSERT_THAT(response.suggestions().at(2).query(), "termfour");
- ASSERT_THAT(response.suggestions().at(3).query(), "termthree");
- ASSERT_THAT(response.suggestions().at(4).query(), "termtwo");
- ASSERT_THAT(response.suggestions().at(5).query(), "termone");
-
- // Query first three suggestions, and they will be ranked.
- suggestion_spec.set_num_to_return(3);
- response = icing.SearchSuggestions(suggestion_spec);
- ASSERT_THAT(response.status(), ProtoIsOk());
- ASSERT_THAT(response.suggestions().at(0).query(), "termsix");
- ASSERT_THAT(response.suggestions().at(1).query(), "termfive");
- ASSERT_THAT(response.suggestions().at(2).query(), "termfour");
-}
-
-TEST_F(IcingSearchEngineTest,
- SearchSuggestionsTest_ShouldReturnInOneNamespace) {
- IcingSearchEngine icing(GetDefaultIcingOptions(), GetTestJniCache());
- ASSERT_THAT(icing.Initialize().status(), ProtoIsOk());
- ASSERT_THAT(icing.SetSchema(CreatePersonAndEmailSchema()).status(),
- ProtoIsOk());
-
- DocumentProto document1 = DocumentBuilder()
- .SetKey("namespace1", "uri1")
- .SetSchema("Email")
- .SetCreationTimestampMs(10)
- .AddStringProperty("subject", "foo fool")
- .Build();
- DocumentProto document2 = DocumentBuilder()
- .SetKey("namespace2", "uri2")
- .SetSchema("Email")
- .SetCreationTimestampMs(10)
- .AddStringProperty("subject", "fool")
- .Build();
- ASSERT_THAT(icing.Put(document1).status(), ProtoIsOk());
- ASSERT_THAT(icing.Put(document2).status(), ProtoIsOk());
-
- SuggestionResponse::Suggestion suggestionFoo;
- suggestionFoo.set_query("foo");
- SuggestionResponse::Suggestion suggestionFool;
- suggestionFool.set_query("fool");
-
- // namespace1 has 2 results.
- SuggestionSpecProto suggestion_spec;
- suggestion_spec.set_prefix("f");
- suggestion_spec.add_namespace_filters("namespace1");
- suggestion_spec.set_num_to_return(10);
- suggestion_spec.mutable_scoring_spec()->set_scoring_match_type(
- TermMatchType::PREFIX);
-
- SuggestionResponse response = icing.SearchSuggestions(suggestion_spec);
- ASSERT_THAT(response.status(), ProtoIsOk());
- ASSERT_THAT(response.suggestions(),
- UnorderedElementsAre(EqualsProto(suggestionFoo),
- EqualsProto(suggestionFool)));
-}
-
-TEST_F(IcingSearchEngineTest,
- SearchSuggestionsTest_ShouldReturnInMultipleNamespace) {
- IcingSearchEngine icing(GetDefaultIcingOptions(), GetTestJniCache());
- ASSERT_THAT(icing.Initialize().status(), ProtoIsOk());
- ASSERT_THAT(icing.SetSchema(CreatePersonAndEmailSchema()).status(),
- ProtoIsOk());
-
- DocumentProto document1 = DocumentBuilder()
- .SetKey("namespace1", "uri1")
- .SetSchema("Email")
- .SetCreationTimestampMs(10)
- .AddStringProperty("subject", "fo")
- .Build();
- DocumentProto document2 = DocumentBuilder()
- .SetKey("namespace2", "uri2")
- .SetSchema("Email")
- .SetCreationTimestampMs(10)
- .AddStringProperty("subject", "foo")
- .Build();
- DocumentProto document3 = DocumentBuilder()
- .SetKey("namespace3", "uri3")
- .SetSchema("Email")
- .SetCreationTimestampMs(10)
- .AddStringProperty("subject", "fool")
- .Build();
- ASSERT_THAT(icing.Put(document1).status(), ProtoIsOk());
- ASSERT_THAT(icing.Put(document2).status(), ProtoIsOk());
- ASSERT_THAT(icing.Put(document3).status(), ProtoIsOk());
-
- SuggestionResponse::Suggestion suggestionFoo;
- suggestionFoo.set_query("foo");
- SuggestionResponse::Suggestion suggestionFool;
- suggestionFool.set_query("fool");
-
- // namespace2 and namespace3 has 2 results.
- SuggestionSpecProto suggestion_spec;
- suggestion_spec.set_prefix("f");
- suggestion_spec.add_namespace_filters("namespace2");
- suggestion_spec.add_namespace_filters("namespace3");
- suggestion_spec.set_num_to_return(10);
- suggestion_spec.mutable_scoring_spec()->set_scoring_match_type(
- TermMatchType::PREFIX);
-
- SuggestionResponse response = icing.SearchSuggestions(suggestion_spec);
- ASSERT_THAT(response.status(), ProtoIsOk());
- ASSERT_THAT(response.suggestions(),
- UnorderedElementsAre(EqualsProto(suggestionFoo),
- EqualsProto(suggestionFool)));
-}
-
-TEST_F(IcingSearchEngineTest,
- SearchSuggestionsTest_OtherNamespaceDontContributeToHitCount) {
- IcingSearchEngine icing(GetDefaultIcingOptions(), GetTestJniCache());
- ASSERT_THAT(icing.Initialize().status(), ProtoIsOk());
- ASSERT_THAT(icing.SetSchema(CreatePersonAndEmailSchema()).status(),
- ProtoIsOk());
-
- // Index 4 documents,
- // namespace1 has 2 hit2 for term one
- // namespace2 has 2 hit2 for term two and 1 hit for term one.
- DocumentProto document1 = DocumentBuilder()
- .SetKey("namespace1", "uri1")
- .SetSchema("Email")
- .SetCreationTimestampMs(10)
- .AddStringProperty("subject", "termone")
- .Build();
- DocumentProto document2 = DocumentBuilder()
- .SetKey("namespace1", "uri2")
- .SetSchema("Email")
- .SetCreationTimestampMs(10)
- .AddStringProperty("subject", "termone")
- .Build();
- DocumentProto document3 = DocumentBuilder()
- .SetKey("namespace2", "uri2")
- .SetSchema("Email")
- .SetCreationTimestampMs(10)
- .AddStringProperty("subject", "termone termtwo")
- .Build();
- DocumentProto document4 = DocumentBuilder()
- .SetKey("namespace2", "uri3")
- .SetSchema("Email")
- .SetCreationTimestampMs(10)
- .AddStringProperty("subject", "termtwo")
- .Build();
- ASSERT_THAT(icing.Put(document1).status(), ProtoIsOk());
- ASSERT_THAT(icing.Put(document2).status(), ProtoIsOk());
- ASSERT_THAT(icing.Put(document3).status(), ProtoIsOk());
- ASSERT_THAT(icing.Put(document4).status(), ProtoIsOk());
-
- SuggestionResponse::Suggestion suggestionTermOne;
- suggestionTermOne.set_query("termone");
- SuggestionResponse::Suggestion suggestionTermTwo;
- suggestionTermTwo.set_query("termtwo");
-
- // only search suggestion for namespace2. The correctly order should be
- // {"termtwo", "termone"}. If we're not filtering out namespace1 when
- // calculating our score, then it will be {"termone", "termtwo"}.
- SuggestionSpecProto suggestion_spec;
- suggestion_spec.set_prefix("t");
- suggestion_spec.add_namespace_filters("namespace2");
- suggestion_spec.set_num_to_return(10);
- suggestion_spec.mutable_scoring_spec()->set_scoring_match_type(
- TermMatchType::PREFIX);
-
- SuggestionResponse response = icing.SearchSuggestions(suggestion_spec);
- ASSERT_THAT(response.status(), ProtoIsOk());
- ASSERT_THAT(response.suggestions(),
- ElementsAre(EqualsProto(suggestionTermTwo),
- EqualsProto(suggestionTermOne)));
-}
-
-TEST_F(IcingSearchEngineTest, SearchSuggestionsTest_DeletionTest) {
- IcingSearchEngine icing(GetDefaultIcingOptions(), GetTestJniCache());
- ASSERT_THAT(icing.Initialize().status(), ProtoIsOk());
- ASSERT_THAT(icing.SetSchema(CreatePersonAndEmailSchema()).status(),
- ProtoIsOk());
-
- DocumentProto document1 = DocumentBuilder()
- .SetKey("namespace1", "uri1")
- .SetSchema("Email")
- .SetCreationTimestampMs(10)
- .AddStringProperty("subject", "fool")
- .Build();
- DocumentProto document2 = DocumentBuilder()
- .SetKey("namespace2", "uri2")
- .SetSchema("Email")
- .SetCreationTimestampMs(10)
- .AddStringProperty("subject", "fool")
- .Build();
- ASSERT_THAT(icing.Put(document1).status(), ProtoIsOk());
- ASSERT_THAT(icing.Put(document2).status(), ProtoIsOk());
-
- SuggestionResponse::Suggestion suggestionFool;
- suggestionFool.set_query("fool");
-
- // namespace1 has this suggestion
- SuggestionSpecProto suggestion_spec;
- suggestion_spec.set_prefix("f");
- suggestion_spec.add_namespace_filters("namespace1");
- suggestion_spec.set_num_to_return(10);
- suggestion_spec.mutable_scoring_spec()->set_scoring_match_type(
- TermMatchType::PREFIX);
-
- SuggestionResponse response = icing.SearchSuggestions(suggestion_spec);
- ASSERT_THAT(response.status(), ProtoIsOk());
- ASSERT_THAT(response.suggestions(),
- UnorderedElementsAre(EqualsProto(suggestionFool)));
-
- // namespace2 has this suggestion
- suggestion_spec.clear_namespace_filters();
- suggestion_spec.add_namespace_filters("namespace2");
- response = icing.SearchSuggestions(suggestion_spec);
- ASSERT_THAT(response.status(), ProtoIsOk());
- ASSERT_THAT(response.suggestions(),
- UnorderedElementsAre(EqualsProto(suggestionFool)));
-
- // delete document from namespace 1
- EXPECT_THAT(icing.Delete("namespace1", "uri1").status(), ProtoIsOk());
-
- // Now namespace1 will return empty
- suggestion_spec.clear_namespace_filters();
- suggestion_spec.add_namespace_filters("namespace1");
- response = icing.SearchSuggestions(suggestion_spec);
- ASSERT_THAT(response.status(), ProtoIsOk());
- ASSERT_THAT(response.suggestions(), IsEmpty());
-
- // namespace2 still has this suggestion, so we can prove the reason of
- // namespace 1 cannot find it is we filter it out, not it doesn't exist.
- suggestion_spec.add_namespace_filters("namespace2");
- response = icing.SearchSuggestions(suggestion_spec);
- ASSERT_THAT(response.status(), ProtoIsOk());
- ASSERT_THAT(response.suggestions(),
- UnorderedElementsAre(EqualsProto(suggestionFool)));
-}
-
-TEST_F(IcingSearchEngineTest, SearchSuggestionsTest_ExpiredTest) {
- DocumentProto document1 = DocumentBuilder()
- .SetKey("namespace1", "uri1")
- .SetSchema("Email")
- .SetCreationTimestampMs(100)
- .SetTtlMs(500)
- .AddStringProperty("subject", "fool")
- .Build();
- DocumentProto document2 = DocumentBuilder()
- .SetKey("namespace2", "uri2")
- .SetSchema("Email")
- .SetCreationTimestampMs(100)
- .SetTtlMs(1000)
- .AddStringProperty("subject", "fool")
- .Build();
- {
- auto fake_clock = std::make_unique<FakeClock>();
- fake_clock->SetSystemTimeMilliseconds(400);
-
- TestIcingSearchEngine icing(GetDefaultIcingOptions(),
- std::make_unique<Filesystem>(),
- std::make_unique<IcingFilesystem>(),
- std::move(fake_clock), GetTestJniCache());
- EXPECT_THAT(icing.Initialize().status(), ProtoIsOk());
- ASSERT_THAT(icing.SetSchema(CreatePersonAndEmailSchema()).status(),
- ProtoIsOk());
-
- ASSERT_THAT(icing.Put(document1).status(), ProtoIsOk());
- ASSERT_THAT(icing.Put(document2).status(), ProtoIsOk());
-
- SuggestionResponse::Suggestion suggestionFool;
- suggestionFool.set_query("fool");
-
- // namespace1 has this suggestion
- SuggestionSpecProto suggestion_spec;
- suggestion_spec.set_prefix("f");
- suggestion_spec.add_namespace_filters("namespace1");
- suggestion_spec.set_num_to_return(10);
- suggestion_spec.mutable_scoring_spec()->set_scoring_match_type(
- TermMatchType::PREFIX);
-
- SuggestionResponse response = icing.SearchSuggestions(suggestion_spec);
- ASSERT_THAT(response.status(), ProtoIsOk());
- ASSERT_THAT(response.suggestions(),
- UnorderedElementsAre(EqualsProto(suggestionFool)));
-
- // namespace2 has this suggestion
- suggestion_spec.clear_namespace_filters();
- suggestion_spec.add_namespace_filters("namespace2");
- response = icing.SearchSuggestions(suggestion_spec);
- ASSERT_THAT(response.status(), ProtoIsOk());
- ASSERT_THAT(response.suggestions(),
- UnorderedElementsAre(EqualsProto(suggestionFool)));
- }
- // We reinitialize here so we can feed in a fake clock this time
- {
- // Time needs to be past document1 creation time (100) + ttl (500) for it
- // to count as "expired". document2 is not expired since its ttl is 1000.
- auto fake_clock = std::make_unique<FakeClock>();
- fake_clock->SetSystemTimeMilliseconds(800);
-
- TestIcingSearchEngine icing(GetDefaultIcingOptions(),
- std::make_unique<Filesystem>(),
- std::make_unique<IcingFilesystem>(),
- std::move(fake_clock), GetTestJniCache());
- ASSERT_THAT(icing.Initialize().status(), ProtoIsOk());
-
- SuggestionSpecProto suggestion_spec;
- suggestion_spec.set_prefix("f");
- suggestion_spec.add_namespace_filters("namespace1");
- suggestion_spec.set_num_to_return(10);
- suggestion_spec.mutable_scoring_spec()->set_scoring_match_type(
- TermMatchType::PREFIX);
-
- // Now namespace1 will return empty
- suggestion_spec.clear_namespace_filters();
- suggestion_spec.add_namespace_filters("namespace1");
- SuggestionResponse response = icing.SearchSuggestions(suggestion_spec);
- ASSERT_THAT(response.status(), ProtoIsOk());
- ASSERT_THAT(response.suggestions(), IsEmpty());
-
- // namespace2 still has this suggestion
- SuggestionResponse::Suggestion suggestionFool;
- suggestionFool.set_query("fool");
-
- suggestion_spec.add_namespace_filters("namespace2");
- response = icing.SearchSuggestions(suggestion_spec);
- ASSERT_THAT(response.status(), ProtoIsOk());
- ASSERT_THAT(response.suggestions(),
- UnorderedElementsAre(EqualsProto(suggestionFool)));
- }
-}
-
-TEST_F(IcingSearchEngineTest, SearchSuggestionsTest_emptyPrefix) {
- IcingSearchEngine icing(GetDefaultIcingOptions(), GetTestJniCache());
- ASSERT_THAT(icing.Initialize().status(), ProtoIsOk());
-
- SuggestionSpecProto suggestion_spec;
- suggestion_spec.set_prefix("");
- suggestion_spec.set_num_to_return(10);
- suggestion_spec.mutable_scoring_spec()->set_scoring_match_type(
- TermMatchType::PREFIX);
-
- ASSERT_THAT(icing.SearchSuggestions(suggestion_spec).status(),
- ProtoStatusIs(StatusProto::INVALID_ARGUMENT));
-}
-
-TEST_F(IcingSearchEngineTest, SearchSuggestionsTest_NonPositiveNumToReturn) {
- IcingSearchEngine icing(GetDefaultIcingOptions(), GetTestJniCache());
- ASSERT_THAT(icing.Initialize().status(), ProtoIsOk());
-
- SuggestionSpecProto suggestion_spec;
- suggestion_spec.set_prefix("prefix");
- suggestion_spec.set_num_to_return(0);
- suggestion_spec.mutable_scoring_spec()->set_scoring_match_type(
- TermMatchType::PREFIX);
-
- ASSERT_THAT(icing.SearchSuggestions(suggestion_spec).status(),
- ProtoStatusIs(StatusProto::INVALID_ARGUMENT));
-}
-
-#ifndef ICING_JNI_TEST
// We skip this test case when we're running in a jni_test since the data files
// will be stored in the android-instrumented storage location, rather than the
// normal cc_library runfiles directory. To get that storage location, it's
@@ -8690,6 +7596,12 @@ TEST_F(IcingSearchEngineTest, SearchSuggestionsTest_NonPositiveNumToReturn) {
// this native side yet, we're just going to disable this. The functionality is
// already well-tested across 4 different emulated OS's so we're not losing much
// test coverage here.
+#ifndef ICING_JNI_TEST
+// Disable backwards compat test. This test is enabled in google3, but disabled
+// in jetpack/framework because we didn't want to keep the binary testdata files
+// in our repo.
+#define DISABLE_BACKWARDS_COMPAT_TEST
+#ifndef DISABLE_BACKWARDS_COMPAT_TEST
TEST_F(IcingSearchEngineTest, MigrateToPortableFileBackedProtoLog) {
// Copy the testdata files into our IcingSearchEngine directory
std::string dir_without_portable_log;
@@ -8729,7 +7641,7 @@ TEST_F(IcingSearchEngineTest, MigrateToPortableFileBackedProtoLog) {
EXPECT_THAT(init_result.initialize_stats().document_store_data_status(),
Eq(InitializeStatsProto::NO_DATA_LOSS));
EXPECT_THAT(init_result.initialize_stats().document_store_recovery_cause(),
- Eq(InitializeStatsProto::LEGACY_DOCUMENT_LOG_FORMAT));
+ Eq(InitializeStatsProto::NONE));
EXPECT_THAT(init_result.initialize_stats().schema_store_recovery_cause(),
Eq(InitializeStatsProto::NONE));
EXPECT_THAT(init_result.initialize_stats().index_restoration_cause(),
@@ -8843,6 +7755,7 @@ TEST_F(IcingSearchEngineTest, MigrateToPortableFileBackedProtoLog) {
EXPECT_THAT(actual_results,
EqualsSearchResultIgnoreStatsAndScores(expected_document3));
}
+#endif // DISABLE_BACKWARDS_COMPAT_TEST
#endif // !ICING_JNI_TEST
} // namespace
diff --git a/icing/index/index-processor.cc b/icing/index/index-processor.cc
index 207c033..6d8632f 100644
--- a/icing/index/index-processor.cc
+++ b/icing/index/index-processor.cc
@@ -43,13 +43,14 @@ namespace lib {
libtextclassifier3::StatusOr<std::unique_ptr<IndexProcessor>>
IndexProcessor::Create(const Normalizer* normalizer, Index* index,
+ const IndexProcessor::Options& options,
const Clock* clock) {
ICING_RETURN_ERROR_IF_NULL(normalizer);
ICING_RETURN_ERROR_IF_NULL(index);
ICING_RETURN_ERROR_IF_NULL(clock);
return std::unique_ptr<IndexProcessor>(
- new IndexProcessor(normalizer, index, clock));
+ new IndexProcessor(normalizer, index, options, clock));
}
libtextclassifier3::Status IndexProcessor::IndexDocument(
@@ -65,48 +66,53 @@ libtextclassifier3::Status IndexProcessor::IndexDocument(
}
index_->set_last_added_document_id(document_id);
uint32_t num_tokens = 0;
- libtextclassifier3::Status status;
+ libtextclassifier3::Status overall_status;
for (const TokenizedSection& section : tokenized_document.sections()) {
// TODO(b/152934343): pass real namespace ids in
Index::Editor editor =
index_->Edit(document_id, section.metadata.id,
section.metadata.term_match_type, /*namespace_id=*/0);
for (std::string_view token : section.token_sequence) {
- ++num_tokens;
-
- switch (section.metadata.tokenizer) {
- case StringIndexingConfig::TokenizerType::VERBATIM:
- // data() is safe to use here because a token created from the
- // VERBATIM tokenizer is the entire string value. The character at
- // data() + token.length() is guaranteed to be a null char.
- status = editor.BufferTerm(token.data());
- break;
- case StringIndexingConfig::TokenizerType::NONE:
- ICING_LOG(WARNING)
- << "Unexpected TokenizerType::NONE found when indexing document.";
- [[fallthrough]];
- case StringIndexingConfig::TokenizerType::PLAIN:
- std::string normalized_term = normalizer_.NormalizeTerm(token);
- status = editor.BufferTerm(normalized_term.c_str());
+ if (++num_tokens > options_.max_tokens_per_document) {
+ // Index all tokens buffered so far.
+ editor.IndexAllBufferedTerms();
+ if (put_document_stats != nullptr) {
+ put_document_stats->mutable_tokenization_stats()
+ ->set_exceeded_max_token_num(true);
+ put_document_stats->mutable_tokenization_stats()
+ ->set_num_tokens_indexed(options_.max_tokens_per_document);
+ }
+ switch (options_.token_limit_behavior) {
+ case Options::TokenLimitBehavior::kReturnError:
+ return absl_ports::ResourceExhaustedError(
+ "Max number of tokens reached!");
+ case Options::TokenLimitBehavior::kSuppressError:
+ return overall_status;
+ }
}
-
- if (!status.ok()) {
- // We've encountered a failure. Bail out. We'll mark this doc as deleted
- // and signal a failure to the client.
- ICING_LOG(WARNING) << "Failed to buffer term in lite lexicon due to: "
- << status.error_message();
- break;
+ std::string term = normalizer_.NormalizeTerm(token);
+ // Add this term to Hit buffer. Even if adding this hit fails, we keep
+ // trying to add more hits because it's possible that future hits could
+ // still be added successfully. For instance if the lexicon is full, we
+ // might fail to add a hit for a new term, but should still be able to
+ // add hits for terms that are already in the index.
+ auto status = editor.BufferTerm(term.c_str());
+ if (overall_status.ok() && !status.ok()) {
+ // If we've succeeded to add everything so far, set overall_status to
+ // represent this new failure. If we've already failed, no need to
+ // update the status - we're already going to return a resource
+ // exhausted error.
+ overall_status = status;
}
}
- if (!status.ok()) {
- break;
- }
// Add all the seen terms to the index with their term frequency.
- status = editor.IndexAllBufferedTerms();
- if (!status.ok()) {
- ICING_LOG(WARNING) << "Failed to add hits in lite index due to: "
- << status.error_message();
- break;
+ auto status = editor.IndexAllBufferedTerms();
+ if (overall_status.ok() && !status.ok()) {
+ // If we've succeeded so far, set overall_status to
+ // represent this new failure. If we've already failed, no need to
+ // update the status - we're already going to return a resource
+ // exhausted error.
+ overall_status = status;
}
}
@@ -117,11 +123,9 @@ libtextclassifier3::Status IndexProcessor::IndexDocument(
num_tokens);
}
- // If we're either successful or we've hit resource exhausted, then attempt a
- // merge.
- if ((status.ok() || absl_ports::IsResourceExhausted(status)) &&
- index_->WantsMerge()) {
- ICING_LOG(ERROR) << "Merging the index at docid " << document_id << ".";
+ // Merge if necessary.
+ if (overall_status.ok() && index_->WantsMerge()) {
+ ICING_VLOG(1) << "Merging the index at docid " << document_id << ".";
std::unique_ptr<Timer> merge_timer = clock_.GetNewTimer();
libtextclassifier3::Status merge_status = index_->Merge();
@@ -146,7 +150,7 @@ libtextclassifier3::Status IndexProcessor::IndexDocument(
}
}
- return status;
+ return overall_status;
}
} // namespace lib
diff --git a/icing/index/index-processor.h b/icing/index/index-processor.h
index 269e41c..6b07c98 100644
--- a/icing/index/index-processor.h
+++ b/icing/index/index-processor.h
@@ -32,6 +32,23 @@ namespace lib {
class IndexProcessor {
public:
+ struct Options {
+ int32_t max_tokens_per_document;
+
+ // Indicates how a document exceeding max_tokens_per_document should be
+ // handled.
+ enum class TokenLimitBehavior {
+ // When set, the first max_tokens_per_document will be indexed. If the
+ // token count exceeds max_tokens_per_document, a ResourceExhausted error
+ // will be returned.
+ kReturnError,
+ // When set, the first max_tokens_per_document will be indexed. If the
+ // token count exceeds max_tokens_per_document, OK will be returned.
+ kSuppressError,
+ };
+ TokenLimitBehavior token_limit_behavior;
+ };
+
// Factory function to create an IndexProcessor which does not take ownership
// of any input components, and all pointers must refer to valid objects that
// outlive the created IndexProcessor instance.
@@ -40,7 +57,8 @@ class IndexProcessor {
// An IndexProcessor on success
// FAILED_PRECONDITION if any of the pointers is null.
static libtextclassifier3::StatusOr<std::unique_ptr<IndexProcessor>> Create(
- const Normalizer* normalizer, Index* index, const Clock* clock);
+ const Normalizer* normalizer, Index* index, const Options& options,
+ const Clock* clock);
// Add tokenized document to the index, associated with document_id. If the
// number of tokens in the document exceeds max_tokens_per_document, then only
@@ -66,11 +84,18 @@ class IndexProcessor {
PutDocumentStatsProto* put_document_stats = nullptr);
private:
- IndexProcessor(const Normalizer* normalizer, Index* index, const Clock* clock)
- : normalizer_(*normalizer), index_(index), clock_(*clock) {}
+ IndexProcessor(const Normalizer* normalizer, Index* index,
+ const Options& options, const Clock* clock)
+ : normalizer_(*normalizer),
+ index_(index),
+ options_(options),
+ clock_(*clock) {}
+
+ std::string NormalizeToken(const Token& token);
const Normalizer& normalizer_;
Index* const index_;
+ const Options options_;
const Clock& clock_;
};
diff --git a/icing/index/index-processor_benchmark.cc b/icing/index/index-processor_benchmark.cc
index 1aad7d0..afeac4d 100644
--- a/icing/index/index-processor_benchmark.cc
+++ b/icing/index/index-processor_benchmark.cc
@@ -16,6 +16,7 @@
#include "gmock/gmock.h"
#include "icing/document-builder.h"
#include "icing/file/filesystem.h"
+#include "icing/helpers/icu/icu-data-file-helper.h"
#include "icing/index/index-processor.h"
#include "icing/index/index.h"
#include "icing/legacy/core/icing-string-util.h"
@@ -23,7 +24,6 @@
#include "icing/schema/schema-util.h"
#include "icing/schema/section-manager.h"
#include "icing/testing/common-matchers.h"
-#include "icing/testing/icu-data-file-helper.h"
#include "icing/testing/test-data.h"
#include "icing/testing/tmp-directory.h"
#include "icing/tokenization/language-segmenter-factory.h"
@@ -168,6 +168,17 @@ void CleanUp(const Filesystem& filesystem, const std::string& index_dir) {
filesystem.DeleteDirectoryRecursively(index_dir.c_str());
}
+std::unique_ptr<IndexProcessor> CreateIndexProcessor(
+ const Normalizer* normalizer, Index* index, const Clock* clock) {
+ IndexProcessor::Options processor_options{};
+ processor_options.max_tokens_per_document = 1024 * 1024 * 10;
+ processor_options.token_limit_behavior =
+ IndexProcessor::Options::TokenLimitBehavior::kReturnError;
+
+ return IndexProcessor::Create(normalizer, index, processor_options, clock)
+ .ValueOrDie();
+}
+
void BM_IndexDocumentWithOneProperty(benchmark::State& state) {
bool run_via_adb = absl::GetFlag(FLAGS_adb);
if (!run_via_adb) {
@@ -189,9 +200,9 @@ void BM_IndexDocumentWithOneProperty(benchmark::State& state) {
std::unique_ptr<Normalizer> normalizer = CreateNormalizer();
Clock clock;
std::unique_ptr<SchemaStore> schema_store = CreateSchemaStore(&clock);
- ICING_ASSERT_OK_AND_ASSIGN(
- std::unique_ptr<IndexProcessor> index_processor,
- IndexProcessor::Create(normalizer.get(), index.get(), &clock));
+ std::unique_ptr<IndexProcessor> index_processor =
+ CreateIndexProcessor(normalizer.get(), index.get(), &clock);
+
DocumentProto input_document = CreateDocumentWithOneProperty(state.range(0));
TokenizedDocument tokenized_document(std::move(
TokenizedDocument::Create(schema_store.get(), language_segmenter.get(),
@@ -243,9 +254,8 @@ void BM_IndexDocumentWithTenProperties(benchmark::State& state) {
std::unique_ptr<Normalizer> normalizer = CreateNormalizer();
Clock clock;
std::unique_ptr<SchemaStore> schema_store = CreateSchemaStore(&clock);
- ICING_ASSERT_OK_AND_ASSIGN(
- std::unique_ptr<IndexProcessor> index_processor,
- IndexProcessor::Create(normalizer.get(), index.get(), &clock));
+ std::unique_ptr<IndexProcessor> index_processor =
+ CreateIndexProcessor(normalizer.get(), index.get(), &clock);
DocumentProto input_document =
CreateDocumentWithTenProperties(state.range(0));
@@ -299,9 +309,8 @@ void BM_IndexDocumentWithDiacriticLetters(benchmark::State& state) {
std::unique_ptr<Normalizer> normalizer = CreateNormalizer();
Clock clock;
std::unique_ptr<SchemaStore> schema_store = CreateSchemaStore(&clock);
- ICING_ASSERT_OK_AND_ASSIGN(
- std::unique_ptr<IndexProcessor> index_processor,
- IndexProcessor::Create(normalizer.get(), index.get(), &clock));
+ std::unique_ptr<IndexProcessor> index_processor =
+ CreateIndexProcessor(normalizer.get(), index.get(), &clock);
DocumentProto input_document =
CreateDocumentWithDiacriticLetters(state.range(0));
@@ -355,9 +364,8 @@ void BM_IndexDocumentWithHiragana(benchmark::State& state) {
std::unique_ptr<Normalizer> normalizer = CreateNormalizer();
Clock clock;
std::unique_ptr<SchemaStore> schema_store = CreateSchemaStore(&clock);
- ICING_ASSERT_OK_AND_ASSIGN(
- std::unique_ptr<IndexProcessor> index_processor,
- IndexProcessor::Create(normalizer.get(), index.get(), &clock));
+ std::unique_ptr<IndexProcessor> index_processor =
+ CreateIndexProcessor(normalizer.get(), index.get(), &clock);
DocumentProto input_document = CreateDocumentWithHiragana(state.range(0));
TokenizedDocument tokenized_document(std::move(
diff --git a/icing/index/index-processor_test.cc b/icing/index/index-processor_test.cc
index 7746688..8a6a9f5 100644
--- a/icing/index/index-processor_test.cc
+++ b/icing/index/index-processor_test.cc
@@ -27,9 +27,9 @@
#include "gmock/gmock.h"
#include "gtest/gtest.h"
#include "icing/absl_ports/str_cat.h"
-#include "icing/absl_ports/str_join.h"
#include "icing/document-builder.h"
#include "icing/file/filesystem.h"
+#include "icing/helpers/icu/icu-data-file-helper.h"
#include "icing/index/hit/doc-hit-info.h"
#include "icing/index/index.h"
#include "icing/index/iterator/doc-hit-info-iterator.h"
@@ -48,8 +48,6 @@
#include "icing/store/document-id.h"
#include "icing/testing/common-matchers.h"
#include "icing/testing/fake-clock.h"
-#include "icing/testing/icu-data-file-helper.h"
-#include "icing/testing/random-string.h"
#include "icing/testing/test-data.h"
#include "icing/testing/tmp-directory.h"
#include "icing/tokenization/language-segmenter-factory.h"
@@ -90,8 +88,6 @@ constexpr std::string_view kRepeatedProperty = "repeated";
constexpr std::string_view kSubProperty = "submessage";
constexpr std::string_view kNestedType = "NestedType";
constexpr std::string_view kNestedProperty = "nested";
-constexpr std::string_view kExactVerbatimProperty = "verbatimExact";
-constexpr std::string_view kPrefixedVerbatimProperty = "verbatimPrefixed";
constexpr DocumentId kDocumentId0 = 0;
constexpr DocumentId kDocumentId1 = 1;
@@ -100,8 +96,6 @@ constexpr SectionId kExactSectionId = 0;
constexpr SectionId kPrefixedSectionId = 1;
constexpr SectionId kRepeatedSectionId = 2;
constexpr SectionId kNestedSectionId = 3;
-constexpr SectionId kExactVerbatimSectionId = 4;
-constexpr SectionId kPrefixedVerbatimSectionId = 5;
using Cardinality = PropertyConfigProto::Cardinality;
using DataType = PropertyConfigProto::DataType;
@@ -110,23 +104,21 @@ using ::testing::Eq;
using ::testing::IsEmpty;
using ::testing::Test;
-constexpr PropertyConfigProto::DataType::Code TYPE_STRING =
- PropertyConfigProto::DataType::STRING;
-constexpr PropertyConfigProto::DataType::Code TYPE_BYTES =
- PropertyConfigProto::DataType::BYTES;
+constexpr PropertyConfigProto_DataType_Code TYPE_STRING =
+ PropertyConfigProto_DataType_Code_STRING;
+constexpr PropertyConfigProto_DataType_Code TYPE_BYTES =
+ PropertyConfigProto_DataType_Code_BYTES;
-constexpr PropertyConfigProto::Cardinality::Code CARDINALITY_OPTIONAL =
- PropertyConfigProto::Cardinality::OPTIONAL;
-constexpr PropertyConfigProto::Cardinality::Code CARDINALITY_REPEATED =
- PropertyConfigProto::Cardinality::REPEATED;
+constexpr PropertyConfigProto_Cardinality_Code CARDINALITY_OPTIONAL =
+ PropertyConfigProto_Cardinality_Code_OPTIONAL;
+constexpr PropertyConfigProto_Cardinality_Code CARDINALITY_REPEATED =
+ PropertyConfigProto_Cardinality_Code_REPEATED;
-constexpr StringIndexingConfig::TokenizerType::Code TOKENIZER_PLAIN =
- StringIndexingConfig::TokenizerType::PLAIN;
-constexpr StringIndexingConfig::TokenizerType::Code TOKENIZER_VERBATIM =
- StringIndexingConfig::TokenizerType::VERBATIM;
+constexpr StringIndexingConfig_TokenizerType_Code TOKENIZER_PLAIN =
+ StringIndexingConfig_TokenizerType_Code_PLAIN;
-constexpr TermMatchType::Code MATCH_EXACT = TermMatchType::EXACT_ONLY;
-constexpr TermMatchType::Code MATCH_PREFIX = TermMatchType::PREFIX;
+constexpr TermMatchType_Code MATCH_EXACT = TermMatchType_Code_EXACT_ONLY;
+constexpr TermMatchType_Code MATCH_PREFIX = TermMatchType_Code_PREFIX;
class IndexProcessorTest : public Test {
protected:
@@ -153,12 +145,9 @@ class IndexProcessorTest : public Test {
normalizer_factory::Create(
/*max_term_byte_size=*/std::numeric_limits<int32_t>::max()));
- std::string schema_store_dir = GetTestTempDir() + "/schema_store";
- ASSERT_TRUE(
- filesystem_.CreateDirectoryRecursively(schema_store_dir.c_str()));
ICING_ASSERT_OK_AND_ASSIGN(
schema_store_,
- SchemaStore::Create(&filesystem_, schema_store_dir, &fake_clock_));
+ SchemaStore::Create(&filesystem_, GetTestTempDir(), &fake_clock_));
SchemaProto schema =
SchemaBuilder()
.AddType(
@@ -189,16 +178,6 @@ class IndexProcessorTest : public Test {
.SetCardinality(CARDINALITY_REPEATED))
.AddProperty(
PropertyConfigBuilder()
- .SetName(kExactVerbatimProperty)
- .SetDataTypeString(MATCH_EXACT, TOKENIZER_VERBATIM)
- .SetCardinality(CARDINALITY_REPEATED))
- .AddProperty(
- PropertyConfigBuilder()
- .SetName(kPrefixedVerbatimProperty)
- .SetDataTypeString(MATCH_PREFIX, TOKENIZER_VERBATIM)
- .SetCardinality(CARDINALITY_REPEATED))
- .AddProperty(
- PropertyConfigBuilder()
.SetName(kSubProperty)
.SetDataTypeDocument(
kNestedType, /*index_nested_properties=*/true)
@@ -214,9 +193,15 @@ class IndexProcessorTest : public Test {
.Build();
ICING_ASSERT_OK(schema_store_->SetSchema(schema));
+ IndexProcessor::Options processor_options;
+ processor_options.max_tokens_per_document = 1000;
+ processor_options.token_limit_behavior =
+ IndexProcessor::Options::TokenLimitBehavior::kReturnError;
+
ICING_ASSERT_OK_AND_ASSIGN(
index_processor_,
- IndexProcessor::Create(normalizer_.get(), index_.get(), &fake_clock_));
+ IndexProcessor::Create(normalizer_.get(), index_.get(),
+ processor_options, &fake_clock_));
mock_icing_filesystem_ = std::make_unique<IcingMockFilesystem>();
}
@@ -247,12 +232,17 @@ std::vector<DocHitInfo> GetHits(std::unique_ptr<DocHitInfoIterator> iterator) {
}
TEST_F(IndexProcessorTest, CreationWithNullPointerShouldFail) {
+ IndexProcessor::Options processor_options;
+ processor_options.max_tokens_per_document = 1000;
+ processor_options.token_limit_behavior =
+ IndexProcessor::Options::TokenLimitBehavior::kReturnError;
+
EXPECT_THAT(IndexProcessor::Create(/*normalizer=*/nullptr, index_.get(),
- &fake_clock_),
+ processor_options, &fake_clock_),
StatusIs(libtextclassifier3::StatusCode::FAILED_PRECONDITION));
EXPECT_THAT(IndexProcessor::Create(normalizer_.get(), /*index=*/nullptr,
- &fake_clock_),
+ processor_options, &fake_clock_),
StatusIs(libtextclassifier3::StatusCode::FAILED_PRECONDITION));
}
@@ -444,68 +434,103 @@ TEST_F(IndexProcessorTest, DocWithRepeatedProperty) {
kDocumentId0, std::vector<SectionId>{kRepeatedSectionId})));
}
-// TODO(b/196771754) This test is disabled on Android because it takes too long
-// to generate all of the unique terms and the test times out. Try storing these
-// unique terms in a file that the test can read from.
-#ifndef __ANDROID__
+TEST_F(IndexProcessorTest, TooManyTokensReturnError) {
+ // Only allow the first four tokens ("hello", "world", "good", "night") to be
+ // indexed.
+ IndexProcessor::Options options;
+ options.max_tokens_per_document = 4;
+ options.token_limit_behavior =
+ IndexProcessor::Options::TokenLimitBehavior::kReturnError;
-TEST_F(IndexProcessorTest, HitBufferExhaustedTest) {
- // Testing has shown that adding ~600,000 hits will fill up the hit buffer.
- std::vector<std::string> unique_terms_ = GenerateUniqueTerms(200000);
- std::string content = absl_ports::StrJoin(unique_terms_, " ");
+ ICING_ASSERT_OK_AND_ASSIGN(
+ index_processor_, IndexProcessor::Create(normalizer_.get(), index_.get(),
+ options, &fake_clock_));
DocumentProto document =
DocumentBuilder()
.SetKey("icing", "fake_type/1")
.SetSchema(std::string(kFakeType))
- .AddStringProperty(std::string(kExactProperty), content)
- .AddStringProperty(std::string(kPrefixedProperty), content)
- .AddStringProperty(std::string(kRepeatedProperty), content)
+ .AddStringProperty(std::string(kExactProperty), "hello world")
+ .AddStringProperty(std::string(kPrefixedProperty), "good night moon!")
.Build();
ICING_ASSERT_OK_AND_ASSIGN(
TokenizedDocument tokenized_document,
TokenizedDocument::Create(schema_store_.get(), lang_segmenter_.get(),
document));
EXPECT_THAT(index_processor_->IndexDocument(tokenized_document, kDocumentId0),
- StatusIs(libtextclassifier3::StatusCode::RESOURCE_EXHAUSTED,
- testing::HasSubstr("Hit buffer is full!")));
+ StatusIs(libtextclassifier3::StatusCode::RESOURCE_EXHAUSTED));
EXPECT_THAT(index_->last_added_document_id(), Eq(kDocumentId0));
+
+ // "night" should have been indexed.
+ ICING_ASSERT_OK_AND_ASSIGN(std::unique_ptr<DocHitInfoIterator> itr,
+ index_->GetIterator("night", kSectionIdMaskAll,
+ TermMatchType::EXACT_ONLY));
+ EXPECT_THAT(GetHits(std::move(itr)),
+ ElementsAre(EqualsDocHitInfo(
+ kDocumentId0, std::vector<SectionId>{kPrefixedSectionId})));
+
+ // "moon" should not have been.
+ ICING_ASSERT_OK_AND_ASSIGN(itr,
+ index_->GetIterator("moon", kSectionIdMaskAll,
+ TermMatchType::EXACT_ONLY));
+ EXPECT_THAT(GetHits(std::move(itr)), IsEmpty());
}
-TEST_F(IndexProcessorTest, LexiconExhaustedTest) {
- // Testing has shown that adding ~300,000 terms generated this way will
- // fill up the lexicon.
- std::vector<std::string> unique_terms_ = GenerateUniqueTerms(300000);
- std::string content = absl_ports::StrJoin(unique_terms_, " ");
+TEST_F(IndexProcessorTest, TooManyTokensSuppressError) {
+ // Only allow the first four tokens ("hello", "world", "good", "night") to be
+ // indexed.
+ IndexProcessor::Options options;
+ options.max_tokens_per_document = 4;
+ options.token_limit_behavior =
+ IndexProcessor::Options::TokenLimitBehavior::kSuppressError;
+
+ ICING_ASSERT_OK_AND_ASSIGN(
+ index_processor_, IndexProcessor::Create(normalizer_.get(), index_.get(),
+ options, &fake_clock_));
DocumentProto document =
DocumentBuilder()
.SetKey("icing", "fake_type/1")
.SetSchema(std::string(kFakeType))
- .AddStringProperty(std::string(kExactProperty), content)
+ .AddStringProperty(std::string(kExactProperty), "hello world")
+ .AddStringProperty(std::string(kPrefixedProperty), "good night moon!")
.Build();
ICING_ASSERT_OK_AND_ASSIGN(
TokenizedDocument tokenized_document,
TokenizedDocument::Create(schema_store_.get(), lang_segmenter_.get(),
document));
EXPECT_THAT(index_processor_->IndexDocument(tokenized_document, kDocumentId0),
- StatusIs(libtextclassifier3::StatusCode::RESOURCE_EXHAUSTED,
- testing::HasSubstr("Unable to add term")));
+ IsOk());
EXPECT_THAT(index_->last_added_document_id(), Eq(kDocumentId0));
-}
-#endif // __ANDROID__
+ // "night" should have been indexed.
+ ICING_ASSERT_OK_AND_ASSIGN(std::unique_ptr<DocHitInfoIterator> itr,
+ index_->GetIterator("night", kSectionIdMaskAll,
+ TermMatchType::EXACT_ONLY));
+ EXPECT_THAT(GetHits(std::move(itr)),
+ ElementsAre(EqualsDocHitInfo(
+ kDocumentId0, std::vector<SectionId>{kPrefixedSectionId})));
+
+ // "moon" should not have been.
+ ICING_ASSERT_OK_AND_ASSIGN(itr,
+ index_->GetIterator("moon", kSectionIdMaskAll,
+ TermMatchType::EXACT_ONLY));
+ EXPECT_THAT(GetHits(std::move(itr)), IsEmpty());
+}
TEST_F(IndexProcessorTest, TooLongTokens) {
// Only allow the tokens of length four, truncating "hello", "world" and
// "night".
+ IndexProcessor::Options options;
+ options.max_tokens_per_document = 1000;
+
ICING_ASSERT_OK_AND_ASSIGN(std::unique_ptr<Normalizer> normalizer,
normalizer_factory::Create(
/*max_term_byte_size=*/4));
ICING_ASSERT_OK_AND_ASSIGN(
- index_processor_,
- IndexProcessor::Create(normalizer.get(), index_.get(), &fake_clock_));
+ index_processor_, IndexProcessor::Create(normalizer.get(), index_.get(),
+ options, &fake_clock_));
DocumentProto document =
DocumentBuilder()
@@ -667,6 +692,16 @@ TEST_F(IndexProcessorTest, NonAsciiIndexing) {
lang_segmenter_,
language_segmenter_factory::Create(std::move(segmenter_options)));
+ IndexProcessor::Options processor_options;
+ processor_options.max_tokens_per_document = 1000;
+ processor_options.token_limit_behavior =
+ IndexProcessor::Options::TokenLimitBehavior::kReturnError;
+
+ ICING_ASSERT_OK_AND_ASSIGN(
+ index_processor_,
+ IndexProcessor::Create(normalizer_.get(), index_.get(), processor_options,
+ &fake_clock_));
+
DocumentProto document =
DocumentBuilder()
.SetKey("icing", "fake_type/1")
@@ -692,13 +727,23 @@ TEST_F(IndexProcessorTest, NonAsciiIndexing) {
TEST_F(IndexProcessorTest,
LexiconFullIndexesSmallerTokensReturnsResourceExhausted) {
+ IndexProcessor::Options processor_options;
+ processor_options.max_tokens_per_document = 1000;
+ processor_options.token_limit_behavior =
+ IndexProcessor::Options::TokenLimitBehavior::kReturnError;
+
+ ICING_ASSERT_OK_AND_ASSIGN(
+ index_processor_,
+ IndexProcessor::Create(normalizer_.get(), index_.get(), processor_options,
+ &fake_clock_));
+
// This is the maximum token length that an empty lexicon constructed for a
// lite index with merge size of 1MiB can support.
constexpr int kMaxTokenLength = 16777217;
// Create a string "ppppppp..." with a length that is too large to fit into
// the lexicon.
std::string enormous_string(kMaxTokenLength + 1, 'p');
- DocumentProto document_one =
+ DocumentProto document =
DocumentBuilder()
.SetKey("icing", "fake_type/1")
.SetSchema(std::string(kFakeType))
@@ -709,10 +754,24 @@ TEST_F(IndexProcessorTest,
ICING_ASSERT_OK_AND_ASSIGN(
TokenizedDocument tokenized_document,
TokenizedDocument::Create(schema_store_.get(), lang_segmenter_.get(),
- document_one));
+ document));
EXPECT_THAT(index_processor_->IndexDocument(tokenized_document, kDocumentId0),
StatusIs(libtextclassifier3::StatusCode::RESOURCE_EXHAUSTED));
EXPECT_THAT(index_->last_added_document_id(), Eq(kDocumentId0));
+
+ ICING_ASSERT_OK_AND_ASSIGN(
+ std::unique_ptr<DocHitInfoIterator> itr,
+ index_->GetIterator("foo", kSectionIdMaskAll, TermMatchType::EXACT_ONLY));
+ EXPECT_THAT(GetHits(std::move(itr)),
+ ElementsAre(EqualsDocHitInfo(
+ kDocumentId0, std::vector<SectionId>{kExactSectionId})));
+
+ ICING_ASSERT_OK_AND_ASSIGN(
+ itr,
+ index_->GetIterator("baz", kSectionIdMaskAll, TermMatchType::EXACT_ONLY));
+ EXPECT_THAT(GetHits(std::move(itr)),
+ ElementsAre(EqualsDocHitInfo(
+ kDocumentId0, std::vector<SectionId>{kPrefixedSectionId})));
}
TEST_F(IndexProcessorTest, IndexingDocAutomaticMerge) {
@@ -736,9 +795,15 @@ TEST_F(IndexProcessorTest, IndexingDocAutomaticMerge) {
ICING_ASSERT_OK_AND_ASSIGN(
index_, Index::Create(options, &filesystem_, &icing_filesystem_));
+ IndexProcessor::Options processor_options;
+ processor_options.max_tokens_per_document = 1000;
+ processor_options.token_limit_behavior =
+ IndexProcessor::Options::TokenLimitBehavior::kReturnError;
+
ICING_ASSERT_OK_AND_ASSIGN(
index_processor_,
- IndexProcessor::Create(normalizer_.get(), index_.get(), &fake_clock_));
+ IndexProcessor::Create(normalizer_.get(), index_.get(), processor_options,
+ &fake_clock_));
DocumentId doc_id = 0;
// Have determined experimentally that indexing 3373 documents with this text
// will cause the LiteIndex to fill up. Further indexing will fail unless the
@@ -792,9 +857,15 @@ TEST_F(IndexProcessorTest, IndexingDocMergeFailureResets) {
index_,
Index::Create(options, &filesystem_, mock_icing_filesystem_.get()));
+ IndexProcessor::Options processor_options;
+ processor_options.max_tokens_per_document = 1000;
+ processor_options.token_limit_behavior =
+ IndexProcessor::Options::TokenLimitBehavior::kReturnError;
+
ICING_ASSERT_OK_AND_ASSIGN(
index_processor_,
- IndexProcessor::Create(normalizer_.get(), index_.get(), &fake_clock_));
+ IndexProcessor::Create(normalizer_.get(), index_.get(), processor_options,
+ &fake_clock_));
// 3. Index one document. This should fit in the LiteIndex without requiring a
// merge.
@@ -816,95 +887,6 @@ TEST_F(IndexProcessorTest, IndexingDocMergeFailureResets) {
EXPECT_THAT(index_->last_added_document_id(), Eq(doc_id));
}
-TEST_F(IndexProcessorTest, ExactVerbatimProperty) {
- DocumentProto document =
- DocumentBuilder()
- .SetKey("icing", "fake_type/1")
- .SetSchema(std::string(kFakeType))
- .AddStringProperty(std::string(kExactVerbatimProperty),
- "Hello, world!")
- .Build();
- ICING_ASSERT_OK_AND_ASSIGN(
- TokenizedDocument tokenized_document,
- TokenizedDocument::Create(schema_store_.get(), lang_segmenter_.get(),
- document));
- EXPECT_THAT(tokenized_document.num_tokens(), 1);
-
- EXPECT_THAT(index_processor_->IndexDocument(tokenized_document, kDocumentId0),
- IsOk());
- EXPECT_THAT(index_->last_added_document_id(), Eq(kDocumentId0));
-
- ICING_ASSERT_OK_AND_ASSIGN(
- std::unique_ptr<DocHitInfoIterator> itr,
- index_->GetIterator("Hello, world!", kSectionIdMaskAll,
- TermMatchType::EXACT_ONLY));
- std::vector<DocHitInfo> hits = GetHits(std::move(itr));
- std::unordered_map<SectionId, Hit::TermFrequency> expectedMap{
- {kExactVerbatimSectionId, 1}};
-
- EXPECT_THAT(hits, ElementsAre(EqualsDocHitInfoWithTermFrequency(
- kDocumentId0, expectedMap)));
-}
-
-TEST_F(IndexProcessorTest, PrefixVerbatimProperty) {
- DocumentProto document =
- DocumentBuilder()
- .SetKey("icing", "fake_type/1")
- .SetSchema(std::string(kFakeType))
- .AddStringProperty(std::string(kPrefixedVerbatimProperty),
- "Hello, world!")
- .Build();
- ICING_ASSERT_OK_AND_ASSIGN(
- TokenizedDocument tokenized_document,
- TokenizedDocument::Create(schema_store_.get(), lang_segmenter_.get(),
- document));
- EXPECT_THAT(tokenized_document.num_tokens(), 1);
-
- EXPECT_THAT(index_processor_->IndexDocument(tokenized_document, kDocumentId0),
- IsOk());
- EXPECT_THAT(index_->last_added_document_id(), Eq(kDocumentId0));
-
- // We expect to match the document we indexed as "Hello, w" is a prefix
- // of "Hello, world!"
- ICING_ASSERT_OK_AND_ASSIGN(std::unique_ptr<DocHitInfoIterator> itr,
- index_->GetIterator("Hello, w", kSectionIdMaskAll,
- TermMatchType::PREFIX));
- std::vector<DocHitInfo> hits = GetHits(std::move(itr));
- std::unordered_map<SectionId, Hit::TermFrequency> expectedMap{
- {kPrefixedVerbatimSectionId, 1}};
-
- EXPECT_THAT(hits, ElementsAre(EqualsDocHitInfoWithTermFrequency(
- kDocumentId0, expectedMap)));
-}
-
-TEST_F(IndexProcessorTest, VerbatimPropertyDoesntMatchSubToken) {
- DocumentProto document =
- DocumentBuilder()
- .SetKey("icing", "fake_type/1")
- .SetSchema(std::string(kFakeType))
- .AddStringProperty(std::string(kPrefixedVerbatimProperty),
- "Hello, world!")
- .Build();
- ICING_ASSERT_OK_AND_ASSIGN(
- TokenizedDocument tokenized_document,
- TokenizedDocument::Create(schema_store_.get(), lang_segmenter_.get(),
- document));
- EXPECT_THAT(tokenized_document.num_tokens(), 1);
-
- EXPECT_THAT(index_processor_->IndexDocument(tokenized_document, kDocumentId0),
- IsOk());
- EXPECT_THAT(index_->last_added_document_id(), Eq(kDocumentId0));
-
- ICING_ASSERT_OK_AND_ASSIGN(
- std::unique_ptr<DocHitInfoIterator> itr,
- index_->GetIterator("world", kSectionIdMaskAll, TermMatchType::PREFIX));
- std::vector<DocHitInfo> hits = GetHits(std::move(itr));
-
- // We should not have hits for term "world" as the index processor should
- // create a sole token "Hello, world! for the document.
- EXPECT_THAT(hits, IsEmpty());
-}
-
} // namespace
} // namespace lib
diff --git a/icing/index/index.cc b/icing/index/index.cc
index 02ba699..db59ad2 100644
--- a/icing/index/index.cc
+++ b/icing/index/index.cc
@@ -36,7 +36,6 @@
#include "icing/legacy/index/icing-filesystem.h"
#include "icing/proto/term.pb.h"
#include "icing/schema/section.h"
-#include "icing/scoring/ranker.h"
#include "icing/store/document-id.h"
#include "icing/util/logging.h"
#include "icing/util/status-macros.h"
@@ -71,25 +70,39 @@ IcingDynamicTrie::Options GetMainLexiconOptions() {
return IcingDynamicTrie::Options();
}
-enum class MergeAction { kTakeLiteTerm, kTakeMainTerm, kMergeTerms };
+// Helper function to check if a term is in the given namespaces.
+// TODO(tjbarron): Implement a method PropertyReadersAll.HasAnyProperty().
+bool IsTermInNamespaces(
+ const IcingDynamicTrie::PropertyReadersAll& property_reader,
+ uint32_t value_index, const std::vector<NamespaceId>& namespace_ids) {
+ if (namespace_ids.empty()) {
+ return true;
+ }
+ for (NamespaceId namespace_id : namespace_ids) {
+ if (property_reader.HasProperty(GetNamespacePropertyId(namespace_id),
+ value_index)) {
+ return true;
+ }
+ }
+
+ return false;
+}
-// Merge the TermMetadata from lite index and main index. If the term exists in
-// both index, sum up its hit count and push it to the term heap.
-// The heap is a min-heap. So that we can avoid some push operation but the time
-// complexity is O(NlgK) which N is total number of term and K is num_to_return.
-std::vector<TermMetadata> MergeAndRankTermMetadatas(
+enum class MergeAction { kTakeLiteTerm, kTakeMainTerm, kMergeTerms };
+std::vector<TermMetadata> MergeTermMetadatas(
std::vector<TermMetadata> lite_term_metadata_list,
std::vector<TermMetadata> main_term_metadata_list, int num_to_return) {
- std::vector<TermMetadata> merged_term_metadata_heap;
- merged_term_metadata_heap.reserve(
+ std::vector<TermMetadata> merged_term_metadata_list;
+ merged_term_metadata_list.reserve(
std::min(lite_term_metadata_list.size() + main_term_metadata_list.size(),
static_cast<size_t>(num_to_return)));
auto lite_term_itr = lite_term_metadata_list.begin();
auto main_term_itr = main_term_metadata_list.begin();
MergeAction merge_action;
- while (lite_term_itr != lite_term_metadata_list.end() ||
- main_term_itr != main_term_metadata_list.end()) {
+ while (merged_term_metadata_list.size() < num_to_return &&
+ (lite_term_itr != lite_term_metadata_list.end() ||
+ main_term_itr != main_term_metadata_list.end())) {
// Get pointers to the next metadatas in each group, if available
// Determine how to merge.
if (main_term_itr == main_term_metadata_list.end()) {
@@ -106,32 +119,23 @@ std::vector<TermMetadata> MergeAndRankTermMetadatas(
}
switch (merge_action) {
case MergeAction::kTakeLiteTerm:
- PushToTermHeap(std::move(*lite_term_itr), num_to_return,
- merged_term_metadata_heap);
+ merged_term_metadata_list.push_back(std::move(*lite_term_itr));
++lite_term_itr;
break;
case MergeAction::kTakeMainTerm:
- PushToTermHeap(std::move(*main_term_itr), num_to_return,
- merged_term_metadata_heap);
+ merged_term_metadata_list.push_back(std::move(*main_term_itr));
++main_term_itr;
break;
case MergeAction::kMergeTerms:
int total_est_hit_count =
lite_term_itr->hit_count + main_term_itr->hit_count;
- PushToTermHeap(TermMetadata(std::move(lite_term_itr->content),
- total_est_hit_count),
- num_to_return, merged_term_metadata_heap);
+ merged_term_metadata_list.emplace_back(
+ std::move(lite_term_itr->content), total_est_hit_count);
++lite_term_itr;
++main_term_itr;
break;
}
}
- // Reverse the list since we pop them from a min heap and we need to return in
- // decreasing order.
- std::vector<TermMetadata> merged_term_metadata_list =
- PopAllTermsFromHeap(merged_term_metadata_heap);
- std::reverse(merged_term_metadata_list.begin(),
- merged_term_metadata_list.end());
return merged_term_metadata_list;
}
@@ -210,56 +214,77 @@ Index::GetIterator(const std::string& term, SectionIdMask section_id_mask,
libtextclassifier3::StatusOr<std::vector<TermMetadata>>
Index::FindLiteTermsByPrefix(const std::string& prefix,
- const NamespaceChecker* namespace_checker) {
+ const std::vector<NamespaceId>& namespace_ids,
+ int num_to_return) {
// Finds all the terms that start with the given prefix in the lexicon.
IcingDynamicTrie::Iterator term_iterator(lite_index_->lexicon(),
prefix.c_str());
+ // A property reader to help check if a term has some property.
+ IcingDynamicTrie::PropertyReadersAll property_reader(lite_index_->lexicon());
+
std::vector<TermMetadata> term_metadata_list;
- while (term_iterator.IsValid()) {
+ while (term_iterator.IsValid() && term_metadata_list.size() < num_to_return) {
uint32_t term_value_index = term_iterator.GetValueIndex();
+ // Skips the terms that don't exist in the given namespaces. We won't skip
+ // any terms if namespace_ids is empty.
+ if (!IsTermInNamespaces(property_reader, term_value_index, namespace_ids)) {
+ term_iterator.Advance();
+ continue;
+ }
+
ICING_ASSIGN_OR_RETURN(
uint32_t term_id,
term_id_codec_->EncodeTvi(term_value_index, TviType::LITE),
absl_ports::InternalError("Failed to access terms in lexicon."));
- ICING_ASSIGN_OR_RETURN(int hit_count,
- lite_index_->CountHits(term_id, namespace_checker));
- if (hit_count > 0) {
- // There is at least one document in the given namespace has this term.
- term_metadata_list.push_back(
- TermMetadata(term_iterator.GetKey(), hit_count));
- }
+
+ term_metadata_list.emplace_back(term_iterator.GetKey(),
+ lite_index_->CountHits(term_id));
term_iterator.Advance();
}
+ if (term_iterator.IsValid()) {
+ // We exited the loop above because we hit the num_to_return limit.
+ ICING_LOG(WARNING) << "Ran into limit of " << num_to_return
+ << " retrieving suggestions for " << prefix
+ << ". Some suggestions may not be returned and others "
+ "may be misranked.";
+ }
return term_metadata_list;
}
libtextclassifier3::StatusOr<std::vector<TermMetadata>>
-Index::FindTermsByPrefix(const std::string& prefix, int num_to_return,
- TermMatchType::Code term_match_type,
- const NamespaceChecker* namespace_checker) {
+Index::FindTermsByPrefix(const std::string& prefix,
+ const std::vector<NamespaceId>& namespace_ids,
+ int num_to_return) {
std::vector<TermMetadata> term_metadata_list;
if (num_to_return <= 0) {
return term_metadata_list;
}
+
// Get results from the LiteIndex.
- ICING_ASSIGN_OR_RETURN(std::vector<TermMetadata> lite_term_metadata_list,
- FindLiteTermsByPrefix(prefix, namespace_checker));
+ ICING_ASSIGN_OR_RETURN(
+ std::vector<TermMetadata> lite_term_metadata_list,
+ FindLiteTermsByPrefix(prefix, namespace_ids, num_to_return));
+
// Append results from the MainIndex.
- ICING_ASSIGN_OR_RETURN(std::vector<TermMetadata> main_term_metadata_list,
- main_index_->FindTermsByPrefix(prefix, term_match_type,
- namespace_checker));
- return MergeAndRankTermMetadatas(std::move(lite_term_metadata_list),
- std::move(main_term_metadata_list),
- num_to_return);
+ ICING_ASSIGN_OR_RETURN(
+ std::vector<TermMetadata> main_term_metadata_list,
+ main_index_->FindTermsByPrefix(prefix, namespace_ids, num_to_return));
+
+ return MergeTermMetadatas(std::move(lite_term_metadata_list),
+ std::move(main_term_metadata_list), num_to_return);
}
IndexStorageInfoProto Index::GetStorageInfo() const {
IndexStorageInfoProto storage_info;
int64_t directory_size = filesystem_->GetDiskUsage(options_.base_dir.c_str());
- storage_info.set_index_size(Filesystem::SanitizeFileSize(directory_size));
+ if (directory_size != Filesystem::kBadFileSize) {
+ storage_info.set_index_size(directory_size);
+ } else {
+ storage_info.set_index_size(-1);
+ }
storage_info = lite_index_->GetStorageInfo(std::move(storage_info));
return main_index_->GetStorageInfo(std::move(storage_info));
}
diff --git a/icing/index/index.h b/icing/index/index.h
index 5c53349..eab5be8 100644
--- a/icing/index/index.h
+++ b/icing/index/index.h
@@ -32,12 +32,10 @@
#include "icing/index/term-id-codec.h"
#include "icing/index/term-metadata.h"
#include "icing/legacy/index/icing-filesystem.h"
-#include "icing/proto/debug.pb.h"
#include "icing/proto/storage.pb.h"
#include "icing/proto/term.pb.h"
#include "icing/schema/section.h"
#include "icing/store/document-id.h"
-#include "icing/store/namespace-checker.h"
#include "icing/store/namespace-id.h"
#include "icing/util/crc32.h"
@@ -144,14 +142,9 @@ class Index {
// index.
// verbosity > 0, more detailed debug information including raw postings
// lists.
- IndexDebugInfoProto GetDebugInfo(int verbosity) const {
- IndexDebugInfoProto debug_info;
- *debug_info.mutable_index_storage_info() = GetStorageInfo();
- *debug_info.mutable_lite_index_info() =
- lite_index_->GetDebugInfo(verbosity);
- *debug_info.mutable_main_index_info() =
- main_index_->GetDebugInfo(verbosity);
- return debug_info;
+ void GetDebugInfo(int verbosity, std::string* out) const {
+ lite_index_->GetDebugInfo(verbosity, out);
+ main_index_->GetDebugInfo(verbosity, out);
}
// Returns the byte size of the all the elements held in the index. This
@@ -188,17 +181,17 @@ class Index {
TermMatchType::Code term_match_type);
// Finds terms with the given prefix in the given namespaces. If
- // 'namespace_ids' is empty, returns results from all the namespaces. Results
- // are sorted in decreasing order of hit count. Number of results are no more
- // than 'num_to_return'.
+ // 'namespace_ids' is empty, returns results from all the namespaces. The
+ // input prefix must be normalized, otherwise inaccurate results may be
+ // returned. Results are not sorted specifically and are in their original
+ // order. Number of results are no more than 'num_to_return'.
//
// Returns:
// A list of TermMetadata on success
// INTERNAL_ERROR if failed to access term data.
libtextclassifier3::StatusOr<std::vector<TermMetadata>> FindTermsByPrefix(
- const std::string& prefix, int num_to_return,
- TermMatchType::Code term_match_type,
- const NamespaceChecker* namespace_checker);
+ const std::string& prefix, const std::vector<NamespaceId>& namespace_ids,
+ int num_to_return);
// A class that can be used to add hits to the index.
//
@@ -274,7 +267,8 @@ class Index {
filesystem_(filesystem) {}
libtextclassifier3::StatusOr<std::vector<TermMetadata>> FindLiteTermsByPrefix(
- const std::string& prefix, const NamespaceChecker* namespace_checker);
+ const std::string& prefix, const std::vector<NamespaceId>& namespace_ids,
+ int num_to_return);
std::unique_ptr<LiteIndex> lite_index_;
std::unique_ptr<MainIndex> main_index_;
diff --git a/icing/index/index_test.cc b/icing/index/index_test.cc
index 8355c01..16593ef 100644
--- a/icing/index/index_test.cc
+++ b/icing/index/index_test.cc
@@ -31,12 +31,10 @@
#include "icing/index/iterator/doc-hit-info-iterator.h"
#include "icing/legacy/index/icing-filesystem.h"
#include "icing/legacy/index/icing-mock-filesystem.h"
-#include "icing/proto/debug.pb.h"
#include "icing/proto/storage.pb.h"
#include "icing/proto/term.pb.h"
#include "icing/schema/section.h"
#include "icing/store/document-id.h"
-#include "icing/testing/always-true-namespace-checker-impl.h"
#include "icing/testing/common-matchers.h"
#include "icing/testing/random-string.h"
#include "icing/testing/tmp-directory.h"
@@ -90,10 +88,18 @@ constexpr DocumentId kDocumentId4 = 4;
constexpr DocumentId kDocumentId5 = 5;
constexpr DocumentId kDocumentId6 = 6;
constexpr DocumentId kDocumentId7 = 7;
-constexpr DocumentId kDocumentId8 = 8;
constexpr SectionId kSectionId2 = 2;
constexpr SectionId kSectionId3 = 3;
+// The value returned by IndexBlock::ApproximateFullPostingListHitsForBlock(
+// GetBlockSize(),
+// GetPostingListIndexBits(posting_list_utils::min_posting_list_size()));
+constexpr int kMinSizePlApproxHits = 3;
+// The value returned by IndexBlock::ApproximateFullPostingListHitsForBlock(
+// GetBlockSize(),
+// GetPostingListIndexBits(2 * posting_list_utils::min_posting_list_size()));
+constexpr int kSecondSmallestPlApproxHits = 7;
+
std::vector<DocHitInfo> GetHits(std::unique_ptr<DocHitInfoIterator> iterator) {
std::vector<DocHitInfo> infos;
while (iterator->Advance().ok()) {
@@ -909,306 +915,217 @@ TEST_F(IndexTest, InvalidHitBufferSize) {
TEST_F(IndexTest, FindTermByPrefixShouldReturnEmpty) {
Index::Editor edit = index_->Edit(kDocumentId0, kSectionId2,
TermMatchType::PREFIX, /*namespace_id=*/0);
- AlwaysTrueNamespaceCheckerImpl impl;
EXPECT_THAT(edit.BufferTerm("fool"), IsOk());
EXPECT_THAT(edit.IndexAllBufferedTerms(), IsOk());
- EXPECT_THAT(index_->FindTermsByPrefix(/*prefix=*/"foo", /*num_to_return=*/0,
- TermMatchType::PREFIX, &impl),
+ EXPECT_THAT(index_->FindTermsByPrefix(/*prefix=*/"foo", /*namespace_ids=*/{0},
+ /*num_to_return=*/0),
IsOkAndHolds(IsEmpty()));
- EXPECT_THAT(index_->FindTermsByPrefix(/*prefix=*/"foo",
- /*num_to_return=*/-1,
- TermMatchType::PREFIX, &impl),
+ EXPECT_THAT(index_->FindTermsByPrefix(/*prefix=*/"foo", /*namespace_ids=*/{0},
+ /*num_to_return=*/-1),
IsOkAndHolds(IsEmpty()));
ICING_ASSERT_OK(index_->Merge());
- EXPECT_THAT(index_->FindTermsByPrefix(/*prefix=*/"foo",
- /*num_to_return=*/0,
- TermMatchType::PREFIX, &impl),
+ EXPECT_THAT(index_->FindTermsByPrefix(/*prefix=*/"foo", /*namespace_ids=*/{0},
+ /*num_to_return=*/0),
IsOkAndHolds(IsEmpty()));
- EXPECT_THAT(index_->FindTermsByPrefix(/*prefix=*/"foo",
- /*num_to_return=*/-1,
- TermMatchType::PREFIX, &impl),
+ EXPECT_THAT(index_->FindTermsByPrefix(/*prefix=*/"foo", /*namespace_ids=*/{0},
+ /*num_to_return=*/-1),
IsOkAndHolds(IsEmpty()));
}
TEST_F(IndexTest, FindTermByPrefixShouldReturnCorrectResult) {
Index::Editor edit = index_->Edit(
kDocumentId0, kSectionId2, TermMatchType::EXACT_ONLY, /*namespace_id=*/0);
- AlwaysTrueNamespaceCheckerImpl impl;
EXPECT_THAT(edit.BufferTerm("foo"), IsOk());
EXPECT_THAT(edit.BufferTerm("bar"), IsOk());
EXPECT_THAT(edit.IndexAllBufferedTerms(), IsOk());
// "b" should only match "bar" but not "foo".
- EXPECT_THAT(index_->FindTermsByPrefix(/*prefix=*/"b",
- /*num_to_return=*/10,
- TermMatchType::PREFIX, &impl),
+ EXPECT_THAT(index_->FindTermsByPrefix(/*prefix=*/"b", /*namespace_ids=*/{0},
+ /*num_to_return=*/10),
IsOkAndHolds(UnorderedElementsAre(EqualsTermMetadata("bar", 1))));
ICING_ASSERT_OK(index_->Merge());
// "b" should only match "bar" but not "foo".
- EXPECT_THAT(index_->FindTermsByPrefix(/*prefix=*/"b",
- /*num_to_return=*/10,
- TermMatchType::PREFIX, &impl),
- IsOkAndHolds(UnorderedElementsAre(EqualsTermMetadata("bar", 1))));
+ EXPECT_THAT(index_->FindTermsByPrefix(/*prefix=*/"b", /*namespace_ids=*/{0},
+ /*num_to_return=*/10),
+ IsOkAndHolds(UnorderedElementsAre(
+ EqualsTermMetadata("bar", kMinSizePlApproxHits))));
}
TEST_F(IndexTest, FindTermByPrefixShouldRespectNumToReturn) {
Index::Editor edit = index_->Edit(
kDocumentId0, kSectionId2, TermMatchType::EXACT_ONLY, /*namespace_id=*/0);
- AlwaysTrueNamespaceCheckerImpl impl;
EXPECT_THAT(edit.BufferTerm("fo"), IsOk());
EXPECT_THAT(edit.BufferTerm("foo"), IsOk());
EXPECT_THAT(edit.BufferTerm("fool"), IsOk());
EXPECT_THAT(edit.IndexAllBufferedTerms(), IsOk());
// We have 3 results but only 2 should be returned.
- EXPECT_THAT(index_->FindTermsByPrefix(/*prefix=*/"f",
- /*num_to_return=*/2,
- TermMatchType::PREFIX, &impl),
+ EXPECT_THAT(index_->FindTermsByPrefix(/*prefix=*/"f", /*namespace_ids=*/{0},
+ /*num_to_return=*/2),
IsOkAndHolds(SizeIs(2)));
ICING_ASSERT_OK(index_->Merge());
// We have 3 results but only 2 should be returned.
- EXPECT_THAT(index_->FindTermsByPrefix(/*prefix=*/"f",
- /*num_to_return=*/2,
- TermMatchType::PREFIX, &impl),
+ EXPECT_THAT(index_->FindTermsByPrefix(/*prefix=*/"f", /*namespace_ids=*/{0},
+ /*num_to_return=*/2),
IsOkAndHolds(SizeIs(2)));
}
-TEST_F(IndexTest, FindTermByPrefixShouldReturnTermsInAllNamespaces) {
+TEST_F(IndexTest, FindTermByPrefixShouldReturnTermsInOneNamespace) {
Index::Editor edit1 =
index_->Edit(kDocumentId0, kSectionId2, TermMatchType::EXACT_ONLY,
/*namespace_id=*/0);
- AlwaysTrueNamespaceCheckerImpl impl;
EXPECT_THAT(edit1.BufferTerm("fo"), IsOk());
+ EXPECT_THAT(edit1.BufferTerm("foo"), IsOk());
EXPECT_THAT(edit1.IndexAllBufferedTerms(), IsOk());
Index::Editor edit2 =
index_->Edit(kDocumentId1, kSectionId2, TermMatchType::EXACT_ONLY,
/*namespace_id=*/1);
- EXPECT_THAT(edit2.BufferTerm("foo"), IsOk());
+ EXPECT_THAT(edit2.BufferTerm("fool"), IsOk());
EXPECT_THAT(edit2.IndexAllBufferedTerms(), IsOk());
- Index::Editor edit3 =
- index_->Edit(kDocumentId2, kSectionId2, TermMatchType::EXACT_ONLY,
- /*namespace_id=*/2);
- EXPECT_THAT(edit3.BufferTerm("fool"), IsOk());
- EXPECT_THAT(edit3.IndexAllBufferedTerms(), IsOk());
-
- // Should return "fo", "foo" and "fool" across all namespaces.
- EXPECT_THAT(index_->FindTermsByPrefix(/*prefix=*/"f",
- /*num_to_return=*/10,
- TermMatchType::PREFIX, &impl),
- IsOkAndHolds(UnorderedElementsAre(
- EqualsTermMetadata("fo", 1), EqualsTermMetadata("foo", 1),
- EqualsTermMetadata("fool", 1))));
+ // namespace with id 0 has 2 results.
+ EXPECT_THAT(index_->FindTermsByPrefix(/*prefix=*/"f", /*namespace_ids=*/{0},
+ /*num_to_return=*/10),
+ IsOkAndHolds(UnorderedElementsAre(EqualsTermMetadata("fo", 1),
+ EqualsTermMetadata("foo", 1))));
+ // namespace with id 1 has 1 result.
+ EXPECT_THAT(
+ index_->FindTermsByPrefix(/*prefix=*/"f", /*namespace_ids=*/{1},
+ /*num_to_return=*/10),
+ IsOkAndHolds(UnorderedElementsAre(EqualsTermMetadata("fool", 1))));
ICING_ASSERT_OK(index_->Merge());
- // Should return "fo", "foo" and "fool" across all namespaces.
- EXPECT_THAT(index_->FindTermsByPrefix(/*prefix=*/"f",
- /*num_to_return=*/10,
- TermMatchType::PREFIX, &impl),
+ // namespace with id 0 has 2 results.
+ EXPECT_THAT(index_->FindTermsByPrefix(/*prefix=*/"f", /*namespace_ids=*/{0},
+ /*num_to_return=*/10),
IsOkAndHolds(UnorderedElementsAre(
- EqualsTermMetadata("fo", 1), EqualsTermMetadata("foo", 1),
- EqualsTermMetadata("fool", 1))));
+ EqualsTermMetadata("fo", kMinSizePlApproxHits),
+ EqualsTermMetadata("foo", kMinSizePlApproxHits))));
+ // namespace with id 1 has 1 result.
+ EXPECT_THAT(index_->FindTermsByPrefix(/*prefix=*/"f", /*namespace_ids=*/{1},
+ /*num_to_return=*/10),
+ IsOkAndHolds(UnorderedElementsAre(
+ EqualsTermMetadata("fool", kMinSizePlApproxHits))));
}
-TEST_F(IndexTest, FindTermByPrefixShouldReturnCorrectHitCount) {
+TEST_F(IndexTest, FindTermByPrefixShouldReturnTermsInMultipleNamespaces) {
Index::Editor edit1 =
index_->Edit(kDocumentId0, kSectionId2, TermMatchType::EXACT_ONLY,
/*namespace_id=*/0);
- AlwaysTrueNamespaceCheckerImpl impl;
- EXPECT_THAT(edit1.BufferTerm("foo"), IsOk());
- EXPECT_THAT(edit1.BufferTerm("fool"), IsOk());
+ EXPECT_THAT(edit1.BufferTerm("fo"), IsOk());
EXPECT_THAT(edit1.IndexAllBufferedTerms(), IsOk());
Index::Editor edit2 =
index_->Edit(kDocumentId1, kSectionId2, TermMatchType::EXACT_ONLY,
- /*namespace_id=*/0);
- EXPECT_THAT(edit2.BufferTerm("fool"), IsOk());
+ /*namespace_id=*/1);
+ EXPECT_THAT(edit2.BufferTerm("foo"), IsOk());
EXPECT_THAT(edit2.IndexAllBufferedTerms(), IsOk());
- // 'foo' has 1 hit, 'fool' has 2 hits.
- EXPECT_THAT(index_->FindTermsByPrefix(/*prefix=*/"f",
- /*num_to_return=*/10,
- TermMatchType::PREFIX, &impl),
- IsOkAndHolds(ElementsAre(EqualsTermMetadata("fool", 2),
- EqualsTermMetadata("foo", 1))));
+ Index::Editor edit3 =
+ index_->Edit(kDocumentId2, kSectionId2, TermMatchType::EXACT_ONLY,
+ /*namespace_id=*/2);
+ EXPECT_THAT(edit3.BufferTerm("fool"), IsOk());
+ EXPECT_THAT(edit3.IndexAllBufferedTerms(), IsOk());
+
+ // Should return "foo" and "fool" which are in namespaces with ids 1 and 2.
+ EXPECT_THAT(
+ index_->FindTermsByPrefix(/*prefix=*/"f", /*namespace_ids=*/{1, 2},
+ /*num_to_return=*/10),
+ IsOkAndHolds(UnorderedElementsAre(EqualsTermMetadata("foo", 1),
+ EqualsTermMetadata("fool", 1))));
ICING_ASSERT_OK(index_->Merge());
- EXPECT_THAT(index_->FindTermsByPrefix(/*prefix=*/"f",
- /*num_to_return=*/10,
- TermMatchType::PREFIX, &impl),
- IsOkAndHolds(ElementsAre(EqualsTermMetadata("fool", 2),
- EqualsTermMetadata("foo", 1))));
+ EXPECT_THAT(
+ index_->FindTermsByPrefix(/*prefix=*/"f", /*namespace_ids=*/{1, 2},
+ /*num_to_return=*/10),
+ IsOkAndHolds(UnorderedElementsAre(
+ EqualsTermMetadata("foo", kMinSizePlApproxHits),
+ EqualsTermMetadata("fool", kMinSizePlApproxHits))));
}
-TEST_F(IndexTest, FindTermByPrefixShouldReturnInOrder) {
- // Push 6 term-six, 5 term-five, 4 term-four, 3 term-three, 2 term-two and one
- // term-one into lite index.
+TEST_F(IndexTest, FindTermByPrefixShouldReturnTermsInAllNamespaces) {
Index::Editor edit1 =
index_->Edit(kDocumentId0, kSectionId2, TermMatchType::EXACT_ONLY,
/*namespace_id=*/0);
- AlwaysTrueNamespaceCheckerImpl impl;
- EXPECT_THAT(edit1.BufferTerm("term-one"), IsOk());
- EXPECT_THAT(edit1.BufferTerm("term-two"), IsOk());
- EXPECT_THAT(edit1.BufferTerm("term-three"), IsOk());
- EXPECT_THAT(edit1.BufferTerm("term-four"), IsOk());
- EXPECT_THAT(edit1.BufferTerm("term-five"), IsOk());
- EXPECT_THAT(edit1.BufferTerm("term-six"), IsOk());
+ EXPECT_THAT(edit1.BufferTerm("fo"), IsOk());
EXPECT_THAT(edit1.IndexAllBufferedTerms(), IsOk());
Index::Editor edit2 =
- index_->Edit(kDocumentId2, kSectionId2, TermMatchType::EXACT_ONLY,
- /*namespace_id=*/0);
- EXPECT_THAT(edit2.BufferTerm("term-two"), IsOk());
- EXPECT_THAT(edit2.BufferTerm("term-three"), IsOk());
- EXPECT_THAT(edit2.BufferTerm("term-four"), IsOk());
- EXPECT_THAT(edit2.BufferTerm("term-five"), IsOk());
- EXPECT_THAT(edit2.BufferTerm("term-six"), IsOk());
+ index_->Edit(kDocumentId1, kSectionId2, TermMatchType::EXACT_ONLY,
+ /*namespace_id=*/1);
+ EXPECT_THAT(edit2.BufferTerm("foo"), IsOk());
EXPECT_THAT(edit2.IndexAllBufferedTerms(), IsOk());
Index::Editor edit3 =
- index_->Edit(kDocumentId3, kSectionId2, TermMatchType::EXACT_ONLY,
- /*namespace_id=*/0);
- EXPECT_THAT(edit3.BufferTerm("term-three"), IsOk());
- EXPECT_THAT(edit3.BufferTerm("term-four"), IsOk());
- EXPECT_THAT(edit3.BufferTerm("term-five"), IsOk());
- EXPECT_THAT(edit3.BufferTerm("term-six"), IsOk());
+ index_->Edit(kDocumentId2, kSectionId2, TermMatchType::EXACT_ONLY,
+ /*namespace_id=*/2);
+ EXPECT_THAT(edit3.BufferTerm("fool"), IsOk());
EXPECT_THAT(edit3.IndexAllBufferedTerms(), IsOk());
- Index::Editor edit4 =
- index_->Edit(kDocumentId4, kSectionId2, TermMatchType::EXACT_ONLY,
- /*namespace_id=*/0);
- EXPECT_THAT(edit4.BufferTerm("term-four"), IsOk());
- EXPECT_THAT(edit4.BufferTerm("term-five"), IsOk());
- EXPECT_THAT(edit4.BufferTerm("term-six"), IsOk());
- EXPECT_THAT(edit4.IndexAllBufferedTerms(), IsOk());
-
- Index::Editor edit5 =
- index_->Edit(kDocumentId5, kSectionId2, TermMatchType::EXACT_ONLY,
- /*namespace_id=*/0);
- EXPECT_THAT(edit5.BufferTerm("term-five"), IsOk());
- EXPECT_THAT(edit5.BufferTerm("term-six"), IsOk());
- EXPECT_THAT(edit5.IndexAllBufferedTerms(), IsOk());
-
- Index::Editor edit6 =
- index_->Edit(kDocumentId6, kSectionId2, TermMatchType::EXACT_ONLY,
- /*namespace_id=*/0);
- EXPECT_THAT(edit6.BufferTerm("term-six"), IsOk());
- EXPECT_THAT(edit6.IndexAllBufferedTerms(), IsOk());
-
- // verify the order in lite index is correct.
- EXPECT_THAT(index_->FindTermsByPrefix(/*prefix=*/"t",
- /*num_to_return=*/10,
- TermMatchType::PREFIX, &impl),
- IsOkAndHolds(ElementsAre(EqualsTermMetadata("term-six", 6),
- EqualsTermMetadata("term-five", 5),
- EqualsTermMetadata("term-four", 4),
- EqualsTermMetadata("term-three", 3),
- EqualsTermMetadata("term-two", 2),
- EqualsTermMetadata("term-one", 1))));
+ // Should return "fo", "foo" and "fool" across all namespaces.
+ EXPECT_THAT(index_->FindTermsByPrefix(/*prefix=*/"f", /*namespace_ids=*/{},
+ /*num_to_return=*/10),
+ IsOkAndHolds(UnorderedElementsAre(
+ EqualsTermMetadata("fo", 1), EqualsTermMetadata("foo", 1),
+ EqualsTermMetadata("fool", 1))));
ICING_ASSERT_OK(index_->Merge());
- EXPECT_THAT(index_->FindTermsByPrefix(/*prefix=*/"t",
- /*num_to_return=*/10,
- TermMatchType::PREFIX, &impl),
- IsOkAndHolds(ElementsAre(EqualsTermMetadata("term-six", 6),
- EqualsTermMetadata("term-five", 5),
- EqualsTermMetadata("term-four", 4),
- EqualsTermMetadata("term-three", 3),
- EqualsTermMetadata("term-two", 2),
- EqualsTermMetadata("term-one", 1))));
-
- // keep push terms to the lite index. We will add 2 document to term-five,
- // term-three and term-one. The output order should be 5-6-3-4-1-2.
- Index::Editor edit7 =
- index_->Edit(kDocumentId7, kSectionId2, TermMatchType::EXACT_ONLY,
- /*namespace_id=*/0);
- EXPECT_THAT(edit7.BufferTerm("term-one"), IsOk());
- EXPECT_THAT(edit7.BufferTerm("term-three"), IsOk());
- EXPECT_THAT(edit7.BufferTerm("term-five"), IsOk());
- EXPECT_THAT(edit7.IndexAllBufferedTerms(), IsOk());
-
- Index::Editor edit8 =
- index_->Edit(kDocumentId8, kSectionId2, TermMatchType::EXACT_ONLY,
- /*namespace_id=*/0);
- EXPECT_THAT(edit8.BufferTerm("term-one"), IsOk());
- EXPECT_THAT(edit8.BufferTerm("term-three"), IsOk());
- EXPECT_THAT(edit8.BufferTerm("term-five"), IsOk());
- EXPECT_THAT(edit8.IndexAllBufferedTerms(), IsOk());
-
- // verify the combination of lite index and main index is in correct order.
- EXPECT_THAT(
- index_->FindTermsByPrefix(/*prefix=*/"t", /*num_to_return=*/10,
- TermMatchType::PREFIX, &impl),
- IsOkAndHolds(ElementsAre(
- EqualsTermMetadata("term-five", 7), EqualsTermMetadata("term-six", 6),
- EqualsTermMetadata("term-three", 5),
- EqualsTermMetadata("term-four", 4), EqualsTermMetadata("term-one", 3),
- EqualsTermMetadata("term-two", 2))));
-
- // Get the first three terms.
- EXPECT_THAT(index_->FindTermsByPrefix(/*prefix=*/"t",
- /*num_to_return=*/3,
- TermMatchType::PREFIX, &impl),
- IsOkAndHolds(ElementsAre(EqualsTermMetadata("term-five", 7),
- EqualsTermMetadata("term-six", 6),
- EqualsTermMetadata("term-three", 5))));
+ // Should return "fo", "foo" and "fool" across all namespaces.
+ EXPECT_THAT(index_->FindTermsByPrefix(/*prefix=*/"f", /*namespace_ids=*/{},
+ /*num_to_return=*/10),
+ IsOkAndHolds(UnorderedElementsAre(
+ EqualsTermMetadata("fo", kMinSizePlApproxHits),
+ EqualsTermMetadata("foo", kMinSizePlApproxHits),
+ EqualsTermMetadata("fool", kMinSizePlApproxHits))));
}
-TEST_F(IndexTest, FindTermByPrefix_InTermMatchTypePrefix_ShouldReturnInOrder) {
+TEST_F(IndexTest, FindTermByPrefixShouldReturnCorrectHitCount) {
Index::Editor edit1 =
- index_->Edit(kDocumentId0, kSectionId2, TermMatchType::PREFIX,
+ index_->Edit(kDocumentId0, kSectionId2, TermMatchType::EXACT_ONLY,
/*namespace_id=*/0);
- AlwaysTrueNamespaceCheckerImpl impl;
- EXPECT_THAT(edit1.BufferTerm("fo"), IsOk());
+ EXPECT_THAT(edit1.BufferTerm("foo"), IsOk());
+ EXPECT_THAT(edit1.BufferTerm("fool"), IsOk());
EXPECT_THAT(edit1.IndexAllBufferedTerms(), IsOk());
Index::Editor edit2 =
- index_->Edit(kDocumentId2, kSectionId2, TermMatchType::PREFIX,
+ index_->Edit(kDocumentId1, kSectionId2, TermMatchType::EXACT_ONLY,
/*namespace_id=*/0);
- EXPECT_THAT(edit2.BufferTerm("foo"), IsOk());
+ EXPECT_THAT(edit2.BufferTerm("fool"), IsOk());
EXPECT_THAT(edit2.IndexAllBufferedTerms(), IsOk());
- Index::Editor edit3 =
- index_->Edit(kDocumentId3, kSectionId2, TermMatchType::PREFIX,
- /*namespace_id=*/0);
- EXPECT_THAT(edit3.BufferTerm("fool"), IsOk());
- EXPECT_THAT(edit3.IndexAllBufferedTerms(), IsOk());
+ // 'foo' has 1 hit, 'fool' has 2 hits.
+ EXPECT_THAT(
+ index_->FindTermsByPrefix(/*prefix=*/"f", /*namespace_ids=*/{0},
+ /*num_to_return=*/10),
+ IsOkAndHolds(UnorderedElementsAre(EqualsTermMetadata("foo", 1),
+ EqualsTermMetadata("fool", 2))));
ICING_ASSERT_OK(index_->Merge());
- // verify the order in pls is correct
- // "fo" { {doc0, exact_hit}, {doc1, prefix_hit}, {doc2, prefix_hit} }
- // "foo" { {doc1, exact_hit}, {doc2, prefix_hit} }
- // "fool" { {doc2, exact_hit} }
- EXPECT_THAT(index_->FindTermsByPrefix(/*prefix=*/"f",
- /*num_to_return=*/10,
- TermMatchType::PREFIX, &impl),
- IsOkAndHolds(ElementsAre(EqualsTermMetadata("fo", 3),
- EqualsTermMetadata("foo", 2),
- EqualsTermMetadata("fool", 1))));
- // Find by exact only, all terms should be equally.
- EXPECT_THAT(index_->FindTermsByPrefix(/*prefix=*/"f", /*num_to_return=*/10,
- TermMatchType::EXACT_ONLY, &impl),
+
+ // foo's one hit should fit on a min-sized pl, fool's two hits should also fit
+ // on a min-sized pl.
+ EXPECT_THAT(index_->FindTermsByPrefix(/*prefix=*/"f", /*namespace_ids=*/{0},
+ /*num_to_return=*/10),
IsOkAndHolds(UnorderedElementsAre(
- EqualsTermMetadata("fo", 1), EqualsTermMetadata("foo", 1),
- EqualsTermMetadata("fool", 1))));
+ EqualsTermMetadata("foo", kMinSizePlApproxHits),
+ EqualsTermMetadata("fool", kMinSizePlApproxHits))));
}
-TEST_F(IndexTest, FindTermByPrefixShouldReturnHitCountForMain) {
+TEST_F(IndexTest, FindTermByPrefixShouldReturnApproximateHitCountForMain) {
Index::Editor edit =
index_->Edit(kDocumentId0, kSectionId2, TermMatchType::EXACT_ONLY,
/*namespace_id=*/0);
- AlwaysTrueNamespaceCheckerImpl impl;
EXPECT_THAT(edit.BufferTerm("foo"), IsOk());
EXPECT_THAT(edit.BufferTerm("fool"), IsOk());
EXPECT_THAT(edit.IndexAllBufferedTerms(), IsOk());
@@ -1243,26 +1160,26 @@ TEST_F(IndexTest, FindTermByPrefixShouldReturnHitCountForMain) {
EXPECT_THAT(edit.IndexAllBufferedTerms(), IsOk());
// 'foo' has 1 hit, 'fool' has 8 hits.
- EXPECT_THAT(index_->FindTermsByPrefix(/*prefix=*/"f",
- /*num_to_return=*/10,
- TermMatchType::PREFIX, &impl),
- IsOkAndHolds(ElementsAre(EqualsTermMetadata("fool", 8),
- EqualsTermMetadata("foo", 1))));
-
- ICING_ASSERT_OK(index_->Merge());
-
EXPECT_THAT(
- index_->FindTermsByPrefix(/*prefix=*/"f", /*num_to_return=*/10,
- TermMatchType::PREFIX, &impl),
+ index_->FindTermsByPrefix(/*prefix=*/"f", /*namespace_ids=*/{0},
+ /*num_to_return=*/10),
IsOkAndHolds(UnorderedElementsAre(EqualsTermMetadata("foo", 1),
EqualsTermMetadata("fool", 8))));
+
+ ICING_ASSERT_OK(index_->Merge());
+
+ // foo's hits should fit on a single pl. fool's hits will need two pls.
+ EXPECT_THAT(index_->FindTermsByPrefix(/*prefix=*/"f", /*namespace_ids=*/{0},
+ /*num_to_return=*/10),
+ IsOkAndHolds(UnorderedElementsAre(
+ EqualsTermMetadata("foo", kMinSizePlApproxHits),
+ EqualsTermMetadata("fool", kSecondSmallestPlApproxHits))));
}
TEST_F(IndexTest, FindTermByPrefixShouldReturnCombinedHitCount) {
Index::Editor edit =
index_->Edit(kDocumentId0, kSectionId2, TermMatchType::EXACT_ONLY,
/*namespace_id=*/0);
- AlwaysTrueNamespaceCheckerImpl impl;
EXPECT_THAT(edit.BufferTerm("foo"), IsOk());
EXPECT_THAT(edit.BufferTerm("fool"), IsOk());
EXPECT_THAT(edit.IndexAllBufferedTerms(), IsOk());
@@ -1274,18 +1191,19 @@ TEST_F(IndexTest, FindTermByPrefixShouldReturnCombinedHitCount) {
EXPECT_THAT(edit.BufferTerm("fool"), IsOk());
EXPECT_THAT(edit.IndexAllBufferedTerms(), IsOk());
- EXPECT_THAT(index_->FindTermsByPrefix(/*prefix=*/"f", /*num_to_return=*/10,
- TermMatchType::PREFIX, &impl),
- IsOkAndHolds(ElementsAre(EqualsTermMetadata("fool", 2),
- EqualsTermMetadata("foo", 1))));
+ // 'foo' has 1 hit in the main index, 'fool' has 1 hit in the main index and
+ // 1 hit in the lite index.
+ EXPECT_THAT(index_->FindTermsByPrefix(/*prefix=*/"f", /*namespace_ids=*/{0},
+ /*num_to_return=*/10),
+ IsOkAndHolds(UnorderedElementsAre(
+ EqualsTermMetadata("foo", kMinSizePlApproxHits),
+ EqualsTermMetadata("fool", kMinSizePlApproxHits + 1))));
}
TEST_F(IndexTest, FindTermByPrefixShouldReturnTermsFromBothIndices) {
Index::Editor edit =
index_->Edit(kDocumentId0, kSectionId2, TermMatchType::EXACT_ONLY,
/*namespace_id=*/0);
- AlwaysTrueNamespaceCheckerImpl impl;
-
EXPECT_THAT(edit.BufferTerm("foo"), IsOk());
EXPECT_THAT(edit.IndexAllBufferedTerms(), IsOk());
@@ -1297,11 +1215,11 @@ TEST_F(IndexTest, FindTermByPrefixShouldReturnTermsFromBothIndices) {
EXPECT_THAT(edit.IndexAllBufferedTerms(), IsOk());
// 'foo' has 1 hit in the main index, 'fool' has 1 hit in the lite index.
- EXPECT_THAT(
- index_->FindTermsByPrefix(/*prefix=*/"f", /*num_to_return=*/10,
- TermMatchType::PREFIX, &impl),
- IsOkAndHolds(UnorderedElementsAre(EqualsTermMetadata("foo", 1),
- EqualsTermMetadata("fool", 1))));
+ EXPECT_THAT(index_->FindTermsByPrefix(/*prefix=*/"f", /*namespace_ids=*/{0},
+ /*num_to_return=*/10),
+ IsOkAndHolds(UnorderedElementsAre(
+ EqualsTermMetadata("foo", kMinSizePlApproxHits),
+ EqualsTermMetadata("fool", 1))));
}
TEST_F(IndexTest, GetElementsSize) {
@@ -1395,14 +1313,12 @@ TEST_F(IndexTest, GetDebugInfo) {
EXPECT_THAT(edit.IndexAllBufferedTerms(), IsOk());
edit = index_->Edit(kDocumentId1, kSectionId3, TermMatchType::PREFIX,
/*namespace_id=*/0);
- index_->set_last_added_document_id(kDocumentId1);
ASSERT_THAT(edit.BufferTerm("foot"), IsOk());
EXPECT_THAT(edit.IndexAllBufferedTerms(), IsOk());
ICING_ASSERT_OK(index_->Merge());
edit = index_->Edit(kDocumentId2, kSectionId2, TermMatchType::EXACT_ONLY,
/*namespace_id=*/0);
- index_->set_last_added_document_id(kDocumentId2);
ASSERT_THAT(edit.BufferTerm("footer"), IsOk());
EXPECT_THAT(edit.IndexAllBufferedTerms(), IsOk());
edit = index_->Edit(kDocumentId2, kSectionId3, TermMatchType::PREFIX,
@@ -1410,45 +1326,40 @@ TEST_F(IndexTest, GetDebugInfo) {
ASSERT_THAT(edit.BufferTerm("foo"), IsOk());
EXPECT_THAT(edit.IndexAllBufferedTerms(), IsOk());
- IndexDebugInfoProto out0 = index_->GetDebugInfo(/*verbosity=*/0);
- EXPECT_FALSE(out0.main_index_info().has_flash_index_storage_info());
- EXPECT_THAT(out0.main_index_info().last_added_document_id(),
- Eq(kDocumentId1));
- EXPECT_THAT(out0.lite_index_info().curr_size(), Eq(2));
- EXPECT_THAT(out0.lite_index_info().last_added_document_id(),
- Eq(kDocumentId2));
+ std::string out0;
+ index_->GetDebugInfo(/*verbosity=*/0, &out0);
+ EXPECT_THAT(out0, Not(IsEmpty()));
- IndexDebugInfoProto out1 = index_->GetDebugInfo(/*verbosity=*/1);
- EXPECT_THAT(out1.main_index_info().flash_index_storage_info(),
- Not(IsEmpty()));
+ std::string out1;
+ index_->GetDebugInfo(/*verbosity=*/1, &out1);
+ EXPECT_THAT(out1, SizeIs(Gt(out0.size())));
// Add one more doc to the lite index. Debug strings should change.
edit = index_->Edit(kDocumentId3, kSectionId2, TermMatchType::EXACT_ONLY,
/*namespace_id=*/0);
- index_->set_last_added_document_id(kDocumentId3);
ASSERT_THAT(edit.BufferTerm("far"), IsOk());
EXPECT_THAT(edit.IndexAllBufferedTerms(), IsOk());
- IndexDebugInfoProto out2 = index_->GetDebugInfo(/*verbosity=*/0);
- EXPECT_THAT(out2.lite_index_info().curr_size(), Eq(3));
- EXPECT_THAT(out2.lite_index_info().last_added_document_id(),
- Eq(kDocumentId3));
+ std::string out2;
+ index_->GetDebugInfo(/*verbosity=*/0, &out2);
+ EXPECT_THAT(out2, Ne(out0));
+
+ std::string out3;
+ index_->GetDebugInfo(/*verbosity=*/1, &out3);
+ EXPECT_THAT(out3, Ne(out1));
// Merge into the man index. Debuug strings should change again.
ICING_ASSERT_OK(index_->Merge());
- IndexDebugInfoProto out3 = index_->GetDebugInfo(/*verbosity=*/0);
- EXPECT_TRUE(out3.has_index_storage_info());
- EXPECT_THAT(out3.main_index_info().lexicon_info(), Not(IsEmpty()));
- EXPECT_THAT(out3.main_index_info().last_added_document_id(),
- Eq(kDocumentId3));
- EXPECT_THAT(out3.lite_index_info().curr_size(), Eq(0));
- EXPECT_THAT(out3.lite_index_info().hit_buffer_size(), Gt(0));
- EXPECT_THAT(out3.lite_index_info().last_added_document_id(),
- Eq(kInvalidDocumentId));
- EXPECT_THAT(out3.lite_index_info().searchable_end(), Eq(0));
- EXPECT_THAT(out3.lite_index_info().index_crc(), Gt(0));
- EXPECT_THAT(out3.lite_index_info().lexicon_info(), Not(IsEmpty()));
+ std::string out4;
+ index_->GetDebugInfo(/*verbosity=*/0, &out4);
+ EXPECT_THAT(out4, Ne(out0));
+ EXPECT_THAT(out4, Ne(out2));
+
+ std::string out5;
+ index_->GetDebugInfo(/*verbosity=*/1, &out5);
+ EXPECT_THAT(out5, Ne(out1));
+ EXPECT_THAT(out5, Ne(out3));
}
TEST_F(IndexTest, BackfillingMultipleTermsSucceeds) {
diff --git a/icing/index/iterator/doc-hit-info-iterator-and.cc b/icing/index/iterator/doc-hit-info-iterator-and.cc
index 543e9ef..66f87bd 100644
--- a/icing/index/iterator/doc-hit-info-iterator-and.cc
+++ b/icing/index/iterator/doc-hit-info-iterator-and.cc
@@ -14,7 +14,8 @@
#include "icing/index/iterator/doc-hit-info-iterator-and.h"
-#include <cstddef>
+#include <stddef.h>
+
#include <cstdint>
#include <memory>
#include <string>
@@ -161,7 +162,6 @@ libtextclassifier3::Status DocHitInfoIteratorAndNary::Advance() {
DocumentId unused;
ICING_ASSIGN_OR_RETURN(
unused, AdvanceTo(iterator.get(), potential_document_id));
- (void)unused; // Silence unused warning.
}
if (iterator->doc_hit_info().document_id() == potential_document_id) {
diff --git a/icing/index/iterator/doc-hit-info-iterator-section-restrict_test.cc b/icing/index/iterator/doc-hit-info-iterator-section-restrict_test.cc
index 7c6d924..43a846b 100644
--- a/icing/index/iterator/doc-hit-info-iterator-section-restrict_test.cc
+++ b/icing/index/iterator/doc-hit-info-iterator-section-restrict_test.cc
@@ -48,13 +48,13 @@ using ::testing::ElementsAreArray;
using ::testing::Eq;
using ::testing::IsEmpty;
-constexpr PropertyConfigProto::Cardinality::Code CARDINALITY_OPTIONAL =
- PropertyConfigProto::Cardinality::OPTIONAL;
+constexpr PropertyConfigProto_Cardinality_Code CARDINALITY_OPTIONAL =
+ PropertyConfigProto_Cardinality_Code_OPTIONAL;
-constexpr StringIndexingConfig::TokenizerType::Code TOKENIZER_PLAIN =
- StringIndexingConfig::TokenizerType::PLAIN;
+constexpr StringIndexingConfig_TokenizerType_Code TOKENIZER_PLAIN =
+ StringIndexingConfig_TokenizerType_Code_PLAIN;
-constexpr TermMatchType::Code MATCH_EXACT = TermMatchType::EXACT_ONLY;
+constexpr TermMatchType_Code MATCH_EXACT = TermMatchType_Code_EXACT_ONLY;
class DocHitInfoIteratorSectionRestrictTest : public ::testing::Test {
protected:
diff --git a/icing/index/lite/doc-hit-info-iterator-term-lite.cc b/icing/index/lite/doc-hit-info-iterator-term-lite.cc
index f215d63..d535d7f 100644
--- a/icing/index/lite/doc-hit-info-iterator-term-lite.cc
+++ b/icing/index/lite/doc-hit-info-iterator-term-lite.cc
@@ -45,13 +45,8 @@ libtextclassifier3::Status DocHitInfoIteratorTermLite::Advance() {
if (cached_hits_idx_ == -1) {
libtextclassifier3::Status status = RetrieveMoreHits();
if (!status.ok()) {
- if (!absl_ports::IsNotFound(status)) {
- // NOT_FOUND is expected to happen (not every term will be in the main
- // index!). Other errors are worth logging.
- ICING_LOG(ERROR)
- << "Encountered unexpected failure while retrieving hits "
- << status.error_message();
- }
+ ICING_LOG(ERROR) << "Failed to retrieve more hits "
+ << status.error_message();
return absl_ports::ResourceExhaustedError(
"No more DocHitInfos in iterator");
}
@@ -77,8 +72,7 @@ libtextclassifier3::Status DocHitInfoIteratorTermLiteExact::RetrieveMoreHits() {
ICING_ASSIGN_OR_RETURN(uint32_t term_id,
term_id_codec_->EncodeTvi(tvi, TviType::LITE));
lite_index_->AppendHits(term_id, section_restrict_mask_,
- /*only_from_prefix_sections=*/false,
- /*namespace_checker=*/nullptr, &cached_hits_);
+ /*only_from_prefix_sections=*/false, &cached_hits_);
cached_hits_idx_ = 0;
return libtextclassifier3::Status::OK;
}
@@ -101,7 +95,7 @@ DocHitInfoIteratorTermLitePrefix::RetrieveMoreHits() {
term_id_codec_->EncodeTvi(it.GetValueIndex(), TviType::LITE));
lite_index_->AppendHits(term_id, section_restrict_mask_,
/*only_from_prefix_sections=*/!exact_match,
- /*namespace_checker=*/nullptr, &cached_hits_);
+ &cached_hits_);
++terms_matched;
}
if (terms_matched > 1) {
diff --git a/icing/index/lite/doc-hit-info-iterator-term-lite.h b/icing/index/lite/doc-hit-info-iterator-term-lite.h
index 179fc93..8dbe043 100644
--- a/icing/index/lite/doc-hit-info-iterator-term-lite.h
+++ b/icing/index/lite/doc-hit-info-iterator-term-lite.h
@@ -82,11 +82,6 @@ class DocHitInfoIteratorTermLite : public DocHitInfoIterator {
protected:
// Add DocHitInfos corresponding to term_ to cached_hits_.
- //
- // Returns:
- // - OK, on success
- // - NOT_FOUND if no term matching term_ was found in the lexicon.
- // - INVALID_ARGUMENT if unable to properly encode the termid
virtual libtextclassifier3::Status RetrieveMoreHits() = 0;
const std::string term_;
diff --git a/icing/index/lite/lite-index.cc b/icing/index/lite/lite-index.cc
index a5c6baf..fb23934 100644
--- a/icing/index/lite/lite-index.cc
+++ b/icing/index/lite/lite-index.cc
@@ -14,11 +14,12 @@
#include "icing/index/lite/lite-index.h"
+#include <inttypes.h>
+#include <stddef.h>
+#include <stdint.h>
#include <sys/mman.h>
#include <algorithm>
-#include <cinttypes>
-#include <cstddef>
#include <cstdint>
#include <memory>
#include <string>
@@ -336,12 +337,9 @@ libtextclassifier3::StatusOr<uint32_t> LiteIndex::GetTermId(
int LiteIndex::AppendHits(uint32_t term_id, SectionIdMask section_id_mask,
bool only_from_prefix_sections,
- const NamespaceChecker* namespace_checker,
std::vector<DocHitInfo>* hits_out) {
int count = 0;
DocumentId last_document_id = kInvalidDocumentId;
- // Record whether the last document belongs to the given namespaces.
- bool last_document_in_namespace = false;
for (uint32_t idx = Seek(term_id); idx < header_->cur_size(); idx++) {
TermIdHitPair term_id_hit_pair(
hit_buffer_.array_cast<TermIdHitPair>()[idx]);
@@ -358,31 +356,22 @@ int LiteIndex::AppendHits(uint32_t term_id, SectionIdMask section_id_mask,
}
DocumentId document_id = hit.document_id();
if (document_id != last_document_id) {
- last_document_id = document_id;
- last_document_in_namespace =
- namespace_checker == nullptr ||
- namespace_checker->BelongsToTargetNamespaces(document_id);
- if (!last_document_in_namespace) {
- // The document is removed or expired or not belongs to target
- // namespaces.
- continue;
- }
++count;
if (hits_out != nullptr) {
hits_out->push_back(DocHitInfo(document_id));
}
+ last_document_id = document_id;
}
- if (hits_out != nullptr && last_document_in_namespace) {
+ if (hits_out != nullptr) {
hits_out->back().UpdateSection(hit.section_id(), hit.term_frequency());
}
}
return count;
}
-libtextclassifier3::StatusOr<int> LiteIndex::CountHits(
- uint32_t term_id, const NamespaceChecker* namespace_checker) {
+int LiteIndex::CountHits(uint32_t term_id) {
return AppendHits(term_id, kSectionIdMaskAll,
- /*only_from_prefix_sections=*/false, namespace_checker,
+ /*only_from_prefix_sections=*/false,
/*hits_out=*/nullptr);
}
@@ -391,16 +380,15 @@ bool LiteIndex::is_full() const {
lexicon_.min_free_fraction() < (1.0 - kTrieFullFraction));
}
-IndexDebugInfoProto::LiteIndexDebugInfoProto LiteIndex::GetDebugInfo(
- int verbosity) {
- IndexDebugInfoProto::LiteIndexDebugInfoProto res;
- res.set_curr_size(header_->cur_size());
- res.set_hit_buffer_size(options_.hit_buffer_size);
- res.set_last_added_document_id(header_->last_added_docid());
- res.set_searchable_end(header_->searchable_end());
- res.set_index_crc(ComputeChecksum().Get());
- lexicon_.GetDebugInfo(verbosity, res.mutable_lexicon_info());
- return res;
+void LiteIndex::GetDebugInfo(int verbosity, std::string* out) const {
+ absl_ports::StrAppend(
+ out, IcingStringUtil::StringPrintf("Lite Index\nHit buffer %u/%u\n",
+ header_->cur_size(),
+ options_.hit_buffer_size));
+
+ // Lexicon.
+ out->append("Lexicon stats:\n");
+ lexicon_.GetDebugInfo(verbosity, out);
}
libtextclassifier3::StatusOr<int64_t> LiteIndex::GetElementsSize() const {
@@ -421,8 +409,12 @@ IndexStorageInfoProto LiteIndex::GetStorageInfo(
IndexStorageInfoProto storage_info) const {
int64_t header_and_hit_buffer_file_size =
filesystem_->GetFileSize(hit_buffer_fd_.get());
- storage_info.set_lite_index_hit_buffer_size(
- IcingFilesystem::SanitizeFileSize(header_and_hit_buffer_file_size));
+ if (header_and_hit_buffer_file_size != Filesystem::kBadFileSize) {
+ storage_info.set_lite_index_hit_buffer_size(
+ header_and_hit_buffer_file_size);
+ } else {
+ storage_info.set_lite_index_hit_buffer_size(-1);
+ }
int64_t lexicon_disk_usage = lexicon_.GetElementsSize();
if (lexicon_disk_usage != Filesystem::kBadFileSize) {
storage_info.set_lite_index_lexicon_size(lexicon_disk_usage);
diff --git a/icing/index/lite/lite-index.h b/icing/index/lite/lite-index.h
index 378fc94..b134aba 100644
--- a/icing/index/lite/lite-index.h
+++ b/icing/index/lite/lite-index.h
@@ -37,12 +37,10 @@
#include "icing/legacy/index/icing-lite-index-header.h"
#include "icing/legacy/index/icing-lite-index-options.h"
#include "icing/legacy/index/icing-mmapper.h"
-#include "icing/proto/debug.pb.h"
#include "icing/proto/storage.pb.h"
#include "icing/proto/term.pb.h"
#include "icing/schema/section.h"
#include "icing/store/document-id.h"
-#include "icing/store/namespace-checker.h"
#include "icing/store/namespace-id.h"
#include "icing/util/bit-util.h"
#include "icing/util/crc32.h"
@@ -142,19 +140,13 @@ class LiteIndex {
// skipping hits in non-prefix sections if only_from_prefix_sections is true,
// to hits_out. If hits_out is nullptr, no hits will be added.
//
- // Only those hits which belongs to the given namespaces will be counted and
- // appended. A nullptr namespace checker will disable this check.
- //
// Returns the number of hits that would be added to hits_out.
int AppendHits(uint32_t term_id, SectionIdMask section_id_mask,
bool only_from_prefix_sections,
- const NamespaceChecker* namespace_checker,
std::vector<DocHitInfo>* hits_out);
// Returns the hit count of the term.
- // Only those hits which belongs to the given namespaces will be counted.
- libtextclassifier3::StatusOr<int> CountHits(
- uint32_t term_id, const NamespaceChecker* namespace_checker);
+ int CountHits(uint32_t term_id);
// Check if buffer has reached its capacity.
bool is_full() const;
@@ -242,7 +234,7 @@ class LiteIndex {
// Returns debug information for the index in out.
// verbosity <= 0, simplest debug information - size of lexicon, hit buffer
// verbosity > 0, more detailed debug information from the lexicon.
- IndexDebugInfoProto::LiteIndexDebugInfoProto GetDebugInfo(int verbosity);
+ void GetDebugInfo(int verbosity, std::string* out) const;
// Returns the byte size of all the elements held in the index. This excludes
// the size of any internal metadata of the index, e.g. the index's header.
diff --git a/icing/index/lite/lite-index_test.cc b/icing/index/lite/lite-index_test.cc
deleted file mode 100644
index 825f830..0000000
--- a/icing/index/lite/lite-index_test.cc
+++ /dev/null
@@ -1,110 +0,0 @@
-// Copyright (C) 2019 Google LLC
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-// http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#include "icing/index/lite/lite-index.h"
-
-#include <vector>
-
-#include "gmock/gmock.h"
-#include "gtest/gtest.h"
-#include "icing/index/term-id-codec.h"
-#include "icing/legacy/index/icing-mock-filesystem.h"
-#include "icing/schema/section.h"
-#include "icing/store/namespace-checker.h"
-#include "icing/testing/common-matchers.h"
-#include "icing/testing/tmp-directory.h"
-
-namespace icing {
-namespace lib {
-
-namespace {
-
-using ::testing::Eq;
-using ::testing::IsEmpty;
-using ::testing::SizeIs;
-
-class AlwaysFalseNamespaceCheckerImpl : public NamespaceChecker {
- public:
- bool BelongsToTargetNamespaces(DocumentId document_id) const override {
- return false;
- }
-};
-
-class LiteIndexTest : public testing::Test {
- protected:
- void SetUp() override {
- index_dir_ = GetTestTempDir() + "/test_dir";
- ASSERT_TRUE(filesystem_.CreateDirectoryRecursively(index_dir_.c_str()));
-
- std::string lite_index_file_name = index_dir_ + "/test_file.lite-idx.index";
- LiteIndex::Options options(lite_index_file_name,
- /*hit_buffer_want_merge_bytes=*/1024 * 1024);
- ICING_ASSERT_OK_AND_ASSIGN(lite_index_,
- LiteIndex::Create(options, &icing_filesystem_));
-
- ICING_ASSERT_OK_AND_ASSIGN(
- term_id_codec_,
- TermIdCodec::Create(
- IcingDynamicTrie::max_value_index(IcingDynamicTrie::Options()),
- IcingDynamicTrie::max_value_index(options.lexicon_options)));
- }
-
- void TearDown() override {
- ASSERT_TRUE(filesystem_.DeleteDirectoryRecursively(index_dir_.c_str()));
- }
-
- std::string index_dir_;
- Filesystem filesystem_;
- IcingFilesystem icing_filesystem_;
- std::unique_ptr<LiteIndex> lite_index_;
- std::unique_ptr<TermIdCodec> term_id_codec_;
-};
-
-constexpr NamespaceId kNamespace0 = 0;
-
-TEST_F(LiteIndexTest, LiteIndexAppendHits) {
- ICING_ASSERT_OK_AND_ASSIGN(
- uint32_t tvi,
- lite_index_->InsertTerm("foo", TermMatchType::PREFIX, kNamespace0));
- ICING_ASSERT_OK_AND_ASSIGN(uint32_t foo_term_id,
- term_id_codec_->EncodeTvi(tvi, TviType::LITE));
- Hit doc_hit0(/*section_id=*/0, /*document_id=*/0, Hit::kDefaultTermFrequency,
- /*is_in_prefix_section=*/false);
- Hit doc_hit1(/*section_id=*/1, /*document_id=*/0, Hit::kDefaultTermFrequency,
- /*is_in_prefix_section=*/false);
- ICING_ASSERT_OK(lite_index_->AddHit(foo_term_id, doc_hit0));
- ICING_ASSERT_OK(lite_index_->AddHit(foo_term_id, doc_hit1));
-
- std::vector<DocHitInfo> hits1;
- lite_index_->AppendHits(foo_term_id, kSectionIdMaskAll,
- /*only_from_prefix_sections=*/false,
- /*namespace_checker=*/nullptr, &hits1);
- EXPECT_THAT(hits1, SizeIs(1));
- EXPECT_THAT(hits1.back().document_id(), Eq(0));
- // Check that the hits are coming from section 0 and section 1.
- EXPECT_THAT(hits1.back().hit_section_ids_mask(), Eq(0b11));
-
- std::vector<DocHitInfo> hits2;
- AlwaysFalseNamespaceCheckerImpl always_false_namespace_checker;
- lite_index_->AppendHits(foo_term_id, kSectionIdMaskAll,
- /*only_from_prefix_sections=*/false,
- &always_false_namespace_checker, &hits2);
- // Check that no hits are returned because they get skipped by the namespace
- // checker.
- EXPECT_THAT(hits2, IsEmpty());
-}
-
-} // namespace
-} // namespace lib
-} // namespace icing
diff --git a/icing/index/main/doc-hit-info-iterator-term-main.cc b/icing/index/main/doc-hit-info-iterator-term-main.cc
index 98bc18e..5553c1e 100644
--- a/icing/index/main/doc-hit-info-iterator-term-main.cc
+++ b/icing/index/main/doc-hit-info-iterator-term-main.cc
@@ -57,9 +57,8 @@ libtextclassifier3::Status DocHitInfoIteratorTermMain::Advance() {
if (!absl_ports::IsNotFound(status)) {
// NOT_FOUND is expected to happen (not every term will be in the main
// index!). Other errors are worth logging.
- ICING_LOG(ERROR)
- << "Encountered unexpected failure while retrieving hits "
- << status.error_message();
+ ICING_LOG(ERROR) << "Failed to retrieve more hits "
+ << status.error_message();
}
return absl_ports::ResourceExhaustedError(
"No more DocHitInfos in iterator");
diff --git a/icing/index/main/flash-index-storage.cc b/icing/index/main/flash-index-storage.cc
index 3c52375..f125b6d 100644
--- a/icing/index/main/flash-index-storage.cc
+++ b/icing/index/main/flash-index-storage.cc
@@ -14,11 +14,11 @@
#include "icing/index/main/flash-index-storage.h"
+#include <errno.h>
+#include <inttypes.h>
#include <sys/types.h>
#include <algorithm>
-#include <cerrno>
-#include <cinttypes>
#include <cstdint>
#include <memory>
#include <unordered_set>
diff --git a/icing/index/main/flash-index-storage.h b/icing/index/main/flash-index-storage.h
index 6c6fbb8..8d5b50b 100644
--- a/icing/index/main/flash-index-storage.h
+++ b/icing/index/main/flash-index-storage.h
@@ -159,7 +159,6 @@ class FlashIndexStorage {
libtextclassifier3::Status Reset();
- // TODO(b/222349894) Convert the string output to a protocol buffer instead.
void GetDebugInfo(int verbosity, std::string* out) const;
private:
diff --git a/icing/index/main/flash-index-storage_test.cc b/icing/index/main/flash-index-storage_test.cc
index 25fcaad..7e15524 100644
--- a/icing/index/main/flash-index-storage_test.cc
+++ b/icing/index/main/flash-index-storage_test.cc
@@ -14,10 +14,10 @@
#include "icing/index/main/flash-index-storage.h"
+#include <stdlib.h>
#include <unistd.h>
#include <algorithm>
-#include <cstdlib>
#include <limits>
#include <utility>
#include <vector>
diff --git a/icing/index/main/index-block.cc b/icing/index/main/index-block.cc
index c6ab345..4590d06 100644
--- a/icing/index/main/index-block.cc
+++ b/icing/index/main/index-block.cc
@@ -14,8 +14,9 @@
#include "icing/index/main/index-block.h"
+#include <inttypes.h>
+
#include <algorithm>
-#include <cinttypes>
#include <limits>
#include "icing/text_classifier/lib3/utils/base/statusor.h"
diff --git a/icing/index/main/index-block.h b/icing/index/main/index-block.h
index 5d75a2a..edf9a79 100644
--- a/icing/index/main/index-block.h
+++ b/icing/index/main/index-block.h
@@ -15,10 +15,10 @@
#ifndef ICING_INDEX_MAIN_INDEX_BLOCK_H_
#define ICING_INDEX_MAIN_INDEX_BLOCK_H_
+#include <string.h>
#include <sys/mman.h>
#include <algorithm>
-#include <cstring>
#include <limits>
#include <memory>
#include <string>
diff --git a/icing/index/main/main-index.cc b/icing/index/main/main-index.cc
index 2d6007b..8ae6b27 100644
--- a/icing/index/main/main-index.cc
+++ b/icing/index/main/main-index.cc
@@ -133,10 +133,18 @@ libtextclassifier3::StatusOr<int64_t> MainIndex::GetElementsSize() const {
IndexStorageInfoProto MainIndex::GetStorageInfo(
IndexStorageInfoProto storage_info) const {
- storage_info.set_main_index_lexicon_size(
- IcingFilesystem::SanitizeFileSize(main_lexicon_->GetElementsSize()));
- storage_info.set_main_index_storage_size(
- Filesystem::SanitizeFileSize(flash_index_storage_->GetElementsSize()));
+ int64_t lexicon_elt_size = main_lexicon_->GetElementsSize();
+ if (lexicon_elt_size != IcingFilesystem::kBadFileSize) {
+ storage_info.set_main_index_lexicon_size(lexicon_elt_size);
+ } else {
+ storage_info.set_main_index_lexicon_size(-1);
+ }
+ int64_t index_elt_size = flash_index_storage_->GetElementsSize();
+ if (lexicon_elt_size != IcingFilesystem::kBadFileSize) {
+ storage_info.set_main_index_storage_size(index_elt_size);
+ } else {
+ storage_info.set_main_index_storage_size(-1);
+ }
storage_info.set_main_index_block_size(flash_index_storage_->block_size());
storage_info.set_num_blocks(flash_index_storage_->num_blocks());
storage_info.set_min_free_fraction(flash_index_storage_->min_free_fraction());
@@ -178,7 +186,7 @@ MainIndex::GetAccessorForPrefixTerm(const std::string& prefix) {
if (!exact && !hits_in_prefix_section.HasProperty(main_itr.GetValueIndex())) {
// Found it, but it doesn't have prefix hits. Exit early. No need to
// retrieve the posting list because there's nothing there for us.
- return absl_ports::NotFoundError("The term doesn't have any prefix hits.");
+ return libtextclassifier3::Status::OK;
}
PostingListIdentifier posting_list_id = PostingListIdentifier::kInvalid;
memcpy(&posting_list_id, main_itr.GetValue(), sizeof(posting_list_id));
@@ -209,48 +217,46 @@ bool IsTermInNamespaces(
libtextclassifier3::StatusOr<std::vector<TermMetadata>>
MainIndex::FindTermsByPrefix(const std::string& prefix,
- TermMatchType::Code term_match_type,
- const NamespaceChecker* namespace_checker) {
+ const std::vector<NamespaceId>& namespace_ids,
+ int num_to_return) {
// Finds all the terms that start with the given prefix in the lexicon.
IcingDynamicTrie::Iterator term_iterator(*main_lexicon_, prefix.c_str());
+ // A property reader to help check if a term has some property.
+ IcingDynamicTrie::PropertyReadersAll property_reader(*main_lexicon_);
+
std::vector<TermMetadata> term_metadata_list;
- while (term_iterator.IsValid()) {
- int count = 0;
- DocumentId last_document_id = kInvalidDocumentId;
+ while (term_iterator.IsValid() && term_metadata_list.size() < num_to_return) {
+ uint32_t term_value_index = term_iterator.GetValueIndex();
+ // Skips the terms that don't exist in the given namespaces. We won't skip
+ // any terms if namespace_ids is empty.
+ if (!IsTermInNamespaces(property_reader, term_value_index, namespace_ids)) {
+ term_iterator.Advance();
+ continue;
+ }
PostingListIdentifier posting_list_id = PostingListIdentifier::kInvalid;
memcpy(&posting_list_id, term_iterator.GetValue(), sizeof(posting_list_id));
- ICING_ASSIGN_OR_RETURN(PostingListAccessor pl_accessor,
- PostingListAccessor::CreateFromExisting(
- flash_index_storage_.get(), posting_list_id));
- ICING_ASSIGN_OR_RETURN(std::vector<Hit> hits,
- pl_accessor.GetNextHitsBatch());
- for (const Hit& hit : hits) {
- DocumentId document_id = hit.document_id();
- if (document_id != last_document_id) {
- last_document_id = document_id;
- if (term_match_type == TermMatchType::EXACT_ONLY &&
- hit.is_prefix_hit()) {
- continue;
- }
- if (!namespace_checker->BelongsToTargetNamespaces(document_id)) {
- // The document is removed or expired or not belongs to target
- // namespaces.
- continue;
- }
- // TODO(b/152934343) Add search type in SuggestionSpec to ask user to
- // input search type, prefix or exact. And make different score strategy
- // base on that.
- ++count;
- }
- }
- if (count > 0) {
- term_metadata_list.push_back(TermMetadata(term_iterator.GetKey(), count));
- }
+ // Getting the actual hit count would require reading the entire posting
+ // list chain. We take an approximation to avoid all of those IO ops.
+ // Because we are not reading the posting lists, it is impossible to
+ // differentiate between single max-size posting lists and chains of
+ // max-size posting lists. We assume that the impact on scoring is not
+ // significant.
+ int approx_hit_count = IndexBlock::ApproximateFullPostingListHitsForBlock(
+ flash_index_storage_->block_size(),
+ posting_list_id.posting_list_index_bits());
+ term_metadata_list.emplace_back(term_iterator.GetKey(), approx_hit_count);
term_iterator.Advance();
}
+ if (term_iterator.IsValid()) {
+ // We exited the loop above because we hit the num_to_return limit.
+ ICING_LOG(WARNING) << "Ran into limit of " << num_to_return
+ << " retrieving suggestions for " << prefix
+ << ". Some suggestions may not be returned and others "
+ "may be misranked.";
+ }
return term_metadata_list;
}
@@ -607,22 +613,16 @@ libtextclassifier3::Status MainIndex::AddPrefixBackfillHits(
return libtextclassifier3::Status::OK;
}
-IndexDebugInfoProto::MainIndexDebugInfoProto MainIndex::GetDebugInfo(
- int verbosity) const {
- IndexDebugInfoProto::MainIndexDebugInfoProto res;
-
+void MainIndex::GetDebugInfo(int verbosity, std::string* out) const {
// Lexicon.
- main_lexicon_->GetDebugInfo(verbosity, res.mutable_lexicon_info());
-
- res.set_last_added_document_id(last_added_document_id());
+ out->append("Main Lexicon stats:\n");
+ main_lexicon_->GetDebugInfo(verbosity, out);
if (verbosity <= 0) {
- return res;
+ return;
}
- flash_index_storage_->GetDebugInfo(verbosity,
- res.mutable_flash_index_storage_info());
- return res;
+ flash_index_storage_->GetDebugInfo(verbosity, out);
}
} // namespace lib
diff --git a/icing/index/main/main-index.h b/icing/index/main/main-index.h
index abb0418..43635ca 100644
--- a/icing/index/main/main-index.h
+++ b/icing/index/main/main-index.h
@@ -27,9 +27,7 @@
#include "icing/index/term-metadata.h"
#include "icing/legacy/index/icing-dynamic-trie.h"
#include "icing/legacy/index/icing-filesystem.h"
-#include "icing/proto/debug.pb.h"
#include "icing/proto/storage.pb.h"
-#include "icing/store/namespace-checker.h"
#include "icing/store/namespace-id.h"
#include "icing/util/status-macros.h"
@@ -73,17 +71,18 @@ class MainIndex {
// Finds terms with the given prefix in the given namespaces. If
// 'namespace_ids' is empty, returns results from all the namespaces. The
// input prefix must be normalized, otherwise inaccurate results may be
- // returned. If term_match_type is EXACT, only exact hit will be counted and
- // it is PREFIX, both prefix and exact hits will be counted. Results are not
- // sorted specifically and are in lexigraphical order. Number of results are
- // no more than 'num_to_return'.
+ // returned. Results are not sorted specifically and are in lexigraphical
+ // order. Number of results are no more than 'num_to_return'.
+ //
+ // The hit count returned with each TermMetadata is an approximation based of
+ // posting list size.
//
// Returns:
// A list of TermMetadata on success
// INTERNAL_ERROR if failed to access term data.
libtextclassifier3::StatusOr<std::vector<TermMetadata>> FindTermsByPrefix(
- const std::string& prefix, TermMatchType::Code term_match_type,
- const NamespaceChecker* namespace_checker);
+ const std::string& prefix, const std::vector<NamespaceId>& namespace_ids,
+ int num_to_return);
struct LexiconMergeOutputs {
// Maps from main_lexicon tvi for new branching point to the main_lexicon
@@ -186,8 +185,7 @@ class MainIndex {
// verbosity <= 0, simplest debug information - just the lexicon
// verbosity > 0, more detailed debug information including raw postings
// lists.
- IndexDebugInfoProto::MainIndexDebugInfoProto GetDebugInfo(
- int verbosity) const;
+ void GetDebugInfo(int verbosity, std::string* out) const;
private:
libtextclassifier3::Status Init(const std::string& index_directory,
diff --git a/icing/index/main/main-index_test.cc b/icing/index/main/main-index_test.cc
index fa83d68..74139be 100644
--- a/icing/index/main/main-index_test.cc
+++ b/icing/index/main/main-index_test.cc
@@ -162,34 +162,6 @@ TEST_F(MainIndexTest, MainIndexGetAccessorForPrefixReturnsValidAccessor) {
EXPECT_THAT(main_index->GetAccessorForPrefixTerm("foo"), IsOk());
}
-TEST_F(MainIndexTest, MainIndexGetAccessorForPrefixReturnsNotFound) {
- // 1. Index one doc in the Lite Index:
- // - Doc0 {"foot" is_in_prefix_section=false}
- ICING_ASSERT_OK_AND_ASSIGN(
- uint32_t tvi,
- lite_index_->InsertTerm("foot", TermMatchType::EXACT_ONLY, kNamespace0));
- ICING_ASSERT_OK_AND_ASSIGN(uint32_t foot_term_id,
- term_id_codec_->EncodeTvi(tvi, TviType::LITE));
-
- Hit doc0_hit(/*section_id=*/0, /*document_id=*/0, Hit::kDefaultTermFrequency,
- /*is_in_prefix_section=*/false);
- ICING_ASSERT_OK(lite_index_->AddHit(foot_term_id, doc0_hit));
-
- // 2. Create the main index. It should have no entries in its lexicon.
- std::string main_index_file_name = index_dir_ + "/test_file.idx.index";
- ICING_ASSERT_OK_AND_ASSIGN(
- std::unique_ptr<MainIndex> main_index,
- MainIndex::Create(main_index_file_name, &filesystem_,
- &icing_filesystem_));
-
- // 3. Merge the index. The main index should return not found when we search
- // prefix contain "foo".
- ICING_ASSERT_OK(Merge(*lite_index_, *term_id_codec_, main_index.get()));
- // GetAccessorForPrefixTerm should return a valid accessor for "foo".
- EXPECT_THAT(main_index->GetAccessorForPrefixTerm("foo"),
- StatusIs(libtextclassifier3::StatusCode::NOT_FOUND));
-}
-
TEST_F(MainIndexTest, MainIndexGetAccessorForExactTermNotFound) {
// Create the main index. It should have no entries in its lexicon.
std::string main_index_file_name = index_dir_ + "/test_file.idx.index";
diff --git a/icing/index/main/posting-list-free.h b/icing/index/main/posting-list-free.h
index 75b99d7..4b27401 100644
--- a/icing/index/main/posting-list-free.h
+++ b/icing/index/main/posting-list-free.h
@@ -15,10 +15,10 @@
#ifndef ICING_INDEX_MAIN_POSTING_LIST_FREE_H_
#define ICING_INDEX_MAIN_POSTING_LIST_FREE_H_
+#include <string.h>
#include <sys/mman.h>
#include <cstdint>
-#include <cstring>
#include "icing/text_classifier/lib3/utils/base/statusor.h"
#include "icing/absl_ports/canonical_errors.h"
@@ -115,7 +115,7 @@ class PostingListFree {
// bytes which will store the next posting list index, the rest are unused and
// can be anything.
uint8_t *posting_list_buffer_;
- [[maybe_unused]] uint32_t size_in_bytes_;
+ uint32_t size_in_bytes_;
static_assert(sizeof(PostingListIndex) <=
posting_list_utils::min_posting_list_size(),
diff --git a/icing/index/main/posting-list-used.h b/icing/index/main/posting-list-used.h
index 8944034..1b2e24e 100644
--- a/icing/index/main/posting-list-used.h
+++ b/icing/index/main/posting-list-used.h
@@ -15,10 +15,10 @@
#ifndef ICING_INDEX_MAIN_POSTING_LIST_USED_H_
#define ICING_INDEX_MAIN_POSTING_LIST_USED_H_
+#include <string.h>
#include <sys/mman.h>
#include <algorithm>
-#include <cstring>
#include <vector>
#include "icing/text_classifier/lib3/utils/base/status.h"
diff --git a/icing/jni/icing-search-engine-jni.cc b/icing/jni/icing-search-engine-jni.cc
index bcc35e6..ea2bcf7 100644
--- a/icing/jni/icing-search-engine-jni.cc
+++ b/icing/jni/icing-search-engine-jni.cc
@@ -166,7 +166,6 @@ Java_com_google_android_icing_IcingSearchEngine_nativeGetSchemaType(
env->GetStringUTFChars(schema_type, /*isCopy=*/nullptr);
icing::lib::GetSchemaTypeResultProto get_schema_type_result_proto =
icing->GetSchemaType(native_schema_type);
- env->ReleaseStringUTFChars(schema_type, native_schema_type);
return SerializeProtoToJniByteArray(env, get_schema_type_result_proto);
}
@@ -193,20 +192,19 @@ JNIEXPORT jbyteArray JNICALL
Java_com_google_android_icing_IcingSearchEngine_nativeGet(
JNIEnv* env, jclass clazz, jobject object, jstring name_space, jstring uri,
jbyteArray result_spec_bytes) {
- icing::lib::GetResultSpecProto get_result_spec;
- if (!ParseProtoFromJniByteArray(env, result_spec_bytes, &get_result_spec)) {
- ICING_LOG(ERROR) << "Failed to parse GetResultSpecProto in nativeGet";
- return nullptr;
- }
icing::lib::IcingSearchEngine* icing =
GetIcingSearchEnginePointer(env, object);
+
const char* native_name_space =
env->GetStringUTFChars(name_space, /*isCopy=*/nullptr);
const char* native_uri = env->GetStringUTFChars(uri, /*isCopy=*/nullptr);
+ icing::lib::GetResultSpecProto get_result_spec;
+ if (!ParseProtoFromJniByteArray(env, result_spec_bytes, &get_result_spec)) {
+ ICING_LOG(ERROR) << "Failed to parse GetResultSpecProto in nativeGet";
+ return nullptr;
+ }
icing::lib::GetResultProto get_result_proto =
icing->Get(native_name_space, native_uri, get_result_spec);
- env->ReleaseStringUTFChars(uri, native_uri);
- env->ReleaseStringUTFChars(name_space, native_name_space);
return SerializeProtoToJniByteArray(env, get_result_proto);
}
@@ -308,8 +306,6 @@ Java_com_google_android_icing_IcingSearchEngine_nativeDelete(
const char* native_uri = env->GetStringUTFChars(uri, /*isCopy=*/nullptr);
icing::lib::DeleteResultProto delete_result_proto =
icing->Delete(native_name_space, native_uri);
- env->ReleaseStringUTFChars(uri, native_uri);
- env->ReleaseStringUTFChars(name_space, native_name_space);
return SerializeProtoToJniByteArray(env, delete_result_proto);
}
@@ -324,7 +320,6 @@ Java_com_google_android_icing_IcingSearchEngine_nativeDeleteByNamespace(
env->GetStringUTFChars(name_space, /*isCopy=*/nullptr);
icing::lib::DeleteByNamespaceResultProto delete_by_namespace_result_proto =
icing->DeleteByNamespace(native_name_space);
- env->ReleaseStringUTFChars(name_space, native_name_space);
return SerializeProtoToJniByteArray(env, delete_by_namespace_result_proto);
}
@@ -339,7 +334,6 @@ Java_com_google_android_icing_IcingSearchEngine_nativeDeleteBySchemaType(
env->GetStringUTFChars(schema_type, /*isCopy=*/nullptr);
icing::lib::DeleteBySchemaTypeResultProto delete_by_schema_type_result_proto =
icing->DeleteBySchemaType(native_schema_type);
- env->ReleaseStringUTFChars(schema_type, native_schema_type);
return SerializeProtoToJniByteArray(env, delete_by_schema_type_result_proto);
}
@@ -426,23 +420,4 @@ Java_com_google_android_icing_IcingSearchEngine_nativeReset(
return SerializeProtoToJniByteArray(env, reset_result_proto);
}
-JNIEXPORT jbyteArray JNICALL
-Java_com_google_android_icing_IcingSearchEngine_nativeSearchSuggestions(
- JNIEnv* env, jclass clazz, jobject object,
- jbyteArray suggestion_spec_bytes) {
- icing::lib::IcingSearchEngine* icing =
- GetIcingSearchEnginePointer(env, object);
-
- icing::lib::SuggestionSpecProto suggestion_spec_proto;
- if (!ParseProtoFromJniByteArray(env, suggestion_spec_bytes,
- &suggestion_spec_proto)) {
- ICING_LOG(ERROR) << "Failed to parse SuggestionSpecProto in nativeSearch";
- return nullptr;
- }
- icing::lib::SuggestionResponse suggestionResponse =
- icing->SearchSuggestions(suggestion_spec_proto);
-
- return SerializeProtoToJniByteArray(env, suggestionResponse);
-}
-
} // extern "C"
diff --git a/icing/legacy/core/icing-core-types.h b/icing/legacy/core/icing-core-types.h
index 7db8408..cc12663 100644
--- a/icing/legacy/core/icing-core-types.h
+++ b/icing/legacy/core/icing-core-types.h
@@ -21,8 +21,9 @@
#ifndef ICING_LEGACY_CORE_ICING_CORE_TYPES_H_
#define ICING_LEGACY_CORE_ICING_CORE_TYPES_H_
+#include <stdint.h>
+
#include <cstddef> // size_t not defined implicitly for all platforms.
-#include <cstdint>
#include <vector>
#include "icing/legacy/core/icing-compat.h"
diff --git a/icing/legacy/core/icing-string-util.cc b/icing/legacy/core/icing-string-util.cc
index ed06e03..2eb64ac 100644
--- a/icing/legacy/core/icing-string-util.cc
+++ b/icing/legacy/core/icing-string-util.cc
@@ -13,11 +13,12 @@
// limitations under the License.
#include "icing/legacy/core/icing-string-util.h"
+#include <stdarg.h>
+#include <stddef.h>
+#include <stdint.h>
+#include <stdio.h>
+
#include <algorithm>
-#include <cstdarg>
-#include <cstddef>
-#include <cstdint>
-#include <cstdio>
#include <string>
#include "icing/legacy/portable/icing-zlib.h"
diff --git a/icing/legacy/core/icing-string-util.h b/icing/legacy/core/icing-string-util.h
index e5e4941..767e581 100644
--- a/icing/legacy/core/icing-string-util.h
+++ b/icing/legacy/core/icing-string-util.h
@@ -15,8 +15,9 @@
#ifndef ICING_LEGACY_CORE_ICING_STRING_UTIL_H_
#define ICING_LEGACY_CORE_ICING_STRING_UTIL_H_
-#include <cstdarg>
-#include <cstdint>
+#include <stdarg.h>
+#include <stdint.h>
+
#include <string>
#include "icing/legacy/core/icing-compat.h"
diff --git a/icing/legacy/core/icing-timer.h b/icing/legacy/core/icing-timer.h
index af38912..49ba9ad 100644
--- a/icing/legacy/core/icing-timer.h
+++ b/icing/legacy/core/icing-timer.h
@@ -16,8 +16,7 @@
#define ICING_LEGACY_CORE_ICING_TIMER_H_
#include <sys/time.h>
-
-#include <ctime>
+#include <time.h>
namespace icing {
namespace lib {
diff --git a/icing/legacy/index/icing-array-storage.cc b/icing/legacy/index/icing-array-storage.cc
index 4d2ef67..b462135 100644
--- a/icing/legacy/index/icing-array-storage.cc
+++ b/icing/legacy/index/icing-array-storage.cc
@@ -14,10 +14,10 @@
#include "icing/legacy/index/icing-array-storage.h"
+#include <inttypes.h>
#include <sys/mman.h>
#include <algorithm>
-#include <cinttypes>
#include "icing/legacy/core/icing-string-util.h"
#include "icing/legacy/core/icing-timer.h"
diff --git a/icing/legacy/index/icing-array-storage.h b/icing/legacy/index/icing-array-storage.h
index 0d93172..fad0565 100644
--- a/icing/legacy/index/icing-array-storage.h
+++ b/icing/legacy/index/icing-array-storage.h
@@ -20,7 +20,8 @@
#ifndef ICING_LEGACY_INDEX_ICING_ARRAY_STORAGE_H_
#define ICING_LEGACY_INDEX_ICING_ARRAY_STORAGE_H_
-#include <cstdint>
+#include <stdint.h>
+
#include <string>
#include <vector>
diff --git a/icing/legacy/index/icing-bit-util.h b/icing/legacy/index/icing-bit-util.h
index d0c3f50..3273a68 100644
--- a/icing/legacy/index/icing-bit-util.h
+++ b/icing/legacy/index/icing-bit-util.h
@@ -20,8 +20,9 @@
#ifndef ICING_LEGACY_INDEX_ICING_BIT_UTIL_H_
#define ICING_LEGACY_INDEX_ICING_BIT_UTIL_H_
-#include <cstdint>
-#include <cstdio>
+#include <stdint.h>
+#include <stdio.h>
+
#include <limits>
#include <vector>
diff --git a/icing/legacy/index/icing-dynamic-trie.cc b/icing/legacy/index/icing-dynamic-trie.cc
index 77876c4..29843ba 100644
--- a/icing/legacy/index/icing-dynamic-trie.cc
+++ b/icing/legacy/index/icing-dynamic-trie.cc
@@ -62,16 +62,15 @@
#include "icing/legacy/index/icing-dynamic-trie.h"
+#include <errno.h>
#include <fcntl.h>
+#include <inttypes.h>
+#include <string.h>
#include <sys/mman.h>
#include <sys/stat.h>
#include <unistd.h>
#include <algorithm>
-#include <cerrno>
-#include <cinttypes>
-#include <cstdint>
-#include <cstring>
#include <memory>
#include <utility>
@@ -398,8 +397,6 @@ class IcingDynamicTrie::IcingDynamicTrieStorage {
// storage.
IcingScopedFd array_fds_[NUM_ARRAY_TYPES];
std::vector<IcingArrayStorage> array_storage_;
-
- // Legacy file system. Switch to use the new Filesystem class instead.
const IcingFilesystem *filesystem_;
};
@@ -1367,12 +1364,10 @@ uint32_t IcingDynamicTrie::size() const {
return storage_->hdr().num_keys();
}
-void IcingDynamicTrie::CollectStatsRecursive(const Node &node, Stats *stats,
- uint32_t depth) const {
+void IcingDynamicTrie::CollectStatsRecursive(const Node &node,
+ Stats *stats) const {
if (node.is_leaf()) {
stats->num_leaves++;
- stats->sum_depth += depth;
- stats->max_depth = max(stats->max_depth, depth);
const char *suffix = storage_->GetSuffix(node.next_index());
stats->suffixes_used += strlen(suffix) + 1 + value_size();
if (!suffix[0]) {
@@ -1384,16 +1379,13 @@ void IcingDynamicTrie::CollectStatsRecursive(const Node &node, Stats *stats,
for (; i < (1U << node.log2_num_children()); i++) {
const Next &next = *storage_->GetNext(node.next_index(), i);
if (next.node_index() == kInvalidNodeIndex) break;
- CollectStatsRecursive(*storage_->GetNode(next.node_index()), stats,
- depth + 1);
+ CollectStatsRecursive(*storage_->GetNode(next.node_index()), stats);
}
// At least one valid node in each next array
if (i == 0) {
ICING_LOG(FATAL) << "No valid node in 'next' array";
}
- stats->sum_children += i;
- stats->max_children = max(stats->max_children, i);
stats->child_counts[i - 1]++;
stats->wasted[node.log2_num_children()] +=
@@ -1475,12 +1467,9 @@ std::string IcingDynamicTrie::Stats::DumpStats(int verbosity) const {
"Wasted total: %u\n"
"Num intermediates %u num leaves %u "
"suffixes used %u null %u\n"
- "avg and max children for intermediates: %.3f, %u\n"
- "avg and max depth for leaves: %.3f, %u\n"
"Total next frag: %.3f%%\n",
total_wasted, num_intermediates, num_leaves, suffixes_used,
- null_suffixes, 1. * sum_children / num_intermediates, max_children,
- 1. * sum_depth / num_leaves, max_depth,
+ null_suffixes,
100. * math_util::SafeDivide((total_free + total_wasted), num_nexts));
}
IcingStringUtil::SStringAppendF(
diff --git a/icing/legacy/index/icing-dynamic-trie.h b/icing/legacy/index/icing-dynamic-trie.h
index 013b926..7fe290b 100644
--- a/icing/legacy/index/icing-dynamic-trie.h
+++ b/icing/legacy/index/icing-dynamic-trie.h
@@ -35,7 +35,8 @@
#ifndef ICING_LEGACY_INDEX_ICING_DYNAMIC_TRIE_H_
#define ICING_LEGACY_INDEX_ICING_DYNAMIC_TRIE_H_
-#include <cstdint>
+#include <stdint.h>
+
#include <memory>
#include <string>
#include <unordered_map>
@@ -152,13 +153,8 @@ class IcingDynamicTrie : public IIcingStorage {
uint32_t max_nodes;
// Count of intermediate nodes.
uint32_t num_intermediates;
- // Total and maximum number of children of intermediate nodes.
- uint32_t sum_children, max_children;
-
// Count of leaf nodes.
uint32_t num_leaves;
- // Total and maximum depth of leaf nodes.
- uint32_t sum_depth, max_depth;
// Next stats
@@ -191,7 +187,6 @@ class IcingDynamicTrie : public IIcingStorage {
uint32_t dirty_pages_nexts;
uint32_t dirty_pages_suffixes;
- // TODO(b/222349894) Convert the string output to a protocol buffer instead.
std::string DumpStats(int verbosity) const;
};
@@ -607,8 +602,7 @@ class IcingDynamicTrie : public IIcingStorage {
static const uint32_t kInvalidSuffixIndex;
// Stats helpers.
- void CollectStatsRecursive(const Node &node, Stats *stats,
- uint32_t depth = 0) const;
+ void CollectStatsRecursive(const Node &node, Stats *stats) const;
// Helpers for Find and Insert.
const Next *GetNextByChar(const Node *node, uint8_t key_char) const;
diff --git a/icing/legacy/index/icing-filesystem.cc b/icing/legacy/index/icing-filesystem.cc
index 4f5e571..90e9146 100644
--- a/icing/legacy/index/icing-filesystem.cc
+++ b/icing/legacy/index/icing-filesystem.cc
@@ -16,6 +16,7 @@
#include <dirent.h>
#include <dlfcn.h>
+#include <errno.h>
#include <fcntl.h>
#include <fnmatch.h>
#include <pthread.h>
@@ -26,7 +27,6 @@
#include <unistd.h>
#include <algorithm>
-#include <cerrno>
#include <unordered_set>
#include "icing/absl_ports/str_cat.h"
diff --git a/icing/legacy/index/icing-filesystem.h b/icing/legacy/index/icing-filesystem.h
index ce75a82..f645632 100644
--- a/icing/legacy/index/icing-filesystem.h
+++ b/icing/legacy/index/icing-filesystem.h
@@ -224,11 +224,6 @@ class IcingFilesystem {
// Increments to_increment by size if size is valid, or sets to_increment
// to kBadFileSize if either size or to_increment is kBadFileSize.
static void IncrementByOrSetInvalid(uint64_t size, uint64_t *to_increment);
-
- // Return -1 if file_size is invalid. Otherwise, return file_size.
- static int64_t SanitizeFileSize(int64_t file_size) {
- return (file_size != kBadFileSize) ? file_size : -1;
- }
};
} // namespace lib
diff --git a/icing/legacy/index/icing-flash-bitmap.h b/icing/legacy/index/icing-flash-bitmap.h
index 6bb9591..3b3521a 100644
--- a/icing/legacy/index/icing-flash-bitmap.h
+++ b/icing/legacy/index/icing-flash-bitmap.h
@@ -37,7 +37,8 @@
#ifndef ICING_LEGACY_INDEX_ICING_FLASH_BITMAP_H_
#define ICING_LEGACY_INDEX_ICING_FLASH_BITMAP_H_
-#include <cstdint>
+#include <stdint.h>
+
#include <memory>
#include <string>
@@ -138,7 +139,6 @@ class IcingFlashBitmap {
// Upgrade for version 18.
bool UpgradeTo18();
- // Legacy file system. Switch to use the new Filesystem class instead.
const IcingFilesystem *const filesystem_;
std::string filename_;
OpenType open_type_;
diff --git a/icing/legacy/index/icing-mmapper.cc b/icing/legacy/index/icing-mmapper.cc
index 7946c82..737335c 100644
--- a/icing/legacy/index/icing-mmapper.cc
+++ b/icing/legacy/index/icing-mmapper.cc
@@ -17,11 +17,10 @@
//
#include "icing/legacy/index/icing-mmapper.h"
+#include <errno.h>
+#include <string.h>
#include <sys/mman.h>
-#include <cerrno>
-#include <cstring>
-
#include "icing/legacy/core/icing-string-util.h"
#include "icing/legacy/index/icing-filesystem.h"
#include "icing/util/logging.h"
diff --git a/icing/legacy/index/icing-mock-filesystem.h b/icing/legacy/index/icing-mock-filesystem.h
index 122ee7b..75ac62f 100644
--- a/icing/legacy/index/icing-mock-filesystem.h
+++ b/icing/legacy/index/icing-mock-filesystem.h
@@ -15,15 +15,16 @@
#ifndef ICING_LEGACY_INDEX_ICING_MOCK_FILESYSTEM_H_
#define ICING_LEGACY_INDEX_ICING_MOCK_FILESYSTEM_H_
-#include <cstdint>
-#include <cstdio>
-#include <cstring>
+#include <stdint.h>
+#include <stdio.h>
+#include <string.h>
+
#include <memory>
#include <string>
#include <vector>
-#include "gmock/gmock.h"
#include "icing/legacy/index/icing-filesystem.h"
+#include "gmock/gmock.h"
namespace icing {
namespace lib {
diff --git a/icing/legacy/index/icing-storage-file.cc b/icing/legacy/index/icing-storage-file.cc
index 35a4418..b27ec67 100644
--- a/icing/legacy/index/icing-storage-file.cc
+++ b/icing/legacy/index/icing-storage-file.cc
@@ -14,9 +14,9 @@
#include "icing/legacy/index/icing-storage-file.h"
+#include <inttypes.h>
#include <unistd.h>
-#include <cinttypes>
#include <string>
#include "icing/legacy/core/icing-compat.h"
diff --git a/icing/portable/endian.h b/icing/portable/endian.h
index ecebb15..42f6c02 100644
--- a/icing/portable/endian.h
+++ b/icing/portable/endian.h
@@ -12,12 +12,10 @@
// See the License for the specific language governing permissions and
// limitations under the License.
//
-// Utility functions that depend on bytesex. We define versions of htonll and
-// ntohll (HostToNetworkLL and NetworkToHostLL in our naming), as well as
-// "Google" versions of all the standards: ghtonl, ghtons, and so on
-// (GHostToNetworkL, GHostToNetworkS, etc in our naming). These functions do
-// exactly the same as their standard variants, but don't require including the
-// dangerous netinet/in.h.
+// Utility functions that depend on bytesex. We define htonll and ntohll,
+// as well as "Google" versions of all the standards: ghtonl, ghtons, and
+// so on. These functions do exactly the same as their standard variants,
+// but don't require including the dangerous netinet/in.h.
#ifndef ICING_PORTABLE_ENDIAN_H_
#define ICING_PORTABLE_ENDIAN_H_
@@ -77,7 +75,7 @@
// The following guarantees declaration of the byte swap functions
#ifdef COMPILER_MSVC
-#include <cstdlib> // NOLINT(build/include)
+#include <stdlib.h> // NOLINT(build/include)
#define bswap_16(x) _byteswap_ushort(x)
#define bswap_32(x) _byteswap_ulong(x)
@@ -172,37 +170,37 @@ inline uint16 gbswap_16(uint16 host_int) { return bswap_16(host_int); }
// correctly handle the (rather involved) definitions of bswap_32.
// gcc guarantees that inline functions are as fast as macros, so
// this isn't a performance hit.
-inline uint16_t GHostToNetworkS(uint16_t x) { return gbswap_16(x); }
-inline uint32_t GHostToNetworkL(uint32_t x) { return gbswap_32(x); }
-inline uint64_t GHostToNetworkLL(uint64_t x) { return gbswap_64(x); }
+inline uint16_t ghtons(uint16_t x) { return gbswap_16(x); }
+inline uint32_t ghtonl(uint32_t x) { return gbswap_32(x); }
+inline uint64_t ghtonll(uint64_t x) { return gbswap_64(x); }
#elif defined IS_BIG_ENDIAN
// These definitions are simpler on big-endian machines
// These are functions instead of macros to avoid self-assignment warnings
// on calls such as "i = ghtnol(i);". This also provides type checking.
-inline uint16 GHostToNetworkS(uint16 x) { return x; }
-inline uint32 GHostToNetworkL(uint32 x) { return x; }
-inline uint64 GHostToNetworkLL(uint64 x) { return x; }
+inline uint16 ghtons(uint16 x) { return x; }
+inline uint32 ghtonl(uint32 x) { return x; }
+inline uint64 ghtonll(uint64 x) { return x; }
#else // bytesex
#error \
"Unsupported bytesex: Either IS_BIG_ENDIAN or IS_LITTLE_ENDIAN must be defined" // NOLINT
#endif // bytesex
-#ifndef HostToNetworkLL
+#ifndef htonll
// With the rise of 64-bit, some systems are beginning to define this.
-#define HostToNetworkLL(x) GHostToNetworkLL(x)
-#endif // HostToNetworkLL
+#define htonll(x) ghtonll(x)
+#endif // htonll
// ntoh* and hton* are the same thing for any size and bytesex,
// since the function is an involution, i.e., its own inverse.
-inline uint16_t GNetworkToHostS(uint16_t x) { return GHostToNetworkS(x); }
-inline uint32_t GNetworkToHostL(uint32_t x) { return GHostToNetworkL(x); }
-inline uint64_t GNetworkToHostLL(uint64_t x) { return GHostToNetworkLL(x); }
+inline uint16_t gntohs(uint16_t x) { return ghtons(x); }
+inline uint32_t gntohl(uint32_t x) { return ghtonl(x); }
+inline uint64_t gntohll(uint64_t x) { return ghtonll(x); }
-#ifndef NetworkToHostLL
-#define NetworkToHostLL(x) GHostToNetworkLL(x)
-#endif // NetworkToHostLL
+#ifndef ntohll
+#define ntohll(x) htonll(x)
+#endif // ntohll
#endif // ICING_PORTABLE_ENDIAN_H_
diff --git a/icing/portable/gzip_stream.cc b/icing/portable/gzip_stream.cc
deleted file mode 100644
index f00a993..0000000
--- a/icing/portable/gzip_stream.cc
+++ /dev/null
@@ -1,313 +0,0 @@
-// Copyright (C) 2009 Google LLC
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-// http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-// This file contains the implementation of classes GzipInputStream and
-// GzipOutputStream. It is forked from protobuf because these classes are only
-// provided in libprotobuf-full but we would like to link libicing against the
-// smaller libprotobuf-lite instead.
-
-#include "icing/portable/gzip_stream.h"
-#include "icing/util/logging.h"
-
-namespace icing {
-namespace lib {
-namespace protobuf_ports {
-
-static const int kDefaultBufferSize = 65536;
-
-GzipInputStream::GzipInputStream(ZeroCopyInputStream* sub_stream, Format format,
- int buffer_size)
- : format_(format), sub_stream_(sub_stream), zerror_(Z_OK), byte_count_(0) {
- zcontext_.state = Z_NULL;
- zcontext_.zalloc = Z_NULL;
- zcontext_.zfree = Z_NULL;
- zcontext_.opaque = Z_NULL;
- zcontext_.total_out = 0;
- zcontext_.next_in = NULL;
- zcontext_.avail_in = 0;
- zcontext_.total_in = 0;
- zcontext_.msg = NULL;
- if (buffer_size == -1) {
- output_buffer_length_ = kDefaultBufferSize;
- } else {
- output_buffer_length_ = buffer_size;
- }
- output_buffer_ = operator new(output_buffer_length_);
- zcontext_.next_out = static_cast<Bytef*>(output_buffer_);
- zcontext_.avail_out = output_buffer_length_;
- output_position_ = output_buffer_;
-}
-GzipInputStream::~GzipInputStream() {
- operator delete(output_buffer_);
- zerror_ = inflateEnd(&zcontext_);
-}
-
-static inline int internalInflateInit2(z_stream* zcontext,
- GzipInputStream::Format format) {
- int windowBitsFormat = 0;
- switch (format) {
- case GzipInputStream::GZIP:
- windowBitsFormat = 16;
- break;
- case GzipInputStream::AUTO:
- windowBitsFormat = 32;
- break;
- case GzipInputStream::ZLIB:
- windowBitsFormat = 0;
- break;
- }
- return inflateInit2(zcontext, /* windowBits */ 15 | windowBitsFormat);
-}
-
-int GzipInputStream::Inflate(int flush) {
- if ((zerror_ == Z_OK) && (zcontext_.avail_out == 0)) {
- // previous inflate filled output buffer. don't change input params yet.
- } else if (zcontext_.avail_in == 0) {
- const void* in;
- int in_size;
- bool first = zcontext_.next_in == NULL;
- bool ok = sub_stream_->Next(&in, &in_size);
- if (!ok) {
- zcontext_.next_out = NULL;
- zcontext_.avail_out = 0;
- return Z_STREAM_END;
- }
- zcontext_.next_in = static_cast<Bytef*>(const_cast<void*>(in));
- zcontext_.avail_in = in_size;
- if (first) {
- int error = internalInflateInit2(&zcontext_, format_);
- if (error != Z_OK) {
- return error;
- }
- }
- }
- zcontext_.next_out = static_cast<Bytef*>(output_buffer_);
- zcontext_.avail_out = output_buffer_length_;
- output_position_ = output_buffer_;
- int error = inflate(&zcontext_, flush);
- return error;
-}
-
-void GzipInputStream::DoNextOutput(const void** data, int* size) {
- *data = output_position_;
- *size = ((uintptr_t)zcontext_.next_out) - ((uintptr_t)output_position_);
- output_position_ = zcontext_.next_out;
-}
-
-// implements ZeroCopyInputStream ----------------------------------
-bool GzipInputStream::Next(const void** data, int* size) {
- bool ok = (zerror_ == Z_OK) || (zerror_ == Z_STREAM_END) ||
- (zerror_ == Z_BUF_ERROR);
- if ((!ok) || (zcontext_.next_out == NULL)) {
- return false;
- }
- if (zcontext_.next_out != output_position_) {
- DoNextOutput(data, size);
- return true;
- }
- if (zerror_ == Z_STREAM_END) {
- if (zcontext_.next_out != NULL) {
- // sub_stream_ may have concatenated streams to follow
- zerror_ = inflateEnd(&zcontext_);
- byte_count_ += zcontext_.total_out;
- if (zerror_ != Z_OK) {
- return false;
- }
- zerror_ = internalInflateInit2(&zcontext_, format_);
- if (zerror_ != Z_OK) {
- return false;
- }
- } else {
- *data = NULL;
- *size = 0;
- return false;
- }
- }
- zerror_ = Inflate(Z_NO_FLUSH);
- if ((zerror_ == Z_STREAM_END) && (zcontext_.next_out == NULL)) {
- // The underlying stream's Next returned false inside Inflate.
- return false;
- }
- ok = (zerror_ == Z_OK) || (zerror_ == Z_STREAM_END) ||
- (zerror_ == Z_BUF_ERROR);
- if (!ok) {
- return false;
- }
- DoNextOutput(data, size);
- return true;
-}
-void GzipInputStream::BackUp(int count) {
- output_position_ = reinterpret_cast<void*>(
- reinterpret_cast<uintptr_t>(output_position_) - count);
-}
-bool GzipInputStream::Skip(int count) {
- const void* data;
- int size = 0;
- bool ok = Next(&data, &size);
- while (ok && (size < count)) {
- count -= size;
- ok = Next(&data, &size);
- }
- if (size > count) {
- BackUp(size - count);
- }
- return ok;
-}
-int64_t GzipInputStream::ByteCount() const {
- int64_t ret = byte_count_ + zcontext_.total_out;
- if (zcontext_.next_out != NULL && output_position_ != NULL) {
- ret += reinterpret_cast<uintptr_t>(zcontext_.next_out) -
- reinterpret_cast<uintptr_t>(output_position_);
- }
- return ret;
-}
-
-// =========================================================================
-
-GzipOutputStream::Options::Options()
- : format(GZIP),
- buffer_size(kDefaultBufferSize),
- compression_level(Z_DEFAULT_COMPRESSION),
- compression_strategy(Z_DEFAULT_STRATEGY) {}
-
-GzipOutputStream::GzipOutputStream(ZeroCopyOutputStream* sub_stream) {
- Init(sub_stream, Options());
-}
-
-GzipOutputStream::GzipOutputStream(ZeroCopyOutputStream* sub_stream,
- const Options& options) {
- Init(sub_stream, options);
-}
-
-void GzipOutputStream::Init(ZeroCopyOutputStream* sub_stream,
- const Options& options) {
- sub_stream_ = sub_stream;
- sub_data_ = NULL;
- sub_data_size_ = 0;
-
- input_buffer_length_ = options.buffer_size;
- input_buffer_ = operator new(input_buffer_length_);
-
- zcontext_.zalloc = Z_NULL;
- zcontext_.zfree = Z_NULL;
- zcontext_.opaque = Z_NULL;
- zcontext_.next_out = NULL;
- zcontext_.avail_out = 0;
- zcontext_.total_out = 0;
- zcontext_.next_in = NULL;
- zcontext_.avail_in = 0;
- zcontext_.total_in = 0;
- zcontext_.msg = NULL;
- // default to GZIP format
- int windowBitsFormat = 16;
- if (options.format == ZLIB) {
- windowBitsFormat = 0;
- }
- zerror_ =
- deflateInit2(&zcontext_, options.compression_level, Z_DEFLATED,
- /* windowBits */ 15 | windowBitsFormat,
- /* memLevel (default) */ 8, options.compression_strategy);
-}
-
-GzipOutputStream::~GzipOutputStream() {
- Close();
- operator delete(input_buffer_);
-}
-
-// private
-int GzipOutputStream::Deflate(int flush) {
- int error = Z_OK;
- do {
- if ((sub_data_ == NULL) || (zcontext_.avail_out == 0)) {
- bool ok = sub_stream_->Next(&sub_data_, &sub_data_size_);
- if (!ok) {
- sub_data_ = NULL;
- sub_data_size_ = 0;
- return Z_BUF_ERROR;
- }
- if (sub_data_size_ <= 0) {
- ICING_LOG(FATAL) << "Failed to advance underlying stream";
- }
- zcontext_.next_out = static_cast<Bytef*>(sub_data_);
- zcontext_.avail_out = sub_data_size_;
- }
- error = deflate(&zcontext_, flush);
- } while (error == Z_OK && zcontext_.avail_out == 0);
- if ((flush == Z_FULL_FLUSH) || (flush == Z_FINISH)) {
- // Notify lower layer of data.
- sub_stream_->BackUp(zcontext_.avail_out);
- // We don't own the buffer anymore.
- sub_data_ = NULL;
- sub_data_size_ = 0;
- }
- return error;
-}
-
-// implements ZeroCopyOutputStream ---------------------------------
-bool GzipOutputStream::Next(void** data, int* size) {
- if ((zerror_ != Z_OK) && (zerror_ != Z_BUF_ERROR)) {
- return false;
- }
- if (zcontext_.avail_in != 0) {
- zerror_ = Deflate(Z_NO_FLUSH);
- if (zerror_ != Z_OK) {
- return false;
- }
- }
- if (zcontext_.avail_in == 0) {
- // all input was consumed. reset the buffer.
- zcontext_.next_in = static_cast<Bytef*>(input_buffer_);
- zcontext_.avail_in = input_buffer_length_;
- *data = input_buffer_;
- *size = input_buffer_length_;
- } else {
- // The loop in Deflate should consume all avail_in
- ICING_LOG(ERROR) << "Deflate left bytes unconsumed";
- }
- return true;
-}
-void GzipOutputStream::BackUp(int count) {
- if (zcontext_.avail_in < static_cast<uInt>(count)) {
- ICING_LOG(FATAL) << "Not enough data to back up " << count << " bytes";
- }
- zcontext_.avail_in -= count;
-}
-int64_t GzipOutputStream::ByteCount() const {
- return zcontext_.total_in + zcontext_.avail_in;
-}
-
-bool GzipOutputStream::Flush() {
- zerror_ = Deflate(Z_FULL_FLUSH);
- // Return true if the flush succeeded or if it was a no-op.
- return (zerror_ == Z_OK) ||
- (zerror_ == Z_BUF_ERROR && zcontext_.avail_in == 0 &&
- zcontext_.avail_out != 0);
-}
-
-bool GzipOutputStream::Close() {
- if ((zerror_ != Z_OK) && (zerror_ != Z_BUF_ERROR)) {
- return false;
- }
- do {
- zerror_ = Deflate(Z_FINISH);
- } while (zerror_ == Z_OK);
- zerror_ = deflateEnd(&zcontext_);
- bool ok = zerror_ == Z_OK;
- zerror_ = Z_STREAM_END;
- return ok;
-}
-
-} // namespace protobuf_ports
-} // namespace lib
-} // namespace icing
diff --git a/icing/portable/gzip_stream.h b/icing/portable/gzip_stream.h
deleted file mode 100644
index 602093f..0000000
--- a/icing/portable/gzip_stream.h
+++ /dev/null
@@ -1,181 +0,0 @@
-// Copyright (C) 2009 Google LLC
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-// http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-// This file contains the definition for classes GzipInputStream and
-// GzipOutputStream. It is forked from protobuf because these classes are only
-// provided in libprotobuf-full but we would like to link libicing against the
-// smaller libprotobuf-lite instead.
-//
-// GzipInputStream decompresses data from an underlying
-// ZeroCopyInputStream and provides the decompressed data as a
-// ZeroCopyInputStream.
-//
-// GzipOutputStream is an ZeroCopyOutputStream that compresses data to
-// an underlying ZeroCopyOutputStream.
-
-#ifndef GOOGLE3_ICING_PORTABLE_GZIP_STREAM_H_
-#define GOOGLE3_ICING_PORTABLE_GZIP_STREAM_H_
-
-#include <google/protobuf/io/zero_copy_stream_impl_lite.h>
-#include "icing/portable/zlib.h"
-
-namespace icing {
-namespace lib {
-namespace protobuf_ports {
-
-// A ZeroCopyInputStream that reads compressed data through zlib
-class GzipInputStream : public google::protobuf::io::ZeroCopyInputStream {
- public:
- // Format key for constructor
- enum Format {
- // zlib will autodetect gzip header or deflate stream
- AUTO = 0,
-
- // GZIP streams have some extra header data for file attributes.
- GZIP = 1,
-
- // Simpler zlib stream format.
- ZLIB = 2,
- };
-
- // buffer_size and format may be -1 for default of 64kB and GZIP format
- explicit GzipInputStream(
- google::protobuf::io::ZeroCopyInputStream* sub_stream,
- Format format = AUTO, int buffer_size = -1);
- virtual ~GzipInputStream();
-
- // Return last error message or NULL if no error.
- inline const char* ZlibErrorMessage() const { return zcontext_.msg; }
- inline int ZlibErrorCode() const { return zerror_; }
-
- // implements ZeroCopyInputStream ----------------------------------
- bool Next(const void** data, int* size) override;
- void BackUp(int count) override;
- bool Skip(int count) override;
- int64_t ByteCount() const override;
-
- private:
- Format format_;
-
- google::protobuf::io::ZeroCopyInputStream* sub_stream_;
-
- z_stream zcontext_;
- int zerror_;
-
- void* output_buffer_;
- void* output_position_;
- size_t output_buffer_length_;
- int64_t byte_count_;
-
- int Inflate(int flush);
- void DoNextOutput(const void** data, int* size);
-};
-
-class GzipOutputStream : public google::protobuf::io::ZeroCopyOutputStream {
- public:
- // Format key for constructor
- enum Format {
- // GZIP streams have some extra header data for file attributes.
- GZIP = 1,
-
- // Simpler zlib stream format.
- ZLIB = 2,
- };
-
- struct Options {
- // Defaults to GZIP.
- Format format;
-
- // What size buffer to use internally. Defaults to 64kB.
- int buffer_size;
-
- // A number between 0 and 9, where 0 is no compression and 9 is best
- // compression. Defaults to Z_DEFAULT_COMPRESSION (see zlib.h).
- int compression_level;
-
- // Defaults to Z_DEFAULT_STRATEGY. Can also be set to Z_FILTERED,
- // Z_HUFFMAN_ONLY, or Z_RLE. See the documentation for deflateInit2 in
- // zlib.h for definitions of these constants.
- int compression_strategy;
-
- Options(); // Initializes with default values.
- };
-
- // Create a GzipOutputStream with default options.
- explicit GzipOutputStream(
- google::protobuf::io::ZeroCopyOutputStream* sub_stream);
-
- // Create a GzipOutputStream with the given options.
- GzipOutputStream(
- google::protobuf::io::ZeroCopyOutputStream* sub_stream,
- const Options& options);
-
- virtual ~GzipOutputStream();
-
- // Return last error message or NULL if no error.
- inline const char* ZlibErrorMessage() const { return zcontext_.msg; }
- inline int ZlibErrorCode() const { return zerror_; }
-
- // Flushes data written so far to zipped data in the underlying stream.
- // It is the caller's responsibility to flush the underlying stream if
- // necessary.
- // Compression may be less efficient stopping and starting around flushes.
- // Returns true if no error.
- //
- // Please ensure that block size is > 6. Here is an excerpt from the zlib
- // doc that explains why:
- //
- // In the case of a Z_FULL_FLUSH or Z_SYNC_FLUSH, make sure that avail_out
- // is greater than six to avoid repeated flush markers due to
- // avail_out == 0 on return.
- bool Flush();
-
- // Writes out all data and closes the gzip stream.
- // It is the caller's responsibility to close the underlying stream if
- // necessary.
- // Returns true if no error.
- bool Close();
-
- // implements ZeroCopyOutputStream ---------------------------------
- bool Next(void** data, int* size) override;
- void BackUp(int count) override;
- int64_t ByteCount() const override;
-
- private:
- google::protobuf::io::ZeroCopyOutputStream* sub_stream_;
- // Result from calling Next() on sub_stream_
- void* sub_data_;
- int sub_data_size_;
-
- z_stream zcontext_;
- int zerror_;
- void* input_buffer_;
- size_t input_buffer_length_;
-
- // Shared constructor code.
- void Init(
- google::protobuf::io::ZeroCopyOutputStream* sub_stream,
- const Options& options);
-
- // Do some compression.
- // Takes zlib flush mode.
- // Returns zlib error code.
- int Deflate(int flush);
-};
-
-} // namespace protobuf_ports
-} // namespace lib
-} // namespace icing
-
-#endif // GOOGLE3_ICING_PORTABLE_GZIP_STREAM_H_
diff --git a/icing/query/query-processor.cc b/icing/query/query-processor.cc
index 36c76db..1f937fd 100644
--- a/icing/query/query-processor.cc
+++ b/icing/query/query-processor.cc
@@ -182,7 +182,7 @@ QueryProcessor::ParseRawQuery(const SearchSpecProto& search_spec) {
const Token& token = tokens.at(i);
std::unique_ptr<DocHitInfoIterator> result_iterator;
- // TODO(b/202076890): Handle negation tokens
+ // TODO(cassiewang): Handle negation tokens
switch (token.type) {
case Token::Type::QUERY_LEFT_PARENTHESES: {
frames.emplace(ParserStateFrame());
diff --git a/icing/query/query-processor_benchmark.cc b/icing/query/query-processor_benchmark.cc
index e48fe78..bdd40aa 100644
--- a/icing/query/query-processor_benchmark.cc
+++ b/icing/query/query-processor_benchmark.cc
@@ -16,6 +16,7 @@
#include "gmock/gmock.h"
#include "third_party/absl/flags/flag.h"
#include "icing/document-builder.h"
+#include "icing/helpers/icu/icu-data-file-helper.h"
#include "icing/index/index.h"
#include "icing/proto/term.pb.h"
#include "icing/query/query-processor.h"
@@ -23,7 +24,6 @@
#include "icing/schema/section.h"
#include "icing/store/document-id.h"
#include "icing/testing/common-matchers.h"
-#include "icing/testing/icu-data-file-helper.h"
#include "icing/testing/test-data.h"
#include "icing/testing/tmp-directory.h"
#include "icing/tokenization/language-segmenter-factory.h"
diff --git a/icing/query/query-processor_test.cc b/icing/query/query-processor_test.cc
index eaa0efc..daeb479 100644
--- a/icing/query/query-processor_test.cc
+++ b/icing/query/query-processor_test.cc
@@ -23,6 +23,7 @@
#include "gtest/gtest.h"
#include "icing/document-builder.h"
#include "icing/file/filesystem.h"
+#include "icing/helpers/icu/icu-data-file-helper.h"
#include "icing/index/hit/doc-hit-info.h"
#include "icing/index/index.h"
#include "icing/index/iterator/doc-hit-info-iterator-test-util.h"
@@ -39,7 +40,6 @@
#include "icing/store/document-store.h"
#include "icing/testing/common-matchers.h"
#include "icing/testing/fake-clock.h"
-#include "icing/testing/icu-data-file-helper.h"
#include "icing/testing/jni-test-helpers.h"
#include "icing/testing/test-data.h"
#include "icing/testing/tmp-directory.h"
@@ -61,30 +61,28 @@ using ::testing::SizeIs;
using ::testing::Test;
using ::testing::UnorderedElementsAre;
-constexpr PropertyConfigProto::DataType::Code TYPE_STRING =
- PropertyConfigProto::DataType::STRING;
+constexpr PropertyConfigProto_DataType_Code TYPE_STRING =
+ PropertyConfigProto_DataType_Code_STRING;
-constexpr PropertyConfigProto::Cardinality::Code CARDINALITY_OPTIONAL =
- PropertyConfigProto::Cardinality::OPTIONAL;
+constexpr PropertyConfigProto_Cardinality_Code CARDINALITY_OPTIONAL =
+ PropertyConfigProto_Cardinality_Code_OPTIONAL;
-constexpr StringIndexingConfig::TokenizerType::Code TOKENIZER_PLAIN =
- StringIndexingConfig::TokenizerType::PLAIN;
+constexpr StringIndexingConfig_TokenizerType_Code TOKENIZER_PLAIN =
+ StringIndexingConfig_TokenizerType_Code_PLAIN;
-constexpr TermMatchType::Code MATCH_EXACT = TermMatchType::EXACT_ONLY;
+constexpr TermMatchType_Code MATCH_EXACT = TermMatchType_Code_EXACT_ONLY;
class QueryProcessorTest : public Test {
protected:
QueryProcessorTest()
: test_dir_(GetTestTempDir() + "/icing"),
store_dir_(test_dir_ + "/store"),
- schema_store_dir_(test_dir_ + "/schema_store"),
index_dir_(test_dir_ + "/index") {}
void SetUp() override {
filesystem_.DeleteDirectoryRecursively(test_dir_.c_str());
filesystem_.CreateDirectoryRecursively(index_dir_.c_str());
filesystem_.CreateDirectoryRecursively(store_dir_.c_str());
- filesystem_.CreateDirectoryRecursively(schema_store_dir_.c_str());
if (!IsCfStringTokenization() && !IsReverseJniTokenization()) {
// If we've specified using the reverse-JNI method for segmentation (i.e.
@@ -131,7 +129,6 @@ class QueryProcessorTest : public Test {
Filesystem filesystem_;
const std::string test_dir_;
const std::string store_dir_;
- const std::string schema_store_dir_;
std::unique_ptr<Index> index_;
std::unique_ptr<LanguageSegmenter> language_segmenter_;
std::unique_ptr<Normalizer> normalizer_;
@@ -179,7 +176,7 @@ TEST_F(QueryProcessorTest, EmptyGroupMatchAllDocuments) {
ICING_ASSERT_OK_AND_ASSIGN(
schema_store_,
- SchemaStore::Create(&filesystem_, schema_store_dir_, &fake_clock_));
+ SchemaStore::Create(&filesystem_, test_dir_, &fake_clock_));
ASSERT_THAT(schema_store_->SetSchema(schema), IsOk());
ICING_ASSERT_OK_AND_ASSIGN(
@@ -230,7 +227,7 @@ TEST_F(QueryProcessorTest, EmptyQueryMatchAllDocuments) {
ICING_ASSERT_OK_AND_ASSIGN(
schema_store_,
- SchemaStore::Create(&filesystem_, schema_store_dir_, &fake_clock_));
+ SchemaStore::Create(&filesystem_, test_dir_, &fake_clock_));
ASSERT_THAT(schema_store_->SetSchema(schema), IsOk());
ICING_ASSERT_OK_AND_ASSIGN(
@@ -281,7 +278,7 @@ TEST_F(QueryProcessorTest, QueryTermNormalized) {
ICING_ASSERT_OK_AND_ASSIGN(
schema_store_,
- SchemaStore::Create(&filesystem_, schema_store_dir_, &fake_clock_));
+ SchemaStore::Create(&filesystem_, test_dir_, &fake_clock_));
ASSERT_THAT(schema_store_->SetSchema(schema), IsOk());
ICING_ASSERT_OK_AND_ASSIGN(
@@ -357,7 +354,7 @@ TEST_F(QueryProcessorTest, OneTermPrefixMatch) {
ICING_ASSERT_OK_AND_ASSIGN(
schema_store_,
- SchemaStore::Create(&filesystem_, schema_store_dir_, &fake_clock_));
+ SchemaStore::Create(&filesystem_, test_dir_, &fake_clock_));
ASSERT_THAT(schema_store_->SetSchema(schema), IsOk());
ICING_ASSERT_OK_AND_ASSIGN(
@@ -425,7 +422,7 @@ TEST_F(QueryProcessorTest, OneTermExactMatch) {
ICING_ASSERT_OK_AND_ASSIGN(
schema_store_,
- SchemaStore::Create(&filesystem_, schema_store_dir_, &fake_clock_));
+ SchemaStore::Create(&filesystem_, test_dir_, &fake_clock_));
ASSERT_THAT(schema_store_->SetSchema(schema), IsOk());
ICING_ASSERT_OK_AND_ASSIGN(
@@ -493,7 +490,7 @@ TEST_F(QueryProcessorTest, AndSameTermExactMatch) {
ICING_ASSERT_OK_AND_ASSIGN(
schema_store_,
- SchemaStore::Create(&filesystem_, schema_store_dir_, &fake_clock_));
+ SchemaStore::Create(&filesystem_, test_dir_, &fake_clock_));
ASSERT_THAT(schema_store_->SetSchema(schema), IsOk());
ICING_ASSERT_OK_AND_ASSIGN(
@@ -563,7 +560,7 @@ TEST_F(QueryProcessorTest, AndTwoTermExactMatch) {
ICING_ASSERT_OK_AND_ASSIGN(
schema_store_,
- SchemaStore::Create(&filesystem_, schema_store_dir_, &fake_clock_));
+ SchemaStore::Create(&filesystem_, test_dir_, &fake_clock_));
ASSERT_THAT(schema_store_->SetSchema(schema), IsOk());
ICING_ASSERT_OK_AND_ASSIGN(
@@ -638,7 +635,7 @@ TEST_F(QueryProcessorTest, AndSameTermPrefixMatch) {
ICING_ASSERT_OK_AND_ASSIGN(
schema_store_,
- SchemaStore::Create(&filesystem_, schema_store_dir_, &fake_clock_));
+ SchemaStore::Create(&filesystem_, test_dir_, &fake_clock_));
ASSERT_THAT(schema_store_->SetSchema(schema), IsOk());
ICING_ASSERT_OK_AND_ASSIGN(
@@ -708,7 +705,7 @@ TEST_F(QueryProcessorTest, AndTwoTermPrefixMatch) {
ICING_ASSERT_OK_AND_ASSIGN(
schema_store_,
- SchemaStore::Create(&filesystem_, schema_store_dir_, &fake_clock_));
+ SchemaStore::Create(&filesystem_, test_dir_, &fake_clock_));
ASSERT_THAT(schema_store_->SetSchema(schema), IsOk());
ICING_ASSERT_OK_AND_ASSIGN(
@@ -784,7 +781,7 @@ TEST_F(QueryProcessorTest, AndTwoTermPrefixAndExactMatch) {
ICING_ASSERT_OK_AND_ASSIGN(
schema_store_,
- SchemaStore::Create(&filesystem_, schema_store_dir_, &fake_clock_));
+ SchemaStore::Create(&filesystem_, test_dir_, &fake_clock_));
ASSERT_THAT(schema_store_->SetSchema(schema), IsOk());
ICING_ASSERT_OK_AND_ASSIGN(
@@ -860,7 +857,7 @@ TEST_F(QueryProcessorTest, OrTwoTermExactMatch) {
ICING_ASSERT_OK_AND_ASSIGN(
schema_store_,
- SchemaStore::Create(&filesystem_, schema_store_dir_, &fake_clock_));
+ SchemaStore::Create(&filesystem_, test_dir_, &fake_clock_));
ASSERT_THAT(schema_store_->SetSchema(schema), IsOk());
ICING_ASSERT_OK_AND_ASSIGN(
@@ -949,7 +946,7 @@ TEST_F(QueryProcessorTest, OrTwoTermPrefixMatch) {
ICING_ASSERT_OK_AND_ASSIGN(
schema_store_,
- SchemaStore::Create(&filesystem_, schema_store_dir_, &fake_clock_));
+ SchemaStore::Create(&filesystem_, test_dir_, &fake_clock_));
ASSERT_THAT(schema_store_->SetSchema(schema), IsOk());
ICING_ASSERT_OK_AND_ASSIGN(
@@ -1037,7 +1034,7 @@ TEST_F(QueryProcessorTest, OrTwoTermPrefixAndExactMatch) {
ICING_ASSERT_OK_AND_ASSIGN(
schema_store_,
- SchemaStore::Create(&filesystem_, schema_store_dir_, &fake_clock_));
+ SchemaStore::Create(&filesystem_, test_dir_, &fake_clock_));
ASSERT_THAT(schema_store_->SetSchema(schema), IsOk());
ICING_ASSERT_OK_AND_ASSIGN(
@@ -1124,7 +1121,7 @@ TEST_F(QueryProcessorTest, CombinedAndOrTerms) {
ICING_ASSERT_OK_AND_ASSIGN(
schema_store_,
- SchemaStore::Create(&filesystem_, schema_store_dir_, &fake_clock_));
+ SchemaStore::Create(&filesystem_, test_dir_, &fake_clock_));
ASSERT_THAT(schema_store_->SetSchema(schema), IsOk());
ICING_ASSERT_OK_AND_ASSIGN(
@@ -1310,7 +1307,7 @@ TEST_F(QueryProcessorTest, OneGroup) {
ICING_ASSERT_OK_AND_ASSIGN(
schema_store_,
- SchemaStore::Create(&filesystem_, schema_store_dir_, &fake_clock_));
+ SchemaStore::Create(&filesystem_, test_dir_, &fake_clock_));
ASSERT_THAT(schema_store_->SetSchema(schema), IsOk());
ICING_ASSERT_OK_AND_ASSIGN(
@@ -1386,7 +1383,7 @@ TEST_F(QueryProcessorTest, TwoGroups) {
ICING_ASSERT_OK_AND_ASSIGN(
schema_store_,
- SchemaStore::Create(&filesystem_, schema_store_dir_, &fake_clock_));
+ SchemaStore::Create(&filesystem_, test_dir_, &fake_clock_));
ASSERT_THAT(schema_store_->SetSchema(schema), IsOk());
ICING_ASSERT_OK_AND_ASSIGN(
@@ -1464,7 +1461,7 @@ TEST_F(QueryProcessorTest, ManyLevelNestedGrouping) {
ICING_ASSERT_OK_AND_ASSIGN(
schema_store_,
- SchemaStore::Create(&filesystem_, schema_store_dir_, &fake_clock_));
+ SchemaStore::Create(&filesystem_, test_dir_, &fake_clock_));
ASSERT_THAT(schema_store_->SetSchema(schema), IsOk());
ICING_ASSERT_OK_AND_ASSIGN(
@@ -1540,7 +1537,7 @@ TEST_F(QueryProcessorTest, OneLevelNestedGrouping) {
ICING_ASSERT_OK_AND_ASSIGN(
schema_store_,
- SchemaStore::Create(&filesystem_, schema_store_dir_, &fake_clock_));
+ SchemaStore::Create(&filesystem_, test_dir_, &fake_clock_));
ASSERT_THAT(schema_store_->SetSchema(schema), IsOk());
ICING_ASSERT_OK_AND_ASSIGN(
@@ -1617,7 +1614,7 @@ TEST_F(QueryProcessorTest, ExcludeTerm) {
ICING_ASSERT_OK_AND_ASSIGN(
schema_store_,
- SchemaStore::Create(&filesystem_, schema_store_dir_, &fake_clock_));
+ SchemaStore::Create(&filesystem_, test_dir_, &fake_clock_));
ASSERT_THAT(schema_store_->SetSchema(schema), IsOk());
ICING_ASSERT_OK_AND_ASSIGN(
@@ -1682,7 +1679,7 @@ TEST_F(QueryProcessorTest, ExcludeNonexistentTerm) {
ICING_ASSERT_OK_AND_ASSIGN(
schema_store_,
- SchemaStore::Create(&filesystem_, schema_store_dir_, &fake_clock_));
+ SchemaStore::Create(&filesystem_, test_dir_, &fake_clock_));
ASSERT_THAT(schema_store_->SetSchema(schema), IsOk());
ICING_ASSERT_OK_AND_ASSIGN(
@@ -1745,7 +1742,7 @@ TEST_F(QueryProcessorTest, ExcludeAnd) {
ICING_ASSERT_OK_AND_ASSIGN(
schema_store_,
- SchemaStore::Create(&filesystem_, schema_store_dir_, &fake_clock_));
+ SchemaStore::Create(&filesystem_, test_dir_, &fake_clock_));
ASSERT_THAT(schema_store_->SetSchema(schema), IsOk());
ICING_ASSERT_OK_AND_ASSIGN(
@@ -1835,7 +1832,7 @@ TEST_F(QueryProcessorTest, ExcludeOr) {
ICING_ASSERT_OK_AND_ASSIGN(
schema_store_,
- SchemaStore::Create(&filesystem_, schema_store_dir_, &fake_clock_));
+ SchemaStore::Create(&filesystem_, test_dir_, &fake_clock_));
ASSERT_THAT(schema_store_->SetSchema(schema), IsOk());
ICING_ASSERT_OK_AND_ASSIGN(
@@ -1931,7 +1928,7 @@ TEST_F(QueryProcessorTest, DeletedFilter) {
ICING_ASSERT_OK_AND_ASSIGN(
schema_store_,
- SchemaStore::Create(&filesystem_, schema_store_dir_, &fake_clock_));
+ SchemaStore::Create(&filesystem_, test_dir_, &fake_clock_));
ASSERT_THAT(schema_store_->SetSchema(schema), IsOk());
ICING_ASSERT_OK_AND_ASSIGN(
@@ -2005,7 +2002,7 @@ TEST_F(QueryProcessorTest, NamespaceFilter) {
ICING_ASSERT_OK_AND_ASSIGN(
schema_store_,
- SchemaStore::Create(&filesystem_, schema_store_dir_, &fake_clock_));
+ SchemaStore::Create(&filesystem_, test_dir_, &fake_clock_));
ASSERT_THAT(schema_store_->SetSchema(schema), IsOk());
ICING_ASSERT_OK_AND_ASSIGN(
@@ -2081,7 +2078,7 @@ TEST_F(QueryProcessorTest, SchemaTypeFilter) {
ICING_ASSERT_OK_AND_ASSIGN(
schema_store_,
- SchemaStore::Create(&filesystem_, schema_store_dir_, &fake_clock_));
+ SchemaStore::Create(&filesystem_, test_dir_, &fake_clock_));
ASSERT_THAT(schema_store_->SetSchema(schema), IsOk());
ICING_ASSERT_OK_AND_ASSIGN(
@@ -2158,7 +2155,7 @@ TEST_F(QueryProcessorTest, SectionFilterForOneDocument) {
ICING_ASSERT_OK_AND_ASSIGN(
schema_store_,
- SchemaStore::Create(&filesystem_, schema_store_dir_, &fake_clock_));
+ SchemaStore::Create(&filesystem_, test_dir_, &fake_clock_));
ASSERT_THAT(schema_store_->SetSchema(schema), IsOk());
ICING_ASSERT_OK_AND_ASSIGN(
@@ -2240,7 +2237,7 @@ TEST_F(QueryProcessorTest, SectionFilterAcrossSchemaTypes) {
ICING_ASSERT_OK_AND_ASSIGN(
schema_store_,
- SchemaStore::Create(&filesystem_, schema_store_dir_, &fake_clock_));
+ SchemaStore::Create(&filesystem_, test_dir_, &fake_clock_));
ASSERT_THAT(schema_store_->SetSchema(schema), IsOk());
ICING_ASSERT_OK_AND_ASSIGN(
@@ -2323,7 +2320,7 @@ TEST_F(QueryProcessorTest, SectionFilterWithinSchemaType) {
ICING_ASSERT_OK_AND_ASSIGN(
schema_store_,
- SchemaStore::Create(&filesystem_, schema_store_dir_, &fake_clock_));
+ SchemaStore::Create(&filesystem_, test_dir_, &fake_clock_));
ASSERT_THAT(schema_store_->SetSchema(schema), IsOk());
ICING_ASSERT_OK_AND_ASSIGN(
@@ -2407,7 +2404,7 @@ TEST_F(QueryProcessorTest, SectionFilterRespectsDifferentSectionIds) {
ICING_ASSERT_OK_AND_ASSIGN(
schema_store_,
- SchemaStore::Create(&filesystem_, schema_store_dir_, &fake_clock_));
+ SchemaStore::Create(&filesystem_, test_dir_, &fake_clock_));
ASSERT_THAT(schema_store_->SetSchema(schema), IsOk());
ICING_ASSERT_OK_AND_ASSIGN(
@@ -2480,7 +2477,7 @@ TEST_F(QueryProcessorTest, NonexistentSectionFilterReturnsEmptyResults) {
ICING_ASSERT_OK_AND_ASSIGN(
schema_store_,
- SchemaStore::Create(&filesystem_, schema_store_dir_, &fake_clock_));
+ SchemaStore::Create(&filesystem_, test_dir_, &fake_clock_));
ASSERT_THAT(schema_store_->SetSchema(schema), IsOk());
ICING_ASSERT_OK_AND_ASSIGN(
@@ -2547,7 +2544,7 @@ TEST_F(QueryProcessorTest, UnindexedSectionFilterReturnsEmptyResults) {
ICING_ASSERT_OK_AND_ASSIGN(
schema_store_,
- SchemaStore::Create(&filesystem_, schema_store_dir_, &fake_clock_));
+ SchemaStore::Create(&filesystem_, test_dir_, &fake_clock_));
ASSERT_THAT(schema_store_->SetSchema(schema), IsOk());
ICING_ASSERT_OK_AND_ASSIGN(
@@ -2617,7 +2614,7 @@ TEST_F(QueryProcessorTest, SectionFilterTermAndUnrestrictedTerm) {
ICING_ASSERT_OK_AND_ASSIGN(
schema_store_,
- SchemaStore::Create(&filesystem_, schema_store_dir_, &fake_clock_));
+ SchemaStore::Create(&filesystem_, test_dir_, &fake_clock_));
ASSERT_THAT(schema_store_->SetSchema(schema), IsOk());
ICING_ASSERT_OK_AND_ASSIGN(
@@ -2692,7 +2689,7 @@ TEST_F(QueryProcessorTest, DocumentBeforeTtlNotFilteredOut) {
ICING_ASSERT_OK_AND_ASSIGN(
schema_store_,
- SchemaStore::Create(&filesystem_, schema_store_dir_, &fake_clock_));
+ SchemaStore::Create(&filesystem_, test_dir_, &fake_clock_));
ASSERT_THAT(schema_store_->SetSchema(schema), IsOk());
// Arbitrary value, just has to be less than the document's creation
@@ -2751,7 +2748,7 @@ TEST_F(QueryProcessorTest, DocumentPastTtlFilteredOut) {
ICING_ASSERT_OK_AND_ASSIGN(
schema_store_,
- SchemaStore::Create(&filesystem_, schema_store_dir_, &fake_clock_));
+ SchemaStore::Create(&filesystem_, test_dir_, &fake_clock_));
ASSERT_THAT(schema_store_->SetSchema(schema), IsOk());
// Arbitrary value, just has to be greater than the document's creation
diff --git a/icing/query/suggestion-processor.cc b/icing/query/suggestion-processor.cc
deleted file mode 100644
index cfa53f6..0000000
--- a/icing/query/suggestion-processor.cc
+++ /dev/null
@@ -1,96 +0,0 @@
-// Copyright (C) 2021 Google LLC
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-// http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#include "icing/query/suggestion-processor.h"
-
-#include "icing/tokenization/tokenizer-factory.h"
-#include "icing/tokenization/tokenizer.h"
-#include "icing/transform/normalizer.h"
-
-namespace icing {
-namespace lib {
-
-libtextclassifier3::StatusOr<std::unique_ptr<SuggestionProcessor>>
-SuggestionProcessor::Create(Index* index,
- const LanguageSegmenter* language_segmenter,
- const Normalizer* normalizer) {
- ICING_RETURN_ERROR_IF_NULL(index);
- ICING_RETURN_ERROR_IF_NULL(language_segmenter);
-
- return std::unique_ptr<SuggestionProcessor>(
- new SuggestionProcessor(index, language_segmenter, normalizer));
-}
-
-libtextclassifier3::StatusOr<std::vector<TermMetadata>>
-SuggestionProcessor::QuerySuggestions(
- const icing::lib::SuggestionSpecProto& suggestion_spec,
- const NamespaceChecker* namespace_checker) {
- // We use query tokenizer to tokenize the give prefix, and we only use the
- // last token to be the suggestion prefix.
- ICING_ASSIGN_OR_RETURN(
- std::unique_ptr<Tokenizer> tokenizer,
- tokenizer_factory::CreateIndexingTokenizer(
- StringIndexingConfig::TokenizerType::PLAIN, &language_segmenter_));
- ICING_ASSIGN_OR_RETURN(std::unique_ptr<Tokenizer::Iterator> iterator,
- tokenizer->Tokenize(suggestion_spec.prefix()));
-
- // If there are previous tokens, they are prepended to the suggestion,
- // separated by spaces.
- std::string last_token;
- int token_start_pos;
- while (iterator->Advance()) {
- Token token = iterator->GetToken();
- last_token = token.text;
- token_start_pos = token.text.data() - suggestion_spec.prefix().c_str();
- }
-
- // If the position of the last token is not the end of the prefix, it means
- // there should be some operator tokens after it and are ignored by the
- // tokenizer.
- bool is_last_token = token_start_pos + last_token.length() >=
- suggestion_spec.prefix().length();
-
- if (!is_last_token || last_token.empty()) {
- // We don't have a valid last token, return early.
- return std::vector<TermMetadata>();
- }
-
- std::string query_prefix =
- suggestion_spec.prefix().substr(0, token_start_pos);
- // Run suggestion based on given SuggestionSpec.
- // Normalize token text to lowercase since all tokens in the lexicon are
- // lowercase.
- ICING_ASSIGN_OR_RETURN(
- std::vector<TermMetadata> terms,
- index_.FindTermsByPrefix(
- normalizer_.NormalizeTerm(last_token),
- suggestion_spec.num_to_return(),
- suggestion_spec.scoring_spec().scoring_match_type(),
- namespace_checker));
-
- for (TermMetadata& term : terms) {
- term.content = query_prefix + term.content;
- }
- return terms;
-}
-
-SuggestionProcessor::SuggestionProcessor(
- Index* index, const LanguageSegmenter* language_segmenter,
- const Normalizer* normalizer)
- : index_(*index),
- language_segmenter_(*language_segmenter),
- normalizer_(*normalizer) {}
-
-} // namespace lib
-} // namespace icing
diff --git a/icing/query/suggestion-processor.h b/icing/query/suggestion-processor.h
deleted file mode 100644
index 088863e..0000000
--- a/icing/query/suggestion-processor.h
+++ /dev/null
@@ -1,68 +0,0 @@
-// Copyright (C) 2021 Google LLC
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-// http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#ifndef ICING_QUERY_SUGGESTION_PROCESSOR_H_
-#define ICING_QUERY_SUGGESTION_PROCESSOR_H_
-
-#include "icing/text_classifier/lib3/utils/base/statusor.h"
-#include "icing/index/index.h"
-#include "icing/proto/search.pb.h"
-#include "icing/tokenization/language-segmenter.h"
-#include "icing/transform/normalizer.h"
-
-namespace icing {
-namespace lib {
-
-// Processes SuggestionSpecProtos and retrieves the specified TermMedaData that
-// satisfies the prefix and its restrictions. This also performs ranking, and
-// returns TermMetaData ordered by their hit count.
-class SuggestionProcessor {
- public:
- // Factory function to create a SuggestionProcessor which does not take
- // ownership of any input components, and all pointers must refer to valid
- // objects that outlive the created SuggestionProcessor instance.
- //
- // Returns:
- // An SuggestionProcessor on success
- // FAILED_PRECONDITION if any of the pointers is null.
- static libtextclassifier3::StatusOr<std::unique_ptr<SuggestionProcessor>>
- Create(Index* index, const LanguageSegmenter* language_segmenter,
- const Normalizer* normalizer);
-
- // Query suggestions based on the given SuggestionSpecProto.
- //
- // Returns:
- // On success,
- // - One vector that represents the entire TermMetadata
- // INTERNAL_ERROR on all other errors
- libtextclassifier3::StatusOr<std::vector<TermMetadata>> QuerySuggestions(
- const SuggestionSpecProto& suggestion_spec,
- const NamespaceChecker* namespace_checker);
-
- private:
- explicit SuggestionProcessor(Index* index,
- const LanguageSegmenter* language_segmenter,
- const Normalizer* normalizer);
-
- // Not const because we could modify/sort the TermMetaData buffer in the lite
- // index.
- Index& index_;
- const LanguageSegmenter& language_segmenter_;
- const Normalizer& normalizer_;
-};
-
-} // namespace lib
-} // namespace icing
-
-#endif // ICING_QUERY_SUGGESTION_PROCESSOR_H_
diff --git a/icing/query/suggestion-processor_test.cc b/icing/query/suggestion-processor_test.cc
deleted file mode 100644
index ba4c90a..0000000
--- a/icing/query/suggestion-processor_test.cc
+++ /dev/null
@@ -1,326 +0,0 @@
-// Copyright (C) 2021 Google LLC
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-// http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#include "icing/query/suggestion-processor.h"
-
-#include "gmock/gmock.h"
-#include "icing/store/document-store.h"
-#include "icing/testing/always-true-namespace-checker-impl.h"
-#include "icing/testing/common-matchers.h"
-#include "icing/testing/fake-clock.h"
-#include "icing/testing/icu-data-file-helper.h"
-#include "icing/testing/jni-test-helpers.h"
-#include "icing/testing/test-data.h"
-#include "icing/testing/tmp-directory.h"
-#include "icing/tokenization/language-segmenter-factory.h"
-#include "icing/transform/normalizer-factory.h"
-#include "unicode/uloc.h"
-
-namespace icing {
-namespace lib {
-
-namespace {
-
-using ::testing::IsEmpty;
-using ::testing::Test;
-
-class SuggestionProcessorTest : public Test {
- protected:
- SuggestionProcessorTest()
- : test_dir_(GetTestTempDir() + "/icing"),
- store_dir_(test_dir_ + "/store"),
- index_dir_(test_dir_ + "/index") {}
-
- void SetUp() override {
- filesystem_.DeleteDirectoryRecursively(test_dir_.c_str());
- filesystem_.CreateDirectoryRecursively(index_dir_.c_str());
- filesystem_.CreateDirectoryRecursively(store_dir_.c_str());
-
- if (!IsCfStringTokenization() && !IsReverseJniTokenization()) {
- // If we've specified using the reverse-JNI method for segmentation (i.e.
- // not ICU), then we won't have the ICU data file included to set up.
- // Technically, we could choose to use reverse-JNI for segmentation AND
- // include an ICU data file, but that seems unlikely and our current BUILD
- // setup doesn't do this.
- ICING_ASSERT_OK(
- // File generated via icu_data_file rule in //icing/BUILD.
- icu_data_file_helper::SetUpICUDataFile(
- GetTestFilePath("icing/icu.dat")));
- }
-
- Index::Options options(index_dir_,
- /*index_merge_size=*/1024 * 1024);
- ICING_ASSERT_OK_AND_ASSIGN(
- index_, Index::Create(options, &filesystem_, &icing_filesystem_));
-
- language_segmenter_factory::SegmenterOptions segmenter_options(
- ULOC_US, jni_cache_.get());
- ICING_ASSERT_OK_AND_ASSIGN(
- language_segmenter_,
- language_segmenter_factory::Create(segmenter_options));
-
- ICING_ASSERT_OK_AND_ASSIGN(normalizer_, normalizer_factory::Create(
- /*max_term_byte_size=*/1000));
-
- ICING_ASSERT_OK_AND_ASSIGN(
- schema_store_,
- SchemaStore::Create(&filesystem_, test_dir_, &fake_clock_));
-
- ICING_ASSERT_OK_AND_ASSIGN(
- DocumentStore::CreateResult create_result,
- DocumentStore::Create(&filesystem_, store_dir_, &fake_clock_,
- schema_store_.get()));
- }
-
- libtextclassifier3::Status AddTokenToIndex(
- DocumentId document_id, SectionId section_id,
- TermMatchType::Code term_match_type, const std::string& token) {
- Index::Editor editor = index_->Edit(document_id, section_id,
- term_match_type, /*namespace_id=*/0);
- auto status = editor.BufferTerm(token.c_str());
- return status.ok() ? editor.IndexAllBufferedTerms() : status;
- }
-
- void TearDown() override {
- filesystem_.DeleteDirectoryRecursively(test_dir_.c_str());
- }
-
- Filesystem filesystem_;
- const std::string test_dir_;
- const std::string store_dir_;
- std::unique_ptr<Index> index_;
- std::unique_ptr<LanguageSegmenter> language_segmenter_;
- std::unique_ptr<Normalizer> normalizer_;
- std::unique_ptr<SchemaStore> schema_store_;
- std::unique_ptr<const JniCache> jni_cache_ = GetTestJniCache();
- FakeClock fake_clock_;
-
- private:
- IcingFilesystem icing_filesystem_;
- const std::string index_dir_;
-};
-
-constexpr DocumentId kDocumentId0 = 0;
-constexpr SectionId kSectionId2 = 2;
-
-TEST_F(SuggestionProcessorTest, PrependedPrefixTokenTest) {
- ASSERT_THAT(AddTokenToIndex(kDocumentId0, kSectionId2,
- TermMatchType::EXACT_ONLY, "foo"),
- IsOk());
-
- ICING_ASSERT_OK_AND_ASSIGN(
- std::unique_ptr<SuggestionProcessor> suggestion_processor,
- SuggestionProcessor::Create(index_.get(), language_segmenter_.get(),
- normalizer_.get()));
-
- SuggestionSpecProto suggestion_spec;
- suggestion_spec.set_prefix(
- "prefix token should be prepended to the suggestion f");
- suggestion_spec.set_num_to_return(10);
-
- AlwaysTrueNamespaceCheckerImpl impl;
- ICING_ASSERT_OK_AND_ASSIGN(
- std::vector<TermMetadata> terms,
- suggestion_processor->QuerySuggestions(suggestion_spec, &impl));
- EXPECT_THAT(terms.at(0).content,
- "prefix token should be prepended to the suggestion foo");
-}
-
-TEST_F(SuggestionProcessorTest, NonExistentPrefixTest) {
- ASSERT_THAT(AddTokenToIndex(kDocumentId0, kSectionId2,
- TermMatchType::EXACT_ONLY, "foo"),
- IsOk());
-
- ICING_ASSERT_OK_AND_ASSIGN(
- std::unique_ptr<SuggestionProcessor> suggestion_processor,
- SuggestionProcessor::Create(index_.get(), language_segmenter_.get(),
- normalizer_.get()));
-
- SuggestionSpecProto suggestion_spec;
- suggestion_spec.set_prefix("nonExistTerm");
- suggestion_spec.set_num_to_return(10);
-
- AlwaysTrueNamespaceCheckerImpl impl;
- ICING_ASSERT_OK_AND_ASSIGN(
- std::vector<TermMetadata> terms,
- suggestion_processor->QuerySuggestions(suggestion_spec, &impl));
-
- EXPECT_THAT(terms, IsEmpty());
-}
-
-TEST_F(SuggestionProcessorTest, PrefixTrailingSpaceTest) {
- ASSERT_THAT(AddTokenToIndex(kDocumentId0, kSectionId2,
- TermMatchType::EXACT_ONLY, "foo"),
- IsOk());
-
- ICING_ASSERT_OK_AND_ASSIGN(
- std::unique_ptr<SuggestionProcessor> suggestion_processor,
- SuggestionProcessor::Create(index_.get(), language_segmenter_.get(),
- normalizer_.get()));
-
- SuggestionSpecProto suggestion_spec;
- suggestion_spec.set_prefix("f ");
- suggestion_spec.set_num_to_return(10);
-
- AlwaysTrueNamespaceCheckerImpl impl;
- ICING_ASSERT_OK_AND_ASSIGN(
- std::vector<TermMetadata> terms,
- suggestion_processor->QuerySuggestions(suggestion_spec, &impl));
-
- EXPECT_THAT(terms, IsEmpty());
-}
-
-TEST_F(SuggestionProcessorTest, NormalizePrefixTest) {
- ASSERT_THAT(AddTokenToIndex(kDocumentId0, kSectionId2,
- TermMatchType::EXACT_ONLY, "foo"),
- IsOk());
-
- ICING_ASSERT_OK_AND_ASSIGN(
- std::unique_ptr<SuggestionProcessor> suggestion_processor,
- SuggestionProcessor::Create(index_.get(), language_segmenter_.get(),
- normalizer_.get()));
-
- SuggestionSpecProto suggestion_spec;
- suggestion_spec.set_prefix("F");
- suggestion_spec.set_num_to_return(10);
-
- AlwaysTrueNamespaceCheckerImpl impl;
- ICING_ASSERT_OK_AND_ASSIGN(
- std::vector<TermMetadata> terms,
- suggestion_processor->QuerySuggestions(suggestion_spec, &impl));
- EXPECT_THAT(terms.at(0).content, "foo");
-
- suggestion_spec.set_prefix("fO");
- ICING_ASSERT_OK_AND_ASSIGN(
- terms, suggestion_processor->QuerySuggestions(suggestion_spec, &impl));
- EXPECT_THAT(terms.at(0).content, "foo");
-
- suggestion_spec.set_prefix("Fo");
- ICING_ASSERT_OK_AND_ASSIGN(
- terms, suggestion_processor->QuerySuggestions(suggestion_spec, &impl));
- EXPECT_THAT(terms.at(0).content, "foo");
-
- suggestion_spec.set_prefix("FO");
- ICING_ASSERT_OK_AND_ASSIGN(
- terms, suggestion_processor->QuerySuggestions(suggestion_spec, &impl));
- EXPECT_THAT(terms.at(0).content, "foo");
-}
-
-TEST_F(SuggestionProcessorTest, OrOperatorPrefixTest) {
- ASSERT_THAT(AddTokenToIndex(kDocumentId0, kSectionId2,
- TermMatchType::EXACT_ONLY, "foo"),
- IsOk());
- ASSERT_THAT(AddTokenToIndex(kDocumentId0, kSectionId2,
- TermMatchType::EXACT_ONLY, "original"),
- IsOk());
-
- ICING_ASSERT_OK_AND_ASSIGN(
- std::unique_ptr<SuggestionProcessor> suggestion_processor,
- SuggestionProcessor::Create(index_.get(), language_segmenter_.get(),
- normalizer_.get()));
-
- SuggestionSpecProto suggestion_spec;
- suggestion_spec.set_prefix("f OR");
- suggestion_spec.set_num_to_return(10);
-
- AlwaysTrueNamespaceCheckerImpl impl;
- ICING_ASSERT_OK_AND_ASSIGN(
- std::vector<TermMetadata> terms,
- suggestion_processor->QuerySuggestions(suggestion_spec, &impl));
-
- // Last Operator token will be used to query suggestion
- EXPECT_THAT(terms.at(0).content, "f original");
-}
-
-TEST_F(SuggestionProcessorTest, ParenthesesOperatorPrefixTest) {
- ASSERT_THAT(AddTokenToIndex(kDocumentId0, kSectionId2,
- TermMatchType::EXACT_ONLY, "foo"),
- IsOk());
- ICING_ASSERT_OK_AND_ASSIGN(
- std::unique_ptr<SuggestionProcessor> suggestion_processor,
- SuggestionProcessor::Create(index_.get(), language_segmenter_.get(),
- normalizer_.get()));
-
- SuggestionSpecProto suggestion_spec;
- suggestion_spec.set_prefix("{f}");
- suggestion_spec.set_num_to_return(10);
-
- AlwaysTrueNamespaceCheckerImpl impl;
- ICING_ASSERT_OK_AND_ASSIGN(
- std::vector<TermMetadata> terms,
- suggestion_processor->QuerySuggestions(suggestion_spec, &impl));
- EXPECT_THAT(terms, IsEmpty());
-
- suggestion_spec.set_prefix("[f]");
- ICING_ASSERT_OK_AND_ASSIGN(
- terms, suggestion_processor->QuerySuggestions(suggestion_spec, &impl));
- EXPECT_THAT(terms, IsEmpty());
-
- suggestion_spec.set_prefix("(f)");
- ICING_ASSERT_OK_AND_ASSIGN(
- terms, suggestion_processor->QuerySuggestions(suggestion_spec, &impl));
- EXPECT_THAT(terms, IsEmpty());
-}
-
-TEST_F(SuggestionProcessorTest, OtherSpecialPrefixTest) {
- ASSERT_THAT(AddTokenToIndex(kDocumentId0, kSectionId2,
- TermMatchType::EXACT_ONLY, "foo"),
- IsOk());
-
- ICING_ASSERT_OK_AND_ASSIGN(
- std::unique_ptr<SuggestionProcessor> suggestion_processor,
- SuggestionProcessor::Create(index_.get(), language_segmenter_.get(),
- normalizer_.get()));
-
- SuggestionSpecProto suggestion_spec;
- suggestion_spec.set_prefix("f:");
- suggestion_spec.set_num_to_return(10);
-
- AlwaysTrueNamespaceCheckerImpl impl;
- ICING_ASSERT_OK_AND_ASSIGN(
- std::vector<TermMetadata> terms,
- suggestion_processor->QuerySuggestions(suggestion_spec, &impl));
- EXPECT_THAT(terms, IsEmpty());
-
- suggestion_spec.set_prefix("f-");
- ICING_ASSERT_OK_AND_ASSIGN(
- terms, suggestion_processor->QuerySuggestions(suggestion_spec, &impl));
- EXPECT_THAT(terms, IsEmpty());
-}
-
-TEST_F(SuggestionProcessorTest, InvalidPrefixTest) {
- ASSERT_THAT(AddTokenToIndex(kDocumentId0, kSectionId2,
- TermMatchType::EXACT_ONLY, "original"),
- IsOk());
-
- ICING_ASSERT_OK_AND_ASSIGN(
- std::unique_ptr<SuggestionProcessor> suggestion_processor,
- SuggestionProcessor::Create(index_.get(), language_segmenter_.get(),
- normalizer_.get()));
-
- SuggestionSpecProto suggestion_spec;
- suggestion_spec.set_prefix("OR OR - :");
- suggestion_spec.set_num_to_return(10);
-
- AlwaysTrueNamespaceCheckerImpl impl;
- ICING_ASSERT_OK_AND_ASSIGN(
- std::vector<TermMetadata> terms,
- suggestion_processor->QuerySuggestions(suggestion_spec, &impl));
- EXPECT_THAT(terms, IsEmpty());
-}
-
-} // namespace
-
-} // namespace lib
-} // namespace icing
diff --git a/icing/result/result-retriever_test.cc b/icing/result/result-retriever_test.cc
index 0d812e4..1c9684d 100644
--- a/icing/result/result-retriever_test.cc
+++ b/icing/result/result-retriever_test.cc
@@ -22,6 +22,7 @@
#include "gtest/gtest.h"
#include "icing/document-builder.h"
#include "icing/file/mock-filesystem.h"
+#include "icing/helpers/icu/icu-data-file-helper.h"
#include "icing/portable/equals-proto.h"
#include "icing/portable/platform.h"
#include "icing/proto/document.pb.h"
@@ -35,7 +36,6 @@
#include "icing/store/document-id.h"
#include "icing/testing/common-matchers.h"
#include "icing/testing/fake-clock.h"
-#include "icing/testing/icu-data-file-helper.h"
#include "icing/testing/snippet-helpers.h"
#include "icing/testing/test-data.h"
#include "icing/testing/tmp-directory.h"
@@ -55,14 +55,14 @@ using ::testing::IsEmpty;
using ::testing::Return;
using ::testing::SizeIs;
-constexpr PropertyConfigProto::Cardinality::Code CARDINALITY_OPTIONAL =
- PropertyConfigProto::Cardinality::OPTIONAL;
+constexpr PropertyConfigProto_Cardinality_Code CARDINALITY_OPTIONAL =
+ PropertyConfigProto_Cardinality_Code_OPTIONAL;
-constexpr StringIndexingConfig::TokenizerType::Code TOKENIZER_PLAIN =
- StringIndexingConfig::TokenizerType::PLAIN;
+constexpr StringIndexingConfig_TokenizerType_Code TOKENIZER_PLAIN =
+ StringIndexingConfig_TokenizerType_Code_PLAIN;
-constexpr TermMatchType::Code MATCH_EXACT = TermMatchType::EXACT_ONLY;
-constexpr TermMatchType::Code MATCH_PREFIX = TermMatchType::PREFIX;
+constexpr TermMatchType_Code MATCH_EXACT = TermMatchType_Code_EXACT_ONLY;
+constexpr TermMatchType_Code MATCH_PREFIX = TermMatchType_Code_PREFIX;
class ResultRetrieverTest : public testing::Test {
protected:
@@ -160,7 +160,7 @@ ResultSpecProto::SnippetSpecProto CreateSnippetSpec() {
ResultSpecProto::SnippetSpecProto snippet_spec;
snippet_spec.set_num_to_snippet(std::numeric_limits<int>::max());
snippet_spec.set_num_matches_per_property(std::numeric_limits<int>::max());
- snippet_spec.set_max_window_utf32_length(1024);
+ snippet_spec.set_max_window_bytes(1024);
return snippet_spec;
}
@@ -362,8 +362,8 @@ TEST_F(ResultRetrieverTest, NotIgnoreErrors) {
TEST_F(ResultRetrieverTest, IOErrorShouldReturnInternalError) {
MockFilesystem mock_filesystem;
- ON_CALL(mock_filesystem, PRead(A<int>(), A<void*>(), A<size_t>(), A<off_t>()))
- .WillByDefault(Return(false));
+ ON_CALL(mock_filesystem, OpenForRead(_)).WillByDefault(Return(false));
+
ICING_ASSERT_OK_AND_ASSIGN(
DocumentStore::CreateResult create_result,
DocumentStore::Create(&mock_filesystem, test_dir_, &fake_clock_,
diff --git a/icing/result/result-state-manager_test.cc b/icing/result/result-state-manager_test.cc
index 8a9005d..32e45aa 100644
--- a/icing/result/result-state-manager_test.cc
+++ b/icing/result/result-state-manager_test.cc
@@ -849,7 +849,7 @@ TEST_F(ResultStateManagerTest, ShouldGetSnippetContext) {
ResultSpecProto result_spec = CreateResultSpec(/*num_per_page=*/1);
result_spec.mutable_snippet_spec()->set_num_to_snippet(5);
result_spec.mutable_snippet_spec()->set_num_matches_per_property(5);
- result_spec.mutable_snippet_spec()->set_max_window_utf32_length(5);
+ result_spec.mutable_snippet_spec()->set_max_window_bytes(5);
SearchSpecProto search_spec;
search_spec.set_term_match_type(TermMatchType::EXACT_ONLY);
@@ -884,7 +884,7 @@ TEST_F(ResultStateManagerTest, ShouldGetDefaultSnippetContext) {
// 0 indicates no snippeting
result_spec.mutable_snippet_spec()->set_num_to_snippet(0);
result_spec.mutable_snippet_spec()->set_num_matches_per_property(0);
- result_spec.mutable_snippet_spec()->set_max_window_utf32_length(0);
+ result_spec.mutable_snippet_spec()->set_max_window_bytes(0);
SearchSpecProto search_spec;
search_spec.set_term_match_type(TermMatchType::EXACT_ONLY);
diff --git a/icing/result/result-state_test.cc b/icing/result/result-state_test.cc
index d92fcfa..f2121a5 100644
--- a/icing/result/result-state_test.cc
+++ b/icing/result/result-state_test.cc
@@ -143,7 +143,7 @@ TEST_F(ResultStateTest, ShouldReturnSnippetContextAccordingToSpecs) {
ResultSpecProto result_spec = CreateResultSpec(/*num_per_page=*/2);
result_spec.mutable_snippet_spec()->set_num_to_snippet(5);
result_spec.mutable_snippet_spec()->set_num_matches_per_property(5);
- result_spec.mutable_snippet_spec()->set_max_window_utf32_length(5);
+ result_spec.mutable_snippet_spec()->set_max_window_bytes(5);
SectionRestrictQueryTermsMap query_terms_map;
query_terms_map.emplace("term1", std::unordered_set<std::string>());
@@ -178,7 +178,7 @@ TEST_F(ResultStateTest, NoSnippetingShouldReturnNull) {
// stored.
result_spec.mutable_snippet_spec()->set_num_to_snippet(0);
result_spec.mutable_snippet_spec()->set_num_matches_per_property(5);
- result_spec.mutable_snippet_spec()->set_max_window_utf32_length(5);
+ result_spec.mutable_snippet_spec()->set_max_window_bytes(5);
SectionRestrictQueryTermsMap query_terms_map;
query_terms_map.emplace("term1", std::unordered_set<std::string>());
diff --git a/icing/result/snippet-retriever.cc b/icing/result/snippet-retriever.cc
index bd1524e..2a138ec 100644
--- a/icing/result/snippet-retriever.cc
+++ b/icing/result/snippet-retriever.cc
@@ -41,7 +41,6 @@
#include "icing/transform/normalizer.h"
#include "icing/util/character-iterator.h"
#include "icing/util/i18n-utils.h"
-#include "icing/util/logging.h"
#include "icing/util/status-macros.h"
namespace icing {
@@ -76,81 +75,10 @@ inline std::string AddIndexToPath(int values_size, int index,
kRBracket);
}
-// Returns a string of the normalized text of the input Token. Normalization
-// is applied based on the Token's type.
-std::string NormalizeToken(const Normalizer& normalizer, const Token& token) {
- switch (token.type) {
- case Token::Type::REGULAR:
- return normalizer.NormalizeTerm(token.text);
- case Token::Type::VERBATIM:
- return std::string(token.text);
- case Token::Type::QUERY_EXCLUSION:
- [[fallthrough]];
- case Token::Type::QUERY_LEFT_PARENTHESES:
- [[fallthrough]];
- case Token::Type::QUERY_RIGHT_PARENTHESES:
- [[fallthrough]];
- case Token::Type::QUERY_OR:
- [[fallthrough]];
- case Token::Type::QUERY_PROPERTY:
- [[fallthrough]];
- case Token::Type::INVALID:
- ICING_LOG(WARNING) << "Unable to normalize token of type: "
- << static_cast<int>(token.type);
- return std::string(token.text);
- }
-}
-
-// Returns a CharacterIterator for token's text, advancing one past the last
-// matching character from the query term.
-CharacterIterator FindMatchEnd(const Normalizer& normalizer, const Token& token,
- const std::string& match_query_term) {
- switch (token.type) {
- case Token::Type::VERBATIM: {
- // VERBATIM tokens are not normalized. This means the non-normalized
- // matched query term must be either equal to or a prefix of the token's
- // text. Therefore, the match must end at the end of the matched query
- // term.
- CharacterIterator verbatim_match_end =
- CharacterIterator(token.text, 0, 0, 0);
- verbatim_match_end.AdvanceToUtf8(match_query_term.length());
- return verbatim_match_end;
- }
- case Token::Type::QUERY_EXCLUSION:
- [[fallthrough]];
- case Token::Type::QUERY_LEFT_PARENTHESES:
- [[fallthrough]];
- case Token::Type::QUERY_RIGHT_PARENTHESES:
- [[fallthrough]];
- case Token::Type::QUERY_OR:
- [[fallthrough]];
- case Token::Type::QUERY_PROPERTY:
- [[fallthrough]];
- case Token::Type::INVALID:
- ICING_LOG(WARNING)
- << "Unexpected Token type " << static_cast<int>(token.type)
- << " found when finding match end of query term and token.";
- [[fallthrough]];
- case Token::Type::REGULAR:
- return normalizer.FindNormalizedMatchEndPosition(token.text,
- match_query_term);
- }
-}
-
class TokenMatcher {
public:
virtual ~TokenMatcher() = default;
-
- // Returns a CharacterIterator pointing just past the end of the substring in
- // token.text that matches a query term. Note that the utf* indices will be
- // in relation to token.text's start.
- //
- // If there is no match, then it will construct a CharacterIterator with all
- // of its indices set to -1.
- //
- // Ex. With an exact matcher, query terms=["foo","bar"] and token.text="bar",
- // Matches will return a CharacterIterator(u8:3, u16:3, u32:3).
- virtual CharacterIterator Matches(Token token) const = 0;
+ virtual bool Matches(Token token) const = 0;
};
class TokenMatcherExact : public TokenMatcher {
@@ -163,18 +91,10 @@ class TokenMatcherExact : public TokenMatcher {
restricted_query_terms_(restricted_query_terms),
normalizer_(normalizer) {}
- CharacterIterator Matches(Token token) const override {
- std::string s = NormalizeToken(normalizer_, token);
- auto itr = unrestricted_query_terms_.find(s);
- if (itr == unrestricted_query_terms_.end()) {
- itr = restricted_query_terms_.find(s);
- }
- if (itr != unrestricted_query_terms_.end() &&
- itr != restricted_query_terms_.end()) {
- return FindMatchEnd(normalizer_, token, *itr);
- }
-
- return CharacterIterator(token.text, -1, -1, -1);
+ bool Matches(Token token) const override {
+ std::string s = normalizer_.NormalizeTerm(token.text);
+ return (unrestricted_query_terms_.count(s) > 0) ||
+ (restricted_query_terms_.count(s) > 0);
}
private:
@@ -193,21 +113,22 @@ class TokenMatcherPrefix : public TokenMatcher {
restricted_query_terms_(restricted_query_terms),
normalizer_(normalizer) {}
- CharacterIterator Matches(Token token) const override {
- std::string s = NormalizeToken(normalizer_, token);
- for (const std::string& query_term : unrestricted_query_terms_) {
- if (query_term.length() <= s.length() &&
- s.compare(0, query_term.length(), query_term) == 0) {
- return FindMatchEnd(normalizer_, token, query_term);
- }
- }
- for (const std::string& query_term : restricted_query_terms_) {
- if (query_term.length() <= s.length() &&
- s.compare(0, query_term.length(), query_term) == 0) {
- return FindMatchEnd(normalizer_, token, query_term);
- }
+ bool Matches(Token token) const override {
+ std::string s = normalizer_.NormalizeTerm(token.text);
+ if (std::any_of(unrestricted_query_terms_.begin(),
+ unrestricted_query_terms_.end(),
+ [&s](const std::string& term) {
+ return term.length() <= s.length() &&
+ s.compare(0, term.length(), term) == 0;
+ })) {
+ return true;
}
- return CharacterIterator(token.text, -1, -1, -1);
+ return std::any_of(restricted_query_terms_.begin(),
+ restricted_query_terms_.end(),
+ [&s](const std::string& term) {
+ return term.length() <= s.length() &&
+ s.compare(0, term.length(), term) == 0;
+ });
}
private:
@@ -245,7 +166,7 @@ libtextclassifier3::StatusOr<CharacterIterator> DetermineWindowStart(
const ResultSpecProto::SnippetSpecProto& snippet_spec,
std::string_view value, int window_start_min_exclusive_utf32,
Tokenizer::Iterator* iterator) {
- if (!iterator->ResetToTokenStartingAfter(window_start_min_exclusive_utf32)) {
+ if (!iterator->ResetToTokenAfter(window_start_min_exclusive_utf32)) {
return absl_ports::InternalError(
"Couldn't reset tokenizer to determine snippet window!");
}
@@ -280,7 +201,7 @@ libtextclassifier3::StatusOr<CharacterIterator> DetermineWindowEnd(
const ResultSpecProto::SnippetSpecProto& snippet_spec,
std::string_view value, int window_end_max_exclusive_utf32,
Tokenizer::Iterator* iterator) {
- if (!iterator->ResetToTokenEndingBefore(window_end_max_exclusive_utf32)) {
+ if (!iterator->ResetToTokenBefore(window_end_max_exclusive_utf32)) {
return absl_ports::InternalError(
"Couldn't reset tokenizer to determine snippet window!");
}
@@ -344,9 +265,9 @@ libtextclassifier3::StatusOr<SnippetMatchProto> RetrieveMatch(
int match_len_utf32 = end_itr.utf32_index() - match_pos_utf32;
int match_mid_utf32 = match_pos_utf32 + match_len_utf32 / 2;
int window_start_min_exclusive_utf32 =
- (match_mid_utf32 - snippet_spec.max_window_utf32_length() / 2) - 1;
+ (match_mid_utf32 - snippet_spec.max_window_bytes() / 2) - 1;
int window_end_max_exclusive_utf32 =
- match_mid_utf32 + (snippet_spec.max_window_utf32_length() + 1) / 2;
+ match_mid_utf32 + (snippet_spec.max_window_bytes() + 1) / 2;
snippet_match.set_exact_match_byte_position(start_itr.utf8_index());
snippet_match.set_exact_match_utf16_position(start_itr.utf16_index());
@@ -357,7 +278,7 @@ libtextclassifier3::StatusOr<SnippetMatchProto> RetrieveMatch(
// Only include windows if it'll at least include the matched text. Otherwise,
// it'll just be an empty string anyways.
- if (snippet_spec.max_window_utf32_length() >= match_len_utf32) {
+ if (snippet_spec.max_window_bytes() >= match_len_utf32) {
// Find the beginning of the window.
ICING_ASSIGN_OR_RETURN(
CharacterIterator window_start,
@@ -398,13 +319,8 @@ libtextclassifier3::StatusOr<SnippetMatchProto> RetrieveMatch(
// DetermineWindowStart/End may change the position of the iterator. So,
// reset the iterator back to the original position.
- bool success = false;
- if (match_pos_utf32 > 0) {
- success = iterator->ResetToTokenStartingAfter(match_pos_utf32 - 1);
- } else {
- success = iterator->ResetToStart();
- }
-
+ bool success = (match_pos_utf32 > 0) ? iterator->ResetToTokenAfter(match_pos_utf32 - 1)
+ : iterator->ResetToStart();
if (!success) {
return absl_ports::InternalError(
"Couldn't reset tokenizer to determine snippet window!");
@@ -448,10 +364,7 @@ void GetEntriesFromProperty(const PropertyProto* current_property,
CharacterIterator char_iterator(value);
while (iterator->Advance()) {
Token token = iterator->GetToken();
- CharacterIterator submatch_end = matcher->Matches(token);
- // If the token matched a query term, then submatch_end will point to an
- // actual position within token.text.
- if (submatch_end.utf8_index() != -1) {
+ if (matcher->Matches(token)) {
if (!char_iterator.AdvanceToUtf8(token.text.data() - value.data())) {
// We can't get the char_iterator to a valid position, so there's no
// way for us to provide valid utf-16 indices. There's nothing more we
@@ -480,15 +393,7 @@ void GetEntriesFromProperty(const PropertyProto* current_property,
}
}
SnippetMatchProto match = std::move(match_or).ValueOrDie();
- // submatch_end refers to a position *within* token.text.
- // This, conveniently enough, means that index that submatch_end points
- // to is the length of the submatch (because the submatch starts at 0 in
- // token.text).
- match.set_submatch_byte_length(submatch_end.utf8_index());
- match.set_submatch_utf16_length(submatch_end.utf16_index());
- // Add the values for the submatch.
snippet_entry.mutable_snippet_matches()->Add(std::move(match));
-
if (--match_options->max_matches_remaining <= 0) {
*snippet_proto->add_entries() = std::move(snippet_entry);
return;
diff --git a/icing/result/snippet-retriever_test.cc b/icing/result/snippet-retriever_test.cc
index 0de2295..e7988ae 100644
--- a/icing/result/snippet-retriever_test.cc
+++ b/icing/result/snippet-retriever_test.cc
@@ -22,6 +22,7 @@
#include "gtest/gtest.h"
#include "icing/document-builder.h"
#include "icing/file/mock-filesystem.h"
+#include "icing/helpers/icu/icu-data-file-helper.h"
#include "icing/portable/equals-proto.h"
#include "icing/portable/platform.h"
#include "icing/proto/document.pb.h"
@@ -36,14 +37,12 @@
#include "icing/store/key-mapper.h"
#include "icing/testing/common-matchers.h"
#include "icing/testing/fake-clock.h"
-#include "icing/testing/icu-data-file-helper.h"
#include "icing/testing/jni-test-helpers.h"
#include "icing/testing/snippet-helpers.h"
#include "icing/testing/test-data.h"
#include "icing/testing/tmp-directory.h"
#include "icing/tokenization/language-segmenter-factory.h"
#include "icing/tokenization/language-segmenter.h"
-#include "icing/transform/map/map-normalizer.h"
#include "icing/transform/normalizer-factory.h"
#include "icing/transform/normalizer.h"
#include "unicode/uloc.h"
@@ -58,18 +57,16 @@ using ::testing::Eq;
using ::testing::IsEmpty;
using ::testing::SizeIs;
-constexpr PropertyConfigProto::Cardinality::Code CARDINALITY_OPTIONAL =
- PropertyConfigProto::Cardinality::OPTIONAL;
-constexpr PropertyConfigProto::Cardinality::Code CARDINALITY_REPEATED =
- PropertyConfigProto::Cardinality::REPEATED;
+constexpr PropertyConfigProto_Cardinality_Code CARDINALITY_OPTIONAL =
+ PropertyConfigProto_Cardinality_Code_OPTIONAL;
+constexpr PropertyConfigProto_Cardinality_Code CARDINALITY_REPEATED =
+ PropertyConfigProto_Cardinality_Code_REPEATED;
-constexpr StringIndexingConfig::TokenizerType::Code TOKENIZER_PLAIN =
- StringIndexingConfig::TokenizerType::PLAIN;
-constexpr StringIndexingConfig::TokenizerType::Code TOKENIZER_VERBATIM =
- StringIndexingConfig::TokenizerType::VERBATIM;
+constexpr StringIndexingConfig_TokenizerType_Code TOKENIZER_PLAIN =
+ StringIndexingConfig_TokenizerType_Code_PLAIN;
-constexpr TermMatchType::Code MATCH_EXACT = TermMatchType::EXACT_ONLY;
-constexpr TermMatchType::Code MATCH_PREFIX = TermMatchType::PREFIX;
+constexpr TermMatchType_Code MATCH_EXACT = TermMatchType_Code_EXACT_ONLY;
+constexpr TermMatchType_Code MATCH_PREFIX = TermMatchType_Code_PREFIX;
std::vector<std::string_view> GetPropertyPaths(const SnippetProto& snippet) {
std::vector<std::string_view> paths;
@@ -133,7 +130,7 @@ class SnippetRetrieverTest : public testing::Test {
snippet_spec_.set_num_to_snippet(std::numeric_limits<int32_t>::max());
snippet_spec_.set_num_matches_per_property(
std::numeric_limits<int32_t>::max());
- snippet_spec_.set_max_window_utf32_length(64);
+ snippet_spec_.set_max_window_bytes(64);
}
void TearDown() override {
@@ -180,7 +177,7 @@ TEST_F(SnippetRetrieverTest, SnippetingWindowMaxWindowSizeSmallerThanMatch) {
// Window starts at the beginning of "three" and ends in the middle of
// "three". len=4, orig_window= "thre"
- snippet_spec_.set_max_window_utf32_length(4);
+ snippet_spec_.set_max_window_bytes(4);
SnippetProto snippet = snippet_retriever_->RetrieveSnippet(
query_terms, MATCH_EXACT, snippet_spec_, document, section_mask);
@@ -206,7 +203,7 @@ TEST_F(SnippetRetrieverTest,
// Window starts at the beginning of "three" and at the exact end of
// "three". len=5, orig_window= "three"
- snippet_spec_.set_max_window_utf32_length(5);
+ snippet_spec_.set_max_window_bytes(5);
SnippetProto snippet = snippet_retriever_->RetrieveSnippet(
query_terms, MATCH_EXACT, snippet_spec_, document, section_mask);
@@ -232,7 +229,7 @@ TEST_F(SnippetRetrieverTest,
// Window starts at the beginning of "four" and at the exact end of
// "four". len=4, orig_window= "four"
- snippet_spec_.set_max_window_utf32_length(4);
+ snippet_spec_.set_max_window_bytes(4);
SnippetProto snippet = snippet_retriever_->RetrieveSnippet(
query_terms, MATCH_EXACT, snippet_spec_, document, section_mask);
@@ -264,7 +261,7 @@ TEST_F(SnippetRetrieverTest, SnippetingWindowMaxWindowStartsInWhitespace) {
// 1. untrimmed, no-shifting window will be (2,17).
// 2. trimmed, no-shifting window [4,13) "two three"
// 3. trimmed, shifted window [4,18) "two three four"
- snippet_spec_.set_max_window_utf32_length(14);
+ snippet_spec_.set_max_window_bytes(14);
SnippetProto snippet = snippet_retriever_->RetrieveSnippet(
query_terms, MATCH_EXACT, snippet_spec_, document, section_mask);
@@ -297,7 +294,7 @@ TEST_F(SnippetRetrieverTest, SnippetingWindowMaxWindowStartsMidToken) {
// 1. untrimmed, no-shifting window will be (1,18).
// 2. trimmed, no-shifting window [4,18) "two three four"
// 3. trimmed, shifted window [4,20) "two three four.."
- snippet_spec_.set_max_window_utf32_length(16);
+ snippet_spec_.set_max_window_bytes(16);
SnippetProto snippet = snippet_retriever_->RetrieveSnippet(
query_terms, MATCH_EXACT, snippet_spec_, document, section_mask);
@@ -323,7 +320,7 @@ TEST_F(SnippetRetrieverTest, SnippetingWindowMaxWindowEndsInPunctuation) {
// Window ends in the middle of all the punctuation and window starts at 0.
// len=20, orig_window="one two three four.."
- snippet_spec_.set_max_window_utf32_length(20);
+ snippet_spec_.set_max_window_bytes(20);
SnippetProto snippet = snippet_retriever_->RetrieveSnippet(
query_terms, MATCH_EXACT, snippet_spec_, document, section_mask);
@@ -351,7 +348,7 @@ TEST_F(SnippetRetrieverTest,
// Window ends in the middle of all the punctuation and window starts at 0.
// len=26, orig_window="pside down in Australia¿"
- snippet_spec_.set_max_window_utf32_length(24);
+ snippet_spec_.set_max_window_bytes(24);
SnippetProto snippet = snippet_retriever_->RetrieveSnippet(
query_terms, MATCH_EXACT, snippet_spec_, document, section_mask);
@@ -379,7 +376,7 @@ TEST_F(SnippetRetrieverTest,
// Window ends in the middle of all the punctuation and window starts at 0.
// len=26, orig_window="upside down in Australia¿ "
- snippet_spec_.set_max_window_utf32_length(26);
+ snippet_spec_.set_max_window_bytes(26);
SnippetProto snippet = snippet_retriever_->RetrieveSnippet(
query_terms, MATCH_EXACT, snippet_spec_, document, section_mask);
@@ -412,7 +409,7 @@ TEST_F(SnippetRetrieverTest, SnippetingWindowMaxWindowStartsBeforeValueStart) {
// 1. untrimmed, no-shifting window will be (-2,21).
// 2. trimmed, no-shifting window [0,21) "one two three four..."
// 3. trimmed, shifted window [0,22) "one two three four...."
- snippet_spec_.set_max_window_utf32_length(22);
+ snippet_spec_.set_max_window_bytes(22);
SnippetProto snippet = snippet_retriever_->RetrieveSnippet(
query_terms, MATCH_EXACT, snippet_spec_, document, section_mask);
@@ -438,7 +435,7 @@ TEST_F(SnippetRetrieverTest, SnippetingWindowMaxWindowEndsInWhitespace) {
// Window ends before "five" but after all the punctuation
// len=26, orig_window="one two three four.... "
- snippet_spec_.set_max_window_utf32_length(26);
+ snippet_spec_.set_max_window_bytes(26);
SnippetProto snippet = snippet_retriever_->RetrieveSnippet(
query_terms, MATCH_EXACT, snippet_spec_, document, section_mask);
@@ -471,7 +468,7 @@ TEST_F(SnippetRetrieverTest, SnippetingWindowMaxWindowEndsMidToken) {
// 1. untrimmed, no-shifting window will be ((-7,26).
// 2. trimmed, no-shifting window [0,26) "one two three four...."
// 3. trimmed, shifted window [0,27) "one two three four.... five"
- snippet_spec_.set_max_window_utf32_length(32);
+ snippet_spec_.set_max_window_bytes(32);
SnippetProto snippet = snippet_retriever_->RetrieveSnippet(
query_terms, MATCH_EXACT, snippet_spec_, document, section_mask);
@@ -497,7 +494,7 @@ TEST_F(SnippetRetrieverTest, SnippetingWindowMaxWindowSizeEqualToValueSize) {
// Max window size equals the size of the value.
// len=34, orig_window="one two three four.... five"
- snippet_spec_.set_max_window_utf32_length(34);
+ snippet_spec_.set_max_window_bytes(34);
SnippetProto snippet = snippet_retriever_->RetrieveSnippet(
query_terms, MATCH_EXACT, snippet_spec_, document, section_mask);
@@ -523,7 +520,7 @@ TEST_F(SnippetRetrieverTest, SnippetingWindowMaxWindowSizeLargerThanValueSize) {
// Max window size exceeds the size of the value.
// len=36, orig_window="one two three four.... five"
- snippet_spec_.set_max_window_utf32_length(36);
+ snippet_spec_.set_max_window_bytes(36);
SnippetProto snippet = snippet_retriever_->RetrieveSnippet(
query_terms, MATCH_EXACT, snippet_spec_, document, section_mask);
@@ -557,7 +554,7 @@ TEST_F(SnippetRetrieverTest, SnippetingWindowMatchAtTextStart) {
// 1. untrimmed, no-shifting window will be (-10,19).
// 2. trimmed, no-shifting window [0,19) "one two three four."
// 3. trimmed, shifted window [0,27) "one two three four.... five"
- snippet_spec_.set_max_window_utf32_length(28);
+ snippet_spec_.set_max_window_bytes(28);
SnippetProto snippet = snippet_retriever_->RetrieveSnippet(
query_terms, MATCH_EXACT, snippet_spec_, document, section_mask);
@@ -591,7 +588,7 @@ TEST_F(SnippetRetrieverTest, SnippetingWindowMatchAtTextEnd) {
// 1. untrimmed, no-shifting window will be (10,39).
// 2. trimmed, no-shifting window [14,31) "four.... five six"
// 3. trimmed, shifted window [4,31) "two three four.... five six"
- snippet_spec_.set_max_window_utf32_length(28);
+ snippet_spec_.set_max_window_bytes(28);
SnippetProto snippet = snippet_retriever_->RetrieveSnippet(
query_terms, MATCH_EXACT, snippet_spec_, document, section_mask);
@@ -625,7 +622,7 @@ TEST_F(SnippetRetrieverTest, SnippetingWindowMatchAtTextStartShortText) {
// 1. untrimmed, no-shifting window will be (-10,19).
// 2. trimmed, no-shifting window [0, 19) "one two three four."
// 3. trimmed, shifted window [0, 22) "one two three four...."
- snippet_spec_.set_max_window_utf32_length(28);
+ snippet_spec_.set_max_window_bytes(28);
SnippetProto snippet = snippet_retriever_->RetrieveSnippet(
query_terms, MATCH_EXACT, snippet_spec_, document, section_mask);
@@ -659,7 +656,7 @@ TEST_F(SnippetRetrieverTest, SnippetingWindowMatchAtTextEndShortText) {
// 1. untrimmed, no-shifting window will be (1,30).
// 2. trimmed, no-shifting window [4, 22) "two three four...."
// 3. trimmed, shifted window [0, 22) "one two three four...."
- snippet_spec_.set_max_window_utf32_length(28);
+ snippet_spec_.set_max_window_bytes(28);
SnippetProto snippet = snippet_retriever_->RetrieveSnippet(
query_terms, MATCH_EXACT, snippet_spec_, document, section_mask);
@@ -693,7 +690,6 @@ TEST_F(SnippetRetrieverTest, PrefixSnippeting) {
EXPECT_THAT(GetWindows(content, snippet.entries(0)),
ElementsAre("subject foo"));
EXPECT_THAT(GetMatches(content, snippet.entries(0)), ElementsAre("foo"));
- EXPECT_THAT(GetSubMatches(content, snippet.entries(0)), ElementsAre("f"));
}
TEST_F(SnippetRetrieverTest, ExactSnippeting) {
@@ -723,7 +719,7 @@ TEST_F(SnippetRetrieverTest, SimpleSnippetingNoWindowing) {
.AddStringProperty("body", "Only a fool would match this content.")
.Build();
- snippet_spec_.set_max_window_utf32_length(0);
+ snippet_spec_.set_max_window_bytes(0);
SectionIdMask section_mask = 0b00000011;
SectionRestrictQueryTermsMap query_terms{{"", {"foo"}}};
@@ -737,7 +733,6 @@ TEST_F(SnippetRetrieverTest, SimpleSnippetingNoWindowing) {
GetString(&document, snippet.entries(0).property_name());
EXPECT_THAT(GetWindows(content, snippet.entries(0)), ElementsAre(""));
EXPECT_THAT(GetMatches(content, snippet.entries(0)), ElementsAre("foo"));
- EXPECT_THAT(GetSubMatches(content, snippet.entries(0)), ElementsAre("foo"));
}
TEST_F(SnippetRetrieverTest, SnippetingMultipleMatches) {
@@ -784,15 +779,12 @@ TEST_F(SnippetRetrieverTest, SnippetingMultipleMatches) {
"we need to begin considering our options regarding body bar."));
EXPECT_THAT(GetMatches(content, snippet.entries(0)),
ElementsAre("foo", "bar"));
- EXPECT_THAT(GetSubMatches(content, snippet.entries(0)),
- ElementsAre("foo", "bar"));
EXPECT_THAT(snippet.entries(1).property_name(), Eq("subject"));
content = GetString(&document, snippet.entries(1).property_name());
EXPECT_THAT(GetWindows(content, snippet.entries(1)),
ElementsAre("subject foo"));
EXPECT_THAT(GetMatches(content, snippet.entries(1)), ElementsAre("foo"));
- EXPECT_THAT(GetSubMatches(content, snippet.entries(1)), ElementsAre("foo"));
}
TEST_F(SnippetRetrieverTest, SnippetingMultipleMatchesSectionRestrict) {
@@ -842,8 +834,6 @@ TEST_F(SnippetRetrieverTest, SnippetingMultipleMatchesSectionRestrict) {
"we need to begin considering our options regarding body bar."));
EXPECT_THAT(GetMatches(content, snippet.entries(0)),
ElementsAre("foo", "bar"));
- EXPECT_THAT(GetSubMatches(content, snippet.entries(0)),
- ElementsAre("foo", "bar"));
}
TEST_F(SnippetRetrieverTest, SnippetingMultipleMatchesSectionRestrictedTerm) {
@@ -894,16 +884,12 @@ TEST_F(SnippetRetrieverTest, SnippetingMultipleMatchesSectionRestrictedTerm) {
"Concerning the subject of foo, we need to begin considering our"));
EXPECT_THAT(GetMatches(content, snippet.entries(0)),
ElementsAre("subject", "foo"));
- EXPECT_THAT(GetSubMatches(content, snippet.entries(0)),
- ElementsAre("subject", "foo"));
EXPECT_THAT(snippet.entries(1).property_name(), Eq("subject"));
content = GetString(&document, snippet.entries(1).property_name());
EXPECT_THAT(GetWindows(content, snippet.entries(1)),
ElementsAre("subject foo"));
EXPECT_THAT(GetMatches(content, snippet.entries(1)), ElementsAre("subject"));
- EXPECT_THAT(GetSubMatches(content, snippet.entries(1)),
- ElementsAre("subject"));
}
TEST_F(SnippetRetrieverTest, SnippetingMultipleMatchesOneMatchPerProperty) {
@@ -947,14 +933,12 @@ TEST_F(SnippetRetrieverTest, SnippetingMultipleMatchesOneMatchPerProperty) {
ElementsAre(
"Concerning the subject of foo, we need to begin considering our"));
EXPECT_THAT(GetMatches(content, snippet.entries(0)), ElementsAre("foo"));
- EXPECT_THAT(GetSubMatches(content, snippet.entries(0)), ElementsAre("foo"));
EXPECT_THAT(snippet.entries(1).property_name(), Eq("subject"));
content = GetString(&document, snippet.entries(1).property_name());
EXPECT_THAT(GetWindows(content, snippet.entries(1)),
ElementsAre("subject foo"));
EXPECT_THAT(GetMatches(content, snippet.entries(1)), ElementsAre("foo"));
- EXPECT_THAT(GetSubMatches(content, snippet.entries(1)), ElementsAre("foo"));
}
TEST_F(SnippetRetrieverTest, PrefixSnippetingNormalization) {
@@ -976,7 +960,6 @@ TEST_F(SnippetRetrieverTest, PrefixSnippetingNormalization) {
GetString(&document, snippet.entries(0).property_name());
EXPECT_THAT(GetWindows(content, snippet.entries(0)), ElementsAre("MDI team"));
EXPECT_THAT(GetMatches(content, snippet.entries(0)), ElementsAre("MDI"));
- EXPECT_THAT(GetSubMatches(content, snippet.entries(0)), ElementsAre("MD"));
}
TEST_F(SnippetRetrieverTest, ExactSnippetingNormalization) {
@@ -1000,9 +983,6 @@ TEST_F(SnippetRetrieverTest, ExactSnippetingNormalization) {
EXPECT_THAT(GetWindows(content, snippet.entries(0)),
ElementsAre("Some members are in Zürich."));
EXPECT_THAT(GetMatches(content, snippet.entries(0)), ElementsAre("Zürich"));
-
- EXPECT_THAT(GetSubMatches(content, snippet.entries(0)),
- ElementsAre("Zürich"));
}
TEST_F(SnippetRetrieverTest, SnippetingTestOneLevel) {
@@ -1063,13 +1043,11 @@ TEST_F(SnippetRetrieverTest, SnippetingTestOneLevel) {
GetString(&document, snippet.entries(0).property_name());
EXPECT_THAT(GetWindows(content, snippet.entries(0)), ElementsAre("polo"));
EXPECT_THAT(GetMatches(content, snippet.entries(0)), ElementsAre("polo"));
- EXPECT_THAT(GetSubMatches(content, snippet.entries(0)), ElementsAre("polo"));
EXPECT_THAT(snippet.entries(1).property_name(), Eq("X[3]"));
content = GetString(&document, snippet.entries(1).property_name());
EXPECT_THAT(GetWindows(content, snippet.entries(1)), ElementsAre("polo"));
EXPECT_THAT(GetMatches(content, snippet.entries(1)), ElementsAre("polo"));
- EXPECT_THAT(GetSubMatches(content, snippet.entries(1)), ElementsAre("polo"));
EXPECT_THAT(GetPropertyPaths(snippet),
ElementsAre("X[1]", "X[3]", "Y[1]", "Y[3]", "Z[1]", "Z[3]"));
@@ -1166,13 +1144,11 @@ TEST_F(SnippetRetrieverTest, SnippetingTestMultiLevel) {
GetString(&document, snippet.entries(0).property_name());
EXPECT_THAT(GetWindows(content, snippet.entries(0)), ElementsAre("polo"));
EXPECT_THAT(GetMatches(content, snippet.entries(0)), ElementsAre("polo"));
- EXPECT_THAT(GetSubMatches(content, snippet.entries(0)), ElementsAre("polo"));
EXPECT_THAT(snippet.entries(1).property_name(), Eq("A.X[3]"));
content = GetString(&document, snippet.entries(1).property_name());
EXPECT_THAT(GetWindows(content, snippet.entries(1)), ElementsAre("polo"));
EXPECT_THAT(GetMatches(content, snippet.entries(1)), ElementsAre("polo"));
- EXPECT_THAT(GetSubMatches(content, snippet.entries(1)), ElementsAre("polo"));
EXPECT_THAT(
GetPropertyPaths(snippet),
@@ -1275,13 +1251,11 @@ TEST_F(SnippetRetrieverTest, SnippetingTestMultiLevelRepeated) {
GetString(&document, snippet.entries(0).property_name());
EXPECT_THAT(GetWindows(content, snippet.entries(0)), ElementsAre("polo"));
EXPECT_THAT(GetMatches(content, snippet.entries(0)), ElementsAre("polo"));
- EXPECT_THAT(GetSubMatches(content, snippet.entries(0)), ElementsAre("polo"));
EXPECT_THAT(snippet.entries(1).property_name(), Eq("A[0].X[3]"));
content = GetString(&document, snippet.entries(1).property_name());
EXPECT_THAT(GetWindows(content, snippet.entries(1)), ElementsAre("polo"));
EXPECT_THAT(GetMatches(content, snippet.entries(1)), ElementsAre("polo"));
- EXPECT_THAT(GetSubMatches(content, snippet.entries(1)), ElementsAre("polo"));
EXPECT_THAT(GetPropertyPaths(snippet),
ElementsAre("A[0].X[1]", "A[0].X[3]", "A[1].X[1]", "A[1].X[3]",
@@ -1382,13 +1356,11 @@ TEST_F(SnippetRetrieverTest, SnippetingTestMultiLevelSingleValue) {
GetString(&document, snippet.entries(0).property_name());
EXPECT_THAT(GetWindows(content, snippet.entries(0)), ElementsAre("polo"));
EXPECT_THAT(GetMatches(content, snippet.entries(0)), ElementsAre("polo"));
- EXPECT_THAT(GetSubMatches(content, snippet.entries(0)), ElementsAre("polo"));
EXPECT_THAT(snippet.entries(1).property_name(), Eq("A[1].X"));
content = GetString(&document, snippet.entries(1).property_name());
EXPECT_THAT(GetWindows(content, snippet.entries(1)), ElementsAre("polo"));
EXPECT_THAT(GetMatches(content, snippet.entries(1)), ElementsAre("polo"));
- EXPECT_THAT(GetSubMatches(content, snippet.entries(1)), ElementsAre("polo"));
EXPECT_THAT(
GetPropertyPaths(snippet),
@@ -1432,12 +1404,10 @@ TEST_F(SnippetRetrieverTest, CJKSnippetMatchTest) {
// Ensure that the match is correct.
EXPECT_THAT(GetMatches(content, *entry), ElementsAre("走路"));
- EXPECT_THAT(GetSubMatches(content, *entry), ElementsAre("走"));
// Ensure that the utf-16 values are also as expected
EXPECT_THAT(match_proto.exact_match_utf16_position(), Eq(3));
EXPECT_THAT(match_proto.exact_match_utf16_length(), Eq(2));
- EXPECT_THAT(match_proto.submatch_utf16_length(), Eq(1));
}
TEST_F(SnippetRetrieverTest, CJKSnippetWindowTest) {
@@ -1475,7 +1445,7 @@ TEST_F(SnippetRetrieverTest, CJKSnippetWindowTest) {
// 1. untrimmed, no-shifting window will be (0,7).
// 2. trimmed, no-shifting window [1, 6) "每天走路去".
// 3. trimmed, shifted window [0, 6) "我每天走路去"
- snippet_spec_.set_max_window_utf32_length(6);
+ snippet_spec_.set_max_window_bytes(6);
SnippetProto snippet = snippet_retriever_->RetrieveSnippet(
query_terms, MATCH_PREFIX, snippet_spec_, document, section_mask);
@@ -1537,12 +1507,10 @@ TEST_F(SnippetRetrieverTest, Utf16MultiCodeUnitSnippetMatchTest) {
// Ensure that the match is correct.
EXPECT_THAT(GetMatches(content, *entry), ElementsAre("𐀂𐀃"));
- EXPECT_THAT(GetSubMatches(content, *entry), ElementsAre("𐀂"));
// Ensure that the utf-16 values are also as expected
EXPECT_THAT(match_proto.exact_match_utf16_position(), Eq(5));
EXPECT_THAT(match_proto.exact_match_utf16_length(), Eq(4));
- EXPECT_THAT(match_proto.submatch_utf16_length(), Eq(2));
}
TEST_F(SnippetRetrieverTest, Utf16MultiCodeUnitWindowTest) {
@@ -1574,7 +1542,7 @@ TEST_F(SnippetRetrieverTest, Utf16MultiCodeUnitWindowTest) {
// UTF8 idx: 9 22
// UTF16 idx: 5 12
// UTF32 idx: 3 7
- snippet_spec_.set_max_window_utf32_length(6);
+ snippet_spec_.set_max_window_bytes(6);
SnippetProto snippet = snippet_retriever_->RetrieveSnippet(
query_terms, MATCH_PREFIX, snippet_spec_, document, section_mask);
@@ -1598,117 +1566,6 @@ TEST_F(SnippetRetrieverTest, Utf16MultiCodeUnitWindowTest) {
EXPECT_THAT(match_proto.window_utf16_length(), Eq(7));
}
-TEST_F(SnippetRetrieverTest, SnippettingVerbatimAscii) {
- SchemaProto schema =
- SchemaBuilder()
- .AddType(SchemaTypeConfigBuilder()
- .SetType("verbatimType")
- .AddProperty(PropertyConfigBuilder()
- .SetName("verbatim")
- .SetDataTypeString(MATCH_EXACT,
- TOKENIZER_VERBATIM)
- .SetCardinality(CARDINALITY_REPEATED)))
- .Build();
- ICING_ASSERT_OK(schema_store_->SetSchema(
- schema, /*ignore_errors_and_delete_documents=*/true));
- ICING_ASSERT_OK_AND_ASSIGN(
- snippet_retriever_,
- SnippetRetriever::Create(schema_store_.get(), language_segmenter_.get(),
- normalizer_.get()));
-
- DocumentProto document = DocumentBuilder()
- .SetKey("icing", "verbatim/1")
- .SetSchema("verbatimType")
- .AddStringProperty("verbatim", "Hello, world!")
- .Build();
-
- SectionIdMask section_mask = 0b00000001;
- SectionRestrictQueryTermsMap query_terms{{"", {"Hello, world!"}}};
-
- snippet_spec_.set_max_window_utf32_length(13);
- SnippetProto snippet = snippet_retriever_->RetrieveSnippet(
- query_terms, MATCH_EXACT, snippet_spec_, document, section_mask);
-
- // There should only be one snippet entry and match, the verbatim token in its
- // entirety.
- ASSERT_THAT(snippet.entries(), SizeIs(1));
-
- const SnippetProto::EntryProto* entry = &snippet.entries(0);
- ASSERT_THAT(entry->snippet_matches(), SizeIs(1));
- ASSERT_THAT(entry->property_name(), "verbatim");
-
- const SnippetMatchProto& match_proto = entry->snippet_matches(0);
- // We expect the match to begin at position 0, and to span the entire token
- // which contains 13 characters.
- EXPECT_THAT(match_proto.window_byte_position(), Eq(0));
- EXPECT_THAT(match_proto.window_utf16_length(), Eq(13));
-
- // We expect the submatch to begin at position 0 of the verbatim token and
- // span the length of our query term "Hello, world!", which has utf-16 length
- // of 13. The submatch length is equal to the window length as the query the
- // snippet is retrieved with an exact term match.
- EXPECT_THAT(match_proto.exact_match_utf16_position(), Eq(0));
- EXPECT_THAT(match_proto.submatch_utf16_length(), Eq(13));
-}
-
-TEST_F(SnippetRetrieverTest, SnippettingVerbatimCJK) {
- SchemaProto schema =
- SchemaBuilder()
- .AddType(SchemaTypeConfigBuilder()
- .SetType("verbatimType")
- .AddProperty(PropertyConfigBuilder()
- .SetName("verbatim")
- .SetDataTypeString(MATCH_PREFIX,
- TOKENIZER_VERBATIM)
- .SetCardinality(CARDINALITY_REPEATED)))
- .Build();
- ICING_ASSERT_OK(schema_store_->SetSchema(
- schema, /*ignore_errors_and_delete_documents=*/true));
- ICING_ASSERT_OK_AND_ASSIGN(
- snippet_retriever_,
- SnippetRetriever::Create(schema_store_.get(), language_segmenter_.get(),
- normalizer_.get()));
-
- // String: "我每天走路去上班。"
- // ^ ^ ^ ^^
- // UTF8 idx: 0 3 9 15 18
- // UTF16 idx: 0 1 3 5 6
- // UTF32 idx: 0 1 3 5 6
- // Breaks into segments: "我", "每天", "走路", "去", "上班"
- std::string chinese_string = "我每天走路去上班。";
- DocumentProto document = DocumentBuilder()
- .SetKey("icing", "verbatim/1")
- .SetSchema("verbatimType")
- .AddStringProperty("verbatim", chinese_string)
- .Build();
-
- SectionIdMask section_mask = 0b00000001;
- SectionRestrictQueryTermsMap query_terms{{"", {"我每"}}};
-
- snippet_spec_.set_max_window_utf32_length(9);
- SnippetProto snippet = snippet_retriever_->RetrieveSnippet(
- query_terms, MATCH_PREFIX, snippet_spec_, document, section_mask);
-
- // There should only be one snippet entry and match, the verbatim token in its
- // entirety.
- ASSERT_THAT(snippet.entries(), SizeIs(1));
-
- const SnippetProto::EntryProto* entry = &snippet.entries(0);
- ASSERT_THAT(entry->snippet_matches(), SizeIs(1));
- ASSERT_THAT(entry->property_name(), "verbatim");
-
- const SnippetMatchProto& match_proto = entry->snippet_matches(0);
- // We expect the match to begin at position 0, and to span the entire token
- // which has utf-16 length of 9.
- EXPECT_THAT(match_proto.window_byte_position(), Eq(0));
- EXPECT_THAT(match_proto.window_utf16_length(), Eq(9));
-
- // We expect the submatch to begin at position 0 of the verbatim token and
- // span the length of our query term "我每", which has utf-16 length of 2.
- EXPECT_THAT(match_proto.exact_match_utf16_position(), Eq(0));
- EXPECT_THAT(match_proto.submatch_utf16_length(), Eq(2));
-}
-
} // namespace
} // namespace lib
diff --git a/icing/schema/schema-store.cc b/icing/schema/schema-store.cc
index fc50ea6..e9ba654 100644
--- a/icing/schema/schema-store.cc
+++ b/icing/schema/schema-store.cc
@@ -108,60 +108,27 @@ libtextclassifier3::StatusOr<std::unique_ptr<SchemaStore>> SchemaStore::Create(
ICING_RETURN_ERROR_IF_NULL(filesystem);
ICING_RETURN_ERROR_IF_NULL(clock);
- if (!filesystem->DirectoryExists(base_dir.c_str())) {
- return absl_ports::FailedPreconditionError(
- "Schema store base directory does not exist!");
- }
std::unique_ptr<SchemaStore> schema_store = std::unique_ptr<SchemaStore>(
new SchemaStore(filesystem, base_dir, clock));
ICING_RETURN_IF_ERROR(schema_store->Initialize(initialize_stats));
return schema_store;
}
-libtextclassifier3::StatusOr<std::unique_ptr<SchemaStore>> SchemaStore::Create(
- const Filesystem* filesystem, const std::string& base_dir,
- const Clock* clock, SchemaProto schema) {
- ICING_RETURN_ERROR_IF_NULL(filesystem);
- ICING_RETURN_ERROR_IF_NULL(clock);
-
- if (!filesystem->DirectoryExists(base_dir.c_str())) {
- return absl_ports::FailedPreconditionError(
- "Schema store base directory does not exist!");
- }
- std::unique_ptr<SchemaStore> schema_store = std::unique_ptr<SchemaStore>(
- new SchemaStore(filesystem, base_dir, clock));
- ICING_RETURN_IF_ERROR(schema_store->Initialize(std::move(schema)));
- return schema_store;
-}
-
SchemaStore::SchemaStore(const Filesystem* filesystem, std::string base_dir,
const Clock* clock)
- : filesystem_(filesystem),
+ : filesystem_(*filesystem),
base_dir_(std::move(base_dir)),
- clock_(clock),
- schema_file_(std::make_unique<FileBackedProto<SchemaProto>>(
- *filesystem, MakeSchemaFilename(base_dir_))) {}
+ clock_(*clock),
+ schema_file_(*filesystem, MakeSchemaFilename(base_dir_)) {}
SchemaStore::~SchemaStore() {
- if (has_schema_successfully_set_ && schema_file_ != nullptr &&
- schema_type_mapper_ != nullptr && section_manager_ != nullptr) {
+ if (has_schema_successfully_set_) {
if (!PersistToDisk().ok()) {
ICING_LOG(ERROR) << "Error persisting to disk in SchemaStore destructor";
}
}
}
-libtextclassifier3::Status SchemaStore::Initialize(SchemaProto new_schema) {
- if (!absl_ports::IsNotFound(GetSchema().status())) {
- return absl_ports::FailedPreconditionError(
- "Incorrectly tried to initialize schema store with a new schema, when "
- "one is already set!");
- }
- ICING_RETURN_IF_ERROR(schema_file_->Write(
- std::make_unique<SchemaProto>(std::move(new_schema))));
- return InitializeInternal(/*initialize_stats=*/nullptr);
-}
-
libtextclassifier3::Status SchemaStore::Initialize(
InitializeStatsProto* initialize_stats) {
auto schema_proto_or = GetSchema();
@@ -172,16 +139,13 @@ libtextclassifier3::Status SchemaStore::Initialize(
// Real error when trying to read the existing schema
return schema_proto_or.status();
}
- return InitializeInternal(initialize_stats);
-}
+ has_schema_successfully_set_ = true;
-libtextclassifier3::Status SchemaStore::InitializeInternal(
- InitializeStatsProto* initialize_stats) {
if (!InitializeDerivedFiles().ok()) {
ICING_VLOG(3)
<< "Couldn't find derived files or failed to initialize them, "
"regenerating derived files for SchemaStore.";
- std::unique_ptr<Timer> regenerate_timer = clock_->GetNewTimer();
+ std::unique_ptr<Timer> regenerate_timer = clock_.GetNewTimer();
if (initialize_stats != nullptr) {
initialize_stats->set_schema_store_recovery_cause(
InitializeStatsProto::IO_ERROR);
@@ -197,7 +161,6 @@ libtextclassifier3::Status SchemaStore::InitializeInternal(
initialize_stats->set_num_schema_types(type_config_map_.size());
}
- has_schema_successfully_set_ = true;
return libtextclassifier3::Status::OK;
}
@@ -209,8 +172,8 @@ libtextclassifier3::Status SchemaStore::InitializeDerivedFiles() {
}
SchemaStore::Header header;
- if (!filesystem_->Read(MakeHeaderFilename(base_dir_).c_str(), &header,
- sizeof(header))) {
+ if (!filesystem_.Read(MakeHeaderFilename(base_dir_).c_str(), &header,
+ sizeof(header))) {
return absl_ports::InternalError(
absl_ports::StrCat("Couldn't read: ", MakeHeaderFilename(base_dir_)));
}
@@ -222,7 +185,7 @@ libtextclassifier3::Status SchemaStore::InitializeDerivedFiles() {
ICING_ASSIGN_OR_RETURN(
schema_type_mapper_,
- KeyMapper<SchemaTypeId>::Create(*filesystem_,
+ KeyMapper<SchemaTypeId>::Create(filesystem_,
MakeSchemaTypeMapperFilename(base_dir_),
kSchemaTypeMapperMaxSize));
@@ -273,12 +236,12 @@ libtextclassifier3::Status SchemaStore::RegenerateDerivedFiles() {
}
bool SchemaStore::HeaderExists() {
- if (!filesystem_->FileExists(MakeHeaderFilename(base_dir_).c_str())) {
+ if (!filesystem_.FileExists(MakeHeaderFilename(base_dir_).c_str())) {
return false;
}
int64_t file_size =
- filesystem_->GetFileSize(MakeHeaderFilename(base_dir_).c_str());
+ filesystem_.GetFileSize(MakeHeaderFilename(base_dir_).c_str());
// If it's been truncated to size 0 before, we consider it to be a new file
return file_size != 0 && file_size != Filesystem::kBadFileSize;
@@ -291,11 +254,11 @@ libtextclassifier3::Status SchemaStore::UpdateHeader(const Crc32& checksum) {
header.checksum = checksum.Get();
ScopedFd scoped_fd(
- filesystem_->OpenForWrite(MakeHeaderFilename(base_dir_).c_str()));
+ filesystem_.OpenForWrite(MakeHeaderFilename(base_dir_).c_str()));
// This should overwrite the header.
if (!scoped_fd.is_valid() ||
- !filesystem_->Write(scoped_fd.get(), &header, sizeof(header)) ||
- !filesystem_->DataSync(scoped_fd.get())) {
+ !filesystem_.Write(scoped_fd.get(), &header, sizeof(header)) ||
+ !filesystem_.DataSync(scoped_fd.get())) {
return absl_ports::InternalError(absl_ports::StrCat(
"Failed to write SchemaStore header: ", MakeHeaderFilename(base_dir_)));
}
@@ -305,10 +268,10 @@ libtextclassifier3::Status SchemaStore::UpdateHeader(const Crc32& checksum) {
libtextclassifier3::Status SchemaStore::ResetSchemaTypeMapper() {
// TODO(b/139734457): Replace ptr.reset()->Delete->Create flow with Reset().
schema_type_mapper_.reset();
- // TODO(b/216487496): Implement a more robust version of TC_RETURN_IF_ERROR
+ // TODO(b/144458732): Implement a more robust version of TC_RETURN_IF_ERROR
// that can support error logging.
libtextclassifier3::Status status = KeyMapper<SchemaTypeId>::Delete(
- *filesystem_, MakeSchemaTypeMapperFilename(base_dir_));
+ filesystem_, MakeSchemaTypeMapperFilename(base_dir_));
if (!status.ok()) {
ICING_LOG(ERROR) << status.error_message()
<< "Failed to delete old schema_type mapper";
@@ -316,7 +279,7 @@ libtextclassifier3::Status SchemaStore::ResetSchemaTypeMapper() {
}
ICING_ASSIGN_OR_RETURN(
schema_type_mapper_,
- KeyMapper<SchemaTypeId>::Create(*filesystem_,
+ KeyMapper<SchemaTypeId>::Create(filesystem_,
MakeSchemaTypeMapperFilename(base_dir_),
kSchemaTypeMapperMaxSize));
@@ -324,17 +287,17 @@ libtextclassifier3::Status SchemaStore::ResetSchemaTypeMapper() {
}
libtextclassifier3::StatusOr<Crc32> SchemaStore::ComputeChecksum() const {
- auto schema_proto_or = GetSchema();
- if (absl_ports::IsNotFound(schema_proto_or.status())) {
- return Crc32();
+ Crc32 total_checksum;
+ if (!has_schema_successfully_set_) {
+ // Nothing to checksum
+ return total_checksum;
}
- ICING_ASSIGN_OR_RETURN(const SchemaProto* schema_proto, schema_proto_or);
+ ICING_ASSIGN_OR_RETURN(const SchemaProto* schema_proto, GetSchema());
Crc32 schema_checksum;
schema_checksum.Append(schema_proto->SerializeAsString());
Crc32 schema_type_mapper_checksum = schema_type_mapper_->ComputeChecksum();
- Crc32 total_checksum;
total_checksum.Append(std::to_string(schema_checksum.Get()));
total_checksum.Append(std::to_string(schema_type_mapper_checksum.Get()));
@@ -343,7 +306,7 @@ libtextclassifier3::StatusOr<Crc32> SchemaStore::ComputeChecksum() const {
libtextclassifier3::StatusOr<const SchemaProto*> SchemaStore::GetSchema()
const {
- return schema_file_->Read();
+ return schema_file_.Read();
}
// TODO(cassiewang): Consider removing this definition of SetSchema if it's not
@@ -368,9 +331,6 @@ SchemaStore::SetSchema(SchemaProto&& new_schema,
if (absl_ports::IsNotFound(schema_proto_or.status())) {
// We don't have a pre-existing schema, so anything is valid.
result.success = true;
- for (const SchemaTypeConfigProto& type_config : new_schema.types()) {
- result.schema_types_new_by_name.insert(type_config.schema_type());
- }
} else if (!schema_proto_or.ok()) {
// Real error
return schema_proto_or.status();
@@ -391,11 +351,8 @@ SchemaStore::SetSchema(SchemaProto&& new_schema,
SchemaUtil::ComputeCompatibilityDelta(old_schema, new_schema,
new_dependency_map);
- result.schema_types_new_by_name = std::move(schema_delta.schema_types_new);
- result.schema_types_changed_fully_compatible_by_name =
- std::move(schema_delta.schema_types_changed_fully_compatible);
- result.schema_types_index_incompatible_by_name =
- std::move(schema_delta.schema_types_index_incompatible);
+ // An incompatible index is fine, we can just reindex
+ result.index_incompatible = schema_delta.index_incompatible;
for (const auto& schema_type : schema_delta.schema_types_deleted) {
// We currently don't support deletions, so mark this as not possible.
@@ -430,78 +387,15 @@ SchemaStore::SetSchema(SchemaProto&& new_schema,
result.success = result.success || ignore_errors_and_delete_documents;
if (result.success) {
- ICING_RETURN_IF_ERROR(ApplySchemaChange(std::move(new_schema)));
+ // Write the schema (and potentially overwrite a previous schema)
+ ICING_RETURN_IF_ERROR(
+ schema_file_.Write(std::make_unique<SchemaProto>(new_schema)));
has_schema_successfully_set_ = true;
- }
- return result;
-}
-
-libtextclassifier3::Status SchemaStore::ApplySchemaChange(
- SchemaProto new_schema) {
- // We need to ensure that we either 1) successfully set the schema and
- // update all derived data structures or 2) fail and leave the schema store
- // unchanged.
- // So, first, we create an empty temporary directory to build a new schema
- // store in.
- std::string temp_schema_store_dir_path = base_dir_ + "_temp";
- if (!filesystem_->DeleteDirectoryRecursively(
- temp_schema_store_dir_path.c_str())) {
- ICING_LOG(WARNING) << "Failed to recursively delete "
- << temp_schema_store_dir_path.c_str();
- return absl_ports::InternalError(
- "Unable to delete temp directory to prepare to build new schema "
- "store.");
- }
-
- if (!filesystem_->CreateDirectoryRecursively(
- temp_schema_store_dir_path.c_str())) {
- return absl_ports::InternalError(
- "Unable to create temp directory to build new schema store.");
- }
-
- // Then we create our new schema store with the new schema.
- auto new_schema_store_or =
- SchemaStore::Create(filesystem_, temp_schema_store_dir_path, clock_,
- std::move(new_schema));
- if (!new_schema_store_or.ok()) {
- // Attempt to clean up the temp directory.
- if (!filesystem_->DeleteDirectoryRecursively(
- temp_schema_store_dir_path.c_str())) {
- // Nothing to do here. Just log an error.
- ICING_LOG(WARNING) << "Failed to recursively delete "
- << temp_schema_store_dir_path.c_str();
- }
- return new_schema_store_or.status();
- }
- std::unique_ptr<SchemaStore> new_schema_store =
- std::move(new_schema_store_or).ValueOrDie();
-
- // Then we swap the new schema file + new derived files with the old files.
- if (!filesystem_->SwapFiles(base_dir_.c_str(),
- temp_schema_store_dir_path.c_str())) {
- // Attempt to clean up the temp directory.
- if (!filesystem_->DeleteDirectoryRecursively(
- temp_schema_store_dir_path.c_str())) {
- // Nothing to do here. Just log an error.
- ICING_LOG(WARNING) << "Failed to recursively delete "
- << temp_schema_store_dir_path.c_str();
- }
- return absl_ports::InternalError(
- "Unable to apply new schema due to failed swap!");
+ ICING_RETURN_IF_ERROR(RegenerateDerivedFiles());
}
- std::string old_base_dir = std::move(base_dir_);
- *this = std::move(*new_schema_store);
-
- // After the std::move, the filepaths saved in this instance and in the
- // schema_file_ instance will still be the one from temp_schema_store_dir
- // even though they now point to files that are within old_base_dir.
- // Manually set them to the correct paths.
- base_dir_ = std::move(old_base_dir);
- schema_file_->SetSwappedFilepath(MakeSchemaFilename(base_dir_));
-
- return libtextclassifier3::Status::OK;
+ return result;
}
libtextclassifier3::StatusOr<const SchemaTypeConfigProto*>
@@ -563,9 +457,12 @@ libtextclassifier3::Status SchemaStore::PersistToDisk() {
SchemaStoreStorageInfoProto SchemaStore::GetStorageInfo() const {
SchemaStoreStorageInfoProto storage_info;
- int64_t directory_size = filesystem_->GetDiskUsage(base_dir_.c_str());
- storage_info.set_schema_store_size(
- Filesystem::SanitizeFileSize(directory_size));
+ int64_t directory_size = filesystem_.GetDiskUsage(base_dir_.c_str());
+ if (directory_size != Filesystem::kBadFileSize) {
+ storage_info.set_schema_store_size(directory_size);
+ } else {
+ storage_info.set_schema_store_size(-1);
+ }
ICING_ASSIGN_OR_RETURN(const SchemaProto* schema, GetSchema(), storage_info);
storage_info.set_num_schema_types(schema->types_size());
int total_sections = 0;
@@ -588,22 +485,5 @@ SchemaStoreStorageInfoProto SchemaStore::GetStorageInfo() const {
return storage_info;
}
-libtextclassifier3::StatusOr<const std::vector<SectionMetadata>*>
-SchemaStore::GetSectionMetadata(const std::string& schema_type) const {
- return section_manager_->GetMetadataList(schema_type);
-}
-
-libtextclassifier3::StatusOr<SchemaDebugInfoProto> SchemaStore::GetDebugInfo()
- const {
- SchemaDebugInfoProto debug_info;
- if (has_schema_successfully_set_) {
- ICING_ASSIGN_OR_RETURN(const SchemaProto* schema, GetSchema());
- *debug_info.mutable_schema() = *schema;
- }
- ICING_ASSIGN_OR_RETURN(Crc32 crc, ComputeChecksum());
- debug_info.set_crc(crc.Get());
- return debug_info;
-}
-
} // namespace lib
} // namespace icing
diff --git a/icing/schema/schema-store.h b/icing/schema/schema-store.h
index 58e5477..dd1edb8 100644
--- a/icing/schema/schema-store.h
+++ b/icing/schema/schema-store.h
@@ -26,7 +26,6 @@
#include "icing/text_classifier/lib3/utils/base/statusor.h"
#include "icing/file/file-backed-proto.h"
#include "icing/file/filesystem.h"
-#include "icing/proto/debug.pb.h"
#include "icing/proto/document.pb.h"
#include "icing/proto/logging.pb.h"
#include "icing/proto/schema.pb.h"
@@ -69,6 +68,9 @@ class SchemaStore {
// to file.
bool success = false;
+ // Whether the new schema changes invalidate the index.
+ bool index_incompatible = false;
+
// SchemaTypeIds of schema types can be reassigned new SchemaTypeIds if:
// 1. Schema types are added in the middle of the SchemaProto
// 2. Schema types are removed from the middle of the SchemaProto
@@ -98,21 +100,6 @@ class SchemaStore {
// SchemaUtil::ComputeCompatibilityDelta. Represented by the SchemaTypeId
// assigned to this SchemaTypeConfigProto in the *old* schema.
std::unordered_set<SchemaTypeId> schema_types_incompatible_by_id;
-
- // Schema types that were added in the new schema. Represented by the
- // `schema_type` field in the SchemaTypeConfigProto.
- std::unordered_set<std::string> schema_types_new_by_name;
-
- // Schema types that were changed in a way that was backwards compatible and
- // didn't invalidate the index. Represented by the `schema_type` field in
- // the SchemaTypeConfigProto.
- std::unordered_set<std::string>
- schema_types_changed_fully_compatible_by_name;
-
- // Schema types that were changed in a way that was backwards compatible,
- // but invalidated the index. Represented by the `schema_type` field in the
- // SchemaTypeConfigProto.
- std::unordered_set<std::string> schema_types_index_incompatible_by_name;
};
// Factory function to create a SchemaStore which does not take ownership
@@ -130,17 +117,17 @@ class SchemaStore {
static libtextclassifier3::StatusOr<std::unique_ptr<SchemaStore>> Create(
const Filesystem* filesystem, const std::string& base_dir,
const Clock* clock, InitializeStatsProto* initialize_stats = nullptr);
-
- SchemaStore(SchemaStore&&) = default;
- SchemaStore& operator=(SchemaStore&&) = default;
+ // Not copyable
SchemaStore(const SchemaStore&) = delete;
SchemaStore& operator=(const SchemaStore&) = delete;
// Persists and updates checksum of subcomponents.
~SchemaStore();
- // Retrieve the current schema if it exists.
+ // Retrieve the current schema if it exists. Caller does not get ownership of
+ // the schema proto and modifying the returned pointer does not affect the
+ // underlying schema proto.
//
// Returns:
// SchemaProto* if exists
@@ -247,70 +234,23 @@ class SchemaStore {
// INTERNAL_ERROR on compute error
libtextclassifier3::StatusOr<Crc32> ComputeChecksum() const;
- // Returns:
- // - On success, the section metadata list for the specified schema type
- // - NOT_FOUND if the schema type is not present in the schema
- libtextclassifier3::StatusOr<const std::vector<SectionMetadata>*>
- GetSectionMetadata(const std::string& schema_type) const;
-
// Calculates the StorageInfo for the Schema Store.
//
// If an IO error occurs while trying to calculate the value for a field, then
// that field will be set to -1.
SchemaStoreStorageInfoProto GetStorageInfo() const;
- // Get debug information for the schema store.
- //
- // Returns:
- // SchemaDebugInfoProto on success
- // INTERNAL_ERROR on IO errors, crc compute error
- libtextclassifier3::StatusOr<SchemaDebugInfoProto> GetDebugInfo() const;
-
private:
- // Factory function to create a SchemaStore and set its schema. The created
- // instance does not take ownership of any input components and all pointers
- // must refer to valid objects that outlive the created SchemaStore instance.
- // The base_dir must already exist. No schema must have set in base_dir prior
- // to this.
- //
- // Returns:
- // A SchemaStore on success
- // FAILED_PRECONDITION on any null pointer input or if there has already
- // been a schema set for this path.
- // INTERNAL_ERROR on any IO errors
- static libtextclassifier3::StatusOr<std::unique_ptr<SchemaStore>> Create(
- const Filesystem* filesystem, const std::string& base_dir,
- const Clock* clock, SchemaProto schema);
-
-
// Use SchemaStore::Create instead.
explicit SchemaStore(const Filesystem* filesystem, std::string base_dir,
const Clock* clock);
- // Verifies that there is no error retrieving a previously set schema. Then
- // initializes like normal.
- //
- // Returns:
- // OK on success
- // INTERNAL_ERROR on IO error
- libtextclassifier3::Status Initialize(InitializeStatsProto* initialize_stats);
-
- // First, blindly writes new_schema to the schema_file. Then initializes like
- // normal.
- //
- // Returns:
- // OK on success
- // INTERNAL_ERROR on IO error
- // FAILED_PRECONDITION if there is already a schema set for the schema_file.
- libtextclassifier3::Status Initialize(SchemaProto new_schema);
-
// Handles initializing the SchemaStore and regenerating any data if needed.
//
// Returns:
// OK on success
// INTERNAL_ERROR on IO error
- libtextclassifier3::Status InitializeInternal(
- InitializeStatsProto* initialize_stats);
+ libtextclassifier3::Status Initialize(InitializeStatsProto* initialize_stats);
// Creates sub-components and verifies the integrity of each sub-component.
//
@@ -346,25 +286,15 @@ class SchemaStore {
// Returns any IO errors.
libtextclassifier3::Status ResetSchemaTypeMapper();
- // Creates a new schema store with new_schema and then swaps that new schema
- // store with the existing one. This function guarantees that either: this
- // instance will be fully updated to the new schema or no changes will take
- // effect.
- //
- // Returns:
- // OK on success
- // INTERNAL on I/O error.
- libtextclassifier3::Status ApplySchemaChange(SchemaProto new_schema);
-
libtextclassifier3::Status CheckSchemaSet() const {
return has_schema_successfully_set_
? libtextclassifier3::Status::OK
: absl_ports::FailedPreconditionError("Schema not set yet.");
}
- const Filesystem* filesystem_;
- std::string base_dir_;
- const Clock* clock_;
+ const Filesystem& filesystem_;
+ const std::string base_dir_;
+ const Clock& clock_;
// Used internally to indicate whether the class has been successfully
// initialized with a valid schema. Will be false if Initialize failed or no
@@ -372,7 +302,7 @@ class SchemaStore {
bool has_schema_successfully_set_ = false;
// Cached schema
- std::unique_ptr<FileBackedProto<SchemaProto>> schema_file_;
+ FileBackedProto<SchemaProto> schema_file_;
// A hash map of (type config name -> type config), allows faster lookup of
// type config in schema. The O(1) type config access makes schema-related and
diff --git a/icing/schema/schema-store_test.cc b/icing/schema/schema-store_test.cc
index 3fd41c4..5ef2dea 100644
--- a/icing/schema/schema-store_test.cc
+++ b/icing/schema/schema-store_test.cc
@@ -21,9 +21,7 @@
#include "gmock/gmock.h"
#include "gtest/gtest.h"
#include "icing/absl_ports/str_cat.h"
-#include "icing/document-builder.h"
#include "icing/file/filesystem.h"
-#include "icing/file/mock-filesystem.h"
#include "icing/portable/equals-proto.h"
#include "icing/proto/document.pb.h"
#include "icing/proto/schema.pb.h"
@@ -35,7 +33,6 @@
#include "icing/testing/common-matchers.h"
#include "icing/testing/fake-clock.h"
#include "icing/testing/tmp-directory.h"
-#include "icing/text_classifier/lib3/utils/base/status.h"
#include "icing/util/crc32.h"
namespace icing {
@@ -47,35 +44,28 @@ using ::icing::lib::portable_equals_proto::EqualsProto;
using ::testing::ElementsAre;
using ::testing::Eq;
using ::testing::Ge;
-using ::testing::Gt;
-using ::testing::HasSubstr;
using ::testing::Not;
using ::testing::Pointee;
-using ::testing::Return;
-using ::testing::SizeIs;
-constexpr PropertyConfigProto::Cardinality::Code CARDINALITY_OPTIONAL =
- PropertyConfigProto::Cardinality::OPTIONAL;
-constexpr PropertyConfigProto::Cardinality::Code CARDINALITY_REPEATED =
- PropertyConfigProto::Cardinality::REPEATED;
+constexpr PropertyConfigProto_Cardinality_Code CARDINALITY_OPTIONAL =
+ PropertyConfigProto_Cardinality_Code_OPTIONAL;
+constexpr PropertyConfigProto_Cardinality_Code CARDINALITY_REPEATED =
+ PropertyConfigProto_Cardinality_Code_REPEATED;
-constexpr StringIndexingConfig::TokenizerType::Code TOKENIZER_PLAIN =
- StringIndexingConfig::TokenizerType::PLAIN;
+constexpr StringIndexingConfig_TokenizerType_Code TOKENIZER_PLAIN =
+ StringIndexingConfig_TokenizerType_Code_PLAIN;
-constexpr TermMatchType::Code MATCH_EXACT = TermMatchType::EXACT_ONLY;
-constexpr TermMatchType::Code MATCH_PREFIX = TermMatchType::PREFIX;
+constexpr TermMatchType_Code MATCH_EXACT = TermMatchType_Code_EXACT_ONLY;
-constexpr PropertyConfigProto::DataType::Code TYPE_STRING =
- PropertyConfigProto::DataType::STRING;
-constexpr PropertyConfigProto::DataType::Code TYPE_DOUBLE =
- PropertyConfigProto::DataType::DOUBLE;
+constexpr PropertyConfigProto_DataType_Code TYPE_STRING =
+ PropertyConfigProto_DataType_Code_STRING;
+constexpr PropertyConfigProto_DataType_Code TYPE_DOUBLE =
+ PropertyConfigProto_DataType_Code_DOUBLE;
class SchemaStoreTest : public ::testing::Test {
protected:
- void SetUp() override {
- temp_dir_ = GetTestTempDir() + "/icing";
- schema_store_dir_ = temp_dir_ + "/schema_store";
- filesystem_.CreateDirectoryRecursively(schema_store_dir_.c_str());
+ SchemaStoreTest() : test_dir_(GetTestTempDir() + "/icing") {
+ filesystem_.CreateDirectoryRecursively(test_dir_.c_str());
schema_ =
SchemaBuilder()
@@ -89,117 +79,30 @@ class SchemaStoreTest : public ::testing::Test {
}
void TearDown() override {
- // Check that the schema store directory is the *only* directory in the
- // schema_store_dir_. IOW, ensure that all temporary directories have been
- // properly cleaned up.
- std::vector<std::string> sub_dirs;
- ASSERT_TRUE(filesystem_.ListDirectory(temp_dir_.c_str(), &sub_dirs));
- ASSERT_THAT(sub_dirs, ElementsAre("schema_store"));
-
- // Finally, clean everything up.
- ASSERT_TRUE(filesystem_.DeleteDirectoryRecursively(temp_dir_.c_str()));
+ filesystem_.DeleteDirectoryRecursively(test_dir_.c_str());
}
- Filesystem filesystem_;
- std::string temp_dir_;
- std::string schema_store_dir_;
+ const Filesystem filesystem_;
+ const std::string test_dir_;
SchemaProto schema_;
- FakeClock fake_clock_;
+ const FakeClock fake_clock_;
};
TEST_F(SchemaStoreTest, CreationWithNullPointerShouldFail) {
EXPECT_THAT(
- SchemaStore::Create(/*filesystem=*/nullptr, schema_store_dir_, &fake_clock_),
+ SchemaStore::Create(/*filesystem=*/nullptr, test_dir_, &fake_clock_),
StatusIs(libtextclassifier3::StatusCode::FAILED_PRECONDITION));
}
-TEST_F(SchemaStoreTest, SchemaStoreMoveConstructible) {
- // Create an instance of SchemaStore.
- SchemaProto schema =
- SchemaBuilder()
- .AddType(SchemaTypeConfigBuilder().SetType("TypeA").AddProperty(
- PropertyConfigBuilder()
- .SetName("prop1")
- .SetDataTypeString(MATCH_EXACT, TOKENIZER_PLAIN)
- .SetCardinality(CARDINALITY_OPTIONAL)))
- .Build();
-
- ICING_ASSERT_OK_AND_ASSIGN(
- std::unique_ptr<SchemaStore> schema_store,
- SchemaStore::Create(&filesystem_, schema_store_dir_, &fake_clock_));
-
- ICING_ASSERT_OK(schema_store->SetSchema(schema));
- ICING_ASSERT_OK_AND_ASSIGN(Crc32 expected_checksum,
- schema_store->ComputeChecksum());
-
- // Move construct an instance of SchemaStore
- SchemaStore move_constructed_schema_store(std::move(*schema_store));
- EXPECT_THAT(move_constructed_schema_store.GetSchema(),
- IsOkAndHolds(Pointee(EqualsProto(schema))));
- EXPECT_THAT(move_constructed_schema_store.ComputeChecksum(),
- IsOkAndHolds(Eq(expected_checksum)));
- SectionMetadata expected_metadata(/*id_in=*/0, MATCH_EXACT, TOKENIZER_PLAIN,
- "prop1");
- EXPECT_THAT(move_constructed_schema_store.GetSectionMetadata("TypeA"),
- IsOkAndHolds(Pointee(ElementsAre(expected_metadata))));
-}
-
-TEST_F(SchemaStoreTest, SchemaStoreMoveAssignment) {
- // Create an instance of SchemaStore.
- SchemaProto schema1 =
- SchemaBuilder()
- .AddType(SchemaTypeConfigBuilder().SetType("TypeA").AddProperty(
- PropertyConfigBuilder()
- .SetName("prop1")
- .SetDataTypeString(MATCH_EXACT, TOKENIZER_PLAIN)
- .SetCardinality(CARDINALITY_OPTIONAL)))
- .Build();
-
- ICING_ASSERT_OK_AND_ASSIGN(
- std::unique_ptr<SchemaStore> schema_store,
- SchemaStore::Create(&filesystem_, schema_store_dir_, &fake_clock_));
-
- ICING_ASSERT_OK(schema_store->SetSchema(schema1));
- ICING_ASSERT_OK_AND_ASSIGN(Crc32 expected_checksum,
- schema_store->ComputeChecksum());
-
- // Construct another instance of SchemaStore
- SchemaProto schema2 =
- SchemaBuilder()
- .AddType(SchemaTypeConfigBuilder().SetType("TypeB").AddProperty(
- PropertyConfigBuilder()
- .SetName("prop2")
- .SetDataTypeString(MATCH_EXACT, TOKENIZER_PLAIN)
- .SetCardinality(CARDINALITY_OPTIONAL)))
- .Build();
-
- ICING_ASSERT_OK_AND_ASSIGN(
- std::unique_ptr<SchemaStore> move_assigned_schema_store,
- SchemaStore::Create(&filesystem_, schema_store_dir_, &fake_clock_));
- ICING_ASSERT_OK(schema_store->SetSchema(schema2));
-
- // Move assign the first instance into the second one.
- *move_assigned_schema_store = std::move(*schema_store);
- EXPECT_THAT(move_assigned_schema_store->GetSchema(),
- IsOkAndHolds(Pointee(EqualsProto(schema1))));
- EXPECT_THAT(move_assigned_schema_store->ComputeChecksum(),
- IsOkAndHolds(Eq(expected_checksum)));
- SectionMetadata expected_metadata(/*id_in=*/0, MATCH_EXACT, TOKENIZER_PLAIN,
- "prop1");
- EXPECT_THAT(move_assigned_schema_store->GetSectionMetadata("TypeA"),
- IsOkAndHolds(Pointee(ElementsAre(expected_metadata))));
-}
-
TEST_F(SchemaStoreTest, CorruptSchemaError) {
{
ICING_ASSERT_OK_AND_ASSIGN(
std::unique_ptr<SchemaStore> schema_store,
- SchemaStore::Create(&filesystem_, schema_store_dir_, &fake_clock_));
+ SchemaStore::Create(&filesystem_, test_dir_, &fake_clock_));
// Set it for the first time
SchemaStore::SetSchemaResult result;
result.success = true;
- result.schema_types_new_by_name.insert(schema_.types(0).schema_type());
EXPECT_THAT(schema_store->SetSchema(schema_),
IsOkAndHolds(EqualsSetSchemaResult(result)));
ICING_ASSERT_OK_AND_ASSIGN(const SchemaProto* actual_schema,
@@ -215,14 +118,14 @@ TEST_F(SchemaStoreTest, CorruptSchemaError) {
.AddType(SchemaTypeConfigBuilder().SetType("corrupted"))
.Build();
- const std::string schema_file = absl_ports::StrCat(schema_store_dir_, "/schema.pb");
+ const std::string schema_file = absl_ports::StrCat(test_dir_, "/schema.pb");
const std::string serialized_schema = corrupt_schema.SerializeAsString();
filesystem_.Write(schema_file.c_str(), serialized_schema.data(),
serialized_schema.size());
// If ground truth was corrupted, we won't know what to do
- EXPECT_THAT(SchemaStore::Create(&filesystem_, schema_store_dir_, &fake_clock_),
+ EXPECT_THAT(SchemaStore::Create(&filesystem_, test_dir_, &fake_clock_),
StatusIs(libtextclassifier3::StatusCode::INTERNAL));
}
@@ -230,12 +133,11 @@ TEST_F(SchemaStoreTest, RecoverCorruptDerivedFileOk) {
{
ICING_ASSERT_OK_AND_ASSIGN(
std::unique_ptr<SchemaStore> schema_store,
- SchemaStore::Create(&filesystem_, schema_store_dir_, &fake_clock_));
+ SchemaStore::Create(&filesystem_, test_dir_, &fake_clock_));
// Set it for the first time
SchemaStore::SetSchemaResult result;
result.success = true;
- result.schema_types_new_by_name.insert(schema_.types(0).schema_type());
EXPECT_THAT(schema_store->SetSchema(schema_),
IsOkAndHolds(EqualsSetSchemaResult(result)));
ICING_ASSERT_OK_AND_ASSIGN(const SchemaProto* actual_schema,
@@ -250,12 +152,12 @@ TEST_F(SchemaStoreTest, RecoverCorruptDerivedFileOk) {
// regenerated from ground truth
const std::string schema_type_mapper_dir =
- absl_ports::StrCat(schema_store_dir_, "/schema_type_mapper");
+ absl_ports::StrCat(test_dir_, "/schema_type_mapper");
filesystem_.DeleteDirectoryRecursively(schema_type_mapper_dir.c_str());
ICING_ASSERT_OK_AND_ASSIGN(
std::unique_ptr<SchemaStore> schema_store,
- SchemaStore::Create(&filesystem_, schema_store_dir_, &fake_clock_));
+ SchemaStore::Create(&filesystem_, test_dir_, &fake_clock_));
// Everything looks fine, ground truth and derived data
ICING_ASSERT_OK_AND_ASSIGN(const SchemaProto* actual_schema,
@@ -268,12 +170,11 @@ TEST_F(SchemaStoreTest, RecoverBadChecksumOk) {
{
ICING_ASSERT_OK_AND_ASSIGN(
std::unique_ptr<SchemaStore> schema_store,
- SchemaStore::Create(&filesystem_, schema_store_dir_, &fake_clock_));
+ SchemaStore::Create(&filesystem_, test_dir_, &fake_clock_));
// Set it for the first time
SchemaStore::SetSchemaResult result;
result.success = true;
- result.schema_types_new_by_name.insert(schema_.types(0).schema_type());
EXPECT_THAT(schema_store->SetSchema(schema_),
IsOkAndHolds(EqualsSetSchemaResult(result)));
ICING_ASSERT_OK_AND_ASSIGN(const SchemaProto* actual_schema,
@@ -287,7 +188,7 @@ TEST_F(SchemaStoreTest, RecoverBadChecksumOk) {
// the recalculated checksum on initialization. This will force a regeneration
// of derived files from ground truth.
const std::string header_file =
- absl_ports::StrCat(schema_store_dir_, "/schema_store_header");
+ absl_ports::StrCat(test_dir_, "/schema_store_header");
SchemaStore::Header header;
header.magic = SchemaStore::Header::kMagic;
header.checksum = 10; // Arbitrary garbage checksum
@@ -296,7 +197,7 @@ TEST_F(SchemaStoreTest, RecoverBadChecksumOk) {
ICING_ASSERT_OK_AND_ASSIGN(
std::unique_ptr<SchemaStore> schema_store,
- SchemaStore::Create(&filesystem_, schema_store_dir_, &fake_clock_));
+ SchemaStore::Create(&filesystem_, test_dir_, &fake_clock_));
// Everything looks fine, ground truth and derived data
ICING_ASSERT_OK_AND_ASSIGN(const SchemaProto* actual_schema,
@@ -308,7 +209,7 @@ TEST_F(SchemaStoreTest, RecoverBadChecksumOk) {
TEST_F(SchemaStoreTest, CreateNoPreviousSchemaOk) {
ICING_ASSERT_OK_AND_ASSIGN(
std::unique_ptr<SchemaStore> store,
- SchemaStore::Create(&filesystem_, schema_store_dir_, &fake_clock_));
+ SchemaStore::Create(&filesystem_, test_dir_, &fake_clock_));
// The apis to retrieve information about the schema should fail gracefully.
EXPECT_THAT(store->GetSchema(),
@@ -341,16 +242,15 @@ TEST_F(SchemaStoreTest, CreateNoPreviousSchemaOk) {
TEST_F(SchemaStoreTest, CreateWithPreviousSchemaOk) {
ICING_ASSERT_OK_AND_ASSIGN(
std::unique_ptr<SchemaStore> schema_store,
- SchemaStore::Create(&filesystem_, schema_store_dir_, &fake_clock_));
+ SchemaStore::Create(&filesystem_, test_dir_, &fake_clock_));
SchemaStore::SetSchemaResult result;
result.success = true;
- result.schema_types_new_by_name.insert(schema_.types(0).schema_type());
EXPECT_THAT(schema_store->SetSchema(schema_),
IsOkAndHolds(EqualsSetSchemaResult(result)));
schema_store.reset();
- EXPECT_THAT(SchemaStore::Create(&filesystem_, schema_store_dir_, &fake_clock_),
+ EXPECT_THAT(SchemaStore::Create(&filesystem_, test_dir_, &fake_clock_),
IsOk());
}
@@ -363,11 +263,10 @@ TEST_F(SchemaStoreTest, MultipleCreateOk) {
ICING_ASSERT_OK_AND_ASSIGN(
std::unique_ptr<SchemaStore> schema_store,
- SchemaStore::Create(&filesystem_, schema_store_dir_, &fake_clock_));
+ SchemaStore::Create(&filesystem_, test_dir_, &fake_clock_));
SchemaStore::SetSchemaResult result;
result.success = true;
- result.schema_types_new_by_name.insert(schema_.types(0).schema_type());
EXPECT_THAT(schema_store->SetSchema(schema_),
IsOkAndHolds(EqualsSetSchemaResult(result)));
@@ -383,7 +282,7 @@ TEST_F(SchemaStoreTest, MultipleCreateOk) {
schema_store.reset();
ICING_ASSERT_OK_AND_ASSIGN(
- schema_store, SchemaStore::Create(&filesystem_, schema_store_dir_, &fake_clock_));
+ schema_store, SchemaStore::Create(&filesystem_, test_dir_, &fake_clock_));
// Verify that our in-memory structures are ok
EXPECT_THAT(schema_store->GetSchemaTypeConfig("email"),
@@ -399,12 +298,11 @@ TEST_F(SchemaStoreTest, MultipleCreateOk) {
TEST_F(SchemaStoreTest, SetNewSchemaOk) {
ICING_ASSERT_OK_AND_ASSIGN(
std::unique_ptr<SchemaStore> schema_store,
- SchemaStore::Create(&filesystem_, schema_store_dir_, &fake_clock_));
+ SchemaStore::Create(&filesystem_, test_dir_, &fake_clock_));
// Set it for the first time
SchemaStore::SetSchemaResult result;
result.success = true;
- result.schema_types_new_by_name.insert(schema_.types(0).schema_type());
EXPECT_THAT(schema_store->SetSchema(schema_),
IsOkAndHolds(EqualsSetSchemaResult(result)));
ICING_ASSERT_OK_AND_ASSIGN(const SchemaProto* actual_schema,
@@ -415,12 +313,11 @@ TEST_F(SchemaStoreTest, SetNewSchemaOk) {
TEST_F(SchemaStoreTest, SetSameSchemaOk) {
ICING_ASSERT_OK_AND_ASSIGN(
std::unique_ptr<SchemaStore> schema_store,
- SchemaStore::Create(&filesystem_, schema_store_dir_, &fake_clock_));
+ SchemaStore::Create(&filesystem_, test_dir_, &fake_clock_));
// Set it for the first time
SchemaStore::SetSchemaResult result;
result.success = true;
- result.schema_types_new_by_name.insert(schema_.types(0).schema_type());
EXPECT_THAT(schema_store->SetSchema(schema_),
IsOkAndHolds(EqualsSetSchemaResult(result)));
ICING_ASSERT_OK_AND_ASSIGN(const SchemaProto* actual_schema,
@@ -428,8 +325,6 @@ TEST_F(SchemaStoreTest, SetSameSchemaOk) {
EXPECT_THAT(*actual_schema, EqualsProto(schema_));
// And one more for fun
- result = SchemaStore::SetSchemaResult();
- result.success = true;
EXPECT_THAT(schema_store->SetSchema(schema_),
IsOkAndHolds(EqualsSetSchemaResult(result)));
ICING_ASSERT_OK_AND_ASSIGN(actual_schema, schema_store->GetSchema());
@@ -439,12 +334,11 @@ TEST_F(SchemaStoreTest, SetSameSchemaOk) {
TEST_F(SchemaStoreTest, SetIncompatibleSchemaOk) {
ICING_ASSERT_OK_AND_ASSIGN(
std::unique_ptr<SchemaStore> schema_store,
- SchemaStore::Create(&filesystem_, schema_store_dir_, &fake_clock_));
+ SchemaStore::Create(&filesystem_, test_dir_, &fake_clock_));
// Set it for the first time
SchemaStore::SetSchemaResult result;
result.success = true;
- result.schema_types_new_by_name.insert(schema_.types(0).schema_type());
EXPECT_THAT(schema_store->SetSchema(schema_),
IsOkAndHolds(EqualsSetSchemaResult(result)));
ICING_ASSERT_OK_AND_ASSIGN(const SchemaProto* actual_schema,
@@ -455,7 +349,6 @@ TEST_F(SchemaStoreTest, SetIncompatibleSchemaOk) {
schema_.clear_types();
// Set the incompatible schema
- result = SchemaStore::SetSchemaResult();
result.success = false;
result.schema_types_deleted_by_name.emplace("email");
result.schema_types_deleted_by_id.emplace(0);
@@ -466,7 +359,7 @@ TEST_F(SchemaStoreTest, SetIncompatibleSchemaOk) {
TEST_F(SchemaStoreTest, SetSchemaWithAddedTypeOk) {
ICING_ASSERT_OK_AND_ASSIGN(
std::unique_ptr<SchemaStore> schema_store,
- SchemaStore::Create(&filesystem_, schema_store_dir_, &fake_clock_));
+ SchemaStore::Create(&filesystem_, test_dir_, &fake_clock_));
SchemaProto schema = SchemaBuilder()
.AddType(SchemaTypeConfigBuilder().SetType("email"))
@@ -475,7 +368,6 @@ TEST_F(SchemaStoreTest, SetSchemaWithAddedTypeOk) {
// Set it for the first time
SchemaStore::SetSchemaResult result;
result.success = true;
- result.schema_types_new_by_name.insert("email");
EXPECT_THAT(schema_store->SetSchema(schema),
IsOkAndHolds(EqualsSetSchemaResult(result)));
ICING_ASSERT_OK_AND_ASSIGN(const SchemaProto* actual_schema,
@@ -488,9 +380,6 @@ TEST_F(SchemaStoreTest, SetSchemaWithAddedTypeOk) {
.Build();
// Set the compatible schema
- result = SchemaStore::SetSchemaResult();
- result.success = true;
- result.schema_types_new_by_name.insert("new_type");
EXPECT_THAT(schema_store->SetSchema(schema),
IsOkAndHolds(EqualsSetSchemaResult(result)));
ICING_ASSERT_OK_AND_ASSIGN(actual_schema, schema_store->GetSchema());
@@ -500,7 +389,7 @@ TEST_F(SchemaStoreTest, SetSchemaWithAddedTypeOk) {
TEST_F(SchemaStoreTest, SetSchemaWithDeletedTypeOk) {
ICING_ASSERT_OK_AND_ASSIGN(
std::unique_ptr<SchemaStore> schema_store,
- SchemaStore::Create(&filesystem_, schema_store_dir_, &fake_clock_));
+ SchemaStore::Create(&filesystem_, test_dir_, &fake_clock_));
SchemaProto schema =
SchemaBuilder()
@@ -511,8 +400,6 @@ TEST_F(SchemaStoreTest, SetSchemaWithDeletedTypeOk) {
// Set it for the first time
SchemaStore::SetSchemaResult result;
result.success = true;
- result.schema_types_new_by_name.insert("email");
- result.schema_types_new_by_name.insert("message");
EXPECT_THAT(schema_store->SetSchema(schema),
IsOkAndHolds(EqualsSetSchemaResult(result)));
ICING_ASSERT_OK_AND_ASSIGN(const SchemaProto* actual_schema,
@@ -558,7 +445,7 @@ TEST_F(SchemaStoreTest, SetSchemaWithDeletedTypeOk) {
TEST_F(SchemaStoreTest, SetSchemaWithReorderedTypesOk) {
ICING_ASSERT_OK_AND_ASSIGN(
std::unique_ptr<SchemaStore> schema_store,
- SchemaStore::Create(&filesystem_, schema_store_dir_, &fake_clock_));
+ SchemaStore::Create(&filesystem_, test_dir_, &fake_clock_));
SchemaProto schema =
SchemaBuilder()
@@ -569,8 +456,6 @@ TEST_F(SchemaStoreTest, SetSchemaWithReorderedTypesOk) {
// Set it for the first time
SchemaStore::SetSchemaResult result;
result.success = true;
- result.schema_types_new_by_name.insert("email");
- result.schema_types_new_by_name.insert("message");
EXPECT_THAT(schema_store->SetSchema(schema),
IsOkAndHolds(EqualsSetSchemaResult(result)));
ICING_ASSERT_OK_AND_ASSIGN(const SchemaProto* actual_schema,
@@ -585,8 +470,6 @@ TEST_F(SchemaStoreTest, SetSchemaWithReorderedTypesOk) {
// Since we assign SchemaTypeIds based on order in the SchemaProto, this will
// cause SchemaTypeIds to change
- result = SchemaStore::SetSchemaResult();
- result.success = true;
result.old_schema_type_ids_changed.emplace(0); // Old SchemaTypeId of "email"
result.old_schema_type_ids_changed.emplace(
1); // Old SchemaTypeId of "message"
@@ -598,10 +481,10 @@ TEST_F(SchemaStoreTest, SetSchemaWithReorderedTypesOk) {
EXPECT_THAT(*actual_schema, EqualsProto(schema));
}
-TEST_F(SchemaStoreTest, IndexedPropertyChangeRequiresReindexingOk) {
+TEST_F(SchemaStoreTest, SetSchemaThatRequiresReindexingOk) {
ICING_ASSERT_OK_AND_ASSIGN(
std::unique_ptr<SchemaStore> schema_store,
- SchemaStore::Create(&filesystem_, schema_store_dir_, &fake_clock_));
+ SchemaStore::Create(&filesystem_, test_dir_, &fake_clock_));
SchemaProto schema =
SchemaBuilder()
@@ -616,7 +499,6 @@ TEST_F(SchemaStoreTest, IndexedPropertyChangeRequiresReindexingOk) {
// Set it for the first time
SchemaStore::SetSchemaResult result;
result.success = true;
- result.schema_types_new_by_name.insert("email");
EXPECT_THAT(schema_store->SetSchema(schema),
IsOkAndHolds(EqualsSetSchemaResult(result)));
ICING_ASSERT_OK_AND_ASSIGN(const SchemaProto* actual_schema,
@@ -632,10 +514,10 @@ TEST_F(SchemaStoreTest, IndexedPropertyChangeRequiresReindexingOk) {
.SetCardinality(CARDINALITY_OPTIONAL)))
.Build();
+ // With a new indexed property, we'll need to reindex
+ result.index_incompatible = true;
+
// Set the compatible schema
- result = SchemaStore::SetSchemaResult();
- result.success = true;
- result.schema_types_index_incompatible_by_name.insert("email");
EXPECT_THAT(schema_store->SetSchema(schema),
IsOkAndHolds(EqualsSetSchemaResult(result)));
ICING_ASSERT_OK_AND_ASSIGN(actual_schema, schema_store->GetSchema());
@@ -645,7 +527,7 @@ TEST_F(SchemaStoreTest, IndexedPropertyChangeRequiresReindexingOk) {
TEST_F(SchemaStoreTest, IndexNestedDocumentsChangeRequiresReindexingOk) {
ICING_ASSERT_OK_AND_ASSIGN(
std::unique_ptr<SchemaStore> schema_store,
- SchemaStore::Create(&filesystem_, schema_store_dir_, &fake_clock_));
+ SchemaStore::Create(&filesystem_, test_dir_, &fake_clock_));
// Make two schemas. One that sets index_nested_properties to false and one
// that sets it to true.
@@ -682,8 +564,6 @@ TEST_F(SchemaStoreTest, IndexNestedDocumentsChangeRequiresReindexingOk) {
// Set schema with index_nested_properties=false to start.
SchemaStore::SetSchemaResult result;
result.success = true;
- result.schema_types_new_by_name.insert("email");
- result.schema_types_new_by_name.insert("person");
EXPECT_THAT(schema_store->SetSchema(no_nested_index_schema),
IsOkAndHolds(EqualsSetSchemaResult(result)));
ICING_ASSERT_OK_AND_ASSIGN(const SchemaProto* actual_schema,
@@ -694,7 +574,7 @@ TEST_F(SchemaStoreTest, IndexNestedDocumentsChangeRequiresReindexingOk) {
// 'person' is index incompatible.
result = SchemaStore::SetSchemaResult();
result.success = true;
- result.schema_types_index_incompatible_by_name.insert("person");
+ result.index_incompatible = true;
EXPECT_THAT(schema_store->SetSchema(nested_index_schema),
IsOkAndHolds(EqualsSetSchemaResult(result)));
ICING_ASSERT_OK_AND_ASSIGN(actual_schema, schema_store->GetSchema());
@@ -704,7 +584,7 @@ TEST_F(SchemaStoreTest, IndexNestedDocumentsChangeRequiresReindexingOk) {
// to 'person' is index incompatible.
result = SchemaStore::SetSchemaResult();
result.success = true;
- result.schema_types_index_incompatible_by_name.insert("person");
+ result.index_incompatible = true;
EXPECT_THAT(schema_store->SetSchema(no_nested_index_schema),
IsOkAndHolds(EqualsSetSchemaResult(result)));
ICING_ASSERT_OK_AND_ASSIGN(actual_schema, schema_store->GetSchema());
@@ -714,7 +594,7 @@ TEST_F(SchemaStoreTest, IndexNestedDocumentsChangeRequiresReindexingOk) {
TEST_F(SchemaStoreTest, SetSchemaWithIncompatibleTypesOk) {
ICING_ASSERT_OK_AND_ASSIGN(
std::unique_ptr<SchemaStore> schema_store,
- SchemaStore::Create(&filesystem_, schema_store_dir_, &fake_clock_));
+ SchemaStore::Create(&filesystem_, test_dir_, &fake_clock_));
SchemaProto schema =
SchemaBuilder()
@@ -729,7 +609,6 @@ TEST_F(SchemaStoreTest, SetSchemaWithIncompatibleTypesOk) {
// Set it for the first time
SchemaStore::SetSchemaResult result;
result.success = true;
- result.schema_types_new_by_name.insert("email");
EXPECT_THAT(schema_store->SetSchema(schema),
IsOkAndHolds(EqualsSetSchemaResult(result)));
ICING_ASSERT_OK_AND_ASSIGN(const SchemaProto* actual_schema,
@@ -773,185 +652,10 @@ TEST_F(SchemaStoreTest, SetSchemaWithIncompatibleTypesOk) {
EXPECT_THAT(*actual_schema, EqualsProto(schema));
}
-TEST_F(SchemaStoreTest, SetSchemaWithIncompatibleNestedTypesOk) {
- ICING_ASSERT_OK_AND_ASSIGN(
- std::unique_ptr<SchemaStore> schema_store,
- SchemaStore::Create(&filesystem_, schema_store_dir_, &fake_clock_));
-
- // 1. Create a ContactPoint type with a repeated property and set that schema
- SchemaTypeConfigBuilder contact_point_repeated_label =
- SchemaTypeConfigBuilder()
- .SetType("ContactPoint")
- .AddProperty(PropertyConfigBuilder()
- .SetName("label")
- .SetDataTypeString(MATCH_PREFIX, TOKENIZER_PLAIN)
- .SetCardinality(CARDINALITY_REPEATED));
- SchemaProto old_schema =
- SchemaBuilder().AddType(contact_point_repeated_label).Build();
- ICING_EXPECT_OK(schema_store->SetSchema(old_schema));
- ICING_ASSERT_OK_AND_ASSIGN(SchemaTypeId old_contact_point_type_id,
- schema_store->GetSchemaTypeId("ContactPoint"));
-
- // 2. Create a type that references the ContactPoint type and make a backwards
- // incompatible change to ContactPoint
- SchemaTypeConfigBuilder contact_point_optional_label =
- SchemaTypeConfigBuilder()
- .SetType("ContactPoint")
- .AddProperty(PropertyConfigBuilder()
- .SetName("label")
- .SetDataTypeString(MATCH_PREFIX, TOKENIZER_PLAIN)
- .SetCardinality(CARDINALITY_OPTIONAL));
- SchemaTypeConfigBuilder person =
- SchemaTypeConfigBuilder().SetType("Person").AddProperty(
- PropertyConfigBuilder()
- .SetName("contactPoints")
- .SetDataTypeDocument("ContactPoint",
- /*index_nested_properties=*/true)
- .SetCardinality(CARDINALITY_REPEATED));
- SchemaProto new_schema = SchemaBuilder()
- .AddType(contact_point_optional_label)
- .AddType(person)
- .Build();
-
- // 3. SetSchema should fail with ignore_errors_and_delete_documents=false and
- // the old schema should remain
- SchemaStore::SetSchemaResult expected_result;
- expected_result.success = false;
- expected_result.schema_types_incompatible_by_name.insert("ContactPoint");
- expected_result.schema_types_incompatible_by_id.insert(
- old_contact_point_type_id);
- expected_result.schema_types_new_by_name.insert("Person");
- EXPECT_THAT(
- schema_store->SetSchema(new_schema,
- /*ignore_errors_and_delete_documents=*/false),
- IsOkAndHolds(EqualsSetSchemaResult(expected_result)));
- ICING_ASSERT_OK_AND_ASSIGN(const SchemaProto* actual_schema,
- schema_store->GetSchema());
- EXPECT_THAT(*actual_schema, EqualsProto(old_schema));
-
- // 4. SetSchema should succeed with ignore_errors_and_delete_documents=true
- // and the new schema should be set
- expected_result.success = true;
- EXPECT_THAT(
- schema_store->SetSchema(new_schema,
- /*ignore_errors_and_delete_documents=*/true),
- IsOkAndHolds(EqualsSetSchemaResult(expected_result)));
- ICING_ASSERT_OK_AND_ASSIGN(actual_schema, schema_store->GetSchema());
- EXPECT_THAT(*actual_schema, EqualsProto(new_schema));
-}
-
-TEST_F(SchemaStoreTest, SetSchemaWithIndexIncompatibleNestedTypesOk) {
- ICING_ASSERT_OK_AND_ASSIGN(
- std::unique_ptr<SchemaStore> schema_store,
- SchemaStore::Create(&filesystem_, schema_store_dir_, &fake_clock_));
-
- // 1. Create a ContactPoint type with label that matches prefix and set that
- // schema
- SchemaTypeConfigBuilder contact_point_prefix_label =
- SchemaTypeConfigBuilder()
- .SetType("ContactPoint")
- .AddProperty(PropertyConfigBuilder()
- .SetName("label")
- .SetDataTypeString(MATCH_PREFIX, TOKENIZER_PLAIN)
- .SetCardinality(CARDINALITY_REPEATED));
- SchemaProto old_schema =
- SchemaBuilder().AddType(contact_point_prefix_label).Build();
- ICING_EXPECT_OK(schema_store->SetSchema(old_schema));
-
- // 2. Create a type that references the ContactPoint type and make a index
- // backwards incompatible change to ContactPoint
- SchemaTypeConfigBuilder contact_point_exact_label =
- SchemaTypeConfigBuilder()
- .SetType("ContactPoint")
- .AddProperty(PropertyConfigBuilder()
- .SetName("label")
- .SetDataTypeString(MATCH_EXACT, TOKENIZER_PLAIN)
- .SetCardinality(CARDINALITY_REPEATED));
- SchemaTypeConfigBuilder person =
- SchemaTypeConfigBuilder().SetType("Person").AddProperty(
- PropertyConfigBuilder()
- .SetName("contactPoints")
- .SetDataTypeDocument("ContactPoint",
- /*index_nested_properties=*/true)
- .SetCardinality(CARDINALITY_REPEATED));
- SchemaProto new_schema = SchemaBuilder()
- .AddType(contact_point_exact_label)
- .AddType(person)
- .Build();
-
- // SetSchema should succeed, and only ContactPoint should be in
- // schema_types_index_incompatible_by_name.
- SchemaStore::SetSchemaResult expected_result;
- expected_result.success = true;
- expected_result.schema_types_index_incompatible_by_name.insert(
- "ContactPoint");
- expected_result.schema_types_new_by_name.insert("Person");
- EXPECT_THAT(
- schema_store->SetSchema(new_schema,
- /*ignore_errors_and_delete_documents=*/false),
- IsOkAndHolds(EqualsSetSchemaResult(expected_result)));
- ICING_ASSERT_OK_AND_ASSIGN(const SchemaProto* actual_schema,
- schema_store->GetSchema());
- EXPECT_THAT(*actual_schema, EqualsProto(new_schema));
-}
-
-TEST_F(SchemaStoreTest, SetSchemaWithCompatibleNestedTypesOk) {
- ICING_ASSERT_OK_AND_ASSIGN(
- std::unique_ptr<SchemaStore> schema_store,
- SchemaStore::Create(&filesystem_, schema_store_dir_, &fake_clock_));
-
- // 1. Create a ContactPoint type with a optional property and set that schema
- SchemaTypeConfigBuilder contact_point_optional_label =
- SchemaTypeConfigBuilder()
- .SetType("ContactPoint")
- .AddProperty(PropertyConfigBuilder()
- .SetName("label")
- .SetDataTypeString(MATCH_PREFIX, TOKENIZER_PLAIN)
- .SetCardinality(CARDINALITY_OPTIONAL));
- SchemaProto old_schema =
- SchemaBuilder().AddType(contact_point_optional_label).Build();
- ICING_EXPECT_OK(schema_store->SetSchema(old_schema));
-
- // 2. Create a type that references the ContactPoint type and make a backwards
- // compatible change to ContactPoint
- SchemaTypeConfigBuilder contact_point_repeated_label =
- SchemaTypeConfigBuilder()
- .SetType("ContactPoint")
- .AddProperty(PropertyConfigBuilder()
- .SetName("label")
- .SetDataTypeString(MATCH_PREFIX, TOKENIZER_PLAIN)
- .SetCardinality(CARDINALITY_REPEATED));
- SchemaTypeConfigBuilder person =
- SchemaTypeConfigBuilder().SetType("Person").AddProperty(
- PropertyConfigBuilder()
- .SetName("contactPoints")
- .SetDataTypeDocument("ContactPoint",
- /*index_nested_properties=*/true)
- .SetCardinality(CARDINALITY_REPEATED));
- SchemaProto new_schema = SchemaBuilder()
- .AddType(contact_point_repeated_label)
- .AddType(person)
- .Build();
-
- // 3. SetSchema should succeed, and only ContactPoint should be in
- // schema_types_changed_fully_compatible_by_name.
- SchemaStore::SetSchemaResult expected_result;
- expected_result.success = true;
- expected_result.schema_types_changed_fully_compatible_by_name.insert(
- "ContactPoint");
- expected_result.schema_types_new_by_name.insert("Person");
- EXPECT_THAT(schema_store->SetSchema(
- new_schema, /*ignore_errors_and_delete_documents=*/false),
- IsOkAndHolds(EqualsSetSchemaResult(expected_result)));
- ICING_ASSERT_OK_AND_ASSIGN(const SchemaProto* actual_schema,
- schema_store->GetSchema());
- EXPECT_THAT(*actual_schema, EqualsProto(new_schema));
-}
-
TEST_F(SchemaStoreTest, GetSchemaTypeId) {
ICING_ASSERT_OK_AND_ASSIGN(
std::unique_ptr<SchemaStore> schema_store,
- SchemaStore::Create(&filesystem_, schema_store_dir_, &fake_clock_));
+ SchemaStore::Create(&filesystem_, test_dir_, &fake_clock_));
schema_.clear_types();
@@ -967,8 +671,6 @@ TEST_F(SchemaStoreTest, GetSchemaTypeId) {
// Set it for the first time
SchemaStore::SetSchemaResult result;
result.success = true;
- result.schema_types_new_by_name.insert(first_type);
- result.schema_types_new_by_name.insert(second_type);
EXPECT_THAT(schema_store->SetSchema(schema_),
IsOkAndHolds(EqualsSetSchemaResult(result)));
@@ -979,7 +681,7 @@ TEST_F(SchemaStoreTest, GetSchemaTypeId) {
TEST_F(SchemaStoreTest, ComputeChecksumDefaultOnEmptySchemaStore) {
ICING_ASSERT_OK_AND_ASSIGN(
std::unique_ptr<SchemaStore> schema_store,
- SchemaStore::Create(&filesystem_, schema_store_dir_, &fake_clock_));
+ SchemaStore::Create(&filesystem_, test_dir_, &fake_clock_));
Crc32 default_checksum;
EXPECT_THAT(schema_store->ComputeChecksum(), IsOkAndHolds(default_checksum));
@@ -988,7 +690,7 @@ TEST_F(SchemaStoreTest, ComputeChecksumDefaultOnEmptySchemaStore) {
TEST_F(SchemaStoreTest, ComputeChecksumSameBetweenCalls) {
ICING_ASSERT_OK_AND_ASSIGN(
std::unique_ptr<SchemaStore> schema_store,
- SchemaStore::Create(&filesystem_, schema_store_dir_, &fake_clock_));
+ SchemaStore::Create(&filesystem_, test_dir_, &fake_clock_));
SchemaProto foo_schema =
SchemaBuilder().AddType(SchemaTypeConfigBuilder().SetType("foo")).Build();
@@ -1004,7 +706,7 @@ TEST_F(SchemaStoreTest, ComputeChecksumSameBetweenCalls) {
TEST_F(SchemaStoreTest, ComputeChecksumSameAcrossInstances) {
ICING_ASSERT_OK_AND_ASSIGN(
std::unique_ptr<SchemaStore> schema_store,
- SchemaStore::Create(&filesystem_, schema_store_dir_, &fake_clock_));
+ SchemaStore::Create(&filesystem_, test_dir_, &fake_clock_));
SchemaProto foo_schema =
SchemaBuilder().AddType(SchemaTypeConfigBuilder().SetType("foo")).Build();
@@ -1017,14 +719,14 @@ TEST_F(SchemaStoreTest, ComputeChecksumSameAcrossInstances) {
schema_store.reset();
ICING_ASSERT_OK_AND_ASSIGN(
- schema_store, SchemaStore::Create(&filesystem_, schema_store_dir_, &fake_clock_));
+ schema_store, SchemaStore::Create(&filesystem_, test_dir_, &fake_clock_));
EXPECT_THAT(schema_store->ComputeChecksum(), IsOkAndHolds(checksum));
}
TEST_F(SchemaStoreTest, ComputeChecksumChangesOnModification) {
ICING_ASSERT_OK_AND_ASSIGN(
std::unique_ptr<SchemaStore> schema_store,
- SchemaStore::Create(&filesystem_, schema_store_dir_, &fake_clock_));
+ SchemaStore::Create(&filesystem_, test_dir_, &fake_clock_));
SchemaProto foo_schema =
SchemaBuilder().AddType(SchemaTypeConfigBuilder().SetType("foo")).Build();
@@ -1048,7 +750,7 @@ TEST_F(SchemaStoreTest, ComputeChecksumChangesOnModification) {
TEST_F(SchemaStoreTest, PersistToDiskFineForEmptySchemaStore) {
ICING_ASSERT_OK_AND_ASSIGN(
std::unique_ptr<SchemaStore> schema_store,
- SchemaStore::Create(&filesystem_, schema_store_dir_, &fake_clock_));
+ SchemaStore::Create(&filesystem_, test_dir_, &fake_clock_));
// Persisting is fine and shouldn't affect anything
ICING_EXPECT_OK(schema_store->PersistToDisk());
@@ -1057,7 +759,7 @@ TEST_F(SchemaStoreTest, PersistToDiskFineForEmptySchemaStore) {
TEST_F(SchemaStoreTest, PersistToDiskPreservesAcrossInstances) {
ICING_ASSERT_OK_AND_ASSIGN(
std::unique_ptr<SchemaStore> schema_store,
- SchemaStore::Create(&filesystem_, schema_store_dir_, &fake_clock_));
+ SchemaStore::Create(&filesystem_, test_dir_, &fake_clock_));
SchemaProto schema =
SchemaBuilder().AddType(SchemaTypeConfigBuilder().SetType("foo")).Build();
@@ -1082,7 +784,7 @@ TEST_F(SchemaStoreTest, PersistToDiskPreservesAcrossInstances) {
// And we get the same schema back on reinitialization
ICING_ASSERT_OK_AND_ASSIGN(
- schema_store, SchemaStore::Create(&filesystem_, schema_store_dir_, &fake_clock_));
+ schema_store, SchemaStore::Create(&filesystem_, test_dir_, &fake_clock_));
ICING_ASSERT_OK_AND_ASSIGN(actual_schema, schema_store->GetSchema());
EXPECT_THAT(*actual_schema, EqualsProto(schema));
}
@@ -1090,7 +792,7 @@ TEST_F(SchemaStoreTest, PersistToDiskPreservesAcrossInstances) {
TEST_F(SchemaStoreTest, SchemaStoreStorageInfoProto) {
ICING_ASSERT_OK_AND_ASSIGN(
std::unique_ptr<SchemaStore> schema_store,
- SchemaStore::Create(&filesystem_, schema_store_dir_, &fake_clock_));
+ SchemaStore::Create(&filesystem_, test_dir_, &fake_clock_));
// Create a schema with two types: one simple type and one type that uses all
// 16 sections.
@@ -1127,8 +829,6 @@ TEST_F(SchemaStoreTest, SchemaStoreStorageInfoProto) {
SchemaStore::SetSchemaResult result;
result.success = true;
- result.schema_types_new_by_name.insert("email");
- result.schema_types_new_by_name.insert("fullSectionsType");
EXPECT_THAT(schema_store->SetSchema(schema),
IsOkAndHolds(EqualsSetSchemaResult(result)));
@@ -1139,114 +839,6 @@ TEST_F(SchemaStoreTest, SchemaStoreStorageInfoProto) {
EXPECT_THAT(storage_info.num_schema_types_sections_exhausted(), Eq(1));
}
-TEST_F(SchemaStoreTest, GetDebugInfo) {
- ICING_ASSERT_OK_AND_ASSIGN(
- std::unique_ptr<SchemaStore> schema_store,
- SchemaStore::Create(&filesystem_, schema_store_dir_, &fake_clock_));
-
- // Set schema
- ASSERT_THAT(
- schema_store->SetSchema(schema_),
- IsOkAndHolds(EqualsSetSchemaResult(SchemaStore::SetSchemaResult{
- .success = true,
- .schema_types_new_by_name = {schema_.types(0).schema_type()}})));
-
- // Check debug info
- ICING_ASSERT_OK_AND_ASSIGN(SchemaDebugInfoProto out,
- schema_store->GetDebugInfo());
- EXPECT_THAT(out.schema(), EqualsProto(schema_));
- EXPECT_THAT(out.crc(), Gt(0));
-}
-
-TEST_F(SchemaStoreTest, GetDebugInfoForEmptySchemaStore) {
- ICING_ASSERT_OK_AND_ASSIGN(
- std::unique_ptr<SchemaStore> schema_store,
- SchemaStore::Create(&filesystem_, schema_store_dir_, &fake_clock_));
-
- // Check debug info before setting a schema
- ICING_ASSERT_OK_AND_ASSIGN(SchemaDebugInfoProto out,
- schema_store->GetDebugInfo());
- SchemaDebugInfoProto expected_out;
- expected_out.set_crc(0);
- EXPECT_THAT(out, EqualsProto(expected_out));
-}
-
-TEST_F(SchemaStoreTest, InitializeRegenerateDerivedFilesFailure) {
- // This test covers the first point that RegenerateDerivedFiles could fail.
- // This should simply result in SetSchema::Create returning an INTERNAL error.
-
- {
- ICING_ASSERT_OK_AND_ASSIGN(
- std::unique_ptr<SchemaStore> schema_store,
- SchemaStore::Create(&filesystem_, schema_store_dir_, &fake_clock_));
- SchemaProto schema = SchemaBuilder()
- .AddType(SchemaTypeConfigBuilder().SetType("Type"))
- .Build();
- ICING_ASSERT_OK(schema_store->SetSchema(std::move(schema)));
- }
-
- auto mock_filesystem = std::make_unique<MockFilesystem>();
- ON_CALL(*mock_filesystem,
- CreateDirectoryRecursively(HasSubstr("key_mapper_dir")))
- .WillByDefault(Return(false));
- {
- EXPECT_THAT(SchemaStore::Create(mock_filesystem.get(), schema_store_dir_,
- &fake_clock_),
- StatusIs(libtextclassifier3::StatusCode::INTERNAL));
- }
-}
-
-TEST_F(SchemaStoreTest, SetSchemaRegenerateDerivedFilesFailure) {
- // This test covers the second point that RegenerateDerivedFiles could fail.
- // If handled correctly, the schema store and section manager should still be
- // in the original, valid state.
- SchemaTypeConfigProto type =
- SchemaTypeConfigBuilder()
- .SetType("Type")
- .AddProperty(PropertyConfigBuilder()
- .SetName("prop1")
- .SetDataTypeString(MATCH_EXACT, TOKENIZER_PLAIN)
- .SetCardinality(CARDINALITY_OPTIONAL))
- .Build();
- {
- ICING_ASSERT_OK_AND_ASSIGN(
- std::unique_ptr<SchemaStore> schema_store,
- SchemaStore::Create(&filesystem_, schema_store_dir_, &fake_clock_));
- SchemaProto schema = SchemaBuilder().AddType(type).Build();
- ICING_ASSERT_OK(schema_store->SetSchema(std::move(schema)));
- }
-
- {
- auto mock_filesystem = std::make_unique<MockFilesystem>();
- ICING_ASSERT_OK_AND_ASSIGN(
- std::unique_ptr<SchemaStore> schema_store,
- SchemaStore::Create(mock_filesystem.get(), schema_store_dir_,
- &fake_clock_));
-
- ON_CALL(*mock_filesystem,
- CreateDirectoryRecursively(HasSubstr("key_mapper_dir")))
- .WillByDefault(Return(false));
- SchemaProto schema =
- SchemaBuilder()
- .AddType(type)
- .AddType(SchemaTypeConfigBuilder().SetType("Type2"))
- .Build();
- EXPECT_THAT(schema_store->SetSchema(std::move(schema)),
- StatusIs(libtextclassifier3::StatusCode::INTERNAL));
- DocumentProto document = DocumentBuilder()
- .SetSchema("Type")
- .AddStringProperty("prop1", "foo bar baz")
- .Build();
- SectionMetadata expected_metadata(/*id_in=*/0, MATCH_EXACT, TOKENIZER_PLAIN,
- "prop1");
- ICING_ASSERT_OK_AND_ASSIGN(std::vector<Section> sections,
- schema_store->ExtractSections(document));
- ASSERT_THAT(sections, SizeIs(1));
- EXPECT_THAT(sections.at(0).metadata, Eq(expected_metadata));
- EXPECT_THAT(sections.at(0).content, ElementsAre("foo bar baz"));
- }
-}
-
} // namespace
} // namespace lib
diff --git a/icing/schema/schema-util.cc b/icing/schema/schema-util.cc
index 88b6946..cabe76d 100644
--- a/icing/schema/schema-util.cc
+++ b/icing/schema/schema-util.cc
@@ -37,20 +37,6 @@ namespace lib {
namespace {
-bool ArePropertiesEqual(const PropertyConfigProto& old_property,
- const PropertyConfigProto& new_property) {
- return old_property.property_name() == new_property.property_name() &&
- old_property.data_type() == new_property.data_type() &&
- old_property.schema_type() == new_property.schema_type() &&
- old_property.cardinality() == new_property.cardinality() &&
- old_property.string_indexing_config().term_match_type() ==
- new_property.string_indexing_config().term_match_type() &&
- old_property.string_indexing_config().tokenizer_type() ==
- new_property.string_indexing_config().tokenizer_type() &&
- old_property.document_indexing_config().index_nested_properties() ==
- new_property.document_indexing_config().index_nested_properties();
-}
-
bool IsCardinalityCompatible(const PropertyConfigProto& old_property,
const PropertyConfigProto& new_property) {
if (old_property.cardinality() < new_property.cardinality()) {
@@ -107,33 +93,6 @@ bool IsTermMatchTypeCompatible(const StringIndexingConfig& old_indexed,
old_indexed.tokenizer_type() == new_indexed.tokenizer_type();
}
-void AddIncompatibleChangeToDelta(
- std::unordered_set<std::string>& incompatible_delta,
- const SchemaTypeConfigProto& old_type_config,
- const SchemaUtil::DependencyMap& new_schema_dependency_map,
- const SchemaUtil::TypeConfigMap& old_type_config_map,
- const SchemaUtil::TypeConfigMap& new_type_config_map) {
- // If this type is incompatible, then every type that depends on it might
- // also be incompatible. Use the dependency map to mark those ones as
- // incompatible too.
- incompatible_delta.insert(old_type_config.schema_type());
- auto parent_types_itr =
- new_schema_dependency_map.find(old_type_config.schema_type());
- if (parent_types_itr != new_schema_dependency_map.end()) {
- for (std::string_view parent_type : parent_types_itr->second) {
- // The types from new_schema that depend on the current
- // old_type_config may not present in old_schema.
- // Those types will be listed at schema_delta.schema_types_new
- // instead.
- std::string parent_type_str(parent_type);
- if (old_type_config_map.find(parent_type_str) !=
- old_type_config_map.end()) {
- incompatible_delta.insert(std::move(parent_type_str));
- }
- }
- }
-}
-
} // namespace
libtextclassifier3::Status ExpandTranstiveDependencies(
@@ -473,9 +432,9 @@ const SchemaUtil::SchemaDelta SchemaUtil::ComputeCompatibilityDelta(
const SchemaProto& old_schema, const SchemaProto& new_schema,
const DependencyMap& new_schema_dependency_map) {
SchemaDelta schema_delta;
+ schema_delta.index_incompatible = false;
- TypeConfigMap old_type_config_map, new_type_config_map;
- BuildTypeConfigMap(old_schema, &old_type_config_map);
+ TypeConfigMap new_type_config_map;
BuildTypeConfigMap(new_schema, &new_type_config_map);
// Iterate through and check each field of the old schema
@@ -504,9 +463,6 @@ const SchemaUtil::SchemaDelta SchemaUtil::ComputeCompatibilityDelta(
// If there is a different number of properties, then there must have been a
// change.
- bool has_property_changed =
- old_type_config.properties_size() !=
- new_schema_type_and_config->second.properties_size();
bool is_incompatible = false;
bool is_index_incompatible = false;
for (const auto& old_property_config : old_type_config.properties()) {
@@ -542,11 +498,6 @@ const SchemaUtil::SchemaDelta SchemaUtil::ComputeCompatibilityDelta(
const PropertyConfigProto* new_property_config =
new_property_name_and_config->second;
- if (!has_property_changed &&
- !ArePropertiesEqual(old_property_config, *new_property_config)) {
- // Finally found a property that changed.
- has_property_changed = true;
- }
if (!IsPropertyCompatible(old_property_config, *new_property_config)) {
ICING_VLOG(1) << absl_ports::StrCat(
@@ -594,33 +545,26 @@ const SchemaUtil::SchemaDelta SchemaUtil::ComputeCompatibilityDelta(
}
if (is_incompatible) {
- AddIncompatibleChangeToDelta(schema_delta.schema_types_incompatible,
- old_type_config, new_schema_dependency_map,
- old_type_config_map, new_type_config_map);
+ // If this type is incompatible, then every type that depends on it might
+ // also be incompatible. Use the dependency map to mark those ones as
+ // incompatible too.
+ schema_delta.schema_types_incompatible.insert(
+ old_type_config.schema_type());
+ auto parent_types_itr =
+ new_schema_dependency_map.find(old_type_config.schema_type());
+ if (parent_types_itr != new_schema_dependency_map.end()) {
+ schema_delta.schema_types_incompatible.reserve(
+ schema_delta.schema_types_incompatible.size() +
+ parent_types_itr->second.size());
+ schema_delta.schema_types_incompatible.insert(
+ parent_types_itr->second.begin(), parent_types_itr->second.end());
+ }
}
if (is_index_incompatible) {
- AddIncompatibleChangeToDelta(schema_delta.schema_types_index_incompatible,
- old_type_config, new_schema_dependency_map,
- old_type_config_map, new_type_config_map);
+ schema_delta.index_incompatible = true;
}
- if (!is_incompatible && !is_index_incompatible && has_property_changed) {
- schema_delta.schema_types_changed_fully_compatible.insert(
- old_type_config.schema_type());
- }
-
- // Lastly, remove this type from the map. We know that this type can't
- // come up in future iterations through the old schema types because the old
- // type config has unique types.
- new_type_config_map.erase(old_type_config.schema_type());
- }
-
- // Any types that are still present in the new_type_config_map are newly added
- // types.
- schema_delta.schema_types_new.reserve(new_type_config_map.size());
- for (auto& kvp : new_type_config_map) {
- schema_delta.schema_types_new.insert(std::move(kvp.first));
}
return schema_delta;
diff --git a/icing/schema/schema-util.h b/icing/schema/schema-util.h
index fa80b15..abbc55d 100644
--- a/icing/schema/schema-util.h
+++ b/icing/schema/schema-util.h
@@ -41,6 +41,12 @@ class SchemaUtil {
std::unordered_set<std::string_view>>;
struct SchemaDelta {
+ // Whether an indexing config has changed, requiring the index to be
+ // regenerated. We don't list out all the types that make the index
+ // incompatible because our index isn't optimized for that. It's much easier
+ // to reset the entire index and reindex every document.
+ bool index_incompatible = false;
+
// Which schema types were present in the old schema, but were deleted from
// the new schema.
std::unordered_set<std::string> schema_types_deleted;
@@ -49,28 +55,10 @@ class SchemaUtil {
// could invalidate existing Documents of that schema type.
std::unordered_set<std::string> schema_types_incompatible;
- // Schema types that were added in the new schema. Represented by the
- // `schema_type` field in the SchemaTypeConfigProto.
- std::unordered_set<std::string> schema_types_new;
-
- // Schema types that were changed in a way that was backwards compatible and
- // didn't invalidate the index. Represented by the `schema_type` field in
- // the SchemaTypeConfigProto.
- std::unordered_set<std::string> schema_types_changed_fully_compatible;
-
- // Schema types that were changed in a way that was backwards compatible,
- // but invalidated the index. Represented by the `schema_type` field in the
- // SchemaTypeConfigProto.
- std::unordered_set<std::string> schema_types_index_incompatible;
-
bool operator==(const SchemaDelta& other) const {
- return schema_types_deleted == other.schema_types_deleted &&
- schema_types_incompatible == other.schema_types_incompatible &&
- schema_types_new == other.schema_types_new &&
- schema_types_changed_fully_compatible ==
- other.schema_types_changed_fully_compatible &&
- schema_types_index_incompatible ==
- other.schema_types_index_incompatible;
+ return index_incompatible == other.index_incompatible &&
+ schema_types_deleted == other.schema_types_deleted &&
+ schema_types_incompatible == other.schema_types_incompatible;
}
};
diff --git a/icing/schema/schema-util_test.cc b/icing/schema/schema-util_test.cc
index f28a2f8..049dd79 100644
--- a/icing/schema/schema-util_test.cc
+++ b/icing/schema/schema-util_test.cc
@@ -38,32 +38,32 @@ constexpr char kEmailType[] = "EmailMessage";
constexpr char kMessageType[] = "Text";
constexpr char kPersonType[] = "Person";
-constexpr PropertyConfigProto::DataType::Code TYPE_DOCUMENT =
- PropertyConfigProto::DataType::DOCUMENT;
-constexpr PropertyConfigProto::DataType::Code TYPE_STRING =
- PropertyConfigProto::DataType::STRING;
-constexpr PropertyConfigProto::DataType::Code TYPE_INT =
- PropertyConfigProto::DataType::INT64;
-constexpr PropertyConfigProto::DataType::Code TYPE_DOUBLE =
- PropertyConfigProto::DataType::DOUBLE;
-
-constexpr PropertyConfigProto::Cardinality::Code CARDINALITY_UNKNOWN =
- PropertyConfigProto::Cardinality::UNKNOWN;
-constexpr PropertyConfigProto::Cardinality::Code CARDINALITY_REQUIRED =
- PropertyConfigProto::Cardinality::REQUIRED;
-constexpr PropertyConfigProto::Cardinality::Code CARDINALITY_OPTIONAL =
- PropertyConfigProto::Cardinality::OPTIONAL;
-constexpr PropertyConfigProto::Cardinality::Code CARDINALITY_REPEATED =
- PropertyConfigProto::Cardinality::REPEATED;
-
-constexpr StringIndexingConfig::TokenizerType::Code TOKENIZER_NONE =
- StringIndexingConfig::TokenizerType::NONE;
-constexpr StringIndexingConfig::TokenizerType::Code TOKENIZER_PLAIN =
- StringIndexingConfig::TokenizerType::PLAIN;
-
-constexpr TermMatchType::Code MATCH_UNKNOWN = TermMatchType::UNKNOWN;
-constexpr TermMatchType::Code MATCH_EXACT = TermMatchType::EXACT_ONLY;
-constexpr TermMatchType::Code MATCH_PREFIX = TermMatchType::PREFIX;
+constexpr PropertyConfigProto_DataType_Code TYPE_DOCUMENT =
+ PropertyConfigProto_DataType_Code_DOCUMENT;
+constexpr PropertyConfigProto_DataType_Code TYPE_STRING =
+ PropertyConfigProto_DataType_Code_STRING;
+constexpr PropertyConfigProto_DataType_Code TYPE_INT =
+ PropertyConfigProto_DataType_Code_INT64;
+constexpr PropertyConfigProto_DataType_Code TYPE_DOUBLE =
+ PropertyConfigProto_DataType_Code_DOUBLE;
+
+constexpr PropertyConfigProto_Cardinality_Code CARDINALITY_UNKNOWN =
+ PropertyConfigProto_Cardinality_Code_UNKNOWN;
+constexpr PropertyConfigProto_Cardinality_Code CARDINALITY_REQUIRED =
+ PropertyConfigProto_Cardinality_Code_REQUIRED;
+constexpr PropertyConfigProto_Cardinality_Code CARDINALITY_OPTIONAL =
+ PropertyConfigProto_Cardinality_Code_OPTIONAL;
+constexpr PropertyConfigProto_Cardinality_Code CARDINALITY_REPEATED =
+ PropertyConfigProto_Cardinality_Code_REPEATED;
+
+constexpr StringIndexingConfig_TokenizerType_Code TOKENIZER_NONE =
+ StringIndexingConfig_TokenizerType_Code_NONE;
+constexpr StringIndexingConfig_TokenizerType_Code TOKENIZER_PLAIN =
+ StringIndexingConfig_TokenizerType_Code_PLAIN;
+
+constexpr TermMatchType_Code MATCH_UNKNOWN = TermMatchType_Code_UNKNOWN;
+constexpr TermMatchType_Code MATCH_EXACT = TermMatchType_Code_EXACT_ONLY;
+constexpr TermMatchType_Code MATCH_PREFIX = TermMatchType_Code_PREFIX;
TEST(SchemaUtilTest, DependencyGraphAlphabeticalOrder) {
// Create a schema with the following dependencies:
@@ -705,7 +705,6 @@ TEST(SchemaUtilTest, NewOptionalPropertyIsCompatible) {
.Build();
SchemaUtil::SchemaDelta schema_delta;
- schema_delta.schema_types_changed_fully_compatible.insert(kEmailType);
SchemaUtil::DependencyMap no_dependencies_map;
EXPECT_THAT(SchemaUtil::ComputeCompatibilityDelta(
old_schema, new_schema_with_optional, no_dependencies_map),
@@ -818,8 +817,6 @@ TEST(SchemaUtilTest, CompatibilityOfDifferentCardinalityOk) {
// We can have the new schema be less restrictive, OPTIONAL->REPEATED;
SchemaUtil::SchemaDelta compatible_schema_delta;
- compatible_schema_delta.schema_types_changed_fully_compatible.insert(
- kEmailType);
EXPECT_THAT(SchemaUtil::ComputeCompatibilityDelta(
/*old_schema=*/more_restrictive_schema,
/*new_schema=*/less_restrictive_schema, no_dependencies_map),
@@ -915,6 +912,7 @@ TEST(SchemaUtilTest, DifferentSchemaTypeIsIncompatible) {
SchemaUtil::SchemaDelta actual = SchemaUtil::ComputeCompatibilityDelta(
old_schema, new_schema, dependencies_map);
EXPECT_THAT(actual, Eq(schema_delta));
+ EXPECT_THAT(actual.index_incompatible, testing::IsFalse());
EXPECT_THAT(actual.schema_types_incompatible,
testing::ElementsAre(kEmailType));
EXPECT_THAT(actual.schema_types_deleted, testing::IsEmpty());
@@ -946,7 +944,7 @@ TEST(SchemaUtilTest, ChangingIndexedPropertiesMakesIndexIncompatible) {
.Build();
SchemaUtil::SchemaDelta schema_delta;
- schema_delta.schema_types_index_incompatible.insert(kPersonType);
+ schema_delta.index_incompatible = true;
// New schema gained a new indexed property.
SchemaUtil::DependencyMap no_dependencies_map;
@@ -993,7 +991,7 @@ TEST(SchemaUtilTest, AddingNewIndexedPropertyMakesIndexIncompatible) {
.Build();
SchemaUtil::SchemaDelta schema_delta;
- schema_delta.schema_types_index_incompatible.insert(kPersonType);
+ schema_delta.index_incompatible = true;
SchemaUtil::DependencyMap no_dependencies_map;
EXPECT_THAT(SchemaUtil::ComputeCompatibilityDelta(old_schema, new_schema,
no_dependencies_map),
@@ -1033,7 +1031,6 @@ TEST(SchemaUtilTest, AddingTypeIsCompatible) {
.Build();
SchemaUtil::SchemaDelta schema_delta;
- schema_delta.schema_types_new.insert(kEmailType);
SchemaUtil::DependencyMap no_dependencies_map;
EXPECT_THAT(SchemaUtil::ComputeCompatibilityDelta(old_schema, new_schema,
no_dependencies_map),
@@ -1112,7 +1109,7 @@ TEST(SchemaUtilTest, DeletingPropertyAndChangingProperty) {
SchemaUtil::SchemaDelta schema_delta;
schema_delta.schema_types_incompatible.emplace(kEmailType);
- schema_delta.schema_types_index_incompatible.emplace(kEmailType);
+ schema_delta.index_incompatible = true;
SchemaUtil::DependencyMap no_dependencies_map;
SchemaUtil::SchemaDelta actual = SchemaUtil::ComputeCompatibilityDelta(
old_schema, new_schema, no_dependencies_map);
@@ -1160,7 +1157,7 @@ TEST(SchemaUtilTest, IndexNestedDocumentsIndexIncompatible) {
// should make kPersonType index_incompatible. kEmailType should be
// unaffected.
SchemaUtil::SchemaDelta schema_delta;
- schema_delta.schema_types_index_incompatible.emplace(kPersonType);
+ schema_delta.index_incompatible = true;
SchemaUtil::DependencyMap dependencies_map = {{kEmailType, {kPersonType}}};
SchemaUtil::SchemaDelta actual = SchemaUtil::ComputeCompatibilityDelta(
no_nested_index_schema, nested_index_schema, dependencies_map);
diff --git a/icing/schema/section.h b/icing/schema/section.h
index 8b2ba55..40e623a 100644
--- a/icing/schema/section.h
+++ b/icing/schema/section.h
@@ -77,11 +77,6 @@ struct SectionMetadata {
id(id_in),
tokenizer(tokenizer),
term_match_type(term_match_type_in) {}
-
- bool operator==(const SectionMetadata& rhs) const {
- return path == rhs.path && id == rhs.id && tokenizer == rhs.tokenizer &&
- term_match_type == rhs.term_match_type;
- }
};
// Section is an icing internal concept similar to document property but with
diff --git a/icing/scoring/bm25f-calculator.cc b/icing/scoring/bm25f-calculator.cc
index 28d385e..4822d7f 100644
--- a/icing/scoring/bm25f-calculator.cc
+++ b/icing/scoring/bm25f-calculator.cc
@@ -26,7 +26,6 @@
#include "icing/store/corpus-associated-scoring-data.h"
#include "icing/store/corpus-id.h"
#include "icing/store/document-associated-score-data.h"
-#include "icing/store/document-filter-data.h"
#include "icing/store/document-id.h"
namespace icing {
@@ -43,11 +42,8 @@ constexpr float k1_ = 1.2f;
constexpr float b_ = 0.7f;
// TODO(b/158603900): add tests for Bm25fCalculator
-Bm25fCalculator::Bm25fCalculator(
- const DocumentStore* document_store,
- std::unique_ptr<SectionWeights> section_weights)
- : document_store_(document_store),
- section_weights_(std::move(section_weights)) {}
+Bm25fCalculator::Bm25fCalculator(const DocumentStore* document_store)
+ : document_store_(document_store) {}
// During initialization, Bm25fCalculator iterates through
// hit-iterators for each query term to pre-compute n(q_i) for each corpus under
@@ -125,9 +121,9 @@ float Bm25fCalculator::ComputeScore(const DocHitInfoIterator* query_it,
// Compute inverse document frequency (IDF) weight for query term in the given
// corpus, and cache it in the map.
//
-// N - n(q_i) + 0.5
-// IDF(q_i) = ln(1 + ------------------)
-// n(q_i) + 0.5
+// N - n(q_i) + 0.5
+// IDF(q_i) = log(1 + ------------------)
+// n(q_i) + 0.5
//
// where N is the number of documents in the corpus, and n(q_i) is the number
// of documents in the corpus containing the query term q_i.
@@ -153,7 +149,7 @@ float Bm25fCalculator::GetCorpusIdfWeightForTerm(std::string_view term,
uint32_t num_docs = csdata.num_docs();
uint32_t nqi = corpus_nqi_map_[corpus_term_info.value];
float idf =
- nqi != 0 ? log(1.0f + (num_docs - nqi + 0.5f) / (nqi + 0.5f)) : 0.0f;
+ nqi != 0 ? log(1.0f + (num_docs - nqi + 0.5f) / (nqi - 0.5f)) : 0.0f;
corpus_idf_map_.insert({corpus_term_info.value, idf});
ICING_VLOG(1) << IcingStringUtil::StringPrintf(
"corpus_id:%d term:%s N:%d nqi:%d idf:%f", corpus_id,
@@ -162,11 +158,6 @@ float Bm25fCalculator::GetCorpusIdfWeightForTerm(std::string_view term,
}
// Get per corpus average document length and cache the result in the map.
-// The average doc length is calculated as:
-//
-// total_tokens_in_corpus
-// Avg Doc Length = -------------------------
-// num_docs_in_corpus + 1
float Bm25fCalculator::GetCorpusAvgDocLength(CorpusId corpus_id) {
auto iter = corpus_avgdl_map_.find(corpus_id);
if (iter != corpus_avgdl_map_.end()) {
@@ -200,8 +191,8 @@ float Bm25fCalculator::ComputedNormalizedTermFrequency(
const DocumentAssociatedScoreData& data) {
uint32_t dl = data.length_in_tokens();
float avgdl = GetCorpusAvgDocLength(data.corpus_id());
- float f_q = ComputeTermFrequencyForMatchedSections(
- data.corpus_id(), term_match_info, hit_info.document_id());
+ float f_q =
+ ComputeTermFrequencyForMatchedSections(data.corpus_id(), term_match_info);
float normalized_tf =
f_q * (k1_ + 1) / (f_q + k1_ * (1 - b_ + b_ * dl / avgdl));
@@ -211,41 +202,23 @@ float Bm25fCalculator::ComputedNormalizedTermFrequency(
return normalized_tf;
}
+// Note: once we support section weights, we should update this function to
+// compute the weighted term frequency.
float Bm25fCalculator::ComputeTermFrequencyForMatchedSections(
- CorpusId corpus_id, const TermMatchInfo& term_match_info,
- DocumentId document_id) const {
+ CorpusId corpus_id, const TermMatchInfo& term_match_info) const {
float sum = 0.0f;
SectionIdMask sections = term_match_info.section_ids_mask;
- SchemaTypeId schema_type_id = GetSchemaTypeId(document_id);
-
while (sections != 0) {
SectionId section_id = __builtin_ctz(sections);
sections &= ~(1u << section_id);
Hit::TermFrequency tf = term_match_info.term_frequencies[section_id];
- double weighted_tf = tf * section_weights_->GetNormalizedSectionWeight(
- schema_type_id, section_id);
if (tf != Hit::kNoTermFrequency) {
- sum += weighted_tf;
+ sum += tf;
}
}
return sum;
}
-SchemaTypeId Bm25fCalculator::GetSchemaTypeId(DocumentId document_id) const {
- auto filter_data_or = document_store_->GetDocumentFilterData(document_id);
- if (!filter_data_or.ok()) {
- // This should never happen. The only failure case for
- // GetDocumentFilterData is if the document_id is outside of the range of
- // allocated document_ids, which shouldn't be possible since we're getting
- // this document_id from the posting lists.
- ICING_LOG(WARNING) << IcingStringUtil::StringPrintf(
- "No document filter data for document [%d]", document_id);
- return kInvalidSchemaTypeId;
- }
- DocumentFilterData data = filter_data_or.ValueOrDie();
- return data.schema_type_id();
-}
-
} // namespace lib
} // namespace icing
diff --git a/icing/scoring/bm25f-calculator.h b/icing/scoring/bm25f-calculator.h
index 05009d8..91b4f24 100644
--- a/icing/scoring/bm25f-calculator.h
+++ b/icing/scoring/bm25f-calculator.h
@@ -22,7 +22,6 @@
#include "icing/index/iterator/doc-hit-info-iterator.h"
#include "icing/legacy/index/icing-bit-util.h"
-#include "icing/scoring/section-weights.h"
#include "icing/store/corpus-id.h"
#include "icing/store/document-store.h"
@@ -63,8 +62,7 @@ namespace lib {
// see: glossary/bm25
class Bm25fCalculator {
public:
- explicit Bm25fCalculator(const DocumentStore *document_store_,
- std::unique_ptr<SectionWeights> section_weights_);
+ explicit Bm25fCalculator(const DocumentStore *document_store_);
// Precompute and cache statistics relevant to BM25F.
// Populates term_id_map_ and corpus_nqi_map_ for use while scoring other
@@ -110,43 +108,18 @@ class Bm25fCalculator {
}
};
- // Returns idf weight for the term and provided corpus.
float GetCorpusIdfWeightForTerm(std::string_view term, CorpusId corpus_id);
-
- // Returns the average document length for the corpus. The average is
- // calculated as the sum of tokens in the corpus' documents over the total
- // number of documents plus one.
float GetCorpusAvgDocLength(CorpusId corpus_id);
-
- // Returns the normalized term frequency for the term match and document hit.
- // This normalizes the term frequency by applying smoothing parameters and
- // factoring document length.
float ComputedNormalizedTermFrequency(
const TermMatchInfo &term_match_info, const DocHitInfo &hit_info,
const DocumentAssociatedScoreData &data);
-
- // Returns the weighted term frequency for the term match and document. For
- // each section the term is present, we scale the term frequency by its
- // section weight. We return the sum of the weighted term frequencies over all
- // sections.
float ComputeTermFrequencyForMatchedSections(
- CorpusId corpus_id, const TermMatchInfo &term_match_info,
- DocumentId document_id) const;
+ CorpusId corpus_id, const TermMatchInfo &term_match_info) const;
- // Returns the schema type id for the document by retrieving it from the
- // DocumentFilterData.
- SchemaTypeId GetSchemaTypeId(DocumentId document_id) const;
-
- // Clears cached scoring data and prepares the calculator for a new scoring
- // run.
void Clear();
const DocumentStore *document_store_; // Does not own.
- // Used for accessing normalized section weights when computing the weighted
- // term frequency.
- std::unique_ptr<SectionWeights> section_weights_;
-
// Map from query term to compact term ID.
// Necessary as a key to the other maps.
// The use of the string_view as key here means that the query_term_iterators
@@ -157,6 +130,7 @@ class Bm25fCalculator {
// Necessary to calculate the normalized term frequency.
// This information is cached in the DocumentStore::CorpusScoreCache
std::unordered_map<CorpusId, float> corpus_avgdl_map_;
+
// Map from <corpus ID, term ID> to number of documents containing term q_i,
// called n(q_i).
// Necessary to calculate IDF(q_i) (inverse document frequency).
diff --git a/icing/scoring/ranker.cc b/icing/scoring/ranker.cc
index 117f44c..fecee82 100644
--- a/icing/scoring/ranker.cc
+++ b/icing/scoring/ranker.cc
@@ -32,7 +32,6 @@ namespace {
// Helper function to wrap the heapify algorithm, it heapifies the target
// subtree node in place.
-// TODO(b/152934343) refactor the heapify function and making it into a class.
void Heapify(
std::vector<ScoredDocumentHit>* scored_document_hits,
int target_subtree_root_index,
@@ -72,80 +71,6 @@ void Heapify(
}
}
-// Heapify the given term vector from top to bottom. Call it after add or
-// replace an element at the front of the vector.
-void HeapifyTermDown(std::vector<TermMetadata>& scored_terms,
- int target_subtree_root_index) {
- int heap_size = scored_terms.size();
- if (target_subtree_root_index >= heap_size) {
- return;
- }
-
- // Initializes subtree root as the current minimum node.
- int min = target_subtree_root_index;
- // If we represent a heap in an array/vector, indices of left and right
- // children can be calculated as such.
- const int left = target_subtree_root_index * 2 + 1;
- const int right = target_subtree_root_index * 2 + 2;
-
- // If left child is smaller than current minimum.
- if (left < heap_size &&
- scored_terms.at(left).hit_count < scored_terms.at(min).hit_count) {
- min = left;
- }
-
- // If right child is smaller than current minimum.
- if (right < heap_size &&
- scored_terms.at(right).hit_count < scored_terms.at(min).hit_count) {
- min = right;
- }
-
- // If the minimum is not the subtree root, swap and continue heapifying the
- // lower level subtree.
- if (min != target_subtree_root_index) {
- std::swap(scored_terms.at(min),
- scored_terms.at(target_subtree_root_index));
- HeapifyTermDown(scored_terms, min);
- }
-}
-
-// Heapify the given term vector from bottom to top. Call it after add an
-// element at the end of the vector.
-void HeapifyTermUp(std::vector<TermMetadata>& scored_terms,
- int target_subtree_child_index) {
- // If we represent a heap in an array/vector, indices of root can be
- // calculated as such.
- const int root = (target_subtree_child_index + 1) / 2 - 1;
-
- // If the current child is smaller than the root, swap and continue heapifying
- // the upper level subtree
- if (root >= 0 && scored_terms.at(target_subtree_child_index).hit_count <
- scored_terms.at(root).hit_count) {
- std::swap(scored_terms.at(root),
- scored_terms.at(target_subtree_child_index));
- HeapifyTermUp(scored_terms, root);
- }
-}
-
-TermMetadata PopRootTerm(std::vector<TermMetadata>& scored_terms) {
- if (scored_terms.empty()) {
- // Return an invalid TermMetadata as a sentinel value.
- return TermMetadata(/*content_in=*/"", /*hit_count_in=*/-1);
- }
-
- // Steps to extract root from heap:
- // 1. copy out root
- TermMetadata root = scored_terms.at(0);
- const size_t last_node_index = scored_terms.size() - 1;
- // 2. swap root and the last node
- std::swap(scored_terms.at(0), scored_terms.at(last_node_index));
- // 3. remove last node
- scored_terms.pop_back();
- // 4. heapify root
- HeapifyTermDown(scored_terms, /*target_subtree_root_index=*/0);
- return root;
-}
-
// Helper function to extract the root from the heap. The heap structure will be
// maintained.
//
@@ -190,19 +115,6 @@ void BuildHeapInPlace(
}
}
-void PushToTermHeap(TermMetadata term, int number_to_return,
- std::vector<TermMetadata>& scored_terms_heap) {
- if (scored_terms_heap.size() < number_to_return) {
- scored_terms_heap.push_back(std::move(term));
- // We insert at end, so we should heapify bottom up.
- HeapifyTermUp(scored_terms_heap, scored_terms_heap.size() - 1);
- } else if (scored_terms_heap.at(0).hit_count < term.hit_count) {
- scored_terms_heap.at(0) = std::move(term);
- // We insert at root, so we should heapify top down.
- HeapifyTermDown(scored_terms_heap, /*target_subtree_root_index=*/0);
- }
-}
-
std::vector<ScoredDocumentHit> PopTopResultsFromHeap(
std::vector<ScoredDocumentHit>* scored_document_hits_heap, int num_results,
const ScoredDocumentHitComparator& scored_document_hit_comparator) {
@@ -222,15 +134,5 @@ std::vector<ScoredDocumentHit> PopTopResultsFromHeap(
return scored_document_hit_result;
}
-std::vector<TermMetadata> PopAllTermsFromHeap(
- std::vector<TermMetadata>& scored_terms_heap) {
- std::vector<TermMetadata> top_term_result;
- top_term_result.reserve(scored_terms_heap.size());
- while (!scored_terms_heap.empty()) {
- top_term_result.push_back(PopRootTerm(scored_terms_heap));
- }
- return top_term_result;
-}
-
} // namespace lib
} // namespace icing
diff --git a/icing/scoring/ranker.h b/icing/scoring/ranker.h
index 81838f3..785c133 100644
--- a/icing/scoring/ranker.h
+++ b/icing/scoring/ranker.h
@@ -17,7 +17,6 @@
#include <vector>
-#include "icing/index/term-metadata.h"
#include "icing/scoring/scored-document-hit.h"
// Provides functionality to get the top N results from an unsorted vector.
@@ -40,18 +39,6 @@ std::vector<ScoredDocumentHit> PopTopResultsFromHeap(
std::vector<ScoredDocumentHit>* scored_document_hits_heap, int num_results,
const ScoredDocumentHitComparator& scored_document_hit_comparator);
-// The heap is a min-heap. So that we can avoid some push operations by
-// comparing to the root term, and only pushing if greater than root. The time
-// complexity for a single push is O(lgK) which K is the number_to_return.
-// REQUIRED: scored_terms_heap is not null.
-void PushToTermHeap(TermMetadata term, int number_to_return,
- std::vector<TermMetadata>& scored_terms_heap);
-
-// Return all terms from the given terms heap. And since the heap is a min-heap,
-// the output vector will be increasing order.
-// REQUIRED: scored_terms_heap is not null.
-std::vector<TermMetadata> PopAllTermsFromHeap(
- std::vector<TermMetadata>& scored_terms_heap);
} // namespace lib
} // namespace icing
diff --git a/icing/scoring/score-and-rank_benchmark.cc b/icing/scoring/score-and-rank_benchmark.cc
index cc1d995..e940e98 100644
--- a/icing/scoring/score-and-rank_benchmark.cc
+++ b/icing/scoring/score-and-rank_benchmark.cc
@@ -117,8 +117,7 @@ void BM_ScoreAndRankDocumentHitsByDocumentScore(benchmark::State& state) {
scoring_spec.set_rank_by(ScoringSpecProto::RankingStrategy::DOCUMENT_SCORE);
ICING_ASSERT_OK_AND_ASSIGN(
std::unique_ptr<ScoringProcessor> scoring_processor,
- ScoringProcessor::Create(scoring_spec, document_store.get(),
- schema_store.get()));
+ ScoringProcessor::Create(scoring_spec, document_store.get()));
int num_to_score = state.range(0);
int num_of_documents = state.range(1);
@@ -221,8 +220,7 @@ void BM_ScoreAndRankDocumentHitsByCreationTime(benchmark::State& state) {
ScoringSpecProto::RankingStrategy::CREATION_TIMESTAMP);
ICING_ASSERT_OK_AND_ASSIGN(
std::unique_ptr<ScoringProcessor> scoring_processor,
- ScoringProcessor::Create(scoring_spec, document_store.get(),
- schema_store.get()));
+ ScoringProcessor::Create(scoring_spec, document_store.get()));
int num_to_score = state.range(0);
int num_of_documents = state.range(1);
@@ -324,8 +322,7 @@ void BM_ScoreAndRankDocumentHitsNoScoring(benchmark::State& state) {
scoring_spec.set_rank_by(ScoringSpecProto::RankingStrategy::NONE);
ICING_ASSERT_OK_AND_ASSIGN(
std::unique_ptr<ScoringProcessor> scoring_processor,
- ScoringProcessor::Create(scoring_spec, document_store.get(),
- schema_store.get()));
+ ScoringProcessor::Create(scoring_spec, document_store.get()));
int num_to_score = state.range(0);
int num_of_documents = state.range(1);
@@ -393,122 +390,6 @@ BENCHMARK(BM_ScoreAndRankDocumentHitsNoScoring)
->ArgPair(10000, 18000)
->ArgPair(10000, 20000);
-void BM_ScoreAndRankDocumentHitsByRelevanceScoring(benchmark::State& state) {
- const std::string base_dir = GetTestTempDir() + "/score_and_rank_benchmark";
- const std::string document_store_dir = base_dir + "/document_store";
- const std::string schema_store_dir = base_dir + "/schema_store";
-
- // Creates file directories
- Filesystem filesystem;
- filesystem.DeleteDirectoryRecursively(base_dir.c_str());
- filesystem.CreateDirectoryRecursively(document_store_dir.c_str());
- filesystem.CreateDirectoryRecursively(schema_store_dir.c_str());
-
- Clock clock;
- ICING_ASSERT_OK_AND_ASSIGN(
- std::unique_ptr<SchemaStore> schema_store,
- SchemaStore::Create(&filesystem, base_dir, &clock));
-
- ICING_ASSERT_OK_AND_ASSIGN(
- DocumentStore::CreateResult create_result,
- DocumentStore::Create(&filesystem, document_store_dir, &clock,
- schema_store.get()));
- std::unique_ptr<DocumentStore> document_store =
- std::move(create_result.document_store);
-
- ICING_ASSERT_OK(schema_store->SetSchema(CreateSchemaWithEmailType()));
-
- ScoringSpecProto scoring_spec;
- scoring_spec.set_rank_by(ScoringSpecProto::RankingStrategy::RELEVANCE_SCORE);
- ICING_ASSERT_OK_AND_ASSIGN(
- std::unique_ptr<ScoringProcessor> scoring_processor,
- ScoringProcessor::Create(scoring_spec, document_store.get(),
- schema_store.get()));
-
- int num_to_score = state.range(0);
- int num_of_documents = state.range(1);
-
- std::mt19937 random_generator;
- std::uniform_int_distribution<int> distribution(
- 1, std::numeric_limits<int>::max());
-
- SectionId section_id = 0;
- SectionIdMask section_id_mask = 1U << section_id;
-
- // Puts documents into document store
- std::vector<DocHitInfo> doc_hit_infos;
- for (int i = 0; i < num_of_documents; i++) {
- ICING_ASSERT_OK_AND_ASSIGN(
- DocumentId document_id,
- document_store->Put(CreateEmailDocument(
- /*id=*/i, /*document_score=*/1,
- /*creation_timestamp_ms=*/1),
- /*num_tokens=*/10));
- DocHitInfo doc_hit = DocHitInfo(document_id, section_id_mask);
- // Set five matches for term "foo" for each document hit.
- doc_hit.UpdateSection(section_id, /*hit_term_frequency=*/5);
- doc_hit_infos.push_back(doc_hit);
- }
-
- ScoredDocumentHitComparator scored_document_hit_comparator(
- /*is_descending=*/true);
-
- for (auto _ : state) {
- // Creates a dummy DocHitInfoIterator with results, we need to pause the
- // timer here so that the cost of copying test data is not included.
- state.PauseTiming();
- std::unique_ptr<DocHitInfoIterator> doc_hit_info_iterator =
- std::make_unique<DocHitInfoIteratorDummy>(doc_hit_infos, "foo");
- // Create a query term iterator that assigns the document hits to term
- // "foo".
- std::unordered_map<std::string, std::unique_ptr<DocHitInfoIterator>>
- query_term_iterators;
- query_term_iterators["foo"] =
- std::make_unique<DocHitInfoIteratorDummy>(doc_hit_infos, "foo");
- state.ResumeTiming();
-
- std::vector<ScoredDocumentHit> scored_document_hits =
- scoring_processor->Score(std::move(doc_hit_info_iterator), num_to_score,
- &query_term_iterators);
-
- BuildHeapInPlace(&scored_document_hits, scored_document_hit_comparator);
- // Ranks and gets the first page, 20 is a common page size
- std::vector<ScoredDocumentHit> results =
- PopTopResultsFromHeap(&scored_document_hits, /*num_results=*/20,
- scored_document_hit_comparator);
- }
-
- // Clean up
- document_store.reset();
- schema_store.reset();
- filesystem.DeleteDirectoryRecursively(base_dir.c_str());
-}
-BENCHMARK(BM_ScoreAndRankDocumentHitsByRelevanceScoring)
- // num_to_score, num_of_documents in document store
- ->ArgPair(1000, 30000)
- ->ArgPair(3000, 30000)
- ->ArgPair(5000, 30000)
- ->ArgPair(7000, 30000)
- ->ArgPair(9000, 30000)
- ->ArgPair(11000, 30000)
- ->ArgPair(13000, 30000)
- ->ArgPair(15000, 30000)
- ->ArgPair(17000, 30000)
- ->ArgPair(19000, 30000)
- ->ArgPair(21000, 30000)
- ->ArgPair(23000, 30000)
- ->ArgPair(25000, 30000)
- ->ArgPair(27000, 30000)
- ->ArgPair(29000, 30000)
- // Starting from this line, we're trying to see if num_of_documents affects
- // performance
- ->ArgPair(10000, 10000)
- ->ArgPair(10000, 12000)
- ->ArgPair(10000, 14000)
- ->ArgPair(10000, 16000)
- ->ArgPair(10000, 18000)
- ->ArgPair(10000, 20000);
-
} // namespace
} // namespace lib
diff --git a/icing/scoring/scorer.cc b/icing/scoring/scorer.cc
index 5f33e66..a4734b4 100644
--- a/icing/scoring/scorer.cc
+++ b/icing/scoring/scorer.cc
@@ -22,7 +22,6 @@
#include "icing/index/iterator/doc-hit-info-iterator.h"
#include "icing/proto/scoring.pb.h"
#include "icing/scoring/bm25f-calculator.h"
-#include "icing/scoring/section-weights.h"
#include "icing/store/document-id.h"
#include "icing/store/document-store.h"
#include "icing/util/status-macros.h"
@@ -157,12 +156,11 @@ class NoScorer : public Scorer {
};
libtextclassifier3::StatusOr<std::unique_ptr<Scorer>> Scorer::Create(
- const ScoringSpecProto& scoring_spec, double default_score,
- const DocumentStore* document_store, const SchemaStore* schema_store) {
+ ScoringSpecProto::RankingStrategy::Code rank_by, double default_score,
+ const DocumentStore* document_store) {
ICING_RETURN_ERROR_IF_NULL(document_store);
- ICING_RETURN_ERROR_IF_NULL(schema_store);
- switch (scoring_spec.rank_by()) {
+ switch (rank_by) {
case ScoringSpecProto::RankingStrategy::DOCUMENT_SCORE:
return std::make_unique<DocumentScoreScorer>(document_store,
default_score);
@@ -170,12 +168,7 @@ libtextclassifier3::StatusOr<std::unique_ptr<Scorer>> Scorer::Create(
return std::make_unique<DocumentCreationTimestampScorer>(document_store,
default_score);
case ScoringSpecProto::RankingStrategy::RELEVANCE_SCORE: {
- ICING_ASSIGN_OR_RETURN(
- std::unique_ptr<SectionWeights> section_weights,
- SectionWeights::Create(schema_store, scoring_spec));
-
- auto bm25f_calculator = std::make_unique<Bm25fCalculator>(
- document_store, std::move(section_weights));
+ auto bm25f_calculator = std::make_unique<Bm25fCalculator>(document_store);
return std::make_unique<RelevanceScoreScorer>(std::move(bm25f_calculator),
default_score);
}
@@ -190,8 +183,8 @@ libtextclassifier3::StatusOr<std::unique_ptr<Scorer>> Scorer::Create(
case ScoringSpecProto::RankingStrategy::USAGE_TYPE2_LAST_USED_TIMESTAMP:
[[fallthrough]];
case ScoringSpecProto::RankingStrategy::USAGE_TYPE3_LAST_USED_TIMESTAMP:
- return std::make_unique<UsageScorer>(
- document_store, scoring_spec.rank_by(), default_score);
+ return std::make_unique<UsageScorer>(document_store, rank_by,
+ default_score);
case ScoringSpecProto::RankingStrategy::NONE:
return std::make_unique<NoScorer>(default_score);
}
diff --git a/icing/scoring/scorer.h b/icing/scoring/scorer.h
index abdd5ca..a22db0f 100644
--- a/icing/scoring/scorer.h
+++ b/icing/scoring/scorer.h
@@ -43,8 +43,8 @@ class Scorer {
// FAILED_PRECONDITION on any null pointer input
// INVALID_ARGUMENT if fails to create an instance
static libtextclassifier3::StatusOr<std::unique_ptr<Scorer>> Create(
- const ScoringSpecProto& scoring_spec, double default_score,
- const DocumentStore* document_store, const SchemaStore* schema_store);
+ ScoringSpecProto::RankingStrategy::Code rank_by, double default_score,
+ const DocumentStore* document_store);
// Returns a non-negative score of a document. The score can be a
// document-associated score which comes from the DocumentProto directly, an
diff --git a/icing/scoring/scorer_test.cc b/icing/scoring/scorer_test.cc
index fef612d..8b89514 100644
--- a/icing/scoring/scorer_test.cc
+++ b/icing/scoring/scorer_test.cc
@@ -27,7 +27,6 @@
#include "icing/proto/scoring.pb.h"
#include "icing/schema-builder.h"
#include "icing/schema/schema-store.h"
-#include "icing/scoring/section-weights.h"
#include "icing/store/document-id.h"
#include "icing/store/document-store.h"
#include "icing/testing/common-matchers.h"
@@ -40,11 +39,11 @@ namespace lib {
namespace {
using ::testing::Eq;
-constexpr PropertyConfigProto::DataType::Code TYPE_STRING =
- PropertyConfigProto::DataType::STRING;
+constexpr PropertyConfigProto_DataType_Code TYPE_STRING =
+ PropertyConfigProto_DataType_Code_STRING;
-constexpr PropertyConfigProto::Cardinality::Code CARDINALITY_REQUIRED =
- PropertyConfigProto::Cardinality::REQUIRED;
+constexpr PropertyConfigProto_Cardinality_Code CARDINALITY_REQUIRED =
+ PropertyConfigProto_Cardinality_Code_REQUIRED;
class ScorerTest : public testing::Test {
protected:
@@ -92,8 +91,6 @@ class ScorerTest : public testing::Test {
DocumentStore* document_store() { return document_store_.get(); }
- SchemaStore* schema_store() { return schema_store_.get(); }
-
const FakeClock& fake_clock1() { return fake_clock1_; }
const FakeClock& fake_clock2() { return fake_clock2_; }
@@ -124,37 +121,17 @@ UsageReport CreateUsageReport(std::string name_space, std::string uri,
return usage_report;
}
-ScoringSpecProto CreateScoringSpecForRankingStrategy(
- ScoringSpecProto::RankingStrategy::Code ranking_strategy) {
- ScoringSpecProto scoring_spec;
- scoring_spec.set_rank_by(ranking_strategy);
- return scoring_spec;
-}
-
-TEST_F(ScorerTest, CreationWithNullDocumentStoreShouldFail) {
- EXPECT_THAT(
- Scorer::Create(CreateScoringSpecForRankingStrategy(
- ScoringSpecProto::RankingStrategy::DOCUMENT_SCORE),
- /*default_score=*/0, /*document_store=*/nullptr,
- schema_store()),
- StatusIs(libtextclassifier3::StatusCode::FAILED_PRECONDITION));
-}
-
-TEST_F(ScorerTest, CreationWithNullSchemaStoreShouldFail) {
- EXPECT_THAT(
- Scorer::Create(CreateScoringSpecForRankingStrategy(
- ScoringSpecProto::RankingStrategy::DOCUMENT_SCORE),
- /*default_score=*/0, document_store(),
- /*schema_store=*/nullptr),
- StatusIs(libtextclassifier3::StatusCode::FAILED_PRECONDITION));
+TEST_F(ScorerTest, CreationWithNullPointerShouldFail) {
+ EXPECT_THAT(Scorer::Create(ScoringSpecProto::RankingStrategy::DOCUMENT_SCORE,
+ /*default_score=*/0, /*document_store=*/nullptr),
+ StatusIs(libtextclassifier3::StatusCode::FAILED_PRECONDITION));
}
TEST_F(ScorerTest, ShouldGetDefaultScoreIfDocumentDoesntExist) {
ICING_ASSERT_OK_AND_ASSIGN(
std::unique_ptr<Scorer> scorer,
- Scorer::Create(CreateScoringSpecForRankingStrategy(
- ScoringSpecProto::RankingStrategy::DOCUMENT_SCORE),
- /*default_score=*/10, document_store(), schema_store()));
+ Scorer::Create(ScoringSpecProto::RankingStrategy::DOCUMENT_SCORE,
+ /*default_score=*/10, document_store()));
// Non existent document id
DocHitInfo docHitInfo = DocHitInfo(/*document_id_in=*/1);
@@ -176,9 +153,8 @@ TEST_F(ScorerTest, ShouldGetDefaultScoreIfDocumentIsDeleted) {
ICING_ASSERT_OK_AND_ASSIGN(
std::unique_ptr<Scorer> scorer,
- Scorer::Create(CreateScoringSpecForRankingStrategy(
- ScoringSpecProto::RankingStrategy::DOCUMENT_SCORE),
- /*default_score=*/10, document_store(), schema_store()));
+ Scorer::Create(ScoringSpecProto::RankingStrategy::DOCUMENT_SCORE,
+ /*default_score=*/10, document_store()));
DocHitInfo docHitInfo = DocHitInfo(document_id);
@@ -209,9 +185,8 @@ TEST_F(ScorerTest, ShouldGetDefaultScoreIfDocumentIsExpired) {
ICING_ASSERT_OK_AND_ASSIGN(
std::unique_ptr<Scorer> scorer,
- Scorer::Create(CreateScoringSpecForRankingStrategy(
- ScoringSpecProto::RankingStrategy::DOCUMENT_SCORE),
- /*default_score=*/10, document_store(), schema_store()));
+ Scorer::Create(ScoringSpecProto::RankingStrategy::DOCUMENT_SCORE,
+ /*default_score=*/10, document_store()));
DocHitInfo docHitInfo = DocHitInfo(document_id);
@@ -238,9 +213,8 @@ TEST_F(ScorerTest, ShouldGetDefaultDocumentScore) {
document_store()->Put(test_document));
ICING_ASSERT_OK_AND_ASSIGN(
std::unique_ptr<Scorer> scorer,
- Scorer::Create(CreateScoringSpecForRankingStrategy(
- ScoringSpecProto::RankingStrategy::DOCUMENT_SCORE),
- /*default_score=*/10, document_store(), schema_store()));
+ Scorer::Create(ScoringSpecProto::RankingStrategy::DOCUMENT_SCORE,
+ /*default_score=*/10, document_store()));
DocHitInfo docHitInfo = DocHitInfo(document_id);
EXPECT_THAT(scorer->GetScore(docHitInfo), Eq(0));
@@ -261,9 +235,8 @@ TEST_F(ScorerTest, ShouldGetCorrectDocumentScore) {
document_store()->Put(test_document));
ICING_ASSERT_OK_AND_ASSIGN(
std::unique_ptr<Scorer> scorer,
- Scorer::Create(CreateScoringSpecForRankingStrategy(
- ScoringSpecProto::RankingStrategy::DOCUMENT_SCORE),
- /*default_score=*/0, document_store(), schema_store()));
+ Scorer::Create(ScoringSpecProto::RankingStrategy::DOCUMENT_SCORE,
+ /*default_score=*/0, document_store()));
DocHitInfo docHitInfo = DocHitInfo(document_id);
EXPECT_THAT(scorer->GetScore(docHitInfo), Eq(5));
@@ -286,9 +259,8 @@ TEST_F(ScorerTest, QueryIteratorNullRelevanceScoreShouldReturnDefaultScore) {
document_store()->Put(test_document));
ICING_ASSERT_OK_AND_ASSIGN(
std::unique_ptr<Scorer> scorer,
- Scorer::Create(CreateScoringSpecForRankingStrategy(
- ScoringSpecProto::RankingStrategy::RELEVANCE_SCORE),
- /*default_score=*/10, document_store(), schema_store()));
+ Scorer::Create(ScoringSpecProto::RankingStrategy::RELEVANCE_SCORE,
+ /*default_score=*/10, document_store()));
DocHitInfo docHitInfo = DocHitInfo(document_id);
EXPECT_THAT(scorer->GetScore(docHitInfo), Eq(10));
@@ -318,9 +290,8 @@ TEST_F(ScorerTest, ShouldGetCorrectCreationTimestampScore) {
document_store()->Put(test_document2));
ICING_ASSERT_OK_AND_ASSIGN(
std::unique_ptr<Scorer> scorer,
- Scorer::Create(CreateScoringSpecForRankingStrategy(
- ScoringSpecProto::RankingStrategy::CREATION_TIMESTAMP),
- /*default_score=*/0, document_store(), schema_store()));
+ Scorer::Create(ScoringSpecProto::RankingStrategy::CREATION_TIMESTAMP,
+ /*default_score=*/0, document_store()));
DocHitInfo docHitInfo1 = DocHitInfo(document_id1);
DocHitInfo docHitInfo2 = DocHitInfo(document_id2);
@@ -345,19 +316,16 @@ TEST_F(ScorerTest, ShouldGetCorrectUsageCountScoreForType1) {
// Create 3 scorers for 3 different usage types.
ICING_ASSERT_OK_AND_ASSIGN(
std::unique_ptr<Scorer> scorer1,
- Scorer::Create(CreateScoringSpecForRankingStrategy(
- ScoringSpecProto::RankingStrategy::USAGE_TYPE1_COUNT),
- /*default_score=*/0, document_store(), schema_store()));
+ Scorer::Create(ScoringSpecProto::RankingStrategy::USAGE_TYPE1_COUNT,
+ /*default_score=*/0, document_store()));
ICING_ASSERT_OK_AND_ASSIGN(
std::unique_ptr<Scorer> scorer2,
- Scorer::Create(CreateScoringSpecForRankingStrategy(
- ScoringSpecProto::RankingStrategy::USAGE_TYPE2_COUNT),
- /*default_score=*/0, document_store(), schema_store()));
+ Scorer::Create(ScoringSpecProto::RankingStrategy::USAGE_TYPE2_COUNT,
+ /*default_score=*/0, document_store()));
ICING_ASSERT_OK_AND_ASSIGN(
std::unique_ptr<Scorer> scorer3,
- Scorer::Create(CreateScoringSpecForRankingStrategy(
- ScoringSpecProto::RankingStrategy::USAGE_TYPE3_COUNT),
- /*default_score=*/0, document_store(), schema_store()));
+ Scorer::Create(ScoringSpecProto::RankingStrategy::USAGE_TYPE3_COUNT,
+ /*default_score=*/0, document_store()));
DocHitInfo docHitInfo = DocHitInfo(document_id);
EXPECT_THAT(scorer1->GetScore(docHitInfo), Eq(0));
EXPECT_THAT(scorer2->GetScore(docHitInfo), Eq(0));
@@ -389,19 +357,16 @@ TEST_F(ScorerTest, ShouldGetCorrectUsageCountScoreForType2) {
// Create 3 scorers for 3 different usage types.
ICING_ASSERT_OK_AND_ASSIGN(
std::unique_ptr<Scorer> scorer1,
- Scorer::Create(CreateScoringSpecForRankingStrategy(
- ScoringSpecProto::RankingStrategy::USAGE_TYPE1_COUNT),
- /*default_score=*/0, document_store(), schema_store()));
+ Scorer::Create(ScoringSpecProto::RankingStrategy::USAGE_TYPE1_COUNT,
+ /*default_score=*/0, document_store()));
ICING_ASSERT_OK_AND_ASSIGN(
std::unique_ptr<Scorer> scorer2,
- Scorer::Create(CreateScoringSpecForRankingStrategy(
- ScoringSpecProto::RankingStrategy::USAGE_TYPE2_COUNT),
- /*default_score=*/0, document_store(), schema_store()));
+ Scorer::Create(ScoringSpecProto::RankingStrategy::USAGE_TYPE2_COUNT,
+ /*default_score=*/0, document_store()));
ICING_ASSERT_OK_AND_ASSIGN(
std::unique_ptr<Scorer> scorer3,
- Scorer::Create(CreateScoringSpecForRankingStrategy(
- ScoringSpecProto::RankingStrategy::USAGE_TYPE3_COUNT),
- /*default_score=*/0, document_store(), schema_store()));
+ Scorer::Create(ScoringSpecProto::RankingStrategy::USAGE_TYPE3_COUNT,
+ /*default_score=*/0, document_store()));
DocHitInfo docHitInfo = DocHitInfo(document_id);
EXPECT_THAT(scorer1->GetScore(docHitInfo), Eq(0));
EXPECT_THAT(scorer2->GetScore(docHitInfo), Eq(0));
@@ -433,19 +398,16 @@ TEST_F(ScorerTest, ShouldGetCorrectUsageCountScoreForType3) {
// Create 3 scorers for 3 different usage types.
ICING_ASSERT_OK_AND_ASSIGN(
std::unique_ptr<Scorer> scorer1,
- Scorer::Create(CreateScoringSpecForRankingStrategy(
- ScoringSpecProto::RankingStrategy::USAGE_TYPE1_COUNT),
- /*default_score=*/0, document_store(), schema_store()));
+ Scorer::Create(ScoringSpecProto::RankingStrategy::USAGE_TYPE1_COUNT,
+ /*default_score=*/0, document_store()));
ICING_ASSERT_OK_AND_ASSIGN(
std::unique_ptr<Scorer> scorer2,
- Scorer::Create(CreateScoringSpecForRankingStrategy(
- ScoringSpecProto::RankingStrategy::USAGE_TYPE2_COUNT),
- /*default_score=*/0, document_store(), schema_store()));
+ Scorer::Create(ScoringSpecProto::RankingStrategy::USAGE_TYPE2_COUNT,
+ /*default_score=*/0, document_store()));
ICING_ASSERT_OK_AND_ASSIGN(
std::unique_ptr<Scorer> scorer3,
- Scorer::Create(CreateScoringSpecForRankingStrategy(
- ScoringSpecProto::RankingStrategy::USAGE_TYPE3_COUNT),
- /*default_score=*/0, document_store(), schema_store()));
+ Scorer::Create(ScoringSpecProto::RankingStrategy::USAGE_TYPE3_COUNT,
+ /*default_score=*/0, document_store()));
DocHitInfo docHitInfo = DocHitInfo(document_id);
EXPECT_THAT(scorer1->GetScore(docHitInfo), Eq(0));
EXPECT_THAT(scorer2->GetScore(docHitInfo), Eq(0));
@@ -477,22 +439,19 @@ TEST_F(ScorerTest, ShouldGetCorrectUsageTimestampScoreForType1) {
// Create 3 scorers for 3 different usage types.
ICING_ASSERT_OK_AND_ASSIGN(
std::unique_ptr<Scorer> scorer1,
- Scorer::Create(CreateScoringSpecForRankingStrategy(
- ScoringSpecProto::RankingStrategy::
- USAGE_TYPE1_LAST_USED_TIMESTAMP),
- /*default_score=*/0, document_store(), schema_store()));
+ Scorer::Create(
+ ScoringSpecProto::RankingStrategy::USAGE_TYPE1_LAST_USED_TIMESTAMP,
+ /*default_score=*/0, document_store()));
ICING_ASSERT_OK_AND_ASSIGN(
std::unique_ptr<Scorer> scorer2,
- Scorer::Create(CreateScoringSpecForRankingStrategy(
- ScoringSpecProto::RankingStrategy::
- USAGE_TYPE2_LAST_USED_TIMESTAMP),
- /*default_score=*/0, document_store(), schema_store()));
+ Scorer::Create(
+ ScoringSpecProto::RankingStrategy::USAGE_TYPE2_LAST_USED_TIMESTAMP,
+ /*default_score=*/0, document_store()));
ICING_ASSERT_OK_AND_ASSIGN(
std::unique_ptr<Scorer> scorer3,
- Scorer::Create(CreateScoringSpecForRankingStrategy(
- ScoringSpecProto::RankingStrategy::
- USAGE_TYPE3_LAST_USED_TIMESTAMP),
- /*default_score=*/0, document_store(), schema_store()));
+ Scorer::Create(
+ ScoringSpecProto::RankingStrategy::USAGE_TYPE3_LAST_USED_TIMESTAMP,
+ /*default_score=*/0, document_store()));
DocHitInfo docHitInfo = DocHitInfo(document_id);
EXPECT_THAT(scorer1->GetScore(docHitInfo), Eq(0));
EXPECT_THAT(scorer2->GetScore(docHitInfo), Eq(0));
@@ -540,22 +499,19 @@ TEST_F(ScorerTest, ShouldGetCorrectUsageTimestampScoreForType2) {
// Create 3 scorers for 3 different usage types.
ICING_ASSERT_OK_AND_ASSIGN(
std::unique_ptr<Scorer> scorer1,
- Scorer::Create(CreateScoringSpecForRankingStrategy(
- ScoringSpecProto::RankingStrategy::
- USAGE_TYPE1_LAST_USED_TIMESTAMP),
- /*default_score=*/0, document_store(), schema_store()));
+ Scorer::Create(
+ ScoringSpecProto::RankingStrategy::USAGE_TYPE1_LAST_USED_TIMESTAMP,
+ /*default_score=*/0, document_store()));
ICING_ASSERT_OK_AND_ASSIGN(
std::unique_ptr<Scorer> scorer2,
- Scorer::Create(CreateScoringSpecForRankingStrategy(
- ScoringSpecProto::RankingStrategy::
- USAGE_TYPE2_LAST_USED_TIMESTAMP),
- /*default_score=*/0, document_store(), schema_store()));
+ Scorer::Create(
+ ScoringSpecProto::RankingStrategy::USAGE_TYPE2_LAST_USED_TIMESTAMP,
+ /*default_score=*/0, document_store()));
ICING_ASSERT_OK_AND_ASSIGN(
std::unique_ptr<Scorer> scorer3,
- Scorer::Create(CreateScoringSpecForRankingStrategy(
- ScoringSpecProto::RankingStrategy::
- USAGE_TYPE3_LAST_USED_TIMESTAMP),
- /*default_score=*/0, document_store(), schema_store()));
+ Scorer::Create(
+ ScoringSpecProto::RankingStrategy::USAGE_TYPE3_LAST_USED_TIMESTAMP,
+ /*default_score=*/0, document_store()));
DocHitInfo docHitInfo = DocHitInfo(document_id);
EXPECT_THAT(scorer1->GetScore(docHitInfo), Eq(0));
EXPECT_THAT(scorer2->GetScore(docHitInfo), Eq(0));
@@ -603,22 +559,19 @@ TEST_F(ScorerTest, ShouldGetCorrectUsageTimestampScoreForType3) {
// Create 3 scorers for 3 different usage types.
ICING_ASSERT_OK_AND_ASSIGN(
std::unique_ptr<Scorer> scorer1,
- Scorer::Create(CreateScoringSpecForRankingStrategy(
- ScoringSpecProto::RankingStrategy::
- USAGE_TYPE1_LAST_USED_TIMESTAMP),
- /*default_score=*/0, document_store(), schema_store()));
+ Scorer::Create(
+ ScoringSpecProto::RankingStrategy::USAGE_TYPE1_LAST_USED_TIMESTAMP,
+ /*default_score=*/0, document_store()));
ICING_ASSERT_OK_AND_ASSIGN(
std::unique_ptr<Scorer> scorer2,
- Scorer::Create(CreateScoringSpecForRankingStrategy(
- ScoringSpecProto::RankingStrategy::
- USAGE_TYPE2_LAST_USED_TIMESTAMP),
- /*default_score=*/0, document_store(), schema_store()));
+ Scorer::Create(
+ ScoringSpecProto::RankingStrategy::USAGE_TYPE2_LAST_USED_TIMESTAMP,
+ /*default_score=*/0, document_store()));
ICING_ASSERT_OK_AND_ASSIGN(
std::unique_ptr<Scorer> scorer3,
- Scorer::Create(CreateScoringSpecForRankingStrategy(
- ScoringSpecProto::RankingStrategy::
- USAGE_TYPE3_LAST_USED_TIMESTAMP),
- /*default_score=*/0, document_store(), schema_store()));
+ Scorer::Create(
+ ScoringSpecProto::RankingStrategy::USAGE_TYPE3_LAST_USED_TIMESTAMP,
+ /*default_score=*/0, document_store()));
DocHitInfo docHitInfo = DocHitInfo(document_id);
EXPECT_THAT(scorer1->GetScore(docHitInfo), Eq(0));
EXPECT_THAT(scorer2->GetScore(docHitInfo), Eq(0));
@@ -654,9 +607,8 @@ TEST_F(ScorerTest, ShouldGetCorrectUsageTimestampScoreForType3) {
TEST_F(ScorerTest, NoScorerShouldAlwaysReturnDefaultScore) {
ICING_ASSERT_OK_AND_ASSIGN(
std::unique_ptr<Scorer> scorer,
- Scorer::Create(CreateScoringSpecForRankingStrategy(
- ScoringSpecProto::RankingStrategy::NONE),
- /*default_score=*/3, document_store(), schema_store()));
+ Scorer::Create(ScoringSpecProto::RankingStrategy::NONE,
+ /*default_score=*/3, document_store()));
DocHitInfo docHitInfo1 = DocHitInfo(/*document_id_in=*/0);
DocHitInfo docHitInfo2 = DocHitInfo(/*document_id_in=*/1);
@@ -666,10 +618,8 @@ TEST_F(ScorerTest, NoScorerShouldAlwaysReturnDefaultScore) {
EXPECT_THAT(scorer->GetScore(docHitInfo3), Eq(3));
ICING_ASSERT_OK_AND_ASSIGN(
- scorer,
- Scorer::Create(CreateScoringSpecForRankingStrategy(
- ScoringSpecProto::RankingStrategy::NONE),
- /*default_score=*/111, document_store(), schema_store()));
+ scorer, Scorer::Create(ScoringSpecProto::RankingStrategy::NONE,
+ /*default_score=*/111, document_store()));
docHitInfo1 = DocHitInfo(/*document_id_in=*/4);
docHitInfo2 = DocHitInfo(/*document_id_in=*/5);
@@ -693,10 +643,9 @@ TEST_F(ScorerTest, ShouldScaleUsageTimestampScoreForMaxTimestamp) {
ICING_ASSERT_OK_AND_ASSIGN(
std::unique_ptr<Scorer> scorer1,
- Scorer::Create(CreateScoringSpecForRankingStrategy(
- ScoringSpecProto::RankingStrategy::
- USAGE_TYPE1_LAST_USED_TIMESTAMP),
- /*default_score=*/0, document_store(), schema_store()));
+ Scorer::Create(
+ ScoringSpecProto::RankingStrategy::USAGE_TYPE1_LAST_USED_TIMESTAMP,
+ /*default_score=*/0, document_store()));
DocHitInfo docHitInfo = DocHitInfo(document_id);
// Create usage report for the maximum allowable timestamp.
diff --git a/icing/scoring/scoring-processor.cc b/icing/scoring/scoring-processor.cc
index e36f3bb..24480ef 100644
--- a/icing/scoring/scoring-processor.cc
+++ b/icing/scoring/scoring-processor.cc
@@ -39,20 +39,19 @@ constexpr double kDefaultScoreInAscendingOrder =
libtextclassifier3::StatusOr<std::unique_ptr<ScoringProcessor>>
ScoringProcessor::Create(const ScoringSpecProto& scoring_spec,
- const DocumentStore* document_store,
- const SchemaStore* schema_store) {
+ const DocumentStore* document_store) {
ICING_RETURN_ERROR_IF_NULL(document_store);
- ICING_RETURN_ERROR_IF_NULL(schema_store);
bool is_descending_order =
scoring_spec.order_by() == ScoringSpecProto::Order::DESC;
ICING_ASSIGN_OR_RETURN(
std::unique_ptr<Scorer> scorer,
- Scorer::Create(scoring_spec,
+ Scorer::Create(scoring_spec.rank_by(),
is_descending_order ? kDefaultScoreInDescendingOrder
: kDefaultScoreInAscendingOrder,
- document_store, schema_store));
+ document_store));
+
// Using `new` to access a non-public constructor.
return std::unique_ptr<ScoringProcessor>(
new ScoringProcessor(std::move(scorer)));
diff --git a/icing/scoring/scoring-processor.h b/icing/scoring/scoring-processor.h
index e7d09b1..2289605 100644
--- a/icing/scoring/scoring-processor.h
+++ b/icing/scoring/scoring-processor.h
@@ -40,8 +40,8 @@ class ScoringProcessor {
// A ScoringProcessor on success
// FAILED_PRECONDITION on any null pointer input
static libtextclassifier3::StatusOr<std::unique_ptr<ScoringProcessor>> Create(
- const ScoringSpecProto& scoring_spec, const DocumentStore* document_store,
- const SchemaStore* schema_store);
+ const ScoringSpecProto& scoring_spec,
+ const DocumentStore* document_store);
// Assigns scores to DocHitInfos from the given DocHitInfoIterator and returns
// a vector of ScoredDocumentHits. The size of results is no more than
diff --git a/icing/scoring/scoring-processor_test.cc b/icing/scoring/scoring-processor_test.cc
index b42ba31..125e2a7 100644
--- a/icing/scoring/scoring-processor_test.cc
+++ b/icing/scoring/scoring-processor_test.cc
@@ -34,16 +34,14 @@ namespace lib {
namespace {
using ::testing::ElementsAre;
-using ::testing::Eq;
-using ::testing::Gt;
using ::testing::IsEmpty;
using ::testing::SizeIs;
-constexpr PropertyConfigProto::DataType::Code TYPE_STRING =
- PropertyConfigProto::DataType::STRING;
+constexpr PropertyConfigProto_DataType_Code TYPE_STRING =
+ PropertyConfigProto_DataType_Code_STRING;
-constexpr PropertyConfigProto::Cardinality::Code CARDINALITY_OPTIONAL =
- PropertyConfigProto::Cardinality::OPTIONAL;
+constexpr PropertyConfigProto_Cardinality_Code CARDINALITY_OPTIONAL =
+ PropertyConfigProto_Cardinality_Code_OPTIONAL;
class ScoringProcessorTest : public testing::Test {
protected:
@@ -60,7 +58,7 @@ class ScoringProcessorTest : public testing::Test {
ICING_ASSERT_OK_AND_ASSIGN(
schema_store_,
- SchemaStore::Create(&filesystem_, schema_store_dir_, &fake_clock_));
+ SchemaStore::Create(&filesystem_, test_dir_, &fake_clock_));
ICING_ASSERT_OK_AND_ASSIGN(
DocumentStore::CreateResult create_result,
@@ -71,24 +69,11 @@ class ScoringProcessorTest : public testing::Test {
// Creates a simple email schema
SchemaProto test_email_schema =
SchemaBuilder()
- .AddType(SchemaTypeConfigBuilder()
- .SetType("email")
- .AddProperty(
- PropertyConfigBuilder()
- .SetName("subject")
- .SetDataTypeString(
- TermMatchType::PREFIX,
- StringIndexingConfig::TokenizerType::PLAIN)
- .SetDataType(TYPE_STRING)
- .SetCardinality(CARDINALITY_OPTIONAL))
- .AddProperty(
- PropertyConfigBuilder()
- .SetName("body")
- .SetDataTypeString(
- TermMatchType::PREFIX,
- StringIndexingConfig::TokenizerType::PLAIN)
- .SetDataType(TYPE_STRING)
- .SetCardinality(CARDINALITY_OPTIONAL)))
+ .AddType(SchemaTypeConfigBuilder().SetType("email").AddProperty(
+ PropertyConfigBuilder()
+ .SetName("subject")
+ .SetDataType(TYPE_STRING)
+ .SetCardinality(CARDINALITY_OPTIONAL)))
.Build();
ICING_ASSERT_OK(schema_store_->SetSchema(test_email_schema));
}
@@ -101,8 +86,6 @@ class ScoringProcessorTest : public testing::Test {
DocumentStore* document_store() { return document_store_.get(); }
- SchemaStore* schema_store() { return schema_store_.get(); }
-
private:
const std::string test_dir_;
const std::string doc_store_dir_;
@@ -156,46 +139,16 @@ UsageReport CreateUsageReport(std::string name_space, std::string uri,
return usage_report;
}
-TypePropertyWeights CreateTypePropertyWeights(
- std::string schema_type, std::vector<PropertyWeight> property_weights) {
- TypePropertyWeights type_property_weights;
- type_property_weights.set_schema_type(std::move(schema_type));
- type_property_weights.mutable_property_weights()->Reserve(
- property_weights.size());
-
- for (PropertyWeight& property_weight : property_weights) {
- *type_property_weights.add_property_weights() = std::move(property_weight);
- }
-
- return type_property_weights;
-}
-
-PropertyWeight CreatePropertyWeight(std::string path, double weight) {
- PropertyWeight property_weight;
- property_weight.set_path(std::move(path));
- property_weight.set_weight(weight);
- return property_weight;
-}
-
-TEST_F(ScoringProcessorTest, CreationWithNullDocumentStoreShouldFail) {
+TEST_F(ScoringProcessorTest, CreationWithNullPointerShouldFail) {
ScoringSpecProto spec_proto;
- EXPECT_THAT(ScoringProcessor::Create(spec_proto, /*document_store=*/nullptr,
- schema_store()),
- StatusIs(libtextclassifier3::StatusCode::FAILED_PRECONDITION));
-}
-
-TEST_F(ScoringProcessorTest, CreationWithNullSchemaStoreShouldFail) {
- ScoringSpecProto spec_proto;
- EXPECT_THAT(ScoringProcessor::Create(spec_proto, document_store(),
- /*schema_store=*/nullptr),
+ EXPECT_THAT(ScoringProcessor::Create(spec_proto, /*document_store=*/nullptr),
StatusIs(libtextclassifier3::StatusCode::FAILED_PRECONDITION));
}
TEST_F(ScoringProcessorTest, ShouldCreateInstance) {
ScoringSpecProto spec_proto;
spec_proto.set_rank_by(ScoringSpecProto::RankingStrategy::DOCUMENT_SCORE);
- ICING_EXPECT_OK(
- ScoringProcessor::Create(spec_proto, document_store(), schema_store()));
+ ICING_EXPECT_OK(ScoringProcessor::Create(spec_proto, document_store()));
}
TEST_F(ScoringProcessorTest, ShouldHandleEmptyDocHitIterator) {
@@ -210,7 +163,7 @@ TEST_F(ScoringProcessorTest, ShouldHandleEmptyDocHitIterator) {
// Creates a ScoringProcessor
ICING_ASSERT_OK_AND_ASSIGN(
std::unique_ptr<ScoringProcessor> scoring_processor,
- ScoringProcessor::Create(spec_proto, document_store(), schema_store()));
+ ScoringProcessor::Create(spec_proto, document_store()));
EXPECT_THAT(scoring_processor->Score(std::move(doc_hit_info_iterator),
/*num_to_score=*/5),
@@ -236,7 +189,7 @@ TEST_F(ScoringProcessorTest, ShouldHandleNonPositiveNumToScore) {
// Creates a ScoringProcessor
ICING_ASSERT_OK_AND_ASSIGN(
std::unique_ptr<ScoringProcessor> scoring_processor,
- ScoringProcessor::Create(spec_proto, document_store(), schema_store()));
+ ScoringProcessor::Create(spec_proto, document_store()));
EXPECT_THAT(scoring_processor->Score(std::move(doc_hit_info_iterator),
/*num_to_score=*/-1),
@@ -266,7 +219,7 @@ TEST_F(ScoringProcessorTest, ShouldRespectNumToScore) {
// Creates a ScoringProcessor
ICING_ASSERT_OK_AND_ASSIGN(
std::unique_ptr<ScoringProcessor> scoring_processor,
- ScoringProcessor::Create(spec_proto, document_store(), schema_store()));
+ ScoringProcessor::Create(spec_proto, document_store()));
EXPECT_THAT(scoring_processor->Score(std::move(doc_hit_info_iterator),
/*num_to_score=*/2),
@@ -298,7 +251,7 @@ TEST_F(ScoringProcessorTest, ShouldScoreByDocumentScore) {
// Creates a ScoringProcessor
ICING_ASSERT_OK_AND_ASSIGN(
std::unique_ptr<ScoringProcessor> scoring_processor,
- ScoringProcessor::Create(spec_proto, document_store(), schema_store()));
+ ScoringProcessor::Create(spec_proto, document_store()));
EXPECT_THAT(scoring_processor->Score(std::move(doc_hit_info_iterator),
/*num_to_score=*/3),
@@ -353,7 +306,7 @@ TEST_F(ScoringProcessorTest,
// Creates a ScoringProcessor
ICING_ASSERT_OK_AND_ASSIGN(
std::unique_ptr<ScoringProcessor> scoring_processor,
- ScoringProcessor::Create(spec_proto, document_store(), schema_store()));
+ ScoringProcessor::Create(spec_proto, document_store()));
std::unordered_map<std::string, std::unique_ptr<DocHitInfoIterator>>
query_term_iterators;
@@ -363,11 +316,11 @@ TEST_F(ScoringProcessorTest,
// the document's length determines the final score. Document shorter than the
// average corpus length are slightly boosted.
ScoredDocumentHit expected_scored_doc_hit1(document_id1, section_id_mask,
- /*score=*/0.187114);
+ /*score=*/0.255482);
ScoredDocumentHit expected_scored_doc_hit2(document_id2, section_id_mask,
- /*score=*/0.084904);
+ /*score=*/0.115927);
ScoredDocumentHit expected_scored_doc_hit3(document_id3, section_id_mask,
- /*score=*/0.121896);
+ /*score=*/0.166435);
EXPECT_THAT(
scoring_processor->Score(std::move(doc_hit_info_iterator),
/*num_to_score=*/3, &query_term_iterators),
@@ -422,7 +375,7 @@ TEST_F(ScoringProcessorTest,
// Creates a ScoringProcessor
ICING_ASSERT_OK_AND_ASSIGN(
std::unique_ptr<ScoringProcessor> scoring_processor,
- ScoringProcessor::Create(spec_proto, document_store(), schema_store()));
+ ScoringProcessor::Create(spec_proto, document_store()));
std::unordered_map<std::string, std::unique_ptr<DocHitInfoIterator>>
query_term_iterators;
@@ -431,11 +384,11 @@ TEST_F(ScoringProcessorTest,
// Since the three documents all contain the query term "foo" exactly once
// and they have the same length, they will have the same BM25F scoret.
ScoredDocumentHit expected_scored_doc_hit1(document_id1, section_id_mask,
- /*score=*/0.118455);
+ /*score=*/0.16173716);
ScoredDocumentHit expected_scored_doc_hit2(document_id2, section_id_mask,
- /*score=*/0.118455);
+ /*score=*/0.16173716);
ScoredDocumentHit expected_scored_doc_hit3(document_id3, section_id_mask,
- /*score=*/0.118455);
+ /*score=*/0.16173716);
EXPECT_THAT(
scoring_processor->Score(std::move(doc_hit_info_iterator),
/*num_to_score=*/3, &query_term_iterators),
@@ -495,7 +448,7 @@ TEST_F(ScoringProcessorTest,
// Creates a ScoringProcessor
ICING_ASSERT_OK_AND_ASSIGN(
std::unique_ptr<ScoringProcessor> scoring_processor,
- ScoringProcessor::Create(spec_proto, document_store(), schema_store()));
+ ScoringProcessor::Create(spec_proto, document_store()));
std::unordered_map<std::string, std::unique_ptr<DocHitInfoIterator>>
query_term_iterators;
@@ -504,11 +457,11 @@ TEST_F(ScoringProcessorTest,
// Since the three documents all have the same length, the score is decided by
// the frequency of the query term "foo".
ScoredDocumentHit expected_scored_doc_hit1(document_id1, section_id_mask1,
- /*score=*/0.226674);
+ /*score=*/0.309497);
ScoredDocumentHit expected_scored_doc_hit2(document_id2, section_id_mask2,
- /*score=*/0.118455);
+ /*score=*/0.16173716);
ScoredDocumentHit expected_scored_doc_hit3(document_id3, section_id_mask3,
- /*score=*/0.196720);
+ /*score=*/0.268599);
EXPECT_THAT(
scoring_processor->Score(std::move(doc_hit_info_iterator),
/*num_to_score=*/3, &query_term_iterators),
@@ -517,351 +470,6 @@ TEST_F(ScoringProcessorTest,
EqualsScoredDocumentHit(expected_scored_doc_hit3)));
}
-TEST_F(ScoringProcessorTest,
- ShouldScoreByRelevanceScore_HitTermWithZeroFrequency) {
- DocumentProto document1 =
- CreateDocument("icing", "email/1", kDefaultScore,
- /*creation_timestamp_ms=*/kDefaultCreationTimestampMs);
-
- ICING_ASSERT_OK_AND_ASSIGN(
- DocumentId document_id1,
- document_store()->Put(document1, /*num_tokens=*/10));
-
- // Document 1 contains the term "foo" 0 times in the "subject" property
- DocHitInfo doc_hit_info1(document_id1);
- doc_hit_info1.UpdateSection(/*section_id*/ 0, /*hit_term_frequency=*/0);
-
- // Creates input doc_hit_infos and expected output scored_document_hits
- std::vector<DocHitInfo> doc_hit_infos = {doc_hit_info1};
-
- // Creates a dummy DocHitInfoIterator with 1 result for the query "foo"
- std::unique_ptr<DocHitInfoIterator> doc_hit_info_iterator =
- std::make_unique<DocHitInfoIteratorDummy>(doc_hit_infos, "foo");
-
- ScoringSpecProto spec_proto;
- spec_proto.set_rank_by(ScoringSpecProto::RankingStrategy::RELEVANCE_SCORE);
-
- // Creates a ScoringProcessor
- ICING_ASSERT_OK_AND_ASSIGN(
- std::unique_ptr<ScoringProcessor> scoring_processor,
- ScoringProcessor::Create(spec_proto, document_store(), schema_store()));
-
- std::unordered_map<std::string, std::unique_ptr<DocHitInfoIterator>>
- query_term_iterators;
- query_term_iterators["foo"] =
- std::make_unique<DocHitInfoIteratorDummy>(doc_hit_infos, "foo");
-
- SectionIdMask section_id_mask1 = 0b00000001;
-
- // Since the document hit has zero frequency, expect a score of zero.
- ScoredDocumentHit expected_scored_doc_hit1(document_id1, section_id_mask1,
- /*score=*/0.000000);
- EXPECT_THAT(
- scoring_processor->Score(std::move(doc_hit_info_iterator),
- /*num_to_score=*/1, &query_term_iterators),
- ElementsAre(EqualsScoredDocumentHit(expected_scored_doc_hit1)));
-}
-
-TEST_F(ScoringProcessorTest,
- ShouldScoreByRelevanceScore_SameHitFrequencyDifferentPropertyWeights) {
- DocumentProto document1 =
- CreateDocument("icing", "email/1", kDefaultScore,
- /*creation_timestamp_ms=*/kDefaultCreationTimestampMs);
- DocumentProto document2 =
- CreateDocument("icing", "email/2", kDefaultScore,
- /*creation_timestamp_ms=*/kDefaultCreationTimestampMs);
-
- ICING_ASSERT_OK_AND_ASSIGN(
- DocumentId document_id1,
- document_store()->Put(document1, /*num_tokens=*/1));
- ICING_ASSERT_OK_AND_ASSIGN(
- DocumentId document_id2,
- document_store()->Put(document2, /*num_tokens=*/1));
-
- // Document 1 contains the term "foo" 1 time in the "body" property
- SectionId body_section_id = 0;
- DocHitInfo doc_hit_info1(document_id1);
- doc_hit_info1.UpdateSection(body_section_id, /*hit_term_frequency=*/1);
-
- // Document 2 contains the term "foo" 1 time in the "subject" property
- SectionId subject_section_id = 1;
- DocHitInfo doc_hit_info2(document_id2);
- doc_hit_info2.UpdateSection(subject_section_id, /*hit_term_frequency=*/1);
-
- // Creates input doc_hit_infos and expected output scored_document_hits
- std::vector<DocHitInfo> doc_hit_infos = {doc_hit_info1, doc_hit_info2};
-
- // Creates a dummy DocHitInfoIterator with 2 results for the query "foo"
- std::unique_ptr<DocHitInfoIterator> doc_hit_info_iterator =
- std::make_unique<DocHitInfoIteratorDummy>(doc_hit_infos, "foo");
-
- ScoringSpecProto spec_proto;
- spec_proto.set_rank_by(ScoringSpecProto::RankingStrategy::RELEVANCE_SCORE);
-
- PropertyWeight body_property_weight =
- CreatePropertyWeight(/*path=*/"body", /*weight=*/0.5);
- PropertyWeight subject_property_weight =
- CreatePropertyWeight(/*path=*/"subject", /*weight=*/2.0);
- *spec_proto.add_type_property_weights() = CreateTypePropertyWeights(
- /*schema_type=*/"email", {body_property_weight, subject_property_weight});
-
- // Creates a ScoringProcessor
- ICING_ASSERT_OK_AND_ASSIGN(
- std::unique_ptr<ScoringProcessor> scoring_processor,
- ScoringProcessor::Create(spec_proto, document_store(), schema_store()));
-
- std::unordered_map<std::string, std::unique_ptr<DocHitInfoIterator>>
- query_term_iterators;
- query_term_iterators["foo"] =
- std::make_unique<DocHitInfoIteratorDummy>(doc_hit_infos, "foo");
-
- SectionIdMask body_section_id_mask = 1U << body_section_id;
- SectionIdMask subject_section_id_mask = 1U << subject_section_id;
-
- // We expect document 2 to have a higher score than document 1 as it matches
- // "foo" in the "subject" property, which is weighed higher than the "body"
- // property. Final scores are computed with smoothing applied.
- ScoredDocumentHit expected_scored_doc_hit1(document_id1, body_section_id_mask,
- /*score=*/0.053624);
- ScoredDocumentHit expected_scored_doc_hit2(document_id2,
- subject_section_id_mask,
- /*score=*/0.153094);
- EXPECT_THAT(
- scoring_processor->Score(std::move(doc_hit_info_iterator),
- /*num_to_score=*/2, &query_term_iterators),
- ElementsAre(EqualsScoredDocumentHit(expected_scored_doc_hit1),
- EqualsScoredDocumentHit(expected_scored_doc_hit2)));
-}
-
-TEST_F(ScoringProcessorTest,
- ShouldScoreByRelevanceScore_WithImplicitPropertyWeight) {
- DocumentProto document1 =
- CreateDocument("icing", "email/1", kDefaultScore,
- /*creation_timestamp_ms=*/kDefaultCreationTimestampMs);
- DocumentProto document2 =
- CreateDocument("icing", "email/2", kDefaultScore,
- /*creation_timestamp_ms=*/kDefaultCreationTimestampMs);
-
- ICING_ASSERT_OK_AND_ASSIGN(
- DocumentId document_id1,
- document_store()->Put(document1, /*num_tokens=*/1));
- ICING_ASSERT_OK_AND_ASSIGN(
- DocumentId document_id2,
- document_store()->Put(document2, /*num_tokens=*/1));
-
- // Document 1 contains the term "foo" 1 time in the "body" property
- SectionId body_section_id = 0;
- DocHitInfo doc_hit_info1(document_id1);
- doc_hit_info1.UpdateSection(body_section_id, /*hit_term_frequency=*/1);
-
- // Document 2 contains the term "foo" 1 time in the "subject" property
- SectionId subject_section_id = 1;
- DocHitInfo doc_hit_info2(document_id2);
- doc_hit_info2.UpdateSection(subject_section_id, /*hit_term_frequency=*/1);
-
- // Creates input doc_hit_infos and expected output scored_document_hits
- std::vector<DocHitInfo> doc_hit_infos = {doc_hit_info1, doc_hit_info2};
-
- // Creates a dummy DocHitInfoIterator with 2 results for the query "foo"
- std::unique_ptr<DocHitInfoIterator> doc_hit_info_iterator =
- std::make_unique<DocHitInfoIteratorDummy>(doc_hit_infos, "foo");
-
- ScoringSpecProto spec_proto;
- spec_proto.set_rank_by(ScoringSpecProto::RankingStrategy::RELEVANCE_SCORE);
-
- PropertyWeight body_property_weight =
- CreatePropertyWeight(/*path=*/"body", /*weight=*/0.5);
- *spec_proto.add_type_property_weights() = CreateTypePropertyWeights(
- /*schema_type=*/"email", {body_property_weight});
-
- // Creates a ScoringProcessor
- ICING_ASSERT_OK_AND_ASSIGN(
- std::unique_ptr<ScoringProcessor> scoring_processor,
- ScoringProcessor::Create(spec_proto, document_store(), schema_store()));
-
- std::unordered_map<std::string, std::unique_ptr<DocHitInfoIterator>>
- query_term_iterators;
- query_term_iterators["foo"] =
- std::make_unique<DocHitInfoIteratorDummy>(doc_hit_infos, "foo");
-
- SectionIdMask body_section_id_mask = 1U << body_section_id;
- SectionIdMask subject_section_id_mask = 1U << subject_section_id;
-
- // We expect document 2 to have a higher score than document 1 as it matches
- // "foo" in the "subject" property, which is weighed higher than the "body"
- // property. This is because the "subject" property is implictly given a
- // a weight of 1.0, the default weight value. Final scores are computed with
- // smoothing applied.
- ScoredDocumentHit expected_scored_doc_hit1(document_id1, body_section_id_mask,
- /*score=*/0.094601);
- ScoredDocumentHit expected_scored_doc_hit2(document_id2,
- subject_section_id_mask,
- /*score=*/0.153094);
- EXPECT_THAT(
- scoring_processor->Score(std::move(doc_hit_info_iterator),
- /*num_to_score=*/2, &query_term_iterators),
- ElementsAre(EqualsScoredDocumentHit(expected_scored_doc_hit1),
- EqualsScoredDocumentHit(expected_scored_doc_hit2)));
-}
-
-TEST_F(ScoringProcessorTest,
- ShouldScoreByRelevanceScore_WithDefaultPropertyWeight) {
- DocumentProto document1 =
- CreateDocument("icing", "email/1", kDefaultScore,
- /*creation_timestamp_ms=*/kDefaultCreationTimestampMs);
- DocumentProto document2 =
- CreateDocument("icing", "email/2", kDefaultScore,
- /*creation_timestamp_ms=*/kDefaultCreationTimestampMs);
-
- ICING_ASSERT_OK_AND_ASSIGN(
- DocumentId document_id1,
- document_store()->Put(document1, /*num_tokens=*/1));
-
- // Document 1 contains the term "foo" 1 time in the "body" property
- SectionId body_section_id = 0;
- DocHitInfo doc_hit_info1(document_id1);
- doc_hit_info1.UpdateSection(body_section_id, /*hit_term_frequency=*/1);
-
- // Creates input doc_hit_infos and expected output scored_document_hits
- std::vector<DocHitInfo> doc_hit_infos = {doc_hit_info1};
-
- // Creates a dummy DocHitInfoIterator with 1 result for the query "foo"
- std::unique_ptr<DocHitInfoIterator> doc_hit_info_iterator =
- std::make_unique<DocHitInfoIteratorDummy>(doc_hit_infos, "foo");
-
- ScoringSpecProto spec_proto;
- spec_proto.set_rank_by(ScoringSpecProto::RankingStrategy::RELEVANCE_SCORE);
-
- *spec_proto.add_type_property_weights() =
- CreateTypePropertyWeights(/*schema_type=*/"email", {});
-
- // Creates a ScoringProcessor with no explicit weights set.
- ICING_ASSERT_OK_AND_ASSIGN(
- std::unique_ptr<ScoringProcessor> scoring_processor,
- ScoringProcessor::Create(spec_proto, document_store(), schema_store()));
-
- ScoringSpecProto spec_proto_with_weights;
- spec_proto_with_weights.set_rank_by(
- ScoringSpecProto::RankingStrategy::RELEVANCE_SCORE);
-
- PropertyWeight body_property_weight = CreatePropertyWeight(/*path=*/"body",
- /*weight=*/1.0);
- *spec_proto_with_weights.add_type_property_weights() =
- CreateTypePropertyWeights(/*schema_type=*/"email",
- {body_property_weight});
-
- // Creates a ScoringProcessor with default weight set for "body" property.
- ICING_ASSERT_OK_AND_ASSIGN(
- std::unique_ptr<ScoringProcessor> scoring_processor_with_weights,
- ScoringProcessor::Create(spec_proto_with_weights, document_store(),
- schema_store()));
-
- std::unordered_map<std::string, std::unique_ptr<DocHitInfoIterator>>
- query_term_iterators;
- query_term_iterators["foo"] =
- std::make_unique<DocHitInfoIteratorDummy>(doc_hit_infos, "foo");
-
- // Create a doc hit iterator
- std::unordered_map<std::string, std::unique_ptr<DocHitInfoIterator>>
- query_term_iterators_scoring_with_weights;
- query_term_iterators_scoring_with_weights["foo"] =
- std::make_unique<DocHitInfoIteratorDummy>(doc_hit_infos, "foo");
-
- SectionIdMask body_section_id_mask = 1U << body_section_id;
-
- // We expect document 1 to have the same score whether a weight is explicitly
- // set to 1.0 or implictly scored with the default weight. Final scores are
- // computed with smoothing applied.
- ScoredDocumentHit expected_scored_doc_hit(document_id1, body_section_id_mask,
- /*score=*/0.208191);
- EXPECT_THAT(
- scoring_processor->Score(std::move(doc_hit_info_iterator),
- /*num_to_score=*/1, &query_term_iterators),
- ElementsAre(EqualsScoredDocumentHit(expected_scored_doc_hit)));
-
- // Restore ownership of doc hit iterator and query term iterator to test.
- doc_hit_info_iterator =
- std::make_unique<DocHitInfoIteratorDummy>(doc_hit_infos, "foo");
- query_term_iterators["foo"] =
- std::make_unique<DocHitInfoIteratorDummy>(doc_hit_infos, "foo");
-
- EXPECT_THAT(scoring_processor_with_weights->Score(
- std::move(doc_hit_info_iterator),
- /*num_to_score=*/1, &query_term_iterators),
- ElementsAre(EqualsScoredDocumentHit(expected_scored_doc_hit)));
-}
-
-TEST_F(ScoringProcessorTest,
- ShouldScoreByRelevanceScore_WithZeroPropertyWeight) {
- DocumentProto document1 =
- CreateDocument("icing", "email/1", kDefaultScore,
- /*creation_timestamp_ms=*/kDefaultCreationTimestampMs);
- DocumentProto document2 =
- CreateDocument("icing", "email/2", kDefaultScore,
- /*creation_timestamp_ms=*/kDefaultCreationTimestampMs);
-
- ICING_ASSERT_OK_AND_ASSIGN(
- DocumentId document_id1,
- document_store()->Put(document1, /*num_tokens=*/1));
- ICING_ASSERT_OK_AND_ASSIGN(
- DocumentId document_id2,
- document_store()->Put(document2, /*num_tokens=*/1));
-
- // Document 1 contains the term "foo" 1 time in the "body" property
- SectionId body_section_id = 0;
- DocHitInfo doc_hit_info1(document_id1);
- doc_hit_info1.UpdateSection(body_section_id, /*hit_term_frequency=*/1);
-
- // Document 2 contains the term "foo" 1 time in the "subject" property
- SectionId subject_section_id = 1;
- DocHitInfo doc_hit_info2(document_id2);
- doc_hit_info2.UpdateSection(subject_section_id, /*hit_term_frequency=*/1);
-
- // Creates input doc_hit_infos and expected output scored_document_hits
- std::vector<DocHitInfo> doc_hit_infos = {doc_hit_info1, doc_hit_info2};
-
- // Creates a dummy DocHitInfoIterator with 2 results for the query "foo"
- std::unique_ptr<DocHitInfoIterator> doc_hit_info_iterator =
- std::make_unique<DocHitInfoIteratorDummy>(doc_hit_infos, "foo");
-
- ScoringSpecProto spec_proto;
- spec_proto.set_rank_by(ScoringSpecProto::RankingStrategy::RELEVANCE_SCORE);
-
- // Sets property weight for "body" to 0.0.
- PropertyWeight body_property_weight =
- CreatePropertyWeight(/*path=*/"body", /*weight=*/0.0);
- // Sets property weight for "subject" to 1.0.
- PropertyWeight subject_property_weight =
- CreatePropertyWeight(/*path=*/"subject", /*weight=*/1.0);
- *spec_proto.add_type_property_weights() = CreateTypePropertyWeights(
- /*schema_type=*/"email", {body_property_weight, subject_property_weight});
-
- // Creates a ScoringProcessor
- ICING_ASSERT_OK_AND_ASSIGN(
- std::unique_ptr<ScoringProcessor> scoring_processor,
- ScoringProcessor::Create(spec_proto, document_store(), schema_store()));
-
- std::unordered_map<std::string, std::unique_ptr<DocHitInfoIterator>>
- query_term_iterators;
- query_term_iterators["foo"] =
- std::make_unique<DocHitInfoIteratorDummy>(doc_hit_infos, "foo");
-
- std::vector<ScoredDocumentHit> scored_document_hits =
- scoring_processor->Score(std::move(doc_hit_info_iterator),
- /*num_to_score=*/2, &query_term_iterators);
-
- // We expect document1 to have a score of 0.0 as the query term "foo" matches
- // in the "body" property which has a weight of 0.0. This is a result of the
- // weighted term frequency being scaled down to 0.0 for the hit. We expect
- // document2 to have a positive score as the query term "foo" matches in the
- // "subject" property which has a weight of 1.0.
- EXPECT_THAT(scored_document_hits, SizeIs(2));
- EXPECT_THAT(scored_document_hits.at(0).document_id(), Eq(document_id1));
- EXPECT_THAT(scored_document_hits.at(0).score(), Eq(0.0));
- EXPECT_THAT(scored_document_hits.at(1).document_id(), Eq(document_id2));
- EXPECT_THAT(scored_document_hits.at(1).score(), Gt(0.0));
-}
-
TEST_F(ScoringProcessorTest, ShouldScoreByCreationTimestamp) {
DocumentProto document1 =
CreateDocument("icing", "email/1", kDefaultScore,
@@ -901,7 +509,7 @@ TEST_F(ScoringProcessorTest, ShouldScoreByCreationTimestamp) {
// Creates a ScoringProcessor which ranks in descending order
ICING_ASSERT_OK_AND_ASSIGN(
std::unique_ptr<ScoringProcessor> scoring_processor,
- ScoringProcessor::Create(spec_proto, document_store(), schema_store()));
+ ScoringProcessor::Create(spec_proto, document_store()));
EXPECT_THAT(scoring_processor->Score(std::move(doc_hit_info_iterator),
/*num_to_score=*/3),
@@ -961,7 +569,7 @@ TEST_F(ScoringProcessorTest, ShouldScoreByUsageCount) {
// Creates a ScoringProcessor which ranks in descending order
ICING_ASSERT_OK_AND_ASSIGN(
std::unique_ptr<ScoringProcessor> scoring_processor,
- ScoringProcessor::Create(spec_proto, document_store(), schema_store()));
+ ScoringProcessor::Create(spec_proto, document_store()));
EXPECT_THAT(scoring_processor->Score(std::move(doc_hit_info_iterator),
/*num_to_score=*/3),
@@ -1021,7 +629,7 @@ TEST_F(ScoringProcessorTest, ShouldScoreByUsageTimestamp) {
// Creates a ScoringProcessor which ranks in descending order
ICING_ASSERT_OK_AND_ASSIGN(
std::unique_ptr<ScoringProcessor> scoring_processor,
- ScoringProcessor::Create(spec_proto, document_store(), schema_store()));
+ ScoringProcessor::Create(spec_proto, document_store()));
EXPECT_THAT(scoring_processor->Score(std::move(doc_hit_info_iterator),
/*num_to_score=*/3),
@@ -1057,7 +665,7 @@ TEST_F(ScoringProcessorTest, ShouldHandleNoScores) {
// Creates a ScoringProcessor which ranks in descending order
ICING_ASSERT_OK_AND_ASSIGN(
std::unique_ptr<ScoringProcessor> scoring_processor,
- ScoringProcessor::Create(spec_proto, document_store(), schema_store()));
+ ScoringProcessor::Create(spec_proto, document_store()));
EXPECT_THAT(scoring_processor->Score(std::move(doc_hit_info_iterator),
/*num_to_score=*/4),
ElementsAre(EqualsScoredDocumentHit(scored_document_hit_default),
@@ -1106,7 +714,7 @@ TEST_F(ScoringProcessorTest, ShouldWrapResultsWhenNoScoring) {
// Creates a ScoringProcessor which ranks in descending order
ICING_ASSERT_OK_AND_ASSIGN(
std::unique_ptr<ScoringProcessor> scoring_processor,
- ScoringProcessor::Create(spec_proto, document_store(), schema_store()));
+ ScoringProcessor::Create(spec_proto, document_store()));
EXPECT_THAT(scoring_processor->Score(std::move(doc_hit_info_iterator),
/*num_to_score=*/3),
diff --git a/icing/scoring/section-weights.cc b/icing/scoring/section-weights.cc
deleted file mode 100644
index ed7cd5e..0000000
--- a/icing/scoring/section-weights.cc
+++ /dev/null
@@ -1,151 +0,0 @@
-// Copyright (C) 2021 Google LLC
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-// http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#include "icing/scoring/section-weights.h"
-
-#include <cfloat>
-#include <unordered_map>
-#include <utility>
-
-#include "icing/proto/scoring.pb.h"
-#include "icing/schema/section.h"
-#include "icing/util/logging.h"
-
-namespace icing {
-namespace lib {
-
-namespace {
-
-// Normalizes all weights in the map to be in range [0.0, 1.0], where the max
-// weight is normalized to 1.0. In the case that all weights are equal to 0.0,
-// the normalized weight for each will be 0.0.
-inline void NormalizeSectionWeights(
- double max_weight, std::unordered_map<SectionId, double>& section_weights) {
- if (max_weight == 0.0) {
- return;
- }
- for (auto& raw_weight : section_weights) {
- raw_weight.second = raw_weight.second / max_weight;
- }
-}
-} // namespace
-
-libtextclassifier3::StatusOr<std::unique_ptr<SectionWeights>>
-SectionWeights::Create(const SchemaStore* schema_store,
- const ScoringSpecProto& scoring_spec) {
- ICING_RETURN_ERROR_IF_NULL(schema_store);
-
- std::unordered_map<SchemaTypeId, NormalizedSectionWeights>
- schema_property_weight_map;
- for (const TypePropertyWeights& type_property_weights :
- scoring_spec.type_property_weights()) {
- std::string_view schema_type = type_property_weights.schema_type();
- auto schema_type_id_or = schema_store->GetSchemaTypeId(schema_type);
- if (!schema_type_id_or.ok()) {
- ICING_LOG(WARNING) << "No schema type id found for schema type: "
- << schema_type;
- continue;
- }
- SchemaTypeId schema_type_id = schema_type_id_or.ValueOrDie();
- auto section_metadata_list_or =
- schema_store->GetSectionMetadata(schema_type.data());
- if (!section_metadata_list_or.ok()) {
- ICING_LOG(WARNING) << "No metadata found for schema type: "
- << schema_type;
- continue;
- }
-
- const std::vector<SectionMetadata>* metadata_list =
- section_metadata_list_or.ValueOrDie();
-
- std::unordered_map<std::string, double> property_paths_weights;
- for (const PropertyWeight& property_weight :
- type_property_weights.property_weights()) {
- double property_path_weight = property_weight.weight();
-
- // Return error on negative weights.
- if (property_path_weight < 0.0) {
- return absl_ports::InvalidArgumentError(IcingStringUtil::StringPrintf(
- "Property weight for property path \"%s\" is negative. Negative "
- "weights are invalid.",
- property_weight.path().c_str()));
- }
- property_paths_weights.insert(
- {property_weight.path(), property_path_weight});
- }
- NormalizedSectionWeights normalized_section_weights =
- ExtractNormalizedSectionWeights(property_paths_weights, *metadata_list);
-
- schema_property_weight_map.insert(
- {schema_type_id,
- {/*section_weights*/ std::move(
- normalized_section_weights.section_weights),
- /*default_weight*/ normalized_section_weights.default_weight}});
- }
- // Using `new` to access a non-public constructor.
- return std::unique_ptr<SectionWeights>(
- new SectionWeights(std::move(schema_property_weight_map)));
-}
-
-double SectionWeights::GetNormalizedSectionWeight(SchemaTypeId schema_type_id,
- SectionId section_id) const {
- auto schema_type_map = schema_section_weight_map_.find(schema_type_id);
- if (schema_type_map == schema_section_weight_map_.end()) {
- // Return default weight if the schema type has no weights specified.
- return kDefaultSectionWeight;
- }
-
- auto section_weight =
- schema_type_map->second.section_weights.find(section_id);
- if (section_weight == schema_type_map->second.section_weights.end()) {
- // If there is no entry for SectionId, the weight is implicitly the
- // normalized default weight.
- return schema_type_map->second.default_weight;
- }
- return section_weight->second;
-}
-
-inline SectionWeights::NormalizedSectionWeights
-SectionWeights::ExtractNormalizedSectionWeights(
- const std::unordered_map<std::string, double>& raw_weights,
- const std::vector<SectionMetadata>& metadata_list) {
- double max_weight = -std::numeric_limits<double>::infinity();
- std::unordered_map<SectionId, double> section_weights;
- for (const SectionMetadata& section_metadata : metadata_list) {
- std::string_view metadata_path = section_metadata.path;
- double section_weight = kDefaultSectionWeight;
- auto iter = raw_weights.find(metadata_path.data());
- if (iter != raw_weights.end()) {
- section_weight = iter->second;
- section_weights.insert({section_metadata.id, section_weight});
- }
- // Replace max if we see new max weight.
- max_weight = std::max(max_weight, section_weight);
- }
-
- NormalizeSectionWeights(max_weight, section_weights);
- // Set normalized default weight to 1.0 in case there is no section
- // metadata and max_weight is -INF (we should not see this case).
- double normalized_default_weight =
- max_weight == -std::numeric_limits<double>::infinity()
- ? kDefaultSectionWeight
- : kDefaultSectionWeight / max_weight;
- SectionWeights::NormalizedSectionWeights normalized_section_weights =
- SectionWeights::NormalizedSectionWeights();
- normalized_section_weights.section_weights = std::move(section_weights);
- normalized_section_weights.default_weight = normalized_default_weight;
- return normalized_section_weights;
-}
-} // namespace lib
-} // namespace icing
diff --git a/icing/scoring/section-weights.h b/icing/scoring/section-weights.h
deleted file mode 100644
index 23a9188..0000000
--- a/icing/scoring/section-weights.h
+++ /dev/null
@@ -1,95 +0,0 @@
-// Copyright (C) 2021 Google LLC
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-// http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#ifndef ICING_SCORING_SECTION_WEIGHTS_H_
-#define ICING_SCORING_SECTION_WEIGHTS_H_
-
-#include <unordered_map>
-
-#include "icing/text_classifier/lib3/utils/base/statusor.h"
-#include "icing/schema/schema-store.h"
-#include "icing/store/document-store.h"
-
-namespace icing {
-namespace lib {
-
-inline constexpr double kDefaultSectionWeight = 1.0;
-
-// Provides functions for setting and retrieving section weights for schema
-// type properties. Section weights are used to promote and demote term matches
-// in sections when scoring results. Section weights are provided by property
-// path, and can range from (0, DBL_MAX]. The SectionId is matched to the
-// property path by going over the schema type's section metadata. Weights that
-// correspond to a valid property path are then normalized against the maxmium
-// section weight, and put into map for quick access for scorers. By default,
-// a section is given a raw, pre-normalized weight of 1.0.
-class SectionWeights {
- public:
- // SectionWeights instances should not be copied.
- SectionWeights(const SectionWeights&) = delete;
- SectionWeights& operator=(const SectionWeights&) = delete;
-
- // Factory function to create a SectionWeights instance. Raw weights are
- // provided through the ScoringSpecProto. Provided property paths for weights
- // are validated against the schema type's section metadata. If the property
- // path doesn't exist, the property weight is ignored. If a weight is 0 or
- // negative, an invalid argument error is returned. Raw weights are then
- // normalized against the maximum weight for that schema type.
- //
- // Returns:
- // A SectionWeights instance on success
- // FAILED_PRECONDITION on any null pointer input
- // INVALID_ARGUMENT if a provided weight for a property path is less than or
- // equal to 0.
- static libtextclassifier3::StatusOr<std::unique_ptr<SectionWeights>> Create(
- const SchemaStore* schema_store, const ScoringSpecProto& scoring_spec);
-
- // Returns the normalized section weight by SchemaTypeId and SectionId. If
- // the SchemaTypeId, or the SectionId for a SchemaTypeId, is not found in the
- // normalized weights map, the default weight is returned instead.
- double GetNormalizedSectionWeight(SchemaTypeId schema_type_id,
- SectionId section_id) const;
-
- private:
- // Holds the normalized section weights for a schema type, as well as the
- // normalized default weight for sections that have no weight set.
- struct NormalizedSectionWeights {
- std::unordered_map<SectionId, double> section_weights;
- double default_weight;
- };
-
- explicit SectionWeights(
- const std::unordered_map<SchemaTypeId, NormalizedSectionWeights>
- schema_section_weight_map)
- : schema_section_weight_map_(std::move(schema_section_weight_map)) {}
-
- // Creates a map of section ids to normalized weights from the raw property
- // path weight map and section metadata and calculates the normalized default
- // section weight.
- static inline SectionWeights::NormalizedSectionWeights
- ExtractNormalizedSectionWeights(
- const std::unordered_map<std::string, double>& raw_weights,
- const std::vector<SectionMetadata>& metadata_list);
-
- // A map of (SchemaTypeId -> SectionId -> Normalized Weight), allows for fast
- // look up of normalized weights. This is precomputed when creating a
- // SectionWeights instance.
- std::unordered_map<SchemaTypeId, NormalizedSectionWeights>
- schema_section_weight_map_;
-};
-
-} // namespace lib
-} // namespace icing
-
-#endif // ICING_SCORING_SECTION_WEIGHTS_H_
diff --git a/icing/scoring/section-weights_test.cc b/icing/scoring/section-weights_test.cc
deleted file mode 100644
index 330faee..0000000
--- a/icing/scoring/section-weights_test.cc
+++ /dev/null
@@ -1,443 +0,0 @@
-// Copyright (C) 2021 Google LLC
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-// http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#include "icing/scoring/section-weights.h"
-
-#include <cfloat>
-
-#include "gmock/gmock.h"
-#include "gtest/gtest.h"
-#include "icing/proto/scoring.pb.h"
-#include "icing/schema-builder.h"
-#include "icing/testing/common-matchers.h"
-#include "icing/testing/fake-clock.h"
-#include "icing/testing/tmp-directory.h"
-
-namespace icing {
-namespace lib {
-
-namespace {
-using ::testing::Eq;
-
-class SectionWeightsTest : public testing::Test {
- protected:
- SectionWeightsTest()
- : test_dir_(GetTestTempDir() + "/icing"),
- schema_store_dir_(test_dir_ + "/schema_store") {}
-
- void SetUp() override {
- // Creates file directories
- filesystem_.DeleteDirectoryRecursively(test_dir_.c_str());
- filesystem_.CreateDirectoryRecursively(schema_store_dir_.c_str());
-
- ICING_ASSERT_OK_AND_ASSIGN(
- schema_store_,
- SchemaStore::Create(&filesystem_, test_dir_, &fake_clock_));
-
- SchemaTypeConfigProto sender_schema =
- SchemaTypeConfigBuilder()
- .SetType("sender")
- .AddProperty(
- PropertyConfigBuilder()
- .SetName("name")
- .SetDataTypeString(
- TermMatchType::PREFIX,
- StringIndexingConfig::TokenizerType::PLAIN)
- .SetCardinality(PropertyConfigProto::Cardinality::OPTIONAL))
- .Build();
- SchemaTypeConfigProto email_schema =
- SchemaTypeConfigBuilder()
- .SetType("email")
- .AddProperty(
- PropertyConfigBuilder()
- .SetName("subject")
- .SetDataTypeString(
- TermMatchType::PREFIX,
- StringIndexingConfig::TokenizerType::PLAIN)
- .SetDataType(PropertyConfigProto::DataType::STRING)
- .SetCardinality(PropertyConfigProto::Cardinality::OPTIONAL))
- .AddProperty(
- PropertyConfigBuilder()
- .SetName("body")
- .SetDataTypeString(
- TermMatchType::PREFIX,
- StringIndexingConfig::TokenizerType::PLAIN)
- .SetDataType(PropertyConfigProto::DataType::STRING)
- .SetCardinality(PropertyConfigProto::Cardinality::OPTIONAL))
- .AddProperty(
- PropertyConfigBuilder()
- .SetName("sender")
- .SetDataTypeDocument("sender",
- /*index_nested_properties=*/true)
- .SetCardinality(PropertyConfigProto::Cardinality::OPTIONAL))
- .Build();
- SchemaProto schema =
- SchemaBuilder().AddType(sender_schema).AddType(email_schema).Build();
-
- ICING_ASSERT_OK(schema_store_->SetSchema(schema));
- }
-
- void TearDown() override {
- schema_store_.reset();
- filesystem_.DeleteDirectoryRecursively(test_dir_.c_str());
- }
-
- SchemaStore *schema_store() { return schema_store_.get(); }
-
- private:
- const std::string test_dir_;
- const std::string schema_store_dir_;
- Filesystem filesystem_;
- FakeClock fake_clock_;
- std::unique_ptr<SchemaStore> schema_store_;
-};
-
-TEST_F(SectionWeightsTest, ShouldNormalizeSinglePropertyWeight) {
- ScoringSpecProto spec_proto;
-
- TypePropertyWeights *type_property_weights =
- spec_proto.add_type_property_weights();
- type_property_weights->set_schema_type("sender");
-
- PropertyWeight *property_weight =
- type_property_weights->add_property_weights();
- property_weight->set_weight(5.0);
- property_weight->set_path("name");
-
- ICING_ASSERT_OK_AND_ASSIGN(
- std::unique_ptr<SectionWeights> section_weights,
- SectionWeights::Create(schema_store(), spec_proto));
- ICING_ASSERT_OK_AND_ASSIGN(SchemaTypeId sender_schema_type_id,
- schema_store()->GetSchemaTypeId("sender"));
-
- // section_id 0 corresponds to property "name".
- // We expect 1.0 as there is only one property in the "sender" schema type
- // so it should take the max normalized weight of 1.0.
- EXPECT_THAT(section_weights->GetNormalizedSectionWeight(sender_schema_type_id,
- /*section_id=*/0),
- Eq(1.0));
-}
-
-TEST_F(SectionWeightsTest, ShouldAcceptMaxWeightValue) {
- ScoringSpecProto spec_proto;
-
- TypePropertyWeights *type_property_weights =
- spec_proto.add_type_property_weights();
- type_property_weights->set_schema_type("sender");
-
- PropertyWeight *property_weight =
- type_property_weights->add_property_weights();
- property_weight->set_weight(DBL_MAX);
- property_weight->set_path("name");
-
- ICING_ASSERT_OK_AND_ASSIGN(
- std::unique_ptr<SectionWeights> section_weights,
- SectionWeights::Create(schema_store(), spec_proto));
- ICING_ASSERT_OK_AND_ASSIGN(SchemaTypeId sender_schema_type_id,
- schema_store()->GetSchemaTypeId("sender"));
-
- // section_id 0 corresponds to property "name".
- EXPECT_THAT(section_weights->GetNormalizedSectionWeight(sender_schema_type_id,
- /*section_id=*/0),
- Eq(1.0));
-}
-
-TEST_F(SectionWeightsTest, ShouldFailWithNegativeWeights) {
- ScoringSpecProto spec_proto;
-
- TypePropertyWeights *type_property_weights =
- spec_proto.add_type_property_weights();
- type_property_weights->set_schema_type("email");
-
- PropertyWeight *body_propery_weight =
- type_property_weights->add_property_weights();
- body_propery_weight->set_weight(-100.0);
- body_propery_weight->set_path("body");
-
- EXPECT_THAT(SectionWeights::Create(schema_store(), spec_proto).status(),
- StatusIs(libtextclassifier3::StatusCode::INVALID_ARGUMENT));
-}
-
-TEST_F(SectionWeightsTest, ShouldAcceptZeroWeight) {
- ScoringSpecProto spec_proto;
-
- TypePropertyWeights *type_property_weights =
- spec_proto.add_type_property_weights();
- type_property_weights->set_schema_type("email");
-
- PropertyWeight *body_property_weight =
- type_property_weights->add_property_weights();
- body_property_weight->set_weight(2.0);
- body_property_weight->set_path("body");
-
- PropertyWeight *subject_property_weight =
- type_property_weights->add_property_weights();
- subject_property_weight->set_weight(0.0);
- subject_property_weight->set_path("subject");
-
- ICING_ASSERT_OK_AND_ASSIGN(
- std::unique_ptr<SectionWeights> section_weights,
- SectionWeights::Create(schema_store(), spec_proto));
- ICING_ASSERT_OK_AND_ASSIGN(SchemaTypeId email_schema_type_id,
- schema_store()->GetSchemaTypeId("email"));
-
- // Normalized weight for "body" property.
- EXPECT_THAT(section_weights->GetNormalizedSectionWeight(email_schema_type_id,
- /*section_id=*/0),
- Eq(1.0));
- // Normalized weight for "subject" property.
- EXPECT_THAT(section_weights->GetNormalizedSectionWeight(email_schema_type_id,
- /*section_id=*/2),
- Eq(0.0));
-}
-
-TEST_F(SectionWeightsTest, ShouldNormalizeToZeroWhenAllWeightsZero) {
- ScoringSpecProto spec_proto;
-
- TypePropertyWeights *type_property_weights =
- spec_proto.add_type_property_weights();
- type_property_weights->set_schema_type("email");
-
- PropertyWeight *body_property_weight =
- type_property_weights->add_property_weights();
- body_property_weight->set_weight(0.0);
- body_property_weight->set_path("body");
-
- PropertyWeight *sender_property_weight =
- type_property_weights->add_property_weights();
- sender_property_weight->set_weight(0.0);
- sender_property_weight->set_path("sender.name");
-
- PropertyWeight *subject_property_weight =
- type_property_weights->add_property_weights();
- subject_property_weight->set_weight(0.0);
- subject_property_weight->set_path("subject");
-
- ICING_ASSERT_OK_AND_ASSIGN(
- std::unique_ptr<SectionWeights> section_weights,
- SectionWeights::Create(schema_store(), spec_proto));
- ICING_ASSERT_OK_AND_ASSIGN(SchemaTypeId email_schema_type_id,
- schema_store()->GetSchemaTypeId("email"));
-
- // Normalized weight for "body" property.
- EXPECT_THAT(section_weights->GetNormalizedSectionWeight(email_schema_type_id,
- /*section_id=*/0),
- Eq(0.0));
- // Normalized weight for "sender.name" property (the nested property).
- EXPECT_THAT(section_weights->GetNormalizedSectionWeight(email_schema_type_id,
- /*section_id=*/1),
- Eq(0.0));
- // Normalized weight for "subject" property.
- EXPECT_THAT(section_weights->GetNormalizedSectionWeight(email_schema_type_id,
- /*section_id=*/2),
- Eq(0.0));
-}
-
-TEST_F(SectionWeightsTest, ShouldReturnDefaultIfTypePropertyWeightsNotSet) {
- ScoringSpecProto spec_proto;
-
- ICING_ASSERT_OK_AND_ASSIGN(
- std::unique_ptr<SectionWeights> section_weights,
- SectionWeights::Create(schema_store(), spec_proto));
- ICING_ASSERT_OK_AND_ASSIGN(SchemaTypeId email_schema_type_id,
- schema_store()->GetSchemaTypeId("email"));
-
- EXPECT_THAT(section_weights->GetNormalizedSectionWeight(email_schema_type_id,
- /*section_id=*/0),
- Eq(kDefaultSectionWeight));
-}
-
-TEST_F(SectionWeightsTest, ShouldSetNestedPropertyWeights) {
- ScoringSpecProto spec_proto;
-
- TypePropertyWeights *type_property_weights =
- spec_proto.add_type_property_weights();
- type_property_weights->set_schema_type("email");
-
- PropertyWeight *body_property_weight =
- type_property_weights->add_property_weights();
- body_property_weight->set_weight(1.0);
- body_property_weight->set_path("body");
-
- PropertyWeight *subject_property_weight =
- type_property_weights->add_property_weights();
- subject_property_weight->set_weight(100.0);
- subject_property_weight->set_path("subject");
-
- PropertyWeight *nested_property_weight =
- type_property_weights->add_property_weights();
- nested_property_weight->set_weight(50.0);
- nested_property_weight->set_path("sender.name");
-
- ICING_ASSERT_OK_AND_ASSIGN(
- std::unique_ptr<SectionWeights> section_weights,
- SectionWeights::Create(schema_store(), spec_proto));
- ICING_ASSERT_OK_AND_ASSIGN(SchemaTypeId email_schema_type_id,
- schema_store()->GetSchemaTypeId("email"));
-
- // Normalized weight for "body" property.
- EXPECT_THAT(section_weights->GetNormalizedSectionWeight(email_schema_type_id,
- /*section_id=*/0),
- Eq(0.01));
- // Normalized weight for "sender.name" property (the nested property).
- EXPECT_THAT(section_weights->GetNormalizedSectionWeight(email_schema_type_id,
- /*section_id=*/1),
- Eq(0.5));
- // Normalized weight for "subject" property.
- EXPECT_THAT(section_weights->GetNormalizedSectionWeight(email_schema_type_id,
- /*section_id=*/2),
- Eq(1.0));
-}
-
-TEST_F(SectionWeightsTest, ShouldNormalizeIfAllWeightsBelowOne) {
- ScoringSpecProto spec_proto;
-
- TypePropertyWeights *type_property_weights =
- spec_proto.add_type_property_weights();
- type_property_weights->set_schema_type("email");
-
- PropertyWeight *body_property_weight =
- type_property_weights->add_property_weights();
- body_property_weight->set_weight(0.1);
- body_property_weight->set_path("body");
-
- PropertyWeight *sender_name_weight =
- type_property_weights->add_property_weights();
- sender_name_weight->set_weight(0.2);
- sender_name_weight->set_path("sender.name");
-
- PropertyWeight *subject_property_weight =
- type_property_weights->add_property_weights();
- subject_property_weight->set_weight(0.4);
- subject_property_weight->set_path("subject");
-
- ICING_ASSERT_OK_AND_ASSIGN(
- std::unique_ptr<SectionWeights> section_weights,
- SectionWeights::Create(schema_store(), spec_proto));
- ICING_ASSERT_OK_AND_ASSIGN(SchemaTypeId email_schema_type_id,
- schema_store()->GetSchemaTypeId("email"));
-
- // Normalized weight for "body" property.
- EXPECT_THAT(section_weights->GetNormalizedSectionWeight(email_schema_type_id,
- /*section_id=*/0),
- Eq(1.0 / 4.0));
- // Normalized weight for "sender.name" property (the nested property).
- EXPECT_THAT(section_weights->GetNormalizedSectionWeight(email_schema_type_id,
- /*section_id=*/1),
- Eq(2.0 / 4.0));
- // Normalized weight for "subject" property.
- EXPECT_THAT(section_weights->GetNormalizedSectionWeight(email_schema_type_id,
- /*section_id=*/2),
- Eq(1.0));
-}
-
-TEST_F(SectionWeightsTest, ShouldSetNestedPropertyWeightSeparatelyForTypes) {
- ScoringSpecProto spec_proto;
-
- TypePropertyWeights *email_type_property_weights =
- spec_proto.add_type_property_weights();
- email_type_property_weights->set_schema_type("email");
-
- PropertyWeight *body_property_weight =
- email_type_property_weights->add_property_weights();
- body_property_weight->set_weight(1.0);
- body_property_weight->set_path("body");
-
- PropertyWeight *subject_property_weight =
- email_type_property_weights->add_property_weights();
- subject_property_weight->set_weight(100.0);
- subject_property_weight->set_path("subject");
-
- PropertyWeight *sender_name_property_weight =
- email_type_property_weights->add_property_weights();
- sender_name_property_weight->set_weight(50.0);
- sender_name_property_weight->set_path("sender.name");
-
- TypePropertyWeights *sender_type_property_weights =
- spec_proto.add_type_property_weights();
- sender_type_property_weights->set_schema_type("sender");
-
- PropertyWeight *sender_property_weight =
- sender_type_property_weights->add_property_weights();
- sender_property_weight->set_weight(25.0);
- sender_property_weight->set_path("sender");
-
- ICING_ASSERT_OK_AND_ASSIGN(
- std::unique_ptr<SectionWeights> section_weights,
- SectionWeights::Create(schema_store(), spec_proto));
- ICING_ASSERT_OK_AND_ASSIGN(SchemaTypeId email_schema_type_id,
- schema_store()->GetSchemaTypeId("email"));
- ICING_ASSERT_OK_AND_ASSIGN(SchemaTypeId sender_schema_type_id,
- schema_store()->GetSchemaTypeId("sender"));
-
- // Normalized weight for "sender.name" property (the nested property)
- EXPECT_THAT(section_weights->GetNormalizedSectionWeight(email_schema_type_id,
- /*section_id=*/1),
- Eq(0.5));
- // Normalized weight for "name" property for "sender" schema type. As it is
- // the only property of the type, it should take the max normalized weight of
- // 1.0.
- EXPECT_THAT(section_weights->GetNormalizedSectionWeight(sender_schema_type_id,
- /*section_id=*/2),
- Eq(1.0));
-}
-
-TEST_F(SectionWeightsTest, ShouldSkipNonExistentPathWhenSettingWeights) {
- ScoringSpecProto spec_proto;
-
- TypePropertyWeights *type_property_weights =
- spec_proto.add_type_property_weights();
- type_property_weights->set_schema_type("email");
-
- // If this property weight isn't skipped, then the max property weight would
- // be set to 100.0 and all weights would be normalized against the max.
- PropertyWeight *non_valid_property_weight =
- type_property_weights->add_property_weights();
- non_valid_property_weight->set_weight(100.0);
- non_valid_property_weight->set_path("sender.organization");
-
- PropertyWeight *subject_property_weight =
- type_property_weights->add_property_weights();
- subject_property_weight->set_weight(10.0);
- subject_property_weight->set_path("subject");
-
- ICING_ASSERT_OK_AND_ASSIGN(
- std::unique_ptr<SectionWeights> section_weights,
- SectionWeights::Create(schema_store(), spec_proto));
- ICING_ASSERT_OK_AND_ASSIGN(SchemaTypeId email_schema_type_id,
- schema_store()->GetSchemaTypeId("email"));
-
- // Normalized weight for "body" property. Because the weight is not explicitly
- // set, it is set to the default of 1.0 before being normalized.
- EXPECT_THAT(section_weights->GetNormalizedSectionWeight(email_schema_type_id,
- /*section_id=*/0),
- Eq(0.1));
- // Normalized weight for "sender.name" property (the nested property). Because
- // the weight is not explicitly set, it is set to the default of 1.0 before
- // being normalized.
- EXPECT_THAT(section_weights->GetNormalizedSectionWeight(email_schema_type_id,
- /*section_id=*/1),
- Eq(0.1));
- // Normalized weight for "subject" property. Because the invalid property path
- // is skipped when assigning weights, subject takes the max normalized weight
- // of 1.0 instead.
- EXPECT_THAT(section_weights->GetNormalizedSectionWeight(email_schema_type_id,
- /*section_id=*/2),
- Eq(1.0));
-}
-
-} // namespace
-
-} // namespace lib
-} // namespace icing
diff --git a/icing/store/document-log-creator.cc b/icing/store/document-log-creator.cc
index 5e23a8e..a035f93 100644
--- a/icing/store/document-log-creator.cc
+++ b/icing/store/document-log-creator.cc
@@ -69,24 +69,33 @@ DocumentLogCreator::Create(const Filesystem* filesystem,
const std::string& base_dir) {
bool v0_exists =
filesystem->FileExists(MakeDocumentLogFilenameV0(base_dir).c_str());
+ bool regen_derived_files = false;
+
+#ifdef ENABLE_V1_MIGRATION
bool v1_exists =
filesystem->FileExists(MakeDocumentLogFilenameV1(base_dir).c_str());
- bool new_file = false;
- int preexisting_file_version = kCurrentVersion;
if (v0_exists && !v1_exists) {
ICING_RETURN_IF_ERROR(MigrateFromV0ToV1(filesystem, base_dir));
// Need to regenerate derived files since documents may be written to a
// different file offset in the log.
- preexisting_file_version = 0;
+ regen_derived_files = true;
} else if (!v1_exists) {
// First time initializing a v1 log. There are no existing derived files at
// this point, so we should generate some. "regenerate" here also means
// "generate for the first time", i.e. we shouldn't expect there to be any
// existing derived files.
- new_file = true;
+ regen_derived_files = true;
+ }
+#else // !ENABLE_V1_MIGRATION
+ if (v0_exists) {
+ // If migration from v0 to v1 is not enabled, then simply delete the v0 file
+ // and treat this as if it's our first time initializing a v1 log.
+ regen_derived_files = true;
+ filesystem->DeleteFile(MakeDocumentLogFilenameV0(base_dir).c_str());
}
+#endif // ENABLED_V1_MIGRATION
ICING_ASSIGN_OR_RETURN(
PortableFileBackedProtoLog<DocumentWrapper>::CreateResult
@@ -97,7 +106,7 @@ DocumentLogCreator::Create(const Filesystem* filesystem,
/*compress_in=*/true)));
CreateResult create_result = {std::move(log_create_result),
- preexisting_file_version, new_file};
+ regen_derived_files};
return create_result;
}
diff --git a/icing/store/document-log-creator.h b/icing/store/document-log-creator.h
index be8feed..51cf497 100644
--- a/icing/store/document-log-creator.h
+++ b/icing/store/document-log-creator.h
@@ -30,20 +30,14 @@ namespace lib {
// be necessary.
class DocumentLogCreator {
public:
- // Version 0 refers to FileBackedProtoLog
- // Version 1 refers to PortableFileBackedProtoLog with kFileFormatVersion = 0
- static constexpr int32_t kCurrentVersion = 1;
struct CreateResult {
// The create result passed up from the PortableFileBackedProtoLog::Create.
// Contains the document log.
PortableFileBackedProtoLog<DocumentWrapper>::CreateResult log_create_result;
- // The version number of the pre-existing document log file.
- // If there is no document log file, it will be set to kCurrentVersion.
- int preexisting_file_version;
-
- // Whether the created file is new.
- bool new_file;
+ // Whether the caller needs to also regenerate/generate any derived files
+ // based off of the initialized document log.
+ bool regen_derived_files;
};
// Creates the document log in the base_dir. Will create one if it doesn't
diff --git a/icing/store/document-store.cc b/icing/store/document-store.cc
index 8c8369c..226a96b 100644
--- a/icing/store/document-store.cc
+++ b/icing/store/document-store.cc
@@ -164,32 +164,6 @@ int64_t CalculateExpirationTimestampMs(int64_t creation_timestamp_ms,
return expiration_timestamp_ms;
}
-InitializeStatsProto::RecoveryCause GetRecoveryCause(
- const DocumentLogCreator::CreateResult& create_result,
- bool force_recovery_and_revalidate_documents) {
- if (force_recovery_and_revalidate_documents) {
- return InitializeStatsProto::SCHEMA_CHANGES_OUT_OF_SYNC;
- } else if (create_result.log_create_result.has_data_loss()) {
- return InitializeStatsProto::DATA_LOSS;
- } else if (create_result.preexisting_file_version !=
- DocumentLogCreator::kCurrentVersion) {
- return InitializeStatsProto::LEGACY_DOCUMENT_LOG_FORMAT;
- }
- return InitializeStatsProto::NONE;
-}
-
-InitializeStatsProto::DocumentStoreDataStatus GetDataStatus(
- DataLoss data_loss) {
- switch (data_loss) {
- case DataLoss::PARTIAL:
- return InitializeStatsProto::PARTIAL_LOSS;
- case DataLoss::COMPLETE:
- return InitializeStatsProto::COMPLETE_LOSS;
- case DataLoss::NONE:
- return InitializeStatsProto::NO_DATA_LOSS;
- }
-}
-
} // namespace
DocumentStore::DocumentStore(const Filesystem* filesystem,
@@ -262,34 +236,44 @@ libtextclassifier3::StatusOr<DataLoss> DocumentStore::Initialize(
std::move(create_result_or).ValueOrDie();
document_log_ = std::move(create_result.log_create_result.proto_log);
- InitializeStatsProto::RecoveryCause recovery_cause =
- GetRecoveryCause(create_result, force_recovery_and_revalidate_documents);
-
- if (recovery_cause != InitializeStatsProto::NONE || create_result.new_file) {
- ICING_LOG(WARNING) << "Starting Document Store Recovery with cause="
- << recovery_cause << ", and create result { new_file="
- << create_result.new_file << ", preeisting_file_version="
- << create_result.preexisting_file_version << ", data_loss="
- << create_result.log_create_result.data_loss << "} and kCurrentVersion="
- << DocumentLogCreator::kCurrentVersion;
+
+ if (create_result.regen_derived_files ||
+ force_recovery_and_revalidate_documents ||
+ create_result.log_create_result.has_data_loss()) {
// We can't rely on any existing derived files. Recreate them from scratch.
// Currently happens if:
// 1) This is a new log and we don't have derived files yet
// 2) Client wanted us to force a regeneration.
// 3) Log has some data loss, can't rely on existing derived data.
+ if (create_result.log_create_result.has_data_loss() &&
+ initialize_stats != nullptr) {
+ ICING_LOG(WARNING)
+ << "Data loss in document log, regenerating derived files.";
+ initialize_stats->set_document_store_recovery_cause(
+ InitializeStatsProto::DATA_LOSS);
+
+ if (create_result.log_create_result.data_loss == DataLoss::PARTIAL) {
+ // Ground truth is partially lost.
+ initialize_stats->set_document_store_data_status(
+ InitializeStatsProto::PARTIAL_LOSS);
+ } else {
+ // Ground truth is completely lost.
+ initialize_stats->set_document_store_data_status(
+ InitializeStatsProto::COMPLETE_LOSS);
+ }
+ }
+
std::unique_ptr<Timer> document_recovery_timer = clock_.GetNewTimer();
libtextclassifier3::Status status =
RegenerateDerivedFiles(force_recovery_and_revalidate_documents);
if (initialize_stats != nullptr &&
- recovery_cause != InitializeStatsProto::NONE) {
+ (force_recovery_and_revalidate_documents ||
+ create_result.log_create_result.has_data_loss())) {
// Only consider it a recovery if the client forced a recovery or there
// was data loss. Otherwise, this could just be the first time we're
// initializing and generating derived files.
initialize_stats->set_document_store_recovery_latency_ms(
document_recovery_timer->GetElapsedMilliseconds());
- initialize_stats->set_document_store_recovery_cause(recovery_cause);
- initialize_stats->set_document_store_data_status(
- GetDataStatus(create_result.log_create_result.data_loss));
}
if (!status.ok()) {
ICING_LOG(ERROR)
@@ -298,13 +282,13 @@ libtextclassifier3::StatusOr<DataLoss> DocumentStore::Initialize(
}
} else {
if (!InitializeExistingDerivedFiles().ok()) {
- ICING_LOG(WARNING)
+ ICING_VLOG(1)
<< "Couldn't find derived files or failed to initialize them, "
"regenerating derived files for DocumentStore.";
std::unique_ptr<Timer> document_recovery_timer = clock_.GetNewTimer();
libtextclassifier3::Status status = RegenerateDerivedFiles(
- /*force_recovery_and_revalidate_documents=*/false);
- if (initialize_stats != nullptr) {
+ /*force_recovery_and_revalidate_documents*/ false);
+ if (initialize_stats != nullptr && num_documents() > 0) {
initialize_stats->set_document_store_recovery_cause(
InitializeStatsProto::IO_ERROR);
initialize_stats->set_document_store_recovery_latency_ms(
@@ -431,19 +415,7 @@ libtextclassifier3::Status DocumentStore::RegenerateDerivedFiles(
// Iterates through document log
auto iterator = document_log_->GetIterator();
auto iterator_status = iterator.Advance();
- libtextclassifier3::StatusOr<int64_t> element_size =
- document_log_->GetElementsFileSize();
- libtextclassifier3::StatusOr<int64_t> disk_usage =
- document_log_->GetDiskUsage();
- if (element_size.ok() && disk_usage.ok()) {
- ICING_VLOG(1) << "Starting recovery of document store. Document store "
- "elements file size:"
- << element_size.ValueOrDie()
- << ", disk usage=" << disk_usage.ValueOrDie();
- }
while (iterator_status.ok()) {
- ICING_VLOG(2) << "Attempting to read document at offset="
- << iterator.GetOffset();
libtextclassifier3::StatusOr<DocumentWrapper> document_wrapper_or =
document_log_->ReadProto(iterator.GetOffset());
@@ -558,7 +530,7 @@ libtextclassifier3::Status DocumentStore::RegenerateDerivedFiles(
libtextclassifier3::Status DocumentStore::ResetDocumentKeyMapper() {
// TODO(b/139734457): Replace ptr.reset()->Delete->Create flow with Reset().
document_key_mapper_.reset();
- // TODO(b/216487496): Implement a more robust version of TC_RETURN_IF_ERROR
+ // TODO(b/144458732): Implement a more robust version of TC_RETURN_IF_ERROR
// that can support error logging.
libtextclassifier3::Status status =
KeyMapper<DocumentId>::Delete(*filesystem_, base_dir_);
@@ -568,7 +540,7 @@ libtextclassifier3::Status DocumentStore::ResetDocumentKeyMapper() {
return status;
}
- // TODO(b/216487496): Implement a more robust version of TC_ASSIGN_OR_RETURN
+ // TODO(b/144458732): Implement a more robust version of TC_ASSIGN_OR_RETURN
// that can support error logging.
auto document_key_mapper_or =
KeyMapper<DocumentId>::Create(*filesystem_, base_dir_, kUriMapperMaxSize);
@@ -584,7 +556,7 @@ libtextclassifier3::Status DocumentStore::ResetDocumentKeyMapper() {
libtextclassifier3::Status DocumentStore::ResetDocumentIdMapper() {
// TODO(b/139734457): Replace ptr.reset()->Delete->Create flow with Reset().
document_id_mapper_.reset();
- // TODO(b/216487496): Implement a more robust version of TC_RETURN_IF_ERROR
+ // TODO(b/144458732): Implement a more robust version of TC_RETURN_IF_ERROR
// that can support error logging.
libtextclassifier3::Status status = FileBackedVector<int64_t>::Delete(
*filesystem_, MakeDocumentIdMapperFilename(base_dir_));
@@ -593,7 +565,7 @@ libtextclassifier3::Status DocumentStore::ResetDocumentIdMapper() {
<< "Failed to delete old document_id mapper";
return status;
}
- // TODO(b/216487496): Implement a more robust version of TC_ASSIGN_OR_RETURN
+ // TODO(b/144458732): Implement a more robust version of TC_ASSIGN_OR_RETURN
// that can support error logging.
auto document_id_mapper_or = FileBackedVector<int64_t>::Create(
*filesystem_, MakeDocumentIdMapperFilename(base_dir_),
@@ -646,7 +618,7 @@ libtextclassifier3::Status DocumentStore::ResetFilterCache() {
libtextclassifier3::Status DocumentStore::ResetNamespaceMapper() {
// TODO(b/139734457): Replace ptr.reset()->Delete->Create flow with Reset().
namespace_mapper_.reset();
- // TODO(b/216487496): Implement a more robust version of TC_RETURN_IF_ERROR
+ // TODO(b/144458732): Implement a more robust version of TC_RETURN_IF_ERROR
// that can support error logging.
libtextclassifier3::Status status = KeyMapper<NamespaceId>::Delete(
*filesystem_, MakeNamespaceMapperFilename(base_dir_));
@@ -666,7 +638,7 @@ libtextclassifier3::Status DocumentStore::ResetNamespaceMapper() {
libtextclassifier3::Status DocumentStore::ResetCorpusMapper() {
// TODO(b/139734457): Replace ptr.reset()->Delete->Create flow with Reset().
corpus_mapper_.reset();
- // TODO(b/216487496): Implement a more robust version of TC_RETURN_IF_ERROR
+ // TODO(b/144458732): Implement a more robust version of TC_RETURN_IF_ERROR
// that can support error logging.
libtextclassifier3::Status status = KeyMapper<CorpusId>::Delete(
*filesystem_, MakeCorpusMapperFilename(base_dir_));
@@ -1777,63 +1749,5 @@ libtextclassifier3::Status DocumentStore::SetUsageScores(
return usage_store_->SetUsageScores(document_id, usage_scores);
}
-libtextclassifier3::StatusOr<
- google::protobuf::RepeatedPtrField<DocumentDebugInfoProto::CorpusInfo>>
-DocumentStore::CollectCorpusInfo() const {
- google::protobuf::RepeatedPtrField<DocumentDebugInfoProto::CorpusInfo>
- corpus_info;
- libtextclassifier3::StatusOr<const SchemaProto*> schema_proto_or =
- schema_store_->GetSchema();
- if (!schema_proto_or.ok()) {
- return corpus_info;
- }
- // Maps from CorpusId to the corresponding protocol buffer in the result.
- std::unordered_map<CorpusId, DocumentDebugInfoProto::CorpusInfo*> info_map;
- std::unordered_map<NamespaceId, std::string> namespace_id_to_namespace =
- namespace_mapper_->GetValuesToKeys();
- const SchemaProto* schema_proto = schema_proto_or.ValueOrDie();
- for (DocumentId document_id = 0; document_id < filter_cache_->num_elements();
- ++document_id) {
- if (!InternalDoesDocumentExist(document_id)) {
- continue;
- }
- ICING_ASSIGN_OR_RETURN(const DocumentFilterData* filter_data,
- filter_cache_->Get(document_id));
- ICING_ASSIGN_OR_RETURN(const DocumentAssociatedScoreData* score_data,
- score_cache_->Get(document_id));
- const std::string& name_space =
- namespace_id_to_namespace[filter_data->namespace_id()];
- const std::string& schema =
- schema_proto->types()[filter_data->schema_type_id()].schema_type();
- auto iter = info_map.find(score_data->corpus_id());
- if (iter == info_map.end()) {
- DocumentDebugInfoProto::CorpusInfo* entry = corpus_info.Add();
- entry->set_namespace_(name_space);
- entry->set_schema(schema);
- iter = info_map.insert({score_data->corpus_id(), entry}).first;
- }
- iter->second->set_total_documents(iter->second->total_documents() + 1);
- iter->second->set_total_token(iter->second->total_token() +
- score_data->length_in_tokens());
- }
- return corpus_info;
-}
-
-libtextclassifier3::StatusOr<DocumentDebugInfoProto>
-DocumentStore::GetDebugInfo(int verbosity) const {
- DocumentDebugInfoProto debug_info;
- *debug_info.mutable_document_storage_info() = GetStorageInfo();
- ICING_ASSIGN_OR_RETURN(Crc32 crc, ComputeChecksum());
- debug_info.set_crc(crc.Get());
- if (verbosity > 0) {
- ICING_ASSIGN_OR_RETURN(google::protobuf::RepeatedPtrField<
- DocumentDebugInfoProto::CorpusInfo>
- corpus_info,
- CollectCorpusInfo());
- *debug_info.mutable_corpus_info() = std::move(corpus_info);
- }
- return debug_info;
-}
-
} // namespace lib
} // namespace icing
diff --git a/icing/store/document-store.h b/icing/store/document-store.h
index e6d2e5c..a60aab1 100644
--- a/icing/store/document-store.h
+++ b/icing/store/document-store.h
@@ -27,7 +27,6 @@
#include "icing/file/file-backed-vector.h"
#include "icing/file/filesystem.h"
#include "icing/file/portable-file-backed-proto-log.h"
-#include "icing/proto/debug.pb.h"
#include "icing/proto/document.pb.h"
#include "icing/proto/document_wrapper.pb.h"
#include "icing/proto/logging.pb.h"
@@ -423,17 +422,6 @@ class DocumentStore {
// INTERNAL_ERROR on compute error
libtextclassifier3::StatusOr<Crc32> ComputeChecksum() const;
- // Get debug information for the document store.
- // verbosity <= 0, simplest debug information
- // verbosity > 0, also return the total number of documents and tokens in each
- // (namespace, schema type) pair.
- //
- // Returns:
- // DocumentDebugInfoProto on success
- // INTERNAL_ERROR on IO errors, crc compute error
- libtextclassifier3::StatusOr<DocumentDebugInfoProto> GetDebugInfo(
- int verbosity) const;
-
private:
// Use DocumentStore::Create() to instantiate.
DocumentStore(const Filesystem* filesystem, std::string_view base_dir,
@@ -509,6 +497,28 @@ class DocumentStore {
bool force_recovery_and_revalidate_documents,
InitializeStatsProto* initialize_stats);
+ // Initializes a new DocumentStore and sets up any underlying files.
+ //
+ // Returns:
+ // Data loss status on success, effectively always DataLoss::NONE
+ // INTERNAL on I/O error
+ libtextclassifier3::StatusOr<DataLoss> InitializeNewStore(
+ InitializeStatsProto* initialize_stats);
+
+ // Initializes a DocumentStore over an existing directory of files.
+ //
+ // stats will be set if non-null
+ //
+ // Returns:
+ // Data loss status on success
+ // INTERNAL on I/O error
+ libtextclassifier3::StatusOr<DataLoss> InitializeExistingStore(
+ bool force_recovery_and_revalidate_documents,
+ InitializeStatsProto* initialize_stats);
+
+ libtextclassifier3::StatusOr<DataLoss> MigrateFromV0ToV1(
+ InitializeStatsProto* initialize_stats);
+
// Creates sub-components and verifies the integrity of each sub-component.
// This assumes that the the underlying files already exist, and will return
// an error if it doesn't find what it's expecting.
@@ -708,13 +718,6 @@ class DocumentStore {
// the document_id_mapper somehow became larger than the filter cache.
DocumentStorageInfoProto CalculateDocumentStatusCounts(
DocumentStorageInfoProto storage_info) const;
-
- // Returns:
- // - on success, a RepeatedPtrField for CorpusInfo collected.
- // - OUT_OF_RANGE, this should never happen.
- libtextclassifier3::StatusOr<google::protobuf::RepeatedPtrField<
- DocumentDebugInfoProto::CorpusInfo>>
- CollectCorpusInfo() const;
};
} // namespace lib
diff --git a/icing/store/document-store_benchmark.cc b/icing/store/document-store_benchmark.cc
index fc3fd9d..ce608fc 100644
--- a/icing/store/document-store_benchmark.cc
+++ b/icing/store/document-store_benchmark.cc
@@ -32,7 +32,6 @@
#include "icing/document-builder.h"
#include "icing/file/filesystem.h"
#include "icing/proto/document.pb.h"
-#include "icing/proto/persist.pb.h"
#include "icing/proto/schema.pb.h"
#include "icing/schema-builder.h"
#include "icing/schema/schema-store.h"
@@ -64,13 +63,13 @@ namespace lib {
namespace {
-constexpr PropertyConfigProto::Cardinality::Code CARDINALITY_OPTIONAL =
- PropertyConfigProto::Cardinality::OPTIONAL;
+constexpr PropertyConfigProto_Cardinality_Code CARDINALITY_OPTIONAL =
+ PropertyConfigProto_Cardinality_Code_OPTIONAL;
-constexpr StringIndexingConfig::TokenizerType::Code TOKENIZER_PLAIN =
- StringIndexingConfig::TokenizerType::PLAIN;
+constexpr StringIndexingConfig_TokenizerType_Code TOKENIZER_PLAIN =
+ StringIndexingConfig_TokenizerType_Code_PLAIN;
-constexpr TermMatchType::Code MATCH_EXACT = TermMatchType::EXACT_ONLY;
+constexpr TermMatchType_Code MATCH_EXACT = TermMatchType_Code_EXACT_ONLY;
class DestructibleDirectory {
public:
@@ -256,74 +255,6 @@ void BM_Delete(benchmark::State& state) {
}
BENCHMARK(BM_Delete);
-void BM_Create(benchmark::State& state) {
- Filesystem filesystem;
- Clock clock;
-
- std::string directory = GetTestTempDir() + "/icing";
- std::string document_store_dir = directory + "/store";
-
- std::unique_ptr<SchemaStore> schema_store =
- CreateSchemaStore(filesystem, directory, &clock);
-
- // Create an initial document store and put some data in.
- {
- DestructibleDirectory ddir(filesystem, directory);
-
- filesystem.CreateDirectoryRecursively(document_store_dir.data());
- ICING_ASSERT_OK_AND_ASSIGN(
- DocumentStore::CreateResult create_result,
- DocumentStore::Create(&filesystem, document_store_dir, &clock,
- schema_store.get()));
- std::unique_ptr<DocumentStore> document_store =
- std::move(create_result.document_store);
-
- DocumentProto document = CreateDocument("namespace", "uri");
- ICING_ASSERT_OK(document_store->Put(document));
- ICING_ASSERT_OK(document_store->PersistToDisk(PersistType::FULL));
- }
-
- // Recreating it with some content to checksum over.
- DestructibleDirectory ddir(filesystem, directory);
-
- filesystem.CreateDirectoryRecursively(document_store_dir.data());
-
- for (auto s : state) {
- benchmark::DoNotOptimize(DocumentStore::Create(
- &filesystem, document_store_dir, &clock, schema_store.get()));
- }
-}
-BENCHMARK(BM_Create);
-
-void BM_ComputeChecksum(benchmark::State& state) {
- Filesystem filesystem;
- Clock clock;
-
- std::string directory = GetTestTempDir() + "/icing";
- DestructibleDirectory ddir(filesystem, directory);
-
- std::string document_store_dir = directory + "/store";
- std::unique_ptr<SchemaStore> schema_store =
- CreateSchemaStore(filesystem, directory, &clock);
-
- filesystem.CreateDirectoryRecursively(document_store_dir.data());
- ICING_ASSERT_OK_AND_ASSIGN(
- DocumentStore::CreateResult create_result,
- DocumentStore::Create(&filesystem, document_store_dir, &clock,
- schema_store.get()));
- std::unique_ptr<DocumentStore> document_store =
- std::move(create_result.document_store);
-
- DocumentProto document = CreateDocument("namespace", "uri");
- ICING_ASSERT_OK(document_store->Put(document));
- ICING_ASSERT_OK(document_store->PersistToDisk(PersistType::LITE));
-
- for (auto s : state) {
- benchmark::DoNotOptimize(document_store->ComputeChecksum());
- }
-}
-BENCHMARK(BM_ComputeChecksum);
-
} // namespace
} // namespace lib
diff --git a/icing/store/document-store_test.cc b/icing/store/document-store_test.cc
index a30b4e4..3ed4c4e 100644
--- a/icing/store/document-store_test.cc
+++ b/icing/store/document-store_test.cc
@@ -29,6 +29,7 @@
#include "icing/file/filesystem.h"
#include "icing/file/memory-mapped-file.h"
#include "icing/file/mock-filesystem.h"
+#include "icing/helpers/icu/icu-data-file-helper.h"
#include "icing/portable/equals-proto.h"
#include "icing/portable/platform.h"
#include "icing/proto/document.pb.h"
@@ -44,7 +45,6 @@
#include "icing/store/namespace-id.h"
#include "icing/testing/common-matchers.h"
#include "icing/testing/fake-clock.h"
-#include "icing/testing/icu-data-file-helper.h"
#include "icing/testing/test-data.h"
#include "icing/testing/tmp-directory.h"
#include "icing/tokenization/language-segmenter-factory.h"
@@ -85,16 +85,16 @@ const NamespaceStorageInfoProto& GetNamespaceStorageInfo(
return std::move(NamespaceStorageInfoProto());
}
-constexpr PropertyConfigProto::Cardinality::Code CARDINALITY_OPTIONAL =
- PropertyConfigProto::Cardinality::OPTIONAL;
+constexpr PropertyConfigProto_Cardinality_Code CARDINALITY_OPTIONAL =
+ PropertyConfigProto_Cardinality_Code_OPTIONAL;
-constexpr StringIndexingConfig::TokenizerType::Code TOKENIZER_PLAIN =
- StringIndexingConfig::TokenizerType::PLAIN;
+constexpr StringIndexingConfig_TokenizerType_Code TOKENIZER_PLAIN =
+ StringIndexingConfig_TokenizerType_Code_PLAIN;
-constexpr TermMatchType::Code MATCH_EXACT = TermMatchType::EXACT_ONLY;
+constexpr TermMatchType_Code MATCH_EXACT = TermMatchType_Code_EXACT_ONLY;
-constexpr PropertyConfigProto::DataType::Code TYPE_INT =
- PropertyConfigProto::DataType::INT64;
+constexpr PropertyConfigProto_DataType_Code TYPE_INT =
+ PropertyConfigProto_DataType_Code_INT64;
UsageReport CreateUsageReport(std::string name_space, std::string uri,
int64 timestamp_ms,
@@ -3170,6 +3170,15 @@ TEST_F(DocumentStoreTest, DetectCompleteDataLoss) {
ASSERT_THAT(create_result.data_loss, Eq(DataLoss::COMPLETE));
}
+// TODO(b/185845269) Re-enable this test by copying over a full valid set of
+// document store files. Right now this test only includes the score_cache and
+// the document store header.
+//
+// This causes a problem now because this cl changes behavior to not consider an
+// InitializeExistingDerivedFiles failure to be a recovery if there is nothing
+// to recover because the doocument store is empty.
+#define DISABLE_BACKWARDS_COMPAT_TEST
+#ifndef DISABLE_BACKWARDS_COMPAT_TEST
TEST_F(DocumentStoreTest, LoadScoreCacheAndInitializeSuccessfully) {
// The directory testdata/score_cache_without_length_in_tokens/document_store
// contains only the scoring_cache and the document_store_header (holding the
@@ -3185,26 +3194,29 @@ TEST_F(DocumentStoreTest, LoadScoreCacheAndInitializeSuccessfully) {
// Get src files
std::string document_store_without_length_in_tokens;
- if (IsAndroidArm() || IsIosPlatform()) {
+ if (IsAndroidPlatform() || IsIosPlatform()) {
document_store_without_length_in_tokens = GetTestFilePath(
"icing/testdata/score_cache_without_length_in_tokens/"
"document_store_android_ios_compatible");
- } else if (IsAndroidX86()) {
- document_store_without_length_in_tokens = GetTestFilePath(
- "icing/testdata/score_cache_without_length_in_tokens/"
- "document_store_android_x86");
} else {
document_store_without_length_in_tokens = GetTestFilePath(
"icing/testdata/score_cache_without_length_in_tokens/"
"document_store");
}
+ std::vector<std::string> document_store_files;
Filesystem filesystem;
- ICING_LOG(INFO) << "Copying files "
- << document_store_without_length_in_tokens;
- ASSERT_THAT(
- filesystem.CopyDirectory(document_store_without_length_in_tokens.c_str(),
- document_store_dir_.c_str(), /*recursive=*/true),
- true);
+ filesystem.ListDirectory(document_store_without_length_in_tokens.c_str(),
+ &document_store_files);
+
+ ICING_LOG(INFO) << "Copying files " << document_store_without_length_in_tokens
+ << ' ' << document_store_files.size();
+ for (size_t i = 0; i != document_store_files.size(); i++) {
+ std::string src = absl_ports::StrCat(
+ document_store_without_length_in_tokens, "/", document_store_files[i]);
+ std::string dst =
+ absl_ports::StrCat(document_store_dir_, "/", document_store_files[i]);
+ ASSERT_THAT(filesystem_.CopyFile(src.c_str(), dst.c_str()), true);
+ }
InitializeStatsProto initialize_stats;
ICING_ASSERT_OK_AND_ASSIGN(
@@ -3215,11 +3227,12 @@ TEST_F(DocumentStoreTest, LoadScoreCacheAndInitializeSuccessfully) {
&initialize_stats));
std::unique_ptr<DocumentStore> doc_store =
std::move(create_result.document_store);
- // The document log is using the legacy v0 format so that a migration is
- // needed, which will also trigger regeneration.
- EXPECT_EQ(initialize_stats.document_store_recovery_cause(),
- InitializeStatsProto::LEGACY_DOCUMENT_LOG_FORMAT);
+ // The store_cache trigger regeneration because its element size is
+ // inconsistent: expected 20 (current new size), actual 12 (as per the v0
+ // score_cache).
+ EXPECT_TRUE(initialize_stats.has_document_store_recovery_cause());
}
+#endif // DISABLE_BACKWARDS_COMPAT_TEST
TEST_F(DocumentStoreTest, DocumentStoreStorageInfo) {
ICING_ASSERT_OK_AND_ASSIGN(
@@ -3409,22 +3422,18 @@ TEST_F(DocumentStoreTest, InitializeForceRecoveryUpdatesTypeIds) {
{
// Create the document store the second time and force recovery
- InitializeStatsProto initialize_stats;
ICING_ASSERT_OK_AND_ASSIGN(
DocumentStore::CreateResult create_result,
- DocumentStore::Create(&filesystem_, document_store_dir_, &fake_clock_,
- schema_store.get(),
- /*force_recovery_and_revalidate_documents=*/true,
- &initialize_stats));
+ DocumentStore::Create(
+ &filesystem_, document_store_dir_, &fake_clock_, schema_store.get(),
+ /*force_recovery_and_revalidate_documents=*/true));
std::unique_ptr<DocumentStore> doc_store =
std::move(create_result.document_store);
// Ensure that the type id of the email document has been correctly updated.
ICING_ASSERT_OK_AND_ASSIGN(DocumentFilterData filter_data,
doc_store->GetDocumentFilterData(docid));
- EXPECT_THAT(filter_data.schema_type_id(), Eq(1));
- EXPECT_THAT(initialize_stats.document_store_recovery_cause(),
- Eq(InitializeStatsProto::SCHEMA_CHANGES_OUT_OF_SYNC));
+ ASSERT_THAT(filter_data.schema_type_id(), Eq(1));
}
}
@@ -3547,6 +3556,7 @@ TEST_F(DocumentStoreTest, InitializeForceRecoveryDeletesInvalidDocument) {
SchemaStore::Create(&filesystem_, schema_store_dir_, &fake_clock_));
ASSERT_THAT(schema_store->SetSchema(schema), IsOk());
+ DocumentId docid = kInvalidDocumentId;
DocumentProto docWithBody =
DocumentBuilder()
.SetKey("icing", "email/1")
@@ -3579,12 +3589,8 @@ TEST_F(DocumentStoreTest, InitializeForceRecoveryDeletesInvalidDocument) {
std::unique_ptr<DocumentStore> doc_store =
std::move(create_result.document_store);
- DocumentId docid = kInvalidDocumentId;
ICING_ASSERT_OK_AND_ASSIGN(docid, doc_store->Put(docWithBody));
- ASSERT_NE(docid, kInvalidDocumentId);
- docid = kInvalidDocumentId;
ICING_ASSERT_OK_AND_ASSIGN(docid, doc_store->Put(docWithoutBody));
- ASSERT_NE(docid, kInvalidDocumentId);
ASSERT_THAT(doc_store->Get(docWithBody.namespace_(), docWithBody.uri()),
IsOkAndHolds(EqualsProto(docWithBody)));
@@ -3652,6 +3658,7 @@ TEST_F(DocumentStoreTest, InitializeDontForceRecoveryKeepsInvalidDocument) {
SchemaStore::Create(&filesystem_, schema_store_dir_, &fake_clock_));
ASSERT_THAT(schema_store->SetSchema(schema), IsOk());
+ DocumentId docid = kInvalidDocumentId;
DocumentProto docWithBody =
DocumentBuilder()
.SetKey("icing", "email/1")
@@ -3684,12 +3691,8 @@ TEST_F(DocumentStoreTest, InitializeDontForceRecoveryKeepsInvalidDocument) {
std::unique_ptr<DocumentStore> doc_store =
std::move(create_result.document_store);
- DocumentId docid = kInvalidDocumentId;
ICING_ASSERT_OK_AND_ASSIGN(docid, doc_store->Put(docWithBody));
- ASSERT_NE(docid, kInvalidDocumentId);
- docid = kInvalidDocumentId;
ICING_ASSERT_OK_AND_ASSIGN(docid, doc_store->Put(docWithoutBody));
- ASSERT_NE(docid, kInvalidDocumentId);
ASSERT_THAT(doc_store->Get(docWithBody.namespace_(), docWithBody.uri()),
IsOkAndHolds(EqualsProto(docWithBody)));
@@ -3832,8 +3835,7 @@ TEST_F(DocumentStoreTest, MigrateToPortableFileBackedProtoLog) {
// Check that we didn't lose anything. A migration also doesn't technically
// count as a recovery.
EXPECT_THAT(create_result.data_loss, Eq(DataLoss::NONE));
- EXPECT_EQ(initialize_stats.document_store_recovery_cause(),
- InitializeStatsProto::LEGACY_DOCUMENT_LOG_FORMAT);
+ EXPECT_FALSE(initialize_stats.has_document_store_recovery_cause());
// Document 1 and 3 were put normally, and document 2 was deleted in our
// testdata files.
@@ -3856,164 +3858,6 @@ TEST_F(DocumentStoreTest, MigrateToPortableFileBackedProtoLog) {
}
#endif // DISABLE_BACKWARDS_COMPAT_TEST
-TEST_F(DocumentStoreTest, GetDebugInfo) {
- SchemaProto schema =
- SchemaBuilder()
- .AddType(SchemaTypeConfigBuilder()
- .SetType("email")
- .AddProperty(
- PropertyConfigBuilder()
- .SetName("subject")
- .SetDataTypeString(MATCH_EXACT, TOKENIZER_PLAIN)
- .SetCardinality(CARDINALITY_OPTIONAL))
- .AddProperty(
- PropertyConfigBuilder()
- .SetName("body")
- .SetDataTypeString(MATCH_EXACT, TOKENIZER_PLAIN)
- .SetCardinality(CARDINALITY_OPTIONAL)))
- .AddType(SchemaTypeConfigBuilder().SetType("person").AddProperty(
- PropertyConfigBuilder()
- .SetName("name")
- .SetDataTypeString(MATCH_EXACT, TOKENIZER_PLAIN)
- .SetCardinality(CARDINALITY_OPTIONAL)))
- .Build();
- std::string schema_store_dir = schema_store_dir_ + "_custom";
- filesystem_.DeleteDirectoryRecursively(schema_store_dir.c_str());
- filesystem_.CreateDirectoryRecursively(schema_store_dir.c_str());
- ICING_ASSERT_OK_AND_ASSIGN(
- std::unique_ptr<SchemaStore> schema_store,
- SchemaStore::Create(&filesystem_, schema_store_dir, &fake_clock_));
-
- ICING_ASSERT_OK(schema_store->SetSchema(schema));
-
- ICING_ASSERT_OK_AND_ASSIGN(
- DocumentStore::CreateResult create_result,
- DocumentStore::Create(&filesystem_, document_store_dir_, &fake_clock_,
- schema_store.get()));
- std::unique_ptr<DocumentStore> document_store =
- std::move(create_result.document_store);
-
- DocumentProto document1 = DocumentBuilder()
- .SetKey("namespace1", "email/1")
- .SetSchema("email")
- .AddStringProperty("subject", "aa bb cc")
- .AddStringProperty("body", "dd ee")
- .SetCreationTimestampMs(1)
- .Build();
- ICING_ASSERT_OK(document_store->Put(document1, 5));
-
- DocumentProto document2 = DocumentBuilder()
- .SetKey("namespace2", "email/2")
- .SetSchema("email")
- .AddStringProperty("subject", "aa bb")
- .AddStringProperty("body", "cc")
- .SetCreationTimestampMs(1)
- .Build();
- ICING_ASSERT_OK(document_store->Put(document2, 3));
-
- DocumentProto document3 = DocumentBuilder()
- .SetKey("namespace2", "email/3")
- .SetSchema("email")
- .AddStringProperty("subject", "aa")
- .AddStringProperty("body", "")
- .SetCreationTimestampMs(1)
- .Build();
- ICING_ASSERT_OK(document_store->Put(document3, 1));
-
- DocumentProto document4 = DocumentBuilder()
- .SetKey("namespace1", "person/1")
- .SetSchema("person")
- .AddStringProperty("name", "test test")
- .SetCreationTimestampMs(1)
- .Build();
- ICING_ASSERT_OK(document_store->Put(document4, 2));
-
- ICING_ASSERT_OK_AND_ASSIGN(DocumentDebugInfoProto out1,
- document_store->GetDebugInfo(/*verbosity=*/1));
- EXPECT_THAT(out1.crc(), Gt(0));
- EXPECT_THAT(out1.document_storage_info().num_alive_documents(), Eq(4));
- EXPECT_THAT(out1.document_storage_info().num_deleted_documents(), Eq(0));
- EXPECT_THAT(out1.document_storage_info().num_expired_documents(), Eq(0));
-
- DocumentDebugInfoProto::CorpusInfo info1, info2, info3;
- info1.set_namespace_("namespace1");
- info1.set_schema("email");
- info1.set_total_documents(1); // document1
- info1.set_total_token(5);
-
- info2.set_namespace_("namespace2");
- info2.set_schema("email");
- info2.set_total_documents(2); // document2 and document3
- info2.set_total_token(4); // 3 + 1
-
- info3.set_namespace_("namespace1");
- info3.set_schema("person");
- info3.set_total_documents(1); // document4
- info3.set_total_token(2);
-
- EXPECT_THAT(out1.corpus_info(),
- UnorderedElementsAre(EqualsProto(info1), EqualsProto(info2),
- EqualsProto(info3)));
-
- // Delete document3.
- ICING_ASSERT_OK(document_store->Delete("namespace2", "email/3"));
- ICING_ASSERT_OK_AND_ASSIGN(DocumentDebugInfoProto out2,
- document_store->GetDebugInfo(/*verbosity=*/1));
- EXPECT_THAT(out2.crc(), Gt(0));
- EXPECT_THAT(out2.crc(), Not(Eq(out1.crc())));
- EXPECT_THAT(out2.document_storage_info().num_alive_documents(), Eq(3));
- EXPECT_THAT(out2.document_storage_info().num_deleted_documents(), Eq(1));
- EXPECT_THAT(out2.document_storage_info().num_expired_documents(), Eq(0));
- info2.set_total_documents(1); // document2
- info2.set_total_token(3);
- EXPECT_THAT(out2.corpus_info(),
- UnorderedElementsAre(EqualsProto(info1), EqualsProto(info2),
- EqualsProto(info3)));
-
- ICING_ASSERT_OK_AND_ASSIGN(DocumentDebugInfoProto out3,
- document_store->GetDebugInfo(/*verbosity=*/0));
- EXPECT_THAT(out3.corpus_info(), IsEmpty());
-}
-
-TEST_F(DocumentStoreTest, GetDebugInfoWithoutSchema) {
- std::string schema_store_dir = schema_store_dir_ + "_custom";
- filesystem_.DeleteDirectoryRecursively(schema_store_dir.c_str());
- filesystem_.CreateDirectoryRecursively(schema_store_dir.c_str());
- ICING_ASSERT_OK_AND_ASSIGN(
- std::unique_ptr<SchemaStore> schema_store,
- SchemaStore::Create(&filesystem_, schema_store_dir, &fake_clock_));
-
- ICING_ASSERT_OK_AND_ASSIGN(
- DocumentStore::CreateResult create_result,
- DocumentStore::Create(&filesystem_, document_store_dir_, &fake_clock_,
- schema_store.get()));
- std::unique_ptr<DocumentStore> document_store =
- std::move(create_result.document_store);
- ICING_ASSERT_OK_AND_ASSIGN(DocumentDebugInfoProto out,
- document_store->GetDebugInfo(/*verbosity=*/1));
- EXPECT_THAT(out.crc(), Gt(0));
- EXPECT_THAT(out.document_storage_info().num_alive_documents(), Eq(0));
- EXPECT_THAT(out.document_storage_info().num_deleted_documents(), Eq(0));
- EXPECT_THAT(out.document_storage_info().num_expired_documents(), Eq(0));
- EXPECT_THAT(out.corpus_info(), IsEmpty());
-}
-
-TEST_F(DocumentStoreTest, GetDebugInfoForEmptyDocumentStore) {
- ICING_ASSERT_OK_AND_ASSIGN(
- DocumentStore::CreateResult create_result,
- DocumentStore::Create(&filesystem_, document_store_dir_, &fake_clock_,
- schema_store_.get()));
- std::unique_ptr<DocumentStore> document_store =
- std::move(create_result.document_store);
- ICING_ASSERT_OK_AND_ASSIGN(DocumentDebugInfoProto out,
- document_store->GetDebugInfo(/*verbosity=*/1));
- EXPECT_THAT(out.crc(), Gt(0));
- EXPECT_THAT(out.document_storage_info().num_alive_documents(), Eq(0));
- EXPECT_THAT(out.document_storage_info().num_deleted_documents(), Eq(0));
- EXPECT_THAT(out.document_storage_info().num_expired_documents(), Eq(0));
- EXPECT_THAT(out.corpus_info(), IsEmpty());
-}
-
} // namespace
} // namespace lib
diff --git a/icing/store/namespace-checker-impl.h b/icing/store/namespace-checker-impl.h
deleted file mode 100644
index bcd0643..0000000
--- a/icing/store/namespace-checker-impl.h
+++ /dev/null
@@ -1,51 +0,0 @@
-// Copyright (C) 2021 Google LLC
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-// http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#ifndef ICING_STORE_NAMESPACE_CHECKER_IMPL_H_
-#define ICING_STORE_NAMESPACE_CHECKER_IMPL_H_
-
-#include "icing/store/document-id.h"
-#include "icing/store/document-store.h"
-#include "icing/store/namespace-checker.h"
-#include "icing/store/namespace-id.h"
-
-namespace icing {
-namespace lib {
-
-class NamespaceCheckerImpl : public NamespaceChecker {
- public:
- explicit NamespaceCheckerImpl(
- const DocumentStore* document_store,
- std::unordered_set<NamespaceId> target_namespace_ids)
- : document_store_(*document_store),
- target_namespace_ids_(std::move(target_namespace_ids)) {}
-
- bool BelongsToTargetNamespaces(DocumentId document_id) const override {
- if (target_namespace_ids_.empty()) {
- return true;
- }
- auto document_filter_data_or_ =
- document_store_.GetDocumentFilterData(document_id);
- return document_filter_data_or_.ok() &&
- target_namespace_ids_.count(
- document_filter_data_or_.ValueOrDie().namespace_id())> 0;
- }
- const DocumentStore& document_store_;
- std::unordered_set<NamespaceId> target_namespace_ids_;
-};
-
-} // namespace lib
-} // namespace icing
-
-#endif // ICING_STORE_NAMESPACE_CHECKER_IMPL_H_ \ No newline at end of file
diff --git a/icing/store/namespace-checker.h b/icing/store/namespace-checker.h
deleted file mode 100644
index 8812ab1..0000000
--- a/icing/store/namespace-checker.h
+++ /dev/null
@@ -1,42 +0,0 @@
-// Copyright (C) 2021 Google LLC
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-// http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#ifndef ICING_STORE_NAMESPACE_CHECKER_H_
-#define ICING_STORE_NAMESPACE_CHECKER_H_
-
-#include "icing/store/document-id.h"
-
-namespace icing {
-namespace lib {
-
-class NamespaceChecker {
- public:
- virtual ~NamespaceChecker() = default;
-
- // Check whether the given document id is belongs to the target namespaces.
- // Returns:
- // On success,
- // - true: the given document id belongs to the target namespaces
- // - false: the given document id doesn't belong to the target namespaces
- // OUT_OF_RANGE if document_id is negative or exceeds previously seen
- // DocumentIds
- // NOT_FOUND if the document or the filter data is not found
- // INTERNAL_ERROR on all other errors
- virtual bool BelongsToTargetNamespaces(DocumentId document_id) const = 0;
-};
-
-} // namespace lib
-} // namespace icing
-
-#endif // ICING_STORE_NAMESPACE_CHECKER_H_
diff --git a/icing/testing/always-true-namespace-checker-impl.h b/icing/testing/always-true-namespace-checker-impl.h
deleted file mode 100644
index f7744b6..0000000
--- a/icing/testing/always-true-namespace-checker-impl.h
+++ /dev/null
@@ -1,34 +0,0 @@
-// Copyright (C) 2021 Google LLC
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-// http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#ifndef ICING_TESTING_ALWAYS_TRUE_NAMESPACE_CHECKER_IMPL_H_
-#define ICING_TESTING_ALWAYS_TRUE_NAMESPACE_CHECKER_IMPL_H_
-
-#include "icing/store/document-id.h"
-#include "icing/store/namespace-checker.h"
-
-namespace icing {
-namespace lib {
-
-class AlwaysTrueNamespaceCheckerImpl : public NamespaceChecker {
- public:
- bool BelongsToTargetNamespaces(DocumentId document_id) const override {
- return true;
- }
-};
-
-} // namespace lib
-} // namespace icing
-
-#endif // ICING_TESTING_ALWAYS_TRUE_NAMESPACE_CHECKER_IMPL_H_ \ No newline at end of file
diff --git a/icing/testing/common-matchers.h b/icing/testing/common-matchers.h
index f83fe0a..8d8bdf2 100644
--- a/icing/testing/common-matchers.h
+++ b/icing/testing/common-matchers.h
@@ -121,6 +121,7 @@ MATCHER_P(EqualsSetSchemaResult, expected, "") {
const SchemaStore::SetSchemaResult& actual = arg;
if (actual.success == expected.success &&
+ actual.index_incompatible == expected.index_incompatible &&
actual.old_schema_type_ids_changed ==
expected.old_schema_type_ids_changed &&
actual.schema_types_deleted_by_name ==
@@ -130,12 +131,7 @@ MATCHER_P(EqualsSetSchemaResult, expected, "") {
actual.schema_types_incompatible_by_name ==
expected.schema_types_incompatible_by_name &&
actual.schema_types_incompatible_by_id ==
- expected.schema_types_incompatible_by_id &&
- actual.schema_types_new_by_name == expected.schema_types_new_by_name &&
- actual.schema_types_changed_fully_compatible_by_name ==
- expected.schema_types_changed_fully_compatible_by_name &&
- actual.schema_types_index_incompatible_by_name ==
- expected.schema_types_index_incompatible_by_name) {
+ expected.schema_types_incompatible_by_id) {
return true;
}
@@ -195,82 +191,37 @@ MATCHER_P(EqualsSetSchemaResult, expected, "") {
absl_ports::NumberFormatter()),
"]");
- // Format schema_types_new_by_name
- std::string actual_schema_types_new_by_name = absl_ports::StrCat(
- "[", absl_ports::StrJoin(actual.schema_types_new_by_name, ","), "]");
-
- std::string expected_schema_types_new_by_name = absl_ports::StrCat(
- "[", absl_ports::StrJoin(expected.schema_types_new_by_name, ","), "]");
-
- // Format schema_types_changed_fully_compatible_by_name
- std::string actual_schema_types_changed_fully_compatible_by_name =
- absl_ports::StrCat(
- "[",
- absl_ports::StrJoin(
- actual.schema_types_changed_fully_compatible_by_name, ","),
- "]");
-
- std::string expected_schema_types_changed_fully_compatible_by_name =
- absl_ports::StrCat(
- "[",
- absl_ports::StrJoin(
- expected.schema_types_changed_fully_compatible_by_name, ","),
- "]");
-
- // Format schema_types_deleted_by_id
- std::string actual_schema_types_index_incompatible_by_name =
- absl_ports::StrCat(
- "[",
- absl_ports::StrJoin(actual.schema_types_index_incompatible_by_name,
- ","),
- "]");
-
- std::string expected_schema_types_index_incompatible_by_name =
- absl_ports::StrCat(
- "[",
- absl_ports::StrJoin(expected.schema_types_index_incompatible_by_name,
- ","),
- "]");
-
*result_listener << IcingStringUtil::StringPrintf(
"\nExpected {\n"
"\tsuccess=%d,\n"
+ "\tindex_incompatible=%d,\n"
"\told_schema_type_ids_changed=%s,\n"
"\tschema_types_deleted_by_name=%s,\n"
"\tschema_types_deleted_by_id=%s,\n"
"\tschema_types_incompatible_by_name=%s,\n"
"\tschema_types_incompatible_by_id=%s\n"
- "\tschema_types_new_by_name=%s,\n"
- "\tschema_types_index_incompatible_by_name=%s,\n"
- "\tschema_types_changed_fully_compatible_by_name=%s\n"
"}\n"
"Actual {\n"
"\tsuccess=%d,\n"
+ "\tindex_incompatible=%d,\n"
"\told_schema_type_ids_changed=%s,\n"
"\tschema_types_deleted_by_name=%s,\n"
"\tschema_types_deleted_by_id=%s,\n"
"\tschema_types_incompatible_by_name=%s,\n"
"\tschema_types_incompatible_by_id=%s\n"
- "\tschema_types_new_by_name=%s,\n"
- "\tschema_types_index_incompatible_by_name=%s,\n"
- "\tschema_types_changed_fully_compatible_by_name=%s\n"
"}\n",
- expected.success, expected_old_schema_type_ids_changed.c_str(),
+ expected.success, expected.index_incompatible,
+ expected_old_schema_type_ids_changed.c_str(),
expected_schema_types_deleted_by_name.c_str(),
expected_schema_types_deleted_by_id.c_str(),
expected_schema_types_incompatible_by_name.c_str(),
- expected_schema_types_incompatible_by_id.c_str(),
- expected_schema_types_new_by_name.c_str(),
- expected_schema_types_changed_fully_compatible_by_name.c_str(),
- expected_schema_types_index_incompatible_by_name.c_str(), actual.success,
- actual_old_schema_type_ids_changed.c_str(),
+ expected_schema_types_incompatible_by_id.c_str(), actual.success,
+ actual.index_incompatible, actual_old_schema_type_ids_changed.c_str(),
actual_schema_types_deleted_by_name.c_str(),
actual_schema_types_deleted_by_id.c_str(),
actual_schema_types_incompatible_by_name.c_str(),
- actual_schema_types_incompatible_by_id.c_str(),
- actual_schema_types_new_by_name.c_str(),
- actual_schema_types_changed_fully_compatible_by_name.c_str(),
- actual_schema_types_index_incompatible_by_name.c_str());
+ actual_schema_types_incompatible_by_id.c_str());
+
return false;
}
diff --git a/icing/testing/random-string.cc b/icing/testing/random-string.cc
deleted file mode 100644
index 27f83bc..0000000
--- a/icing/testing/random-string.cc
+++ /dev/null
@@ -1,54 +0,0 @@
-// Copyright (C) 2019 Google LLC
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-// http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#include "icing/testing/random-string.h"
-
-namespace icing {
-namespace lib {
-
-std::vector<std::string> GenerateUniqueTerms(int num_terms) {
- char before_a = 'a' - 1;
- std::string term(1, before_a);
- std::vector<std::string> terms;
- int current_char = 0;
- for (int permutation = 0; permutation < num_terms; ++permutation) {
- if (term[current_char] != 'z') {
- ++term[current_char];
- } else {
- if (current_char < term.length() - 1) {
- // The string currently looks something like this "zzzaa"
- // 1. Find the first char after this one that isn't
- current_char = term.find_first_not_of('z', current_char);
- if (current_char != std::string::npos) {
- // 2. Increment that character
- ++term[current_char];
-
- // 3. Set every character prior to current_char to 'a'
- term.replace(0, current_char, current_char, 'a');
- } else {
- // Every character in this string is a 'z'. We need to grow.
- term = std::string(term.length() + 1, 'a');
- }
- } else {
- term = std::string(term.length() + 1, 'a');
- }
- current_char = 0;
- }
- terms.push_back(term);
- }
- return terms;
-}
-
-} // namespace lib
-} // namespace icing
diff --git a/icing/testing/random-string.h b/icing/testing/random-string.h
index fd8d87b..ac36924 100644
--- a/icing/testing/random-string.h
+++ b/icing/testing/random-string.h
@@ -15,7 +15,6 @@
#ifndef ICING_TESTING_RANDOM_STRING_H_
#define ICING_TESTING_RANDOM_STRING_H_
-#include <algorithm>
#include <random>
#include <string>
@@ -37,10 +36,6 @@ std::string RandomString(const std::string_view alphabet, size_t len,
return result;
}
-// Returns a vector containing num_terms unique terms. Terms are created in
-// non-random order starting with "a" to "z" to "aa" to "zz", etc.
-std::vector<std::string> GenerateUniqueTerms(int num_terms);
-
} // namespace lib
} // namespace icing
diff --git a/icing/testing/random-string_test.cc b/icing/testing/random-string_test.cc
deleted file mode 100644
index 759fec0..0000000
--- a/icing/testing/random-string_test.cc
+++ /dev/null
@@ -1,54 +0,0 @@
-// Copyright (C) 2019 Google LLC
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-// http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#include "icing/testing/random-string.h"
-
-#include "gmock/gmock.h"
-#include "gtest/gtest.h"
-
-using ::testing::ElementsAre;
-using ::testing::Eq;
-using ::testing::IsEmpty;
-
-namespace icing {
-namespace lib {
-
-namespace {
-
-TEST(RandomStringTest, GenerateUniqueTerms) {
- EXPECT_THAT(GenerateUniqueTerms(0), IsEmpty());
- EXPECT_THAT(GenerateUniqueTerms(1), ElementsAre("a"));
- EXPECT_THAT(GenerateUniqueTerms(4), ElementsAre("a", "b", "c", "d"));
- EXPECT_THAT(GenerateUniqueTerms(29),
- ElementsAre("a", "b", "c", "d", "e", "f", "g", "h", "i", "j", "k",
- "l", "m", "n", "o", "p", "q", "r", "s", "t", "u", "v",
- "w", "x", "y", "z", "aa", "ba", "ca"));
- EXPECT_THAT(GenerateUniqueTerms(56),
- ElementsAre("a", "b", "c", "d", "e", "f", "g", "h", "i", "j", "k",
- "l", "m", "n", "o", "p", "q", "r", "s", "t", "u", "v",
- "w", "x", "y", "z", "aa", "ba", "ca", "da", "ea",
- "fa", "ga", "ha", "ia", "ja", "ka", "la", "ma", "na",
- "oa", "pa", "qa", "ra", "sa", "ta", "ua", "va", "wa",
- "xa", "ya", "za", "ab", "bb", "cb", "db"));
- EXPECT_THAT(GenerateUniqueTerms(56).at(54), Eq("cb"));
- EXPECT_THAT(GenerateUniqueTerms(26 * 26 * 26).at(26), Eq("aa"));
- EXPECT_THAT(GenerateUniqueTerms(26 * 26 * 26).at(26 * 27), Eq("aaa"));
- EXPECT_THAT(GenerateUniqueTerms(26 * 26 * 26).at(26 * 27 - 6), Eq("uz"));
- EXPECT_THAT(GenerateUniqueTerms(26 * 26 * 26).at(26 * 27 + 5), Eq("faa"));
-}
-
-} // namespace
-
-} // namespace lib
-} // namespace icing
diff --git a/icing/testing/snippet-helpers.cc b/icing/testing/snippet-helpers.cc
index 7a71987..cfd20c2 100644
--- a/icing/testing/snippet-helpers.cc
+++ b/icing/testing/snippet-helpers.cc
@@ -77,16 +77,6 @@ std::vector<std::string_view> GetMatches(
return matches;
}
-std::vector<std::string_view> GetSubMatches(
- std::string_view content, const SnippetProto::EntryProto& snippet_proto) {
- std::vector<std::string_view> matches;
- for (const SnippetMatchProto& match : snippet_proto.snippet_matches()) {
- matches.push_back(content.substr(match.exact_match_byte_position(),
- match.submatch_byte_length()));
- }
- return matches;
-}
-
std::string_view GetString(const DocumentProto* document,
std::string_view property_path) {
std::vector<std::string_view> properties =
diff --git a/icing/testing/snippet-helpers.h b/icing/testing/snippet-helpers.h
index 73b2ce2..defadeb 100644
--- a/icing/testing/snippet-helpers.h
+++ b/icing/testing/snippet-helpers.h
@@ -40,10 +40,6 @@ std::vector<std::string_view> GetWindows(
std::vector<std::string_view> GetMatches(
std::string_view content, const SnippetProto::EntryProto& snippet_proto);
-// Retrieves all submatches defined by the snippet_proto for the content.
-std::vector<std::string_view> GetSubMatches(
- std::string_view content, const SnippetProto::EntryProto& snippet_proto);
-
// Retrieves the string value held in the document corresponding to the
// property_path.
// Example:
diff --git a/icing/tokenization/combined-tokenizer_test.cc b/icing/tokenization/combined-tokenizer_test.cc
deleted file mode 100644
index 0212e4f..0000000
--- a/icing/tokenization/combined-tokenizer_test.cc
+++ /dev/null
@@ -1,232 +0,0 @@
-// Copyright (C) 2022 Google LLC
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-// http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#include <string_view>
-#include <vector>
-
-#include "testing/base/public/gmock.h"
-#include "testing/base/public/gunit.h"
-#include "third_party/icing/portable/platform.h"
-#include "third_party/icing/proto/schema_proto_portable.pb.h"
-#include "third_party/icing/testing/common-matchers.h"
-#include "third_party/icing/testing/icu-data-file-helper.h"
-#include "third_party/icing/testing/jni-test-helpers.h"
-#include "third_party/icing/testing/test-data.h"
-#include "third_party/icing/tokenization/language-segmenter-factory.h"
-#include "third_party/icing/tokenization/language-segmenter.h"
-#include "third_party/icing/tokenization/tokenizer-factory.h"
-#include "third_party/icing/tokenization/tokenizer.h"
-#include "third_party/icu/include/unicode/uloc.h"
-
-namespace icing {
-namespace lib {
-
-namespace {
-
-using ::testing::ElementsAre;
-
-// This test exists to ensure that the different tokenizers treat different
-// segments of text in the same manner.
-class CombinedTokenizerTest : public ::testing::Test {
- protected:
- void SetUp() override {
- if (!IsCfStringTokenization() && !IsReverseJniTokenization()) {
- ICING_ASSERT_OK(
- // File generated via icu_data_file rule in //third_party/icing/BUILD.
- icu_data_file_helper::SetUpICUDataFile(
- GetTestFilePath("third_party/icing/icu.dat")));
- }
- jni_cache_ = GetTestJniCache();
-
- language_segmenter_factory::SegmenterOptions options(ULOC_US,
- jni_cache_.get());
- ICING_ASSERT_OK_AND_ASSIGN(
- lang_segmenter_,
- language_segmenter_factory::Create(std::move(options)));
- }
-
- std::unique_ptr<const JniCache> jni_cache_;
- std::unique_ptr<LanguageSegmenter> lang_segmenter_;
-};
-
-std::vector<std::string> GetTokenTerms(const std::vector<Token>& tokens) {
- std::vector<std::string> terms;
- terms.reserve(tokens.size());
- for (const Token& token : tokens) {
- if (token.type == Token::Type::REGULAR) {
- terms.push_back(std::string(token.text));
- }
- }
- return terms;
-}
-
-} // namespace
-
-TEST_F(CombinedTokenizerTest, SpecialCharacters) {
- const std::string_view kText = "😊 Hello! Goodbye?";
- ICING_ASSERT_OK_AND_ASSIGN(
- std::unique_ptr<Tokenizer> indexing_tokenizer,
- tokenizer_factory::CreateIndexingTokenizer(
- StringIndexingConfig::TokenizerType::PLAIN, lang_segmenter_.get()));
-
- ICING_ASSERT_OK_AND_ASSIGN(
- std::unique_ptr<Tokenizer> query_tokenizer,
- CreateQueryTokenizer(tokenizer_factory::QueryTokenizerType::RAW_QUERY,
- lang_segmenter_.get()));
-
- ICING_ASSERT_OK_AND_ASSIGN(std::vector<Token> indexing_tokens,
- indexing_tokenizer->TokenizeAll(kText));
- std::vector<std::string> indexing_terms = GetTokenTerms(indexing_tokens);
- EXPECT_THAT(indexing_terms, ElementsAre("😊", "Hello", "Goodbye"));
-
- ICING_ASSERT_OK_AND_ASSIGN(std::vector<Token> query_tokens,
- query_tokenizer->TokenizeAll(kText));
- std::vector<std::string> query_terms = GetTokenTerms(query_tokens);
- EXPECT_THAT(query_terms, ElementsAre("😊", "Hello", "Goodbye"));
-}
-
-TEST_F(CombinedTokenizerTest, Parentheses) {
- const std::string_view kText = "((paren1)(paren2) (last paren))";
- ICING_ASSERT_OK_AND_ASSIGN(
- std::unique_ptr<Tokenizer> indexing_tokenizer,
- tokenizer_factory::CreateIndexingTokenizer(
- StringIndexingConfig::TokenizerType::PLAIN, lang_segmenter_.get()));
-
- ICING_ASSERT_OK_AND_ASSIGN(
- std::unique_ptr<Tokenizer> query_tokenizer,
- CreateQueryTokenizer(tokenizer_factory::QueryTokenizerType::RAW_QUERY,
- lang_segmenter_.get()));
-
- ICING_ASSERT_OK_AND_ASSIGN(std::vector<Token> indexing_tokens,
- indexing_tokenizer->TokenizeAll(kText));
- std::vector<std::string> indexing_terms = GetTokenTerms(indexing_tokens);
- EXPECT_THAT(indexing_terms, ElementsAre("paren1", "paren2", "last", "paren"));
-
- ICING_ASSERT_OK_AND_ASSIGN(std::vector<Token> query_tokens,
- query_tokenizer->TokenizeAll(kText));
- std::vector<std::string> query_terms = GetTokenTerms(query_tokens);
- EXPECT_THAT(query_terms, ElementsAre("paren1", "paren2", "last", "paren"));
-}
-
-TEST_F(CombinedTokenizerTest, Negation) {
- const std::string_view kText = "-foo -bar -baz";
- ICING_ASSERT_OK_AND_ASSIGN(
- std::unique_ptr<Tokenizer> indexing_tokenizer,
- tokenizer_factory::CreateIndexingTokenizer(
- StringIndexingConfig::TokenizerType::PLAIN, lang_segmenter_.get()));
-
- ICING_ASSERT_OK_AND_ASSIGN(
- std::unique_ptr<Tokenizer> query_tokenizer,
- CreateQueryTokenizer(tokenizer_factory::QueryTokenizerType::RAW_QUERY,
- lang_segmenter_.get()));
-
- ICING_ASSERT_OK_AND_ASSIGN(std::vector<Token> indexing_tokens,
- indexing_tokenizer->TokenizeAll(kText));
- std::vector<std::string> indexing_terms = GetTokenTerms(indexing_tokens);
- EXPECT_THAT(indexing_terms, ElementsAre("foo", "bar", "baz"));
-
- ICING_ASSERT_OK_AND_ASSIGN(std::vector<Token> query_tokens,
- query_tokenizer->TokenizeAll(kText));
- std::vector<std::string> query_terms = GetTokenTerms(query_tokens);
- EXPECT_THAT(query_terms, ElementsAre("foo", "bar", "baz"));
-}
-
-TEST_F(CombinedTokenizerTest, Colons) {
- const std::string_view kText = ":foo: :bar baz:";
- ICING_ASSERT_OK_AND_ASSIGN(
- std::unique_ptr<Tokenizer> indexing_tokenizer,
- tokenizer_factory::CreateIndexingTokenizer(
- StringIndexingConfig::TokenizerType::PLAIN, lang_segmenter_.get()));
-
- ICING_ASSERT_OK_AND_ASSIGN(
- std::unique_ptr<Tokenizer> query_tokenizer,
- CreateQueryTokenizer(tokenizer_factory::QueryTokenizerType::RAW_QUERY,
- lang_segmenter_.get()));
-
- ICING_ASSERT_OK_AND_ASSIGN(std::vector<Token> indexing_tokens,
- indexing_tokenizer->TokenizeAll(kText));
- std::vector<std::string> indexing_terms = GetTokenTerms(indexing_tokens);
- EXPECT_THAT(indexing_terms, ElementsAre("foo", "bar", "baz"));
-
- ICING_ASSERT_OK_AND_ASSIGN(std::vector<Token> query_tokens,
- query_tokenizer->TokenizeAll(kText));
- std::vector<std::string> query_terms = GetTokenTerms(query_tokens);
- EXPECT_THAT(query_terms, ElementsAre("foo", "bar", "baz"));
-}
-
-TEST_F(CombinedTokenizerTest, ColonsPropertyRestricts) {
- ICING_ASSERT_OK_AND_ASSIGN(
- std::unique_ptr<Tokenizer> indexing_tokenizer,
- tokenizer_factory::CreateIndexingTokenizer(
- StringIndexingConfig::TokenizerType::PLAIN, lang_segmenter_.get()));
-
- ICING_ASSERT_OK_AND_ASSIGN(
- std::unique_ptr<Tokenizer> query_tokenizer,
- CreateQueryTokenizer(tokenizer_factory::QueryTokenizerType::RAW_QUERY,
- lang_segmenter_.get()));
-
- // This is a difference between the two tokenizers. "foo:bar" is a single
- // token to the plain tokenizer because ':' is a word connector. But "foo:bar"
- // is a property restrict to the query tokenizer - so "foo" is the property
- // and "bar" is the only text term.
- constexpr std::string_view kText = "foo:bar";
- ICING_ASSERT_OK_AND_ASSIGN(std::vector<Token> indexing_tokens,
- indexing_tokenizer->TokenizeAll(kText));
- std::vector<std::string> indexing_terms = GetTokenTerms(indexing_tokens);
- EXPECT_THAT(indexing_terms, ElementsAre("foo:bar"));
-
- ICING_ASSERT_OK_AND_ASSIGN(std::vector<Token> query_tokens,
- query_tokenizer->TokenizeAll(kText));
- std::vector<std::string> query_terms = GetTokenTerms(query_tokens);
- EXPECT_THAT(query_terms, ElementsAre("bar"));
-
- // This difference, however, should only apply to the first ':'. A
- // second ':' should be treated by both tokenizers as a word connector.
- constexpr std::string_view kText2 = "foo:bar:baz";
- ICING_ASSERT_OK_AND_ASSIGN(indexing_tokens,
- indexing_tokenizer->TokenizeAll(kText2));
- indexing_terms = GetTokenTerms(indexing_tokens);
- EXPECT_THAT(indexing_terms, ElementsAre("foo:bar:baz"));
-
- ICING_ASSERT_OK_AND_ASSIGN(query_tokens,
- query_tokenizer->TokenizeAll(kText2));
- query_terms = GetTokenTerms(query_tokens);
- EXPECT_THAT(query_terms, ElementsAre("bar:baz"));
-}
-
-TEST_F(CombinedTokenizerTest, Punctuation) {
- const std::string_view kText = "Who? What!? Why & How.";
- ICING_ASSERT_OK_AND_ASSIGN(
- std::unique_ptr<Tokenizer> indexing_tokenizer,
- tokenizer_factory::CreateIndexingTokenizer(
- StringIndexingConfig::TokenizerType::PLAIN, lang_segmenter_.get()));
-
- ICING_ASSERT_OK_AND_ASSIGN(
- std::unique_ptr<Tokenizer> query_tokenizer,
- CreateQueryTokenizer(tokenizer_factory::QueryTokenizerType::RAW_QUERY,
- lang_segmenter_.get()));
-
- ICING_ASSERT_OK_AND_ASSIGN(std::vector<Token> indexing_tokens,
- indexing_tokenizer->TokenizeAll(kText));
- std::vector<std::string> indexing_terms = GetTokenTerms(indexing_tokens);
- EXPECT_THAT(indexing_terms, ElementsAre("Who", "What", "Why", "How"));
-
- ICING_ASSERT_OK_AND_ASSIGN(std::vector<Token> query_tokens,
- query_tokenizer->TokenizeAll(kText));
- std::vector<std::string> query_terms = GetTokenTerms(query_tokens);
- EXPECT_THAT(query_terms, ElementsAre("Who", "What", "Why", "How"));
-}
-
-} // namespace lib
-} // namespace icing
diff --git a/icing/tokenization/icu/icu-language-segmenter.cc b/icing/tokenization/icu/icu-language-segmenter.cc
index dc7b0a4..cb31441 100644
--- a/icing/tokenization/icu/icu-language-segmenter.cc
+++ b/icing/tokenization/icu/icu-language-segmenter.cc
@@ -59,7 +59,7 @@ class IcuLanguageSegmenterIterator : public LanguageSegmenter::Iterator {
~IcuLanguageSegmenterIterator() {
ubrk_close(break_iterator_);
- utext_close(u_text_);
+ utext_close(&u_text_);
}
// Advances to the next term. Returns false if it has reached the end.
@@ -83,6 +83,9 @@ class IcuLanguageSegmenterIterator : public LanguageSegmenter::Iterator {
return false;
}
+ if (!IsValidSegment()) {
+ return Advance();
+ }
return true;
}
@@ -223,7 +226,8 @@ class IcuLanguageSegmenterIterator : public LanguageSegmenter::Iterator {
return absl_ports::AbortedError(
"Could not retrieve valid utf8 character!");
}
- if (term_end_index_exclusive_ > offset_iterator_.utf8_index()) {
+ if (term_end_index_exclusive_ > offset_iterator_.utf8_index() ||
+ !IsValidSegment()) {
return ResetToTermEndingBeforeUtf32(term_start_iterator.utf32_index());
}
return term_start_iterator.utf32_index();
@@ -249,7 +253,7 @@ class IcuLanguageSegmenterIterator : public LanguageSegmenter::Iterator {
: break_iterator_(nullptr),
text_(text),
locale_(locale),
- u_text_(nullptr),
+ u_text_(UTEXT_INITIALIZER),
offset_iterator_(text),
term_start_index_(0),
term_end_index_exclusive_(0) {}
@@ -257,13 +261,10 @@ class IcuLanguageSegmenterIterator : public LanguageSegmenter::Iterator {
// Returns true on success
bool Initialize() {
UErrorCode status = U_ZERO_ERROR;
- u_text_ = utext_openUTF8(nullptr, text_.data(), text_.length(), &status);
- if (u_text_ == nullptr) {
- return false;
- }
+ utext_openUTF8(&u_text_, text_.data(), text_.length(), &status);
break_iterator_ = ubrk_open(UBRK_WORD, locale_.data(), /*text=*/nullptr,
/*textLength=*/0, &status);
- ubrk_setUText(break_iterator_, u_text_, &status);
+ ubrk_setUText(break_iterator_, &u_text_, &status);
return !U_FAILURE(status);
}
@@ -290,6 +291,23 @@ class IcuLanguageSegmenterIterator : public LanguageSegmenter::Iterator {
term_start_index_ = 0;
}
+ bool IsValidSegment() const {
+ // Rule 1: all ASCII terms will be returned.
+ // We know it's a ASCII term by checking the first char.
+ if (i18n_utils::IsAscii(text_[term_start_index_])) {
+ return true;
+ }
+
+ UChar32 uchar32 = i18n_utils::GetUChar32At(text_.data(), text_.length(),
+ term_start_index_);
+ // Rule 2: for non-ASCII terms, only the alphabetic terms are returned.
+ // We know it's an alphabetic term by checking the first unicode character.
+ if (u_isUAlphabetic(uchar32)) {
+ return true;
+ }
+ return false;
+ }
+
// The underlying class that does the segmentation, ubrk_close() must be
// called after using.
UBreakIterator* break_iterator_;
@@ -303,8 +321,8 @@ class IcuLanguageSegmenterIterator : public LanguageSegmenter::Iterator {
std::string_view locale_;
// A thin wrapper around the input UTF8 text, needed by break_iterator_.
- // Allocated by calling utext_openUtf8() and freed by calling utext_close().
- UText* u_text_;
+ // utext_close() must be called after using.
+ UText u_text_;
// Offset iterator. This iterator is not guaranteed to point to any particular
// character, but is guaranteed to point to a valid UTF character sequence.
diff --git a/icing/tokenization/icu/icu-language-segmenter_test.cc b/icing/tokenization/icu/icu-language-segmenter_test.cc
index 4098be5..01eb7d8 100644
--- a/icing/tokenization/icu/icu-language-segmenter_test.cc
+++ b/icing/tokenization/icu/icu-language-segmenter_test.cc
@@ -21,8 +21,8 @@
#include "gmock/gmock.h"
#include "gtest/gtest.h"
#include "icing/absl_ports/str_cat.h"
+#include "icing/helpers/icu/icu-data-file-helper.h"
#include "icing/testing/common-matchers.h"
-#include "icing/testing/icu-data-file-helper.h"
#include "icing/testing/icu-i18n-test-utils.h"
#include "icing/testing/jni-test-helpers.h"
#include "icing/testing/test-data.h"
@@ -191,7 +191,7 @@ TEST_P(IcuLanguageSegmenterAllLocalesTest, Non_ASCII_Non_Alphabetic) {
// Full-width (non-ASCII) punctuation marks and special characters are left
// out.
EXPECT_THAT(language_segmenter->GetAllTerms("。?·Hello!×"),
- IsOkAndHolds(ElementsAre("。", "?", "·", "Hello", "!", "×")));
+ IsOkAndHolds(ElementsAre("Hello")));
}
TEST_P(IcuLanguageSegmenterAllLocalesTest, Acronym) {
@@ -252,9 +252,9 @@ TEST_P(IcuLanguageSegmenterAllLocalesTest, WordConnector) {
// Connectors don't connect if one side is an invalid term (?)
EXPECT_THAT(language_segmenter->GetAllTerms("bar:baz:?"),
- IsOkAndHolds(ElementsAre("bar:baz", ":", "?")));
+ IsOkAndHolds(ElementsAre("bar:baz", ":")));
EXPECT_THAT(language_segmenter->GetAllTerms("?:bar:baz"),
- IsOkAndHolds(ElementsAre("?", ":", "bar:baz")));
+ IsOkAndHolds(ElementsAre(":", "bar:baz")));
EXPECT_THAT(language_segmenter->GetAllTerms("3:14"),
IsOkAndHolds(ElementsAre("3", ":", "14")));
EXPECT_THAT(language_segmenter->GetAllTerms("私:は"),
@@ -372,15 +372,6 @@ TEST_P(IcuLanguageSegmenterAllLocalesTest, Number) {
IsOkAndHolds(ElementsAre("-", "123")));
}
-TEST_P(IcuLanguageSegmenterAllLocalesTest, FullWidthNumbers) {
- ICING_ASSERT_OK_AND_ASSIGN(
- auto language_segmenter,
- language_segmenter_factory::Create(
- GetSegmenterOptions(GetLocale(), jni_cache_.get())));
- EXPECT_THAT(language_segmenter->GetAllTerms("0123456789"),
- IsOkAndHolds(ElementsAre("0123456789")));
-}
-
TEST_P(IcuLanguageSegmenterAllLocalesTest, ContinuousWhitespaces) {
ICING_ASSERT_OK_AND_ASSIGN(
auto language_segmenter,
@@ -417,16 +408,15 @@ TEST_P(IcuLanguageSegmenterAllLocalesTest, CJKT) {
// have whitespaces as word delimiter.
// Chinese
- EXPECT_THAT(
- language_segmenter->GetAllTerms("我每天走路去上班。"),
- IsOkAndHolds(ElementsAre("我", "每天", "走路", "去", "上班", "。")));
+ EXPECT_THAT(language_segmenter->GetAllTerms("我每天走路去上班。"),
+ IsOkAndHolds(ElementsAre("我", "每天", "走路", "去", "上班")));
// Japanese
EXPECT_THAT(language_segmenter->GetAllTerms("私は毎日仕事に歩いています。"),
IsOkAndHolds(ElementsAre("私", "は", "毎日", "仕事", "に", "歩",
- "い", "てい", "ます", "。")));
+ "い", "てい", "ます")));
// Khmer
EXPECT_THAT(language_segmenter->GetAllTerms("ញុំដើរទៅធ្វើការរាល់ថ្ងៃ។"),
- IsOkAndHolds(ElementsAre("ញុំ", "ដើរទៅ", "ធ្វើការ", "រាល់ថ្ងៃ", "។")));
+ IsOkAndHolds(ElementsAre("ញុំ", "ដើរទៅ", "ធ្វើការ", "រាល់ថ្ងៃ")));
// Thai
EXPECT_THAT(
language_segmenter->GetAllTerms("ฉันเดินไปทำงานทุกวัน"),
@@ -859,19 +849,16 @@ TEST_P(IcuLanguageSegmenterAllLocalesTest, ChineseResetToTermAfterUtf32) {
ICING_ASSERT_OK_AND_ASSIGN(std::unique_ptr<LanguageSegmenter::Iterator> itr,
language_segmenter->Segment(kChinese));
// String: "我每天走路去上班。"
- // ^ ^ ^ ^^ ^
- // UTF-8 idx: 0 3 9 15 18 24
- // UTF-832 idx: 0 1 3 5 6 8
+ // ^ ^ ^ ^^
+ // UTF-8 idx: 0 3 9 15 18
+ // UTF-832 idx: 0 1 3 5 6
EXPECT_THAT(itr->ResetToTermStartingAfterUtf32(0), IsOkAndHolds(Eq(1)));
EXPECT_THAT(itr->GetTerm(), Eq("每天"));
EXPECT_THAT(itr->ResetToTermStartingAfterUtf32(2), IsOkAndHolds(Eq(3)));
EXPECT_THAT(itr->GetTerm(), Eq("走路"));
- EXPECT_THAT(itr->ResetToTermStartingAfterUtf32(7), IsOkAndHolds(Eq(8)));
- EXPECT_THAT(itr->GetTerm(), Eq("。"));
-
- EXPECT_THAT(itr->ResetToTermStartingAfterUtf32(8),
+ EXPECT_THAT(itr->ResetToTermStartingAfterUtf32(7),
StatusIs(libtextclassifier3::StatusCode::NOT_FOUND));
EXPECT_THAT(itr->GetTerm(), IsEmpty());
}
@@ -886,21 +873,18 @@ TEST_P(IcuLanguageSegmenterAllLocalesTest, JapaneseResetToTermAfterUtf32) {
ICING_ASSERT_OK_AND_ASSIGN(std::unique_ptr<LanguageSegmenter::Iterator> itr,
language_segmenter->Segment(kJapanese));
// String: "私は毎日仕事に歩いています。"
- // ^ ^ ^ ^ ^ ^ ^ ^ ^ ^
- // UTF-8 idx: 0 3 6 12 18212427 33 39
- // UTF-32 idx: 0 1 2 4 6 7 8 9 11 13
+ // ^ ^ ^ ^ ^ ^ ^ ^ ^
+ // UTF-8 idx: 0 3 6 12 18212427 33
+ // UTF-32 idx: 0 1 2 4 6 7 8 9 11
EXPECT_THAT(itr->ResetToTermStartingAfterUtf32(0), IsOkAndHolds(Eq(1)));
EXPECT_THAT(itr->GetTerm(), Eq("は"));
- EXPECT_THAT(itr->ResetToTermStartingAfterUtf32(13),
+ EXPECT_THAT(itr->ResetToTermStartingAfterUtf32(11),
StatusIs(libtextclassifier3::StatusCode::NOT_FOUND));
EXPECT_THAT(itr->GetTerm(), IsEmpty());
EXPECT_THAT(itr->ResetToTermStartingAfterUtf32(3), IsOkAndHolds(Eq(4)));
EXPECT_THAT(itr->GetTerm(), Eq("仕事"));
-
- EXPECT_THAT(itr->ResetToTermStartingAfterUtf32(12), IsOkAndHolds(Eq(13)));
- EXPECT_THAT(itr->GetTerm(), Eq("。"));
}
TEST_P(IcuLanguageSegmenterAllLocalesTest, KhmerResetToTermAfterUtf32) {
@@ -912,16 +896,13 @@ TEST_P(IcuLanguageSegmenterAllLocalesTest, KhmerResetToTermAfterUtf32) {
ICING_ASSERT_OK_AND_ASSIGN(std::unique_ptr<LanguageSegmenter::Iterator> itr,
language_segmenter->Segment(kKhmer));
// String: "ញុំដើរទៅធ្វើការរាល់ថ្ងៃ។"
- // ^ ^ ^ ^ ^
- // UTF-8 idx: 0 9 24 45 69
- // UTF-32 idx: 0 3 8 15 23
+ // ^ ^ ^ ^
+ // UTF-8 idx: 0 9 24 45
+ // UTF-32 idx: 0 3 8 15
EXPECT_THAT(itr->ResetToTermStartingAfterUtf32(0), IsOkAndHolds(Eq(3)));
EXPECT_THAT(itr->GetTerm(), Eq("ដើរទៅ"));
- EXPECT_THAT(itr->ResetToTermStartingAfterUtf32(15), IsOkAndHolds(Eq(23)));
- EXPECT_THAT(itr->GetTerm(), Eq("។"));
-
- EXPECT_THAT(itr->ResetToTermStartingAfterUtf32(23),
+ EXPECT_THAT(itr->ResetToTermStartingAfterUtf32(15),
StatusIs(libtextclassifier3::StatusCode::NOT_FOUND));
EXPECT_THAT(itr->GetTerm(), IsEmpty());
diff --git a/icing/tokenization/language-segmenter-iterator_test.cc b/icing/tokenization/language-segmenter-iterator_test.cc
index 3aff45c..d293581 100644
--- a/icing/tokenization/language-segmenter-iterator_test.cc
+++ b/icing/tokenization/language-segmenter-iterator_test.cc
@@ -15,9 +15,9 @@
#include "gmock/gmock.h"
#include "gtest/gtest.h"
#include "icing/absl_ports/str_cat.h"
+#include "icing/helpers/icu/icu-data-file-helper.h"
#include "icing/portable/platform.h"
#include "icing/testing/common-matchers.h"
-#include "icing/testing/icu-data-file-helper.h"
#include "icing/testing/jni-test-helpers.h"
#include "icing/testing/test-data.h"
#include "icing/tokenization/language-segmenter-factory.h"
diff --git a/icing/tokenization/language-segmenter_benchmark.cc b/icing/tokenization/language-segmenter_benchmark.cc
index 6f7d4df..bd86169 100644
--- a/icing/tokenization/language-segmenter_benchmark.cc
+++ b/icing/tokenization/language-segmenter_benchmark.cc
@@ -14,8 +14,8 @@
#include "testing/base/public/benchmark.h"
#include "gmock/gmock.h"
+#include "icing/helpers/icu/icu-data-file-helper.h"
#include "icing/testing/common-matchers.h"
-#include "icing/testing/icu-data-file-helper.h"
#include "icing/testing/test-data.h"
#include "icing/tokenization/language-segmenter-factory.h"
#include "icing/tokenization/language-segmenter.h"
diff --git a/icing/tokenization/plain-tokenizer.cc b/icing/tokenization/plain-tokenizer.cc
index 7a1949f..13fe550 100644
--- a/icing/tokenization/plain-tokenizer.cc
+++ b/icing/tokenization/plain-tokenizer.cc
@@ -66,9 +66,9 @@ class PlainTokenIterator : public Tokenizer::Iterator {
Token GetToken() const override {
if (current_term_.empty()) {
- return Token(Token::Type::INVALID);
+ return Token(Token::INVALID);
}
- return Token(Token::Type::REGULAR, current_term_);
+ return Token(Token::REGULAR, current_term_);
}
libtextclassifier3::StatusOr<CharacterIterator> CalculateTokenStart()
@@ -81,8 +81,8 @@ class PlainTokenIterator : public Tokenizer::Iterator {
return base_iterator_->CalculateTermEndExclusive();
}
- bool ResetToTokenStartingAfter(int32_t utf32_offset) override {
- if (!base_iterator_->ResetToTermStartingAfterUtf32(utf32_offset).ok()) {
+ bool ResetToTokenAfter(int32_t offset) override {
+ if (!base_iterator_->ResetToTermStartingAfterUtf32(offset).ok()) {
return false;
}
current_term_ = base_iterator_->GetTerm();
@@ -93,17 +93,15 @@ class PlainTokenIterator : public Tokenizer::Iterator {
return true;
}
- bool ResetToTokenEndingBefore(int32_t utf32_offset) override {
+ bool ResetToTokenBefore(int32_t offset) override {
ICING_ASSIGN_OR_RETURN(
- utf32_offset,
- base_iterator_->ResetToTermEndingBeforeUtf32(utf32_offset), false);
+ offset, base_iterator_->ResetToTermEndingBeforeUtf32(offset), false);
current_term_ = base_iterator_->GetTerm();
while (!IsValidTerm(current_term_)) {
// Haven't found a valid term yet. Retrieve the term prior to this one
// from the segmenter.
ICING_ASSIGN_OR_RETURN(
- utf32_offset,
- base_iterator_->ResetToTermEndingBeforeUtf32(utf32_offset), false);
+ offset, base_iterator_->ResetToTermEndingBeforeUtf32(offset), false);
current_term_ = base_iterator_->GetTerm();
}
return true;
diff --git a/icing/tokenization/plain-tokenizer_test.cc b/icing/tokenization/plain-tokenizer_test.cc
index c48b51e..7490bfa 100644
--- a/icing/tokenization/plain-tokenizer_test.cc
+++ b/icing/tokenization/plain-tokenizer_test.cc
@@ -18,9 +18,9 @@
#include "gmock/gmock.h"
#include "icing/absl_ports/str_cat.h"
+#include "icing/helpers/icu/icu-data-file-helper.h"
#include "icing/portable/platform.h"
#include "icing/testing/common-matchers.h"
-#include "icing/testing/icu-data-file-helper.h"
#include "icing/testing/icu-i18n-test-utils.h"
#include "icing/testing/jni-test-helpers.h"
#include "icing/testing/test-data.h"
@@ -68,27 +68,26 @@ TEST_F(PlainTokenizerTest, Simple) {
EXPECT_THAT(plain_tokenizer->TokenizeAll(""), IsOkAndHolds(IsEmpty()));
- EXPECT_THAT(
- plain_tokenizer->TokenizeAll("Hello World"),
- IsOkAndHolds(ElementsAre(EqualsToken(Token::Type::REGULAR, "Hello"),
- EqualsToken(Token::Type::REGULAR, "World"))));
+ EXPECT_THAT(plain_tokenizer->TokenizeAll("Hello World"),
+ IsOkAndHolds(ElementsAre(EqualsToken(Token::REGULAR, "Hello"),
+ EqualsToken(Token::REGULAR, "World"))));
EXPECT_THAT(
plain_tokenizer->TokenizeAll(
"Lorem ipsum dolor sit amet, consectetur adipiscing elit. "
"Duis efficitur iaculis auctor."),
- IsOkAndHolds(ElementsAre(EqualsToken(Token::Type::REGULAR, "Lorem"),
- EqualsToken(Token::Type::REGULAR, "ipsum"),
- EqualsToken(Token::Type::REGULAR, "dolor"),
- EqualsToken(Token::Type::REGULAR, "sit"),
- EqualsToken(Token::Type::REGULAR, "amet"),
- EqualsToken(Token::Type::REGULAR, "consectetur"),
- EqualsToken(Token::Type::REGULAR, "adipiscing"),
- EqualsToken(Token::Type::REGULAR, "elit"),
- EqualsToken(Token::Type::REGULAR, "Duis"),
- EqualsToken(Token::Type::REGULAR, "efficitur"),
- EqualsToken(Token::Type::REGULAR, "iaculis"),
- EqualsToken(Token::Type::REGULAR, "auctor"))));
+ IsOkAndHolds(ElementsAre(EqualsToken(Token::REGULAR, "Lorem"),
+ EqualsToken(Token::REGULAR, "ipsum"),
+ EqualsToken(Token::REGULAR, "dolor"),
+ EqualsToken(Token::REGULAR, "sit"),
+ EqualsToken(Token::REGULAR, "amet"),
+ EqualsToken(Token::REGULAR, "consectetur"),
+ EqualsToken(Token::REGULAR, "adipiscing"),
+ EqualsToken(Token::REGULAR, "elit"),
+ EqualsToken(Token::REGULAR, "Duis"),
+ EqualsToken(Token::REGULAR, "efficitur"),
+ EqualsToken(Token::REGULAR, "iaculis"),
+ EqualsToken(Token::REGULAR, "auctor"))));
}
TEST_F(PlainTokenizerTest, Whitespace) {
@@ -108,18 +107,16 @@ TEST_F(PlainTokenizerTest, Whitespace) {
// 0x0009 is horizontal tab, considered as a whitespace
std::string text_with_horizontal_tab =
absl_ports::StrCat("Hello", UCharToString(0x0009), "World");
- EXPECT_THAT(
- plain_tokenizer->TokenizeAll(text_with_horizontal_tab),
- IsOkAndHolds(ElementsAre(EqualsToken(Token::Type::REGULAR, "Hello"),
- EqualsToken(Token::Type::REGULAR, "World"))));
+ EXPECT_THAT(plain_tokenizer->TokenizeAll(text_with_horizontal_tab),
+ IsOkAndHolds(ElementsAre(EqualsToken(Token::REGULAR, "Hello"),
+ EqualsToken(Token::REGULAR, "World"))));
// 0x000B is vertical tab, considered as a whitespace
std::string text_with_vertical_tab =
absl_ports::StrCat("Hello", UCharToString(0x000B), "World");
- EXPECT_THAT(
- plain_tokenizer->TokenizeAll(text_with_vertical_tab),
- IsOkAndHolds(ElementsAre(EqualsToken(Token::Type::REGULAR, "Hello"),
- EqualsToken(Token::Type::REGULAR, "World"))));
+ EXPECT_THAT(plain_tokenizer->TokenizeAll(text_with_vertical_tab),
+ IsOkAndHolds(ElementsAre(EqualsToken(Token::REGULAR, "Hello"),
+ EqualsToken(Token::REGULAR, "World"))));
}
TEST_F(PlainTokenizerTest, Punctuation) {
@@ -134,39 +131,38 @@ TEST_F(PlainTokenizerTest, Punctuation) {
language_segmenter.get()));
// Half-width punctuation marks are filtered out.
- EXPECT_THAT(
- plain_tokenizer->TokenizeAll(
- "Hello, World! Hello: World. \"Hello\" World?"),
- IsOkAndHolds(ElementsAre(EqualsToken(Token::Type::REGULAR, "Hello"),
- EqualsToken(Token::Type::REGULAR, "World"),
- EqualsToken(Token::Type::REGULAR, "Hello"),
- EqualsToken(Token::Type::REGULAR, "World"),
- EqualsToken(Token::Type::REGULAR, "Hello"),
- EqualsToken(Token::Type::REGULAR, "World"))));
+ EXPECT_THAT(plain_tokenizer->TokenizeAll(
+ "Hello, World! Hello: World. \"Hello\" World?"),
+ IsOkAndHolds(ElementsAre(EqualsToken(Token::REGULAR, "Hello"),
+ EqualsToken(Token::REGULAR, "World"),
+ EqualsToken(Token::REGULAR, "Hello"),
+ EqualsToken(Token::REGULAR, "World"),
+ EqualsToken(Token::REGULAR, "Hello"),
+ EqualsToken(Token::REGULAR, "World"))));
// Full-width punctuation marks are filtered out.
std::vector<std::string_view> exp_tokens;
if (IsCfStringTokenization()) {
EXPECT_THAT(
plain_tokenizer->TokenizeAll("你好,世界!你好:世界。“你好”世界?"),
- IsOkAndHolds(ElementsAre(EqualsToken(Token::Type::REGULAR, "你"),
- EqualsToken(Token::Type::REGULAR, "好"),
- EqualsToken(Token::Type::REGULAR, "世界"),
- EqualsToken(Token::Type::REGULAR, "你"),
- EqualsToken(Token::Type::REGULAR, "好"),
- EqualsToken(Token::Type::REGULAR, "世界"),
- EqualsToken(Token::Type::REGULAR, "你"),
- EqualsToken(Token::Type::REGULAR, "好"),
- EqualsToken(Token::Type::REGULAR, "世界"))));
+ IsOkAndHolds(ElementsAre(EqualsToken(Token::REGULAR, "你"),
+ EqualsToken(Token::REGULAR, "好"),
+ EqualsToken(Token::REGULAR, "世界"),
+ EqualsToken(Token::REGULAR, "你"),
+ EqualsToken(Token::REGULAR, "好"),
+ EqualsToken(Token::REGULAR, "世界"),
+ EqualsToken(Token::REGULAR, "你"),
+ EqualsToken(Token::REGULAR, "好"),
+ EqualsToken(Token::REGULAR, "世界"))));
} else {
EXPECT_THAT(
plain_tokenizer->TokenizeAll("你好,世界!你好:世界。“你好”世界?"),
- IsOkAndHolds(ElementsAre(EqualsToken(Token::Type::REGULAR, "你好"),
- EqualsToken(Token::Type::REGULAR, "世界"),
- EqualsToken(Token::Type::REGULAR, "你好"),
- EqualsToken(Token::Type::REGULAR, "世界"),
- EqualsToken(Token::Type::REGULAR, "你好"),
- EqualsToken(Token::Type::REGULAR, "世界"))));
+ IsOkAndHolds(ElementsAre(EqualsToken(Token::REGULAR, "你好"),
+ EqualsToken(Token::REGULAR, "世界"),
+ EqualsToken(Token::REGULAR, "你好"),
+ EqualsToken(Token::REGULAR, "世界"),
+ EqualsToken(Token::REGULAR, "你好"),
+ EqualsToken(Token::REGULAR, "世界"))));
}
}
@@ -184,16 +180,14 @@ TEST_F(PlainTokenizerTest, SpecialCharacters) {
// Right now we don't have special logic for these characters, just output
// them as tokens.
- EXPECT_THAT(
- plain_tokenizer->TokenizeAll("1+1"),
- IsOkAndHolds(ElementsAre(EqualsToken(Token::Type::REGULAR, "1"),
- EqualsToken(Token::Type::REGULAR, "+"),
- EqualsToken(Token::Type::REGULAR, "1"))));
+ EXPECT_THAT(plain_tokenizer->TokenizeAll("1+1"),
+ IsOkAndHolds(ElementsAre(EqualsToken(Token::REGULAR, "1"),
+ EqualsToken(Token::REGULAR, "+"),
+ EqualsToken(Token::REGULAR, "1"))));
- EXPECT_THAT(
- plain_tokenizer->TokenizeAll("$50"),
- IsOkAndHolds(ElementsAre(EqualsToken(Token::Type::REGULAR, "$"),
- EqualsToken(Token::Type::REGULAR, "50"))));
+ EXPECT_THAT(plain_tokenizer->TokenizeAll("$50"),
+ IsOkAndHolds(ElementsAre(EqualsToken(Token::REGULAR, "$"),
+ EqualsToken(Token::REGULAR, "50"))));
}
TEST_F(PlainTokenizerTest, CJKT) {
@@ -209,13 +203,12 @@ TEST_F(PlainTokenizerTest, CJKT) {
tokenizer_factory::CreateIndexingTokenizer(
StringIndexingConfig::TokenizerType::PLAIN,
language_segmenter.get()));
- EXPECT_THAT(
- plain_tokenizer->TokenizeAll("我每天走路去上班。"),
- IsOkAndHolds(ElementsAre(EqualsToken(Token::Type::REGULAR, "我"),
- EqualsToken(Token::Type::REGULAR, "每天"),
- EqualsToken(Token::Type::REGULAR, "走路"),
- EqualsToken(Token::Type::REGULAR, "去"),
- EqualsToken(Token::Type::REGULAR, "上班"))));
+ EXPECT_THAT(plain_tokenizer->TokenizeAll("我每天走路去上班。"),
+ IsOkAndHolds(ElementsAre(EqualsToken(Token::REGULAR, "我"),
+ EqualsToken(Token::REGULAR, "每天"),
+ EqualsToken(Token::REGULAR, "走路"),
+ EqualsToken(Token::REGULAR, "去"),
+ EqualsToken(Token::REGULAR, "上班"))));
// Japanese
options = language_segmenter_factory::SegmenterOptions(ULOC_JAPANESE,
jni_cache_.get());
@@ -227,44 +220,41 @@ TEST_F(PlainTokenizerTest, CJKT) {
StringIndexingConfig::TokenizerType::PLAIN,
language_segmenter.get()));
if (IsCfStringTokenization()) {
- EXPECT_THAT(
- plain_tokenizer->TokenizeAll("私は毎日仕事に歩いています。"),
- IsOkAndHolds(ElementsAre(EqualsToken(Token::Type::REGULAR, "私"),
- EqualsToken(Token::Type::REGULAR, "は"),
- EqualsToken(Token::Type::REGULAR, "毎日"),
- EqualsToken(Token::Type::REGULAR, "仕事"),
- EqualsToken(Token::Type::REGULAR, "に"),
- EqualsToken(Token::Type::REGULAR, "歩い"),
- EqualsToken(Token::Type::REGULAR, "て"),
- EqualsToken(Token::Type::REGULAR, "い"),
- EqualsToken(Token::Type::REGULAR, "ます"))));
+ EXPECT_THAT(plain_tokenizer->TokenizeAll("私は毎日仕事に歩いています。"),
+ IsOkAndHolds(ElementsAre(EqualsToken(Token::REGULAR, "私"),
+ EqualsToken(Token::REGULAR, "は"),
+ EqualsToken(Token::REGULAR, "毎日"),
+ EqualsToken(Token::REGULAR, "仕事"),
+ EqualsToken(Token::REGULAR, "に"),
+ EqualsToken(Token::REGULAR, "歩い"),
+ EqualsToken(Token::REGULAR, "て"),
+ EqualsToken(Token::REGULAR, "い"),
+ EqualsToken(Token::REGULAR, "ます"))));
} else {
- EXPECT_THAT(
- plain_tokenizer->TokenizeAll("私は毎日仕事に歩いています。"),
- IsOkAndHolds(ElementsAre(EqualsToken(Token::Type::REGULAR, "私"),
- EqualsToken(Token::Type::REGULAR, "は"),
- EqualsToken(Token::Type::REGULAR, "毎日"),
- EqualsToken(Token::Type::REGULAR, "仕事"),
- EqualsToken(Token::Type::REGULAR, "に"),
- EqualsToken(Token::Type::REGULAR, "歩"),
- EqualsToken(Token::Type::REGULAR, "い"),
- EqualsToken(Token::Type::REGULAR, "てい"),
- EqualsToken(Token::Type::REGULAR, "ます"))));
+ EXPECT_THAT(plain_tokenizer->TokenizeAll("私は毎日仕事に歩いています。"),
+ IsOkAndHolds(ElementsAre(EqualsToken(Token::REGULAR, "私"),
+ EqualsToken(Token::REGULAR, "は"),
+ EqualsToken(Token::REGULAR, "毎日"),
+ EqualsToken(Token::REGULAR, "仕事"),
+ EqualsToken(Token::REGULAR, "に"),
+ EqualsToken(Token::REGULAR, "歩"),
+ EqualsToken(Token::REGULAR, "い"),
+ EqualsToken(Token::REGULAR, "てい"),
+ EqualsToken(Token::REGULAR, "ます"))));
}
// Khmer
- EXPECT_THAT(
- plain_tokenizer->TokenizeAll("ញុំដើរទៅធ្វើការរាល់ថ្ងៃ។"),
- IsOkAndHolds(ElementsAre(EqualsToken(Token::Type::REGULAR, "ញុំ"),
- EqualsToken(Token::Type::REGULAR, "ដើរទៅ"),
- EqualsToken(Token::Type::REGULAR, "ធ្វើការ"),
- EqualsToken(Token::Type::REGULAR, "រាល់ថ្ងៃ"))));
+ EXPECT_THAT(plain_tokenizer->TokenizeAll("ញុំដើរទៅធ្វើការរាល់ថ្ងៃ។"),
+ IsOkAndHolds(ElementsAre(EqualsToken(Token::REGULAR, "ញុំ"),
+ EqualsToken(Token::REGULAR, "ដើរទៅ"),
+ EqualsToken(Token::REGULAR, "ធ្វើការ"),
+ EqualsToken(Token::REGULAR, "រាល់ថ្ងៃ"))));
// Korean
- EXPECT_THAT(plain_tokenizer->TokenizeAll("나는 매일 출근합니다."),
- IsOkAndHolds(ElementsAre(
- EqualsToken(Token::Type::REGULAR, "나는"),
- EqualsToken(Token::Type::REGULAR, "매일"),
- EqualsToken(Token::Type::REGULAR, "출근합니다"))));
+ EXPECT_THAT(
+ plain_tokenizer->TokenizeAll("나는 매일 출근합니다."),
+ IsOkAndHolds(ElementsAre(EqualsToken(Token::REGULAR, "나는"),
+ EqualsToken(Token::REGULAR, "매일"),
+ EqualsToken(Token::REGULAR, "출근합니다"))));
// Thai
// DIFFERENCE!! Disagreement over how to segment "ทุกวัน" (iOS groups).
@@ -274,24 +264,23 @@ TEST_F(PlainTokenizerTest, CJKT) {
std::vector<Token> tokens,
plain_tokenizer->TokenizeAll("ฉันเดินไปทำงานทุกวัน"));
- EXPECT_THAT(tokens, ElementsAre(EqualsToken(Token::Type::REGULAR, "ฉัน"),
- EqualsToken(Token::Type::REGULAR, "เดิน"),
- EqualsToken(Token::Type::REGULAR, "ไป"),
- EqualsToken(Token::Type::REGULAR, "ทำงาน"),
- EqualsToken(Token::Type::REGULAR, "ทุกวัน")));
+ EXPECT_THAT(tokens, ElementsAre(EqualsToken(Token::REGULAR, "ฉัน"),
+ EqualsToken(Token::REGULAR, "เดิน"),
+ EqualsToken(Token::REGULAR, "ไป"),
+ EqualsToken(Token::REGULAR, "ทำงาน"),
+ EqualsToken(Token::REGULAR, "ทุกวัน")));
} else {
- EXPECT_THAT(
- plain_tokenizer->TokenizeAll("ฉันเดินไปทำงานทุกวัน"),
- IsOkAndHolds(ElementsAre(EqualsToken(Token::Type::REGULAR, "ฉัน"),
- EqualsToken(Token::Type::REGULAR, "เดิน"),
- EqualsToken(Token::Type::REGULAR, "ไป"),
- EqualsToken(Token::Type::REGULAR, "ทำงาน"),
- EqualsToken(Token::Type::REGULAR, "ทุก"),
- EqualsToken(Token::Type::REGULAR, "วัน"))));
+ EXPECT_THAT(plain_tokenizer->TokenizeAll("ฉันเดินไปทำงานทุกวัน"),
+ IsOkAndHolds(ElementsAre(EqualsToken(Token::REGULAR, "ฉัน"),
+ EqualsToken(Token::REGULAR, "เดิน"),
+ EqualsToken(Token::REGULAR, "ไป"),
+ EqualsToken(Token::REGULAR, "ทำงาน"),
+ EqualsToken(Token::REGULAR, "ทุก"),
+ EqualsToken(Token::REGULAR, "วัน"))));
}
}
-TEST_F(PlainTokenizerTest, ResetToTokenStartingAfterSimple) {
+TEST_F(PlainTokenizerTest, ResetToTokenAfterSimple) {
language_segmenter_factory::SegmenterOptions options(ULOC_US,
jni_cache_.get());
ICING_ASSERT_OK_AND_ASSIGN(
@@ -305,13 +294,13 @@ TEST_F(PlainTokenizerTest, ResetToTokenStartingAfterSimple) {
constexpr std::string_view kText = "f b";
auto iterator = plain_tokenizer->Tokenize(kText).ValueOrDie();
- EXPECT_TRUE(iterator->ResetToTokenStartingAfter(0));
- EXPECT_THAT(iterator->GetToken(), EqualsToken(Token::Type::REGULAR, "b"));
+ EXPECT_TRUE(iterator->ResetToTokenAfter(0));
+ EXPECT_THAT(iterator->GetToken(), EqualsToken(Token::REGULAR, "b"));
- EXPECT_FALSE(iterator->ResetToTokenStartingAfter(2));
+ EXPECT_FALSE(iterator->ResetToTokenAfter(2));
}
-TEST_F(PlainTokenizerTest, ResetToTokenEndingBeforeSimple) {
+TEST_F(PlainTokenizerTest, ResetToTokenBeforeSimple) {
language_segmenter_factory::SegmenterOptions options(ULOC_US,
jni_cache_.get());
ICING_ASSERT_OK_AND_ASSIGN(
@@ -325,13 +314,13 @@ TEST_F(PlainTokenizerTest, ResetToTokenEndingBeforeSimple) {
constexpr std::string_view kText = "f b";
auto iterator = plain_tokenizer->Tokenize(kText).ValueOrDie();
- EXPECT_TRUE(iterator->ResetToTokenEndingBefore(2));
- EXPECT_THAT(iterator->GetToken(), EqualsToken(Token::Type::REGULAR, "f"));
+ EXPECT_TRUE(iterator->ResetToTokenBefore(2));
+ EXPECT_THAT(iterator->GetToken(), EqualsToken(Token::REGULAR, "f"));
- EXPECT_FALSE(iterator->ResetToTokenEndingBefore(0));
+ EXPECT_FALSE(iterator->ResetToTokenBefore(0));
}
-TEST_F(PlainTokenizerTest, ResetToTokenStartingAfter) {
+TEST_F(PlainTokenizerTest, ResetToTokenAfter) {
language_segmenter_factory::SegmenterOptions options(ULOC_US,
jni_cache_.get());
ICING_ASSERT_OK_AND_ASSIGN(
@@ -343,12 +332,11 @@ TEST_F(PlainTokenizerTest, ResetToTokenStartingAfter) {
language_segmenter.get()));
constexpr std::string_view kText = " foo . bar baz.. bat ";
- EXPECT_THAT(
- plain_tokenizer->TokenizeAll(kText),
- IsOkAndHolds(ElementsAre(EqualsToken(Token::Type::REGULAR, "foo"),
- EqualsToken(Token::Type::REGULAR, "bar"),
- EqualsToken(Token::Type::REGULAR, "baz"),
- EqualsToken(Token::Type::REGULAR, "bat"))));
+ EXPECT_THAT(plain_tokenizer->TokenizeAll(kText),
+ IsOkAndHolds(ElementsAre(EqualsToken(Token::REGULAR, "foo"),
+ EqualsToken(Token::REGULAR, "bar"),
+ EqualsToken(Token::REGULAR, "baz"),
+ EqualsToken(Token::REGULAR, "bat"))));
std::vector<std::string> expected_text = {
"foo", // 0: " foo . bar"
"bar", // 1: "foo . bar "
@@ -371,19 +359,19 @@ TEST_F(PlainTokenizerTest, ResetToTokenStartingAfter) {
auto iterator = plain_tokenizer->Tokenize(kText).ValueOrDie();
EXPECT_TRUE(iterator->Advance());
- EXPECT_THAT(iterator->GetToken(), EqualsToken(Token::Type::REGULAR, "foo"));
+ EXPECT_THAT(iterator->GetToken(), EqualsToken(Token::REGULAR, "foo"));
for (int i = 0; i < kText.length(); ++i) {
if (i < expected_text.size()) {
- EXPECT_TRUE(iterator->ResetToTokenStartingAfter(i));
+ EXPECT_TRUE(iterator->ResetToTokenAfter(i));
EXPECT_THAT(iterator->GetToken(),
- EqualsToken(Token::Type::REGULAR, expected_text[i]));
+ EqualsToken(Token::REGULAR, expected_text[i]));
} else {
- EXPECT_FALSE(iterator->ResetToTokenStartingAfter(i));
+ EXPECT_FALSE(iterator->ResetToTokenAfter(i));
}
}
}
-TEST_F(PlainTokenizerTest, ResetToTokenEndingBefore) {
+TEST_F(PlainTokenizerTest, ResetToTokenBefore) {
language_segmenter_factory::SegmenterOptions options(ULOC_US,
jni_cache_.get());
ICING_ASSERT_OK_AND_ASSIGN(
@@ -395,12 +383,11 @@ TEST_F(PlainTokenizerTest, ResetToTokenEndingBefore) {
language_segmenter.get()));
constexpr std::string_view kText = " foo . bar baz.. bat ";
- EXPECT_THAT(
- plain_tokenizer->TokenizeAll(kText),
- IsOkAndHolds(ElementsAre(EqualsToken(Token::Type::REGULAR, "foo"),
- EqualsToken(Token::Type::REGULAR, "bar"),
- EqualsToken(Token::Type::REGULAR, "baz"),
- EqualsToken(Token::Type::REGULAR, "bat"))));
+ EXPECT_THAT(plain_tokenizer->TokenizeAll(kText),
+ IsOkAndHolds(ElementsAre(EqualsToken(Token::REGULAR, "foo"),
+ EqualsToken(Token::REGULAR, "bar"),
+ EqualsToken(Token::REGULAR, "baz"),
+ EqualsToken(Token::REGULAR, "bat"))));
std::vector<std::string> expected_text = {
"bat", // 20: "baz.. bat "
"baz", // 19: " baz.. bat"
@@ -423,16 +410,15 @@ TEST_F(PlainTokenizerTest, ResetToTokenEndingBefore) {
auto iterator = plain_tokenizer->Tokenize(kText).ValueOrDie();
EXPECT_TRUE(iterator->Advance());
- EXPECT_THAT(iterator->GetToken(), EqualsToken(Token::Type::REGULAR, "foo"));
+ EXPECT_THAT(iterator->GetToken(), EqualsToken(Token::REGULAR, "foo"));
for (int i = kText.length() - 1; i >= 0; --i) {
int expected_index = kText.length() - 1 - i;
if (expected_index < expected_text.size()) {
- EXPECT_TRUE(iterator->ResetToTokenEndingBefore(i));
- EXPECT_THAT(
- iterator->GetToken(),
- EqualsToken(Token::Type::REGULAR, expected_text[expected_index]));
+ EXPECT_TRUE(iterator->ResetToTokenBefore(i));
+ EXPECT_THAT(iterator->GetToken(),
+ EqualsToken(Token::REGULAR, expected_text[expected_index]));
} else {
- EXPECT_FALSE(iterator->ResetToTokenEndingBefore(i));
+ EXPECT_FALSE(iterator->ResetToTokenBefore(i));
}
}
}
diff --git a/icing/tokenization/raw-query-tokenizer.cc b/icing/tokenization/raw-query-tokenizer.cc
index ff449a7..205d3a2 100644
--- a/icing/tokenization/raw-query-tokenizer.cc
+++ b/icing/tokenization/raw-query-tokenizer.cc
@@ -14,8 +14,9 @@
#include "icing/tokenization/raw-query-tokenizer.h"
+#include <stddef.h>
+
#include <cctype>
-#include <cstddef>
#include <memory>
#include <string>
#include <string_view>
@@ -102,7 +103,7 @@ enum State {
// When seeing right parentheses
CLOSING_PARENTHESES = 8,
- PROCESSING_NON_ASCII_ALPHANUMERIC_TERM = 9,
+ PROCESSING_NON_ASCII_ALPHABETIC_TERM = 9,
PROCESSING_PROPERTY_TERM_APPENDING = 10,
@@ -119,7 +120,7 @@ enum TermType {
// A term that consists of unicode alphabetic and numeric characters
ASCII_ALPHANUMERIC_TERM = 1,
- NON_ASCII_ALPHANUMERIC_TERM = 2,
+ NON_ASCII_ALPHABETIC_TERM = 2,
// "("
LEFT_PARENTHESES = 3,
@@ -208,7 +209,7 @@ std::string_view GetErrorMessage(ActionOrError maybe_error) {
// PROCESSING_OR = 6
// OPENING_PARENTHESES = 7
// CLOSING_PARENTHESES = 8
-// PROCESSING_NON_ASCII_ALPHANUMERIC_TERM = 9
+// PROCESSING_NON_ASCII_ALPHABETIC_TERM = 9
// PROCESSING_PROPERTY_TERM_APPENDING = 10
//
// Actions:
@@ -252,40 +253,40 @@ std::string_view GetErrorMessage(ActionOrError maybe_error) {
// like "+", "&", "@", "#" in indexing and query tokenizers.
constexpr State state_transition_rules[STATE_COUNT][TYPE_COUNT] = {
/*State: Ready*/
- {READY, PROCESSING_ALPHANUMERIC_TERM, PROCESSING_NON_ASCII_ALPHANUMERIC_TERM,
+ {READY, PROCESSING_ALPHANUMERIC_TERM, PROCESSING_NON_ASCII_ALPHABETIC_TERM,
OPENING_PARENTHESES, CLOSING_PARENTHESES, PROCESSING_EXCLUSION,
PROCESSING_OR, READY, READY},
/*State: PROCESSING_ALPHANUMERIC_TERM*/
- {READY, PROCESSING_ALPHANUMERIC_TERM, PROCESSING_NON_ASCII_ALPHANUMERIC_TERM,
+ {READY, PROCESSING_ALPHANUMERIC_TERM, PROCESSING_NON_ASCII_ALPHABETIC_TERM,
OPENING_PARENTHESES, CLOSING_PARENTHESES, READY, INVALID,
PROCESSING_PROPERTY_RESTRICT, READY},
/*State: PROCESSING_EXCLUSION*/
{READY, PROCESSING_EXCLUSION_TERM, PROCESSING_EXCLUSION_TERM, INVALID,
CLOSING_PARENTHESES, PROCESSING_EXCLUSION, INVALID, INVALID, READY},
/*State: PROCESSING_EXCLUSION_TERM*/
- {READY, PROCESSING_ALPHANUMERIC_TERM, PROCESSING_NON_ASCII_ALPHANUMERIC_TERM,
+ {READY, PROCESSING_ALPHANUMERIC_TERM, PROCESSING_NON_ASCII_ALPHABETIC_TERM,
OPENING_PARENTHESES, CLOSING_PARENTHESES, READY, INVALID, INVALID, READY},
/*State: PROCESSING_PROPERTY_RESTRICT*/
{READY, PROCESSING_PROPERTY_TERM, PROCESSING_PROPERTY_TERM, INVALID,
CLOSING_PARENTHESES, INVALID, INVALID, PROCESSING_PROPERTY_RESTRICT,
READY},
/*State: PROCESSING_PROPERTY_TERM*/
- {READY, PROCESSING_ALPHANUMERIC_TERM, PROCESSING_NON_ASCII_ALPHANUMERIC_TERM,
+ {READY, PROCESSING_ALPHANUMERIC_TERM, PROCESSING_NON_ASCII_ALPHABETIC_TERM,
OPENING_PARENTHESES, CLOSING_PARENTHESES, READY, INVALID,
PROCESSING_PROPERTY_TERM_APPENDING, READY},
/*State: PROCESSING_OR*/
{READY, INVALID, INVALID, OPENING_PARENTHESES, CLOSING_PARENTHESES, INVALID,
INVALID, INVALID, READY},
/*State: OPENING_PARENTHESES*/
- {READY, PROCESSING_ALPHANUMERIC_TERM, PROCESSING_NON_ASCII_ALPHANUMERIC_TERM,
+ {READY, PROCESSING_ALPHANUMERIC_TERM, PROCESSING_NON_ASCII_ALPHABETIC_TERM,
OPENING_PARENTHESES, CLOSING_PARENTHESES, PROCESSING_EXCLUSION,
OPENING_PARENTHESES, READY, READY},
/*State: CLOSING_PARENTHESES*/
- {READY, PROCESSING_ALPHANUMERIC_TERM, PROCESSING_NON_ASCII_ALPHANUMERIC_TERM,
+ {READY, PROCESSING_ALPHANUMERIC_TERM, PROCESSING_NON_ASCII_ALPHABETIC_TERM,
OPENING_PARENTHESES, CLOSING_PARENTHESES, PROCESSING_EXCLUSION,
PROCESSING_OR, INVALID, READY},
- /*State: PROCESSING_NON_ASCII_ALPHANUMERIC_TERM*/
- {READY, PROCESSING_ALPHANUMERIC_TERM, PROCESSING_NON_ASCII_ALPHANUMERIC_TERM,
+ /*State: PROCESSING_NON_ASCII_ALPHABETIC_TERM*/
+ {READY, PROCESSING_ALPHANUMERIC_TERM, PROCESSING_NON_ASCII_ALPHABETIC_TERM,
OPENING_PARENTHESES, CLOSING_PARENTHESES, READY, INVALID, INVALID, READY},
/*State: PROCESSING_PROPERTY_TERM_APPENDING*/
{READY, PROCESSING_PROPERTY_TERM_APPENDING,
@@ -326,7 +327,7 @@ constexpr ActionOrError action_rules[STATE_COUNT][TYPE_COUNT] = {
/*State: CLOSING_PARENTHESES*/
{OUTPUT, OUTPUT, OUTPUT, OUTPUT, OUTPUT, OUTPUT, OUTPUT,
ERROR_GROUP_AS_PROPERTY_NAME, OUTPUT},
- /*State: PROCESSING_NON_ASCII_ALPHANUMERIC_TERM*/
+ /*State: PROCESSING_NON_ASCII_ALPHABETIC_TERM*/
{OUTPUT, OUTPUT, OUTPUT, OUTPUT, OUTPUT, OUTPUT,
ERROR_NO_WHITESPACE_AROUND_OR, ERROR_NON_ASCII_AS_PROPERTY_NAME, OUTPUT},
/*State: PROCESSING_PROPERTY_TERM_APPENDING*/
@@ -345,40 +346,6 @@ std::pair<TermType, std::string_view> GetWhitespaceTerm(std::string_view text,
return std::make_pair(WHITESPACE, text.substr(pos, cur - pos));
}
-TermType GetContentTermType(std::string_view text, size_t pos) {
- if (i18n_utils::IsPunctuationAt(text, pos)) {
- return OTHER;
- } else if (i18n_utils::IsAscii(text[pos])) {
- return ASCII_ALPHANUMERIC_TERM;
- }
- return NON_ASCII_ALPHANUMERIC_TERM;
-}
-
-bool IsContentTermType(TermType term_type) {
- switch (term_type) {
- case ASCII_ALPHANUMERIC_TERM:
- [[fallthrough]];
- case NON_ASCII_ALPHANUMERIC_TERM:
- [[fallthrough]];
- case OTHER:
- return true;
- case WHITESPACE:
- [[fallthrough]];
- case LEFT_PARENTHESES:
- [[fallthrough]];
- case RIGHT_PARENTHESES:
- [[fallthrough]];
- case EXCLUSION_OPERATOR:
- [[fallthrough]];
- case OR_OPERATOR:
- [[fallthrough]];
- case COLON:
- [[fallthrough]];
- case TYPE_COUNT:
- return false;
- }
-}
-
// Determines the length of the potential content term beginning at text[pos]
// and returns a pair with the appropriate TermType and a string_view of the
// content term.
@@ -391,7 +358,12 @@ std::pair<TermType, std::string_view> GetContentTerm(std::string_view text,
size_t pos) {
size_t len = 0;
// Checks the first char to see if it's an ASCII term
- TermType type = GetContentTermType(text, pos);
+ TermType type = ASCII_ALPHANUMERIC_TERM;
+ if (!i18n_utils::IsAscii(text[pos])) {
+ type = NON_ASCII_ALPHABETIC_TERM;
+ } else if (std::isalnum(text[pos])) {
+ type = OTHER;
+ }
for (size_t cur = pos; cur < text.length() && len == 0; ++cur) {
switch (text[cur]) {
case kLeftParentheses:
@@ -451,7 +423,7 @@ std::pair<TermType, std::string_view> GetTerm(std::string_view text,
// and [(cat OR)]. This helps assert extra rule 3: "OR" is ignored if there's no
// valid token on its right.
void RemoveLastTokenIfOrOperator(std::vector<Token>* tokens) {
- if (!tokens->empty() && tokens->back().type == Token::Type::QUERY_OR) {
+ if (!tokens->empty() && tokens->back().type == Token::QUERY_OR) {
tokens->pop_back();
}
}
@@ -465,11 +437,11 @@ libtextclassifier3::Status OutputOrOperatorToken(std::vector<Token>* tokens) {
}
Token::Type last_token_type = tokens->back().type;
switch (last_token_type) {
- case Token::Type::REGULAR:
- case Token::Type::QUERY_RIGHT_PARENTHESES:
- tokens->emplace_back(Token::Type::QUERY_OR);
+ case Token::REGULAR:
+ case Token::QUERY_RIGHT_PARENTHESES:
+ tokens->emplace_back(Token::QUERY_OR);
break;
- case Token::Type::QUERY_OR:
+ case Token::QUERY_OR:
// Ignores "OR" because there's already an "OR", e.g. "term1 OR OR term2"
break;
default:
@@ -499,7 +471,7 @@ libtextclassifier3::Status OutputToken(State new_state,
switch (current_term_type) {
case ASCII_ALPHANUMERIC_TERM:
[[fallthrough]];
- case NON_ASCII_ALPHANUMERIC_TERM:
+ case NON_ASCII_ALPHABETIC_TERM:
if (new_state == PROCESSING_PROPERTY_TERM) {
// Asserts extra rule 1: each property name in the property path is a
// valid term.
@@ -510,21 +482,21 @@ libtextclassifier3::Status OutputToken(State new_state,
GetErrorMessage(ERROR_NON_ASCII_AS_PROPERTY_NAME));
}
}
- tokens->emplace_back(Token::Type::QUERY_PROPERTY, current_term);
+ tokens->emplace_back(Token::QUERY_PROPERTY, current_term);
} else {
- tokens->emplace_back(Token::Type::REGULAR, current_term);
+ tokens->emplace_back(Token::REGULAR, current_term);
}
break;
case LEFT_PARENTHESES:
- tokens->emplace_back(Token::Type::QUERY_LEFT_PARENTHESES);
+ tokens->emplace_back(Token::QUERY_LEFT_PARENTHESES);
break;
case RIGHT_PARENTHESES:
// Ignores "OR" if it's followed by right parentheses.
RemoveLastTokenIfOrOperator(tokens);
- tokens->emplace_back(Token::Type::QUERY_RIGHT_PARENTHESES);
+ tokens->emplace_back(Token::QUERY_RIGHT_PARENTHESES);
break;
case EXCLUSION_OPERATOR:
- tokens->emplace_back(Token::Type::QUERY_EXCLUSION);
+ tokens->emplace_back(Token::QUERY_EXCLUSION);
break;
case OR_OPERATOR:
return OutputOrOperatorToken(tokens);
@@ -569,8 +541,10 @@ libtextclassifier3::Status ProcessTerm(
ICING_ASSIGN_OR_RETURN(std::vector<std::string_view> content_terms,
language_segmenter->GetAllTerms(*current_term));
for (std::string_view term : content_terms) {
- TermType type = GetContentTermType(term, 0);
- if (type == OTHER) {
+ TermType type = ASCII_ALPHANUMERIC_TERM;
+ if (!i18n_utils::IsAscii(term[0])) {
+ type = NON_ASCII_ALPHABETIC_TERM;
+ } else if (!std::isalnum(term[0])) {
// Skip OTHER tokens here.
continue;
}
@@ -616,7 +590,9 @@ libtextclassifier3::StatusOr<std::vector<Token>> ProcessTerms(
for (int i = 0; i < prescanned_terms.size(); ++i) {
const std::pair<TermType, std::string_view>& prescanned_term =
prescanned_terms.at(i);
- if (!IsContentTermType(prescanned_term.first)) {
+ if (prescanned_term.first != ASCII_ALPHANUMERIC_TERM &&
+ prescanned_term.first != NON_ASCII_ALPHABETIC_TERM &&
+ prescanned_term.first != OTHER) {
// This can't be a property restrict. Just pass it in.
ICING_RETURN_IF_ERROR(
ProcessTerm(&current_state, &current_term, &current_term_type,
@@ -628,15 +604,18 @@ libtextclassifier3::StatusOr<std::vector<Token>> ProcessTerms(
std::vector<std::string_view> content_terms,
language_segmenter->GetAllTerms(prescanned_term.second));
for (std::string_view term : content_terms) {
- TermType type = GetContentTermType(term, 0);
+ TermType type = ASCII_ALPHANUMERIC_TERM;
if (term == kOrOperator) {
// TODO(tjbarron) Decide whether we should revise this and other
// handled syntax. This is used to allow queries like "term1,OR,term2"
// to succeed. It's not clear if we should allow this or require
// clients to ensure that OR operators are always surrounded by
// whitespace.
- // Override the type if this is actually an OR operator.
type = OR_OPERATOR;
+ } else if (!i18n_utils::IsAscii(term[0])) {
+ type = NON_ASCII_ALPHABETIC_TERM;
+ } else if (!std::isalnum(term[0])) {
+ type = OTHER;
}
ICING_RETURN_IF_ERROR(ProcessTerm(&current_state, &current_term,
&current_term_type,
@@ -670,7 +649,7 @@ class RawQueryTokenIterator : public Tokenizer::Iterator {
Token GetToken() const override {
if (current_ < 0 || current_ >= tokens_.size()) {
- return Token(Token::Type::INVALID);
+ return Token(Token::INVALID);
}
return tokens_.at(current_);
}
diff --git a/icing/tokenization/raw-query-tokenizer_test.cc b/icing/tokenization/raw-query-tokenizer_test.cc
index b1dcc73..500efa0 100644
--- a/icing/tokenization/raw-query-tokenizer_test.cc
+++ b/icing/tokenization/raw-query-tokenizer_test.cc
@@ -16,9 +16,9 @@
#include "gmock/gmock.h"
#include "gtest/gtest.h"
+#include "icing/helpers/icu/icu-data-file-helper.h"
#include "icing/portable/platform.h"
#include "icing/testing/common-matchers.h"
-#include "icing/testing/icu-data-file-helper.h"
#include "icing/testing/test-data.h"
#include "icing/tokenization/language-segmenter-factory.h"
#include "icing/tokenization/tokenizer-factory.h"
@@ -59,38 +59,13 @@ TEST_F(RawQueryTokenizerTest, Simple) {
tokenizer_factory::CreateQueryTokenizer(tokenizer_factory::RAW_QUERY,
language_segmenter.get()));
- EXPECT_THAT(
- raw_query_tokenizer->TokenizeAll("Hello World!"),
- IsOkAndHolds(ElementsAre(EqualsToken(Token::Type::REGULAR, "Hello"),
- EqualsToken(Token::Type::REGULAR, "World"))));
-
- EXPECT_THAT(
- raw_query_tokenizer->TokenizeAll("hElLo WORLD"),
- IsOkAndHolds(ElementsAre(EqualsToken(Token::Type::REGULAR, "hElLo"),
- EqualsToken(Token::Type::REGULAR, "WORLD"))));
-}
-
-TEST_F(RawQueryTokenizerTest, Emoji) {
- language_segmenter_factory::SegmenterOptions options(ULOC_US);
- ICING_ASSERT_OK_AND_ASSIGN(
- auto language_segmenter,
- language_segmenter_factory::Create(std::move(options)));
- ICING_ASSERT_OK_AND_ASSIGN(
- std::unique_ptr<Tokenizer> raw_query_tokenizer,
- tokenizer_factory::CreateQueryTokenizer(tokenizer_factory::RAW_QUERY,
- language_segmenter.get()));
+ EXPECT_THAT(raw_query_tokenizer->TokenizeAll("Hello World!"),
+ IsOkAndHolds(ElementsAre(EqualsToken(Token::REGULAR, "Hello"),
+ EqualsToken(Token::REGULAR, "World"))));
- EXPECT_THAT(
- raw_query_tokenizer->TokenizeAll("😊 Hello! Goodbye?"),
- IsOkAndHolds(ElementsAre(EqualsToken(Token::Type::REGULAR, "😊"),
- EqualsToken(Token::Type::REGULAR, "Hello"),
- EqualsToken(Token::Type::REGULAR, "Goodbye"))));
-
- EXPECT_THAT(
- raw_query_tokenizer->TokenizeAll("Hello😊 ! Goodbye?"),
- IsOkAndHolds(ElementsAre(EqualsToken(Token::Type::REGULAR, "Hello"),
- EqualsToken(Token::Type::REGULAR, "😊"),
- EqualsToken(Token::Type::REGULAR, "Goodbye"))));
+ EXPECT_THAT(raw_query_tokenizer->TokenizeAll("hElLo WORLD"),
+ IsOkAndHolds(ElementsAre(EqualsToken(Token::REGULAR, "hElLo"),
+ EqualsToken(Token::REGULAR, "WORLD"))));
}
TEST_F(RawQueryTokenizerTest, Parentheses) {
@@ -103,96 +78,84 @@ TEST_F(RawQueryTokenizerTest, Parentheses) {
tokenizer_factory::CreateQueryTokenizer(tokenizer_factory::RAW_QUERY,
language_segmenter.get()));
- ICING_ASSERT_OK_AND_ASSIGN(std::vector<Token> query_tokens,
- raw_query_tokenizer->TokenizeAll("()"));
- EXPECT_THAT(
- query_tokens,
- ElementsAre(EqualsToken(Token::Type::QUERY_LEFT_PARENTHESES, ""),
- EqualsToken(Token::Type::QUERY_RIGHT_PARENTHESES, "")));
+ EXPECT_THAT(raw_query_tokenizer->TokenizeAll("()"),
+ IsOkAndHolds(ElementsAre(
+ EqualsToken(Token::QUERY_LEFT_PARENTHESES, ""),
+ EqualsToken(Token::QUERY_RIGHT_PARENTHESES, ""))));
- ICING_ASSERT_OK_AND_ASSIGN(query_tokens,
- raw_query_tokenizer->TokenizeAll("( )"));
- EXPECT_THAT(
- query_tokens,
- ElementsAre(EqualsToken(Token::Type::QUERY_LEFT_PARENTHESES, ""),
- EqualsToken(Token::Type::QUERY_RIGHT_PARENTHESES, "")));
+ EXPECT_THAT(raw_query_tokenizer->TokenizeAll("( )"),
+ IsOkAndHolds(ElementsAre(
+ EqualsToken(Token::QUERY_LEFT_PARENTHESES, ""),
+ EqualsToken(Token::QUERY_RIGHT_PARENTHESES, ""))));
- ICING_ASSERT_OK_AND_ASSIGN(query_tokens,
- raw_query_tokenizer->TokenizeAll("(term1 term2)"));
- EXPECT_THAT(
- query_tokens,
- ElementsAre(EqualsToken(Token::Type::QUERY_LEFT_PARENTHESES, ""),
- EqualsToken(Token::Type::REGULAR, "term1"),
- EqualsToken(Token::Type::REGULAR, "term2"),
- EqualsToken(Token::Type::QUERY_RIGHT_PARENTHESES, "")));
+ EXPECT_THAT(raw_query_tokenizer->TokenizeAll("(term1 term2)"),
+ IsOkAndHolds(ElementsAre(
+ EqualsToken(Token::QUERY_LEFT_PARENTHESES, ""),
+ EqualsToken(Token::REGULAR, "term1"),
+ EqualsToken(Token::REGULAR, "term2"),
+ EqualsToken(Token::QUERY_RIGHT_PARENTHESES, ""))));
+
+ EXPECT_THAT(raw_query_tokenizer->TokenizeAll("((term1 term2) (term3 term4))"),
+ IsOkAndHolds(ElementsAre(
+ EqualsToken(Token::QUERY_LEFT_PARENTHESES, ""),
+ EqualsToken(Token::QUERY_LEFT_PARENTHESES, ""),
+ EqualsToken(Token::REGULAR, "term1"),
+ EqualsToken(Token::REGULAR, "term2"),
+ EqualsToken(Token::QUERY_RIGHT_PARENTHESES, ""),
+ EqualsToken(Token::QUERY_LEFT_PARENTHESES, ""),
+ EqualsToken(Token::REGULAR, "term3"),
+ EqualsToken(Token::REGULAR, "term4"),
+ EqualsToken(Token::QUERY_RIGHT_PARENTHESES, ""),
+ EqualsToken(Token::QUERY_RIGHT_PARENTHESES, ""))));
+
+ EXPECT_THAT(raw_query_tokenizer->TokenizeAll("term1(term2)"),
+ IsOkAndHolds(ElementsAre(
+ EqualsToken(Token::REGULAR, "term1"),
+ EqualsToken(Token::QUERY_LEFT_PARENTHESES, ""),
+ EqualsToken(Token::REGULAR, "term2"),
+ EqualsToken(Token::QUERY_RIGHT_PARENTHESES, ""))));
- ICING_ASSERT_OK_AND_ASSIGN(
- query_tokens,
- raw_query_tokenizer->TokenizeAll("((term1 term2) (term3 term4))"));
- EXPECT_THAT(
- query_tokens,
- ElementsAre(EqualsToken(Token::Type::QUERY_LEFT_PARENTHESES, ""),
- EqualsToken(Token::Type::QUERY_LEFT_PARENTHESES, ""),
- EqualsToken(Token::Type::REGULAR, "term1"),
- EqualsToken(Token::Type::REGULAR, "term2"),
- EqualsToken(Token::Type::QUERY_RIGHT_PARENTHESES, ""),
- EqualsToken(Token::Type::QUERY_LEFT_PARENTHESES, ""),
- EqualsToken(Token::Type::REGULAR, "term3"),
- EqualsToken(Token::Type::REGULAR, "term4"),
- EqualsToken(Token::Type::QUERY_RIGHT_PARENTHESES, ""),
- EqualsToken(Token::Type::QUERY_RIGHT_PARENTHESES, "")));
-
- ICING_ASSERT_OK_AND_ASSIGN(query_tokens,
- raw_query_tokenizer->TokenizeAll("term1(term2)"));
EXPECT_THAT(
- query_tokens,
- ElementsAre(EqualsToken(Token::Type::REGULAR, "term1"),
- EqualsToken(Token::Type::QUERY_LEFT_PARENTHESES, ""),
- EqualsToken(Token::Type::REGULAR, "term2"),
- EqualsToken(Token::Type::QUERY_RIGHT_PARENTHESES, "")));
-
- ICING_ASSERT_OK_AND_ASSIGN(query_tokens,
- raw_query_tokenizer->TokenizeAll("(term1)term2"));
- EXPECT_THAT(query_tokens,
- ElementsAre(EqualsToken(Token::Type::QUERY_LEFT_PARENTHESES, ""),
- EqualsToken(Token::Type::REGULAR, "term1"),
- EqualsToken(Token::Type::QUERY_RIGHT_PARENTHESES, ""),
- EqualsToken(Token::Type::REGULAR, "term2")));
+ raw_query_tokenizer->TokenizeAll("(term1)term2"),
+ IsOkAndHolds(ElementsAre(EqualsToken(Token::QUERY_LEFT_PARENTHESES, ""),
+ EqualsToken(Token::REGULAR, "term1"),
+ EqualsToken(Token::QUERY_RIGHT_PARENTHESES, ""),
+ EqualsToken(Token::REGULAR, "term2"))));
EXPECT_THAT(raw_query_tokenizer->TokenizeAll("(term1)(term2)"),
IsOkAndHolds(ElementsAre(
- EqualsToken(Token::Type::QUERY_LEFT_PARENTHESES, ""),
- EqualsToken(Token::Type::REGULAR, "term1"),
- EqualsToken(Token::Type::QUERY_RIGHT_PARENTHESES, ""),
- EqualsToken(Token::Type::QUERY_LEFT_PARENTHESES, ""),
- EqualsToken(Token::Type::REGULAR, "term2"),
- EqualsToken(Token::Type::QUERY_RIGHT_PARENTHESES, ""))));
-
- EXPECT_THAT(raw_query_tokenizer->TokenizeAll("(term1)-term2"),
- IsOkAndHolds(ElementsAre(
- EqualsToken(Token::Type::QUERY_LEFT_PARENTHESES, ""),
- EqualsToken(Token::Type::REGULAR, "term1"),
- EqualsToken(Token::Type::QUERY_RIGHT_PARENTHESES, ""),
- EqualsToken(Token::Type::QUERY_EXCLUSION, ""),
- EqualsToken(Token::Type::REGULAR, "term2"))));
+ EqualsToken(Token::QUERY_LEFT_PARENTHESES, ""),
+ EqualsToken(Token::REGULAR, "term1"),
+ EqualsToken(Token::QUERY_RIGHT_PARENTHESES, ""),
+ EqualsToken(Token::QUERY_LEFT_PARENTHESES, ""),
+ EqualsToken(Token::REGULAR, "term2"),
+ EqualsToken(Token::QUERY_RIGHT_PARENTHESES, ""))));
- EXPECT_THAT(raw_query_tokenizer->TokenizeAll("(term1)OR term2"),
- IsOkAndHolds(ElementsAre(
- EqualsToken(Token::Type::QUERY_LEFT_PARENTHESES, ""),
- EqualsToken(Token::Type::REGULAR, "term1"),
- EqualsToken(Token::Type::QUERY_RIGHT_PARENTHESES, ""),
- EqualsToken(Token::Type::QUERY_OR, ""),
- EqualsToken(Token::Type::REGULAR, "term2"))));
+ EXPECT_THAT(
+ raw_query_tokenizer->TokenizeAll("(term1)-term2"),
+ IsOkAndHolds(ElementsAre(EqualsToken(Token::QUERY_LEFT_PARENTHESES, ""),
+ EqualsToken(Token::REGULAR, "term1"),
+ EqualsToken(Token::QUERY_RIGHT_PARENTHESES, ""),
+ EqualsToken(Token::QUERY_EXCLUSION, ""),
+ EqualsToken(Token::REGULAR, "term2"))));
+
+ EXPECT_THAT(
+ raw_query_tokenizer->TokenizeAll("(term1)OR term2"),
+ IsOkAndHolds(ElementsAre(EqualsToken(Token::QUERY_LEFT_PARENTHESES, ""),
+ EqualsToken(Token::REGULAR, "term1"),
+ EqualsToken(Token::QUERY_RIGHT_PARENTHESES, ""),
+ EqualsToken(Token::QUERY_OR, ""),
+ EqualsToken(Token::REGULAR, "term2"))));
EXPECT_THAT(raw_query_tokenizer->TokenizeAll("(term1)OR(term2)"),
IsOkAndHolds(ElementsAre(
- EqualsToken(Token::Type::QUERY_LEFT_PARENTHESES, ""),
- EqualsToken(Token::Type::REGULAR, "term1"),
- EqualsToken(Token::Type::QUERY_RIGHT_PARENTHESES, ""),
- EqualsToken(Token::Type::QUERY_OR, ""),
- EqualsToken(Token::Type::QUERY_LEFT_PARENTHESES, ""),
- EqualsToken(Token::Type::REGULAR, "term2"),
- EqualsToken(Token::Type::QUERY_RIGHT_PARENTHESES, ""))));
+ EqualsToken(Token::QUERY_LEFT_PARENTHESES, ""),
+ EqualsToken(Token::REGULAR, "term1"),
+ EqualsToken(Token::QUERY_RIGHT_PARENTHESES, ""),
+ EqualsToken(Token::QUERY_OR, ""),
+ EqualsToken(Token::QUERY_LEFT_PARENTHESES, ""),
+ EqualsToken(Token::REGULAR, "term2"),
+ EqualsToken(Token::QUERY_RIGHT_PARENTHESES, ""))));
EXPECT_THAT(raw_query_tokenizer->TokenizeAll("(term1):term2"),
StatusIs(libtextclassifier3::StatusCode::INVALID_ARGUMENT,
@@ -217,49 +180,44 @@ TEST_F(RawQueryTokenizerTest, Exclustion) {
tokenizer_factory::CreateQueryTokenizer(tokenizer_factory::RAW_QUERY,
language_segmenter.get()));
- EXPECT_THAT(
- raw_query_tokenizer->TokenizeAll("-term1"),
- IsOkAndHolds(ElementsAre(EqualsToken(Token::Type::QUERY_EXCLUSION, ""),
- EqualsToken(Token::Type::REGULAR, "term1"))));
+ EXPECT_THAT(raw_query_tokenizer->TokenizeAll("-term1"),
+ IsOkAndHolds(ElementsAre(EqualsToken(Token::QUERY_EXCLUSION, ""),
+ EqualsToken(Token::REGULAR, "term1"))));
EXPECT_THAT(raw_query_tokenizer->TokenizeAll("(-term1)"),
IsOkAndHolds(ElementsAre(
- EqualsToken(Token::Type::QUERY_LEFT_PARENTHESES, ""),
- EqualsToken(Token::Type::QUERY_EXCLUSION, ""),
- EqualsToken(Token::Type::REGULAR, "term1"),
- EqualsToken(Token::Type::QUERY_RIGHT_PARENTHESES, ""))));
+ EqualsToken(Token::QUERY_LEFT_PARENTHESES, ""),
+ EqualsToken(Token::QUERY_EXCLUSION, ""),
+ EqualsToken(Token::REGULAR, "term1"),
+ EqualsToken(Token::QUERY_RIGHT_PARENTHESES, ""))));
// Exclusion operator is ignored
- EXPECT_THAT(
- raw_query_tokenizer->TokenizeAll("- term1"),
- IsOkAndHolds(ElementsAre(EqualsToken(Token::Type::REGULAR, "term1"))));
+ EXPECT_THAT(raw_query_tokenizer->TokenizeAll("- term1"),
+ IsOkAndHolds(ElementsAre(EqualsToken(Token::REGULAR, "term1"))));
// Exclusion operator is ignored
- EXPECT_THAT(
- raw_query_tokenizer->TokenizeAll("term1- term2"),
- IsOkAndHolds(ElementsAre(EqualsToken(Token::Type::REGULAR, "term1"),
- EqualsToken(Token::Type::REGULAR, "term2"))));
+ EXPECT_THAT(raw_query_tokenizer->TokenizeAll("term1- term2"),
+ IsOkAndHolds(ElementsAre(EqualsToken(Token::REGULAR, "term1"),
+ EqualsToken(Token::REGULAR, "term2"))));
// Exclusion operator is ignored
EXPECT_THAT(raw_query_tokenizer->TokenizeAll("(term1 -)"),
IsOkAndHolds(ElementsAre(
- EqualsToken(Token::Type::QUERY_LEFT_PARENTHESES, ""),
- EqualsToken(Token::Type::REGULAR, "term1"),
- EqualsToken(Token::Type::QUERY_RIGHT_PARENTHESES, ""))));
+ EqualsToken(Token::QUERY_LEFT_PARENTHESES, ""),
+ EqualsToken(Token::REGULAR, "term1"),
+ EqualsToken(Token::QUERY_RIGHT_PARENTHESES, ""))));
// First exclusion operator is ignored
- EXPECT_THAT(
- raw_query_tokenizer->TokenizeAll("--term1"),
- IsOkAndHolds(ElementsAre(EqualsToken(Token::Type::QUERY_EXCLUSION, ""),
- EqualsToken(Token::Type::REGULAR, "term1"))));
+ EXPECT_THAT(raw_query_tokenizer->TokenizeAll("--term1"),
+ IsOkAndHolds(ElementsAre(EqualsToken(Token::QUERY_EXCLUSION, ""),
+ EqualsToken(Token::REGULAR, "term1"))));
// First "-" is exclusion operator, second is not and will be discarded.
// In other words, exclusion only applies to the term right after it.
- EXPECT_THAT(
- raw_query_tokenizer->TokenizeAll("-term1-term2"),
- IsOkAndHolds(ElementsAre(EqualsToken(Token::Type::QUERY_EXCLUSION, ""),
- EqualsToken(Token::Type::REGULAR, "term1"),
- EqualsToken(Token::Type::REGULAR, "term2"))));
+ EXPECT_THAT(raw_query_tokenizer->TokenizeAll("-term1-term2"),
+ IsOkAndHolds(ElementsAre(EqualsToken(Token::QUERY_EXCLUSION, ""),
+ EqualsToken(Token::REGULAR, "term1"),
+ EqualsToken(Token::REGULAR, "term2"))));
EXPECT_THAT(raw_query_tokenizer->TokenizeAll("-(term1)"),
StatusIs(libtextclassifier3::StatusCode::INVALID_ARGUMENT,
@@ -291,75 +249,73 @@ TEST_F(RawQueryTokenizerTest, PropertyRestriction) {
tokenizer_factory::CreateQueryTokenizer(tokenizer_factory::RAW_QUERY,
language_segmenter.get()));
- EXPECT_THAT(raw_query_tokenizer->TokenizeAll("property1:term1"),
- IsOkAndHolds(ElementsAre(
- EqualsToken(Token::Type::QUERY_PROPERTY, "property1"),
- EqualsToken(Token::Type::REGULAR, "term1"))));
+ EXPECT_THAT(
+ raw_query_tokenizer->TokenizeAll("property1:term1"),
+ IsOkAndHolds(ElementsAre(EqualsToken(Token::QUERY_PROPERTY, "property1"),
+ EqualsToken(Token::REGULAR, "term1"))));
EXPECT_THAT(raw_query_tokenizer->TokenizeAll("(property1:term1)"),
IsOkAndHolds(ElementsAre(
- EqualsToken(Token::Type::QUERY_LEFT_PARENTHESES, ""),
- EqualsToken(Token::Type::QUERY_PROPERTY, "property1"),
- EqualsToken(Token::Type::REGULAR, "term1"),
- EqualsToken(Token::Type::QUERY_RIGHT_PARENTHESES, ""))));
+ EqualsToken(Token::QUERY_LEFT_PARENTHESES, ""),
+ EqualsToken(Token::QUERY_PROPERTY, "property1"),
+ EqualsToken(Token::REGULAR, "term1"),
+ EqualsToken(Token::QUERY_RIGHT_PARENTHESES, ""))));
// Colon is ignored
- EXPECT_THAT(
- raw_query_tokenizer->TokenizeAll(":term1"),
- IsOkAndHolds(ElementsAre(EqualsToken(Token::Type::REGULAR, "term1"))));
+ EXPECT_THAT(raw_query_tokenizer->TokenizeAll(":term1"),
+ IsOkAndHolds(ElementsAre(EqualsToken(Token::REGULAR, "term1"))));
// Colon is ignored
EXPECT_THAT(raw_query_tokenizer->TokenizeAll("(:term1)"),
IsOkAndHolds(ElementsAre(
- EqualsToken(Token::Type::QUERY_LEFT_PARENTHESES, ""),
- EqualsToken(Token::Type::REGULAR, "term1"),
- EqualsToken(Token::Type::QUERY_RIGHT_PARENTHESES, ""))));
+ EqualsToken(Token::QUERY_LEFT_PARENTHESES, ""),
+ EqualsToken(Token::REGULAR, "term1"),
+ EqualsToken(Token::QUERY_RIGHT_PARENTHESES, ""))));
// Colon is ignored
- EXPECT_THAT(
- raw_query_tokenizer->TokenizeAll("term1:"),
- IsOkAndHolds(ElementsAre(EqualsToken(Token::Type::REGULAR, "term1"))));
+ EXPECT_THAT(raw_query_tokenizer->TokenizeAll("term1:"),
+ IsOkAndHolds(ElementsAre(EqualsToken(Token::REGULAR, "term1"))));
// property name can be a path
EXPECT_THAT(raw_query_tokenizer->TokenizeAll("email.title:hello"),
- IsOkAndHolds(ElementsAre(
- EqualsToken(Token::Type::QUERY_PROPERTY, "email.title"),
- EqualsToken(Token::Type::REGULAR, "hello"))));
+ IsOkAndHolds(
+ ElementsAre(EqualsToken(Token::QUERY_PROPERTY, "email.title"),
+ EqualsToken(Token::REGULAR, "hello"))));
// The first colon ":" triggers property restriction, the second colon is used
// as a word connector per ICU's rule
// (https://unicode.org/reports/tr29/#Word_Boundaries).
- EXPECT_THAT(raw_query_tokenizer->TokenizeAll("property:foo:bar"),
- IsOkAndHolds(ElementsAre(
- EqualsToken(Token::Type::QUERY_PROPERTY, "property"),
- EqualsToken(Token::Type::REGULAR, "foo:bar"))));
+ EXPECT_THAT(
+ raw_query_tokenizer->TokenizeAll("property:foo:bar"),
+ IsOkAndHolds(ElementsAre(EqualsToken(Token::QUERY_PROPERTY, "property"),
+ EqualsToken(Token::REGULAR, "foo:bar"))));
// Property restriction only applies to the term right after it.
// Note: "term1:term2" is not a term but 2 terms because word connectors
// don't apply to numbers and alphabets.
- EXPECT_THAT(raw_query_tokenizer->TokenizeAll("property1:term1:term2"),
- IsOkAndHolds(ElementsAre(
- EqualsToken(Token::Type::QUERY_PROPERTY, "property1"),
- EqualsToken(Token::Type::REGULAR, "term1"),
- EqualsToken(Token::Type::REGULAR, "term2"))));
+ EXPECT_THAT(
+ raw_query_tokenizer->TokenizeAll("property1:term1:term2"),
+ IsOkAndHolds(ElementsAre(EqualsToken(Token::QUERY_PROPERTY, "property1"),
+ EqualsToken(Token::REGULAR, "term1"),
+ EqualsToken(Token::REGULAR, "term2"))));
- EXPECT_THAT(raw_query_tokenizer->TokenizeAll("property1:今天:天气"),
- IsOkAndHolds(ElementsAre(
- EqualsToken(Token::Type::QUERY_PROPERTY, "property1"),
- EqualsToken(Token::Type::REGULAR, "今天"),
- EqualsToken(Token::Type::REGULAR, "天气"))));
+ EXPECT_THAT(
+ raw_query_tokenizer->TokenizeAll("property1:今天:天气"),
+ IsOkAndHolds(ElementsAre(EqualsToken(Token::QUERY_PROPERTY, "property1"),
+ EqualsToken(Token::REGULAR, "今天"),
+ EqualsToken(Token::REGULAR, "天气"))));
- EXPECT_THAT(raw_query_tokenizer->TokenizeAll("property1:term1-"),
- IsOkAndHolds(ElementsAre(
- EqualsToken(Token::Type::QUERY_PROPERTY, "property1"),
- EqualsToken(Token::Type::REGULAR, "term1"))));
+ EXPECT_THAT(
+ raw_query_tokenizer->TokenizeAll("property1:term1-"),
+ IsOkAndHolds(ElementsAre(EqualsToken(Token::QUERY_PROPERTY, "property1"),
+ EqualsToken(Token::REGULAR, "term1"))));
// Multiple continuous colons will still be recognized as a property
// restriction operator
- EXPECT_THAT(raw_query_tokenizer->TokenizeAll("property1::term1"),
- IsOkAndHolds(ElementsAre(
- EqualsToken(Token::Type::QUERY_PROPERTY, "property1"),
- EqualsToken(Token::Type::REGULAR, "term1"))));
+ EXPECT_THAT(
+ raw_query_tokenizer->TokenizeAll("property1::term1"),
+ IsOkAndHolds(ElementsAre(EqualsToken(Token::QUERY_PROPERTY, "property1"),
+ EqualsToken(Token::REGULAR, "term1"))));
EXPECT_THAT(
raw_query_tokenizer->TokenizeAll("property1:(term1)"),
@@ -389,109 +345,105 @@ TEST_F(RawQueryTokenizerTest, OR) {
tokenizer_factory::CreateQueryTokenizer(tokenizer_factory::RAW_QUERY,
language_segmenter.get()));
- EXPECT_THAT(
- raw_query_tokenizer->TokenizeAll("term1 OR term2"),
- IsOkAndHolds(ElementsAre(EqualsToken(Token::Type::REGULAR, "term1"),
- EqualsToken(Token::Type::QUERY_OR, ""),
- EqualsToken(Token::Type::REGULAR, "term2"))));
+ EXPECT_THAT(raw_query_tokenizer->TokenizeAll("term1 OR term2"),
+ IsOkAndHolds(ElementsAre(EqualsToken(Token::REGULAR, "term1"),
+ EqualsToken(Token::QUERY_OR, ""),
+ EqualsToken(Token::REGULAR, "term2"))));
// Two continuous "OR"s are treated as one
- EXPECT_THAT(
- raw_query_tokenizer->TokenizeAll("term1 OR OR term2"),
- IsOkAndHolds(ElementsAre(EqualsToken(Token::Type::REGULAR, "term1"),
- EqualsToken(Token::Type::QUERY_OR, ""),
- EqualsToken(Token::Type::REGULAR, "term2"))));
+ EXPECT_THAT(raw_query_tokenizer->TokenizeAll("term1 OR OR term2"),
+ IsOkAndHolds(ElementsAre(EqualsToken(Token::REGULAR, "term1"),
+ EqualsToken(Token::QUERY_OR, ""),
+ EqualsToken(Token::REGULAR, "term2"))));
- EXPECT_THAT(raw_query_tokenizer->TokenizeAll("(term1) OR term2"),
- IsOkAndHolds(ElementsAre(
- EqualsToken(Token::Type::QUERY_LEFT_PARENTHESES, ""),
- EqualsToken(Token::Type::REGULAR, "term1"),
- EqualsToken(Token::Type::QUERY_RIGHT_PARENTHESES, ""),
- EqualsToken(Token::Type::QUERY_OR, ""),
- EqualsToken(Token::Type::REGULAR, "term2"))));
+ EXPECT_THAT(
+ raw_query_tokenizer->TokenizeAll("(term1) OR term2"),
+ IsOkAndHolds(ElementsAre(EqualsToken(Token::QUERY_LEFT_PARENTHESES, ""),
+ EqualsToken(Token::REGULAR, "term1"),
+ EqualsToken(Token::QUERY_RIGHT_PARENTHESES, ""),
+ EqualsToken(Token::QUERY_OR, ""),
+ EqualsToken(Token::REGULAR, "term2"))));
EXPECT_THAT(raw_query_tokenizer->TokenizeAll("term1 OR (term2)"),
IsOkAndHolds(ElementsAre(
- EqualsToken(Token::Type::REGULAR, "term1"),
- EqualsToken(Token::Type::QUERY_OR, ""),
- EqualsToken(Token::Type::QUERY_LEFT_PARENTHESES, ""),
- EqualsToken(Token::Type::REGULAR, "term2"),
- EqualsToken(Token::Type::QUERY_RIGHT_PARENTHESES, ""))));
+ EqualsToken(Token::REGULAR, "term1"),
+ EqualsToken(Token::QUERY_OR, ""),
+ EqualsToken(Token::QUERY_LEFT_PARENTHESES, ""),
+ EqualsToken(Token::REGULAR, "term2"),
+ EqualsToken(Token::QUERY_RIGHT_PARENTHESES, ""))));
EXPECT_THAT(raw_query_tokenizer->TokenizeAll("((term1) OR (term2))"),
IsOkAndHolds(ElementsAre(
- EqualsToken(Token::Type::QUERY_LEFT_PARENTHESES, ""),
- EqualsToken(Token::Type::QUERY_LEFT_PARENTHESES, ""),
- EqualsToken(Token::Type::REGULAR, "term1"),
- EqualsToken(Token::Type::QUERY_RIGHT_PARENTHESES, ""),
- EqualsToken(Token::Type::QUERY_OR, ""),
- EqualsToken(Token::Type::QUERY_LEFT_PARENTHESES, ""),
- EqualsToken(Token::Type::REGULAR, "term2"),
- EqualsToken(Token::Type::QUERY_RIGHT_PARENTHESES, ""),
- EqualsToken(Token::Type::QUERY_RIGHT_PARENTHESES, ""))));
+ EqualsToken(Token::QUERY_LEFT_PARENTHESES, ""),
+ EqualsToken(Token::QUERY_LEFT_PARENTHESES, ""),
+ EqualsToken(Token::REGULAR, "term1"),
+ EqualsToken(Token::QUERY_RIGHT_PARENTHESES, ""),
+ EqualsToken(Token::QUERY_OR, ""),
+ EqualsToken(Token::QUERY_LEFT_PARENTHESES, ""),
+ EqualsToken(Token::REGULAR, "term2"),
+ EqualsToken(Token::QUERY_RIGHT_PARENTHESES, ""),
+ EqualsToken(Token::QUERY_RIGHT_PARENTHESES, ""))));
// Only "OR" (all in uppercase) is the operator
EXPECT_THAT(
raw_query_tokenizer->TokenizeAll("term1 or term2 Or term3 oR term4"),
- IsOkAndHolds(ElementsAre(EqualsToken(Token::Type::REGULAR, "term1"),
- EqualsToken(Token::Type::REGULAR, "or"),
- EqualsToken(Token::Type::REGULAR, "term2"),
- EqualsToken(Token::Type::REGULAR, "Or"),
- EqualsToken(Token::Type::REGULAR, "term3"),
- EqualsToken(Token::Type::REGULAR, "oR"),
- EqualsToken(Token::Type::REGULAR, "term4"))));
+ IsOkAndHolds(ElementsAre(EqualsToken(Token::REGULAR, "term1"),
+ EqualsToken(Token::REGULAR, "or"),
+ EqualsToken(Token::REGULAR, "term2"),
+ EqualsToken(Token::REGULAR, "Or"),
+ EqualsToken(Token::REGULAR, "term3"),
+ EqualsToken(Token::REGULAR, "oR"),
+ EqualsToken(Token::REGULAR, "term4"))));
// "OR" is ignored
- EXPECT_THAT(
- raw_query_tokenizer->TokenizeAll("OR term1"),
- IsOkAndHolds(ElementsAre(EqualsToken(Token::Type::REGULAR, "term1"))));
+ EXPECT_THAT(raw_query_tokenizer->TokenizeAll("OR term1"),
+ IsOkAndHolds(ElementsAre(EqualsToken(Token::REGULAR, "term1"))));
// "OR" is ignored
- EXPECT_THAT(
- raw_query_tokenizer->TokenizeAll("term1 OR"),
- IsOkAndHolds(ElementsAre(EqualsToken(Token::Type::REGULAR, "term1"))));
+ EXPECT_THAT(raw_query_tokenizer->TokenizeAll("term1 OR"),
+ IsOkAndHolds(ElementsAre(EqualsToken(Token::REGULAR, "term1"))));
// "OR" is ignored
EXPECT_THAT(raw_query_tokenizer->TokenizeAll("(OR term1)"),
IsOkAndHolds(ElementsAre(
- EqualsToken(Token::Type::QUERY_LEFT_PARENTHESES, ""),
- EqualsToken(Token::Type::REGULAR, "term1"),
- EqualsToken(Token::Type::QUERY_RIGHT_PARENTHESES, ""))));
+ EqualsToken(Token::QUERY_LEFT_PARENTHESES, ""),
+ EqualsToken(Token::REGULAR, "term1"),
+ EqualsToken(Token::QUERY_RIGHT_PARENTHESES, ""))));
// "OR" is ignored
EXPECT_THAT(raw_query_tokenizer->TokenizeAll("( OR term1)"),
IsOkAndHolds(ElementsAre(
- EqualsToken(Token::Type::QUERY_LEFT_PARENTHESES, ""),
- EqualsToken(Token::Type::REGULAR, "term1"),
- EqualsToken(Token::Type::QUERY_RIGHT_PARENTHESES, ""))));
+ EqualsToken(Token::QUERY_LEFT_PARENTHESES, ""),
+ EqualsToken(Token::REGULAR, "term1"),
+ EqualsToken(Token::QUERY_RIGHT_PARENTHESES, ""))));
// "OR" is ignored
EXPECT_THAT(raw_query_tokenizer->TokenizeAll("(term1 OR)"),
IsOkAndHolds(ElementsAre(
- EqualsToken(Token::Type::QUERY_LEFT_PARENTHESES, ""),
- EqualsToken(Token::Type::REGULAR, "term1"),
- EqualsToken(Token::Type::QUERY_RIGHT_PARENTHESES, ""))));
+ EqualsToken(Token::QUERY_LEFT_PARENTHESES, ""),
+ EqualsToken(Token::REGULAR, "term1"),
+ EqualsToken(Token::QUERY_RIGHT_PARENTHESES, ""))));
// "OR" is ignored
EXPECT_THAT(raw_query_tokenizer->TokenizeAll("(term1 OR )"),
IsOkAndHolds(ElementsAre(
- EqualsToken(Token::Type::QUERY_LEFT_PARENTHESES, ""),
- EqualsToken(Token::Type::REGULAR, "term1"),
- EqualsToken(Token::Type::QUERY_RIGHT_PARENTHESES, ""))));
+ EqualsToken(Token::QUERY_LEFT_PARENTHESES, ""),
+ EqualsToken(Token::REGULAR, "term1"),
+ EqualsToken(Token::QUERY_RIGHT_PARENTHESES, ""))));
// "OR" is ignored
EXPECT_THAT(raw_query_tokenizer->TokenizeAll("( OR )"),
IsOkAndHolds(ElementsAre(
- EqualsToken(Token::Type::QUERY_LEFT_PARENTHESES, ""),
- EqualsToken(Token::Type::QUERY_RIGHT_PARENTHESES, ""))));
+ EqualsToken(Token::QUERY_LEFT_PARENTHESES, ""),
+ EqualsToken(Token::QUERY_RIGHT_PARENTHESES, ""))));
EXPECT_THAT(raw_query_tokenizer->TokenizeAll("term1 OR(term2)"),
IsOkAndHolds(ElementsAre(
- EqualsToken(Token::Type::REGULAR, "term1"),
- EqualsToken(Token::Type::QUERY_OR, ""),
- EqualsToken(Token::Type::QUERY_LEFT_PARENTHESES, ""),
- EqualsToken(Token::Type::REGULAR, "term2"),
- EqualsToken(Token::Type::QUERY_RIGHT_PARENTHESES, ""))));
+ EqualsToken(Token::REGULAR, "term1"),
+ EqualsToken(Token::QUERY_OR, ""),
+ EqualsToken(Token::QUERY_LEFT_PARENTHESES, ""),
+ EqualsToken(Token::REGULAR, "term2"),
+ EqualsToken(Token::QUERY_RIGHT_PARENTHESES, ""))));
EXPECT_THAT(
raw_query_tokenizer->TokenizeAll("term1 OR-term2"),
@@ -520,31 +472,31 @@ TEST_F(RawQueryTokenizerTest, CJKT) {
if (IsCfStringTokenization()) {
EXPECT_THAT(
raw_query_tokenizer->TokenizeAll("-今天天气很好"),
- IsOkAndHolds(ElementsAre(EqualsToken(Token::Type::QUERY_EXCLUSION, ""),
- EqualsToken(Token::Type::REGULAR, "今天"),
- EqualsToken(Token::Type::REGULAR, "天气"),
- EqualsToken(Token::Type::REGULAR, "很"),
- EqualsToken(Token::Type::REGULAR, "好"))));
+ IsOkAndHolds(ElementsAre(EqualsToken(Token::QUERY_EXCLUSION, ""),
+ EqualsToken(Token::REGULAR, "今天"),
+ EqualsToken(Token::REGULAR, "天气"),
+ EqualsToken(Token::REGULAR, "很"),
+ EqualsToken(Token::REGULAR, "好"))));
} else {
EXPECT_THAT(
raw_query_tokenizer->TokenizeAll("-今天天气很好"),
- IsOkAndHolds(ElementsAre(EqualsToken(Token::Type::QUERY_EXCLUSION, ""),
- EqualsToken(Token::Type::REGULAR, "今天"),
- EqualsToken(Token::Type::REGULAR, "天气"),
- EqualsToken(Token::Type::REGULAR, "很好"))));
+ IsOkAndHolds(ElementsAre(EqualsToken(Token::QUERY_EXCLUSION, ""),
+ EqualsToken(Token::REGULAR, "今天"),
+ EqualsToken(Token::REGULAR, "天气"),
+ EqualsToken(Token::REGULAR, "很好"))));
}
if (IsCfStringTokenization()) {
EXPECT_THAT(raw_query_tokenizer->TokenizeAll("property1:你好"),
- IsOkAndHolds(ElementsAre(
- EqualsToken(Token::Type::QUERY_PROPERTY, "property1"),
- EqualsToken(Token::Type::REGULAR, "你"),
- EqualsToken(Token::Type::REGULAR, "好"))));
+ IsOkAndHolds(
+ ElementsAre(EqualsToken(Token::QUERY_PROPERTY, "property1"),
+ EqualsToken(Token::REGULAR, "你"),
+ EqualsToken(Token::REGULAR, "好"))));
} else {
EXPECT_THAT(raw_query_tokenizer->TokenizeAll("property1:你好"),
- IsOkAndHolds(ElementsAre(
- EqualsToken(Token::Type::QUERY_PROPERTY, "property1"),
- EqualsToken(Token::Type::REGULAR, "你好"))));
+ IsOkAndHolds(
+ ElementsAre(EqualsToken(Token::QUERY_PROPERTY, "property1"),
+ EqualsToken(Token::REGULAR, "你好"))));
}
EXPECT_THAT(
@@ -552,11 +504,10 @@ TEST_F(RawQueryTokenizerTest, CJKT) {
StatusIs(libtextclassifier3::StatusCode::INVALID_ARGUMENT,
HasSubstr("Characters in property name must all be ASCII")));
- EXPECT_THAT(
- raw_query_tokenizer->TokenizeAll("cat OR ねこ"),
- IsOkAndHolds(ElementsAre(EqualsToken(Token::Type::REGULAR, "cat"),
- EqualsToken(Token::Type::QUERY_OR, ""),
- EqualsToken(Token::Type::REGULAR, "ねこ"))));
+ EXPECT_THAT(raw_query_tokenizer->TokenizeAll("cat OR ねこ"),
+ IsOkAndHolds(ElementsAre(EqualsToken(Token::REGULAR, "cat"),
+ EqualsToken(Token::QUERY_OR, ""),
+ EqualsToken(Token::REGULAR, "ねこ"))));
EXPECT_THAT(
raw_query_tokenizer->TokenizeAll("cat ORねこ"),
@@ -592,45 +543,40 @@ TEST_F(RawQueryTokenizerTest, OtherChars) {
language_segmenter.get()));
// Comma is ignored
- EXPECT_THAT(
- raw_query_tokenizer->TokenizeAll(",term1, ,"),
- IsOkAndHolds(ElementsAre(EqualsToken(Token::Type::REGULAR, "term1"))));
+ EXPECT_THAT(raw_query_tokenizer->TokenizeAll(",term1, ,"),
+ IsOkAndHolds(ElementsAre(EqualsToken(Token::REGULAR, "term1"))));
EXPECT_THAT(raw_query_tokenizer->TokenizeAll("(,term1),"),
IsOkAndHolds(ElementsAre(
- EqualsToken(Token::Type::QUERY_LEFT_PARENTHESES, ""),
- EqualsToken(Token::Type::REGULAR, "term1"),
- EqualsToken(Token::Type::QUERY_RIGHT_PARENTHESES, ""))));
+ EqualsToken(Token::QUERY_LEFT_PARENTHESES, ""),
+ EqualsToken(Token::REGULAR, "term1"),
+ EqualsToken(Token::QUERY_RIGHT_PARENTHESES, ""))));
// Exclusion operator and comma are ignored
- EXPECT_THAT(
- raw_query_tokenizer->TokenizeAll("-,term1"),
- IsOkAndHolds(ElementsAre(EqualsToken(Token::Type::REGULAR, "term1"))));
+ EXPECT_THAT(raw_query_tokenizer->TokenizeAll("-,term1"),
+ IsOkAndHolds(ElementsAre(EqualsToken(Token::REGULAR, "term1"))));
- EXPECT_THAT(
- raw_query_tokenizer->TokenizeAll("-term1,"),
- IsOkAndHolds(ElementsAre(EqualsToken(Token::Type::QUERY_EXCLUSION, ""),
- EqualsToken(Token::Type::REGULAR, "term1"))));
+ EXPECT_THAT(raw_query_tokenizer->TokenizeAll("-term1,"),
+ IsOkAndHolds(ElementsAre(EqualsToken(Token::QUERY_EXCLUSION, ""),
+ EqualsToken(Token::REGULAR, "term1"))));
// Colon and comma are ignored
- EXPECT_THAT(
- raw_query_tokenizer->TokenizeAll("property1:,term1"),
- IsOkAndHolds(ElementsAre(EqualsToken(Token::Type::REGULAR, "property1"),
- EqualsToken(Token::Type::REGULAR, "term1"))));
+ EXPECT_THAT(raw_query_tokenizer->TokenizeAll("property1:,term1"),
+ IsOkAndHolds(ElementsAre(EqualsToken(Token::REGULAR, "property1"),
+ EqualsToken(Token::REGULAR, "term1"))));
- EXPECT_THAT(raw_query_tokenizer->TokenizeAll("property1:term1,term2"),
- IsOkAndHolds(ElementsAre(
- EqualsToken(Token::Type::QUERY_PROPERTY, "property1"),
- EqualsToken(Token::Type::REGULAR, "term1"),
- EqualsToken(Token::Type::REGULAR, "term2"))));
+ EXPECT_THAT(
+ raw_query_tokenizer->TokenizeAll("property1:term1,term2"),
+ IsOkAndHolds(ElementsAre(EqualsToken(Token::QUERY_PROPERTY, "property1"),
+ EqualsToken(Token::REGULAR, "term1"),
+ EqualsToken(Token::REGULAR, "term2"))));
// This is a special case for OR, unknown chars are treated the same as
// whitespaces before and after OR.
- EXPECT_THAT(
- raw_query_tokenizer->TokenizeAll("term1,OR,term2"),
- IsOkAndHolds(ElementsAre(EqualsToken(Token::Type::REGULAR, "term1"),
- EqualsToken(Token::Type::QUERY_OR, ""),
- EqualsToken(Token::Type::REGULAR, "term2"))));
+ EXPECT_THAT(raw_query_tokenizer->TokenizeAll("term1,OR,term2"),
+ IsOkAndHolds(ElementsAre(EqualsToken(Token::REGULAR, "term1"),
+ EqualsToken(Token::QUERY_OR, ""),
+ EqualsToken(Token::REGULAR, "term2"))));
}
TEST_F(RawQueryTokenizerTest, Mix) {
@@ -647,38 +593,37 @@ TEST_F(RawQueryTokenizerTest, Mix) {
EXPECT_THAT(raw_query_tokenizer->TokenizeAll(
"こんにちはgood afternoon, title:今天 OR (ในวันนี้ -B12)"),
IsOkAndHolds(ElementsAre(
- EqualsToken(Token::Type::REGULAR, "こんにちは"),
- EqualsToken(Token::Type::REGULAR, "good"),
- EqualsToken(Token::Type::REGULAR, "afternoon"),
- EqualsToken(Token::Type::QUERY_PROPERTY, "title"),
- EqualsToken(Token::Type::REGULAR, "今天"),
- EqualsToken(Token::Type::QUERY_OR, ""),
- EqualsToken(Token::Type::QUERY_LEFT_PARENTHESES, ""),
- EqualsToken(Token::Type::REGULAR, "ใน"),
- EqualsToken(Token::Type::REGULAR, "วันนี้"),
- EqualsToken(Token::Type::QUERY_EXCLUSION, ""),
- EqualsToken(Token::Type::REGULAR, "B12"),
- EqualsToken(Token::Type::QUERY_RIGHT_PARENTHESES, ""))));
+ EqualsToken(Token::REGULAR, "こんにちは"),
+ EqualsToken(Token::REGULAR, "good"),
+ EqualsToken(Token::REGULAR, "afternoon"),
+ EqualsToken(Token::QUERY_PROPERTY, "title"),
+ EqualsToken(Token::REGULAR, "今天"),
+ EqualsToken(Token::QUERY_OR, ""),
+ EqualsToken(Token::QUERY_LEFT_PARENTHESES, ""),
+ EqualsToken(Token::REGULAR, "ใน"),
+ EqualsToken(Token::REGULAR, "วันนี้"),
+ EqualsToken(Token::QUERY_EXCLUSION, ""),
+ EqualsToken(Token::REGULAR, "B12"),
+ EqualsToken(Token::QUERY_RIGHT_PARENTHESES, ""))));
} else {
ICING_ASSERT_OK_AND_ASSIGN(
std::vector<Token> tokens,
raw_query_tokenizer->TokenizeAll(
"こんにちはgood afternoon, title:今天 OR (ในวันนี้ -B12)"));
- EXPECT_THAT(
- tokens,
- ElementsAre(EqualsToken(Token::Type::REGULAR, "こんにちは"),
- EqualsToken(Token::Type::REGULAR, "good"),
- EqualsToken(Token::Type::REGULAR, "afternoon"),
- EqualsToken(Token::Type::QUERY_PROPERTY, "title"),
- EqualsToken(Token::Type::REGULAR, "今天"),
- EqualsToken(Token::Type::QUERY_OR, ""),
- EqualsToken(Token::Type::QUERY_LEFT_PARENTHESES, ""),
- EqualsToken(Token::Type::REGULAR, "ใน"),
- EqualsToken(Token::Type::REGULAR, "วัน"),
- EqualsToken(Token::Type::REGULAR, "นี้"),
- EqualsToken(Token::Type::QUERY_EXCLUSION, ""),
- EqualsToken(Token::Type::REGULAR, "B12"),
- EqualsToken(Token::Type::QUERY_RIGHT_PARENTHESES, "")));
+ EXPECT_THAT(tokens,
+ ElementsAre(EqualsToken(Token::REGULAR, "こんにちは"),
+ EqualsToken(Token::REGULAR, "good"),
+ EqualsToken(Token::REGULAR, "afternoon"),
+ EqualsToken(Token::QUERY_PROPERTY, "title"),
+ EqualsToken(Token::REGULAR, "今天"),
+ EqualsToken(Token::QUERY_OR, ""),
+ EqualsToken(Token::QUERY_LEFT_PARENTHESES, ""),
+ EqualsToken(Token::REGULAR, "ใน"),
+ EqualsToken(Token::REGULAR, "วัน"),
+ EqualsToken(Token::REGULAR, "นี้"),
+ EqualsToken(Token::QUERY_EXCLUSION, ""),
+ EqualsToken(Token::REGULAR, "B12"),
+ EqualsToken(Token::QUERY_RIGHT_PARENTHESES, "")));
}
}
diff --git a/icing/tokenization/reverse_jni/reverse-jni-break-iterator.cc b/icing/tokenization/reverse_jni/reverse-jni-break-iterator.cc
index 8e1e563..6b1cb3a 100644
--- a/icing/tokenization/reverse_jni/reverse-jni-break-iterator.cc
+++ b/icing/tokenization/reverse_jni/reverse-jni-break-iterator.cc
@@ -15,10 +15,10 @@
#include "icing/tokenization/reverse_jni/reverse-jni-break-iterator.h"
#include <jni.h>
+#include <math.h>
#include <cassert>
#include <cctype>
-#include <cmath>
#include <map>
#include "icing/jni/jni-cache.h"
diff --git a/icing/tokenization/reverse_jni/reverse-jni-language-segmenter.cc b/icing/tokenization/reverse_jni/reverse-jni-language-segmenter.cc
index e5de6e6..76219b5 100644
--- a/icing/tokenization/reverse_jni/reverse-jni-language-segmenter.cc
+++ b/icing/tokenization/reverse_jni/reverse-jni-language-segmenter.cc
@@ -51,9 +51,9 @@ class ReverseJniLanguageSegmenterIterator : public LanguageSegmenter::Iterator {
if (term_end_exclusive_.utf16_index() == 0) {
int first = break_iterator_->First();
if (!term_start_.MoveToUtf16(first)) {
- // First is guaranteed to succeed and return a position within bonds.
- // So the only possible failure could be an invalid sequence. Mark as
- // DONE and return.
+ // First is guaranteed to succeed and return a position within bonds. So
+ // the only possible failure could be an invalid sequence. Mark as DONE
+ // and return.
MarkAsDone();
return false;
}
@@ -74,7 +74,14 @@ class ReverseJniLanguageSegmenterIterator : public LanguageSegmenter::Iterator {
MarkAsDone();
return false;
}
- return true;
+
+ // Check if the current term is valid. We consider any term valid if its
+ // first character is valid. If it's not valid, then we need to advance to
+ // the next term.
+ if (IsValidTerm()) {
+ return true;
+ }
+ return Advance();
}
// Returns the current term. It can be called only when Advance() returns
@@ -237,7 +244,7 @@ class ReverseJniLanguageSegmenterIterator : public LanguageSegmenter::Iterator {
// 4. The start and end indices point to a segment, but we need to ensure
// that this segment is 1) valid and 2) ends before offset. Otherwise, we'll
// need a segment prior to this one.
- if (term_end_exclusive_.utf32_index() > offset) {
+ if (term_end_exclusive_.utf32_index() > offset || !IsValidTerm()) {
return ResetToTermEndingBeforeUtf32(term_start_.utf32_index());
}
return term_start_.utf32_index();
@@ -277,6 +284,21 @@ class ReverseJniLanguageSegmenterIterator : public LanguageSegmenter::Iterator {
return term_end_exclusive_.utf16_index() == ReverseJniBreakIterator::kDone;
}
+ bool IsValidTerm() const {
+ // Rule 1: all ASCII terms will be returned.
+ // We know it's a ASCII term by checking the first char.
+ if (i18n_utils::IsAscii(text_[term_start_.utf8_index()])) {
+ return true;
+ }
+
+ // Rule 2: for non-ASCII terms, only the alphabetic terms are returned.
+ // We know it's an alphabetic term by checking the first unicode character.
+ if (i18n_utils::IsAlphabeticAt(text_, term_start_.utf8_index())) {
+ return true;
+ }
+ return false;
+ }
+
// All of ReverseJniBreakIterator's functions return UTF-16 boundaries. So
// this class needs to maintain state to convert between UTF-16 and UTF-8.
std::unique_ptr<ReverseJniBreakIterator> break_iterator_;
diff --git a/icing/tokenization/reverse_jni/reverse-jni-language-segmenter_test.cc b/icing/tokenization/reverse_jni/reverse-jni-language-segmenter_test.cc
index 277ece6..b1a8f72 100644
--- a/icing/tokenization/reverse_jni/reverse-jni-language-segmenter_test.cc
+++ b/icing/tokenization/reverse_jni/reverse-jni-language-segmenter_test.cc
@@ -185,7 +185,7 @@ TEST_P(ReverseJniLanguageSegmenterTest, Non_ASCII_Non_Alphabetic) {
// Full-width (non-ASCII) punctuation marks and special characters are left
// out.
EXPECT_THAT(language_segmenter->GetAllTerms("。?·Hello!×"),
- IsOkAndHolds(ElementsAre("。", "?", "·", "Hello", "!", "×")));
+ IsOkAndHolds(ElementsAre("Hello")));
}
TEST_P(ReverseJniLanguageSegmenterTest, Acronym) {
@@ -246,9 +246,9 @@ TEST_P(ReverseJniLanguageSegmenterTest, WordConnector) {
// Connectors don't connect if one side is an invalid term (?)
EXPECT_THAT(language_segmenter->GetAllTerms("bar:baz:?"),
- IsOkAndHolds(ElementsAre("bar:baz", ":", "?")));
+ IsOkAndHolds(ElementsAre("bar:baz", ":")));
EXPECT_THAT(language_segmenter->GetAllTerms("?:bar:baz"),
- IsOkAndHolds(ElementsAre("?", ":", "bar:baz")));
+ IsOkAndHolds(ElementsAre(":", "bar:baz")));
EXPECT_THAT(language_segmenter->GetAllTerms("3:14"),
IsOkAndHolds(ElementsAre("3", ":", "14")));
EXPECT_THAT(language_segmenter->GetAllTerms("私:は"),
@@ -366,17 +366,6 @@ TEST_P(ReverseJniLanguageSegmenterTest, Number) {
IsOkAndHolds(ElementsAre("-", "123")));
}
-TEST_P(ReverseJniLanguageSegmenterTest, FullWidthNumbers) {
- ICING_ASSERT_OK_AND_ASSIGN(
- auto language_segmenter,
- language_segmenter_factory::Create(
- GetSegmenterOptions(GetLocale(), jni_cache_.get())));
-
- EXPECT_THAT(language_segmenter->GetAllTerms("0123456789"),
- IsOkAndHolds(ElementsAre("0", "1", "2", "3", "4", "5", "6",
- "7", "8", "9")));
-}
-
TEST_P(ReverseJniLanguageSegmenterTest, ContinuousWhitespaces) {
ICING_ASSERT_OK_AND_ASSIGN(
auto language_segmenter,
@@ -413,17 +402,15 @@ TEST_P(ReverseJniLanguageSegmenterTest, CJKT) {
// have whitespaces as word delimiter.
// Chinese
- EXPECT_THAT(
- language_segmenter->GetAllTerms("我每天走路去上班。"),
- IsOkAndHolds(ElementsAre("我", "每天", "走路", "去", "上班", "。")));
+ EXPECT_THAT(language_segmenter->GetAllTerms("我每天走路去上班。"),
+ IsOkAndHolds(ElementsAre("我", "每天", "走路", "去", "上班")));
// Japanese
EXPECT_THAT(language_segmenter->GetAllTerms("私は毎日仕事に歩いています。"),
IsOkAndHolds(ElementsAre("私", "は", "毎日", "仕事", "に", "歩",
- "い", "てい", "ます", "。")));
+ "い", "てい", "ます")));
// Khmer
EXPECT_THAT(language_segmenter->GetAllTerms("ញុំដើរទៅធ្វើការរាល់ថ្ងៃ។"),
- IsOkAndHolds(ElementsAre("ញុំ", "ដើរទៅ", "ធ្វើការ", "រាល់ថ្ងៃ", "។")));
-
+ IsOkAndHolds(ElementsAre("ញុំ", "ដើរទៅ", "ធ្វើការ", "រាល់ថ្ងៃ")));
// Thai
EXPECT_THAT(
language_segmenter->GetAllTerms("ฉันเดินไปทำงานทุกวัน"),
@@ -854,19 +841,16 @@ TEST_P(ReverseJniLanguageSegmenterTest, ChineseResetToTermAfterUtf32) {
ICING_ASSERT_OK_AND_ASSIGN(std::unique_ptr<LanguageSegmenter::Iterator> itr,
language_segmenter->Segment(kChinese));
// String: "我每天走路去上班。"
- // ^ ^ ^ ^^ ^
- // UTF-8 idx: 0 3 9 15 18 24
- // UTF-832 idx: 0 1 3 5 6 8
+ // ^ ^ ^ ^^
+ // UTF-8 idx: 0 3 9 15 18
+ // UTF-832 idx: 0 1 3 5 6
EXPECT_THAT(itr->ResetToTermStartingAfterUtf32(0), IsOkAndHolds(Eq(1)));
EXPECT_THAT(itr->GetTerm(), Eq("每天"));
EXPECT_THAT(itr->ResetToTermStartingAfterUtf32(2), IsOkAndHolds(Eq(3)));
EXPECT_THAT(itr->GetTerm(), Eq("走路"));
- EXPECT_THAT(itr->ResetToTermStartingAfterUtf32(7), IsOkAndHolds(Eq(8)));
- EXPECT_THAT(itr->GetTerm(), Eq("。"));
-
- EXPECT_THAT(itr->ResetToTermStartingAfterUtf32(8),
+ EXPECT_THAT(itr->ResetToTermStartingAfterUtf32(7),
StatusIs(libtextclassifier3::StatusCode::NOT_FOUND));
EXPECT_THAT(itr->GetTerm(), IsEmpty());
}
@@ -881,21 +865,18 @@ TEST_P(ReverseJniLanguageSegmenterTest, JapaneseResetToTermAfterUtf32) {
ICING_ASSERT_OK_AND_ASSIGN(std::unique_ptr<LanguageSegmenter::Iterator> itr,
language_segmenter->Segment(kJapanese));
// String: "私は毎日仕事に歩いています。"
- // ^ ^ ^ ^ ^ ^ ^ ^ ^ ^
- // UTF-8 idx: 0 3 6 12 18212427 33 39
- // UTF-32 idx: 0 1 2 4 6 7 8 9 11 13
+ // ^ ^ ^ ^ ^ ^ ^ ^ ^
+ // UTF-8 idx: 0 3 6 12 18212427 33
+ // UTF-32 idx: 0 1 2 4 6 7 8 9 11
EXPECT_THAT(itr->ResetToTermStartingAfterUtf32(0), IsOkAndHolds(Eq(1)));
EXPECT_THAT(itr->GetTerm(), Eq("は"));
- EXPECT_THAT(itr->ResetToTermStartingAfterUtf32(13),
+ EXPECT_THAT(itr->ResetToTermStartingAfterUtf32(11),
StatusIs(libtextclassifier3::StatusCode::NOT_FOUND));
EXPECT_THAT(itr->GetTerm(), IsEmpty());
EXPECT_THAT(itr->ResetToTermStartingAfterUtf32(3), IsOkAndHolds(Eq(4)));
EXPECT_THAT(itr->GetTerm(), Eq("仕事"));
-
- EXPECT_THAT(itr->ResetToTermStartingAfterUtf32(12), IsOkAndHolds(Eq(13)));
- EXPECT_THAT(itr->GetTerm(), Eq("。"));
}
TEST_P(ReverseJniLanguageSegmenterTest, KhmerResetToTermAfterUtf32) {
@@ -907,16 +888,13 @@ TEST_P(ReverseJniLanguageSegmenterTest, KhmerResetToTermAfterUtf32) {
ICING_ASSERT_OK_AND_ASSIGN(std::unique_ptr<LanguageSegmenter::Iterator> itr,
language_segmenter->Segment(kKhmer));
// String: "ញុំដើរទៅធ្វើការរាល់ថ្ងៃ។"
- // ^ ^ ^ ^ ^
- // UTF-8 idx: 0 9 24 45 69
- // UTF-32 idx: 0 3 8 15 23
+ // ^ ^ ^ ^
+ // UTF-8 idx: 0 9 24 45
+ // UTF-32 idx: 0 3 8 15
EXPECT_THAT(itr->ResetToTermStartingAfterUtf32(0), IsOkAndHolds(Eq(3)));
EXPECT_THAT(itr->GetTerm(), Eq("ដើរទៅ"));
- EXPECT_THAT(itr->ResetToTermStartingAfterUtf32(15), IsOkAndHolds(Eq(23)));
- EXPECT_THAT(itr->GetTerm(), Eq("។"));
-
- EXPECT_THAT(itr->ResetToTermStartingAfterUtf32(23),
+ EXPECT_THAT(itr->ResetToTermStartingAfterUtf32(15),
StatusIs(libtextclassifier3::StatusCode::NOT_FOUND));
EXPECT_THAT(itr->GetTerm(), IsEmpty());
diff --git a/icing/tokenization/token.h b/icing/tokenization/token.h
index 0c268be..dda9efc 100644
--- a/icing/tokenization/token.h
+++ b/icing/tokenization/token.h
@@ -21,14 +21,11 @@ namespace icing {
namespace lib {
struct Token {
- enum class Type {
+ enum Type {
// Common types
REGULAR, // A token without special meanings, the value of it will be
// indexed or searched directly
- VERBATIM, // A token that should be indexed and searched without any
- // modifications to the raw text
-
// Types only used in raw query
QUERY_OR, // Indicates OR logic between its left and right tokens
QUERY_EXCLUSION, // Indicates exclusion operation on next token
diff --git a/icing/tokenization/tokenizer-factory.cc b/icing/tokenization/tokenizer-factory.cc
index b2508f7..9b59acf 100644
--- a/icing/tokenization/tokenizer-factory.cc
+++ b/icing/tokenization/tokenizer-factory.cc
@@ -23,7 +23,6 @@
#include "icing/tokenization/plain-tokenizer.h"
#include "icing/tokenization/raw-query-tokenizer.h"
#include "icing/tokenization/tokenizer.h"
-#include "icing/tokenization/verbatim-tokenizer.h"
#include "icing/util/status-macros.h"
namespace icing {
@@ -39,8 +38,6 @@ CreateIndexingTokenizer(StringIndexingConfig::TokenizerType::Code type,
switch (type) {
case StringIndexingConfig::TokenizerType::PLAIN:
return std::make_unique<PlainTokenizer>(lang_segmenter);
- case StringIndexingConfig::TokenizerType::VERBATIM:
- return std::make_unique<VerbatimTokenizer>();
case StringIndexingConfig::TokenizerType::NONE:
[[fallthrough]];
default:
diff --git a/icing/tokenization/tokenizer.h b/icing/tokenization/tokenizer.h
index 24f8269..b4f0c6e 100644
--- a/icing/tokenization/tokenizer.h
+++ b/icing/tokenization/tokenizer.h
@@ -40,6 +40,14 @@ class Tokenizer {
public:
virtual ~Tokenizer() = default;
+ enum Type {
+ // Index tokenizers
+ PLAIN, // Used to tokenize plain text input
+
+ // Query tokenizers
+ RAW_QUERY, // Used to tokenize raw queries
+ };
+
// An iterator helping to get tokens.
// Example usage:
//
@@ -75,26 +83,22 @@ class Tokenizer {
// offset.
// Ex.
// auto iterator = tokenizer.Tokenize("foo bar baz").ValueOrDie();
- // iterator.ResetToTokenStartingAfter(4);
+ // iterator.ResetToTokenAfter(4);
// // The first full token starting after position 4 (the 'b' in "bar") is
// // "baz".
// PrintToken(iterator.GetToken()); // prints "baz"
- virtual bool ResetToTokenStartingAfter(int32_t utf32_offset) {
- return false;
- }
+ virtual bool ResetToTokenAfter(int32_t offset) { return false; }
// Sets the tokenizer to point at the first token that *ends* *before*
// offset. Returns false if there are no valid tokens ending
// before offset.
// Ex.
// auto iterator = tokenizer.Tokenize("foo bar baz").ValueOrDie();
- // iterator.ResetToTokenEndingBefore(4);
+ // iterator.ResetToTokenBefore(4);
// // The first full token ending before position 4 (the 'b' in "bar") is
// // "foo".
// PrintToken(iterator.GetToken()); // prints "foo"
- virtual bool ResetToTokenEndingBefore(int32_t utf32_offset) {
- return false;
- }
+ virtual bool ResetToTokenBefore(int32_t offset) { return false; }
virtual bool ResetToStart() { return false; }
};
diff --git a/icing/tokenization/verbatim-tokenizer.cc b/icing/tokenization/verbatim-tokenizer.cc
deleted file mode 100644
index 0d3a320..0000000
--- a/icing/tokenization/verbatim-tokenizer.cc
+++ /dev/null
@@ -1,139 +0,0 @@
-// Copyright (C) 2021 Google LLC
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-// http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#include "icing/tokenization/verbatim-tokenizer.h"
-
-#include "icing/text_classifier/lib3/utils/base/statusor.h"
-#include "icing/util/character-iterator.h"
-#include "icing/util/status-macros.h"
-
-namespace icing {
-namespace lib {
-
-class VerbatimTokenIterator : public Tokenizer::Iterator {
- public:
- explicit VerbatimTokenIterator(std::string_view text)
- : term_(std::move(text)) {}
-
- bool Advance() override {
- if (term_.empty() || has_advanced_to_end_) {
- return false;
- }
-
- has_advanced_to_end_ = true;
- return true;
- }
-
- Token GetToken() const override {
- if (term_.empty() || !has_advanced_to_end_) {
- return Token(Token::Type::INVALID);
- }
-
- return Token(Token::Type::VERBATIM, term_);
- }
-
- libtextclassifier3::StatusOr<CharacterIterator> CalculateTokenStart()
- override {
- if (term_.empty()) {
- return absl_ports::AbortedError(
- "Could not calculate start of empty token.");
- }
-
- return CharacterIterator(term_, 0, 0, 0);
- }
-
- libtextclassifier3::StatusOr<CharacterIterator> CalculateTokenEndExclusive()
- override {
- if (term_.empty()) {
- return absl_ports::AbortedError(
- "Could not calculate end of empty token.");
- }
-
- if (token_end_iterator_.utf8_index() >= 0) {
- return token_end_iterator_;
- }
-
- bool moved_to_token_end = token_end_iterator_.MoveToUtf8(term_.length());
- if (moved_to_token_end) {
- return token_end_iterator_;
- } else {
- return absl_ports::AbortedError("Could not move to end of token.");
- }
- }
-
- bool ResetToTokenStartingAfter(int32_t utf32_offset) override {
- // We can only reset to the sole verbatim token, so we must have a negative
- // offset for it to be considered the token after.
- if (utf32_offset < 0) {
- // Because we are now at the sole verbatim token, we should ensure we can
- // no longer advance past it.
- has_advanced_to_end_ = true;
- return true;
- }
- return false;
- }
-
- bool ResetToTokenEndingBefore(int32_t utf32_offset) override {
- // We can only reset to the sole verbatim token, so we must have an offset
- // after the end of the token for the reset to be valid. This means the
- // provided utf-32 offset must be equal to or greater than the utf-32 length
- // of the token.
- if (token_end_iterator_.utf8_index() < 0) {
- // Moves one index past the end of the term.
- bool moved_to_token_end = token_end_iterator_.MoveToUtf8(term_.length());
- if (!moved_to_token_end) {
- // We're unable to reset as we failed to move to the end of the term.
- return false;
- }
- }
-
- if (utf32_offset >= token_end_iterator_.utf32_index()) {
- // Because we are now at the sole verbatim token, we should ensure we can
- // no longer advance past it.
- has_advanced_to_end_ = true;
- return true;
- }
- return false;
- }
-
- bool ResetToStart() override {
- has_advanced_to_end_ = true;
- return true;
- }
-
- private:
- std::string_view term_;
- CharacterIterator token_end_iterator_ = CharacterIterator(term_, -1, -1, -1);
- // Used to determine whether we have advanced on the sole verbatim token
- bool has_advanced_to_end_ = false;
-};
-
-libtextclassifier3::StatusOr<std::unique_ptr<Tokenizer::Iterator>>
-VerbatimTokenizer::Tokenize(std::string_view text) const {
- return std::make_unique<VerbatimTokenIterator>(text);
-}
-
-libtextclassifier3::StatusOr<std::vector<Token>> VerbatimTokenizer::TokenizeAll(
- std::string_view text) const {
- ICING_ASSIGN_OR_RETURN(std::unique_ptr<Tokenizer::Iterator> iterator,
- Tokenize(text));
- std::vector<Token> tokens;
- while (iterator->Advance()) {
- tokens.push_back(iterator->GetToken());
- }
- return tokens;
-}
-
-} // namespace lib
-} // namespace icing
diff --git a/icing/tokenization/verbatim-tokenizer.h b/icing/tokenization/verbatim-tokenizer.h
deleted file mode 100644
index 8404cf1..0000000
--- a/icing/tokenization/verbatim-tokenizer.h
+++ /dev/null
@@ -1,41 +0,0 @@
-// Copyright (C) 2021 Google LLC
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-// http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#ifndef ICING_TOKENIZATION_VERBATIM_H_
-#define ICING_TOKENIZATION_VERBATIM_H_
-
-#include <memory>
-#include <string_view>
-#include <vector>
-
-#include "icing/text_classifier/lib3/utils/base/statusor.h"
-#include "icing/tokenization/tokenizer.h"
-
-namespace icing {
-namespace lib {
-
-// Provides verbatim tokenization on input text
-class VerbatimTokenizer : public Tokenizer {
- public:
- libtextclassifier3::StatusOr<std::unique_ptr<Tokenizer::Iterator>> Tokenize(
- std::string_view text) const override;
-
- libtextclassifier3::StatusOr<std::vector<Token>> TokenizeAll(
- std::string_view text) const override;
-};
-
-} // namespace lib
-} // namespace icing
-
-#endif // ICING_TOKENIZATION_VERBATIM_H_
diff --git a/icing/tokenization/verbatim-tokenizer_test.cc b/icing/tokenization/verbatim-tokenizer_test.cc
deleted file mode 100644
index e38c7aa..0000000
--- a/icing/tokenization/verbatim-tokenizer_test.cc
+++ /dev/null
@@ -1,209 +0,0 @@
-// Copyright (C) 2021 Google LLC
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-// http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#include <string_view>
-
-#include "gmock/gmock.h"
-#include "icing/portable/platform.h"
-#include "icing/testing/common-matchers.h"
-#include "icing/testing/icu-data-file-helper.h"
-#include "icing/testing/jni-test-helpers.h"
-#include "icing/testing/test-data.h"
-#include "icing/tokenization/language-segmenter-factory.h"
-#include "icing/tokenization/tokenizer-factory.h"
-#include "icing/util/character-iterator.h"
-#include "unicode/uloc.h"
-
-namespace icing {
-namespace lib {
-namespace {
-using ::testing::ElementsAre;
-using ::testing::Eq;
-using ::testing::IsEmpty;
-
-class VerbatimTokenizerTest : public ::testing::Test {
- protected:
- void SetUp() override {
- if (!IsCfStringTokenization() && !IsReverseJniTokenization()) {
- ICING_ASSERT_OK(
- // File generated via icu_data_file rule in //icing/BUILD.
- icu_data_file_helper::SetUpICUDataFile(
- GetTestFilePath("icing/icu.dat")));
- }
-
- jni_cache_ = GetTestJniCache();
- language_segmenter_factory::SegmenterOptions options(ULOC_US,
- jni_cache_.get());
- ICING_ASSERT_OK_AND_ASSIGN(
- language_segmenter_,
- language_segmenter_factory::Create(std::move(options)));
- }
-
- std::unique_ptr<const JniCache> jni_cache_;
- std::unique_ptr<LanguageSegmenter> language_segmenter_;
-};
-
-TEST_F(VerbatimTokenizerTest, Empty) {
- ICING_ASSERT_OK_AND_ASSIGN(std::unique_ptr<Tokenizer> verbatim_tokenizer,
- tokenizer_factory::CreateIndexingTokenizer(
- StringIndexingConfig::TokenizerType::VERBATIM,
- language_segmenter_.get()));
-
- EXPECT_THAT(verbatim_tokenizer->TokenizeAll(""), IsOkAndHolds(IsEmpty()));
-}
-
-TEST_F(VerbatimTokenizerTest, Simple) {
- ICING_ASSERT_OK_AND_ASSIGN(std::unique_ptr<Tokenizer> verbatim_tokenizer,
- tokenizer_factory::CreateIndexingTokenizer(
- StringIndexingConfig::TokenizerType::VERBATIM,
- language_segmenter_.get()));
-
- EXPECT_THAT(
- verbatim_tokenizer->TokenizeAll("foo bar"),
- IsOkAndHolds(ElementsAre(EqualsToken(Token::Type::VERBATIM, "foo bar"))));
-}
-
-TEST_F(VerbatimTokenizerTest, Punctuation) {
- ICING_ASSERT_OK_AND_ASSIGN(std::unique_ptr<Tokenizer> verbatim_tokenizer,
- tokenizer_factory::CreateIndexingTokenizer(
- StringIndexingConfig::TokenizerType::VERBATIM,
- language_segmenter_.get()));
-
- EXPECT_THAT(verbatim_tokenizer->TokenizeAll("Hello, world!"),
- IsOkAndHolds(ElementsAre(
- EqualsToken(Token::Type::VERBATIM, "Hello, world!"))));
-}
-
-TEST_F(VerbatimTokenizerTest, InvalidTokenBeforeAdvancing) {
- ICING_ASSERT_OK_AND_ASSIGN(std::unique_ptr<Tokenizer> verbatim_tokenizer,
- tokenizer_factory::CreateIndexingTokenizer(
- StringIndexingConfig::TokenizerType::VERBATIM,
- language_segmenter_.get()));
-
- constexpr std::string_view kText = "Hello, world!";
- auto token_iterator = verbatim_tokenizer->Tokenize(kText).ValueOrDie();
-
- // We should get an invalid token if we get the token before advancing.
- EXPECT_THAT(token_iterator->GetToken(),
- EqualsToken(Token::Type::INVALID, ""));
-}
-
-TEST_F(VerbatimTokenizerTest, ResetToTokenEndingBefore) {
- ICING_ASSERT_OK_AND_ASSIGN(std::unique_ptr<Tokenizer> verbatim_tokenizer,
- tokenizer_factory::CreateIndexingTokenizer(
- StringIndexingConfig::TokenizerType::VERBATIM,
- language_segmenter_.get()));
-
- constexpr std::string_view kText = "Hello, world!";
- auto token_iterator = verbatim_tokenizer->Tokenize(kText).ValueOrDie();
-
- // Reset to beginning of verbatim of token. We provide an offset of 13 as it
- // is larger than the final index (12) of the verbatim token.
- EXPECT_TRUE(token_iterator->ResetToTokenEndingBefore(13));
- EXPECT_THAT(token_iterator->GetToken(),
- EqualsToken(Token::Type::VERBATIM, "Hello, world!"));
-
- // Ensure our cached character iterator propertly maintains the end of the
- // verbatim token.
- EXPECT_TRUE(token_iterator->ResetToTokenEndingBefore(13));
- EXPECT_THAT(token_iterator->GetToken(),
- EqualsToken(Token::Type::VERBATIM, "Hello, world!"));
-
- // We should not be able to reset with an offset before or within
- // the verbatim token's utf-32 length.
- EXPECT_FALSE(token_iterator->ResetToTokenEndingBefore(0));
- EXPECT_FALSE(token_iterator->ResetToTokenEndingBefore(12));
-}
-
-TEST_F(VerbatimTokenizerTest, ResetToTokenStartingAfter) {
- ICING_ASSERT_OK_AND_ASSIGN(std::unique_ptr<Tokenizer> verbatim_tokenizer,
- tokenizer_factory::CreateIndexingTokenizer(
- StringIndexingConfig::TokenizerType::VERBATIM,
- language_segmenter_.get()));
-
- constexpr std::string_view kText = "Hello, world!";
- auto token_iterator = verbatim_tokenizer->Tokenize(kText).ValueOrDie();
-
- // Get token without resetting
- EXPECT_TRUE(token_iterator->Advance());
- EXPECT_THAT(token_iterator->GetToken(),
- EqualsToken(Token::Type::VERBATIM, "Hello, world!"));
-
- // We expect a sole verbatim token, so it's not possible to reset after the
- // start of the token.
- EXPECT_FALSE(token_iterator->ResetToTokenStartingAfter(1));
-
- // We expect to be reset to the sole verbatim token when the offset is
- // negative.
- EXPECT_TRUE(token_iterator->ResetToTokenStartingAfter(-1));
- EXPECT_THAT(token_iterator->GetToken(),
- EqualsToken(Token::Type::VERBATIM, "Hello, world!"));
-}
-
-TEST_F(VerbatimTokenizerTest, ResetToStart) {
- ICING_ASSERT_OK_AND_ASSIGN(std::unique_ptr<Tokenizer> verbatim_tokenizer,
- tokenizer_factory::CreateIndexingTokenizer(
- StringIndexingConfig::TokenizerType::VERBATIM,
- language_segmenter_.get()));
-
- constexpr std::string_view kText = "Hello, world!";
- auto token_iterator = verbatim_tokenizer->Tokenize(kText).ValueOrDie();
-
- // Get token without resetting
- EXPECT_TRUE(token_iterator->Advance());
- EXPECT_THAT(token_iterator->GetToken(),
- EqualsToken(Token::Type::VERBATIM, "Hello, world!"));
-
- // Retrieve token again after resetting to start
- EXPECT_TRUE(token_iterator->ResetToStart());
- EXPECT_THAT(token_iterator->GetToken(),
- EqualsToken(Token::Type::VERBATIM, "Hello, world!"));
-}
-
-TEST_F(VerbatimTokenizerTest, CalculateTokenStart) {
- ICING_ASSERT_OK_AND_ASSIGN(std::unique_ptr<Tokenizer> verbatim_tokenizer,
- tokenizer_factory::CreateIndexingTokenizer(
- StringIndexingConfig::TokenizerType::VERBATIM,
- language_segmenter_.get()));
-
- constexpr std::string_view kText = "Hello, world!";
- auto token_iterator = verbatim_tokenizer->Tokenize(kText).ValueOrDie();
-
- ICING_ASSERT_OK_AND_ASSIGN(CharacterIterator start_character_iterator,
- token_iterator->CalculateTokenStart());
-
- // We should retrieve the character 'H', the first character of the token.
- EXPECT_THAT(start_character_iterator.GetCurrentChar(), Eq('H'));
-}
-
-TEST_F(VerbatimTokenizerTest, CalculateTokenEnd) {
- ICING_ASSERT_OK_AND_ASSIGN(std::unique_ptr<Tokenizer> verbatim_tokenizer,
- tokenizer_factory::CreateIndexingTokenizer(
- StringIndexingConfig::TokenizerType::VERBATIM,
- language_segmenter_.get()));
-
- constexpr std::string_view kText = "Hello, world!";
- auto token_iterator = verbatim_tokenizer->Tokenize(kText).ValueOrDie();
-
- ICING_ASSERT_OK_AND_ASSIGN(CharacterIterator end_character_iterator,
- token_iterator->CalculateTokenEndExclusive());
-
- // We should retrieve the the null character, as the returned character
- // iterator will be set one past the end of the token.
- EXPECT_THAT(end_character_iterator.GetCurrentChar(), Eq('\0'));
-}
-
-} // namespace
-} // namespace lib
-} // namespace icing
diff --git a/icing/transform/icu/icu-normalizer.cc b/icing/transform/icu/icu-normalizer.cc
index aceb11d..eb0eead 100644
--- a/icing/transform/icu/icu-normalizer.cc
+++ b/icing/transform/icu/icu-normalizer.cc
@@ -29,7 +29,6 @@
#include "icing/util/status-macros.h"
#include "unicode/umachine.h"
#include "unicode/unorm2.h"
-#include "unicode/ustring.h"
#include "unicode/utrans.h"
namespace icing {
@@ -158,18 +157,14 @@ std::string IcuNormalizer::NormalizeLatin(const UNormalizer2* normalizer2,
const std::string_view term) const {
std::string result;
result.reserve(term.length());
- int current_pos = 0;
- while (current_pos < term.length()) {
- if (i18n_utils::IsAscii(term[current_pos])) {
- result.push_back(std::tolower(term[current_pos]));
- ++current_pos;
- } else {
- UChar32 uchar32 =
- i18n_utils::GetUChar32At(term.data(), term.length(), current_pos);
+ for (int i = 0; i < term.length(); i++) {
+ if (i18n_utils::IsAscii(term[i])) {
+ result.push_back(std::tolower(term[i]));
+ } else if (i18n_utils::IsLeadUtf8Byte(term[i])) {
+ UChar32 uchar32 = i18n_utils::GetUChar32At(term.data(), term.length(), i);
if (uchar32 == i18n_utils::kInvalidUChar32) {
ICING_LOG(WARNING) << "Unable to get uchar32 from " << term
- << " at position" << current_pos;
- current_pos += i18n_utils::GetUtf8Length(uchar32);
+ << " at position" << i;
continue;
}
char ascii_char;
@@ -182,9 +177,8 @@ std::string IcuNormalizer::NormalizeLatin(const UNormalizer2* normalizer2,
// tokenized. We handle it here in case there're something wrong with
// the tokenizers.
int utf8_length = i18n_utils::GetUtf8Length(uchar32);
- absl_ports::StrAppend(&result, term.substr(current_pos, utf8_length));
+ absl_ports::StrAppend(&result, term.substr(i, utf8_length));
}
- current_pos += i18n_utils::GetUtf8Length(uchar32);
}
}
@@ -267,106 +261,5 @@ std::string IcuNormalizer::TermTransformer::Transform(
return std::move(utf8_term_or).ValueOrDie();
}
-CharacterIterator FindNormalizedLatinMatchEndPosition(
- const UNormalizer2* normalizer2, std::string_view term,
- CharacterIterator char_itr, std::string_view normalized_term) {
- CharacterIterator normalized_char_itr(normalized_term);
- char ascii_char;
- while (char_itr.utf8_index() < term.length() &&
- normalized_char_itr.utf8_index() < normalized_term.length()) {
- UChar32 c = char_itr.GetCurrentChar();
- if (i18n_utils::IsAscii(c)) {
- c = std::tolower(c);
- } else if (DiacriticCharToAscii(normalizer2, c, &ascii_char)) {
- c = ascii_char;
- }
- UChar32 normalized_c = normalized_char_itr.GetCurrentChar();
- if (c != normalized_c) {
- return char_itr;
- }
- char_itr.AdvanceToUtf32(char_itr.utf32_index() + 1);
- normalized_char_itr.AdvanceToUtf32(normalized_char_itr.utf32_index() + 1);
- }
- return char_itr;
-}
-
-CharacterIterator
-IcuNormalizer::TermTransformer::FindNormalizedNonLatinMatchEndPosition(
- std::string_view term, CharacterIterator char_itr,
- std::string_view normalized_term) const {
- CharacterIterator normalized_char_itr(normalized_term);
- UErrorCode status = U_ZERO_ERROR;
-
- constexpr int kUtf16CharBufferLength = 6;
- UChar c16[kUtf16CharBufferLength];
- int32_t c16_length;
- int32_t limit;
-
- constexpr int kCharBufferLength = 3 * 4;
- char normalized_buffer[kCharBufferLength];
- int32_t c8_length;
- while (char_itr.utf8_index() < term.length() &&
- normalized_char_itr.utf8_index() < normalized_term.length()) {
- UChar32 c = char_itr.GetCurrentChar();
- int c_lenth = i18n_utils::GetUtf8Length(c);
- u_strFromUTF8(c16, kUtf16CharBufferLength, &c16_length,
- term.data() + char_itr.utf8_index(),
- /*srcLength=*/c_lenth, &status);
- if (U_FAILURE(status)) {
- break;
- }
-
- limit = c16_length;
- utrans_transUChars(u_transliterator_, c16, &c16_length,
- kUtf16CharBufferLength,
- /*start=*/0, &limit, &status);
- if (U_FAILURE(status)) {
- break;
- }
-
- u_strToUTF8(normalized_buffer, kCharBufferLength, &c8_length, c16,
- c16_length, &status);
- if (U_FAILURE(status)) {
- break;
- }
-
- for (int i = 0; i < c8_length; ++i) {
- if (normalized_buffer[i] !=
- normalized_term[normalized_char_itr.utf8_index() + i]) {
- return char_itr;
- }
- }
- normalized_char_itr.AdvanceToUtf8(normalized_char_itr.utf8_index() +
- c8_length);
- char_itr.AdvanceToUtf32(char_itr.utf32_index() + 1);
- }
- if (U_FAILURE(status)) {
- // Failed to transform, return its original form.
- ICING_LOG(WARNING) << "Failed to normalize UTF8 term: " << term;
- }
- return char_itr;
-}
-
-CharacterIterator IcuNormalizer::FindNormalizedMatchEndPosition(
- std::string_view term, std::string_view normalized_term) const {
- UErrorCode status = U_ZERO_ERROR;
- // ICU manages the singleton instance
- const UNormalizer2* normalizer2 = unorm2_getNFCInstance(&status);
- if (U_FAILURE(status)) {
- ICING_LOG(WARNING) << "Failed to create a UNormalizer2 instance";
- }
-
- CharacterIterator char_itr(term);
- UChar32 first_uchar32 = char_itr.GetCurrentChar();
- if (normalizer2 != nullptr && first_uchar32 != i18n_utils::kInvalidUChar32 &&
- DiacriticCharToAscii(normalizer2, first_uchar32, /*char_out=*/nullptr)) {
- return FindNormalizedLatinMatchEndPosition(normalizer2, term, char_itr,
- normalized_term);
- } else {
- return term_transformer_->FindNormalizedNonLatinMatchEndPosition(
- term, char_itr, normalized_term);
- }
-}
-
} // namespace lib
} // namespace icing
diff --git a/icing/transform/icu/icu-normalizer.h b/icing/transform/icu/icu-normalizer.h
index d4f1ebd..f20a9fb 100644
--- a/icing/transform/icu/icu-normalizer.h
+++ b/icing/transform/icu/icu-normalizer.h
@@ -21,7 +21,6 @@
#include "icing/text_classifier/lib3/utils/base/statusor.h"
#include "icing/transform/normalizer.h"
-#include "icing/util/character-iterator.h"
#include "unicode/unorm2.h"
#include "unicode/utrans.h"
@@ -57,17 +56,6 @@ class IcuNormalizer : public Normalizer {
// result in the non-Latin characters not properly being normalized
std::string NormalizeTerm(std::string_view term) const override;
- // Returns a CharacterIterator pointing to one past the end of the segment of
- // term that (once normalized) matches with normalized_term.
- //
- // Ex. FindNormalizedMatchEndPosition("YELLOW", "yell") will return
- // CharacterIterator(u8:4, u16:4, u32:4).
- //
- // Ex. FindNormalizedMatchEndPosition("YELLOW", "red") will return
- // CharacterIterator(u8:0, u16:0, u32:0).
- CharacterIterator FindNormalizedMatchEndPosition(
- std::string_view term, std::string_view normalized_term) const override;
-
private:
// A handler class that helps manage the lifecycle of UTransliterator. It's
// used in IcuNormalizer to transform terms into the formats we need.
@@ -87,12 +75,6 @@ class IcuNormalizer : public Normalizer {
// Transforms the text based on our rules described at top of this file
std::string Transform(std::string_view term) const;
- // Returns a CharacterIterator pointing to one past the end of the segment
- // of a non-latin term that (once normalized) matches with normalized_term.
- CharacterIterator FindNormalizedNonLatinMatchEndPosition(
- std::string_view term, CharacterIterator char_itr,
- std::string_view normalized_term) const;
-
private:
explicit TermTransformer(UTransliterator* u_transliterator);
diff --git a/icing/transform/icu/icu-normalizer_benchmark.cc b/icing/transform/icu/icu-normalizer_benchmark.cc
index fdd4c70..b037538 100644
--- a/icing/transform/icu/icu-normalizer_benchmark.cc
+++ b/icing/transform/icu/icu-normalizer_benchmark.cc
@@ -14,8 +14,8 @@
#include "testing/base/public/benchmark.h"
#include "gmock/gmock.h"
+#include "icing/helpers/icu/icu-data-file-helper.h"
#include "icing/testing/common-matchers.h"
-#include "icing/testing/icu-data-file-helper.h"
#include "icing/testing/test-data.h"
#include "icing/transform/normalizer-factory.h"
#include "icing/transform/normalizer.h"
@@ -161,124 +161,6 @@ BENCHMARK(BM_NormalizeHiragana)
->Arg(2048000)
->Arg(4096000);
-void BM_UppercaseSubTokenLength(benchmark::State& state) {
- bool run_via_adb = absl::GetFlag(FLAGS_adb);
- if (!run_via_adb) {
- ICING_ASSERT_OK(icu_data_file_helper::SetUpICUDataFile(
- GetTestFilePath("icing/icu.dat")));
- }
-
- ICING_ASSERT_OK_AND_ASSIGN(
- std::unique_ptr<Normalizer> normalizer,
- normalizer_factory::Create(
-
- /*max_term_byte_size=*/std::numeric_limits<int>::max()));
-
- std::string input_string(state.range(0), 'A');
- std::string normalized_input_string(state.range(0), 'a');
- for (auto _ : state) {
- normalizer->FindNormalizedMatchEndPosition(input_string,
- normalized_input_string);
- }
-}
-BENCHMARK(BM_UppercaseSubTokenLength)
- ->Arg(1000)
- ->Arg(2000)
- ->Arg(4000)
- ->Arg(8000)
- ->Arg(16000)
- ->Arg(32000)
- ->Arg(64000)
- ->Arg(128000)
- ->Arg(256000)
- ->Arg(384000)
- ->Arg(512000)
- ->Arg(1024000)
- ->Arg(2048000)
- ->Arg(4096000);
-
-void BM_AccentSubTokenLength(benchmark::State& state) {
- bool run_via_adb = absl::GetFlag(FLAGS_adb);
- if (!run_via_adb) {
- ICING_ASSERT_OK(icu_data_file_helper::SetUpICUDataFile(
- GetTestFilePath("icing/icu.dat")));
- }
-
- ICING_ASSERT_OK_AND_ASSIGN(
- std::unique_ptr<Normalizer> normalizer,
- normalizer_factory::Create(
-
- /*max_term_byte_size=*/std::numeric_limits<int>::max()));
-
- std::string input_string;
- std::string normalized_input_string;
- while (input_string.length() < state.range(0)) {
- input_string.append("àáâãā");
- normalized_input_string.append("aaaaa");
- }
-
- for (auto _ : state) {
- normalizer->FindNormalizedMatchEndPosition(input_string,
- normalized_input_string);
- }
-}
-BENCHMARK(BM_AccentSubTokenLength)
- ->Arg(1000)
- ->Arg(2000)
- ->Arg(4000)
- ->Arg(8000)
- ->Arg(16000)
- ->Arg(32000)
- ->Arg(64000)
- ->Arg(128000)
- ->Arg(256000)
- ->Arg(384000)
- ->Arg(512000)
- ->Arg(1024000)
- ->Arg(2048000)
- ->Arg(4096000);
-
-void BM_HiraganaSubTokenLength(benchmark::State& state) {
- bool run_via_adb = absl::GetFlag(FLAGS_adb);
- if (!run_via_adb) {
- ICING_ASSERT_OK(icu_data_file_helper::SetUpICUDataFile(
- GetTestFilePath("icing/icu.dat")));
- }
-
- ICING_ASSERT_OK_AND_ASSIGN(
- std::unique_ptr<Normalizer> normalizer,
- normalizer_factory::Create(
-
- /*max_term_byte_size=*/std::numeric_limits<int>::max()));
-
- std::string input_string;
- std::string normalized_input_string;
- while (input_string.length() < state.range(0)) {
- input_string.append("あいうえお");
- normalized_input_string.append("アイウエオ");
- }
-
- for (auto _ : state) {
- normalizer->FindNormalizedMatchEndPosition(input_string,
- normalized_input_string);
- }
-}
-BENCHMARK(BM_HiraganaSubTokenLength)
- ->Arg(1000)
- ->Arg(2000)
- ->Arg(4000)
- ->Arg(8000)
- ->Arg(16000)
- ->Arg(32000)
- ->Arg(64000)
- ->Arg(128000)
- ->Arg(256000)
- ->Arg(384000)
- ->Arg(512000)
- ->Arg(1024000)
- ->Arg(2048000)
- ->Arg(4096000);
-
} // namespace
} // namespace lib
diff --git a/icing/transform/icu/icu-normalizer_test.cc b/icing/transform/icu/icu-normalizer_test.cc
index 143da17..f5d20ff 100644
--- a/icing/transform/icu/icu-normalizer_test.cc
+++ b/icing/transform/icu/icu-normalizer_test.cc
@@ -16,8 +16,8 @@
#include "gmock/gmock.h"
#include "gtest/gtest.h"
+#include "icing/helpers/icu/icu-data-file-helper.h"
#include "icing/testing/common-matchers.h"
-#include "icing/testing/icu-data-file-helper.h"
#include "icing/testing/icu-i18n-test-utils.h"
#include "icing/testing/test-data.h"
#include "icing/transform/normalizer-factory.h"
@@ -231,104 +231,6 @@ TEST_F(IcuNormalizerTest, Truncate) {
}
}
-TEST_F(IcuNormalizerTest, PrefixMatchLength) {
- // Verify that FindNormalizedMatchEndPosition will properly find the length of
- // the prefix match when given a non-normalized term and a normalized term
- // is a prefix of the non-normalized one.
- ICING_ASSERT_OK_AND_ASSIGN(auto normalizer, normalizer_factory::Create(
- /*max_term_byte_size=*/1000));
-
- // Upper to lower
- std::string term = "MDI";
- CharacterIterator match_end =
- normalizer->FindNormalizedMatchEndPosition(term, "md");
- EXPECT_THAT(term.substr(0, match_end.utf8_index()), Eq("MD"));
-
- term = "Icing";
- match_end = normalizer->FindNormalizedMatchEndPosition(term, "icin");
- EXPECT_THAT(term.substr(0, match_end.utf8_index()), Eq("Icin"));
-
- // Full-width
- term = "525600";
- match_end = normalizer->FindNormalizedMatchEndPosition(term, "525");
- EXPECT_THAT(term.substr(0, match_end.utf8_index()), Eq("525"));
-
- term = "FULLWIDTH";
- match_end = normalizer->FindNormalizedMatchEndPosition(term, "full");
- EXPECT_THAT(term.substr(0, match_end.utf8_index()), Eq("FULL"));
-
- // Hiragana to Katakana
- term = "あいうえお";
- match_end = normalizer->FindNormalizedMatchEndPosition(term, "アイ");
- EXPECT_THAT(term.substr(0, match_end.utf8_index()), Eq("あい"));
-
- term = "かきくけこ";
- match_end = normalizer->FindNormalizedMatchEndPosition(term, "カ");
- EXPECT_THAT(term.substr(0, match_end.utf8_index()), Eq("か"));
-
- // Latin accents
- term = "Zürich";
- match_end = normalizer->FindNormalizedMatchEndPosition(term, "zur");
- EXPECT_THAT(term.substr(0, match_end.utf8_index()), Eq("Zür"));
-
- term = "après-midi";
- match_end = normalizer->FindNormalizedMatchEndPosition(term, "apre");
- EXPECT_THAT(term.substr(0, match_end.utf8_index()), Eq("aprè"));
-
- term = "Buenos días";
- match_end = normalizer->FindNormalizedMatchEndPosition(term, "buenos di");
- EXPECT_THAT(term.substr(0, match_end.utf8_index()), Eq("Buenos dí"));
-}
-
-TEST_F(IcuNormalizerTest, SharedPrefixMatchLength) {
- // Verify that FindNormalizedMatchEndPosition will properly find the length of
- // the prefix match when given a non-normalized term and a normalized term
- // that share a common prefix.
- ICING_ASSERT_OK_AND_ASSIGN(auto normalizer, normalizer_factory::Create(
- /*max_term_byte_size=*/1000));
-
- // Upper to lower
- std::string term = "MDI";
- CharacterIterator match_end =
- normalizer->FindNormalizedMatchEndPosition(term, "mgm");
- EXPECT_THAT(term.substr(0, match_end.utf8_index()), Eq("M"));
-
- term = "Icing";
- match_end = normalizer->FindNormalizedMatchEndPosition(term, "icky");
- EXPECT_THAT(term.substr(0, match_end.utf8_index()), Eq("Ic"));
-
- // Full-width
- term = "525600";
- match_end = normalizer->FindNormalizedMatchEndPosition(term, "525788");
- EXPECT_THAT(term.substr(0, match_end.utf8_index()), Eq("525"));
-
- term = "FULLWIDTH";
- match_end = normalizer->FindNormalizedMatchEndPosition(term, "fully");
- EXPECT_THAT(term.substr(0, match_end.utf8_index()), Eq("FULL"));
-
- // Hiragana to Katakana
- term = "あいうえお";
- match_end = normalizer->FindNormalizedMatchEndPosition(term, "アイエオ");
- EXPECT_THAT(term.substr(0, match_end.utf8_index()), Eq("あい"));
-
- term = "かきくけこ";
- match_end = normalizer->FindNormalizedMatchEndPosition(term, "カケコ");
- EXPECT_THAT(term.substr(0, match_end.utf8_index()), Eq("か"));
-
- // Latin accents
- term = "Zürich";
- match_end = normalizer->FindNormalizedMatchEndPosition(term, "zurg");
- EXPECT_THAT(term.substr(0, match_end.utf8_index()), Eq("Zür"));
-
- term = "après-midi";
- match_end = normalizer->FindNormalizedMatchEndPosition(term, "apreciate");
- EXPECT_THAT(term.substr(0, match_end.utf8_index()), Eq("aprè"));
-
- term = "días";
- match_end = normalizer->FindNormalizedMatchEndPosition(term, "diamond");
- EXPECT_THAT(term.substr(0, match_end.utf8_index()), Eq("día"));
-}
-
} // namespace
} // namespace lib
} // namespace icing
diff --git a/icing/transform/map/map-normalizer.cc b/icing/transform/map/map-normalizer.cc
index 61fce65..c888551 100644
--- a/icing/transform/map/map-normalizer.cc
+++ b/icing/transform/map/map-normalizer.cc
@@ -14,7 +14,8 @@
#include "icing/transform/map/map-normalizer.h"
-#include <cctype>
+#include <ctype.h>
+
#include <string>
#include <string_view>
#include <unordered_map>
@@ -22,7 +23,6 @@
#include "icing/absl_ports/str_cat.h"
#include "icing/transform/map/normalization-map.h"
-#include "icing/util/character-iterator.h"
#include "icing/util/i18n-utils.h"
#include "icing/util/logging.h"
#include "unicode/utypes.h"
@@ -30,70 +30,48 @@
namespace icing {
namespace lib {
-namespace {
-
-UChar32 NormalizeChar(UChar32 c) {
- if (i18n_utils::GetUtf16Length(c) > 1) {
- // All the characters we need to normalize can be encoded into a
- // single char16_t. If this character needs more than 1 char16_t code
- // unit, we can skip normalization and append it directly.
- return c;
- }
-
- // The original character can be encoded into a single char16_t.
- const std::unordered_map<char16_t, char16_t>* normalization_map =
- GetNormalizationMap();
- if (normalization_map == nullptr) {
- // Normalization map couldn't be properly initialized, append the original
- // character.
- ICING_LOG(WARNING) << "Unable to get a valid pointer to normalization map!";
- return c;
- }
- auto iterator = normalization_map->find(static_cast<char16_t>(c));
- if (iterator == normalization_map->end()) {
- // Normalization mapping not found, append the original character.
- return c;
- }
-
- // Found a normalization mapping. The normalized character (stored in a
- // char16_t) can have 1 or 2 bytes.
- if (i18n_utils::IsAscii(iterator->second)) {
- // The normalized character has 1 byte. It may be an upper-case char.
- // Lower-case it before returning it.
- return std::tolower(static_cast<char>(iterator->second));
- } else {
- return iterator->second;
- }
-}
-
-} // namespace
-
std::string MapNormalizer::NormalizeTerm(std::string_view term) const {
std::string normalized_text;
normalized_text.reserve(term.length());
- int current_pos = 0;
- while (current_pos < term.length()) {
- if (i18n_utils::IsAscii(term[current_pos])) {
- normalized_text.push_back(std::tolower(term[current_pos]));
- ++current_pos;
- } else {
- UChar32 uchar32 =
- i18n_utils::GetUChar32At(term.data(), term.length(), current_pos);
+ for (int i = 0; i < term.length(); ++i) {
+ if (i18n_utils::IsAscii(term[i])) {
+ // The original character has 1 byte.
+ normalized_text.push_back(std::tolower(term[i]));
+ } else if (i18n_utils::IsLeadUtf8Byte(term[i])) {
+ UChar32 uchar32 = i18n_utils::GetUChar32At(term.data(), term.length(), i);
if (uchar32 == i18n_utils::kInvalidUChar32) {
ICING_LOG(WARNING) << "Unable to get uchar32 from " << term
- << " at position" << current_pos;
- ++current_pos;
+ << " at position" << i;
continue;
}
- UChar32 normalized_char32 = NormalizeChar(uchar32);
- if (i18n_utils::IsAscii(normalized_char32)) {
- normalized_text.push_back(normalized_char32);
+ int utf8_length = i18n_utils::GetUtf8Length(uchar32);
+ if (i18n_utils::GetUtf16Length(uchar32) > 1) {
+ // All the characters we need to normalize can be encoded into a
+ // single char16_t. If this character needs more than 1 char16_t code
+ // unit, we can skip normalization and append it directly.
+ absl_ports::StrAppend(&normalized_text, term.substr(i, utf8_length));
+ continue;
+ }
+ // The original character can be encoded into a single char16_t.
+ const std::unordered_map<char16_t, char16_t>& normalization_map =
+ GetNormalizationMap();
+ auto iterator = normalization_map.find(static_cast<char16_t>(uchar32));
+ if (iterator != normalization_map.end()) {
+ // Found a normalization mapping. The normalized character (stored in a
+ // char16_t) can have 1 or 2 bytes.
+ if (i18n_utils::IsAscii(iterator->second)) {
+ // The normalized character has 1 byte.
+ normalized_text.push_back(
+ std::tolower(static_cast<char>(iterator->second)));
+ } else {
+ // The normalized character has 2 bytes.
+ i18n_utils::AppendUchar32ToUtf8(&normalized_text, iterator->second);
+ }
} else {
- // The normalized character has 2 bytes.
- i18n_utils::AppendUchar32ToUtf8(&normalized_text, normalized_char32);
+ // Normalization mapping not found, append the original character.
+ absl_ports::StrAppend(&normalized_text, term.substr(i, utf8_length));
}
- current_pos += i18n_utils::GetUtf8Length(uchar32);
}
}
@@ -104,27 +82,5 @@ std::string MapNormalizer::NormalizeTerm(std::string_view term) const {
return normalized_text;
}
-CharacterIterator MapNormalizer::FindNormalizedMatchEndPosition(
- std::string_view term, std::string_view normalized_term) const {
- CharacterIterator char_itr(term);
- CharacterIterator normalized_char_itr(normalized_term);
- while (char_itr.utf8_index() < term.length() &&
- normalized_char_itr.utf8_index() < normalized_term.length()) {
- UChar32 c = char_itr.GetCurrentChar();
- if (i18n_utils::IsAscii(c)) {
- c = std::tolower(c);
- } else {
- c = NormalizeChar(c);
- }
- UChar32 normalized_c = normalized_char_itr.GetCurrentChar();
- if (c != normalized_c) {
- return char_itr;
- }
- char_itr.AdvanceToUtf32(char_itr.utf32_index() + 1);
- normalized_char_itr.AdvanceToUtf32(normalized_char_itr.utf32_index() + 1);
- }
- return char_itr;
-}
-
} // namespace lib
} // namespace icing
diff --git a/icing/transform/map/map-normalizer.h b/icing/transform/map/map-normalizer.h
index ed996ae..f9c0e42 100644
--- a/icing/transform/map/map-normalizer.h
+++ b/icing/transform/map/map-normalizer.h
@@ -19,7 +19,6 @@
#include <string_view>
#include "icing/transform/normalizer.h"
-#include "icing/util/character-iterator.h"
namespace icing {
namespace lib {
@@ -40,17 +39,6 @@ class MapNormalizer : public Normalizer {
// Read more mapping details in normalization-map.cc
std::string NormalizeTerm(std::string_view term) const override;
- // Returns a CharacterIterator pointing to one past the end of the segment of
- // term that (once normalized) matches with normalized_term.
- //
- // Ex. FindNormalizedMatchEndPosition("YELLOW", "yell") will return
- // CharacterIterator(u8:4, u16:4, u32:4).
- //
- // Ex. FindNormalizedMatchEndPosition("YELLOW", "red") will return
- // CharacterIterator(u8:0, u16:0, u32:0).
- CharacterIterator FindNormalizedMatchEndPosition(
- std::string_view term, std::string_view normalized_term) const override;
-
private:
// The maximum term length allowed after normalization.
int max_term_byte_size_;
diff --git a/icing/transform/map/map-normalizer_benchmark.cc b/icing/transform/map/map-normalizer_benchmark.cc
index 8268541..691afc6 100644
--- a/icing/transform/map/map-normalizer_benchmark.cc
+++ b/icing/transform/map/map-normalizer_benchmark.cc
@@ -143,104 +143,6 @@ BENCHMARK(BM_NormalizeHiragana)
->Arg(2048000)
->Arg(4096000);
-void BM_UppercaseSubTokenLength(benchmark::State& state) {
- ICING_ASSERT_OK_AND_ASSIGN(
- std::unique_ptr<Normalizer> normalizer,
- normalizer_factory::Create(
-
- /*max_term_byte_size=*/std::numeric_limits<int>::max()));
-
- std::string input_string(state.range(0), 'A');
- std::string normalized_input_string(state.range(0), 'a');
- for (auto _ : state) {
- normalizer->FindNormalizedMatchEndPosition(input_string,
- normalized_input_string);
- }
-}
-BENCHMARK(BM_UppercaseSubTokenLength)
- ->Arg(1000)
- ->Arg(2000)
- ->Arg(4000)
- ->Arg(8000)
- ->Arg(16000)
- ->Arg(32000)
- ->Arg(64000)
- ->Arg(128000)
- ->Arg(256000)
- ->Arg(384000)
- ->Arg(512000)
- ->Arg(1024000)
- ->Arg(2048000)
- ->Arg(4096000);
-
-void BM_AccentSubTokenLength(benchmark::State& state) {
- ICING_ASSERT_OK_AND_ASSIGN(
- std::unique_ptr<Normalizer> normalizer,
- normalizer_factory::Create(
- /*max_term_byte_size=*/std::numeric_limits<int>::max()));
-
- std::string input_string;
- std::string normalized_input_string;
- while (input_string.length() < state.range(0)) {
- input_string.append("àáâãā");
- normalized_input_string.append("aaaaa");
- }
-
- for (auto _ : state) {
- normalizer->FindNormalizedMatchEndPosition(input_string,
- normalized_input_string);
- }
-}
-BENCHMARK(BM_AccentSubTokenLength)
- ->Arg(1000)
- ->Arg(2000)
- ->Arg(4000)
- ->Arg(8000)
- ->Arg(16000)
- ->Arg(32000)
- ->Arg(64000)
- ->Arg(128000)
- ->Arg(256000)
- ->Arg(384000)
- ->Arg(512000)
- ->Arg(1024000)
- ->Arg(2048000)
- ->Arg(4096000);
-
-void BM_HiraganaSubTokenLength(benchmark::State& state) {
- ICING_ASSERT_OK_AND_ASSIGN(
- std::unique_ptr<Normalizer> normalizer,
- normalizer_factory::Create(
- /*max_term_byte_size=*/std::numeric_limits<int>::max()));
-
- std::string input_string;
- std::string normalized_input_string;
- while (input_string.length() < state.range(0)) {
- input_string.append("あいうえお");
- normalized_input_string.append("アイウエオ");
- }
-
- for (auto _ : state) {
- normalizer->FindNormalizedMatchEndPosition(input_string,
- normalized_input_string);
- }
-}
-BENCHMARK(BM_HiraganaSubTokenLength)
- ->Arg(1000)
- ->Arg(2000)
- ->Arg(4000)
- ->Arg(8000)
- ->Arg(16000)
- ->Arg(32000)
- ->Arg(64000)
- ->Arg(128000)
- ->Arg(256000)
- ->Arg(384000)
- ->Arg(512000)
- ->Arg(1024000)
- ->Arg(2048000)
- ->Arg(4096000);
-
} // namespace
} // namespace lib
diff --git a/icing/transform/map/map-normalizer_test.cc b/icing/transform/map/map-normalizer_test.cc
index adc5623..b62ae0e 100644
--- a/icing/transform/map/map-normalizer_test.cc
+++ b/icing/transform/map/map-normalizer_test.cc
@@ -23,7 +23,6 @@
#include "icing/testing/icu-i18n-test-utils.h"
#include "icing/transform/normalizer-factory.h"
#include "icing/transform/normalizer.h"
-#include "icing/util/character-iterator.h"
namespace icing {
namespace lib {
@@ -200,104 +199,6 @@ TEST(MapNormalizerTest, Truncate) {
}
}
-TEST(MapNormalizerTest, PrefixMatchLength) {
- // Verify that FindNormalizedMatchEndPosition will properly find the length of
- // the prefix match when given a non-normalized term and a normalized term
- // is a prefix of the non-normalized one.
- ICING_ASSERT_OK_AND_ASSIGN(auto normalizer, normalizer_factory::Create(
- /*max_term_byte_size=*/1000));
-
- // Upper to lower
- std::string term = "MDI";
- CharacterIterator match_end =
- normalizer->FindNormalizedMatchEndPosition(term, "md");
- EXPECT_THAT(term.substr(0, match_end.utf8_index()), Eq("MD"));
-
- term = "Icing";
- match_end = normalizer->FindNormalizedMatchEndPosition(term, "icin");
- EXPECT_THAT(term.substr(0, match_end.utf8_index()), Eq("Icin"));
-
- // Full-width
- term = "525600";
- match_end = normalizer->FindNormalizedMatchEndPosition(term, "525");
- EXPECT_THAT(term.substr(0, match_end.utf8_index()), Eq("525"));
-
- term = "FULLWIDTH";
- match_end = normalizer->FindNormalizedMatchEndPosition(term, "full");
- EXPECT_THAT(term.substr(0, match_end.utf8_index()), Eq("FULL"));
-
- // Hiragana to Katakana
- term = "あいうえお";
- match_end = normalizer->FindNormalizedMatchEndPosition(term, "アイ");
- EXPECT_THAT(term.substr(0, match_end.utf8_index()), Eq("あい"));
-
- term = "かきくけこ";
- match_end = normalizer->FindNormalizedMatchEndPosition(term, "カ");
- EXPECT_THAT(term.substr(0, match_end.utf8_index()), Eq("か"));
-
- // Latin accents
- term = "Zürich";
- match_end = normalizer->FindNormalizedMatchEndPosition(term, "zur");
- EXPECT_THAT(term.substr(0, match_end.utf8_index()), Eq("Zür"));
-
- term = "après-midi";
- match_end = normalizer->FindNormalizedMatchEndPosition(term, "apre");
- EXPECT_THAT(term.substr(0, match_end.utf8_index()), Eq("aprè"));
-
- term = "Buenos días";
- match_end = normalizer->FindNormalizedMatchEndPosition(term, "buenos di");
- EXPECT_THAT(term.substr(0, match_end.utf8_index()), Eq("Buenos dí"));
-}
-
-TEST(MapNormalizerTest, SharedPrefixMatchLength) {
- // Verify that FindNormalizedMatchEndPosition will properly find the length of
- // the prefix match when given a non-normalized term and a normalized term
- // that share a common prefix.
- ICING_ASSERT_OK_AND_ASSIGN(auto normalizer, normalizer_factory::Create(
- /*max_term_byte_size=*/1000));
-
- // Upper to lower
- std::string term = "MDI";
- CharacterIterator match_end =
- normalizer->FindNormalizedMatchEndPosition(term, "mgm");
- EXPECT_THAT(term.substr(0, match_end.utf8_index()), Eq("M"));
-
- term = "Icing";
- match_end = normalizer->FindNormalizedMatchEndPosition(term, "icky");
- EXPECT_THAT(term.substr(0, match_end.utf8_index()), Eq("Ic"));
-
- // Full-width
- term = "525600";
- match_end = normalizer->FindNormalizedMatchEndPosition(term, "525788");
- EXPECT_THAT(term.substr(0, match_end.utf8_index()), Eq("525"));
-
- term = "FULLWIDTH";
- match_end = normalizer->FindNormalizedMatchEndPosition(term, "fully");
- EXPECT_THAT(term.substr(0, match_end.utf8_index()), Eq("FULL"));
-
- // Hiragana to Katakana
- term = "あいうえお";
- match_end = normalizer->FindNormalizedMatchEndPosition(term, "アイエオ");
- EXPECT_THAT(term.substr(0, match_end.utf8_index()), Eq("あい"));
-
- term = "かきくけこ";
- match_end = normalizer->FindNormalizedMatchEndPosition(term, "カケコ");
- EXPECT_THAT(term.substr(0, match_end.utf8_index()), Eq("か"));
-
- // Latin accents
- term = "Zürich";
- match_end = normalizer->FindNormalizedMatchEndPosition(term, "zurg");
- EXPECT_THAT(term.substr(0, match_end.utf8_index()), Eq("Zür"));
-
- term = "après-midi";
- match_end = normalizer->FindNormalizedMatchEndPosition(term, "apreciate");
- EXPECT_THAT(term.substr(0, match_end.utf8_index()), Eq("aprè"));
-
- term = "días";
- match_end = normalizer->FindNormalizedMatchEndPosition(term, "diamond");
- EXPECT_THAT(term.substr(0, match_end.utf8_index()), Eq("día"));
-}
-
} // namespace
} // namespace lib
diff --git a/icing/transform/map/normalization-map.cc b/icing/transform/map/normalization-map.cc
index 0994ab8..c318036 100644
--- a/icing/transform/map/normalization-map.cc
+++ b/icing/transform/map/normalization-map.cc
@@ -691,21 +691,19 @@ constexpr NormalizationPair kNormalizationMappings[] = {
} // namespace
-const std::unordered_map<char16_t, char16_t> *GetNormalizationMap() {
+const std::unordered_map<char16_t, char16_t>& GetNormalizationMap() {
// The map is allocated dynamically the first time this function is executed.
- static const std::unordered_map<char16_t, char16_t> *const normalization_map =
- [] {
- auto *map = new std::unordered_map<char16_t, char16_t>();
- // Size of all the mappings is about 2.5 KiB.
- constexpr int numMappings =
- sizeof(kNormalizationMappings) / sizeof(NormalizationPair);
- map->reserve(numMappings);
- for (size_t i = 0; i < numMappings; ++i) {
- map->emplace(kNormalizationMappings[i].from,
- kNormalizationMappings[i].to);
- }
- return map;
- }();
+ static const std::unordered_map<char16_t, char16_t> normalization_map = [] {
+ std::unordered_map<char16_t, char16_t> map;
+ // Size of all the mappings is about 2.5 KiB.
+ constexpr int numMappings =
+ sizeof(kNormalizationMappings) / sizeof(NormalizationPair);
+ map.reserve(numMappings);
+ for (size_t i = 0; i < numMappings; ++i) {
+ map.emplace(kNormalizationMappings[i].from, kNormalizationMappings[i].to);
+ }
+ return map;
+ }();
return normalization_map;
}
diff --git a/icing/transform/map/normalization-map.h b/icing/transform/map/normalization-map.h
index ac7872b..aea85bd 100644
--- a/icing/transform/map/normalization-map.h
+++ b/icing/transform/map/normalization-map.h
@@ -23,7 +23,7 @@ namespace lib {
// Returns a map containing normalization mappings. A mapping (A -> B) means
// that we'll transform every character 'A' into 'B'. See normalization-map.cc
// for mapping details.
-const std::unordered_map<char16_t, char16_t>* GetNormalizationMap();
+const std::unordered_map<char16_t, char16_t>& GetNormalizationMap();
} // namespace lib
} // namespace icing
diff --git a/icing/transform/normalizer.h b/icing/transform/normalizer.h
index 2110f0f..4cbfa63 100644
--- a/icing/transform/normalizer.h
+++ b/icing/transform/normalizer.h
@@ -20,7 +20,6 @@
#include <string_view>
#include "icing/text_classifier/lib3/utils/base/statusor.h"
-#include "icing/util/character-iterator.h"
namespace icing {
namespace lib {
@@ -40,17 +39,6 @@ class Normalizer {
// Normalizes the input term based on rules. See implementation classes for
// specific transformation rules.
virtual std::string NormalizeTerm(std::string_view term) const = 0;
-
- // Returns a CharacterIterator pointing to one past the end of the segment of
- // term that (once normalized) matches with normalized_term.
- //
- // Ex. FindNormalizedMatchEndPosition("YELLOW", "yell") will return
- // CharacterIterator(u8:4, u16:4, u32:4).
- //
- // Ex. FindNormalizedMatchEndPosition("YELLOW", "red") will return
- // CharacterIterator(u8:0, u16:0, u32:0).
- virtual CharacterIterator FindNormalizedMatchEndPosition(
- std::string_view term, std::string_view normalized_term) const = 0;
};
} // namespace lib
diff --git a/icing/transform/simple/none-normalizer-factory.cc b/icing/transform/simple/none-normalizer-factory.cc
new file mode 100644
index 0000000..6b35270
--- /dev/null
+++ b/icing/transform/simple/none-normalizer-factory.cc
@@ -0,0 +1,53 @@
+// Copyright (C) 2019 Google LLC
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#ifndef ICING_TRANSFORM_SIMPLE_NONE_NORMALIZER_FACTORY_H_
+#define ICING_TRANSFORM_SIMPLE_NONE_NORMALIZER_FACTORY_H_
+
+#include <memory>
+#include <string_view>
+
+#include "icing/text_classifier/lib3/utils/base/statusor.h"
+#include "icing/absl_ports/canonical_errors.h"
+#include "icing/transform/normalizer.h"
+#include "icing/transform/simple/none-normalizer.h"
+
+namespace icing {
+namespace lib {
+
+namespace normalizer_factory {
+
+// Creates a dummy normalizer. The term is not normalized, but
+// the text will be truncated to max_term_byte_size if it exceeds the max size.
+//
+// Returns:
+// A normalizer on success
+// INVALID_ARGUMENT if max_term_byte_size <= 0
+// INTERNAL_ERROR on errors
+libtextclassifier3::StatusOr<std::unique_ptr<Normalizer>> Create(
+ int max_term_byte_size) {
+ if (max_term_byte_size <= 0) {
+ return absl_ports::InvalidArgumentError(
+ "max_term_byte_size must be greater than zero.");
+ }
+
+ return std::make_unique<NoneNormalizer>(max_term_byte_size);
+}
+
+} // namespace normalizer_factory
+
+} // namespace lib
+} // namespace icing
+
+#endif // ICING_TRANSFORM_SIMPLE_NONE_NORMALIZER_FACTORY_H_
diff --git a/icing/transform/simple/none-normalizer.h b/icing/transform/simple/none-normalizer.h
new file mode 100644
index 0000000..47085e1
--- /dev/null
+++ b/icing/transform/simple/none-normalizer.h
@@ -0,0 +1,51 @@
+// Copyright (C) 2019 Google LLC
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#ifndef ICING_TRANSFORM_SIMPLE_NONE_NORMALIZER_H_
+#define ICING_TRANSFORM_SIMPLE_NONE_NORMALIZER_H_
+
+#include <string>
+#include <string_view>
+
+#include "icing/transform/normalizer.h"
+
+namespace icing {
+namespace lib {
+
+// This normalizer is not meant for production use. Currently only used to get
+// the Icing library to compile in Jetpack.
+//
+// No normalization is done, but the term is truncated if it exceeds
+// max_term_byte_size.
+class NoneNormalizer : public Normalizer {
+ public:
+ explicit NoneNormalizer(int max_term_byte_size)
+ : max_term_byte_size_(max_term_byte_size){};
+
+ std::string NormalizeTerm(std::string_view term) const override {
+ if (term.length() > max_term_byte_size_) {
+ return std::string(term.substr(0, max_term_byte_size_));
+ }
+ return std::string(term);
+ }
+
+ private:
+ // The maximum term length allowed after normalization.
+ int max_term_byte_size_;
+};
+
+} // namespace lib
+} // namespace icing
+
+#endif // ICING_TRANSFORM_SIMPLE_NONE_NORMALIZER_H_
diff --git a/icing/transform/simple/none-normalizer_test.cc b/icing/transform/simple/none-normalizer_test.cc
new file mode 100644
index 0000000..e074828
--- /dev/null
+++ b/icing/transform/simple/none-normalizer_test.cc
@@ -0,0 +1,74 @@
+// Copyright (C) 2019 Google LLC
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include <memory>
+
+#include "gmock/gmock.h"
+#include "gtest/gtest.h"
+#include "icing/testing/common-matchers.h"
+#include "icing/transform/normalizer-factory.h"
+#include "icing/transform/normalizer.h"
+
+namespace icing {
+namespace lib {
+namespace {
+
+using ::testing::Eq;
+
+TEST(NoneNormalizerTest, Creation) {
+ EXPECT_THAT(normalizer_factory::Create(
+ /*max_term_byte_size=*/5),
+ IsOk());
+ EXPECT_THAT(normalizer_factory::Create(
+ /*max_term_byte_size=*/0),
+ StatusIs(libtextclassifier3::StatusCode::INVALID_ARGUMENT));
+ EXPECT_THAT(normalizer_factory::Create(
+ /*max_term_byte_size=*/-1),
+ StatusIs(libtextclassifier3::StatusCode::INVALID_ARGUMENT));
+}
+
+TEST(IcuNormalizerTest, NoNormalizationDone) {
+ ICING_ASSERT_OK_AND_ASSIGN(auto normalizer, normalizer_factory::Create(
+ /*max_term_byte_size=*/1000));
+ EXPECT_THAT(normalizer->NormalizeTerm(""), Eq(""));
+ EXPECT_THAT(normalizer->NormalizeTerm("hello world"), Eq("hello world"));
+
+ // Capitalization
+ EXPECT_THAT(normalizer->NormalizeTerm("MDI"), Eq("MDI"));
+
+ // Accents
+ EXPECT_THAT(normalizer->NormalizeTerm("Zürich"), Eq("Zürich"));
+
+ // Full-width punctuation to ASCII punctuation
+ EXPECT_THAT(normalizer->NormalizeTerm("。,!?:”"), Eq("。,!?:”"));
+
+ // Half-width katakana
+ EXPECT_THAT(normalizer->NormalizeTerm("カ"), Eq("カ"));
+}
+
+TEST(NoneNormalizerTest, Truncate) {
+ ICING_ASSERT_OK_AND_ASSIGN(auto normalizer, normalizer_factory::Create(
+ /*max_term_byte_size=*/5));
+
+ // Won't be truncated
+ EXPECT_THAT(normalizer->NormalizeTerm("hi"), Eq("hi"));
+ EXPECT_THAT(normalizer->NormalizeTerm("hello"), Eq("hello"));
+
+ // Truncated to length 5.
+ EXPECT_THAT(normalizer->NormalizeTerm("hello!"), Eq("hello"));
+}
+
+} // namespace
+} // namespace lib
+} // namespace icing
diff --git a/icing/util/character-iterator.cc b/icing/util/character-iterator.cc
index 0ab1e50..6c5faef 100644
--- a/icing/util/character-iterator.cc
+++ b/icing/util/character-iterator.cc
@@ -14,8 +14,6 @@
#include "icing/util/character-iterator.h"
-#include "icing/util/i18n-utils.h"
-
namespace icing {
namespace lib {
@@ -32,37 +30,22 @@ int GetUTF8StartPosition(std::string_view text, int current_byte_index) {
} // namespace
-UChar32 CharacterIterator::GetCurrentChar() {
- if (cached_current_char_ == i18n_utils::kInvalidUChar32) {
- // Our indices point to the right character, we just need to read that
- // character. No need to worry about an error. If GetUChar32At fails, then
- // current_char will be i18n_utils::kInvalidUChar32.
- cached_current_char_ =
- i18n_utils::GetUChar32At(text_.data(), text_.length(), utf8_index_);
- }
- return cached_current_char_;
-}
-
bool CharacterIterator::MoveToUtf8(int desired_utf8_index) {
return (desired_utf8_index > utf8_index_) ? AdvanceToUtf8(desired_utf8_index)
: RewindToUtf8(desired_utf8_index);
}
bool CharacterIterator::AdvanceToUtf8(int desired_utf8_index) {
- ResetToStartIfNecessary();
-
if (desired_utf8_index > text_.length()) {
// Enforce the requirement.
return false;
}
// Need to work forwards.
- UChar32 uchar32 = cached_current_char_;
while (utf8_index_ < desired_utf8_index) {
- uchar32 =
+ UChar32 uchar32 =
i18n_utils::GetUChar32At(text_.data(), text_.length(), utf8_index_);
if (uchar32 == i18n_utils::kInvalidUChar32) {
// Unable to retrieve a valid UTF-32 character at the previous position.
- cached_current_char_ = i18n_utils::kInvalidUChar32;
return false;
}
int utf8_length = i18n_utils::GetUtf8Length(uchar32);
@@ -74,8 +57,6 @@ bool CharacterIterator::AdvanceToUtf8(int desired_utf8_index) {
utf16_index_ += i18n_utils::GetUtf16Length(uchar32);
++utf32_index_;
}
- cached_current_char_ =
- i18n_utils::GetUChar32At(text_.data(), text_.length(), utf8_index_);
return true;
}
@@ -85,30 +66,21 @@ bool CharacterIterator::RewindToUtf8(int desired_utf8_index) {
return false;
}
// Need to work backwards.
- UChar32 uchar32 = cached_current_char_;
while (utf8_index_ > desired_utf8_index) {
- int utf8_index = utf8_index_ - 1;
- utf8_index = GetUTF8StartPosition(text_, utf8_index);
- if (utf8_index < 0) {
+ --utf8_index_;
+ utf8_index_ = GetUTF8StartPosition(text_, utf8_index_);
+ if (utf8_index_ < 0) {
// Somehow, there wasn't a single UTF-8 lead byte at
// requested_byte_index or an earlier byte.
- cached_current_char_ = i18n_utils::kInvalidUChar32;
return false;
}
// We've found the start of a unicode char!
- uchar32 =
- i18n_utils::GetUChar32At(text_.data(), text_.length(), utf8_index);
- int expected_length = utf8_index_ - utf8_index;
- if (uchar32 == i18n_utils::kInvalidUChar32 ||
- expected_length != i18n_utils::GetUtf8Length(uchar32)) {
- // Either unable to retrieve a valid UTF-32 character at the previous
- // position or we skipped past an invalid sequence while seeking the
- // previous start position.
- cached_current_char_ = i18n_utils::kInvalidUChar32;
+ UChar32 uchar32 =
+ i18n_utils::GetUChar32At(text_.data(), text_.length(), utf8_index_);
+ if (uchar32 == i18n_utils::kInvalidUChar32) {
+ // Unable to retrieve a valid UTF-32 character at the previous position.
return false;
}
- cached_current_char_ = uchar32;
- utf8_index_ = utf8_index;
utf16_index_ -= i18n_utils::GetUtf16Length(uchar32);
--utf32_index_;
}
@@ -122,15 +94,11 @@ bool CharacterIterator::MoveToUtf16(int desired_utf16_index) {
}
bool CharacterIterator::AdvanceToUtf16(int desired_utf16_index) {
- ResetToStartIfNecessary();
-
- UChar32 uchar32 = cached_current_char_;
while (utf16_index_ < desired_utf16_index) {
- uchar32 =
+ UChar32 uchar32 =
i18n_utils::GetUChar32At(text_.data(), text_.length(), utf8_index_);
if (uchar32 == i18n_utils::kInvalidUChar32) {
// Unable to retrieve a valid UTF-32 character at the previous position.
- cached_current_char_ = i18n_utils::kInvalidUChar32;
return false;
}
int utf16_length = i18n_utils::GetUtf16Length(uchar32);
@@ -141,15 +109,12 @@ bool CharacterIterator::AdvanceToUtf16(int desired_utf16_index) {
int utf8_length = i18n_utils::GetUtf8Length(uchar32);
if (utf8_index_ + utf8_length > text_.length()) {
// Enforce the requirement.
- cached_current_char_ = i18n_utils::kInvalidUChar32;
return false;
}
utf8_index_ += utf8_length;
utf16_index_ += utf16_length;
++utf32_index_;
}
- cached_current_char_ =
- i18n_utils::GetUChar32At(text_.data(), text_.length(), utf8_index_);
return true;
}
@@ -157,30 +122,21 @@ bool CharacterIterator::RewindToUtf16(int desired_utf16_index) {
if (desired_utf16_index < 0) {
return false;
}
- UChar32 uchar32 = cached_current_char_;
while (utf16_index_ > desired_utf16_index) {
- int utf8_index = utf8_index_ - 1;
- utf8_index = GetUTF8StartPosition(text_, utf8_index);
- if (utf8_index < 0) {
+ --utf8_index_;
+ utf8_index_ = GetUTF8StartPosition(text_, utf8_index_);
+ if (utf8_index_ < 0) {
// Somehow, there wasn't a single UTF-8 lead byte at
// requested_byte_index or an earlier byte.
- cached_current_char_ = i18n_utils::kInvalidUChar32;
return false;
}
// We've found the start of a unicode char!
- uchar32 =
- i18n_utils::GetUChar32At(text_.data(), text_.length(), utf8_index);
- int expected_length = utf8_index_ - utf8_index;
- if (uchar32 == i18n_utils::kInvalidUChar32 ||
- expected_length != i18n_utils::GetUtf8Length(uchar32)) {
- // Either unable to retrieve a valid UTF-32 character at the previous
- // position or we skipped past an invalid sequence while seeking the
- // previous start position.
- cached_current_char_ = i18n_utils::kInvalidUChar32;
+ UChar32 uchar32 =
+ i18n_utils::GetUChar32At(text_.data(), text_.length(), utf8_index_);
+ if (uchar32 == i18n_utils::kInvalidUChar32) {
+ // Unable to retrieve a valid UTF-32 character at the previous position.
return false;
}
- cached_current_char_ = uchar32;
- utf8_index_ = utf8_index;
utf16_index_ -= i18n_utils::GetUtf16Length(uchar32);
--utf32_index_;
}
@@ -194,30 +150,23 @@ bool CharacterIterator::MoveToUtf32(int desired_utf32_index) {
}
bool CharacterIterator::AdvanceToUtf32(int desired_utf32_index) {
- ResetToStartIfNecessary();
-
- UChar32 uchar32 = cached_current_char_;
while (utf32_index_ < desired_utf32_index) {
- uchar32 =
+ UChar32 uchar32 =
i18n_utils::GetUChar32At(text_.data(), text_.length(), utf8_index_);
if (uchar32 == i18n_utils::kInvalidUChar32) {
// Unable to retrieve a valid UTF-32 character at the previous position.
- cached_current_char_ = i18n_utils::kInvalidUChar32;
return false;
}
int utf16_length = i18n_utils::GetUtf16Length(uchar32);
int utf8_length = i18n_utils::GetUtf8Length(uchar32);
if (utf8_index_ + utf8_length > text_.length()) {
// Enforce the requirement.
- cached_current_char_ = i18n_utils::kInvalidUChar32;
return false;
}
utf8_index_ += utf8_length;
utf16_index_ += utf16_length;
++utf32_index_;
}
- cached_current_char_ =
- i18n_utils::GetUChar32At(text_.data(), text_.length(), utf8_index_);
return true;
}
@@ -225,45 +174,26 @@ bool CharacterIterator::RewindToUtf32(int desired_utf32_index) {
if (desired_utf32_index < 0) {
return false;
}
- UChar32 uchar32 = cached_current_char_;
while (utf32_index_ > desired_utf32_index) {
- int utf8_index = utf8_index_ - 1;
- utf8_index = GetUTF8StartPosition(text_, utf8_index);
- if (utf8_index < 0) {
+ --utf8_index_;
+ utf8_index_ = GetUTF8StartPosition(text_, utf8_index_);
+ if (utf8_index_ < 0) {
// Somehow, there wasn't a single UTF-8 lead byte at
// requested_byte_index or an earlier byte.
- cached_current_char_ = i18n_utils::kInvalidUChar32;
return false;
}
// We've found the start of a unicode char!
- uchar32 =
- i18n_utils::GetUChar32At(text_.data(), text_.length(), utf8_index);
- int expected_length = utf8_index_ - utf8_index;
- if (uchar32 == i18n_utils::kInvalidUChar32 ||
- expected_length != i18n_utils::GetUtf8Length(uchar32)) {
- // Either unable to retrieve a valid UTF-32 character at the previous
- // position or we skipped past an invalid sequence while seeking the
- // previous start position.
- cached_current_char_ = i18n_utils::kInvalidUChar32;
+ UChar32 uchar32 =
+ i18n_utils::GetUChar32At(text_.data(), text_.length(), utf8_index_);
+ if (uchar32 == i18n_utils::kInvalidUChar32) {
+ // Unable to retrieve a valid UTF-32 character at the previous position.
return false;
}
- cached_current_char_ = uchar32;
- utf8_index_ = utf8_index;
utf16_index_ -= i18n_utils::GetUtf16Length(uchar32);
--utf32_index_;
}
return true;
}
-void CharacterIterator::ResetToStartIfNecessary() {
- if (utf8_index_ < 0 || utf16_index_ < 0 || utf32_index_ < 0) {
- utf8_index_ = 0;
- utf16_index_ = 0;
- utf32_index_ = 0;
- cached_current_char_ =
- i18n_utils::GetUChar32At(text_.data(), text_.length(), 0);
- }
-}
-
} // namespace lib
} // namespace icing
diff --git a/icing/util/character-iterator.h b/icing/util/character-iterator.h
index 893718a..9df7bee 100644
--- a/icing/util/character-iterator.h
+++ b/icing/util/character-iterator.h
@@ -29,15 +29,10 @@ class CharacterIterator {
CharacterIterator(std::string_view text, int utf8_index, int utf16_index,
int utf32_index)
: text_(text),
- cached_current_char_(i18n_utils::kInvalidUChar32),
utf8_index_(utf8_index),
utf16_index_(utf16_index),
utf32_index_(utf32_index) {}
- // Returns the character that the iterator currently points to.
- // i18n_utils::kInvalidUChar32 if unable to read that character.
- UChar32 GetCurrentChar();
-
// Moves current position to desired_utf8_index.
// REQUIRES: 0 <= desired_utf8_index <= text_.length()
bool MoveToUtf8(int desired_utf8_index);
@@ -87,8 +82,6 @@ class CharacterIterator {
int utf32_index() const { return utf32_index_; }
bool operator==(const CharacterIterator& rhs) const {
- // cached_current_char_ is just that: a cached value. As such, it's not
- // considered for equality.
return text_ == rhs.text_ && utf8_index_ == rhs.utf8_index_ &&
utf16_index_ == rhs.utf16_index_ && utf32_index_ == rhs.utf32_index_;
}
@@ -99,12 +92,7 @@ class CharacterIterator {
}
private:
- // Resets the character iterator to the start of the text if any of the
- // indices are negative.
- void ResetToStartIfNecessary();
-
std::string_view text_;
- UChar32 cached_current_char_;
int utf8_index_;
int utf16_index_;
int utf32_index_;
diff --git a/icing/util/character-iterator_test.cc b/icing/util/character-iterator_test.cc
deleted file mode 100644
index 195a47b..0000000
--- a/icing/util/character-iterator_test.cc
+++ /dev/null
@@ -1,266 +0,0 @@
-// Copyright (C) 2019 Google LLC
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-// http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#include "icing/util/character-iterator.h"
-
-#include "gmock/gmock.h"
-#include "gtest/gtest.h"
-#include "icing/testing/icu-i18n-test-utils.h"
-
-namespace icing {
-namespace lib {
-
-using ::testing::Eq;
-using ::testing::IsFalse;
-using ::testing::IsTrue;
-
-TEST(CharacterIteratorTest, BasicUtf8) {
- constexpr std::string_view kText = "¿Dónde está la biblioteca?";
- CharacterIterator iterator(kText);
- EXPECT_THAT(UCharToString(iterator.GetCurrentChar()), Eq("¿"));
-
- EXPECT_THAT(iterator.AdvanceToUtf8(4), IsTrue());
- EXPECT_THAT(UCharToString(iterator.GetCurrentChar()), Eq("ó"));
- EXPECT_THAT(iterator,
- Eq(CharacterIterator(kText, /*utf8_index=*/3, /*utf16_index=*/2,
- /*utf32_index=*/2)));
-
- EXPECT_THAT(iterator.AdvanceToUtf8(18), IsTrue());
- EXPECT_THAT(UCharToString(iterator.GetCurrentChar()), Eq("b"));
- EXPECT_THAT(iterator,
- Eq(CharacterIterator(kText, /*utf8_index=*/18, /*utf16_index=*/15,
- /*utf32_index=*/15)));
-
- EXPECT_THAT(iterator.AdvanceToUtf8(28), IsTrue());
- EXPECT_THAT(UCharToString(iterator.GetCurrentChar()), Eq("?"));
- EXPECT_THAT(iterator,
- Eq(CharacterIterator(kText, /*utf8_index=*/28, /*utf16_index=*/25,
- /*utf32_index=*/25)));
-
- EXPECT_THAT(iterator.AdvanceToUtf8(29), IsTrue());
- EXPECT_THAT(iterator.GetCurrentChar(), Eq(0));
- EXPECT_THAT(iterator,
- Eq(CharacterIterator(kText, /*utf8_index=*/29, /*utf16_index=*/26,
- /*utf32_index=*/26)));
-
- EXPECT_THAT(iterator.RewindToUtf8(28), IsTrue());
- EXPECT_THAT(UCharToString(iterator.GetCurrentChar()), Eq("?"));
- EXPECT_THAT(iterator,
- Eq(CharacterIterator(kText, /*utf8_index=*/28, /*utf16_index=*/25,
- /*utf32_index=*/25)));
-
- EXPECT_THAT(iterator.RewindToUtf8(18), IsTrue());
- EXPECT_THAT(UCharToString(iterator.GetCurrentChar()), Eq("b"));
- EXPECT_THAT(iterator,
- Eq(CharacterIterator(kText, /*utf8_index=*/18, /*utf16_index=*/15,
- /*utf32_index=*/15)));
-
- EXPECT_THAT(iterator.RewindToUtf8(4), IsTrue());
- EXPECT_THAT(UCharToString(iterator.GetCurrentChar()), Eq("ó"));
- EXPECT_THAT(iterator,
- Eq(CharacterIterator(kText, /*utf8_index=*/3, /*utf16_index=*/2,
- /*utf32_index=*/2)));
-
- EXPECT_THAT(iterator.RewindToUtf8(0), IsTrue());
- EXPECT_THAT(UCharToString(iterator.GetCurrentChar()), Eq("¿"));
- EXPECT_THAT(iterator,
- Eq(CharacterIterator(kText, /*utf8_index=*/0, /*utf16_index=*/0,
- /*utf32_index=*/0)));
-}
-
-TEST(CharacterIteratorTest, BasicUtf16) {
- constexpr std::string_view kText = "¿Dónde está la biblioteca?";
- CharacterIterator iterator(kText);
- EXPECT_THAT(UCharToString(iterator.GetCurrentChar()), Eq("¿"));
-
- EXPECT_THAT(iterator.AdvanceToUtf16(2), IsTrue());
- EXPECT_THAT(UCharToString(iterator.GetCurrentChar()), Eq("ó"));
- EXPECT_THAT(iterator,
- Eq(CharacterIterator(kText, /*utf8_index=*/3, /*utf16_index=*/2,
- /*utf32_index=*/2)));
-
- EXPECT_THAT(iterator.AdvanceToUtf16(15), IsTrue());
- EXPECT_THAT(UCharToString(iterator.GetCurrentChar()), Eq("b"));
- EXPECT_THAT(iterator,
- Eq(CharacterIterator(kText, /*utf8_index=*/18, /*utf16_index=*/15,
- /*utf32_index=*/15)));
-
- EXPECT_THAT(iterator.AdvanceToUtf16(25), IsTrue());
- EXPECT_THAT(UCharToString(iterator.GetCurrentChar()), Eq("?"));
- EXPECT_THAT(iterator,
- Eq(CharacterIterator(kText, /*utf8_index=*/28, /*utf16_index=*/25,
- /*utf32_index=*/25)));
-
- EXPECT_THAT(iterator.AdvanceToUtf16(26), IsTrue());
- EXPECT_THAT(iterator.GetCurrentChar(), Eq(0));
- EXPECT_THAT(iterator,
- Eq(CharacterIterator(kText, /*utf8_index=*/29, /*utf16_index=*/26,
- /*utf32_index=*/26)));
-
- EXPECT_THAT(iterator.RewindToUtf16(25), IsTrue());
- EXPECT_THAT(UCharToString(iterator.GetCurrentChar()), Eq("?"));
- EXPECT_THAT(iterator,
- Eq(CharacterIterator(kText, /*utf8_index=*/28, /*utf16_index=*/25,
- /*utf32_index=*/25)));
-
- EXPECT_THAT(iterator.RewindToUtf16(15), IsTrue());
- EXPECT_THAT(UCharToString(iterator.GetCurrentChar()), Eq("b"));
- EXPECT_THAT(iterator,
- Eq(CharacterIterator(kText, /*utf8_index=*/18, /*utf16_index=*/15,
- /*utf32_index=*/15)));
-
- EXPECT_THAT(iterator.RewindToUtf16(2), IsTrue());
- EXPECT_THAT(UCharToString(iterator.GetCurrentChar()), Eq("ó"));
- EXPECT_THAT(iterator,
- Eq(CharacterIterator(kText, /*utf8_index=*/3, /*utf16_index=*/2,
- /*utf32_index=*/2)));
-
- EXPECT_THAT(iterator.RewindToUtf8(0), IsTrue());
- EXPECT_THAT(UCharToString(iterator.GetCurrentChar()), Eq("¿"));
- EXPECT_THAT(iterator,
- Eq(CharacterIterator(kText, /*utf8_index=*/0, /*utf16_index=*/0,
- /*utf32_index=*/0)));
-}
-
-TEST(CharacterIteratorTest, BasicUtf32) {
- constexpr std::string_view kText = "¿Dónde está la biblioteca?";
- CharacterIterator iterator(kText);
- EXPECT_THAT(UCharToString(iterator.GetCurrentChar()), Eq("¿"));
-
- EXPECT_THAT(iterator.AdvanceToUtf32(2), IsTrue());
- EXPECT_THAT(UCharToString(iterator.GetCurrentChar()), Eq("ó"));
- EXPECT_THAT(iterator,
- Eq(CharacterIterator(kText, /*utf8_index=*/3, /*utf16_index=*/2,
- /*utf32_index=*/2)));
-
- EXPECT_THAT(iterator.AdvanceToUtf32(15), IsTrue());
- EXPECT_THAT(UCharToString(iterator.GetCurrentChar()), Eq("b"));
- EXPECT_THAT(iterator,
- Eq(CharacterIterator(kText, /*utf8_index=*/18, /*utf16_index=*/15,
- /*utf32_index=*/15)));
-
- EXPECT_THAT(iterator.AdvanceToUtf32(25), IsTrue());
- EXPECT_THAT(UCharToString(iterator.GetCurrentChar()), Eq("?"));
- EXPECT_THAT(iterator,
- Eq(CharacterIterator(kText, /*utf8_index=*/28, /*utf16_index=*/25,
- /*utf32_index=*/25)));
-
- EXPECT_THAT(iterator.AdvanceToUtf32(26), IsTrue());
- EXPECT_THAT(iterator.GetCurrentChar(), Eq(0));
- EXPECT_THAT(iterator,
- Eq(CharacterIterator(kText, /*utf8_index=*/29, /*utf16_index=*/26,
- /*utf32_index=*/26)));
-
- EXPECT_THAT(iterator.RewindToUtf32(25), IsTrue());
- EXPECT_THAT(UCharToString(iterator.GetCurrentChar()), Eq("?"));
- EXPECT_THAT(iterator,
- Eq(CharacterIterator(kText, /*utf8_index=*/28, /*utf16_index=*/25,
- /*utf32_index=*/25)));
-
- EXPECT_THAT(iterator.RewindToUtf32(15), IsTrue());
- EXPECT_THAT(UCharToString(iterator.GetCurrentChar()), Eq("b"));
- EXPECT_THAT(iterator,
- Eq(CharacterIterator(kText, /*utf8_index=*/18, /*utf16_index=*/15,
- /*utf32_index=*/15)));
-
- EXPECT_THAT(iterator.RewindToUtf32(2), IsTrue());
- EXPECT_THAT(UCharToString(iterator.GetCurrentChar()), Eq("ó"));
- EXPECT_THAT(iterator,
- Eq(CharacterIterator(kText, /*utf8_index=*/3, /*utf16_index=*/2,
- /*utf32_index=*/2)));
-
- EXPECT_THAT(iterator.RewindToUtf32(0), IsTrue());
- EXPECT_THAT(UCharToString(iterator.GetCurrentChar()), Eq("¿"));
- EXPECT_THAT(iterator,
- Eq(CharacterIterator(kText, /*utf8_index=*/0, /*utf16_index=*/0,
- /*utf32_index=*/0)));
-}
-
-TEST(CharacterIteratorTest, InvalidUtf) {
- // "\255" is an invalid sequence.
- constexpr std::string_view kText = "foo \255 bar";
- CharacterIterator iterator(kText);
-
- // Try to advance to the 'b' in 'bar'. This will fail and leave us pointed at
- // the invalid sequence '\255'. Get CurrentChar() should return an invalid
- // character.
- EXPECT_THAT(iterator.AdvanceToUtf8(6), IsFalse());
- EXPECT_THAT(iterator.GetCurrentChar(), Eq(i18n_utils::kInvalidUChar32));
- CharacterIterator exp_iterator(kText, /*utf8_index=*/4, /*utf16_index=*/4,
- /*utf32_index=*/4);
- EXPECT_THAT(iterator, Eq(exp_iterator));
-
- EXPECT_THAT(iterator.AdvanceToUtf16(6), IsFalse());
- EXPECT_THAT(iterator.GetCurrentChar(), Eq(i18n_utils::kInvalidUChar32));
- EXPECT_THAT(iterator, Eq(exp_iterator));
-
- EXPECT_THAT(iterator.AdvanceToUtf32(6), IsFalse());
- EXPECT_THAT(iterator.GetCurrentChar(), Eq(i18n_utils::kInvalidUChar32));
- EXPECT_THAT(iterator, Eq(exp_iterator));
-
- // Create the iterator with it pointing at the 'b' in 'bar'.
- iterator = CharacterIterator(kText, /*utf8_index=*/6, /*utf16_index=*/6,
- /*utf32_index=*/6);
- EXPECT_THAT(UCharToString(iterator.GetCurrentChar()), Eq("b"));
-
- // Try to advance to the last 'o' in 'foo'. This will fail and leave us
- // pointed at the ' ' before the invalid sequence '\255'.
- exp_iterator = CharacterIterator(kText, /*utf8_index=*/5, /*utf16_index=*/5,
- /*utf32_index=*/5);
- EXPECT_THAT(iterator.RewindToUtf8(2), IsFalse());
- EXPECT_THAT(iterator.GetCurrentChar(), Eq(' '));
- EXPECT_THAT(iterator, Eq(exp_iterator));
-
- EXPECT_THAT(iterator.RewindToUtf16(2), IsFalse());
- EXPECT_THAT(iterator.GetCurrentChar(), Eq(' '));
- EXPECT_THAT(iterator, Eq(exp_iterator));
-
- EXPECT_THAT(iterator.RewindToUtf32(2), IsFalse());
- EXPECT_THAT(iterator.GetCurrentChar(), Eq(' '));
- EXPECT_THAT(iterator, Eq(exp_iterator));
-}
-
-TEST(CharacterIteratorTest, MoveToUtfNegativeIndex) {
- constexpr std::string_view kText = "¿Dónde está la biblioteca?";
-
- CharacterIterator iterator_utf8(kText, /*utf8_index=*/-1, /*utf16_index=*/0,
- /*utf32_index=*/0);
- // We should be able to successfully move when the index is negative.
- EXPECT_THAT(iterator_utf8.MoveToUtf8(0), IsTrue());
- // The character cache should be reset and contain the first character when
- // resetting to index 0.
- EXPECT_THAT(UCharToString(iterator_utf8.GetCurrentChar()), Eq("¿"));
- EXPECT_THAT(iterator_utf8.utf8_index(), Eq(0));
- EXPECT_THAT(iterator_utf8.utf16_index(), Eq(0));
- EXPECT_THAT(iterator_utf8.utf32_index(), Eq(0));
-
- CharacterIterator iterator_utf16(kText, /*utf8_index=*/0, /*utf16_index=*/-1,
- /*utf32_index=*/0);
- EXPECT_THAT(iterator_utf16.MoveToUtf16(1), IsTrue());
- EXPECT_THAT(iterator_utf16.GetCurrentChar(), Eq('D'));
- EXPECT_THAT(iterator_utf16.utf8_index(), Eq(2));
- EXPECT_THAT(iterator_utf16.utf16_index(), Eq(1));
- EXPECT_THAT(iterator_utf16.utf32_index(), Eq(1));
-
- CharacterIterator iterator_utf32(kText, /*utf8_index=*/0, /*utf16_index=*/0,
- /*utf32_index=*/-1);
- EXPECT_THAT(iterator_utf32.MoveToUtf32(2), IsTrue());
- EXPECT_THAT(UCharToString(iterator_utf32.GetCurrentChar()), Eq("ó"));
- EXPECT_THAT(iterator_utf32.utf8_index(), Eq(3));
- EXPECT_THAT(iterator_utf32.utf16_index(), Eq(2));
- EXPECT_THAT(iterator_utf32.utf32_index(), Eq(2));
-}
-
-} // namespace lib
-} // namespace icing
diff --git a/icing/util/document-validator_test.cc b/icing/util/document-validator_test.cc
index 45c23e0..cb013d7 100644
--- a/icing/util/document-validator_test.cc
+++ b/icing/util/document-validator_test.cc
@@ -46,15 +46,15 @@ constexpr char kPropertyEmails[] = "emails";
constexpr char kDefaultNamespace[] = "icing";
constexpr char kDefaultString[] = "This is a string.";
-constexpr PropertyConfigProto::Cardinality::Code CARDINALITY_OPTIONAL =
- PropertyConfigProto::Cardinality::OPTIONAL;
-constexpr PropertyConfigProto::Cardinality::Code CARDINALITY_REQUIRED =
- PropertyConfigProto::Cardinality::REQUIRED;
-constexpr PropertyConfigProto::Cardinality::Code CARDINALITY_REPEATED =
- PropertyConfigProto::Cardinality::REPEATED;
+constexpr PropertyConfigProto_Cardinality_Code CARDINALITY_OPTIONAL =
+ PropertyConfigProto_Cardinality_Code_OPTIONAL;
+constexpr PropertyConfigProto_Cardinality_Code CARDINALITY_REQUIRED =
+ PropertyConfigProto_Cardinality_Code_REQUIRED;
+constexpr PropertyConfigProto_Cardinality_Code CARDINALITY_REPEATED =
+ PropertyConfigProto_Cardinality_Code_REPEATED;
-constexpr PropertyConfigProto::DataType::Code TYPE_STRING =
- PropertyConfigProto::DataType::STRING;
+constexpr PropertyConfigProto_DataType_Code TYPE_STRING =
+ PropertyConfigProto_DataType_Code_STRING;
class DocumentValidatorTest : public ::testing::Test {
protected:
@@ -93,11 +93,9 @@ class DocumentValidatorTest : public ::testing::Test {
.SetCardinality(CARDINALITY_REPEATED)))
.Build();
- schema_dir_ = GetTestTempDir() + "/schema_store";
- ASSERT_TRUE(filesystem_.CreateDirectory(schema_dir_.c_str()));
ICING_ASSERT_OK_AND_ASSIGN(
schema_store_,
- SchemaStore::Create(&filesystem_, schema_dir_, &fake_clock_));
+ SchemaStore::Create(&filesystem_, GetTestTempDir(), &fake_clock_));
ASSERT_THAT(schema_store_->SetSchema(schema), IsOk());
document_validator_ =
@@ -124,7 +122,6 @@ class DocumentValidatorTest : public ::testing::Test {
SimpleEmailBuilder().Build());
}
- std::string schema_dir_;
std::unique_ptr<DocumentValidator> document_validator_;
std::unique_ptr<SchemaStore> schema_store_;
Filesystem filesystem_;
diff --git a/icing/util/i18n-utils.cc b/icing/util/i18n-utils.cc
index ec327ad..cd0a227 100644
--- a/icing/util/i18n-utils.cc
+++ b/icing/util/i18n-utils.cc
@@ -116,8 +116,6 @@ bool IsAscii(char c) { return U8_IS_SINGLE((uint8_t)c); }
bool IsAscii(UChar32 c) { return U8_LENGTH(c) == 1; }
-bool IsAlphaNumeric(UChar32 c) { return u_isalnum(c); }
-
int GetUtf8Length(UChar32 c) { return U8_LENGTH(c); }
int GetUtf16Length(UChar32 c) { return U16_LENGTH(c); }
diff --git a/icing/util/i18n-utils.h b/icing/util/i18n-utils.h
index 491df6b..82ae828 100644
--- a/icing/util/i18n-utils.h
+++ b/icing/util/i18n-utils.h
@@ -67,9 +67,6 @@ bool IsAscii(char c);
// Checks if the Unicode char is within ASCII range.
bool IsAscii(UChar32 c);
-// Checks if the Unicode char is alphanumeric.
-bool IsAlphaNumeric(UChar32 c);
-
// Returns how many code units (char) are used for the UTF-8 encoding of this
// Unicode character. Returns 0 if not valid.
int GetUtf8Length(UChar32 c);
diff --git a/java/Android.bp b/java/Android.bp
index 6133230..ef417ba 100644
--- a/java/Android.bp
+++ b/java/Android.bp
@@ -32,6 +32,5 @@ java_library {
"androidx.annotation_annotation",
],
sdk_version: "current",
- min_sdk_version: "Tiramisu",
apex_available: ["com.android.appsearch"],
}
diff --git a/java/src/com/google/android/icing/IcingSearchEngine.java b/java/src/com/google/android/icing/IcingSearchEngine.java
index 95e0c84..1f5fb51 100644
--- a/java/src/com/google/android/icing/IcingSearchEngine.java
+++ b/java/src/com/google/android/icing/IcingSearchEngine.java
@@ -43,8 +43,6 @@ import com.google.android.icing.proto.SearchSpecProto;
import com.google.android.icing.proto.SetSchemaResultProto;
import com.google.android.icing.proto.StatusProto;
import com.google.android.icing.proto.StorageInfoResultProto;
-import com.google.android.icing.proto.SuggestionResponse;
-import com.google.android.icing.proto.SuggestionSpecProto;
import com.google.android.icing.proto.UsageReport;
import com.google.protobuf.ExtensionRegistryLite;
import com.google.protobuf.InvalidProtocolBufferException;
@@ -372,26 +370,6 @@ public class IcingSearchEngine implements Closeable {
}
@NonNull
- public SuggestionResponse searchSuggestions(@NonNull SuggestionSpecProto suggestionSpec) {
- byte[] suggestionResponseBytes = nativeSearchSuggestions(this, suggestionSpec.toByteArray());
- if (suggestionResponseBytes == null) {
- Log.e(TAG, "Received null suggestionResponseBytes from native.");
- return SuggestionResponse.newBuilder()
- .setStatus(StatusProto.newBuilder().setCode(StatusProto.Code.INTERNAL))
- .build();
- }
-
- try {
- return SuggestionResponse.parseFrom(suggestionResponseBytes, EXTENSION_REGISTRY_LITE);
- } catch (InvalidProtocolBufferException e) {
- Log.e(TAG, "Error parsing suggestionResponseBytes.", e);
- return SuggestionResponse.newBuilder()
- .setStatus(StatusProto.newBuilder().setCode(StatusProto.Code.INTERNAL))
- .build();
- }
- }
-
- @NonNull
public DeleteByNamespaceResultProto deleteByNamespace(@NonNull String namespace) {
throwIfClosed();
@@ -626,7 +604,4 @@ public class IcingSearchEngine implements Closeable {
private static native byte[] nativeGetStorageInfo(IcingSearchEngine instance);
private static native byte[] nativeReset(IcingSearchEngine instance);
-
- private static native byte[] nativeSearchSuggestions(
- IcingSearchEngine instance, byte[] suggestionSpecBytes);
}
diff --git a/java/tests/instrumentation/src/androidx/appsearch/smoketest/AndroidXSmokeTest.java b/java/tests/instrumentation/src/androidx/appsearch/smoketest/AppSearchSmokeTest.java
index 98b1b25..8fae104 100644
--- a/java/tests/instrumentation/src/androidx/appsearch/smoketest/AndroidXSmokeTest.java
+++ b/java/tests/instrumentation/src/androidx/appsearch/smoketest/AppSearchSmokeTest.java
@@ -24,7 +24,6 @@ import androidx.appsearch.app.AppSearchSchema;
import androidx.appsearch.app.AppSearchSchema.PropertyConfig;
import androidx.appsearch.app.AppSearchSchema.StringPropertyConfig;
import androidx.appsearch.app.AppSearchSession;
-import androidx.appsearch.app.GenericDocument;
import androidx.appsearch.app.PutDocumentsRequest;
import androidx.appsearch.app.SearchResult;
import androidx.appsearch.app.SearchResults;
@@ -33,16 +32,15 @@ import androidx.appsearch.app.SetSchemaRequest;
import androidx.appsearch.localstorage.LocalStorage;
import androidx.appsearch.localstorage.LocalStorage.SearchContext;
import androidx.test.core.app.ApplicationProvider;
-import androidx.test.ext.junit.runners.AndroidJUnit4;
+import androidx.test.filters.SmallTest;
import org.junit.Before;
import org.junit.Test;
-import org.junit.runner.RunWith;
import java.util.List;
-@RunWith(AndroidJUnit4.class)
-public class AndroidXSmokeTest {
+@SmallTest
+public class AppSearchSmokeTest {
private AppSearchSession appSearch;
@Before
@@ -50,8 +48,7 @@ public class AndroidXSmokeTest {
appSearch =
LocalStorage.createSearchSession(
new SearchContext.Builder(
- ApplicationProvider.getApplicationContext(),
- "database")
+ ApplicationProvider.getApplicationContext())
.build())
.get();
// Remove all data before test
@@ -82,7 +79,7 @@ public class AndroidXSmokeTest {
.build())
.get();
- TestDocument input = new TestDocument("namespace", "id1", "avocado");
+ TestDocument input = new TestDocument("uri1", "avocado");
appSearch
.put(new PutDocumentsRequest.Builder().addDocuments(input).build())
.get()
@@ -98,11 +95,10 @@ public class AndroidXSmokeTest {
SearchResult result = page.get(0);
assertThat(results.getNextPage().get()).isEmpty();
- GenericDocument genericOutput = result.getGenericDocument();
- assertEquals("id1", genericOutput.getId());
- assertEquals("avocado", genericOutput.getPropertyString("body"));
- TestDocument output = genericOutput.toDocumentClass(TestDocument.class);
- assertEquals("id1", output.getId());
+ assertEquals("uri1", result.getDocument().getUri());
+ assertEquals("avocado", result.getDocument().getPropertyString("body"));
+ TestDocument output = result.getDocument().toDocumentClass(TestDocument.class);
+ assertEquals("uri1", output.getUri());
assertEquals("avocado", output.getBody());
}
}
diff --git a/java/tests/instrumentation/src/androidx/appsearch/smoketest/TestDocument.java b/java/tests/instrumentation/src/androidx/appsearch/smoketest/TestDocument.java
index ebf32e4..089ff55 100644
--- a/java/tests/instrumentation/src/androidx/appsearch/smoketest/TestDocument.java
+++ b/java/tests/instrumentation/src/androidx/appsearch/smoketest/TestDocument.java
@@ -21,28 +21,21 @@ import androidx.appsearch.app.AppSearchSchema.StringPropertyConfig;
@Document
public class TestDocument {
- @Document.Namespace private final String mNamespace;
+ @Document.Uri private final String uri;
- @Document.Id private final String mId;
+ @Document.Property(indexingType = StringPropertyConfig.INDEXING_TYPE_PREFIXES)
+ private final String body;
- @Document.StringProperty(indexingType = StringPropertyConfig.INDEXING_TYPE_PREFIXES)
- private final String mBody;
-
- TestDocument(String namespace, String id, String body) {
- mNamespace = namespace;
- mId = id;
- mBody = body;
- }
-
- public String getNamespace() {
- return mNamespace;
+ TestDocument(String uri, String body) {
+ this.uri = uri;
+ this.body = body;
}
- public String getId() {
- return mId;
+ public String getUri() {
+ return uri;
}
public String getBody() {
- return mBody;
+ return body;
}
}
diff --git a/java/tests/instrumentation/src/com/google/android/icing/IcingSearchEngineTest.java b/java/tests/instrumentation/src/com/google/android/icing/IcingSearchEngineTest.java
index a46814c..64f98f6 100644
--- a/java/tests/instrumentation/src/com/google/android/icing/IcingSearchEngineTest.java
+++ b/java/tests/instrumentation/src/com/google/android/icing/IcingSearchEngineTest.java
@@ -51,11 +51,7 @@ import com.google.android.icing.proto.StatusProto;
import com.google.android.icing.proto.StorageInfoResultProto;
import com.google.android.icing.proto.StringIndexingConfig;
import com.google.android.icing.proto.StringIndexingConfig.TokenizerType;
-import com.google.android.icing.proto.SuggestionResponse;
-import com.google.android.icing.proto.SuggestionSpecProto;
-import com.google.android.icing.proto.SuggestionSpecProto.SuggestionScoringSpecProto;
import com.google.android.icing.proto.TermMatchType;
-import com.google.android.icing.proto.TermMatchType.Code;
import com.google.android.icing.proto.UsageReport;
import com.google.android.icing.IcingSearchEngine;
import java.io.File;
@@ -63,6 +59,7 @@ import java.util.HashMap;
import java.util.Map;
import org.junit.After;
import org.junit.Before;
+import org.junit.Ignore;
import org.junit.Rule;
import org.junit.Test;
import org.junit.rules.TemporaryFolder;
@@ -493,6 +490,7 @@ public final class IcingSearchEngineTest {
}
@Test
+ @Ignore("b/190845688")
public void testCJKTSnippets() throws Exception {
assertStatusOk(icingSearchEngine.initialize().getStatus());
@@ -500,13 +498,12 @@ public final class IcingSearchEngineTest {
assertStatusOk(
icingSearchEngine.setSchema(schema, /*ignoreErrorsAndDeleteDocuments=*/ false).getStatus());
- // String: "天是蓝的"
- // ^ ^^ ^
- // UTF16 idx: 0 1 2 3
- // Breaks into segments: "天", "是", "蓝", "的"
- // "The sky is blue"
- String chinese = "天是蓝的";
- assertThat(chinese.length()).isEqualTo(4);
+ // String: "我每天走路去上班。"
+ // ^ ^ ^ ^^
+ // UTF16 idx: 0 1 3 5 6
+ // Breaks into segments: "我", "每天", "走路", "去", "上班"
+ String chinese = "我每天走路去上班。";
+ assertThat(chinese.length()).isEqualTo(9);
DocumentProto emailDocument1 =
createEmailDocument("namespace", "uri1").toBuilder()
.addProperties(PropertyProto.newBuilder().setName("subject").addStringValues(chinese))
@@ -516,7 +513,7 @@ public final class IcingSearchEngineTest {
// Search and request snippet matching but no windowing.
SearchSpecProto searchSpec =
SearchSpecProto.newBuilder()
- .setQuery("是")
+ .setQuery("每")
.setTermMatchType(TermMatchType.Code.PREFIX)
.build();
ResultSpecProto resultSpecProto =
@@ -555,9 +552,9 @@ public final class IcingSearchEngineTest {
int matchStart = matchProto.getExactMatchUtf16Position();
int matchEnd = matchStart + matchProto.getExactMatchUtf16Length();
assertThat(matchStart).isEqualTo(1);
- assertThat(matchEnd).isEqualTo(2);
+ assertThat(matchEnd).isEqualTo(3);
String match = content.substring(matchStart, matchEnd);
- assertThat(match).isEqualTo("是");
+ assertThat(match).isEqualTo("每天");
}
@Test
@@ -627,47 +624,6 @@ public final class IcingSearchEngineTest {
assertThat(match).isEqualTo("𐀂𐀃");
}
- @Test
- public void testSearchSuggestions() {
- assertStatusOk(icingSearchEngine.initialize().getStatus());
-
- SchemaTypeConfigProto emailTypeConfig = createEmailTypeConfig();
- SchemaProto schema = SchemaProto.newBuilder().addTypes(emailTypeConfig).build();
- assertThat(
- icingSearchEngine
- .setSchema(schema, /*ignoreErrorsAndDeleteDocuments=*/ false)
- .getStatus()
- .getCode())
- .isEqualTo(StatusProto.Code.OK);
-
- DocumentProto emailDocument1 =
- createEmailDocument("namespace", "uri1").toBuilder()
- .addProperties(PropertyProto.newBuilder().setName("subject").addStringValues("fo"))
- .build();
- DocumentProto emailDocument2 =
- createEmailDocument("namespace", "uri2").toBuilder()
- .addProperties(PropertyProto.newBuilder().setName("subject").addStringValues("foo"))
- .build();
- assertStatusOk(icingSearchEngine.put(emailDocument1).getStatus());
- assertStatusOk(icingSearchEngine.put(emailDocument2).getStatus());
-
- SuggestionSpecProto suggestionSpec =
- SuggestionSpecProto.newBuilder()
- .setPrefix("f")
- .setNumToReturn(10)
- .setScoringSpec(
- SuggestionScoringSpecProto.newBuilder()
- .setScoringMatchType(Code.EXACT_ONLY)
- .build())
- .build();
-
- SuggestionResponse response = icingSearchEngine.searchSuggestions(suggestionSpec);
- assertStatusOk(response.getStatus());
- assertThat(response.getSuggestionsList()).hasSize(2);
- assertThat(response.getSuggestions(0).getQuery()).isEqualTo("foo");
- assertThat(response.getSuggestions(1).getQuery()).isEqualTo("fo");
- }
-
private static void assertStatusOk(StatusProto status) {
assertWithMessage(status.getMessage()).that(status.getCode()).isEqualTo(StatusProto.Code.OK);
}
diff --git a/proto/Android.bp b/proto/Android.bp
index cda0ec2..4fb0c18 100644
--- a/proto/Android.bp
+++ b/proto/Android.bp
@@ -43,5 +43,4 @@ cc_library_static {
export_proto_headers: true,
},
srcs: ["icing/**/*.proto"],
- min_sdk_version: "Tiramisu",
}
diff --git a/proto/icing/proto/debug.proto b/proto/icing/proto/debug.proto
deleted file mode 100644
index 504ae43..0000000
--- a/proto/icing/proto/debug.proto
+++ /dev/null
@@ -1,127 +0,0 @@
-// Copyright 2022 Google LLC
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-// http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-syntax = "proto2";
-
-package icing.lib;
-
-import "icing/proto/schema.proto";
-import "icing/proto/status.proto";
-import "icing/proto/storage.proto";
-
-option java_package = "com.google.android.icing.proto";
-option java_multiple_files = true;
-option objc_class_prefix = "ICNG";
-
-// Next tag: 4
-message IndexDebugInfoProto {
- // Storage information of the index.
- optional IndexStorageInfoProto index_storage_info = 1;
-
- message MainIndexDebugInfoProto {
- // Information about the main lexicon.
- // TODO(b/222349894) Convert the string output to a protocol buffer instead.
- optional string lexicon_info = 1;
-
- // Last added document id.
- optional uint32 last_added_document_id = 2;
-
- // If verbosity > 0, return information about the posting list storage.
- // TODO(b/222349894) Convert the string output to a protocol buffer instead.
- optional string flash_index_storage_info = 3;
- }
- optional MainIndexDebugInfoProto main_index_info = 2;
-
- message LiteIndexDebugInfoProto {
- // Current number of hits.
- optional uint32 curr_size = 1;
-
- // The maximum possible number of hits.
- optional uint32 hit_buffer_size = 2;
-
- // Last added document id.
- optional uint32 last_added_document_id = 3;
-
- // The first position in the hit buffer that is not sorted yet,
- // or curr_size if all hits are sorted.
- optional uint32 searchable_end = 4;
-
- // The most recent checksum of the lite index, by calling
- // LiteIndex::ComputeChecksum().
- optional uint32 index_crc = 5;
-
- // Information about the lite lexicon.
- // TODO(b/222349894) Convert the string output to a protocol buffer instead.
- optional string lexicon_info = 6;
- }
- optional LiteIndexDebugInfoProto lite_index_info = 3;
-}
-
-// Next tag: 4
-message DocumentDebugInfoProto {
- // Storage information of the document store.
- optional DocumentStorageInfoProto document_storage_info = 1;
-
- // The most recent checksum of the document store, by calling
- // DocumentStore::ComputeChecksum().
- optional uint32 crc = 2;
-
- message CorpusInfo {
- optional string namespace = 1;
- optional string schema = 2;
- optional uint32 total_documents = 3;
- optional uint32 total_token = 4;
- }
-
- // If verbosity > 0, return the total number of documents and tokens in each
- // (namespace, schema type) pair.
- // Note that deleted and expired documents are skipped in the output.
- repeated CorpusInfo corpus_info = 3;
-}
-
-// Next tag: 3
-message SchemaDebugInfoProto {
- // Copy of the SchemaProto if it has been set in the schema store.
- // Modifying this does not affect the Schema that IcingSearchEngine holds.
- optional SchemaProto schema = 1;
-
- // The most recent checksum of the schema store, by calling
- // SchemaStore::ComputeChecksum().
- optional uint32 crc = 2;
-}
-
-// Next tag: 4
-message DebugInfoProto {
- // Debug information of the index.
- optional IndexDebugInfoProto index_info = 1;
-
- // Debug information of the document store.
- optional DocumentDebugInfoProto document_info = 2;
-
- // Debug information of the schema store.
- optional SchemaDebugInfoProto schema_info = 3;
-}
-
-// Next tag: 3
-message DebugInfoResultProto {
- // Status code can be one of:
- // OK
- // FAILED_PRECONDITION
- //
- // See status.proto for more details.
- optional StatusProto status = 1;
-
- // Debug information for Icing.
- optional DebugInfoProto debug_info = 2;
-}
diff --git a/proto/icing/proto/document.proto b/proto/icing/proto/document.proto
index 1a501e7..9a4e5b9 100644
--- a/proto/icing/proto/document.proto
+++ b/proto/icing/proto/document.proto
@@ -209,7 +209,7 @@ message DeleteBySchemaTypeResultProto {
}
// Result of a call to IcingSearchEngine.DeleteByQuery
-// Next tag: 5
+// Next tag: 3
message DeleteByQueryResultProto {
// Status code can be one of:
// OK
@@ -224,20 +224,5 @@ message DeleteByQueryResultProto {
optional StatusProto status = 1;
// Stats for delete execution performance.
- optional DeleteByQueryStatsProto delete_by_query_stats = 3;
-
- // Used by DeleteByQueryResultProto to return information about deleted
- // documents.
- message DocumentGroupInfo {
- optional string namespace = 1;
- optional string schema = 2;
- repeated string uris = 3;
- }
-
- // Additional return message that shows the uris of the deleted documents, if
- // users set return_deleted_document_info to true.
- // The result is grouped by the corresponding namespace and type.
- repeated DocumentGroupInfo deleted_documents = 4;
-
- reserved 2;
+ optional DeleteStatsProto delete_stats = 2;
}
diff --git a/proto/icing/proto/initialize.proto b/proto/icing/proto/initialize.proto
index 7fe1e6f..ab2556d 100644
--- a/proto/icing/proto/initialize.proto
+++ b/proto/icing/proto/initialize.proto
@@ -30,6 +30,19 @@ message IcingSearchEngineOptions {
// the index saved by the last instance.
optional string base_dir = 1;
+ // The maximum number of tokens to be allowed per document. If a document
+ // exceeds this number of tokens, then only the first max_tokens_per_doc
+ // will be indexed.
+ //
+ // Clients may use this value to prevent the possibility of a select few
+ // documents from exhausting limits in the index that are shared between all
+ // documents (ie max allowed index size).
+ //
+ // Valid values: [1, INT_MAX], Current default is 1/5 of the default of
+ // max_document_size.
+ // Optional.
+ optional int32 max_tokens_per_doc = 2 [default = 13107];
+
// The maximum allowable token length. All tokens in excess of this size
// will be truncated to max_token_length before being indexed.
//
@@ -57,8 +70,6 @@ message IcingSearchEngineOptions {
// Valid values: [1, INT_MAX]
// Optional.
optional int32 index_merge_size = 4 [default = 1048576]; // 1 MiB
-
- reserved 2;
}
// Result of a call to IcingSearchEngine.Initialize
diff --git a/proto/icing/proto/logging.proto b/proto/icing/proto/logging.proto
index 0a7c4a6..29f7f80 100644
--- a/proto/icing/proto/logging.proto
+++ b/proto/icing/proto/logging.proto
@@ -23,7 +23,7 @@ option java_multiple_files = true;
option objc_class_prefix = "ICNG";
// Stats of the top-level function IcingSearchEngine::Initialize().
-// Next tag: 12
+// Next tag: 11
message InitializeStatsProto {
// Overall time used for the function call.
optional int32 latency_ms = 1;
@@ -46,9 +46,6 @@ message InitializeStatsProto {
// Random I/O errors.
IO_ERROR = 4;
-
- // The document log is using legacy format.
- LEGACY_DOCUMENT_LOG_FORMAT = 5;
}
// Possible recovery causes for document store:
@@ -95,10 +92,6 @@ message InitializeStatsProto {
// Number of schema types currently in schema store.
optional int32 num_schema_types = 10;
-
- // Number of consecutive initialization failures that immediately preceded
- // this initialization.
- optional int32 num_previous_init_failures = 11;
}
// Stats of the top-level function IcingSearchEngine::Put().
@@ -121,10 +114,12 @@ message PutDocumentStatsProto {
optional int32 document_size = 5;
message TokenizationStats {
+ // Whether the number of tokens to be indexed exceeded the max number of
+ // tokens per document.
+ optional bool exceeded_max_token_num = 2;
+
// Number of tokens added to the index.
optional int32 num_tokens_indexed = 1;
-
- reserved 2;
}
optional TokenizationStats tokenization_stats = 6;
}
@@ -186,7 +181,8 @@ message QueryStatsProto {
}
// Stats of the top-level functions IcingSearchEngine::Delete,
-// IcingSearchEngine::DeleteByNamespace, IcingSearchEngine::DeleteBySchemaType.
+// IcingSearchEngine::DeleteByNamespace, IcingSearchEngine::DeleteBySchemaType,
+// IcingSearchEngine::DeleteByQuery.
// Next tag: 4
message DeleteStatsProto {
// Overall time used for the function call.
@@ -200,10 +196,8 @@ message DeleteStatsProto {
// Delete one document.
SINGLE = 1;
- // Delete by query. This value is deprecated.
- // IcingSearchEngine::DeleteByQuery will return a DeleteByQueryStatsProto
- // rather than a DeleteStatsProto.
- DEPRECATED_QUERY = 2 [deprecated = true];
+ // Delete by query.
+ QUERY = 2;
// Delete by namespace.
NAMESPACE = 3;
@@ -217,32 +211,3 @@ message DeleteStatsProto {
// Number of documents deleted by this call.
optional int32 num_documents_deleted = 3;
}
-
-// Stats of the top-level functions IcingSearchEngine::DeleteByQuery.
-// Next tag: 9
-message DeleteByQueryStatsProto {
- // Overall time used for the function call.
- optional int32 latency_ms = 1;
-
- // Number of documents deleted by this call.
- optional int32 num_documents_deleted = 2;
-
- // The UTF-8 length of the query string
- optional int32 query_length = 3;
-
- // Number of terms in the query string.
- optional int32 num_terms = 4;
-
- // Number of namespaces filtered.
- optional int32 num_namespaces_filtered = 5;
-
- // Number of schema types filtered.
- optional int32 num_schema_types_filtered = 6;
-
- // Time used to parse the query, including 2 parts: tokenizing and
- // transforming tokens into an iterator tree.
- optional int32 parse_query_latency_ms = 7;
-
- // Time used to delete each document.
- optional int32 document_removal_latency_ms = 8;
-}
diff --git a/proto/icing/proto/schema.proto b/proto/icing/proto/schema.proto
index ffb6f2c..4188a8c 100644
--- a/proto/icing/proto/schema.proto
+++ b/proto/icing/proto/schema.proto
@@ -91,14 +91,6 @@ message StringIndexingConfig {
// Tokenization for plain text.
PLAIN = 1;
-
- // Tokenizes text in verbatim. This means no normalization or segmentation
- // is applied to string values that are tokenized using this type.
- // Therefore, the output token is equivalent to the raw string text. For
- // example, "Hello, world!" would be tokenized as "Hello, world!"
- // preserving punctuation and capitalization, and not creating separate
- // tokens between the space.
- VERBATIM = 2;
}
}
optional TokenizerType.Code tokenizer_type = 2;
@@ -205,7 +197,7 @@ message SchemaProto {
}
// Result of a call to IcingSearchEngine.SetSchema
-// Next tag: 8
+// Next tag: 4
message SetSchemaResultProto {
// Status code can be one of:
// OK
@@ -229,21 +221,6 @@ message SetSchemaResultProto {
// documents that fail validation against the new schema types would also be
// deleted.
repeated string incompatible_schema_types = 3;
-
- // Schema types that did not exist in the previous schema and were added with
- // the new schema type.
- repeated string new_schema_types = 4;
-
- // Schema types that were changed in a way that was backwards compatible and
- // didn't invalidate the index.
- repeated string fully_compatible_changed_schema_types = 5;
-
- // Schema types that were changed in a way that was backwards compatible, but
- // invalidated the index.
- repeated string index_incompatible_changed_schema_types = 6;
-
- // Overall time used for the function call.
- optional int32 latency_ms = 7;
}
// Result of a call to IcingSearchEngine.GetSchema
diff --git a/proto/icing/proto/scoring.proto b/proto/icing/proto/scoring.proto
index 71c943e..6186fde 100644
--- a/proto/icing/proto/scoring.proto
+++ b/proto/icing/proto/scoring.proto
@@ -23,7 +23,7 @@ option objc_class_prefix = "ICNG";
// Encapsulates the configurations on how Icing should score and rank the search
// results.
// TODO(b/170347684): Change all timestamps to seconds.
-// Next tag: 4
+// Next tag: 3
message ScoringSpecProto {
// OPTIONAL: Indicates how the search results will be ranked.
message RankingStrategy {
@@ -83,42 +83,4 @@ message ScoringSpecProto {
}
}
optional Order.Code order_by = 2;
-
- // OPTIONAL: Specifies property weights for RELEVANCE_SCORE scoring strategy.
- // Property weights are used for promoting or demoting query term matches in a
- // document property. When property weights are provided, the term frequency
- // is multiplied by the normalized property weight when computing the
- // normalized term frequency component of BM25F. To prefer query term matches
- // in the "subject" property over the "body" property of "Email" documents,
- // set a higher property weight value for "subject" than "body". By default,
- // all properties that are not specified are given a raw, pre-normalized
- // weight of 1.0 when scoring.
- repeated TypePropertyWeights type_property_weights = 3;
-}
-
-// Next tag: 3
-message TypePropertyWeights {
- // Schema type to apply property weights to.
- optional string schema_type = 1;
-
- // Property weights to apply to the schema type.
- repeated PropertyWeight property_weights = 2;
-}
-
-// Next tag: 3
-message PropertyWeight {
- // Property path to assign property weight to. Property paths must be composed
- // only of property names and property separators (the '.' character).
- // For example, if an "Email" schema type has string property "subject" and
- // document property "sender", which has string property "name", the property
- // path for the email's subject would just be "subject" and the property path
- // for the sender's name would be "sender.name". If an invalid path is
- // specified, the property weight is discarded.
- optional string path = 1;
-
- // Property weight, valid values are positive and zero. Setting a zero
- // property weight will remove scoring contribution for a query term match in
- // the property. Negative weights are invalid and will result in an error.
- // By default, a property is given a raw, pre-normalized weight of 1.0.
- optional double weight = 2;
}
diff --git a/proto/icing/proto/search.proto b/proto/icing/proto/search.proto
index f005c76..66fdbe6 100644
--- a/proto/icing/proto/search.proto
+++ b/proto/icing/proto/search.proto
@@ -85,16 +85,16 @@ message ResultSpecProto {
// have snippet information provided. If set to 0, snippeting is disabled.
optional int32 num_matches_per_property = 2;
- // How large of a window to provide. Windows start at
- // max_window_utf32_length / 2 bytes before the middle of the matching token
- // and end at max_window_utf32_length / 2 bytes after the middle of the
- // matching token. Windowing respects token boundaries. Therefore, the
- // returned window may be smaller than requested. Setting
- // max_window_utf32_length to 0 will disable windowing information. If
- // matches enabled is also set to false, then snippeting is disabled. Ex.
- // max_window_utf32_length = 16. "foo bar baz bat rat" with a query of "baz"
+ // How large of a window to provide. Windows start at max_window_bytes / 2
+ // bytes before the middle of the matching token and end at max_window_bytes
+ // / 2 bytes after the middle of the matching token. Windowing respects
+ // token boundaries.
+ // Therefore, the returned window may be smaller than requested. Setting
+ // max_window_bytes to 0 will disable windowing information. If matches
+ // enabled is also set to false, then snippeting is disabled.
+ // Ex. max_window_bytes = 16. "foo bar baz bat rat" with a query of "baz"
// will return a window of "bar baz bat" which is only 11 bytes long.
- optional int32 max_window_utf32_length = 3;
+ optional int32 max_window_bytes = 3;
}
optional SnippetSpecProto snippet_spec = 3;
@@ -136,57 +136,27 @@ message ResultSpecProto {
}
// The representation of a single match within a DocumentProto property.
-//
-// Example : A document whose content is "Necesito comprar comida mañana." and a
-// query for "mana" with window=15
-// Next tag: 12
+// Next tag: 10
message SnippetMatchProto {
// The index of the byte in the string at which the match begins and the
// length in bytes of the match.
- //
- // For the example above, the values of these fields would be
- // exact_match_byte_position=24, exact_match_byte_length=7 "mañana"
optional int32 exact_match_byte_position = 2;
optional int32 exact_match_byte_length = 3;
- // The length in bytes of the subterm that matches the query. The beginning of
- // the submatch is the same as exact_match_byte_position.
- //
- // For the example above, the value of this field would be 5. With
- // exact_match_byte_position=24 above, it would produce the substring "maña"
- optional int32 submatch_byte_length = 10;
-
// The index of the UTF-16 code unit in the string at which the match begins
// and the length in UTF-16 code units of the match. This is for use with
// UTF-16 encoded strings like Java.lang.String.
- //
- // For the example above, the values of these fields would be
- // exact_match_utf16_position=24, exact_match_utf16_length=6 "mañana"
optional int32 exact_match_utf16_position = 6;
optional int32 exact_match_utf16_length = 7;
- // The length in UTF-16 code units of the subterm that matches the query. The
- // beginning of the submatch is the same as exact_match_utf16_position. This
- // is for use with UTF-16 encoded strings like Java.lang.String.
- //
- // For the example above, the value of this field would be 4. With
- // exact_match_utf16_position=24 above, it would produce the substring "maña"
- optional int32 submatch_utf16_length = 11;
-
// The index of the byte in the string at which the suggested snippet window
// begins and the length in bytes of the window.
- //
- // For the example above, the values of these fields would be
- // window_byte_position=17, window_byte_length=15 "comida mañana."
optional int32 window_byte_position = 4;
optional int32 window_byte_length = 5;
// The index of the UTF-16 code unit in the string at which the suggested
// snippet window begins and the length in UTF-16 code units of the window.
// This is for use with UTF-16 encoded strings like Java.lang.String.
- //
- // For the example above, the values of these fields would be
- // window_utf16_position=17, window_utf16_length=14 "comida mañana."
optional int32 window_utf16_position = 8;
optional int32 window_utf16_length = 9;
@@ -308,54 +278,3 @@ message GetResultSpecProto {
// type will be retrieved.
repeated TypePropertyMask type_property_masks = 1;
}
-
-// Next tag: 5
-message SuggestionSpecProto {
- // REQUIRED: The "raw" prefix string that users may type. For example, "f"
- // will search for suggested query that start with "f" like "foo", "fool".
- optional string prefix = 1;
-
- // OPTIONAL: Only search for suggestions that under the specified namespaces.
- // If unset, the suggestion will search over all namespaces. Note that this
- // applies to the entire 'prefix'. To issue different suggestions for
- // different namespaces, separate RunSuggestion()'s will need to be made.
- repeated string namespace_filters = 2;
-
- // REQUIRED: The number of suggestions to be returned.
- optional int32 num_to_return = 3;
-
- // Indicates how the suggestion terms should be scored and ranked.
- message SuggestionScoringSpecProto {
- // TermMatchType.Code=UNKNOWN
- // Should never purposely be set and may lead to undefined behavior. This is
- // used for backwards compatibility reasons.
- //
- // TermMatchType.Code=EXACT_ONLY
- // Only exact hits will be counted to score a suggestion term.
- //
- // TermMatchType.Code=PREFIX
- // Both exact hits and prefix hits will be counted to score a suggestion
- // term.
- optional TermMatchType.Code scoring_match_type = 1;
- }
-
- optional SuggestionScoringSpecProto scoring_spec = 4;
-}
-
-// Next tag: 3
-message SuggestionResponse {
- message Suggestion {
- // The suggested query string for client to search for.
- optional string query = 1;
- }
-
- // Status code can be one of:
- // OK
- // FAILED_PRECONDITION
- // INTERNAL
- //
- // See status.proto for more details.
- optional StatusProto status = 1;
-
- repeated Suggestion suggestions = 2;
-}
diff --git a/synced_AOSP_CL_number.txt b/synced_AOSP_CL_number.txt
index 73d349b..35ad6d9 100644
--- a/synced_AOSP_CL_number.txt
+++ b/synced_AOSP_CL_number.txt
@@ -1 +1 @@
-set(synced_AOSP_CL_number=436284873)
+set(synced_AOSP_CL_number=378695940)